2012-02-18 20:03:15 +08:00
|
|
|
//===-- PPCInstrInfo.td - The PowerPC Instruction Set ------*- tablegen -*-===//
|
|
|
|
//
|
2004-06-22 00:55:25 +08:00
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
2007-12-30 04:36:04 +08:00
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
2012-02-18 20:03:15 +08:00
|
|
|
//
|
2004-06-22 00:55:25 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
2004-08-10 01:24:04 +08:00
|
|
|
// This file describes the subset of the 32-bit PowerPC instruction set, as used
|
|
|
|
// by the PowerPC instruction selector.
|
2004-06-22 00:55:25 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2005-10-15 07:40:39 +08:00
|
|
|
include "PPCInstrFormats.td"
|
2004-06-22 00:55:25 +08:00
|
|
|
|
2006-03-01 13:50:56 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// PowerPC specific type constraints.
|
|
|
|
//
|
|
|
|
def SDT_PPCstfiwx : SDTypeProfile<0, 2, [ // stfiwx
|
|
|
|
SDTCisVT<0, f64>, SDTCisPtrTy<1>
|
|
|
|
]>;
|
2013-04-02 01:52:07 +08:00
|
|
|
def SDT_PPClfiwx : SDTypeProfile<1, 1, [ // lfiw[az]x
|
2013-03-31 18:12:51 +08:00
|
|
|
SDTCisVT<0, f64>, SDTCisPtrTy<1>
|
|
|
|
]>;
|
2016-10-04 14:59:23 +08:00
|
|
|
def SDT_PPCLxsizx : SDTypeProfile<1, 2, [
|
|
|
|
SDTCisVT<0, f64>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
|
|
|
|
]>;
|
|
|
|
def SDT_PPCstxsix : SDTypeProfile<0, 3, [
|
|
|
|
SDTCisVT<0, f64>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
|
|
|
|
]>;
|
|
|
|
def SDT_PPCVexts : SDTypeProfile<1, 2, [
|
|
|
|
SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2>
|
|
|
|
]>;
|
2013-03-31 18:12:51 +08:00
|
|
|
|
2007-11-13 17:19:02 +08:00
|
|
|
def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
|
|
|
|
def SDT_PPCCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
|
|
|
|
SDTCisVT<1, i32> ]>;
|
2006-03-20 09:53:53 +08:00
|
|
|
def SDT_PPCvperm : SDTypeProfile<1, 3, [
|
|
|
|
SDTCisVT<3, v16i8>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>
|
|
|
|
]>;
|
|
|
|
|
2016-05-05 00:04:02 +08:00
|
|
|
def SDT_PPCVecSplat : SDTypeProfile<1, 2, [ SDTCisVec<0>,
|
|
|
|
SDTCisVec<1>, SDTCisInt<2>
|
|
|
|
]>;
|
|
|
|
|
2016-07-13 05:00:10 +08:00
|
|
|
def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>,
|
|
|
|
SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
|
|
|
|
]>;
|
|
|
|
|
|
|
|
def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>,
|
|
|
|
SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
|
|
|
|
]>;
|
|
|
|
|
2006-03-31 13:13:27 +08:00
|
|
|
def SDT_PPCvcmp : SDTypeProfile<1, 3, [
|
2006-03-26 18:06:40 +08:00
|
|
|
SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>
|
|
|
|
]>;
|
|
|
|
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
llvm-svn: 27804
2006-04-19 01:59:36 +08:00
|
|
|
def SDT_PPCcondbr : SDTypeProfile<0, 3, [
|
2006-11-18 06:37:34 +08:00
|
|
|
SDTCisVT<0, i32>, SDTCisVT<2, OtherVT>
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
llvm-svn: 27804
2006-04-19 01:59:36 +08:00
|
|
|
]>;
|
|
|
|
|
2009-09-26 04:36:54 +08:00
|
|
|
def SDT_PPClbrx : SDTypeProfile<1, 2, [
|
2013-03-29 03:25:55 +08:00
|
|
|
SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
|
2006-07-11 04:56:58 +08:00
|
|
|
]>;
|
2009-09-26 04:36:54 +08:00
|
|
|
def SDT_PPCstbrx : SDTypeProfile<0, 3, [
|
2013-03-29 03:25:55 +08:00
|
|
|
SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
|
2006-07-11 04:56:58 +08:00
|
|
|
]>;
|
|
|
|
|
2008-04-30 17:16:33 +08:00
|
|
|
def SDT_PPCTC_ret : SDTypeProfile<0, 2, [
|
|
|
|
SDTCisPtrTy<0>, SDTCisVT<1, i32>
|
|
|
|
]>;
|
|
|
|
|
2014-07-19 07:29:49 +08:00
|
|
|
def tocentry32 : Operand<iPTR> {
|
|
|
|
let MIOperandInfo = (ops i32imm:$imm);
|
|
|
|
}
|
2009-08-15 19:54:46 +08:00
|
|
|
|
[PowerPC] Add support for the QPX vector instruction set
This adds support for the QPX vector instruction set, which is used by the
enhanced A2 cores on the IBM BG/Q supercomputers. QPX vectors are 256 bytes
wide, holding 4 double-precision floating-point values. Boolean values, modeled
here as <4 x i1> are actually also represented as floating-point values
(essentially { -1, 1 } for { false, true }). QPX shares many features with
Altivec and VSX, but is distinct from both of them. One major difference is
that, instead of adding completely-separate vector registers, QPX vector
registers are extensions of the scalar floating-point registers (lane 0 is the
corresponding scalar floating-point value). The operations supported on QPX
vectors mirrors that supported on the scalar floating-point values (with some
additional ones for permutations and logical/comparison operations).
I've been maintaining this support out-of-tree, as part of the bgclang project,
for several years. This is not the entire bgclang patch set, but is most of the
subset that can be cleanly integrated into LLVM proper at this time. Adding
this to the LLVM backend is part of my efforts to rebase bgclang to the current
LLVM trunk, but is independently useful (especially for codes that use LLVM as
a JIT in library form).
The assembler/disassembler test coverage is complete. The CodeGen test coverage
is not, but I've included some tests, and more will be added as follow-up work.
llvm-svn: 230413
2015-02-25 09:06:45 +08:00
|
|
|
def SDT_PPCqvfperm : SDTypeProfile<1, 3, [
|
|
|
|
SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVec<3>
|
|
|
|
]>;
|
|
|
|
def SDT_PPCqvgpci : SDTypeProfile<1, 1, [
|
|
|
|
SDTCisVec<0>, SDTCisInt<1>
|
|
|
|
]>;
|
|
|
|
def SDT_PPCqvaligni : SDTypeProfile<1, 3, [
|
|
|
|
SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<3>
|
|
|
|
]>;
|
|
|
|
def SDT_PPCqvesplati : SDTypeProfile<1, 2, [
|
|
|
|
SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisInt<2>
|
|
|
|
]>;
|
|
|
|
|
|
|
|
def SDT_PPCqbflt : SDTypeProfile<1, 1, [
|
|
|
|
SDTCisVec<0>, SDTCisVec<1>
|
|
|
|
]>;
|
|
|
|
|
|
|
|
def SDT_PPCqvlfsb : SDTypeProfile<1, 1, [
|
|
|
|
SDTCisVec<0>, SDTCisPtrTy<1>
|
|
|
|
]>;
|
|
|
|
|
2005-10-26 04:41:46 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// PowerPC specific DAG Nodes.
|
|
|
|
//
|
|
|
|
|
2013-04-03 12:01:11 +08:00
|
|
|
def PPCfre : SDNode<"PPCISD::FRE", SDTFPUnaryOp, []>;
|
|
|
|
def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>;
|
|
|
|
|
2013-04-02 01:52:07 +08:00
|
|
|
def PPCfcfid : SDNode<"PPCISD::FCFID", SDTFPUnaryOp, []>;
|
|
|
|
def PPCfcfidu : SDNode<"PPCISD::FCFIDU", SDTFPUnaryOp, []>;
|
|
|
|
def PPCfcfids : SDNode<"PPCISD::FCFIDS", SDTFPRoundOp, []>;
|
|
|
|
def PPCfcfidus: SDNode<"PPCISD::FCFIDUS", SDTFPRoundOp, []>;
|
2005-10-26 04:41:46 +08:00
|
|
|
def PPCfctidz : SDNode<"PPCISD::FCTIDZ", SDTFPUnaryOp, []>;
|
|
|
|
def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
|
2013-04-02 01:52:07 +08:00
|
|
|
def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
|
|
|
|
def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
|
2008-01-06 14:44:58 +08:00
|
|
|
def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx,
|
|
|
|
[SDNPHasChain, SDNPMayStore]>;
|
2013-04-02 01:52:07 +08:00
|
|
|
def PPClfiwax : SDNode<"PPCISD::LFIWAX", SDT_PPClfiwx,
|
2017-01-27 02:59:15 +08:00
|
|
|
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
|
2013-04-02 01:52:07 +08:00
|
|
|
def PPClfiwzx : SDNode<"PPCISD::LFIWZX", SDT_PPClfiwx,
|
2017-01-27 02:59:15 +08:00
|
|
|
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
|
2016-10-04 14:59:23 +08:00
|
|
|
def PPClxsizx : SDNode<"PPCISD::LXSIZX", SDT_PPCLxsizx,
|
|
|
|
[SDNPHasChain, SDNPMayLoad]>;
|
|
|
|
def PPCstxsix : SDNode<"PPCISD::STXSIX", SDT_PPCstxsix,
|
|
|
|
[SDNPHasChain, SDNPMayStore]>;
|
|
|
|
def PPCVexts : SDNode<"PPCISD::VEXTS", SDT_PPCVexts, []>;
|
2005-10-26 04:41:46 +08:00
|
|
|
|
2013-03-26 18:56:22 +08:00
|
|
|
// Extract FPSCR (not modeled at the DAG level).
|
|
|
|
def PPCmffs : SDNode<"PPCISD::MFFS",
|
|
|
|
SDTypeProfile<1, 0, [SDTCisVT<0, f64>]>, []>;
|
|
|
|
|
|
|
|
// Perform FADD in round-to-zero mode.
|
|
|
|
def PPCfaddrtz: SDNode<"PPCISD::FADDRTZ", SDTFPBinOp, []>;
|
|
|
|
|
2007-10-10 09:01:31 +08:00
|
|
|
|
2005-10-26 04:55:47 +08:00
|
|
|
def PPCfsel : SDNode<"PPCISD::FSEL",
|
|
|
|
// Type constraint for fsel.
|
|
|
|
SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,
|
|
|
|
SDTCisFP<0>, SDTCisVT<1, f64>]>, []>;
|
2005-09-09 03:50:41 +08:00
|
|
|
|
2005-12-14 06:55:22 +08:00
|
|
|
def PPChi : SDNode<"PPCISD::Hi", SDTIntBinOp, []>;
|
|
|
|
def PPClo : SDNode<"PPCISD::Lo", SDTIntBinOp, []>;
|
[PowerPC] Make LDtocL and friends invariant loads
LDtocL, and other loads that roughly correspond to the TOC_ENTRY SDAG node,
represent loads from the TOC, which is invariant. As a result, these loads can
be hoisted out of loops, etc. In order to do this, we need to generate
GOT-style MMOs for TOC_ENTRY, which requires treating it as a legitimate memory
intrinsic node type. Once this is done, the MMO transfer is automatically
handled for TableGen-driven instruction selection, and for nodes generated
directly in PPCISelDAGToDAG, we need to transfer the MMOs manually.
Also, we were not transferring MMOs associated with pre-increment loads, so do
that too.
Lastly, this fixes an exposed bug where R30 was not added as a defined operand of
UpdateGBR.
This problem was highlighted by an example (used to generate the test case)
posted to llvmdev by Francois Pichet.
llvm-svn: 230553
2015-02-26 05:36:59 +08:00
|
|
|
def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp,
|
|
|
|
[SDNPMayLoad, SDNPMemOperand]>;
|
2005-12-14 06:55:22 +08:00
|
|
|
def PPCvmaddfp : SDNode<"PPCISD::VMADDFP", SDTFPTernaryOp, []>;
|
|
|
|
def PPCvnmsubfp : SDNode<"PPCISD::VNMSUBFP", SDTFPTernaryOp, []>;
|
2005-11-17 15:30:41 +08:00
|
|
|
|
2013-12-21 02:08:54 +08:00
|
|
|
def PPCppc32GOT : SDNode<"PPCISD::PPC32_GOT", SDTIntLeaf, []>;
|
|
|
|
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
llvm-svn: 170209
2012-12-15 01:02:38 +08:00
|
|
|
def PPCaddisGotTprelHA : SDNode<"PPCISD::ADDIS_GOT_TPREL_HA", SDTIntBinOp>;
|
|
|
|
def PPCldGotTprelL : SDNode<"PPCISD::LD_GOT_TPREL_L", SDTIntBinOp,
|
|
|
|
[SDNPMayLoad]>;
|
2012-12-05 00:18:08 +08:00
|
|
|
def PPCaddTls : SDNode<"PPCISD::ADD_TLS", SDTIntBinOp, []>;
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
def PPCaddisTlsgdHA : SDNode<"PPCISD::ADDIS_TLSGD_HA", SDTIntBinOp>;
|
|
|
|
def PPCaddiTlsgdL : SDNode<"PPCISD::ADDI_TLSGD_L", SDTIntBinOp>;
|
2015-02-11 03:09:05 +08:00
|
|
|
def PPCgetTlsAddr : SDNode<"PPCISD::GET_TLS_ADDR", SDTIntBinOp>;
|
|
|
|
def PPCaddiTlsgdLAddr : SDNode<"PPCISD::ADDI_TLSGD_L_ADDR",
|
|
|
|
SDTypeProfile<1, 3, [
|
|
|
|
SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
|
|
|
|
SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
|
2012-12-13 03:29:35 +08:00
|
|
|
def PPCaddisTlsldHA : SDNode<"PPCISD::ADDIS_TLSLD_HA", SDTIntBinOp>;
|
|
|
|
def PPCaddiTlsldL : SDNode<"PPCISD::ADDI_TLSLD_L", SDTIntBinOp>;
|
2015-02-11 03:09:05 +08:00
|
|
|
def PPCgetTlsldAddr : SDNode<"PPCISD::GET_TLSLD_ADDR", SDTIntBinOp>;
|
|
|
|
def PPCaddiTlsldLAddr : SDNode<"PPCISD::ADDI_TLSLD_L_ADDR",
|
|
|
|
SDTypeProfile<1, 3, [
|
|
|
|
SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
|
|
|
|
SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
|
|
|
|
def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp>;
|
2012-12-13 03:29:35 +08:00
|
|
|
def PPCaddiDtprelL : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
|
2012-12-05 00:18:08 +08:00
|
|
|
|
2016-07-13 05:00:10 +08:00
|
|
|
def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
|
2016-05-05 00:04:02 +08:00
|
|
|
def PPCxxsplt : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
|
2016-07-13 05:00:10 +08:00
|
|
|
def PPCxxinsert : SDNode<"PPCISD::XXINSERT", SDT_PPCVecInsert, []>;
|
|
|
|
def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
|
2006-03-19 14:55:52 +08:00
|
|
|
|
[PowerPC] Add support for the QPX vector instruction set
This adds support for the QPX vector instruction set, which is used by the
enhanced A2 cores on the IBM BG/Q supercomputers. QPX vectors are 256 bytes
wide, holding 4 double-precision floating-point values. Boolean values, modeled
here as <4 x i1> are actually also represented as floating-point values
(essentially { -1, 1 } for { false, true }). QPX shares many features with
Altivec and VSX, but is distinct from both of them. One major difference is
that, instead of adding completely-separate vector registers, QPX vector
registers are extensions of the scalar floating-point registers (lane 0 is the
corresponding scalar floating-point value). The operations supported on QPX
vectors mirrors that supported on the scalar floating-point values (with some
additional ones for permutations and logical/comparison operations).
I've been maintaining this support out-of-tree, as part of the bgclang project,
for several years. This is not the entire bgclang patch set, but is most of the
subset that can be cleanly integrated into LLVM proper at this time. Adding
this to the LLVM backend is part of my efforts to rebase bgclang to the current
LLVM trunk, but is independently useful (especially for codes that use LLVM as
a JIT in library form).
The assembler/disassembler test coverage is complete. The CodeGen test coverage
is not, but I've included some tests, and more will be added as follow-up work.
llvm-svn: 230413
2015-02-25 09:06:45 +08:00
|
|
|
def PPCqvfperm : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>;
|
|
|
|
def PPCqvgpci : SDNode<"PPCISD::QVGPCI", SDT_PPCqvgpci, []>;
|
|
|
|
def PPCqvaligni : SDNode<"PPCISD::QVALIGNI", SDT_PPCqvaligni, []>;
|
|
|
|
def PPCqvesplati : SDNode<"PPCISD::QVESPLATI", SDT_PPCqvesplati, []>;
|
|
|
|
|
|
|
|
def PPCqbflt : SDNode<"PPCISD::QBFLT", SDT_PPCqbflt, []>;
|
|
|
|
|
|
|
|
def PPCqvlfsb : SDNode<"PPCISD::QVLFSb", SDT_PPCqvlfsb,
|
|
|
|
[SDNPHasChain, SDNPMayLoad]>;
|
|
|
|
|
2015-01-03 09:16:37 +08:00
|
|
|
def PPCcmpb : SDNode<"PPCISD::CMPB", SDTIntBinOp, []>;
|
|
|
|
|
2005-12-06 10:10:38 +08:00
|
|
|
// These nodes represent the 32-bit PPC shifts that operate on 6-bit shift
|
|
|
|
// amounts. These nodes are generated by the multi-precision shift code.
|
2008-03-08 04:18:24 +08:00
|
|
|
def PPCsrl : SDNode<"PPCISD::SRL" , SDTIntShiftOp>;
|
|
|
|
def PPCsra : SDNode<"PPCISD::SRA" , SDTIntShiftOp>;
|
|
|
|
def PPCshl : SDNode<"PPCISD::SHL" , SDTIntShiftOp>;
|
2005-12-06 10:10:38 +08:00
|
|
|
|
2005-12-05 03:01:59 +08:00
|
|
|
// These are target-independent nodes, but have target-specific formats.
|
2007-11-13 17:19:02 +08:00
|
|
|
def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PPCCallSeqStart,
|
2010-12-24 02:28:41 +08:00
|
|
|
[SDNPHasChain, SDNPOutGlue]>;
|
2007-11-13 17:19:02 +08:00
|
|
|
def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_PPCCallSeqEnd,
|
2010-12-24 02:28:41 +08:00
|
|
|
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
|
2005-12-05 03:01:59 +08:00
|
|
|
|
2006-06-28 02:36:44 +08:00
|
|
|
def SDT_PPCCall : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
|
2013-03-22 23:24:13 +08:00
|
|
|
def PPCcall : SDNode<"PPCISD::CALL", SDT_PPCCall,
|
|
|
|
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
|
|
|
|
SDNPVariadic]>;
|
|
|
|
def PPCcall_nop : SDNode<"PPCISD::CALL_NOP", SDT_PPCCall,
|
|
|
|
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
|
|
|
|
SDNPVariadic]>;
|
2006-05-18 03:00:46 +08:00
|
|
|
def PPCmtctr : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
|
2010-12-24 02:28:41 +08:00
|
|
|
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
|
2013-03-22 23:24:13 +08:00
|
|
|
def PPCbctrl : SDNode<"PPCISD::BCTRL", SDTNone,
|
|
|
|
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
|
|
|
|
SDNPVariadic]>;
|
2014-12-24 06:29:40 +08:00
|
|
|
def PPCbctrl_load_toc : SDNode<"PPCISD::BCTRL_LOAD_TOC",
|
|
|
|
SDTypeProfile<0, 1, []>,
|
|
|
|
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
|
|
|
|
SDNPVariadic]>;
|
2006-05-17 14:01:33 +08:00
|
|
|
|
2008-01-16 06:02:54 +08:00
|
|
|
def retflag : SDNode<"PPCISD::RET_FLAG", SDTNone,
|
2010-12-24 02:28:41 +08:00
|
|
|
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
|
2005-12-20 08:26:01 +08:00
|
|
|
|
2008-04-30 17:16:33 +08:00
|
|
|
def PPCtc_return : SDNode<"PPCISD::TC_RETURN", SDT_PPCTC_ret,
|
2010-12-24 02:28:41 +08:00
|
|
|
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
|
2008-04-30 17:16:33 +08:00
|
|
|
|
2013-03-22 05:37:52 +08:00
|
|
|
def PPCeh_sjlj_setjmp : SDNode<"PPCISD::EH_SJLJ_SETJMP",
|
|
|
|
SDTypeProfile<1, 1, [SDTCisInt<0>,
|
|
|
|
SDTCisPtrTy<1>]>,
|
|
|
|
[SDNPHasChain, SDNPSideEffect]>;
|
|
|
|
def PPCeh_sjlj_longjmp : SDNode<"PPCISD::EH_SJLJ_LONGJMP",
|
|
|
|
SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
|
|
|
|
[SDNPHasChain, SDNPSideEffect]>;
|
|
|
|
|
2013-05-15 03:35:45 +08:00
|
|
|
def SDT_PPCsc : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
|
|
|
|
def PPCsc : SDNode<"PPCISD::SC", SDT_PPCsc,
|
|
|
|
[SDNPHasChain, SDNPSideEffect]>;
|
|
|
|
|
2015-05-23 00:44:10 +08:00
|
|
|
def PPCclrbhrb : SDNode<"PPCISD::CLRBHRB", SDTNone,
|
|
|
|
[SDNPHasChain, SDNPSideEffect]>;
|
|
|
|
def PPCmfbhrbe : SDNode<"PPCISD::MFBHRBE", SDTIntBinOp, [SDNPHasChain]>;
|
|
|
|
def PPCrfebb : SDNode<"PPCISD::RFEBB", SDT_PPCsc,
|
|
|
|
[SDNPHasChain, SDNPSideEffect]>;
|
|
|
|
|
2006-03-31 13:13:27 +08:00
|
|
|
def PPCvcmp : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
|
2010-12-24 02:28:41 +08:00
|
|
|
def PPCvcmp_o : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutGlue]>;
|
2006-03-26 18:06:40 +08:00
|
|
|
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
llvm-svn: 27804
2006-04-19 01:59:36 +08:00
|
|
|
def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
|
2010-12-24 02:28:41 +08:00
|
|
|
[SDNPHasChain, SDNPOptInGlue]>;
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
llvm-svn: 27804
2006-04-19 01:59:36 +08:00
|
|
|
|
2008-01-10 13:12:37 +08:00
|
|
|
def PPClbrx : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
|
2017-01-27 02:59:15 +08:00
|
|
|
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
|
2008-01-06 14:44:58 +08:00
|
|
|
def PPCstbrx : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
|
|
|
|
[SDNPHasChain, SDNPMayStore]>;
|
2006-07-11 04:56:58 +08:00
|
|
|
|
2012-08-28 10:10:27 +08:00
|
|
|
// Instructions to set/unset CR bit 6 for SVR4 vararg calls
|
|
|
|
def PPCcr6set : SDNode<"PPCISD::CR6SET", SDTNone,
|
|
|
|
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
|
|
|
|
def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
|
|
|
|
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
|
|
|
|
|
2006-11-17 06:43:37 +08:00
|
|
|
// Instructions to support dynamic alloca.
|
|
|
|
def SDTDynOp : SDTypeProfile<1, 2, []>;
|
2015-12-01 19:40:55 +08:00
|
|
|
def SDTDynAreaOp : SDTypeProfile<1, 1, []>;
|
2006-11-17 06:43:37 +08:00
|
|
|
def PPCdynalloc : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
|
2015-12-01 19:40:55 +08:00
|
|
|
def PPCdynareaoffset : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>;
|
2006-11-17 06:43:37 +08:00
|
|
|
|
2005-09-09 03:50:41 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
2005-09-09 08:39:56 +08:00
|
|
|
// PowerPC specific transformation functions and pattern fragments.
|
|
|
|
//
|
Woo, it kinda works. We now generate this atrociously bad, but correct,
code for long long foo(long long a, long long b) { return a + b; }
_foo:
or r2, r3, r3
or r3, r4, r4
or r4, r5, r5
or r5, r6, r6
rldicr r2, r2, 32, 31
rldicl r3, r3, 0, 32
rldicr r4, r4, 32, 31
rldicl r5, r5, 0, 32
or r2, r3, r2
or r3, r5, r4
add r4, r3, r2
rldicl r2, r4, 32, 32
or r4, r4, r4
or r3, r2, r2
blr
llvm-svn: 23809
2005-10-19 09:12:32 +08:00
|
|
|
|
2005-10-20 02:42:01 +08:00
|
|
|
def SHL32 : SDNodeXForm<imm, [{
|
|
|
|
// Transformation function: 31 - imm
|
2015-04-28 22:05:47 +08:00
|
|
|
return getI32Imm(31 - N->getZExtValue(), SDLoc(N));
|
2005-10-20 02:42:01 +08:00
|
|
|
}]>;
|
|
|
|
|
|
|
|
def SRL32 : SDNodeXForm<imm, [{
|
|
|
|
// Transformation function: 32 - imm
|
2015-04-28 22:05:47 +08:00
|
|
|
return N->getZExtValue() ? getI32Imm(32 - N->getZExtValue(), SDLoc(N))
|
|
|
|
: getI32Imm(0, SDLoc(N));
|
2005-10-20 02:42:01 +08:00
|
|
|
}]>;
|
|
|
|
|
2005-09-09 08:39:56 +08:00
|
|
|
def LO16 : SDNodeXForm<imm, [{
|
|
|
|
// Transformation function: get the low 16 bits.
|
2015-04-28 22:05:47 +08:00
|
|
|
return getI32Imm((unsigned short)N->getZExtValue(), SDLoc(N));
|
2005-09-09 08:39:56 +08:00
|
|
|
}]>;
|
|
|
|
|
|
|
|
def HI16 : SDNodeXForm<imm, [{
|
|
|
|
// Transformation function: shift the immediate value down into the low bits.
|
2015-04-28 22:05:47 +08:00
|
|
|
return getI32Imm((unsigned)N->getZExtValue() >> 16, SDLoc(N));
|
2005-09-09 08:39:56 +08:00
|
|
|
}]>;
|
2005-09-09 01:33:10 +08:00
|
|
|
|
2005-09-29 07:07:13 +08:00
|
|
|
def HA16 : SDNodeXForm<imm, [{
|
|
|
|
// Transformation function: shift the immediate value down into the low bits.
|
2016-06-21 13:10:24 +08:00
|
|
|
int Val = N->getZExtValue();
|
2015-04-28 22:05:47 +08:00
|
|
|
return getI32Imm((Val - (signed short)Val) >> 16, SDLoc(N));
|
2005-09-29 07:07:13 +08:00
|
|
|
}]>;
|
2006-09-22 13:01:56 +08:00
|
|
|
def MB : SDNodeXForm<imm, [{
|
|
|
|
// Transformation function: get the start bit of a mask
|
2008-10-16 21:02:33 +08:00
|
|
|
unsigned mb = 0, me;
|
2008-09-13 00:56:44 +08:00
|
|
|
(void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
|
2015-04-28 22:05:47 +08:00
|
|
|
return getI32Imm(mb, SDLoc(N));
|
2006-09-22 13:01:56 +08:00
|
|
|
}]>;
|
2005-09-29 07:07:13 +08:00
|
|
|
|
2006-09-22 13:01:56 +08:00
|
|
|
def ME : SDNodeXForm<imm, [{
|
|
|
|
// Transformation function: get the end bit of a mask
|
2008-10-16 21:02:33 +08:00
|
|
|
unsigned mb, me = 0;
|
2008-09-13 00:56:44 +08:00
|
|
|
(void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
|
2015-04-28 22:05:47 +08:00
|
|
|
return getI32Imm(me, SDLoc(N));
|
2006-09-22 13:01:56 +08:00
|
|
|
}]>;
|
|
|
|
def maskimm32 : PatLeaf<(imm), [{
|
|
|
|
// maskImm predicate - True if immediate is a run of ones.
|
|
|
|
unsigned mb, me;
|
2009-08-12 04:47:22 +08:00
|
|
|
if (N->getValueType(0) == MVT::i32)
|
2008-09-13 00:56:44 +08:00
|
|
|
return isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
|
2006-09-22 13:01:56 +08:00
|
|
|
else
|
|
|
|
return false;
|
|
|
|
}]>;
|
2005-09-29 07:07:13 +08:00
|
|
|
|
Change some PowerPC PatLeaf definitions to ImmLeaf for fast-isel.
Using PatLeaf rather than ImmLeaf when defining immediate predicates
prevents simple patterns using those predicates from being recognized
for fast instruction selection. This patch replaces the immSExt16
PatLeaf predicate with two ImmLeaf predicates, imm32SExt16 and
imm64SExt16, allowing a few more patterns to be recognized (ADDI,
ADDIC, MULLI, ADDI8, and ADDIC8). Using the new predicates does not
help for LI, LI8, SUBFIC, and SUBFIC8 because these are rejected for
other reasons, but I see no reason to retain the PatLeaf predicate.
No functional change intended, and thus no test cases yet. This is
preliminary work for enabling fast-isel support for PowerPC. When
that support is ready, we'll be able to test this function.
llvm-svn: 182510
2013-05-23 04:09:24 +08:00
|
|
|
def imm32SExt16 : Operand<i32>, ImmLeaf<i32, [{
|
|
|
|
// imm32SExt16 predicate - True if the i32 immediate fits in a 16-bit
|
|
|
|
// sign extended field. Used by instructions like 'addi'.
|
|
|
|
return (int32_t)Imm == (short)Imm;
|
|
|
|
}]>;
|
|
|
|
def imm64SExt16 : Operand<i64>, ImmLeaf<i64, [{
|
|
|
|
// imm64SExt16 predicate - True if the i64 immediate fits in a 16-bit
|
|
|
|
// sign extended field. Used by instructions like 'addi'.
|
|
|
|
return (int64_t)Imm == (short)Imm;
|
2005-09-09 01:33:10 +08:00
|
|
|
}]>;
|
2005-09-09 01:40:49 +08:00
|
|
|
def immZExt16 : PatLeaf<(imm), [{
|
|
|
|
// immZExt16 predicate - True if the immediate fits in a 16-bit zero extended
|
|
|
|
// field. Used by instructions like 'ori'.
|
2008-09-13 00:56:44 +08:00
|
|
|
return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
|
2005-09-09 08:39:56 +08:00
|
|
|
}], LO16>;
|
2016-12-15 19:16:20 +08:00
|
|
|
def immAnyExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm) || isUInt<8>(Imm); }]>;
|
2016-11-30 00:11:34 +08:00
|
|
|
def immSExt5NonZero : ImmLeaf<i32, [{ return Imm && isInt<5>(Imm); }]>;
|
2005-09-09 08:39:56 +08:00
|
|
|
|
Add some 64-bit logical ops.
Split imm16Shifted into a sext/zext form for 64-bit support.
Add some patterns for immediate formation. For example, we now compile this:
static unsigned long long Y;
void test3() {
Y = 0xF0F00F00;
}
into:
_test3:
li r2, 3840
lis r3, ha16(_Y)
xoris r2, r2, 61680
std r2, lo16(_Y)(r3)
blr
GCC produces:
_test3:
li r0,0
lis r2,ha16(_Y)
ori r0,r0,61680
sldi r0,r0,16
ori r0,r0,3840
std r0,lo16(_Y)(r2)
blr
llvm-svn: 28883
2006-06-21 06:34:10 +08:00
|
|
|
// imm16Shifted* - These match immediates where the low 16-bits are zero. There
|
|
|
|
// are two forms: imm16ShiftedSExt and imm16ShiftedZExt. These two forms are
|
|
|
|
// identical in 32-bit mode, but in 64-bit mode, they return true if the
|
|
|
|
// immediate fits into a sign/zero extended 32-bit immediate (with the low bits
|
|
|
|
// clear).
|
|
|
|
def imm16ShiftedZExt : PatLeaf<(imm), [{
|
|
|
|
// imm16ShiftedZExt predicate - True if only bits in the top 16-bits of the
|
|
|
|
// immediate are set. Used by instructions like 'xoris'.
|
2008-09-13 00:56:44 +08:00
|
|
|
return (N->getZExtValue() & ~uint64_t(0xFFFF0000)) == 0;
|
Add some 64-bit logical ops.
Split imm16Shifted into a sext/zext form for 64-bit support.
Add some patterns for immediate formation. For example, we now compile this:
static unsigned long long Y;
void test3() {
Y = 0xF0F00F00;
}
into:
_test3:
li r2, 3840
lis r3, ha16(_Y)
xoris r2, r2, 61680
std r2, lo16(_Y)(r3)
blr
GCC produces:
_test3:
li r0,0
lis r2,ha16(_Y)
ori r0,r0,61680
sldi r0,r0,16
ori r0,r0,3840
std r0,lo16(_Y)(r2)
blr
llvm-svn: 28883
2006-06-21 06:34:10 +08:00
|
|
|
}], HI16>;
|
|
|
|
|
|
|
|
def imm16ShiftedSExt : PatLeaf<(imm), [{
|
|
|
|
// imm16ShiftedSExt predicate - True if only bits in the top 16-bits of the
|
|
|
|
// immediate are set. Used by instructions like 'addis'. Identical to
|
|
|
|
// imm16ShiftedZExt in 32-bit mode.
|
2008-09-13 00:56:44 +08:00
|
|
|
if (N->getZExtValue() & 0xFFFF) return false;
|
2009-08-12 04:47:22 +08:00
|
|
|
if (N->getValueType(0) == MVT::i32)
|
2006-06-21 05:39:30 +08:00
|
|
|
return true;
|
|
|
|
// For 64-bit, make sure it is sext right.
|
2008-09-13 00:56:44 +08:00
|
|
|
return N->getZExtValue() == (uint64_t)(int)N->getZExtValue();
|
2005-09-09 08:39:56 +08:00
|
|
|
}], HI16>;
|
2005-09-09 01:33:10 +08:00
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
def imm64ZExt32 : Operand<i64>, ImmLeaf<i64, [{
|
|
|
|
// imm64ZExt32 predicate - True if the i64 immediate fits in a 32-bit
|
|
|
|
// zero extended field.
|
|
|
|
return isUInt<32>(Imm);
|
|
|
|
}]>;
|
|
|
|
|
2013-03-19 07:00:58 +08:00
|
|
|
// Some r+i load/store instructions (such as LD, STD, LDU, etc.) that require
|
2013-05-17 01:58:02 +08:00
|
|
|
// restricted memrix (4-aligned) constants are alignment sensitive. If these
|
2013-03-19 07:00:58 +08:00
|
|
|
// offsets are hidden behind TOC entries than the values of the lower-order
|
|
|
|
// bits cannot be checked directly. As a result, we need to also incorporate
|
|
|
|
// an alignment check into the relevant patterns.
|
|
|
|
|
|
|
|
def aligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
|
|
|
|
return cast<LoadSDNode>(N)->getAlignment() >= 4;
|
|
|
|
}]>;
|
|
|
|
def aligned4store : PatFrag<(ops node:$val, node:$ptr),
|
|
|
|
(store node:$val, node:$ptr), [{
|
|
|
|
return cast<StoreSDNode>(N)->getAlignment() >= 4;
|
|
|
|
}]>;
|
|
|
|
def aligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
|
|
|
|
return cast<LoadSDNode>(N)->getAlignment() >= 4;
|
|
|
|
}]>;
|
|
|
|
def aligned4pre_store : PatFrag<
|
|
|
|
(ops node:$val, node:$base, node:$offset),
|
|
|
|
(pre_store node:$val, node:$base, node:$offset), [{
|
|
|
|
return cast<StoreSDNode>(N)->getAlignment() >= 4;
|
|
|
|
}]>;
|
|
|
|
|
|
|
|
def unaligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
|
|
|
|
return cast<LoadSDNode>(N)->getAlignment() < 4;
|
|
|
|
}]>;
|
|
|
|
def unaligned4store : PatFrag<(ops node:$val, node:$ptr),
|
|
|
|
(store node:$val, node:$ptr), [{
|
|
|
|
return cast<StoreSDNode>(N)->getAlignment() < 4;
|
|
|
|
}]>;
|
|
|
|
def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
|
|
|
|
return cast<LoadSDNode>(N)->getAlignment() < 4;
|
|
|
|
}]>;
|
2006-03-25 14:12:06 +08:00
|
|
|
|
2005-09-09 03:50:41 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// PowerPC Flag Definitions.
|
|
|
|
|
2005-04-19 12:32:54 +08:00
|
|
|
class isPPC64 { bit PPC64 = 1; }
|
2013-04-13 02:17:57 +08:00
|
|
|
class isDOT { bit RC = 1; }
|
2005-04-19 12:32:54 +08:00
|
|
|
|
2006-11-08 10:13:12 +08:00
|
|
|
class RegConstraint<string C> {
|
|
|
|
string Constraints = C;
|
|
|
|
}
|
2006-11-16 07:24:18 +08:00
|
|
|
class NoEncode<string E> {
|
|
|
|
string DisableEncoding = E;
|
|
|
|
}
|
2005-09-09 03:50:41 +08:00
|
|
|
|
|
|
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// PowerPC Operand Definitions.
|
2004-08-15 07:27:29 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
// In the default PowerPC assembler syntax, registers are specified simply
|
|
|
|
// by number, so they cannot be distinguished from immediate values (without
|
|
|
|
// looking at the opcode). This means that the default operand matching logic
|
|
|
|
// for the asm parser does not work, and we need to specify custom matchers.
|
|
|
|
// Since those can only be specified with RegisterOperand classes and not
|
|
|
|
// directly on the RegisterClass, all instructions patterns used by the asm
|
|
|
|
// parser need to use a RegisterOperand (instead of a RegisterClass) for
|
|
|
|
// all their register operands.
|
|
|
|
// For this purpose, we define one RegisterOperand for each RegisterClass,
|
|
|
|
// using the same name as the class, just in lower case.
|
|
|
|
|
2013-05-04 03:49:39 +08:00
|
|
|
def PPCRegGPRCAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "RegGPRC"; let PredicateMethod = "isRegNumber";
|
|
|
|
}
|
|
|
|
def gprc : RegisterOperand<GPRC> {
|
|
|
|
let ParserMatchClass = PPCRegGPRCAsmOperand;
|
|
|
|
}
|
|
|
|
def PPCRegG8RCAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "RegG8RC"; let PredicateMethod = "isRegNumber";
|
|
|
|
}
|
|
|
|
def g8rc : RegisterOperand<G8RC> {
|
|
|
|
let ParserMatchClass = PPCRegG8RCAsmOperand;
|
|
|
|
}
|
|
|
|
def PPCRegGPRCNoR0AsmOperand : AsmOperandClass {
|
|
|
|
let Name = "RegGPRCNoR0"; let PredicateMethod = "isRegNumber";
|
|
|
|
}
|
|
|
|
def gprc_nor0 : RegisterOperand<GPRC_NOR0> {
|
|
|
|
let ParserMatchClass = PPCRegGPRCNoR0AsmOperand;
|
|
|
|
}
|
|
|
|
def PPCRegG8RCNoX0AsmOperand : AsmOperandClass {
|
|
|
|
let Name = "RegG8RCNoX0"; let PredicateMethod = "isRegNumber";
|
|
|
|
}
|
|
|
|
def g8rc_nox0 : RegisterOperand<G8RC_NOX0> {
|
|
|
|
let ParserMatchClass = PPCRegG8RCNoX0AsmOperand;
|
|
|
|
}
|
|
|
|
def PPCRegF8RCAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "RegF8RC"; let PredicateMethod = "isRegNumber";
|
|
|
|
}
|
|
|
|
def f8rc : RegisterOperand<F8RC> {
|
|
|
|
let ParserMatchClass = PPCRegF8RCAsmOperand;
|
|
|
|
}
|
|
|
|
def PPCRegF4RCAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "RegF4RC"; let PredicateMethod = "isRegNumber";
|
|
|
|
}
|
|
|
|
def f4rc : RegisterOperand<F4RC> {
|
|
|
|
let ParserMatchClass = PPCRegF4RCAsmOperand;
|
|
|
|
}
|
|
|
|
def PPCRegVRRCAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "RegVRRC"; let PredicateMethod = "isRegNumber";
|
|
|
|
}
|
|
|
|
def vrrc : RegisterOperand<VRRC> {
|
|
|
|
let ParserMatchClass = PPCRegVRRCAsmOperand;
|
|
|
|
}
|
2016-10-04 14:59:23 +08:00
|
|
|
def PPCRegVFRCAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "RegVFRC"; let PredicateMethod = "isRegNumber";
|
|
|
|
}
|
|
|
|
def vfrc : RegisterOperand<VFRC> {
|
|
|
|
let ParserMatchClass = PPCRegVFRCAsmOperand;
|
|
|
|
}
|
2013-05-04 03:49:39 +08:00
|
|
|
def PPCRegCRBITRCAsmOperand : AsmOperandClass {
|
2013-07-04 22:24:00 +08:00
|
|
|
let Name = "RegCRBITRC"; let PredicateMethod = "isCRBitNumber";
|
2013-05-04 03:49:39 +08:00
|
|
|
}
|
|
|
|
def crbitrc : RegisterOperand<CRBITRC> {
|
|
|
|
let ParserMatchClass = PPCRegCRBITRCAsmOperand;
|
|
|
|
}
|
|
|
|
def PPCRegCRRCAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "RegCRRC"; let PredicateMethod = "isCCRegNumber";
|
|
|
|
}
|
|
|
|
def crrc : RegisterOperand<CRRC> {
|
|
|
|
let ParserMatchClass = PPCRegCRRCAsmOperand;
|
|
|
|
}
|
2015-03-26 03:36:23 +08:00
|
|
|
def crrc0 : RegisterOperand<CRRC0> {
|
|
|
|
let ParserMatchClass = PPCRegCRRCAsmOperand;
|
|
|
|
}
|
2013-05-04 03:49:39 +08:00
|
|
|
|
2015-03-05 04:44:33 +08:00
|
|
|
def PPCU1ImmAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "U1Imm"; let PredicateMethod = "isU1Imm";
|
|
|
|
let RenderMethod = "addImmOperands";
|
|
|
|
}
|
|
|
|
def u1imm : Operand<i32> {
|
|
|
|
let PrintMethod = "printU1ImmOperand";
|
|
|
|
let ParserMatchClass = PPCU1ImmAsmOperand;
|
|
|
|
}
|
|
|
|
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
llvm-svn: 203768
2014-03-13 15:58:58 +08:00
|
|
|
def PPCU2ImmAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "U2Imm"; let PredicateMethod = "isU2Imm";
|
|
|
|
let RenderMethod = "addImmOperands";
|
|
|
|
}
|
|
|
|
def u2imm : Operand<i32> {
|
|
|
|
let PrintMethod = "printU2ImmOperand";
|
|
|
|
let ParserMatchClass = PPCU2ImmAsmOperand;
|
|
|
|
}
|
2014-07-30 06:21:57 +08:00
|
|
|
|
[PowerPC] Support asm parsing for bc[l][a][+-] mnemonics
PowerPC assembly code in the wild, so it seems, has things like this:
bc+ 12, 28, .L9
This is a bit odd because the '+' here becomes part of the BO field, and the BO
field is otherwise the first operand. Nevertheless, the ISA specification does
clearly say that the +- hint syntax applies to all conditional-branch mnemonics
(that test either CTR or a condition register, although not the forms which
check both), both basic and extended, so this is supposed to be valid.
This introduces some asm-parser-only definitions which take only the upper
three bits from the specified BO value, and the lower two bits are implied by
the +- suffix (via some associated aliases).
Fixes PR23646.
llvm-svn: 280571
2016-09-03 10:31:44 +08:00
|
|
|
def PPCATBitsAsHintAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "ATBitsAsHint"; let PredicateMethod = "isATBitsAsHint";
|
|
|
|
let RenderMethod = "addImmOperands"; // Irrelevant, predicate always fails.
|
|
|
|
}
|
|
|
|
def atimm : Operand<i32> {
|
|
|
|
let PrintMethod = "printATBitsAsHint";
|
|
|
|
let ParserMatchClass = PPCATBitsAsHintAsmOperand;
|
|
|
|
}
|
|
|
|
|
2015-03-26 03:36:23 +08:00
|
|
|
def PPCU3ImmAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "U3Imm"; let PredicateMethod = "isU3Imm";
|
|
|
|
let RenderMethod = "addImmOperands";
|
|
|
|
}
|
|
|
|
def u3imm : Operand<i32> {
|
|
|
|
let PrintMethod = "printU3ImmOperand";
|
|
|
|
let ParserMatchClass = PPCU3ImmAsmOperand;
|
|
|
|
}
|
|
|
|
|
2014-07-30 06:21:57 +08:00
|
|
|
def PPCU4ImmAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "U4Imm"; let PredicateMethod = "isU4Imm";
|
|
|
|
let RenderMethod = "addImmOperands";
|
|
|
|
}
|
|
|
|
def u4imm : Operand<i32> {
|
|
|
|
let PrintMethod = "printU4ImmOperand";
|
|
|
|
let ParserMatchClass = PPCU4ImmAsmOperand;
|
|
|
|
}
|
2013-05-04 03:49:39 +08:00
|
|
|
def PPCS5ImmAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "S5Imm"; let PredicateMethod = "isS5Imm";
|
|
|
|
let RenderMethod = "addImmOperands";
|
|
|
|
}
|
2006-03-25 14:12:06 +08:00
|
|
|
def s5imm : Operand<i32> {
|
|
|
|
let PrintMethod = "printS5ImmOperand";
|
2013-05-04 03:49:39 +08:00
|
|
|
let ParserMatchClass = PPCS5ImmAsmOperand;
|
2013-12-20 00:13:01 +08:00
|
|
|
let DecoderMethod = "decodeSImmOperand<5>";
|
2013-05-04 03:49:39 +08:00
|
|
|
}
|
|
|
|
def PPCU5ImmAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "U5Imm"; let PredicateMethod = "isU5Imm";
|
|
|
|
let RenderMethod = "addImmOperands";
|
2006-03-25 14:12:06 +08:00
|
|
|
}
|
2005-09-15 04:53:05 +08:00
|
|
|
def u5imm : Operand<i32> {
|
2004-08-21 13:56:39 +08:00
|
|
|
let PrintMethod = "printU5ImmOperand";
|
2013-05-04 03:49:39 +08:00
|
|
|
let ParserMatchClass = PPCU5ImmAsmOperand;
|
2013-12-20 00:13:01 +08:00
|
|
|
let DecoderMethod = "decodeUImmOperand<5>";
|
2013-05-04 03:49:39 +08:00
|
|
|
}
|
|
|
|
def PPCU6ImmAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "U6Imm"; let PredicateMethod = "isU6Imm";
|
|
|
|
let RenderMethod = "addImmOperands";
|
2004-08-21 13:56:39 +08:00
|
|
|
}
|
2005-09-15 04:53:05 +08:00
|
|
|
def u6imm : Operand<i32> {
|
2004-08-30 10:28:06 +08:00
|
|
|
let PrintMethod = "printU6ImmOperand";
|
2013-05-04 03:49:39 +08:00
|
|
|
let ParserMatchClass = PPCU6ImmAsmOperand;
|
2013-12-20 00:13:01 +08:00
|
|
|
let DecoderMethod = "decodeUImmOperand<6>";
|
2013-05-04 03:49:39 +08:00
|
|
|
}
|
[Power9] Implement new vsx instructions: insert, extract, test data class, min/max, reverse, permute, splat
This change implements the following vsx instructions:
- Scalar Insert/Extract
xsiexpdp xsiexpqp xsxexpdp xsxsigdp xsxexpqp xsxsigqp
- Vector Insert/Extract
xviexpdp xviexpsp xvxexpdp xvxexpsp xvxsigdp xvxsigsp
xxextractuw xxinsertw
- Scalar/Vector Test Data Class
xststdcdp xststdcsp xststdcqp
xvtstdcdp xvtstdcsp
- Maximum/Minimum
xsmaxcdp xsmaxjdp
xsmincdp xsminjdp
- Vector Byte-Reverse/Permute/Splat
xxbrd xxbrh xxbrq xxbrw
xxperm xxpermr
xxspltib
30 instructions
Thanks Nemanja for invaluable discussion! Thanks Kit's great help!
Reviewers: hal, nemanja, kbarton, tjablin, amehsan
http://reviews.llvm.org/D16842
llvm-svn: 264567
2016-03-28 16:34:28 +08:00
|
|
|
def PPCU7ImmAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "U7Imm"; let PredicateMethod = "isU7Imm";
|
|
|
|
let RenderMethod = "addImmOperands";
|
|
|
|
}
|
|
|
|
def u7imm : Operand<i32> {
|
|
|
|
let PrintMethod = "printU7ImmOperand";
|
|
|
|
let ParserMatchClass = PPCU7ImmAsmOperand;
|
|
|
|
let DecoderMethod = "decodeUImmOperand<7>";
|
|
|
|
}
|
|
|
|
def PPCU8ImmAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "U8Imm"; let PredicateMethod = "isU8Imm";
|
|
|
|
let RenderMethod = "addImmOperands";
|
|
|
|
}
|
|
|
|
def u8imm : Operand<i32> {
|
|
|
|
let PrintMethod = "printU8ImmOperand";
|
|
|
|
let ParserMatchClass = PPCU8ImmAsmOperand;
|
|
|
|
let DecoderMethod = "decodeUImmOperand<8>";
|
|
|
|
}
|
2015-05-23 00:44:10 +08:00
|
|
|
def PPCU10ImmAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "U10Imm"; let PredicateMethod = "isU10Imm";
|
|
|
|
let RenderMethod = "addImmOperands";
|
|
|
|
}
|
|
|
|
def u10imm : Operand<i32> {
|
|
|
|
let PrintMethod = "printU10ImmOperand";
|
|
|
|
let ParserMatchClass = PPCU10ImmAsmOperand;
|
|
|
|
let DecoderMethod = "decodeUImmOperand<10>";
|
|
|
|
}
|
[PowerPC] Add support for the QPX vector instruction set
This adds support for the QPX vector instruction set, which is used by the
enhanced A2 cores on the IBM BG/Q supercomputers. QPX vectors are 256 bytes
wide, holding 4 double-precision floating-point values. Boolean values, modeled
here as <4 x i1> are actually also represented as floating-point values
(essentially { -1, 1 } for { false, true }). QPX shares many features with
Altivec and VSX, but is distinct from both of them. One major difference is
that, instead of adding completely-separate vector registers, QPX vector
registers are extensions of the scalar floating-point registers (lane 0 is the
corresponding scalar floating-point value). The operations supported on QPX
vectors mirrors that supported on the scalar floating-point values (with some
additional ones for permutations and logical/comparison operations).
I've been maintaining this support out-of-tree, as part of the bgclang project,
for several years. This is not the entire bgclang patch set, but is most of the
subset that can be cleanly integrated into LLVM proper at this time. Adding
this to the LLVM backend is part of my efforts to rebase bgclang to the current
LLVM trunk, but is independently useful (especially for codes that use LLVM as
a JIT in library form).
The assembler/disassembler test coverage is complete. The CodeGen test coverage
is not, but I've included some tests, and more will be added as follow-up work.
llvm-svn: 230413
2015-02-25 09:06:45 +08:00
|
|
|
def PPCU12ImmAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "U12Imm"; let PredicateMethod = "isU12Imm";
|
|
|
|
let RenderMethod = "addImmOperands";
|
|
|
|
}
|
|
|
|
def u12imm : Operand<i32> {
|
|
|
|
let PrintMethod = "printU12ImmOperand";
|
|
|
|
let ParserMatchClass = PPCU12ImmAsmOperand;
|
|
|
|
let DecoderMethod = "decodeUImmOperand<12>";
|
|
|
|
}
|
2013-05-04 03:49:39 +08:00
|
|
|
def PPCS16ImmAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "S16Imm"; let PredicateMethod = "isS16Imm";
|
2014-08-10 20:41:50 +08:00
|
|
|
let RenderMethod = "addS16ImmOperands";
|
2004-08-30 10:28:06 +08:00
|
|
|
}
|
2005-09-15 04:53:05 +08:00
|
|
|
def s16imm : Operand<i32> {
|
2004-09-04 13:00:00 +08:00
|
|
|
let PrintMethod = "printS16ImmOperand";
|
2013-06-26 21:49:15 +08:00
|
|
|
let EncoderMethod = "getImm16Encoding";
|
2013-05-04 03:49:39 +08:00
|
|
|
let ParserMatchClass = PPCS16ImmAsmOperand;
|
2013-12-20 00:13:01 +08:00
|
|
|
let DecoderMethod = "decodeSImmOperand<16>";
|
2013-05-04 03:49:39 +08:00
|
|
|
}
|
|
|
|
def PPCU16ImmAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "U16Imm"; let PredicateMethod = "isU16Imm";
|
2014-08-10 20:41:50 +08:00
|
|
|
let RenderMethod = "addU16ImmOperands";
|
2004-09-04 13:00:00 +08:00
|
|
|
}
|
2005-09-15 04:53:05 +08:00
|
|
|
def u16imm : Operand<i32> {
|
2004-08-15 13:20:16 +08:00
|
|
|
let PrintMethod = "printU16ImmOperand";
|
2013-06-26 21:49:15 +08:00
|
|
|
let EncoderMethod = "getImm16Encoding";
|
2013-05-04 03:49:39 +08:00
|
|
|
let ParserMatchClass = PPCU16ImmAsmOperand;
|
2013-12-20 00:13:01 +08:00
|
|
|
let DecoderMethod = "decodeUImmOperand<16>";
|
2004-08-15 13:20:16 +08:00
|
|
|
}
|
2013-06-26 21:49:53 +08:00
|
|
|
def PPCS17ImmAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "S17Imm"; let PredicateMethod = "isS17Imm";
|
2014-08-10 20:41:50 +08:00
|
|
|
let RenderMethod = "addS16ImmOperands";
|
2013-06-26 21:49:53 +08:00
|
|
|
}
|
|
|
|
def s17imm : Operand<i32> {
|
|
|
|
// This operand type is used for addis/lis to allow the assembler parser
|
|
|
|
// to accept immediates in the range -65536..65535 for compatibility with
|
|
|
|
// the GNU assembler. The operand is treated as 16-bit otherwise.
|
|
|
|
let PrintMethod = "printS16ImmOperand";
|
|
|
|
let EncoderMethod = "getImm16Encoding";
|
|
|
|
let ParserMatchClass = PPCS17ImmAsmOperand;
|
2013-12-20 00:13:01 +08:00
|
|
|
let DecoderMethod = "decodeSImmOperand<16>";
|
2013-06-26 21:49:53 +08:00
|
|
|
}
|
2016-10-25 01:31:09 +08:00
|
|
|
|
|
|
|
def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
|
|
|
|
|
2013-06-24 19:03:33 +08:00
|
|
|
def PPCDirectBrAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "DirectBr"; let PredicateMethod = "isDirectBr";
|
|
|
|
let RenderMethod = "addBranchTargetOperands";
|
|
|
|
}
|
2010-11-15 14:09:35 +08:00
|
|
|
def directbrtarget : Operand<OtherVT> {
|
2004-09-02 16:13:00 +08:00
|
|
|
let PrintMethod = "printBranchOperand";
|
2010-11-15 14:09:35 +08:00
|
|
|
let EncoderMethod = "getDirectBrEncoding";
|
2013-06-24 19:03:33 +08:00
|
|
|
let ParserMatchClass = PPCDirectBrAsmOperand;
|
|
|
|
}
|
|
|
|
def absdirectbrtarget : Operand<OtherVT> {
|
|
|
|
let PrintMethod = "printAbsBranchOperand";
|
|
|
|
let EncoderMethod = "getAbsDirectBrEncoding";
|
|
|
|
let ParserMatchClass = PPCDirectBrAsmOperand;
|
|
|
|
}
|
|
|
|
def PPCCondBrAsmOperand : AsmOperandClass {
|
|
|
|
let Name = "CondBr"; let PredicateMethod = "isCondBr";
|
|
|
|
let RenderMethod = "addBranchTargetOperands";
|
2010-11-15 14:09:35 +08:00
|
|
|
}
|
|
|
|
def condbrtarget : Operand<OtherVT> {
|
2010-11-16 09:45:05 +08:00
|
|
|
let PrintMethod = "printBranchOperand";
|
2010-11-15 14:09:35 +08:00
|
|
|
let EncoderMethod = "getCondBrEncoding";
|
2013-06-24 19:03:33 +08:00
|
|
|
let ParserMatchClass = PPCCondBrAsmOperand;
|
|
|
|
}
|
|
|
|
def abscondbrtarget : Operand<OtherVT> {
|
|
|
|
let PrintMethod = "printAbsBranchOperand";
|
|
|
|
let EncoderMethod = "getAbsCondBrEncoding";
|
|
|
|
let ParserMatchClass = PPCCondBrAsmOperand;
|
2004-09-02 16:13:00 +08:00
|
|
|
}
|
2006-06-17 05:01:35 +08:00
|
|
|
def calltarget : Operand<iPTR> {
|
2013-06-24 19:03:33 +08:00
|
|
|
let PrintMethod = "printBranchOperand";
|
2010-11-15 14:09:35 +08:00
|
|
|
let EncoderMethod = "getDirectBrEncoding";
|
2013-06-24 19:03:33 +08:00
|
|
|
let ParserMatchClass = PPCDirectBrAsmOperand;
|
2005-11-18 03:16:08 +08:00
|
|
|
}
|
2013-06-24 19:03:33 +08:00
|
|
|
def abscalltarget : Operand<iPTR> {
|
|
|
|
let PrintMethod = "printAbsBranchOperand";
|
|
|
|
let EncoderMethod = "getAbsDirectBrEncoding";
|
|
|
|
let ParserMatchClass = PPCDirectBrAsmOperand;
|
2005-11-16 08:48:01 +08:00
|
|
|
}
|
2013-05-04 03:49:39 +08:00
|
|
|
def PPCCRBitMaskOperand : AsmOperandClass {
|
|
|
|
let Name = "CRBitMask"; let PredicateMethod = "isCRBitMask";
|
2004-09-04 13:00:00 +08:00
|
|
|
}
|
2005-07-21 06:42:00 +08:00
|
|
|
def crbitm: Operand<i8> {
|
|
|
|
let PrintMethod = "printcrbitm";
|
2010-11-15 13:19:25 +08:00
|
|
|
let EncoderMethod = "get_crbitm_encoding";
|
2013-12-20 00:13:01 +08:00
|
|
|
let DecoderMethod = "decodeCRBitMOperand";
|
2013-05-04 03:49:39 +08:00
|
|
|
let ParserMatchClass = PPCCRBitMaskOperand;
|
2005-07-21 06:42:00 +08:00
|
|
|
}
|
2005-12-20 07:25:09 +08:00
|
|
|
// Address operands
|
Prepare to make r0 an allocatable register on PPC
Currently the PPC r0 register is unconditionally reserved. There are two reasons
for this:
1. r0 is treated specially (as the constant 0) by certain instructions, and so
cannot be used with those instructions as a regular register.
2. r0 is used as a temporary register in the CR-register spilling process
(where, under some circumstances, we require two GPRs).
This change addresses the first reason by introducing a restricted register
class (without r0) for use by those instructions that treat r0 specially. These
register classes have a new pseudo-register, ZERO, which represents the r0-as-0
use. This has the side benefit of making the existing target code simpler (and
easier to understand), and will make it clear to the register allocator that
uses of r0 as 0 don't conflict will real uses of the r0 register.
Once the CR spilling code is improved, we'll be able to allocate r0.
Adding these extra register classes, for some reason unclear to me, causes
requests to the target to copy 32-bit registers to 64-bit registers. The
resulting code seems correct (and causes no test-suite failures), and the new
test case covers this new kind of asymmetric copy.
As r0 is still reserved, no functionality change intended.
llvm-svn: 177423
2013-03-20 02:51:05 +08:00
|
|
|
// A version of ptr_rc which excludes R0 (or X0 in 64-bit mode).
|
2013-05-04 03:49:39 +08:00
|
|
|
def PPCRegGxRCNoR0Operand : AsmOperandClass {
|
|
|
|
let Name = "RegGxRCNoR0"; let PredicateMethod = "isRegNumber";
|
|
|
|
}
|
|
|
|
def ptr_rc_nor0 : Operand<iPTR>, PointerLikeRegClass<1> {
|
|
|
|
let ParserMatchClass = PPCRegGxRCNoR0Operand;
|
|
|
|
}
|
|
|
|
// A version of ptr_rc usable with the asm parser.
|
|
|
|
def PPCRegGxRCOperand : AsmOperandClass {
|
|
|
|
let Name = "RegGxRC"; let PredicateMethod = "isRegNumber";
|
|
|
|
}
|
|
|
|
def ptr_rc_idx : Operand<iPTR>, PointerLikeRegClass<0> {
|
|
|
|
let ParserMatchClass = PPCRegGxRCOperand;
|
|
|
|
}
|
Prepare to make r0 an allocatable register on PPC
Currently the PPC r0 register is unconditionally reserved. There are two reasons
for this:
1. r0 is treated specially (as the constant 0) by certain instructions, and so
cannot be used with those instructions as a regular register.
2. r0 is used as a temporary register in the CR-register spilling process
(where, under some circumstances, we require two GPRs).
This change addresses the first reason by introducing a restricted register
class (without r0) for use by those instructions that treat r0 specially. These
register classes have a new pseudo-register, ZERO, which represents the r0-as-0
use. This has the side benefit of making the existing target code simpler (and
easier to understand), and will make it clear to the register allocator that
uses of r0 as 0 don't conflict will real uses of the r0 register.
Once the CR spilling code is improved, we'll be able to allocate r0.
Adding these extra register classes, for some reason unclear to me, causes
requests to the target to copy 32-bit registers to 64-bit registers. The
resulting code seems correct (and causes no test-suite failures), and the new
test case covers this new kind of asymmetric copy.
As r0 is still reserved, no functionality change intended.
llvm-svn: 177423
2013-03-20 02:51:05 +08:00
|
|
|
|
2013-05-04 03:49:39 +08:00
|
|
|
def PPCDispRIOperand : AsmOperandClass {
|
|
|
|
let Name = "DispRI"; let PredicateMethod = "isS16Imm";
|
2014-08-10 20:41:50 +08:00
|
|
|
let RenderMethod = "addS16ImmOperands";
|
2013-05-04 03:49:39 +08:00
|
|
|
}
|
|
|
|
def dispRI : Operand<iPTR> {
|
|
|
|
let ParserMatchClass = PPCDispRIOperand;
|
|
|
|
}
|
|
|
|
def PPCDispRIXOperand : AsmOperandClass {
|
|
|
|
let Name = "DispRIX"; let PredicateMethod = "isS16ImmX4";
|
2013-05-17 01:58:02 +08:00
|
|
|
let RenderMethod = "addImmOperands";
|
2013-05-04 03:49:39 +08:00
|
|
|
}
|
|
|
|
def dispRIX : Operand<iPTR> {
|
|
|
|
let ParserMatchClass = PPCDispRIXOperand;
|
|
|
|
}
|
2016-03-08 11:49:13 +08:00
|
|
|
def PPCDispRIX16Operand : AsmOperandClass {
|
|
|
|
let Name = "DispRIX16"; let PredicateMethod = "isS16ImmX16";
|
|
|
|
let RenderMethod = "addImmOperands";
|
|
|
|
}
|
|
|
|
def dispRIX16 : Operand<iPTR> {
|
|
|
|
let ParserMatchClass = PPCDispRIX16Operand;
|
|
|
|
}
|
2014-08-09 00:43:49 +08:00
|
|
|
def PPCDispSPE8Operand : AsmOperandClass {
|
|
|
|
let Name = "DispSPE8"; let PredicateMethod = "isU8ImmX8";
|
|
|
|
let RenderMethod = "addImmOperands";
|
|
|
|
}
|
|
|
|
def dispSPE8 : Operand<iPTR> {
|
|
|
|
let ParserMatchClass = PPCDispSPE8Operand;
|
|
|
|
}
|
|
|
|
def PPCDispSPE4Operand : AsmOperandClass {
|
|
|
|
let Name = "DispSPE4"; let PredicateMethod = "isU7ImmX4";
|
|
|
|
let RenderMethod = "addImmOperands";
|
|
|
|
}
|
|
|
|
def dispSPE4 : Operand<iPTR> {
|
|
|
|
let ParserMatchClass = PPCDispSPE4Operand;
|
|
|
|
}
|
|
|
|
def PPCDispSPE2Operand : AsmOperandClass {
|
|
|
|
let Name = "DispSPE2"; let PredicateMethod = "isU6ImmX2";
|
|
|
|
let RenderMethod = "addImmOperands";
|
|
|
|
}
|
|
|
|
def dispSPE2 : Operand<iPTR> {
|
|
|
|
let ParserMatchClass = PPCDispSPE2Operand;
|
|
|
|
}
|
2013-03-26 18:55:45 +08:00
|
|
|
|
2006-06-17 05:01:35 +08:00
|
|
|
def memri : Operand<iPTR> {
|
2005-12-20 07:25:09 +08:00
|
|
|
let PrintMethod = "printMemRegImm";
|
2013-03-26 18:55:45 +08:00
|
|
|
let MIOperandInfo = (ops dispRI:$imm, ptr_rc_nor0:$reg);
|
split out an encoder for memri operands, allowing a relocation to be plopped
into the immediate field. This allows us to encode stuff like this:
lbz r3, lo16(__ZL4init)(r4) ; globalopt.cpp:5
; encoding: [0x88,0x64,A,A]
; fixup A - offset: 0, value: lo16(__ZL4init), kind: fixup_ppc_lo16
stw r3, lo16(__ZL1s)(r5) ; globalopt.cpp:6
; encoding: [0x90,0x65,A,A]
; fixup A - offset: 0, value: lo16(__ZL1s), kind: fixup_ppc_lo16
With this, we should have a completely function MCCodeEmitter for PPC, wewt.
llvm-svn: 119134
2010-11-15 16:22:03 +08:00
|
|
|
let EncoderMethod = "getMemRIEncoding";
|
2013-12-20 00:13:01 +08:00
|
|
|
let DecoderMethod = "decodeMemRIOperands";
|
2005-12-20 07:25:09 +08:00
|
|
|
}
|
2006-06-17 05:01:35 +08:00
|
|
|
def memrr : Operand<iPTR> {
|
2005-12-20 07:25:09 +08:00
|
|
|
let PrintMethod = "printMemRegReg";
|
2013-05-04 03:49:39 +08:00
|
|
|
let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg, ptr_rc_idx:$offreg);
|
2005-12-20 07:25:09 +08:00
|
|
|
}
|
2013-05-17 01:58:02 +08:00
|
|
|
def memrix : Operand<iPTR> { // memri where the imm is 4-aligned.
|
|
|
|
let PrintMethod = "printMemRegImm";
|
2013-03-26 18:55:45 +08:00
|
|
|
let MIOperandInfo = (ops dispRIX:$imm, ptr_rc_nor0:$reg);
|
2010-11-15 16:02:41 +08:00
|
|
|
let EncoderMethod = "getMemRIXEncoding";
|
2013-12-20 00:13:01 +08:00
|
|
|
let DecoderMethod = "decodeMemRIXOperands";
|
When possible, custom lower 32-bit SINT_TO_FP to this:
_foo2:
extsw r2, r3
std r2, -8(r1)
lfd f0, -8(r1)
fcfid f0, f0
frsp f1, f0
blr
instead of this:
_foo2:
lis r2, ha16(LCPI2_0)
lis r4, 17200
xoris r3, r3, 32768
stw r3, -4(r1)
stw r4, -8(r1)
lfs f0, lo16(LCPI2_0)(r2)
lfd f1, -8(r1)
fsub f0, f1, f0
frsp f1, f0
blr
This speeds up Misc/pi from 2.44s->2.09s with LLC and from 3.01->2.18s
with llcbeta (16.7% and 38.1% respectively).
llvm-svn: 26943
2006-03-22 13:30:33 +08:00
|
|
|
}
|
2016-03-08 11:49:13 +08:00
|
|
|
def memrix16 : Operand<iPTR> { // memri, imm is 16-aligned, 12-bit, Inst{16:27}
|
|
|
|
let PrintMethod = "printMemRegImm";
|
|
|
|
let MIOperandInfo = (ops dispRIX16:$imm, ptr_rc_nor0:$reg);
|
|
|
|
let EncoderMethod = "getMemRIX16Encoding";
|
|
|
|
let DecoderMethod = "decodeMemRIX16Operands";
|
|
|
|
}
|
2014-08-09 00:43:49 +08:00
|
|
|
def spe8dis : Operand<iPTR> { // SPE displacement where the imm is 8-aligned.
|
|
|
|
let PrintMethod = "printMemRegImm";
|
|
|
|
let MIOperandInfo = (ops dispSPE8:$imm, ptr_rc_nor0:$reg);
|
|
|
|
let EncoderMethod = "getSPE8DisEncoding";
|
|
|
|
}
|
|
|
|
def spe4dis : Operand<iPTR> { // SPE displacement where the imm is 4-aligned.
|
|
|
|
let PrintMethod = "printMemRegImm";
|
|
|
|
let MIOperandInfo = (ops dispSPE4:$imm, ptr_rc_nor0:$reg);
|
|
|
|
let EncoderMethod = "getSPE4DisEncoding";
|
|
|
|
}
|
|
|
|
def spe2dis : Operand<iPTR> { // SPE displacement where the imm is 2-aligned.
|
|
|
|
let PrintMethod = "printMemRegImm";
|
|
|
|
let MIOperandInfo = (ops dispSPE2:$imm, ptr_rc_nor0:$reg);
|
|
|
|
let EncoderMethod = "getSPE2DisEncoding";
|
|
|
|
}
|
2005-12-20 07:25:09 +08:00
|
|
|
|
2013-03-22 05:37:52 +08:00
|
|
|
// A single-register address. This is used with the SjLj
|
|
|
|
// pseudo-instructions.
|
|
|
|
def memr : Operand<iPTR> {
|
|
|
|
let MIOperandInfo = (ops ptr_rc:$ptrreg);
|
|
|
|
}
|
2013-12-21 02:08:54 +08:00
|
|
|
def PPCTLSRegOperand : AsmOperandClass {
|
|
|
|
let Name = "TLSReg"; let PredicateMethod = "isTLSReg";
|
|
|
|
let RenderMethod = "addTLSRegOperands";
|
|
|
|
}
|
|
|
|
def tlsreg32 : Operand<i32> {
|
|
|
|
let EncoderMethod = "getTLSRegEncoding";
|
|
|
|
let ParserMatchClass = PPCTLSRegOperand;
|
|
|
|
}
|
2014-07-26 01:47:22 +08:00
|
|
|
def tlsgd32 : Operand<i32> {}
|
|
|
|
def tlscall32 : Operand<i32> {
|
|
|
|
let PrintMethod = "printTLSCall";
|
|
|
|
let MIOperandInfo = (ops calltarget:$func, tlsgd32:$sym);
|
|
|
|
let EncoderMethod = "getTLSCallEncoding";
|
|
|
|
}
|
2013-03-22 05:37:52 +08:00
|
|
|
|
2013-03-26 18:53:27 +08:00
|
|
|
// PowerPC Predicate operand.
|
|
|
|
def pred : Operand<OtherVT> {
|
2006-11-04 13:27:39 +08:00
|
|
|
let PrintMethod = "printPredicateOperand";
|
2013-04-27 00:53:15 +08:00
|
|
|
let MIOperandInfo = (ops i32imm:$bibo, crrc:$reg);
|
2006-11-04 13:27:39 +08:00
|
|
|
}
|
2006-11-04 07:53:25 +08:00
|
|
|
|
2006-01-12 10:05:36 +08:00
|
|
|
// Define PowerPC specific addressing mode.
|
2006-10-12 05:03:53 +08:00
|
|
|
def iaddr : ComplexPattern<iPTR, 2, "SelectAddrImm", [], []>;
|
|
|
|
def xaddr : ComplexPattern<iPTR, 2, "SelectAddrIdx", [], []>;
|
|
|
|
def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>;
|
2013-05-17 01:58:02 +08:00
|
|
|
def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4", [], []>; // "std"
|
2004-08-15 13:20:16 +08:00
|
|
|
|
2013-03-22 05:37:52 +08:00
|
|
|
// The address in a single register. This is used with the SjLj
|
|
|
|
// pseudo-instructions.
|
|
|
|
def addr : ComplexPattern<iPTR, 1, "SelectAddr",[], []>;
|
|
|
|
|
2006-11-16 08:41:37 +08:00
|
|
|
/// This is just the offset part of iaddr, used for preinc.
|
|
|
|
def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;
|
2006-11-15 10:43:19 +08:00
|
|
|
|
2005-12-15 06:07:12 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// PowerPC Instruction Predicate Definitions.
|
2014-05-22 09:07:24 +08:00
|
|
|
def In32BitMode : Predicate<"!PPCSubTarget->isPPC64()">;
|
|
|
|
def In64BitMode : Predicate<"PPCSubTarget->isPPC64()">;
|
|
|
|
def IsBookE : Predicate<"PPCSubTarget->isBookE()">;
|
|
|
|
def IsNotBookE : Predicate<"!PPCSubTarget->isBookE()">;
|
2014-10-03 06:34:22 +08:00
|
|
|
def HasOnlyMSYNC : Predicate<"PPCSubTarget->hasOnlyMSYNC()">;
|
|
|
|
def HasSYNC : Predicate<"!PPCSubTarget->hasOnlyMSYNC()">;
|
2014-08-04 23:47:38 +08:00
|
|
|
def IsPPC4xx : Predicate<"PPCSubTarget->isPPC4xx()">;
|
2014-08-05 01:07:41 +08:00
|
|
|
def IsPPC6xx : Predicate<"PPCSubTarget->isPPC6xx()">;
|
2014-08-04 23:47:38 +08:00
|
|
|
def IsE500 : Predicate<"PPCSubTarget->isE500()">;
|
2014-08-07 20:18:21 +08:00
|
|
|
def HasSPE : Predicate<"PPCSubTarget->HasSPE()">;
|
2015-01-15 04:17:10 +08:00
|
|
|
def HasICBT : Predicate<"PPCSubTarget->hasICBT()">;
|
2015-03-11 04:51:07 +08:00
|
|
|
def HasPartwordAtomics : Predicate<"PPCSubTarget->hasPartwordAtomics()">;
|
[PowerPC] Add support for the QPX vector instruction set
This adds support for the QPX vector instruction set, which is used by the
enhanced A2 cores on the IBM BG/Q supercomputers. QPX vectors are 256 bytes
wide, holding 4 double-precision floating-point values. Boolean values, modeled
here as <4 x i1> are actually also represented as floating-point values
(essentially { -1, 1 } for { false, true }). QPX shares many features with
Altivec and VSX, but is distinct from both of them. One major difference is
that, instead of adding completely-separate vector registers, QPX vector
registers are extensions of the scalar floating-point registers (lane 0 is the
corresponding scalar floating-point value). The operations supported on QPX
vectors mirrors that supported on the scalar floating-point values (with some
additional ones for permutations and logical/comparison operations).
I've been maintaining this support out-of-tree, as part of the bgclang project,
for several years. This is not the entire bgclang patch set, but is most of the
subset that can be cleanly integrated into LLVM proper at this time. Adding
this to the LLVM backend is part of my efforts to rebase bgclang to the current
LLVM trunk, but is independently useful (especially for codes that use LLVM as
a JIT in library form).
The assembler/disassembler test coverage is complete. The CodeGen test coverage
is not, but I've included some tests, and more will be added as follow-up work.
llvm-svn: 230413
2015-02-25 09:06:45 +08:00
|
|
|
def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">;
|
|
|
|
def NaNsFPMath : Predicate<"!TM.Options.NoNaNsFPMath">;
|
2015-04-10 07:54:37 +08:00
|
|
|
def HasBPERMD : Predicate<"PPCSubTarget->hasBPERMD()">;
|
|
|
|
def HasExtDiv : Predicate<"PPCSubTarget->hasExtDiv()">;
|
2016-03-31 23:26:37 +08:00
|
|
|
def IsISA3_0 : Predicate<"PPCSubTarget->isISA3_0()">;
|
[PowerPC] Add support for the QPX vector instruction set
This adds support for the QPX vector instruction set, which is used by the
enhanced A2 cores on the IBM BG/Q supercomputers. QPX vectors are 256 bytes
wide, holding 4 double-precision floating-point values. Boolean values, modeled
here as <4 x i1> are actually also represented as floating-point values
(essentially { -1, 1 } for { false, true }). QPX shares many features with
Altivec and VSX, but is distinct from both of them. One major difference is
that, instead of adding completely-separate vector registers, QPX vector
registers are extensions of the scalar floating-point registers (lane 0 is the
corresponding scalar floating-point value). The operations supported on QPX
vectors mirrors that supported on the scalar floating-point values (with some
additional ones for permutations and logical/comparison operations).
I've been maintaining this support out-of-tree, as part of the bgclang project,
for several years. This is not the entire bgclang patch set, but is most of the
subset that can be cleanly integrated into LLVM proper at this time. Adding
this to the LLVM backend is part of my efforts to rebase bgclang to the current
LLVM trunk, but is independently useful (especially for codes that use LLVM as
a JIT in library form).
The assembler/disassembler test coverage is complete. The CodeGen test coverage
is not, but I've included some tests, and more will be added as follow-up work.
llvm-svn: 230413
2015-02-25 09:06:45 +08:00
|
|
|
|
2013-04-12 10:18:09 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// PowerPC Multiclass Definitions.
|
|
|
|
|
|
|
|
multiclass XForm_6r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
|
|
|
let BaseName = asmbase in {
|
|
|
|
def NAME : XForm_6<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CR0] in
|
|
|
|
def o : XForm_6<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
|
|
|
[]>, isDOT, RecFormRel;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
multiclass XForm_6rc<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
|
|
|
let BaseName = asmbase in {
|
|
|
|
let Defs = [CARRY] in
|
|
|
|
def NAME : XForm_6<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
|
|
|
let Defs = [CARRY, CR0] in
|
2013-04-12 10:18:09 +08:00
|
|
|
def o : XForm_6<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
|
|
|
[]>, isDOT, RecFormRel;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-04-13 02:17:57 +08:00
|
|
|
multiclass XForm_10rc<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
|
|
|
let BaseName = asmbase in {
|
|
|
|
let Defs = [CARRY] in
|
|
|
|
def NAME : XForm_10<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
|
|
|
let Defs = [CARRY, CR0] in
|
2013-04-12 10:18:09 +08:00
|
|
|
def o : XForm_10<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
|
|
|
[]>, isDOT, RecFormRel;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
multiclass XForm_11r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
|
|
|
let BaseName = asmbase in {
|
|
|
|
def NAME : XForm_11<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CR0] in
|
2013-04-12 10:18:09 +08:00
|
|
|
def o : XForm_11<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
|
|
|
[]>, isDOT, RecFormRel;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
multiclass XOForm_1r<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
|
|
|
let BaseName = asmbase in {
|
|
|
|
def NAME : XOForm_1<opcode, xo, oe, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CR0] in
|
|
|
|
def o : XOForm_1<opcode, xo, oe, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
|
|
|
[]>, isDOT, RecFormRel;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-04-10 07:54:37 +08:00
|
|
|
// Multiclass for instructions for which the non record form is not cracked
|
|
|
|
// and the record form is cracked (i.e. divw, mullw, etc.)
|
|
|
|
multiclass XOForm_1rcr<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
|
|
|
let BaseName = asmbase in {
|
|
|
|
def NAME : XOForm_1<opcode, xo, oe, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
|
|
|
let Defs = [CR0] in
|
|
|
|
def o : XOForm_1<opcode, xo, oe, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
|
|
|
[]>, isDOT, RecFormRel, PPC970_DGroup_First,
|
|
|
|
PPC970_DGroup_Cracked;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-04-13 02:17:57 +08:00
|
|
|
multiclass XOForm_1rc<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
|
|
|
let BaseName = asmbase in {
|
|
|
|
let Defs = [CARRY] in
|
|
|
|
def NAME : XOForm_1<opcode, xo, oe, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
|
|
|
let Defs = [CARRY, CR0] in
|
2013-04-12 10:18:09 +08:00
|
|
|
def o : XOForm_1<opcode, xo, oe, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
|
|
|
[]>, isDOT, RecFormRel;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
multiclass XOForm_3r<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
|
|
|
let BaseName = asmbase in {
|
|
|
|
def NAME : XOForm_3<opcode, xo, oe, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CR0] in
|
|
|
|
def o : XOForm_3<opcode, xo, oe, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
|
|
|
[]>, isDOT, RecFormRel;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
multiclass XOForm_3rc<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
|
|
|
let BaseName = asmbase in {
|
|
|
|
let Defs = [CARRY] in
|
|
|
|
def NAME : XOForm_3<opcode, xo, oe, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
|
|
|
let Defs = [CARRY, CR0] in
|
2013-04-12 10:18:09 +08:00
|
|
|
def o : XOForm_3<opcode, xo, oe, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
|
|
|
[]>, isDOT, RecFormRel;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
multiclass MForm_2r<bits<6> opcode, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
|
|
|
let BaseName = asmbase in {
|
|
|
|
def NAME : MForm_2<opcode, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CR0] in
|
2013-04-12 10:18:09 +08:00
|
|
|
def o : MForm_2<opcode, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
|
|
|
[]>, isDOT, RecFormRel;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
multiclass MDForm_1r<bits<6> opcode, bits<3> xo, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
|
|
|
let BaseName = asmbase in {
|
|
|
|
def NAME : MDForm_1<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CR0] in
|
2013-04-12 10:18:09 +08:00
|
|
|
def o : MDForm_1<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
|
|
|
[]>, isDOT, RecFormRel;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-04-26 23:39:12 +08:00
|
|
|
multiclass MDSForm_1r<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
|
|
|
let BaseName = asmbase in {
|
|
|
|
def NAME : MDSForm_1<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
|
|
|
let Defs = [CR0] in
|
|
|
|
def o : MDSForm_1<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
|
|
|
[]>, isDOT, RecFormRel;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-04-13 02:17:57 +08:00
|
|
|
multiclass XSForm_1rc<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
2013-04-12 10:18:09 +08:00
|
|
|
let BaseName = asmbase in {
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CARRY] in
|
2013-04-12 10:18:09 +08:00
|
|
|
def NAME : XSForm_1<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CARRY, CR0] in
|
2013-04-12 10:18:09 +08:00
|
|
|
def o : XSForm_1<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
|
|
|
[]>, isDOT, RecFormRel;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
multiclass XForm_26r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
|
|
|
let BaseName = asmbase in {
|
|
|
|
def NAME : XForm_26<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CR1] in
|
2013-04-12 10:18:09 +08:00
|
|
|
def o : XForm_26<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
2013-04-13 02:17:57 +08:00
|
|
|
[]>, isDOT, RecFormRel;
|
2013-04-12 10:18:09 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-08-19 13:01:02 +08:00
|
|
|
multiclass XForm_28r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
|
|
|
let BaseName = asmbase in {
|
|
|
|
def NAME : XForm_28<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
|
|
|
let Defs = [CR1] in
|
|
|
|
def o : XForm_28<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
|
|
|
[]>, isDOT, RecFormRel;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-04-12 10:18:09 +08:00
|
|
|
multiclass AForm_1r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
|
|
|
let BaseName = asmbase in {
|
|
|
|
def NAME : AForm_1<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CR1] in
|
2013-04-12 10:18:09 +08:00
|
|
|
def o : AForm_1<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
2013-04-13 02:17:57 +08:00
|
|
|
[]>, isDOT, RecFormRel;
|
2013-04-12 10:18:09 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
multiclass AForm_2r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
|
|
|
let BaseName = asmbase in {
|
|
|
|
def NAME : AForm_2<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CR1] in
|
2013-04-12 10:18:09 +08:00
|
|
|
def o : AForm_2<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
2013-04-13 02:17:57 +08:00
|
|
|
[]>, isDOT, RecFormRel;
|
2013-04-12 10:18:09 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
multiclass AForm_3r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
|
|
|
|
string asmbase, string asmstr, InstrItinClass itin,
|
|
|
|
list<dag> pattern> {
|
|
|
|
let BaseName = asmbase in {
|
|
|
|
def NAME : AForm_3<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
|
|
|
|
pattern>, RecFormRel;
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CR1] in
|
2013-04-12 10:18:09 +08:00
|
|
|
def o : AForm_3<opcode, xo, OOL, IOL,
|
|
|
|
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
|
2013-04-13 02:17:57 +08:00
|
|
|
[]>, isDOT, RecFormRel;
|
2013-04-12 10:18:09 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-09-09 03:50:41 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// PowerPC Instruction Definitions.
|
|
|
|
|
2004-06-22 00:55:25 +08:00
|
|
|
// Pseudo-instructions:
|
2005-09-09 03:50:41 +08:00
|
|
|
|
2006-03-12 17:13:49 +08:00
|
|
|
let hasCtrlDep = 1 in {
|
2007-09-12 03:55:27 +08:00
|
|
|
let Defs = [R1], Uses = [R1] in {
|
2012-10-05 02:14:28 +08:00
|
|
|
def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "#ADJCALLSTACKDOWN $amt",
|
2008-10-12 06:08:30 +08:00
|
|
|
[(callseq_start timm:$amt)]>;
|
2012-10-05 02:14:28 +08:00
|
|
|
def ADJCALLSTACKUP : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2",
|
2008-10-12 06:08:30 +08:00
|
|
|
[(callseq_end timm:$amt1, timm:$amt2)]>;
|
2007-09-12 03:55:27 +08:00
|
|
|
}
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def UPDATE_VRSAVE : Pseudo<(outs gprc:$rD), (ins gprc:$rS),
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
"UPDATE_VRSAVE $rD, $rS", []>;
|
2004-10-08 06:30:03 +08:00
|
|
|
}
|
2006-11-17 06:43:37 +08:00
|
|
|
|
2007-09-12 03:55:27 +08:00
|
|
|
let Defs = [R1], Uses = [R1] in
|
2013-04-27 00:53:15 +08:00
|
|
|
def DYNALLOC : Pseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$result,
|
|
|
|
(PPCdynalloc i32:$negsize, iaddr:$fpsi))]>;
|
2015-12-01 19:40:55 +08:00
|
|
|
def DYNAREAOFFSET : Pseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET",
|
|
|
|
[(set i32:$result, (PPCdynareaoffset iaddr:$fpsi))]>;
|
2006-11-17 06:43:37 +08:00
|
|
|
|
2009-10-30 02:10:34 +08:00
|
|
|
// SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after
|
|
|
|
// instruction selection into a branch sequence.
|
|
|
|
let usesCustomInserter = 1, // Expanded after instruction selection.
|
2006-03-12 17:13:49 +08:00
|
|
|
PPC970_Single = 1 in {
|
2013-03-27 13:57:58 +08:00
|
|
|
// Note that SELECT_CC_I4 and SELECT_CC_I8 use the no-r0 register classes
|
|
|
|
// because either operand might become the first operand in an isel, and
|
|
|
|
// that operand cannot be r0.
|
2013-04-27 00:53:15 +08:00
|
|
|
def SELECT_CC_I4 : Pseudo<(outs gprc:$dst), (ins crrc:$cond,
|
|
|
|
gprc_nor0:$T, gprc_nor0:$F,
|
2012-10-05 02:14:28 +08:00
|
|
|
i32imm:$BROPC), "#SELECT_CC_I4",
|
2006-09-27 10:55:21 +08:00
|
|
|
[]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def SELECT_CC_I8 : Pseudo<(outs g8rc:$dst), (ins crrc:$cond,
|
|
|
|
g8rc_nox0:$T, g8rc_nox0:$F,
|
2012-10-05 02:14:28 +08:00
|
|
|
i32imm:$BROPC), "#SELECT_CC_I8",
|
2006-09-27 10:55:21 +08:00
|
|
|
[]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def SELECT_CC_F4 : Pseudo<(outs f4rc:$dst), (ins crrc:$cond, f4rc:$T, f4rc:$F,
|
2012-10-05 02:14:28 +08:00
|
|
|
i32imm:$BROPC), "#SELECT_CC_F4",
|
2006-09-27 10:55:21 +08:00
|
|
|
[]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def SELECT_CC_F8 : Pseudo<(outs f8rc:$dst), (ins crrc:$cond, f8rc:$T, f8rc:$F,
|
2012-10-05 02:14:28 +08:00
|
|
|
i32imm:$BROPC), "#SELECT_CC_F8",
|
2006-09-27 10:55:21 +08:00
|
|
|
[]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def SELECT_CC_VRRC: Pseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F,
|
2012-10-05 02:14:28 +08:00
|
|
|
i32imm:$BROPC), "#SELECT_CC_VRRC",
|
2006-09-27 10:55:21 +08:00
|
|
|
[]>;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
|
|
|
|
// SELECT_* pseudo instructions, like SELECT_CC_* but taking condition
|
|
|
|
// register bit directly.
|
|
|
|
def SELECT_I4 : Pseudo<(outs gprc:$dst), (ins crbitrc:$cond,
|
|
|
|
gprc_nor0:$T, gprc_nor0:$F), "#SELECT_I4",
|
|
|
|
[(set i32:$dst, (select i1:$cond, i32:$T, i32:$F))]>;
|
|
|
|
def SELECT_I8 : Pseudo<(outs g8rc:$dst), (ins crbitrc:$cond,
|
|
|
|
g8rc_nox0:$T, g8rc_nox0:$F), "#SELECT_I8",
|
|
|
|
[(set i64:$dst, (select i1:$cond, i64:$T, i64:$F))]>;
|
|
|
|
def SELECT_F4 : Pseudo<(outs f4rc:$dst), (ins crbitrc:$cond,
|
|
|
|
f4rc:$T, f4rc:$F), "#SELECT_F4",
|
|
|
|
[(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>;
|
|
|
|
def SELECT_F8 : Pseudo<(outs f8rc:$dst), (ins crbitrc:$cond,
|
|
|
|
f8rc:$T, f8rc:$F), "#SELECT_F8",
|
|
|
|
[(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>;
|
|
|
|
def SELECT_VRRC: Pseudo<(outs vrrc:$dst), (ins crbitrc:$cond,
|
|
|
|
vrrc:$T, vrrc:$F), "#SELECT_VRRC",
|
|
|
|
[(set v4i32:$dst,
|
|
|
|
(select i1:$cond, v4i32:$T, v4i32:$F))]>;
|
2005-08-27 05:23:58 +08:00
|
|
|
}
|
|
|
|
|
2008-03-04 06:19:16 +08:00
|
|
|
// SPILL_CR - Indicate that we're dumping the CR register, so we'll need to
|
|
|
|
// scavenge a register for it.
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
let mayStore = 1 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def SPILL_CR : Pseudo<(outs), (ins crrc:$cond, memri:$F),
|
2012-10-05 02:14:28 +08:00
|
|
|
"#SPILL_CR", []>;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
def SPILL_CRBIT : Pseudo<(outs), (ins crbitrc:$cond, memri:$F),
|
|
|
|
"#SPILL_CRBIT", []>;
|
|
|
|
}
|
2008-03-04 06:19:16 +08:00
|
|
|
|
2011-12-07 04:55:36 +08:00
|
|
|
// RESTORE_CR - Indicate that we're restoring the CR register (previously
|
|
|
|
// spilled), so we'll need to scavenge a register for it.
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
let mayLoad = 1 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def RESTORE_CR : Pseudo<(outs crrc:$cond), (ins memri:$F),
|
2012-10-05 02:14:28 +08:00
|
|
|
"#RESTORE_CR", []>;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
def RESTORE_CRBIT : Pseudo<(outs crbitrc:$cond), (ins memri:$F),
|
|
|
|
"#RESTORE_CRBIT", []>;
|
|
|
|
}
|
2011-12-07 04:55:36 +08:00
|
|
|
|
2007-07-21 08:34:19 +08:00
|
|
|
let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
|
2013-03-26 18:53:27 +08:00
|
|
|
let isReturn = 1, Uses = [LR, RM] in
|
2013-11-28 07:26:09 +08:00
|
|
|
def BLR : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB,
|
2015-01-14 01:47:54 +08:00
|
|
|
[(retflag)]>, Requires<[In32BitMode]>;
|
PPC: Prep for if conversion of bctr[l]
This adds in-principle support for if-converting the bctr[l] instructions.
These instructions are used for indirect branching. It seems, however, that the
current if converter will never actually predicate these. To do so, it would
need the ability to hoist a few setup insts. out of the conditionally-executed
block. For example, code like this:
void foo(int a, int (*bar)()) { if (a != 0) bar(); }
becomes:
...
beq 0, .LBB0_2
std 2, 40(1)
mr 12, 4
ld 3, 0(4)
ld 11, 16(4)
ld 2, 8(4)
mtctr 3
bctrl
ld 2, 40(1)
.LBB0_2:
...
and it would be safe to do all of this unconditionally with a predicated
beqctrl instruction.
llvm-svn: 179156
2013-04-10 14:42:34 +08:00
|
|
|
let isBranch = 1, isIndirectBranch = 1, Uses = [CTR] in {
|
2013-11-28 07:26:09 +08:00
|
|
|
def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
|
|
|
|
[]>;
|
PPC: Prep for if conversion of bctr[l]
This adds in-principle support for if-converting the bctr[l] instructions.
These instructions are used for indirect branching. It seems, however, that the
current if converter will never actually predicate these. To do so, it would
need the ability to hoist a few setup insts. out of the conditionally-executed
block. For example, code like this:
void foo(int a, int (*bar)()) { if (a != 0) bar(); }
becomes:
...
beq 0, .LBB0_2
std 2, 40(1)
mr 12, 4
ld 3, 0(4)
ld 11, 16(4)
ld 2, 8(4)
mtctr 3
bctrl
ld 2, 40(1)
.LBB0_2:
...
and it would be safe to do all of this unconditionally with a predicated
beqctrl instruction.
llvm-svn: 179156
2013-04-10 14:42:34 +08:00
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
let isCodeGenOnly = 1 in {
|
|
|
|
def BCCCTR : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond),
|
|
|
|
"b${cond:cc}ctr${cond:pm} ${cond:reg}", IIC_BrB,
|
|
|
|
[]>;
|
|
|
|
|
|
|
|
def BCCTR : XLForm_2_br2<19, 528, 12, 0, (outs), (ins crbitrc:$bi),
|
|
|
|
"bcctr 12, $bi, 0", IIC_BrB, []>;
|
|
|
|
def BCCTRn : XLForm_2_br2<19, 528, 4, 0, (outs), (ins crbitrc:$bi),
|
|
|
|
"bcctr 4, $bi, 0", IIC_BrB, []>;
|
|
|
|
}
|
PPC: Prep for if conversion of bctr[l]
This adds in-principle support for if-converting the bctr[l] instructions.
These instructions are used for indirect branching. It seems, however, that the
current if converter will never actually predicate these. To do so, it would
need the ability to hoist a few setup insts. out of the conditionally-executed
block. For example, code like this:
void foo(int a, int (*bar)()) { if (a != 0) bar(); }
becomes:
...
beq 0, .LBB0_2
std 2, 40(1)
mr 12, 4
ld 3, 0(4)
ld 11, 16(4)
ld 2, 8(4)
mtctr 3
bctrl
ld 2, 40(1)
.LBB0_2:
...
and it would be safe to do all of this unconditionally with a predicated
beqctrl instruction.
llvm-svn: 179156
2013-04-10 14:42:34 +08:00
|
|
|
}
|
2005-09-09 03:50:41 +08:00
|
|
|
}
|
|
|
|
|
2005-02-16 04:26:49 +08:00
|
|
|
let Defs = [LR] in
|
2012-10-05 02:14:28 +08:00
|
|
|
def MovePCtoLR : Pseudo<(outs), (ins), "#MovePCtoLR", []>,
|
2006-03-12 17:13:49 +08:00
|
|
|
PPC970_Unit_BRU;
|
2014-11-12 23:16:30 +08:00
|
|
|
let Defs = [LR] in
|
|
|
|
def MoveGOTtoLR : Pseudo<(outs), (ins), "#MoveGOTtoLR", []>,
|
|
|
|
PPC970_Unit_BRU;
|
2004-06-22 00:55:25 +08:00
|
|
|
|
2007-07-21 08:34:19 +08:00
|
|
|
let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
|
2006-10-14 03:10:34 +08:00
|
|
|
let isBarrier = 1 in {
|
2010-11-15 14:09:35 +08:00
|
|
|
def B : IForm<18, 0, 0, (outs), (ins directbrtarget:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"b $dst", IIC_BrB,
|
2005-12-05 02:42:54 +08:00
|
|
|
[(br bb:$dst)]>;
|
2013-06-24 19:03:33 +08:00
|
|
|
def BA : IForm<18, 1, 0, (outs), (ins absdirectbrtarget:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"ba $dst", IIC_BrB, []>;
|
2006-10-14 03:10:34 +08:00
|
|
|
}
|
2004-11-23 07:07:01 +08:00
|
|
|
|
2006-11-18 06:37:34 +08:00
|
|
|
// BCC represents an arbitrary conditional branch on a predicate.
|
|
|
|
// FIXME: should be able to write a pattern for PPCcondbranch, but can't use
|
2012-10-05 23:16:11 +08:00
|
|
|
// a two-value operand where a dag node expects two operands. :(
|
2013-04-09 00:24:03 +08:00
|
|
|
let isCodeGenOnly = 1 in {
|
2012-10-05 23:16:11 +08:00
|
|
|
def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst),
|
2013-06-25 00:52:04 +08:00
|
|
|
"b${cond:cc}${cond:pm} ${cond:reg}, $dst"
|
2013-04-27 00:53:15 +08:00
|
|
|
/*[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]*/>;
|
2013-06-24 19:03:33 +08:00
|
|
|
def BCCA : BForm<16, 1, 0, (outs), (ins pred:$cond, abscondbrtarget:$dst),
|
2013-06-25 00:52:04 +08:00
|
|
|
"b${cond:cc}a${cond:pm} ${cond:reg}, $dst">;
|
2013-06-24 19:03:33 +08:00
|
|
|
|
2013-04-09 00:24:03 +08:00
|
|
|
let isReturn = 1, Uses = [LR, RM] in
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
def BCCLR : XLForm_2_br<19, 16, 0, (outs), (ins pred:$cond),
|
2013-11-28 07:26:09 +08:00
|
|
|
"b${cond:cc}lr${cond:pm} ${cond:reg}", IIC_BrB, []>;
|
2013-06-25 00:52:04 +08:00
|
|
|
}
|
2013-04-10 06:58:37 +08:00
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
let isCodeGenOnly = 1 in {
|
|
|
|
let Pattern = [(brcond i1:$bi, bb:$dst)] in
|
|
|
|
def BC : BForm_4<16, 12, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst),
|
|
|
|
"bc 12, $bi, $dst">;
|
|
|
|
|
|
|
|
let Pattern = [(brcond (not i1:$bi), bb:$dst)] in
|
|
|
|
def BCn : BForm_4<16, 4, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst),
|
|
|
|
"bc 4, $bi, $dst">;
|
|
|
|
|
|
|
|
let isReturn = 1, Uses = [LR, RM] in
|
|
|
|
def BCLR : XLForm_2_br2<19, 16, 12, 0, (outs), (ins crbitrc:$bi),
|
|
|
|
"bclr 12, $bi, 0", IIC_BrB, []>;
|
|
|
|
def BCLRn : XLForm_2_br2<19, 16, 4, 0, (outs), (ins crbitrc:$bi),
|
|
|
|
"bclr 4, $bi, 0", IIC_BrB, []>;
|
|
|
|
}
|
|
|
|
|
2013-06-25 00:52:04 +08:00
|
|
|
let isReturn = 1, Defs = [CTR], Uses = [CTR, LR, RM] in {
|
|
|
|
def BDZLR : XLForm_2_ext<19, 16, 18, 0, 0, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bdzlr", IIC_BrB, []>;
|
2013-06-25 00:52:04 +08:00
|
|
|
def BDNZLR : XLForm_2_ext<19, 16, 16, 0, 0, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bdnzlr", IIC_BrB, []>;
|
2013-06-25 00:52:04 +08:00
|
|
|
def BDZLRp : XLForm_2_ext<19, 16, 27, 0, 0, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bdzlr+", IIC_BrB, []>;
|
2013-06-25 00:52:04 +08:00
|
|
|
def BDNZLRp: XLForm_2_ext<19, 16, 25, 0, 0, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bdnzlr+", IIC_BrB, []>;
|
2013-06-25 00:52:04 +08:00
|
|
|
def BDZLRm : XLForm_2_ext<19, 16, 26, 0, 0, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bdzlr-", IIC_BrB, []>;
|
2013-06-25 00:52:04 +08:00
|
|
|
def BDNZLRm: XLForm_2_ext<19, 16, 24, 0, 0, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bdnzlr-", IIC_BrB, []>;
|
2013-04-09 00:24:03 +08:00
|
|
|
}
|
2012-06-08 23:38:21 +08:00
|
|
|
|
|
|
|
let Defs = [CTR], Uses = [CTR] in {
|
2012-11-14 03:15:52 +08:00
|
|
|
def BDZ : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
|
|
|
|
"bdz $dst">;
|
|
|
|
def BDNZ : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
|
|
|
|
"bdnz $dst">;
|
2013-06-24 19:03:33 +08:00
|
|
|
def BDZA : BForm_1<16, 18, 1, 0, (outs), (ins abscondbrtarget:$dst),
|
|
|
|
"bdza $dst">;
|
|
|
|
def BDNZA : BForm_1<16, 16, 1, 0, (outs), (ins abscondbrtarget:$dst),
|
|
|
|
"bdnza $dst">;
|
2013-06-25 00:52:04 +08:00
|
|
|
def BDZp : BForm_1<16, 27, 0, 0, (outs), (ins condbrtarget:$dst),
|
|
|
|
"bdz+ $dst">;
|
|
|
|
def BDNZp: BForm_1<16, 25, 0, 0, (outs), (ins condbrtarget:$dst),
|
|
|
|
"bdnz+ $dst">;
|
|
|
|
def BDZAp : BForm_1<16, 27, 1, 0, (outs), (ins abscondbrtarget:$dst),
|
|
|
|
"bdza+ $dst">;
|
|
|
|
def BDNZAp: BForm_1<16, 25, 1, 0, (outs), (ins abscondbrtarget:$dst),
|
|
|
|
"bdnza+ $dst">;
|
|
|
|
def BDZm : BForm_1<16, 26, 0, 0, (outs), (ins condbrtarget:$dst),
|
|
|
|
"bdz- $dst">;
|
|
|
|
def BDNZm: BForm_1<16, 24, 0, 0, (outs), (ins condbrtarget:$dst),
|
|
|
|
"bdnz- $dst">;
|
|
|
|
def BDZAm : BForm_1<16, 26, 1, 0, (outs), (ins abscondbrtarget:$dst),
|
|
|
|
"bdza- $dst">;
|
|
|
|
def BDNZAm: BForm_1<16, 24, 1, 0, (outs), (ins abscondbrtarget:$dst),
|
|
|
|
"bdnza- $dst">;
|
2012-06-08 23:38:21 +08:00
|
|
|
}
|
2004-06-29 02:23:35 +08:00
|
|
|
}
|
|
|
|
|
2013-04-05 06:55:54 +08:00
|
|
|
// The unconditional BCL used by the SjLj setjmp code.
|
2013-03-26 18:57:16 +08:00
|
|
|
let isCall = 1, hasCtrlDep = 1, isCodeGenOnly = 1, PPC970_Unit = 7 in {
|
2013-03-22 05:37:52 +08:00
|
|
|
let Defs = [LR], Uses = [RM] in {
|
2013-04-05 06:55:54 +08:00
|
|
|
def BCLalways : BForm_2<16, 20, 31, 0, 1, (outs), (ins condbrtarget:$dst),
|
|
|
|
"bcl 20, 31, $dst">;
|
2013-03-22 05:37:52 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-03-07 00:41:49 +08:00
|
|
|
let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
|
2007-02-25 13:34:32 +08:00
|
|
|
// Convenient aliases for call instructions
|
2008-10-30 02:26:45 +08:00
|
|
|
let Uses = [RM] in {
|
2013-03-22 23:24:13 +08:00
|
|
|
def BL : IForm<18, 0, 1, (outs), (ins calltarget:$func),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bl $func", IIC_BrB, []>; // See Pat patterns below.
|
2013-06-24 19:03:33 +08:00
|
|
|
def BLA : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bla $func", IIC_BrB, [(PPCcall (i32 imm:$func))]>;
|
2013-06-24 19:02:19 +08:00
|
|
|
|
2013-06-24 19:03:33 +08:00
|
|
|
let isCodeGenOnly = 1 in {
|
2014-07-26 01:47:22 +08:00
|
|
|
def BL_TLS : IForm<18, 0, 1, (outs), (ins tlscall32:$func),
|
|
|
|
"bl $func", IIC_BrB, []>;
|
2013-06-24 19:03:33 +08:00
|
|
|
def BCCL : BForm<16, 0, 1, (outs), (ins pred:$cond, condbrtarget:$dst),
|
2013-06-25 00:52:04 +08:00
|
|
|
"b${cond:cc}l${cond:pm} ${cond:reg}, $dst">;
|
2013-06-24 19:03:33 +08:00
|
|
|
def BCCLA : BForm<16, 1, 1, (outs), (ins pred:$cond, abscondbrtarget:$dst),
|
2013-06-25 00:52:04 +08:00
|
|
|
"b${cond:cc}la${cond:pm} ${cond:reg}, $dst">;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
|
|
|
|
def BCL : BForm_4<16, 12, 0, 1, (outs),
|
|
|
|
(ins crbitrc:$bi, condbrtarget:$dst),
|
|
|
|
"bcl 12, $bi, $dst">;
|
|
|
|
def BCLn : BForm_4<16, 4, 0, 1, (outs),
|
|
|
|
(ins crbitrc:$bi, condbrtarget:$dst),
|
|
|
|
"bcl 4, $bi, $dst">;
|
2013-06-24 19:03:33 +08:00
|
|
|
}
|
2008-10-30 02:26:45 +08:00
|
|
|
}
|
|
|
|
let Uses = [CTR, RM] in {
|
2013-03-22 23:24:13 +08:00
|
|
|
def BCTRL : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bctrl", IIC_BrB, [(PPCbctrl)]>,
|
2013-03-22 23:24:13 +08:00
|
|
|
Requires<[In32BitMode]>;
|
2013-04-18 01:19:05 +08:00
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
let isCodeGenOnly = 1 in {
|
|
|
|
def BCCCTRL : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond),
|
|
|
|
"b${cond:cc}ctrl${cond:pm} ${cond:reg}", IIC_BrB,
|
|
|
|
[]>;
|
|
|
|
|
|
|
|
def BCCTRL : XLForm_2_br2<19, 528, 12, 1, (outs), (ins crbitrc:$bi),
|
|
|
|
"bcctrl 12, $bi, 0", IIC_BrB, []>;
|
|
|
|
def BCCTRLn : XLForm_2_br2<19, 528, 4, 1, (outs), (ins crbitrc:$bi),
|
|
|
|
"bcctrl 4, $bi, 0", IIC_BrB, []>;
|
|
|
|
}
|
2008-10-24 04:41:28 +08:00
|
|
|
}
|
2013-06-24 19:01:55 +08:00
|
|
|
let Uses = [LR, RM] in {
|
|
|
|
def BLRL : XLForm_2_ext<19, 16, 20, 0, 1, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"blrl", IIC_BrB, []>;
|
2013-06-24 19:01:55 +08:00
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
let isCodeGenOnly = 1 in {
|
|
|
|
def BCCLRL : XLForm_2_br<19, 16, 1, (outs), (ins pred:$cond),
|
|
|
|
"b${cond:cc}lrl${cond:pm} ${cond:reg}", IIC_BrB,
|
|
|
|
[]>;
|
|
|
|
|
|
|
|
def BCLRL : XLForm_2_br2<19, 16, 12, 1, (outs), (ins crbitrc:$bi),
|
|
|
|
"bclrl 12, $bi, 0", IIC_BrB, []>;
|
|
|
|
def BCLRLn : XLForm_2_br2<19, 16, 4, 1, (outs), (ins crbitrc:$bi),
|
|
|
|
"bclrl 4, $bi, 0", IIC_BrB, []>;
|
|
|
|
}
|
2013-06-24 19:01:55 +08:00
|
|
|
}
|
2013-06-24 19:02:38 +08:00
|
|
|
let Defs = [CTR], Uses = [CTR, RM] in {
|
|
|
|
def BDZL : BForm_1<16, 18, 0, 1, (outs), (ins condbrtarget:$dst),
|
|
|
|
"bdzl $dst">;
|
|
|
|
def BDNZL : BForm_1<16, 16, 0, 1, (outs), (ins condbrtarget:$dst),
|
|
|
|
"bdnzl $dst">;
|
2013-06-24 19:03:33 +08:00
|
|
|
def BDZLA : BForm_1<16, 18, 1, 1, (outs), (ins abscondbrtarget:$dst),
|
|
|
|
"bdzla $dst">;
|
|
|
|
def BDNZLA : BForm_1<16, 16, 1, 1, (outs), (ins abscondbrtarget:$dst),
|
|
|
|
"bdnzla $dst">;
|
2013-06-25 00:52:04 +08:00
|
|
|
def BDZLp : BForm_1<16, 27, 0, 1, (outs), (ins condbrtarget:$dst),
|
|
|
|
"bdzl+ $dst">;
|
|
|
|
def BDNZLp: BForm_1<16, 25, 0, 1, (outs), (ins condbrtarget:$dst),
|
|
|
|
"bdnzl+ $dst">;
|
|
|
|
def BDZLAp : BForm_1<16, 27, 1, 1, (outs), (ins abscondbrtarget:$dst),
|
|
|
|
"bdzla+ $dst">;
|
|
|
|
def BDNZLAp: BForm_1<16, 25, 1, 1, (outs), (ins abscondbrtarget:$dst),
|
|
|
|
"bdnzla+ $dst">;
|
|
|
|
def BDZLm : BForm_1<16, 26, 0, 1, (outs), (ins condbrtarget:$dst),
|
|
|
|
"bdzl- $dst">;
|
|
|
|
def BDNZLm: BForm_1<16, 24, 0, 1, (outs), (ins condbrtarget:$dst),
|
|
|
|
"bdnzl- $dst">;
|
|
|
|
def BDZLAm : BForm_1<16, 26, 1, 1, (outs), (ins abscondbrtarget:$dst),
|
|
|
|
"bdzla- $dst">;
|
|
|
|
def BDNZLAm: BForm_1<16, 24, 1, 1, (outs), (ins abscondbrtarget:$dst),
|
|
|
|
"bdnzla- $dst">;
|
2013-06-24 19:02:38 +08:00
|
|
|
}
|
|
|
|
let Defs = [CTR], Uses = [CTR, LR, RM] in {
|
|
|
|
def BDZLRL : XLForm_2_ext<19, 16, 18, 0, 1, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bdzlrl", IIC_BrB, []>;
|
2013-06-24 19:02:38 +08:00
|
|
|
def BDNZLRL : XLForm_2_ext<19, 16, 16, 0, 1, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bdnzlrl", IIC_BrB, []>;
|
2013-06-25 00:52:04 +08:00
|
|
|
def BDZLRLp : XLForm_2_ext<19, 16, 27, 0, 1, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bdzlrl+", IIC_BrB, []>;
|
2013-06-25 00:52:04 +08:00
|
|
|
def BDNZLRLp: XLForm_2_ext<19, 16, 25, 0, 1, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bdnzlrl+", IIC_BrB, []>;
|
2013-06-25 00:52:04 +08:00
|
|
|
def BDZLRLm : XLForm_2_ext<19, 16, 26, 0, 1, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bdzlrl-", IIC_BrB, []>;
|
2013-06-25 00:52:04 +08:00
|
|
|
def BDNZLRLm: XLForm_2_ext<19, 16, 24, 0, 1, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bdnzlrl-", IIC_BrB, []>;
|
2013-06-24 19:02:38 +08:00
|
|
|
}
|
2004-06-30 07:37:36 +08:00
|
|
|
}
|
|
|
|
|
2008-10-30 02:26:45 +08:00
|
|
|
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
|
2008-04-30 17:16:33 +08:00
|
|
|
def TCRETURNdi :Pseudo< (outs),
|
2012-07-14 04:44:29 +08:00
|
|
|
(ins calltarget:$dst, i32imm:$offset),
|
2008-04-30 17:16:33 +08:00
|
|
|
"#TC_RETURNd $dst $offset",
|
|
|
|
[]>;
|
|
|
|
|
|
|
|
|
2008-10-30 02:26:45 +08:00
|
|
|
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
|
2013-06-24 19:03:33 +08:00
|
|
|
def TCRETURNai :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset),
|
2008-04-30 17:16:33 +08:00
|
|
|
"#TC_RETURNa $func $offset",
|
|
|
|
[(PPCtc_return (i32 imm:$func), imm:$offset)]>;
|
|
|
|
|
2008-10-30 02:26:45 +08:00
|
|
|
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
|
2012-07-14 04:44:29 +08:00
|
|
|
def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset),
|
2008-04-30 17:16:33 +08:00
|
|
|
"#TC_RETURNr $dst $offset",
|
|
|
|
[]>;
|
|
|
|
|
|
|
|
|
2013-03-26 18:57:16 +08:00
|
|
|
let isCodeGenOnly = 1 in {
|
|
|
|
|
2008-04-30 17:16:33 +08:00
|
|
|
let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
|
2008-10-30 02:26:45 +08:00
|
|
|
isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM] in
|
2013-11-28 07:26:09 +08:00
|
|
|
def TAILBCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
|
|
|
|
[]>, Requires<[In32BitMode]>;
|
2008-04-30 17:16:33 +08:00
|
|
|
|
|
|
|
let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
|
2008-10-30 02:26:45 +08:00
|
|
|
isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
|
2008-04-30 17:16:33 +08:00
|
|
|
def TAILB : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"b $dst", IIC_BrB,
|
2008-04-30 17:16:33 +08:00
|
|
|
[]>;
|
|
|
|
|
|
|
|
let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
|
2008-10-30 02:26:45 +08:00
|
|
|
isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
|
2013-06-24 19:03:33 +08:00
|
|
|
def TAILBA : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"ba $dst", IIC_BrB,
|
2008-04-30 17:16:33 +08:00
|
|
|
[]>;
|
|
|
|
|
2013-06-24 19:03:33 +08:00
|
|
|
}
|
|
|
|
|
2013-03-26 18:57:16 +08:00
|
|
|
let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
|
2013-07-17 13:35:44 +08:00
|
|
|
let Defs = [CTR] in
|
2013-04-27 00:53:15 +08:00
|
|
|
def EH_SjLj_SetJmp32 : Pseudo<(outs gprc:$dst), (ins memr:$buf),
|
2013-03-22 05:37:52 +08:00
|
|
|
"#EH_SJLJ_SETJMP32",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>,
|
2013-03-22 05:37:52 +08:00
|
|
|
Requires<[In32BitMode]>;
|
|
|
|
let isTerminator = 1 in
|
|
|
|
def EH_SjLj_LongJmp32 : Pseudo<(outs), (ins memr:$buf),
|
|
|
|
"#EH_SJLJ_LONGJMP32",
|
|
|
|
[(PPCeh_sjlj_longjmp addr:$buf)]>,
|
|
|
|
Requires<[In32BitMode]>;
|
|
|
|
}
|
|
|
|
|
2016-04-29 05:24:37 +08:00
|
|
|
// This pseudo is never removed from the function, as it serves as
|
|
|
|
// a terminator. Size is set to 0 to prevent the builtin assembler
|
|
|
|
// from emitting it.
|
|
|
|
let isBranch = 1, isTerminator = 1, Size = 0 in {
|
2013-03-22 05:37:52 +08:00
|
|
|
def EH_SjLj_Setup : Pseudo<(outs), (ins directbrtarget:$dst),
|
|
|
|
"#EH_SjLj_Setup\t$dst", []>;
|
|
|
|
}
|
2008-04-30 17:16:33 +08:00
|
|
|
|
2013-05-15 03:35:45 +08:00
|
|
|
// System call.
|
|
|
|
let PPC970_Unit = 7 in {
|
|
|
|
def SC : SCForm<17, 1, (outs), (ins i32imm:$lev),
|
2013-11-28 07:26:09 +08:00
|
|
|
"sc $lev", IIC_BrB, [(PPCsc (i32 imm:$lev))]>;
|
2013-05-15 03:35:45 +08:00
|
|
|
}
|
|
|
|
|
2015-05-23 00:44:10 +08:00
|
|
|
// Branch history rolling buffer.
|
|
|
|
def CLRBHRB : XForm_0<31, 430, (outs), (ins), "clrbhrb", IIC_BrB,
|
|
|
|
[(PPCclrbhrb)]>,
|
|
|
|
PPC970_DGroup_Single;
|
|
|
|
// The $dmy argument used for MFBHRBE is not needed; however, including
|
|
|
|
// it avoids automatic generation of PPCFastISel::fastEmit_i(), which
|
|
|
|
// interferes with necessary special handling (see PPCFastISel.cpp).
|
|
|
|
def MFBHRBE : XFXForm_3p<31, 302, (outs gprc:$rD),
|
|
|
|
(ins u10imm:$imm, u10imm:$dmy),
|
|
|
|
"mfbhrbe $rD, $imm", IIC_BrB,
|
|
|
|
[(set i32:$rD,
|
|
|
|
(PPCmfbhrbe imm:$imm, imm:$dmy))]>,
|
|
|
|
PPC970_DGroup_First;
|
|
|
|
|
|
|
|
def RFEBB : XLForm_S<19, 146, (outs), (ins u1imm:$imm), "rfebb $imm",
|
|
|
|
IIC_BrB, [(PPCrfebb (i32 imm:$imm))]>,
|
|
|
|
PPC970_DGroup_Single;
|
|
|
|
|
2006-06-07 05:29:23 +08:00
|
|
|
// DCB* instructions.
|
2013-11-28 07:26:09 +08:00
|
|
|
def DCBA : DCB_Form<758, 0, (outs), (ins memrr:$dst), "dcba $dst",
|
|
|
|
IIC_LdStDCBF, [(int_ppc_dcba xoaddr:$dst)]>,
|
2006-10-24 09:08:42 +08:00
|
|
|
PPC970_DGroup_Single;
|
2013-11-28 07:26:09 +08:00
|
|
|
def DCBI : DCB_Form<470, 0, (outs), (ins memrr:$dst), "dcbi $dst",
|
|
|
|
IIC_LdStDCBF, [(int_ppc_dcbi xoaddr:$dst)]>,
|
2006-10-24 09:08:42 +08:00
|
|
|
PPC970_DGroup_Single;
|
2013-11-28 07:26:09 +08:00
|
|
|
def DCBST : DCB_Form<54, 0, (outs), (ins memrr:$dst), "dcbst $dst",
|
|
|
|
IIC_LdStDCBF, [(int_ppc_dcbst xoaddr:$dst)]>,
|
2006-10-24 09:08:42 +08:00
|
|
|
PPC970_DGroup_Single;
|
2013-11-28 07:26:09 +08:00
|
|
|
def DCBZ : DCB_Form<1014, 0, (outs), (ins memrr:$dst), "dcbz $dst",
|
|
|
|
IIC_LdStDCBF, [(int_ppc_dcbz xoaddr:$dst)]>,
|
2006-10-24 09:08:42 +08:00
|
|
|
PPC970_DGroup_Single;
|
2013-11-28 07:26:09 +08:00
|
|
|
def DCBZL : DCB_Form<1014, 1, (outs), (ins memrr:$dst), "dcbzl $dst",
|
|
|
|
IIC_LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>,
|
2006-10-24 09:08:42 +08:00
|
|
|
PPC970_DGroup_Single;
|
2006-11-15 03:19:53 +08:00
|
|
|
|
2016-09-03 07:41:54 +08:00
|
|
|
def DCBF : DCB_Form_hint<86, (outs), (ins u5imm:$TH, memrr:$dst),
|
|
|
|
"dcbf $dst, $TH", IIC_LdStDCBF, []>,
|
|
|
|
PPC970_DGroup_Single;
|
|
|
|
|
2015-04-24 06:47:57 +08:00
|
|
|
let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in {
|
|
|
|
def DCBT : DCB_Form_hint<278, (outs), (ins u5imm:$TH, memrr:$dst),
|
|
|
|
"dcbt $dst, $TH", IIC_LdStDCBF, []>,
|
|
|
|
PPC970_DGroup_Single;
|
|
|
|
def DCBTST : DCB_Form_hint<246, (outs), (ins u5imm:$TH, memrr:$dst),
|
|
|
|
"dcbtst $dst, $TH", IIC_LdStDCBF, []>,
|
|
|
|
PPC970_DGroup_Single;
|
|
|
|
} // hasSideEffects = 0
|
|
|
|
|
2014-08-24 07:21:04 +08:00
|
|
|
def ICBT : XForm_icbt<31, 22, (outs), (ins u4imm:$CT, memrr:$src),
|
2015-01-15 04:17:10 +08:00
|
|
|
"icbt $CT, $src", IIC_LdStLoad>, Requires<[HasICBT]>;
|
2014-08-24 07:21:04 +08:00
|
|
|
|
2015-04-24 06:47:57 +08:00
|
|
|
def : Pat<(int_ppc_dcbt xoaddr:$dst),
|
|
|
|
(DCBT 0, xoaddr:$dst)>;
|
|
|
|
def : Pat<(int_ppc_dcbtst xoaddr:$dst),
|
|
|
|
(DCBTST 0, xoaddr:$dst)>;
|
2016-09-03 07:41:54 +08:00
|
|
|
def : Pat<(int_ppc_dcbf xoaddr:$dst),
|
|
|
|
(DCBF 0, xoaddr:$dst)>;
|
2015-04-24 06:47:57 +08:00
|
|
|
|
2012-04-02 04:08:17 +08:00
|
|
|
def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 1)),
|
2015-04-24 06:47:57 +08:00
|
|
|
(DCBT 0, xoaddr:$dst)>; // data prefetch for loads
|
2014-08-24 07:21:04 +08:00
|
|
|
def : Pat<(prefetch xoaddr:$dst, (i32 1), imm, (i32 1)),
|
2015-04-24 06:47:57 +08:00
|
|
|
(DCBTST 0, xoaddr:$dst)>; // data prefetch for stores
|
2014-08-24 07:21:04 +08:00
|
|
|
def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 0)),
|
2015-01-15 04:17:10 +08:00
|
|
|
(ICBT 0, xoaddr:$dst)>, Requires<[HasICBT]>; // inst prefetch (for read)
|
2012-04-02 04:08:17 +08:00
|
|
|
|
2008-07-12 10:23:19 +08:00
|
|
|
// Atomic operations
|
2009-10-30 02:10:34 +08:00
|
|
|
let usesCustomInserter = 1 in {
|
2011-04-05 01:07:09 +08:00
|
|
|
let Defs = [CR0] in {
|
2008-08-29 01:53:09 +08:00
|
|
|
def ATOMIC_LOAD_ADD_I8 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I8",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_add_8 xoaddr:$ptr, i32:$incr))]>;
|
2008-08-29 01:53:09 +08:00
|
|
|
def ATOMIC_LOAD_SUB_I8 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I8",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_sub_8 xoaddr:$ptr, i32:$incr))]>;
|
2008-08-29 01:53:09 +08:00
|
|
|
def ATOMIC_LOAD_AND_I8 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I8",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_and_8 xoaddr:$ptr, i32:$incr))]>;
|
2008-08-29 01:53:09 +08:00
|
|
|
def ATOMIC_LOAD_OR_I8 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I8",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_or_8 xoaddr:$ptr, i32:$incr))]>;
|
2008-08-29 01:53:09 +08:00
|
|
|
def ATOMIC_LOAD_XOR_I8 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "ATOMIC_LOAD_XOR_I8",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_xor_8 xoaddr:$ptr, i32:$incr))]>;
|
2008-08-29 01:53:09 +08:00
|
|
|
def ATOMIC_LOAD_NAND_I8 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I8",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_nand_8 xoaddr:$ptr, i32:$incr))]>;
|
2016-08-29 00:17:58 +08:00
|
|
|
def ATOMIC_LOAD_MIN_I8 : Pseudo<
|
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I8",
|
|
|
|
[(set i32:$dst, (atomic_load_min_8 xoaddr:$ptr, i32:$incr))]>;
|
|
|
|
def ATOMIC_LOAD_MAX_I8 : Pseudo<
|
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I8",
|
|
|
|
[(set i32:$dst, (atomic_load_max_8 xoaddr:$ptr, i32:$incr))]>;
|
|
|
|
def ATOMIC_LOAD_UMIN_I8 : Pseudo<
|
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I8",
|
|
|
|
[(set i32:$dst, (atomic_load_umin_8 xoaddr:$ptr, i32:$incr))]>;
|
|
|
|
def ATOMIC_LOAD_UMAX_I8 : Pseudo<
|
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I8",
|
|
|
|
[(set i32:$dst, (atomic_load_umax_8 xoaddr:$ptr, i32:$incr))]>;
|
2008-08-29 01:53:09 +08:00
|
|
|
def ATOMIC_LOAD_ADD_I16 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I16",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_add_16 xoaddr:$ptr, i32:$incr))]>;
|
2008-08-29 01:53:09 +08:00
|
|
|
def ATOMIC_LOAD_SUB_I16 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I16",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_sub_16 xoaddr:$ptr, i32:$incr))]>;
|
2008-08-29 01:53:09 +08:00
|
|
|
def ATOMIC_LOAD_AND_I16 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I16",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_and_16 xoaddr:$ptr, i32:$incr))]>;
|
2008-08-29 01:53:09 +08:00
|
|
|
def ATOMIC_LOAD_OR_I16 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I16",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_or_16 xoaddr:$ptr, i32:$incr))]>;
|
2008-08-29 01:53:09 +08:00
|
|
|
def ATOMIC_LOAD_XOR_I16 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I16",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_xor_16 xoaddr:$ptr, i32:$incr))]>;
|
2008-08-29 01:53:09 +08:00
|
|
|
def ATOMIC_LOAD_NAND_I16 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I16",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_nand_16 xoaddr:$ptr, i32:$incr))]>;
|
2016-08-29 00:17:58 +08:00
|
|
|
def ATOMIC_LOAD_MIN_I16 : Pseudo<
|
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I16",
|
|
|
|
[(set i32:$dst, (atomic_load_min_16 xoaddr:$ptr, i32:$incr))]>;
|
|
|
|
def ATOMIC_LOAD_MAX_I16 : Pseudo<
|
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I16",
|
|
|
|
[(set i32:$dst, (atomic_load_max_16 xoaddr:$ptr, i32:$incr))]>;
|
|
|
|
def ATOMIC_LOAD_UMIN_I16 : Pseudo<
|
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I16",
|
|
|
|
[(set i32:$dst, (atomic_load_umin_16 xoaddr:$ptr, i32:$incr))]>;
|
|
|
|
def ATOMIC_LOAD_UMAX_I16 : Pseudo<
|
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I16",
|
|
|
|
[(set i32:$dst, (atomic_load_umax_16 xoaddr:$ptr, i32:$incr))]>;
|
2008-07-12 10:23:19 +08:00
|
|
|
def ATOMIC_LOAD_ADD_I32 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I32",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_add_32 xoaddr:$ptr, i32:$incr))]>;
|
2008-08-26 06:34:37 +08:00
|
|
|
def ATOMIC_LOAD_SUB_I32 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I32",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_sub_32 xoaddr:$ptr, i32:$incr))]>;
|
2008-08-26 06:34:37 +08:00
|
|
|
def ATOMIC_LOAD_AND_I32 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I32",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_and_32 xoaddr:$ptr, i32:$incr))]>;
|
2008-08-26 06:34:37 +08:00
|
|
|
def ATOMIC_LOAD_OR_I32 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I32",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_or_32 xoaddr:$ptr, i32:$incr))]>;
|
2008-08-26 06:34:37 +08:00
|
|
|
def ATOMIC_LOAD_XOR_I32 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I32",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_xor_32 xoaddr:$ptr, i32:$incr))]>;
|
2008-08-26 06:34:37 +08:00
|
|
|
def ATOMIC_LOAD_NAND_I32 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I32",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_load_nand_32 xoaddr:$ptr, i32:$incr))]>;
|
2016-08-29 00:17:58 +08:00
|
|
|
def ATOMIC_LOAD_MIN_I32 : Pseudo<
|
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I32",
|
|
|
|
[(set i32:$dst, (atomic_load_min_32 xoaddr:$ptr, i32:$incr))]>;
|
|
|
|
def ATOMIC_LOAD_MAX_I32 : Pseudo<
|
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I32",
|
|
|
|
[(set i32:$dst, (atomic_load_max_32 xoaddr:$ptr, i32:$incr))]>;
|
|
|
|
def ATOMIC_LOAD_UMIN_I32 : Pseudo<
|
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I32",
|
|
|
|
[(set i32:$dst, (atomic_load_umin_32 xoaddr:$ptr, i32:$incr))]>;
|
|
|
|
def ATOMIC_LOAD_UMAX_I32 : Pseudo<
|
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I32",
|
|
|
|
[(set i32:$dst, (atomic_load_umax_32 xoaddr:$ptr, i32:$incr))]>;
|
2008-08-26 06:34:37 +08:00
|
|
|
|
2008-08-29 01:53:09 +08:00
|
|
|
def ATOMIC_CMP_SWAP_I8 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I8",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_cmp_swap_8 xoaddr:$ptr, i32:$old, i32:$new))]>;
|
2008-08-29 01:53:09 +08:00
|
|
|
def ATOMIC_CMP_SWAP_I16 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I16 $dst $ptr $old $new",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_cmp_swap_16 xoaddr:$ptr, i32:$old, i32:$new))]>;
|
2008-08-22 11:49:10 +08:00
|
|
|
def ATOMIC_CMP_SWAP_I32 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I32 $dst $ptr $old $new",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_cmp_swap_32 xoaddr:$ptr, i32:$old, i32:$new))]>;
|
2008-08-26 06:34:37 +08:00
|
|
|
|
2008-08-29 01:53:09 +08:00
|
|
|
def ATOMIC_SWAP_I8 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_i8",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_swap_8 xoaddr:$ptr, i32:$new))]>;
|
2008-08-29 01:53:09 +08:00
|
|
|
def ATOMIC_SWAP_I16 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I16",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_swap_16 xoaddr:$ptr, i32:$new))]>;
|
2008-08-26 05:09:52 +08:00
|
|
|
def ATOMIC_SWAP_I32 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I32",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (atomic_swap_32 xoaddr:$ptr, i32:$new))]>;
|
2008-08-22 11:49:10 +08:00
|
|
|
}
|
2008-04-19 09:30:48 +08:00
|
|
|
}
|
|
|
|
|
2008-07-12 10:23:19 +08:00
|
|
|
// Instructions to support atomic operations
|
2017-01-27 02:59:15 +08:00
|
|
|
let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in {
|
2015-03-11 04:51:07 +08:00
|
|
|
def LBARX : XForm_1<31, 52, (outs gprc:$rD), (ins memrr:$src),
|
|
|
|
"lbarx $rD, $src", IIC_LdStLWARX, []>,
|
|
|
|
Requires<[HasPartwordAtomics]>;
|
|
|
|
|
|
|
|
def LHARX : XForm_1<31, 116, (outs gprc:$rD), (ins memrr:$src),
|
|
|
|
"lharx $rD, $src", IIC_LdStLWARX, []>,
|
|
|
|
Requires<[HasPartwordAtomics]>;
|
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LWARX : XForm_1<31, 20, (outs gprc:$rD), (ins memrr:$src),
|
2015-03-11 04:51:07 +08:00
|
|
|
"lwarx $rD, $src", IIC_LdStLWARX, []>;
|
|
|
|
|
|
|
|
// Instructions to support lock versions of atomics
|
|
|
|
// (EH=1 - see Power ISA 2.07 Book II 4.4.2)
|
|
|
|
def LBARXL : XForm_1<31, 52, (outs gprc:$rD), (ins memrr:$src),
|
|
|
|
"lbarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT,
|
|
|
|
Requires<[HasPartwordAtomics]>;
|
|
|
|
|
|
|
|
def LHARXL : XForm_1<31, 116, (outs gprc:$rD), (ins memrr:$src),
|
|
|
|
"lharx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT,
|
|
|
|
Requires<[HasPartwordAtomics]>;
|
|
|
|
|
|
|
|
def LWARXL : XForm_1<31, 20, (outs gprc:$rD), (ins memrr:$src),
|
|
|
|
"lwarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT;
|
2016-03-31 23:26:37 +08:00
|
|
|
|
|
|
|
// The atomic instructions use the destination register as well as the next one
|
|
|
|
// or two registers in order (modulo 31).
|
|
|
|
let hasExtraSrcRegAllocReq = 1 in
|
|
|
|
def LWAT : X_RD5_RS5_IM5<31, 582, (outs gprc:$rD), (ins gprc:$rA, u5imm:$FC),
|
|
|
|
"lwat $rD, $rA, $FC", IIC_LdStLoad>,
|
|
|
|
Requires<[IsISA3_0]>;
|
2015-03-11 04:51:07 +08:00
|
|
|
}
|
|
|
|
|
2017-01-27 02:59:15 +08:00
|
|
|
let Defs = [CR0], mayStore = 1, mayLoad = 0, hasSideEffects = 0 in {
|
2015-03-11 04:51:07 +08:00
|
|
|
def STBCX : XForm_1<31, 694, (outs), (ins gprc:$rS, memrr:$dst),
|
|
|
|
"stbcx. $rS, $dst", IIC_LdStSTWCX, []>,
|
|
|
|
isDOT, Requires<[HasPartwordAtomics]>;
|
|
|
|
|
|
|
|
def STHCX : XForm_1<31, 726, (outs), (ins gprc:$rS, memrr:$dst),
|
|
|
|
"sthcx. $rS, $dst", IIC_LdStSTWCX, []>,
|
|
|
|
isDOT, Requires<[HasPartwordAtomics]>;
|
2008-07-12 10:23:19 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def STWCX : XForm_1<31, 150, (outs), (ins gprc:$rS, memrr:$dst),
|
2015-03-11 04:51:07 +08:00
|
|
|
"stwcx. $rS, $dst", IIC_LdStSTWCX, []>, isDOT;
|
|
|
|
}
|
2008-07-12 10:23:19 +08:00
|
|
|
|
2017-01-27 02:59:15 +08:00
|
|
|
let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
|
2016-03-31 23:26:37 +08:00
|
|
|
def STWAT : X_RD5_RS5_IM5<31, 710, (outs), (ins gprc:$rS, gprc:$rA, u5imm:$FC),
|
|
|
|
"stwat $rS, $rA, $FC", IIC_LdStStore>,
|
|
|
|
Requires<[IsISA3_0]>;
|
|
|
|
|
2010-05-15 00:46:02 +08:00
|
|
|
let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
|
2013-11-28 07:26:09 +08:00
|
|
|
def TRAP : XForm_24<31, 4, (outs), (ins), "trap", IIC_LdStLoad, [(trap)]>;
|
2008-08-12 01:36:31 +08:00
|
|
|
|
2013-07-04 22:40:12 +08:00
|
|
|
def TWI : DForm_base<3, (outs), (ins u5imm:$to, gprc:$rA, s16imm:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"twi $to, $rA, $imm", IIC_IntTrapW, []>;
|
2013-07-04 22:40:12 +08:00
|
|
|
def TW : XForm_1<31, 4, (outs), (ins u5imm:$to, gprc:$rA, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"tw $to, $rA, $rB", IIC_IntTrapW, []>;
|
2013-07-04 22:40:12 +08:00
|
|
|
def TDI : DForm_base<2, (outs), (ins u5imm:$to, g8rc:$rA, s16imm:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"tdi $to, $rA, $imm", IIC_IntTrapD, []>;
|
2013-07-04 22:40:12 +08:00
|
|
|
def TD : XForm_1<31, 68, (outs), (ins u5imm:$to, g8rc:$rA, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"td $to, $rA, $rB", IIC_IntTrapD, []>;
|
2013-07-04 22:40:12 +08:00
|
|
|
|
2006-11-15 03:19:53 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// PPC32 Load Instructions.
|
2004-08-30 10:28:06 +08:00
|
|
|
//
|
2006-11-15 03:19:53 +08:00
|
|
|
|
2006-11-15 10:43:19 +08:00
|
|
|
// Unindexed (r+i) Loads.
|
2015-03-12 07:28:38 +08:00
|
|
|
let PPC970_Unit = 2 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def LBZ : DForm_1<34, (outs gprc:$rD), (ins memri:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lbz $rD, $src", IIC_LdStLoad,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$rD, (zextloadi8 iaddr:$src))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LHA : DForm_1<42, (outs gprc:$rD), (ins memri:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lha $rD, $src", IIC_LdStLHA,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$rD, (sextloadi16 iaddr:$src))]>,
|
2006-03-13 13:15:10 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LHZ : DForm_1<40, (outs gprc:$rD), (ins memri:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lhz $rD, $src", IIC_LdStLoad,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$rD, (zextloadi16 iaddr:$src))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LWZ : DForm_1<32, (outs gprc:$rD), (ins memri:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lwz $rD, $src", IIC_LdStLoad,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$rD, (load iaddr:$src))]>;
|
2006-11-08 10:13:12 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LFS : DForm_1<48, (outs f4rc:$rD), (ins memri:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lfs $rD, $src", IIC_LdStLFD,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set f32:$rD, (load iaddr:$src))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LFD : DForm_1<50, (outs f8rc:$rD), (ins memri:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lfd $rD, $src", IIC_LdStLFD,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set f64:$rD, (load iaddr:$src))]>;
|
2006-11-10 10:08:47 +08:00
|
|
|
|
|
|
|
|
2006-11-15 10:43:19 +08:00
|
|
|
// Unindexed (r+i) Loads with Update (preinc).
|
2017-01-27 02:59:15 +08:00
|
|
|
let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def LBZU : DForm_1<35, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lbzu $rD, $addr", IIC_LdStLoadUpd,
|
2006-11-16 07:24:18 +08:00
|
|
|
[]>, RegConstraint<"$addr.reg = $ea_result">,
|
|
|
|
NoEncode<"$ea_result">;
|
2006-11-15 10:43:19 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LHAU : DForm_1<43, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lhau $rD, $addr", IIC_LdStLHAU,
|
2006-11-16 07:24:18 +08:00
|
|
|
[]>, RegConstraint<"$addr.reg = $ea_result">,
|
|
|
|
NoEncode<"$ea_result">;
|
2006-11-15 10:43:19 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LHZU : DForm_1<41, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lhzu $rD, $addr", IIC_LdStLoadUpd,
|
2006-11-16 07:24:18 +08:00
|
|
|
[]>, RegConstraint<"$addr.reg = $ea_result">,
|
|
|
|
NoEncode<"$ea_result">;
|
2006-11-15 10:43:19 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LWZU : DForm_1<33, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lwzu $rD, $addr", IIC_LdStLoadUpd,
|
2006-11-16 07:24:18 +08:00
|
|
|
[]>, RegConstraint<"$addr.reg = $ea_result">,
|
|
|
|
NoEncode<"$ea_result">;
|
2006-11-15 10:43:19 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LFSU : DForm_1<49, (outs f4rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lfsu $rD, $addr", IIC_LdStLFDU,
|
2006-11-16 07:24:18 +08:00
|
|
|
[]>, RegConstraint<"$addr.reg = $ea_result">,
|
|
|
|
NoEncode<"$ea_result">;
|
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LFDU : DForm_1<51, (outs f8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lfdu $rD, $addr", IIC_LdStLFDU,
|
2006-11-16 07:24:18 +08:00
|
|
|
[]>, RegConstraint<"$addr.reg = $ea_result">,
|
|
|
|
NoEncode<"$ea_result">;
|
2012-06-20 23:43:03 +08:00
|
|
|
|
|
|
|
|
|
|
|
// Indexed (r+r) Loads with Update (preinc).
|
2013-04-27 00:53:15 +08:00
|
|
|
def LBZUX : XForm_1<31, 119, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
|
2012-06-20 23:43:03 +08:00
|
|
|
(ins memrr:$addr),
|
2013-12-01 04:41:13 +08:00
|
|
|
"lbzux $rD, $addr", IIC_LdStLoadUpdX,
|
2013-03-22 22:59:13 +08:00
|
|
|
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
|
2012-06-20 23:43:03 +08:00
|
|
|
NoEncode<"$ea_result">;
|
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LHAUX : XForm_1<31, 375, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
|
2012-06-20 23:43:03 +08:00
|
|
|
(ins memrr:$addr),
|
2013-12-01 04:41:13 +08:00
|
|
|
"lhaux $rD, $addr", IIC_LdStLHAUX,
|
2013-03-22 22:59:13 +08:00
|
|
|
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
|
2012-06-20 23:43:03 +08:00
|
|
|
NoEncode<"$ea_result">;
|
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LHZUX : XForm_1<31, 311, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
|
2012-06-20 23:43:03 +08:00
|
|
|
(ins memrr:$addr),
|
2013-12-01 04:41:13 +08:00
|
|
|
"lhzux $rD, $addr", IIC_LdStLoadUpdX,
|
2013-03-22 22:59:13 +08:00
|
|
|
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
|
2012-06-20 23:43:03 +08:00
|
|
|
NoEncode<"$ea_result">;
|
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LWZUX : XForm_1<31, 55, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
|
2012-06-20 23:43:03 +08:00
|
|
|
(ins memrr:$addr),
|
2013-12-01 04:41:13 +08:00
|
|
|
"lwzux $rD, $addr", IIC_LdStLoadUpdX,
|
2013-03-22 22:59:13 +08:00
|
|
|
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
|
2012-06-20 23:43:03 +08:00
|
|
|
NoEncode<"$ea_result">;
|
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LFSUX : XForm_1<31, 567, (outs f4rc:$rD, ptr_rc_nor0:$ea_result),
|
2012-06-20 23:43:03 +08:00
|
|
|
(ins memrr:$addr),
|
2013-12-01 04:41:13 +08:00
|
|
|
"lfsux $rD, $addr", IIC_LdStLFDUX,
|
2013-03-22 22:59:13 +08:00
|
|
|
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
|
2012-06-20 23:43:03 +08:00
|
|
|
NoEncode<"$ea_result">;
|
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LFDUX : XForm_1<31, 631, (outs f8rc:$rD, ptr_rc_nor0:$ea_result),
|
2012-06-20 23:43:03 +08:00
|
|
|
(ins memrr:$addr),
|
2013-12-01 04:41:13 +08:00
|
|
|
"lfdux $rD, $addr", IIC_LdStLFDUX,
|
2013-03-22 22:59:13 +08:00
|
|
|
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
|
2012-06-20 23:43:03 +08:00
|
|
|
NoEncode<"$ea_result">;
|
2004-10-08 06:30:03 +08:00
|
|
|
}
|
2008-12-03 10:30:17 +08:00
|
|
|
}
|
2006-11-08 10:13:12 +08:00
|
|
|
|
2006-11-15 10:43:19 +08:00
|
|
|
// Indexed (r+r) Loads.
|
2006-11-15 03:19:53 +08:00
|
|
|
//
|
2017-01-27 02:59:15 +08:00
|
|
|
let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def LBZX : XForm_1<31, 87, (outs gprc:$rD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lbzx $rD, $src", IIC_LdStLoad,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$rD, (zextloadi8 xaddr:$src))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LHAX : XForm_1<31, 343, (outs gprc:$rD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lhax $rD, $src", IIC_LdStLHA,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$rD, (sextloadi16 xaddr:$src))]>,
|
2006-11-15 03:19:53 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LHZX : XForm_1<31, 279, (outs gprc:$rD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lhzx $rD, $src", IIC_LdStLoad,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$rD, (zextloadi16 xaddr:$src))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LWZX : XForm_1<31, 23, (outs gprc:$rD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lwzx $rD, $src", IIC_LdStLoad,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$rD, (load xaddr:$src))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LHBRX : XForm_1<31, 790, (outs gprc:$rD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lhbrx $rD, $src", IIC_LdStLoad,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$rD, (PPClbrx xoaddr:$src, i16))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LWBRX : XForm_1<31, 534, (outs gprc:$rD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lwbrx $rD, $src", IIC_LdStLoad,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$rD, (PPClbrx xoaddr:$src, i32))]>;
|
2006-11-15 03:19:53 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LFSX : XForm_25<31, 535, (outs f4rc:$frD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lfsx $frD, $src", IIC_LdStLFD,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set f32:$frD, (load xaddr:$src))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LFDX : XForm_25<31, 599, (outs f8rc:$frD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lfdx $frD, $src", IIC_LdStLFD,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set f64:$frD, (load xaddr:$src))]>;
|
2013-03-31 18:12:51 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LFIWAX : XForm_25<31, 855, (outs f8rc:$frD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lfiwax $frD, $src", IIC_LdStLFD,
|
2013-03-31 18:12:51 +08:00
|
|
|
[(set f64:$frD, (PPClfiwax xoaddr:$src))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LFIWZX : XForm_25<31, 887, (outs f8rc:$frD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lfiwzx $frD, $src", IIC_LdStLFD,
|
2013-04-02 01:52:07 +08:00
|
|
|
[(set f64:$frD, (PPClfiwzx xoaddr:$src))]>;
|
2006-11-15 03:19:53 +08:00
|
|
|
}
|
|
|
|
|
2013-07-04 02:29:47 +08:00
|
|
|
// Load Multiple
|
|
|
|
def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lmw $rD, $src", IIC_LdStLMW, []>;
|
2013-07-04 02:29:47 +08:00
|
|
|
|
2006-11-15 03:19:53 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// PPC32 Store Instructions.
|
|
|
|
//
|
|
|
|
|
2006-11-15 10:43:19 +08:00
|
|
|
// Unindexed (r+i) Stores.
|
2017-01-27 02:59:15 +08:00
|
|
|
let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def STB : DForm_1<38, (outs), (ins gprc:$rS, memri:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stb $rS, $src", IIC_LdStStore,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(truncstorei8 i32:$rS, iaddr:$src)]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STH : DForm_1<44, (outs), (ins gprc:$rS, memri:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"sth $rS, $src", IIC_LdStStore,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(truncstorei16 i32:$rS, iaddr:$src)]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STW : DForm_1<36, (outs), (ins gprc:$rS, memri:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stw $rS, $src", IIC_LdStStore,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(store i32:$rS, iaddr:$src)]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STFS : DForm_1<52, (outs), (ins f4rc:$rS, memri:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stfs $rS, $dst", IIC_LdStSTFD,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(store f32:$rS, iaddr:$dst)]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STFD : DForm_1<54, (outs), (ins f8rc:$rS, memri:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stfd $rS, $dst", IIC_LdStSTFD,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(store f64:$rS, iaddr:$dst)]>;
|
2006-11-15 03:19:53 +08:00
|
|
|
}
|
|
|
|
|
2006-11-15 10:43:19 +08:00
|
|
|
// Unindexed (r+i) Stores with Update (preinc).
|
2017-01-27 02:59:15 +08:00
|
|
|
let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def STBU : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stbu $rS, $dst", IIC_LdStStoreUpd, []>,
|
2013-03-20 03:52:04 +08:00
|
|
|
RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STHU : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"sthu $rS, $dst", IIC_LdStStoreUpd, []>,
|
2013-03-20 03:52:04 +08:00
|
|
|
RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STWU : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stwu $rS, $dst", IIC_LdStStoreUpd, []>,
|
2013-03-20 03:52:04 +08:00
|
|
|
RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STFSU : DForm_1<53, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memri:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stfsu $rS, $dst", IIC_LdStSTFDU, []>,
|
2013-03-20 03:52:04 +08:00
|
|
|
RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STFDU : DForm_1<55, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memri:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stfdu $rS, $dst", IIC_LdStSTFDU, []>,
|
2013-03-20 03:52:04 +08:00
|
|
|
RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Patterns to match the pre-inc stores. We can't put the patterns on
|
|
|
|
// the instruction definitions directly as ISel wants the address base
|
|
|
|
// and offset to be separate operands, not a single complex operand.
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(pre_truncsti8 i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
|
|
|
|
(STBU $rS, iaddroff:$ptroff, $ptrreg)>;
|
|
|
|
def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
|
|
|
|
(STHU $rS, iaddroff:$ptroff, $ptrreg)>;
|
|
|
|
def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
|
|
|
|
(STWU $rS, iaddroff:$ptroff, $ptrreg)>;
|
|
|
|
def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
|
|
|
|
(STFSU $rS, iaddroff:$ptroff, $ptrreg)>;
|
|
|
|
def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
|
|
|
|
(STFDU $rS, iaddroff:$ptroff, $ptrreg)>;
|
2006-11-15 10:43:19 +08:00
|
|
|
|
2006-11-15 03:19:53 +08:00
|
|
|
// Indexed (r+r) Stores.
|
2008-01-06 13:53:26 +08:00
|
|
|
let PPC970_Unit = 2 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def STBX : XForm_8<31, 215, (outs), (ins gprc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stbx $rS, $dst", IIC_LdStStore,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(truncstorei8 i32:$rS, xaddr:$dst)]>,
|
2006-11-15 03:19:53 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STHX : XForm_8<31, 407, (outs), (ins gprc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"sthx $rS, $dst", IIC_LdStStore,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(truncstorei16 i32:$rS, xaddr:$dst)]>,
|
2006-11-15 03:19:53 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STWX : XForm_8<31, 151, (outs), (ins gprc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stwx $rS, $dst", IIC_LdStStore,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(store i32:$rS, xaddr:$dst)]>,
|
2006-11-15 03:19:53 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2012-06-19 10:34:32 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def STHBRX: XForm_8<31, 918, (outs), (ins gprc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"sthbrx $rS, $dst", IIC_LdStStore,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(PPCstbrx i32:$rS, xoaddr:$dst, i16)]>,
|
2006-11-15 03:19:53 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STWBRX: XForm_8<31, 662, (outs), (ins gprc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stwbrx $rS, $dst", IIC_LdStStore,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(PPCstbrx i32:$rS, xoaddr:$dst, i32)]>,
|
2006-11-15 03:19:53 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def STFIWX: XForm_28<31, 983, (outs), (ins f8rc:$frS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stfiwx $frS, $dst", IIC_LdStSTFD,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(PPCstfiwx f64:$frS, xoaddr:$dst)]>;
|
2008-01-06 14:44:58 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def STFSX : XForm_28<31, 663, (outs), (ins f4rc:$frS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stfsx $frS, $dst", IIC_LdStSTFD,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(store f32:$frS, xaddr:$dst)]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STFDX : XForm_28<31, 727, (outs), (ins f8rc:$frS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stfdx $frS, $dst", IIC_LdStSTFD,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(store f64:$frS, xaddr:$dst)]>;
|
2006-11-15 03:19:53 +08:00
|
|
|
}
|
|
|
|
|
2013-03-20 03:52:04 +08:00
|
|
|
// Indexed (r+r) Stores with Update (preinc).
|
2017-01-27 02:59:15 +08:00
|
|
|
let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def STBUX : XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stbux $rS, $dst", IIC_LdStStoreUpd, []>,
|
2013-03-22 22:59:13 +08:00
|
|
|
RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
|
2013-03-20 03:52:04 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STHUX : XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"sthux $rS, $dst", IIC_LdStStoreUpd, []>,
|
2013-03-22 22:59:13 +08:00
|
|
|
RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
|
2013-03-20 03:52:04 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STWUX : XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stwux $rS, $dst", IIC_LdStStoreUpd, []>,
|
2013-03-22 22:59:13 +08:00
|
|
|
RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
|
2013-03-20 03:52:04 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STFSUX: XForm_8<31, 695, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stfsux $rS, $dst", IIC_LdStSTFDU, []>,
|
2013-03-22 22:59:13 +08:00
|
|
|
RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
|
2013-03-20 03:52:04 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STFDUX: XForm_8<31, 759, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stfdux $rS, $dst", IIC_LdStSTFDU, []>,
|
2013-03-22 22:59:13 +08:00
|
|
|
RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
|
2013-03-20 03:52:04 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Patterns to match the pre-inc stores. We can't put the patterns on
|
|
|
|
// the instruction definitions directly as ISel wants the address base
|
|
|
|
// and offset to be separate operands, not a single complex operand.
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(pre_truncsti8 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
|
|
|
|
(STBUX $rS, $ptrreg, $ptroff)>;
|
|
|
|
def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
|
|
|
|
(STHUX $rS, $ptrreg, $ptroff)>;
|
|
|
|
def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
|
|
|
|
(STWUX $rS, $ptrreg, $ptroff)>;
|
|
|
|
def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
|
|
|
|
(STFSUX $rS, $ptrreg, $ptroff)>;
|
|
|
|
def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
|
|
|
|
(STFDUX $rS, $ptrreg, $ptroff)>;
|
2013-03-20 03:52:04 +08:00
|
|
|
|
2013-07-04 02:29:47 +08:00
|
|
|
// Store Multiple
|
|
|
|
def STMW : DForm_1<47, (outs), (ins gprc:$rS, memri:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stmw $rS, $dst", IIC_LdStLMW, []>;
|
2013-07-04 02:29:47 +08:00
|
|
|
|
2013-07-02 00:37:52 +08:00
|
|
|
def SYNC : XForm_24_sync<31, 598, (outs), (ins i32imm:$L),
|
2014-10-03 06:34:22 +08:00
|
|
|
"sync $L", IIC_LdStSync, []>;
|
2014-01-23 04:20:52 +08:00
|
|
|
|
|
|
|
let isCodeGenOnly = 1 in {
|
|
|
|
def MSYNC : XForm_24_sync<31, 598, (outs), (ins),
|
2014-10-03 06:34:22 +08:00
|
|
|
"msync", IIC_LdStSync, []> {
|
2014-01-23 04:20:52 +08:00
|
|
|
let L = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-10-03 06:34:22 +08:00
|
|
|
def : Pat<(int_ppc_sync), (SYNC 0)>, Requires<[HasSYNC]>;
|
|
|
|
def : Pat<(int_ppc_lwsync), (SYNC 1)>, Requires<[HasSYNC]>;
|
|
|
|
def : Pat<(int_ppc_sync), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
|
|
|
|
def : Pat<(int_ppc_lwsync), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
|
2006-11-15 03:19:53 +08:00
|
|
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// PPC32 Arithmetic Instructions.
|
|
|
|
//
|
2006-11-08 10:13:12 +08:00
|
|
|
|
2006-03-12 17:13:49 +08:00
|
|
|
let PPC970_Unit = 1 in { // FXU Operations.
|
2013-05-24 06:48:06 +08:00
|
|
|
def ADDI : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"addi $rD, $rA, $imm", IIC_IntSimple,
|
Change some PowerPC PatLeaf definitions to ImmLeaf for fast-isel.
Using PatLeaf rather than ImmLeaf when defining immediate predicates
prevents simple patterns using those predicates from being recognized
for fast instruction selection. This patch replaces the immSExt16
PatLeaf predicate with two ImmLeaf predicates, imm32SExt16 and
imm64SExt16, allowing a few more patterns to be recognized (ADDI,
ADDIC, MULLI, ADDI8, and ADDIC8). Using the new predicates does not
help for LI, LI8, SUBFIC, and SUBFIC8 because these are rejected for
other reasons, but I see no reason to retain the PatLeaf predicate.
No functional change intended, and thus no test cases yet. This is
preliminary work for enabling fast-isel support for PowerPC. When
that support is ready, we'll be able to test this function.
llvm-svn: 182510
2013-05-23 04:09:24 +08:00
|
|
|
[(set i32:$rD, (add i32:$rA, imm32SExt16:$imm))]>;
|
2013-04-13 02:17:57 +08:00
|
|
|
let BaseName = "addic" in {
|
|
|
|
let Defs = [CARRY] in
|
2013-04-27 00:53:15 +08:00
|
|
|
def ADDIC : DForm_2<12, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"addic $rD, $rA, $imm", IIC_IntGeneral,
|
Change some PowerPC PatLeaf definitions to ImmLeaf for fast-isel.
Using PatLeaf rather than ImmLeaf when defining immediate predicates
prevents simple patterns using those predicates from being recognized
for fast instruction selection. This patch replaces the immSExt16
PatLeaf predicate with two ImmLeaf predicates, imm32SExt16 and
imm64SExt16, allowing a few more patterns to be recognized (ADDI,
ADDIC, MULLI, ADDI8, and ADDIC8). Using the new predicates does not
help for LI, LI8, SUBFIC, and SUBFIC8 because these are rejected for
other reasons, but I see no reason to retain the PatLeaf predicate.
No functional change intended, and thus no test cases yet. This is
preliminary work for enabling fast-isel support for PowerPC. When
that support is ready, we'll be able to test this function.
llvm-svn: 182510
2013-05-23 04:09:24 +08:00
|
|
|
[(set i32:$rD, (addc i32:$rA, imm32SExt16:$imm))]>,
|
2013-04-12 10:18:09 +08:00
|
|
|
RecFormRel, PPC970_DGroup_Cracked;
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CARRY, CR0] in
|
2013-04-27 00:53:15 +08:00
|
|
|
def ADDICo : DForm_2<13, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"addic. $rD, $rA, $imm", IIC_IntGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[]>, isDOT, RecFormRel;
|
2009-09-19 04:15:22 +08:00
|
|
|
}
|
2013-06-26 21:49:53 +08:00
|
|
|
def ADDIS : DForm_2<15, (outs gprc:$rD), (ins gprc_nor0:$rA, s17imm:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"addis $rD, $rA, $imm", IIC_IntSimple,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$rD, (add i32:$rA, imm16ShiftedSExt:$imm))]>;
|
2013-03-26 18:57:16 +08:00
|
|
|
let isCodeGenOnly = 1 in
|
2013-05-24 06:48:06 +08:00
|
|
|
def LA : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$sym),
|
2013-11-28 07:26:09 +08:00
|
|
|
"la $rD, $sym($rA)", IIC_IntGeneral,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$rD, (add i32:$rA,
|
2005-11-18 01:52:01 +08:00
|
|
|
(PPClo tglobaladdr:$sym, 0)))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def MULLI : DForm_2< 7, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mulli $rD, $rA, $imm", IIC_IntMulLI,
|
Change some PowerPC PatLeaf definitions to ImmLeaf for fast-isel.
Using PatLeaf rather than ImmLeaf when defining immediate predicates
prevents simple patterns using those predicates from being recognized
for fast instruction selection. This patch replaces the immSExt16
PatLeaf predicate with two ImmLeaf predicates, imm32SExt16 and
imm64SExt16, allowing a few more patterns to be recognized (ADDI,
ADDIC, MULLI, ADDI8, and ADDIC8). Using the new predicates does not
help for LI, LI8, SUBFIC, and SUBFIC8 because these are rejected for
other reasons, but I see no reason to retain the PatLeaf predicate.
No functional change intended, and thus no test cases yet. This is
preliminary work for enabling fast-isel support for PowerPC. When
that support is ready, we'll be able to test this function.
llvm-svn: 182510
2013-05-23 04:09:24 +08:00
|
|
|
[(set i32:$rD, (mul i32:$rA, imm32SExt16:$imm))]>;
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CARRY] in
|
2013-04-27 00:53:15 +08:00
|
|
|
def SUBFIC : DForm_2< 8, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"subfic $rD, $rA, $imm", IIC_IntGeneral,
|
Change some PowerPC PatLeaf definitions to ImmLeaf for fast-isel.
Using PatLeaf rather than ImmLeaf when defining immediate predicates
prevents simple patterns using those predicates from being recognized
for fast instruction selection. This patch replaces the immSExt16
PatLeaf predicate with two ImmLeaf predicates, imm32SExt16 and
imm64SExt16, allowing a few more patterns to be recognized (ADDI,
ADDIC, MULLI, ADDI8, and ADDIC8). Using the new predicates does not
help for LI, LI8, SUBFIC, and SUBFIC8 because these are rejected for
other reasons, but I see no reason to retain the PatLeaf predicate.
No functional change intended, and thus no test cases yet. This is
preliminary work for enabling fast-isel support for PowerPC. When
that support is ready, we'll be able to test this function.
llvm-svn: 182510
2013-05-23 04:09:24 +08:00
|
|
|
[(set i32:$rD, (subc imm32SExt16:$imm, i32:$rA))]>;
|
Initial commit of the machine code LICM pass. It successfully hoists this:
_foo:
li r2, 0
LBB1_1: ; bb
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmplw cr0, r2, r4
bne cr0, LBB1_1 ; bb
LBB1_2: ; return
blr
to:
_foo:
li r2, 0
li r5, 0
LBB1_1: ; bb
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmplw cr0, r2, r4
bne cr0, LBB1_1 ; bb
LBB1_2: ; return
blr
ZOMG!! :-)
Moar to come...
llvm-svn: 44687
2007-12-08 05:42:31 +08:00
|
|
|
|
2012-08-28 10:10:33 +08:00
|
|
|
let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
|
2013-05-24 06:48:06 +08:00
|
|
|
def LI : DForm_2_r0<14, (outs gprc:$rD), (ins s16imm:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"li $rD, $imm", IIC_IntSimple,
|
Change some PowerPC PatLeaf definitions to ImmLeaf for fast-isel.
Using PatLeaf rather than ImmLeaf when defining immediate predicates
prevents simple patterns using those predicates from being recognized
for fast instruction selection. This patch replaces the immSExt16
PatLeaf predicate with two ImmLeaf predicates, imm32SExt16 and
imm64SExt16, allowing a few more patterns to be recognized (ADDI,
ADDIC, MULLI, ADDI8, and ADDIC8). Using the new predicates does not
help for LI, LI8, SUBFIC, and SUBFIC8 because these are rejected for
other reasons, but I see no reason to retain the PatLeaf predicate.
No functional change intended, and thus no test cases yet. This is
preliminary work for enabling fast-isel support for PowerPC. When
that support is ready, we'll be able to test this function.
llvm-svn: 182510
2013-05-23 04:09:24 +08:00
|
|
|
[(set i32:$rD, imm32SExt16:$imm)]>;
|
2013-06-26 21:49:53 +08:00
|
|
|
def LIS : DForm_2_r0<15, (outs gprc:$rD), (ins s17imm:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lis $rD, $imm", IIC_IntSimple,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$rD, imm16ShiftedSExt:$imm)]>;
|
Initial commit of the machine code LICM pass. It successfully hoists this:
_foo:
li r2, 0
LBB1_1: ; bb
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmplw cr0, r2, r4
bne cr0, LBB1_1 ; bb
LBB1_2: ; return
blr
to:
_foo:
li r2, 0
li r5, 0
LBB1_1: ; bb
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmplw cr0, r2, r4
bne cr0, LBB1_1 ; bb
LBB1_2: ; return
blr
ZOMG!! :-)
Moar to come...
llvm-svn: 44687
2007-12-08 05:42:31 +08:00
|
|
|
}
|
2006-03-12 17:13:49 +08:00
|
|
|
}
|
2006-11-15 03:19:53 +08:00
|
|
|
|
2006-03-12 17:13:49 +08:00
|
|
|
let PPC970_Unit = 1 in { // FXU Operations.
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CR0] in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def ANDIo : DForm_4<28, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
|
2013-11-28 07:26:09 +08:00
|
|
|
"andi. $dst, $src1, $src2", IIC_IntGeneral,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (and i32:$src1, immZExt16:$src2))]>,
|
2006-02-12 17:09:52 +08:00
|
|
|
isDOT;
|
2013-04-27 00:53:15 +08:00
|
|
|
def ANDISo : DForm_4<29, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
|
2013-11-28 07:26:09 +08:00
|
|
|
"andis. $dst, $src1, $src2", IIC_IntGeneral,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (and i32:$src1, imm16ShiftedZExt:$src2))]>,
|
2006-02-12 17:09:52 +08:00
|
|
|
isDOT;
|
2013-04-13 02:17:57 +08:00
|
|
|
}
|
2013-04-27 00:53:15 +08:00
|
|
|
def ORI : DForm_4<24, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
|
2013-11-28 07:26:09 +08:00
|
|
|
"ori $dst, $src1, $src2", IIC_IntSimple,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (or i32:$src1, immZExt16:$src2))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def ORIS : DForm_4<25, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
|
2013-11-28 07:26:09 +08:00
|
|
|
"oris $dst, $src1, $src2", IIC_IntSimple,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (or i32:$src1, imm16ShiftedZExt:$src2))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def XORI : DForm_4<26, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
|
2013-11-28 07:26:09 +08:00
|
|
|
"xori $dst, $src1, $src2", IIC_IntSimple,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (xor i32:$src1, immZExt16:$src2))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def XORIS : DForm_4<27, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
|
2013-11-28 07:26:09 +08:00
|
|
|
"xoris $dst, $src1, $src2", IIC_IntSimple,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (xor i32:$src1, imm16ShiftedZExt:$src2))]>;
|
2013-12-12 08:19:11 +08:00
|
|
|
|
2013-11-28 07:26:09 +08:00
|
|
|
def NOP : DForm_4_zero<24, (outs), (ins), "nop", IIC_IntSimple,
|
2005-12-10 07:54:18 +08:00
|
|
|
[]>;
|
2013-12-12 08:19:11 +08:00
|
|
|
let isCodeGenOnly = 1 in {
|
|
|
|
// The POWER6 and POWER7 have special group-terminating nops.
|
|
|
|
def NOP_GT_PWR6 : DForm_4_fixedreg_zero<24, 1, (outs), (ins),
|
|
|
|
"ori 1, 1, 0", IIC_IntSimple, []>;
|
|
|
|
def NOP_GT_PWR7 : DForm_4_fixedreg_zero<24, 2, (outs), (ins),
|
|
|
|
"ori 2, 2, 0", IIC_IntSimple, []>;
|
|
|
|
}
|
|
|
|
|
2014-11-26 08:46:26 +08:00
|
|
|
let isCompare = 1, hasSideEffects = 0 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def CMPWI : DForm_5_ext<11, (outs crrc:$crD), (ins gprc:$rA, s16imm:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"cmpwi $crD, $rA, $imm", IIC_IntCompare>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def CMPLWI : DForm_6_ext<10, (outs crrc:$dst), (ins gprc:$src1, u16imm:$src2),
|
2013-11-28 07:26:09 +08:00
|
|
|
"cmplwi $dst, $src1, $src2", IIC_IntCompare>;
|
2016-04-14 02:51:18 +08:00
|
|
|
def CMPRB : X_BF3_L1_RS5_RS5<31, 192, (outs crbitrc:$BF),
|
|
|
|
(ins u1imm:$L, g8rc:$rA, g8rc:$rB),
|
|
|
|
"cmprb $BF, $L, $rA, $rB", IIC_IntCompare, []>,
|
|
|
|
Requires<[IsISA3_0]>;
|
2013-04-15 10:37:46 +08:00
|
|
|
}
|
2006-03-12 17:13:49 +08:00
|
|
|
}
|
2006-07-11 04:56:58 +08:00
|
|
|
|
2014-11-26 08:46:26 +08:00
|
|
|
let PPC970_Unit = 1, hasSideEffects = 0 in { // FXU Operations.
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
defm NAND : XForm_6r<31, 476, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"nand", "$rA, $rS, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rA, (not (and i32:$rS, i32:$rB)))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm AND : XForm_6r<31, 28, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"and", "$rA, $rS, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rA, (and i32:$rS, i32:$rB))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
} // isCommutable
|
2013-04-27 00:53:15 +08:00
|
|
|
defm ANDC : XForm_6r<31, 60, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"andc", "$rA, $rS, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rA, (and i32:$rS, (not i32:$rB)))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
defm OR : XForm_6r<31, 444, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"or", "$rA, $rS, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rA, (or i32:$rS, i32:$rB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm NOR : XForm_6r<31, 124, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"nor", "$rA, $rS, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rA, (not (or i32:$rS, i32:$rB)))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
} // isCommutable
|
2013-04-27 00:53:15 +08:00
|
|
|
defm ORC : XForm_6r<31, 412, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"orc", "$rA, $rS, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rA, (or i32:$rS, (not i32:$rB)))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
defm EQV : XForm_6r<31, 284, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"eqv", "$rA, $rS, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rA, (not (xor i32:$rS, i32:$rB)))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm XOR : XForm_6r<31, 316, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"xor", "$rA, $rS, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rA, (xor i32:$rS, i32:$rB))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
} // isCommutable
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SLW : XForm_6r<31, 24, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"slw", "$rA, $rS, $rB", IIC_IntGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rA, (PPCshl i32:$rS, i32:$rB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SRW : XForm_6r<31, 536, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"srw", "$rA, $rS, $rB", IIC_IntGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rA, (PPCsrl i32:$rS, i32:$rB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SRAW : XForm_6rc<31, 792, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"sraw", "$rA, $rS, $rB", IIC_IntShift,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i32:$rA, (PPCsra i32:$rS, i32:$rB))]>;
|
2009-09-19 04:15:22 +08:00
|
|
|
}
|
2006-11-15 03:19:53 +08:00
|
|
|
|
2006-03-12 17:13:49 +08:00
|
|
|
let PPC970_Unit = 1 in { // FXU Operations.
|
2014-11-26 08:46:26 +08:00
|
|
|
let hasSideEffects = 0 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SRAWI : XForm_10rc<31, 824, (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH),
|
2013-11-28 07:26:09 +08:00
|
|
|
"srawi", "$rA, $rS, $SH", IIC_IntShift,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i32:$rA, (sra i32:$rS, (i32 imm:$SH)))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm CNTLZW : XForm_11r<31, 26, (outs gprc:$rA), (ins gprc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"cntlzw", "$rA, $rS", IIC_IntGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rA, (ctlz i32:$rS))]>;
|
2016-04-14 02:51:18 +08:00
|
|
|
defm CNTTZW : XForm_11r<31, 538, (outs gprc:$rA), (ins gprc:$rS),
|
|
|
|
"cnttzw", "$rA, $rS", IIC_IntGeneral,
|
|
|
|
[(set i32:$rA, (cttz i32:$rS))]>, Requires<[IsISA3_0]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm EXTSB : XForm_11r<31, 954, (outs gprc:$rA), (ins gprc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"extsb", "$rA, $rS", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rA, (sext_inreg i32:$rS, i8))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm EXTSH : XForm_11r<31, 922, (outs gprc:$rA), (ins gprc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"extsh", "$rA, $rS", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rA, (sext_inreg i32:$rS, i16))]>;
|
2015-01-03 09:16:37 +08:00
|
|
|
|
|
|
|
let isCommutable = 1 in
|
|
|
|
def CMPB : XForm_6<31, 508, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
|
|
|
|
"cmpb $rA, $rS, $rB", IIC_IntGeneral,
|
|
|
|
[(set i32:$rA, (PPCcmpb i32:$rS, i32:$rB))]>;
|
2013-04-12 10:18:09 +08:00
|
|
|
}
|
2014-11-26 08:46:26 +08:00
|
|
|
let isCompare = 1, hasSideEffects = 0 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def CMPW : XForm_16_ext<31, 0, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"cmpw $crD, $rA, $rB", IIC_IntCompare>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def CMPLW : XForm_16_ext<31, 32, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"cmplw $crD, $rA, $rB", IIC_IntCompare>;
|
2013-04-15 10:37:46 +08:00
|
|
|
}
|
2006-03-12 17:13:49 +08:00
|
|
|
}
|
|
|
|
let PPC970_Unit = 3 in { // FPU Operations.
|
Change instruction description to split OperandList into OutOperandList and
InOperandList. This gives one piece of important information: # of results
produced by an instruction.
An example of the change:
def ADD32rr : I<0x01, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
"add{l} {$src2, $dst|$dst, $src2}",
[(set GR32:$dst, (add GR32:$src1, GR32:$src2))]>;
=>
def ADD32rr : I<0x01, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
"add{l} {$src2, $dst|$dst, $src2}",
[(set GR32:$dst, (add GR32:$src1, GR32:$src2))]>;
llvm-svn: 40033
2007-07-19 09:14:50 +08:00
|
|
|
//def FCMPO : XForm_17<63, 32, (outs CRRC:$crD), (ins FPRC:$fA, FPRC:$fB),
|
2013-11-28 07:26:09 +08:00
|
|
|
// "fcmpo $crD, $fA, $fB", IIC_FPCompare>;
|
2014-11-26 08:46:26 +08:00
|
|
|
let isCompare = 1, hasSideEffects = 0 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def FCMPUS : XForm_17<63, 0, (outs crrc:$crD), (ins f4rc:$fA, f4rc:$fB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fcmpu $crD, $fA, $fB", IIC_FPCompare>;
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
def FCMPUD : XForm_17<63, 0, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fcmpu $crD, $fA, $fB", IIC_FPCompare>;
|
2013-04-15 10:37:46 +08:00
|
|
|
}
|
2006-11-15 03:19:53 +08:00
|
|
|
|
2017-01-05 23:00:45 +08:00
|
|
|
def FTDIV: XForm_17<63, 128, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
|
|
|
|
"ftdiv $crD, $fA, $fB", IIC_FPCompare>;
|
|
|
|
def FTSQRT: XForm_17a<63, 160, (outs crrc:$crD), (ins f8rc:$fB),
|
|
|
|
"ftsqrt $crD, $fB", IIC_FPCompare>;
|
|
|
|
|
2008-10-30 02:26:45 +08:00
|
|
|
let Uses = [RM] in {
|
2014-11-26 08:46:26 +08:00
|
|
|
let hasSideEffects = 0 in {
|
2013-09-26 12:11:24 +08:00
|
|
|
defm FCTIW : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fctiw", "$frD, $frB", IIC_FPGeneral,
|
2013-09-26 13:22:11 +08:00
|
|
|
[]>;
|
2017-01-05 23:00:45 +08:00
|
|
|
defm FCTIWU : XForm_26r<63, 142, (outs f8rc:$frD), (ins f8rc:$frB),
|
|
|
|
"fctiwu", "$frD, $frB", IIC_FPGeneral,
|
|
|
|
[]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fctiwz", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$frD, (PPCfctiwz f64:$frB))]>;
|
2013-03-29 16:57:48 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FRSP : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"frsp", "$frD, $frB", IIC_FPGeneral,
|
2016-08-19 04:08:15 +08:00
|
|
|
[(set f32:$frD, (fpround f64:$frB))]>;
|
2013-03-29 16:57:48 +08:00
|
|
|
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FRIND : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"frin", "$frD, $frB", IIC_FPGeneral,
|
2016-08-19 04:08:15 +08:00
|
|
|
[(set f64:$frD, (fround f64:$frB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FRINS : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"frin", "$frD, $frB", IIC_FPGeneral,
|
2016-08-19 04:08:15 +08:00
|
|
|
[(set f32:$frD, (fround f32:$frB))]>;
|
2013-03-30 03:41:55 +08:00
|
|
|
}
|
|
|
|
|
2014-11-26 08:46:26 +08:00
|
|
|
let hasSideEffects = 0 in {
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FRIPD : XForm_26r<63, 456, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"frip", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$frD, (fceil f64:$frB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FRIPS : XForm_26r<63, 456, (outs f4rc:$frD), (ins f4rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"frip", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f32:$frD, (fceil f32:$frB))]>;
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FRIZD : XForm_26r<63, 424, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"friz", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$frD, (ftrunc f64:$frB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FRIZS : XForm_26r<63, 424, (outs f4rc:$frD), (ins f4rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"friz", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f32:$frD, (ftrunc f32:$frB))]>;
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FRIMD : XForm_26r<63, 488, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"frim", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$frD, (ffloor f64:$frB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FRIMS : XForm_26r<63, 488, (outs f4rc:$frD), (ins f4rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"frim", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f32:$frD, (ffloor f32:$frB))]>;
|
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FSQRT : XForm_26r<63, 22, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-12-01 04:41:13 +08:00
|
|
|
"fsqrt", "$frD, $frB", IIC_FPSqrtD,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$frD, (fsqrt f64:$frB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FSQRTS : XForm_26r<59, 22, (outs f4rc:$frD), (ins f4rc:$frB),
|
2013-12-01 04:41:13 +08:00
|
|
|
"fsqrts", "$frD, $frB", IIC_FPSqrtS,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f32:$frD, (fsqrt f32:$frB))]>;
|
|
|
|
}
|
2008-10-30 02:26:45 +08:00
|
|
|
}
|
2006-03-12 17:13:49 +08:00
|
|
|
}
|
2005-10-01 09:35:02 +08:00
|
|
|
|
2010-07-17 05:03:52 +08:00
|
|
|
/// Note that FMR is defined as pseudo-ops on the PPC970 because they are
|
2006-03-24 15:12:19 +08:00
|
|
|
/// often coalesced away and we don't want the dispatch group builder to think
|
2006-03-12 17:13:49 +08:00
|
|
|
/// that they will fill slots (which could cause the load of a LSU reject to
|
|
|
|
/// sneak into a d-group with a store).
|
2014-11-26 08:46:26 +08:00
|
|
|
let hasSideEffects = 0 in
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FMR : XForm_26r<63, 72, (outs f4rc:$frD), (ins f4rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fmr", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[]>, // (set f32:$frD, f32:$frB)
|
|
|
|
PPC970_Unit_Pseudo;
|
2005-10-01 09:35:02 +08:00
|
|
|
|
2014-11-26 08:46:26 +08:00
|
|
|
let PPC970_Unit = 3, hasSideEffects = 0 in { // FPU Operations.
|
2005-10-01 09:35:02 +08:00
|
|
|
// These are artificially split into two different forms, for 4/8 byte FP.
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FABSS : XForm_26r<63, 264, (outs f4rc:$frD), (ins f4rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fabs", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f32:$frD, (fabs f32:$frB))]>;
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FABSD : XForm_26r<63, 264, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fabs", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$frD, (fabs f64:$frB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FNABSS : XForm_26r<63, 136, (outs f4rc:$frD), (ins f4rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fnabs", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f32:$frD, (fneg (fabs f32:$frB)))]>;
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FNABSD : XForm_26r<63, 136, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fnabs", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$frD, (fneg (fabs f64:$frB)))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FNEGS : XForm_26r<63, 40, (outs f4rc:$frD), (ins f4rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fneg", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f32:$frD, (fneg f32:$frB))]>;
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FNEGD : XForm_26r<63, 40, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fneg", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$frD, (fneg f64:$frB))]>;
|
2013-04-03 12:01:11 +08:00
|
|
|
|
2013-08-19 13:01:02 +08:00
|
|
|
defm FCPSGNS : XForm_28r<63, 8, (outs f4rc:$frD), (ins f4rc:$frA, f4rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fcpsgn", "$frD, $frA, $frB", IIC_FPGeneral,
|
2013-08-19 13:01:02 +08:00
|
|
|
[(set f32:$frD, (fcopysign f32:$frB, f32:$frA))]>;
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
2013-08-19 13:01:02 +08:00
|
|
|
defm FCPSGND : XForm_28r<63, 8, (outs f8rc:$frD), (ins f8rc:$frA, f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fcpsgn", "$frD, $frA, $frB", IIC_FPGeneral,
|
2013-08-19 13:01:02 +08:00
|
|
|
[(set f64:$frD, (fcopysign f64:$frB, f64:$frA))]>;
|
|
|
|
|
2013-04-03 12:01:11 +08:00
|
|
|
// Reciprocal estimates.
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FRE : XForm_26r<63, 24, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fre", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$frD, (PPCfre f64:$frB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FRES : XForm_26r<59, 24, (outs f4rc:$frD), (ins f4rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fres", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f32:$frD, (PPCfre f32:$frB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FRSQRTE : XForm_26r<63, 26, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"frsqrte", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$frD, (PPCfrsqrte f64:$frB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FRSQRTES : XForm_26r<59, 26, (outs f4rc:$frD), (ins f4rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"frsqrtes", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f32:$frD, (PPCfrsqrte f32:$frB))]>;
|
2006-03-12 17:13:49 +08:00
|
|
|
}
|
2004-08-30 06:45:13 +08:00
|
|
|
|
2004-08-30 10:28:06 +08:00
|
|
|
// XL-Form instructions. condition register logical ops.
|
|
|
|
//
|
2014-11-26 08:46:26 +08:00
|
|
|
let hasSideEffects = 0 in
|
2013-04-27 00:53:15 +08:00
|
|
|
def MCRF : XLForm_3<19, 0, (outs crrc:$BF), (ins crrc:$BFA),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mcrf $BF, $BFA", IIC_BrMCR>,
|
2006-03-12 17:13:49 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_CRU;
|
2004-08-30 10:28:06 +08:00
|
|
|
|
2015-01-07 08:15:29 +08:00
|
|
|
// FIXME: According to the ISA (section 2.5.1 of version 2.06), the
|
|
|
|
// condition-register logical instructions have preferred forms. Specifically,
|
|
|
|
// it is preferred that the bit specified by the BT field be in the same
|
|
|
|
// condition register as that specified by the bit BB. We might want to account
|
|
|
|
// for this via hinting the register allocator and anti-dep breakers, or we
|
|
|
|
// could constrain the register class to force this constraint and then loosen
|
|
|
|
// it during register allocation via convertToThreeAddress or some similar
|
|
|
|
// mechanism.
|
|
|
|
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in {
|
2013-07-02 05:40:54 +08:00
|
|
|
def CRAND : XLForm_1<19, 257, (outs crbitrc:$CRD),
|
|
|
|
(ins crbitrc:$CRA, crbitrc:$CRB),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
"crand $CRD, $CRA, $CRB", IIC_BrCR,
|
|
|
|
[(set i1:$CRD, (and i1:$CRA, i1:$CRB))]>;
|
2013-07-02 05:40:54 +08:00
|
|
|
|
|
|
|
def CRNAND : XLForm_1<19, 225, (outs crbitrc:$CRD),
|
|
|
|
(ins crbitrc:$CRA, crbitrc:$CRB),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
"crnand $CRD, $CRA, $CRB", IIC_BrCR,
|
|
|
|
[(set i1:$CRD, (not (and i1:$CRA, i1:$CRB)))]>;
|
2013-07-02 05:40:54 +08:00
|
|
|
|
|
|
|
def CROR : XLForm_1<19, 449, (outs crbitrc:$CRD),
|
|
|
|
(ins crbitrc:$CRA, crbitrc:$CRB),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
"cror $CRD, $CRA, $CRB", IIC_BrCR,
|
|
|
|
[(set i1:$CRD, (or i1:$CRA, i1:$CRB))]>;
|
2013-07-02 05:40:54 +08:00
|
|
|
|
|
|
|
def CRXOR : XLForm_1<19, 193, (outs crbitrc:$CRD),
|
|
|
|
(ins crbitrc:$CRA, crbitrc:$CRB),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
"crxor $CRD, $CRA, $CRB", IIC_BrCR,
|
|
|
|
[(set i1:$CRD, (xor i1:$CRA, i1:$CRB))]>;
|
2013-07-02 05:40:54 +08:00
|
|
|
|
|
|
|
def CRNOR : XLForm_1<19, 33, (outs crbitrc:$CRD),
|
|
|
|
(ins crbitrc:$CRA, crbitrc:$CRB),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
"crnor $CRD, $CRA, $CRB", IIC_BrCR,
|
|
|
|
[(set i1:$CRD, (not (or i1:$CRA, i1:$CRB)))]>;
|
2013-07-02 05:40:54 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def CREQV : XLForm_1<19, 289, (outs crbitrc:$CRD),
|
|
|
|
(ins crbitrc:$CRA, crbitrc:$CRB),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
"creqv $CRD, $CRA, $CRB", IIC_BrCR,
|
|
|
|
[(set i1:$CRD, (not (xor i1:$CRA, i1:$CRB)))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
} // isCommutable
|
2007-02-25 13:34:32 +08:00
|
|
|
|
2013-07-02 05:40:54 +08:00
|
|
|
def CRANDC : XLForm_1<19, 129, (outs crbitrc:$CRD),
|
2013-04-27 00:53:15 +08:00
|
|
|
(ins crbitrc:$CRA, crbitrc:$CRB),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
"crandc $CRD, $CRA, $CRB", IIC_BrCR,
|
|
|
|
[(set i1:$CRD, (and i1:$CRA, (not i1:$CRB)))]>;
|
2013-07-02 05:40:54 +08:00
|
|
|
|
|
|
|
def CRORC : XLForm_1<19, 417, (outs crbitrc:$CRD),
|
|
|
|
(ins crbitrc:$CRA, crbitrc:$CRB),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
"crorc $CRD, $CRA, $CRB", IIC_BrCR,
|
|
|
|
[(set i1:$CRD, (or i1:$CRA, (not i1:$CRB)))]>;
|
2008-03-10 22:12:10 +08:00
|
|
|
|
2013-03-26 18:57:16 +08:00
|
|
|
let isCodeGenOnly = 1 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def CRSET : XLForm_1_ext<19, 289, (outs crbitrc:$dst), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"creqv $dst, $dst, $dst", IIC_BrCR,
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
[(set i1:$dst, 1)]>;
|
2007-02-25 13:34:32 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def CRUNSET: XLForm_1_ext<19, 193, (outs crbitrc:$dst), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"crxor $dst, $dst, $dst", IIC_BrCR,
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
[(set i1:$dst, 0)]>;
|
2011-08-31 01:04:16 +08:00
|
|
|
|
2012-08-28 10:10:27 +08:00
|
|
|
let Defs = [CR1EQ], CRD = 6 in {
|
|
|
|
def CR6SET : XLForm_1_ext<19, 289, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"creqv 6, 6, 6", IIC_BrCR,
|
2012-08-28 10:10:27 +08:00
|
|
|
[(PPCcr6set)]>;
|
|
|
|
|
|
|
|
def CR6UNSET: XLForm_1_ext<19, 193, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"crxor 6, 6, 6", IIC_BrCR,
|
2012-08-28 10:10:27 +08:00
|
|
|
[(PPCcr6unset)]>;
|
|
|
|
}
|
2013-03-26 18:57:16 +08:00
|
|
|
}
|
2012-08-28 10:10:27 +08:00
|
|
|
|
2006-03-12 17:13:49 +08:00
|
|
|
// XFX-Form instructions. Instructions that deal with SPRs.
|
2004-08-30 10:28:06 +08:00
|
|
|
//
|
2013-07-03 20:32:41 +08:00
|
|
|
|
|
|
|
def MFSPR : XFXForm_1<31, 339, (outs gprc:$RT), (ins i32imm:$SPR),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mfspr $RT, $SPR", IIC_SprMFSPR>;
|
2013-07-03 20:32:41 +08:00
|
|
|
def MTSPR : XFXForm_1<31, 467, (outs), (ins i32imm:$SPR, gprc:$RT),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mtspr $SPR, $RT", IIC_SprMTSPR>;
|
2013-07-03 20:32:41 +08:00
|
|
|
|
2013-07-08 23:20:38 +08:00
|
|
|
def MFTB : XFXForm_1<31, 371, (outs gprc:$RT), (ins i32imm:$SPR),
|
2015-06-17 00:01:15 +08:00
|
|
|
"mftb $RT, $SPR", IIC_SprMFTB>;
|
2013-07-08 23:20:38 +08:00
|
|
|
|
2014-12-03 06:01:00 +08:00
|
|
|
// A pseudo-instruction used to implement the read of the 64-bit cycle counter
|
|
|
|
// on a 32-bit target.
|
|
|
|
let hasSideEffects = 1, usesCustomInserter = 1 in
|
|
|
|
def ReadTB : Pseudo<(outs gprc:$lo, gprc:$hi), (ins),
|
|
|
|
"#ReadTB", []>;
|
|
|
|
|
2008-10-24 04:41:28 +08:00
|
|
|
let Uses = [CTR] in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def MFCTR : XFXForm_1_ext<31, 339, 9, (outs gprc:$rT), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mfctr $rT", IIC_SprMFSPR>,
|
2006-03-12 17:13:49 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_FXU;
|
2008-10-24 04:41:28 +08:00
|
|
|
}
|
2013-03-26 03:05:30 +08:00
|
|
|
let Defs = [CTR], Pattern = [(PPCmtctr i32:$rS)] in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mtctr $rS", IIC_SprMTSPR>,
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_FXU;
|
2006-05-18 03:00:46 +08:00
|
|
|
}
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
llvm-svn: 181927
2013-05-16 05:37:41 +08:00
|
|
|
let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in {
|
|
|
|
let Pattern = [(int_ppc_mtctr i32:$rS)] in
|
2013-05-21 00:08:37 +08:00
|
|
|
def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mtctr $rS", IIC_SprMTSPR>,
|
2013-05-21 00:08:37 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_FXU;
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
llvm-svn: 181927
2013-05-16 05:37:41 +08:00
|
|
|
}
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
|
2008-10-24 04:41:28 +08:00
|
|
|
let Defs = [LR] in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def MTLR : XFXForm_7_ext<31, 467, 8, (outs), (ins gprc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mtlr $rS", IIC_SprMTSPR>,
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_FXU;
|
2008-10-24 04:41:28 +08:00
|
|
|
}
|
|
|
|
let Uses = [LR] in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def MFLR : XFXForm_1_ext<31, 339, 8, (outs gprc:$rT), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mflr $rT", IIC_SprMFSPR>,
|
2006-03-12 17:13:49 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_FXU;
|
2008-10-24 04:41:28 +08:00
|
|
|
}
|
For functions that use vector registers, save VRSAVE, mark used
registers, and update it on entry to each function, then restore it on exit.
This compiles:
void func(vfloat *a, vfloat *b, vfloat *c) {
*a = *b * *c + *c;
}
to this:
_func:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r5
lvx v1, 0, r4
vmaddfp v0, v1, v0, v0
stvx v0, 0, r3
mtspr 256, r2
blr
GCC produces this (which has additional stack accesses):
_func:
mfspr r0,256
stw r0,-4(r1)
oris r0,r0,0xc000
mtspr 256,r0
lvx v0,0,r5
lvx v1,0,r4
lwz r12,-4(r1)
vmaddfp v0,v0,v1,v0
stvx v0,0,r3
mtspr 256,r12
blr
llvm-svn: 26733
2006-03-14 05:52:10 +08:00
|
|
|
|
2013-03-22 03:03:21 +08:00
|
|
|
let isCodeGenOnly = 1 in {
|
2013-07-03 20:32:41 +08:00
|
|
|
// Move to/from VRSAVE: despite being a SPR, the VRSAVE register is renamed
|
|
|
|
// like a GPR on the PPC970. As such, copies in and out have the same
|
|
|
|
// performance characteristics as an OR instruction.
|
|
|
|
def MTVRSAVE : XFXForm_7_ext<31, 467, 256, (outs), (ins gprc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mtspr 256, $rS", IIC_IntGeneral>,
|
2013-07-03 20:32:41 +08:00
|
|
|
PPC970_DGroup_Single, PPC970_Unit_FXU;
|
|
|
|
def MFVRSAVE : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mfspr $rT, 256", IIC_IntGeneral>,
|
2013-07-03 20:32:41 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_FXU;
|
|
|
|
|
2013-03-22 03:03:21 +08:00
|
|
|
def MTVRSAVEv : XFXForm_7_ext<31, 467, 256,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs VRSAVERC:$reg), (ins gprc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mtspr 256, $rS", IIC_IntGeneral>,
|
2013-03-22 03:03:21 +08:00
|
|
|
PPC970_DGroup_Single, PPC970_Unit_FXU;
|
2013-04-27 00:53:15 +08:00
|
|
|
def MFVRSAVEv : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT),
|
2013-03-22 03:03:21 +08:00
|
|
|
(ins VRSAVERC:$reg),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mfspr $rT, 256", IIC_IntGeneral>,
|
2013-03-22 03:03:21 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_FXU;
|
|
|
|
}
|
|
|
|
|
2016-06-10 07:27:48 +08:00
|
|
|
// Aliases for mtvrsave/mfvrsave to mfspr/mtspr.
|
|
|
|
def : InstAlias<"mtvrsave $rS", (MTVRSAVE gprc:$rS)>;
|
|
|
|
def : InstAlias<"mfvrsave $rS", (MFVRSAVE gprc:$rS)>;
|
|
|
|
|
2013-03-22 03:03:21 +08:00
|
|
|
// SPILL_VRSAVE - Indicate that we're dumping the VRSAVE register,
|
|
|
|
// so we'll need to scavenge a register for it.
|
|
|
|
let mayStore = 1 in
|
|
|
|
def SPILL_VRSAVE : Pseudo<(outs), (ins VRSAVERC:$vrsave, memri:$F),
|
|
|
|
"#SPILL_VRSAVE", []>;
|
|
|
|
|
|
|
|
// RESTORE_VRSAVE - Indicate that we're restoring the VRSAVE register (previously
|
|
|
|
// spilled), so we'll need to scavenge a register for it.
|
|
|
|
let mayLoad = 1 in
|
|
|
|
def RESTORE_VRSAVE : Pseudo<(outs VRSAVERC:$vrsave), (ins memri:$F),
|
|
|
|
"#RESTORE_VRSAVE", []>;
|
|
|
|
|
2014-11-26 08:46:26 +08:00
|
|
|
let hasSideEffects = 0 in {
|
2016-01-08 21:09:54 +08:00
|
|
|
// mtocrf's input needs to be prepared by shifting by an amount dependent
|
|
|
|
// on the cr register selected. Thus, post-ra anti-dep breaking must not
|
|
|
|
// later change that register assignment.
|
|
|
|
let hasExtraDefRegAllocReq = 1 in {
|
2013-07-04 01:59:07 +08:00
|
|
|
def MTOCRF: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins gprc:$ST),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mtocrf $FXM, $ST", IIC_BrMCRX>,
|
2013-07-04 01:59:07 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_CRU;
|
|
|
|
|
2016-01-08 21:09:54 +08:00
|
|
|
// Similarly to mtocrf, the mask for mtcrf must be prepared in a way that
|
|
|
|
// is dependent on the cr fields being set.
|
2013-07-04 01:59:07 +08:00
|
|
|
def MTCRF : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, gprc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mtcrf $FXM, $rS", IIC_BrMCRX>,
|
2006-03-12 17:13:49 +08:00
|
|
|
PPC970_MicroCode, PPC970_Unit_CRU;
|
2016-01-08 21:09:54 +08:00
|
|
|
} // hasExtraDefRegAllocReq = 1
|
2010-05-21 01:48:26 +08:00
|
|
|
|
2016-01-08 21:09:54 +08:00
|
|
|
// mfocrf's input needs to be prepared by shifting by an amount dependent
|
|
|
|
// on the cr register selected. Thus, post-ra anti-dep breaking must not
|
|
|
|
// later change that register assignment.
|
|
|
|
let hasExtraSrcRegAllocReq = 1 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def MFOCRF: XFXForm_5a<31, 19, (outs gprc:$rT), (ins crbitm:$FXM),
|
2013-12-01 04:41:13 +08:00
|
|
|
"mfocrf $rT, $FXM", IIC_SprMFCRF>,
|
2006-03-12 17:13:49 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_CRU;
|
2013-04-07 22:33:13 +08:00
|
|
|
|
2016-01-08 21:09:54 +08:00
|
|
|
// Similarly to mfocrf, the mask for mfcrf must be prepared in a way that
|
|
|
|
// is dependent on the cr fields being copied.
|
2013-04-27 00:53:15 +08:00
|
|
|
def MFCR : XFXForm_3<31, 19, (outs gprc:$rT), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mfcr $rT", IIC_SprMFCR>,
|
2013-04-07 22:33:13 +08:00
|
|
|
PPC970_MicroCode, PPC970_Unit_CRU;
|
2016-01-08 21:09:54 +08:00
|
|
|
} // hasExtraSrcRegAllocReq = 1
|
2016-03-31 23:26:37 +08:00
|
|
|
|
|
|
|
def MCRXRX : X_BF3<31, 576, (outs crrc:$BF), (ins),
|
|
|
|
"mcrxrx $BF", IIC_BrMCRX>, Requires<[IsISA3_0]>;
|
2014-11-26 08:46:26 +08:00
|
|
|
} // hasSideEffects = 0
|
2004-08-30 10:28:06 +08:00
|
|
|
|
2013-03-26 18:56:22 +08:00
|
|
|
// Pseudo instruction to perform FADD in round-to-zero mode.
|
|
|
|
let usesCustomInserter = 1, Uses = [RM] in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def FADDrtz: Pseudo<(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), "",
|
2013-03-26 18:56:22 +08:00
|
|
|
[(set f64:$FRT, (PPCfaddrtz f64:$FRA, f64:$FRB))]>;
|
|
|
|
}
|
2007-10-10 09:01:31 +08:00
|
|
|
|
2013-03-26 18:56:22 +08:00
|
|
|
// The above pseudo gets expanded to make use of the following instructions
|
|
|
|
// to manipulate FPSCR. Note that FPSCR is not modeled at the DAG level.
|
2008-10-30 02:26:45 +08:00
|
|
|
let Uses = [RM], Defs = [RM] in {
|
|
|
|
def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mtfsb0 $FM", IIC_IntMTFSB0, []>,
|
2008-10-30 02:26:45 +08:00
|
|
|
PPC970_DGroup_Single, PPC970_Unit_FPU;
|
|
|
|
def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mtfsb1 $FM", IIC_IntMTFSB0, []>,
|
2008-10-30 02:26:45 +08:00
|
|
|
PPC970_DGroup_Single, PPC970_Unit_FPU;
|
2015-01-15 09:00:53 +08:00
|
|
|
let isCodeGenOnly = 1 in
|
|
|
|
def MTFSFb : XFLForm<63, 711, (outs), (ins i32imm:$FM, f8rc:$rT),
|
|
|
|
"mtfsf $FM, $rT", IIC_IntMTFSB0, []>,
|
|
|
|
PPC970_DGroup_Single, PPC970_Unit_FPU;
|
2008-10-30 02:26:45 +08:00
|
|
|
}
|
|
|
|
let Uses = [RM] in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def MFFS : XForm_42<63, 583, (outs f8rc:$rT), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mffs $rT", IIC_IntMFFS,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set f64:$rT, (PPCmffs))]>,
|
2008-10-30 02:26:45 +08:00
|
|
|
PPC970_DGroup_Single, PPC970_Unit_FPU;
|
2015-01-15 09:00:53 +08:00
|
|
|
|
|
|
|
let Defs = [CR1] in
|
|
|
|
def MFFSo : XForm_42<63, 583, (outs f8rc:$rT), (ins),
|
|
|
|
"mffs. $rT", IIC_IntMFFS, []>, isDOT;
|
2008-10-30 02:26:45 +08:00
|
|
|
}
|
|
|
|
|
2007-10-10 09:01:31 +08:00
|
|
|
|
2014-11-26 08:46:26 +08:00
|
|
|
let PPC970_Unit = 1, hasSideEffects = 0 in { // FXU Operations.
|
2004-08-30 10:28:06 +08:00
|
|
|
// XO-Form instructions. Arithmetic instructions that can set overflow bit
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
defm ADD4 : XOForm_1r<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"add", "$rT, $rA, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rT, (add i32:$rA, i32:$rB))]>;
|
2013-12-21 02:08:54 +08:00
|
|
|
let isCodeGenOnly = 1 in
|
|
|
|
def ADD4TLS : XOForm_1<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, tlsreg32:$rB),
|
|
|
|
"add $rT, $rA, $rB", IIC_IntSimple,
|
|
|
|
[(set i32:$rT, (add i32:$rA, tglobaltlsaddr:$rB))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
defm ADDC : XOForm_1rc<31, 10, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"addc", "$rT, $rA, $rB", IIC_IntGeneral,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i32:$rT, (addc i32:$rA, i32:$rB))]>,
|
|
|
|
PPC970_DGroup_Cracked;
|
2014-03-24 23:07:28 +08:00
|
|
|
|
2015-04-10 07:54:37 +08:00
|
|
|
defm DIVW : XOForm_1rcr<31, 491, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
|
|
|
|
"divw", "$rT, $rA, $rB", IIC_IntDivW,
|
|
|
|
[(set i32:$rT, (sdiv i32:$rA, i32:$rB))]>;
|
|
|
|
defm DIVWU : XOForm_1rcr<31, 459, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
|
|
|
|
"divwu", "$rT, $rA, $rB", IIC_IntDivW,
|
|
|
|
[(set i32:$rT, (udiv i32:$rA, i32:$rB))]>;
|
|
|
|
def DIVWE : XOForm_1<31, 427, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
|
|
|
|
"divwe $rT, $rA, $rB", IIC_IntDivW,
|
|
|
|
[(set i32:$rT, (int_ppc_divwe gprc:$rA, gprc:$rB))]>,
|
|
|
|
Requires<[HasExtDiv]>;
|
|
|
|
let Defs = [CR0] in
|
|
|
|
def DIVWEo : XOForm_1<31, 427, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
|
|
|
|
"divwe. $rT, $rA, $rB", IIC_IntDivW,
|
|
|
|
[]>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First,
|
|
|
|
Requires<[HasExtDiv]>;
|
|
|
|
def DIVWEU : XOForm_1<31, 395, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
|
|
|
|
"divweu $rT, $rA, $rB", IIC_IntDivW,
|
|
|
|
[(set i32:$rT, (int_ppc_divweu gprc:$rA, gprc:$rB))]>,
|
|
|
|
Requires<[HasExtDiv]>;
|
|
|
|
let Defs = [CR0] in
|
|
|
|
def DIVWEUo : XOForm_1<31, 395, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
|
|
|
|
"divweu. $rT, $rA, $rB", IIC_IntDivW,
|
|
|
|
[]>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First,
|
|
|
|
Requires<[HasExtDiv]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
defm MULHW : XOForm_1r<31, 75, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mulhw", "$rT, $rA, $rB", IIC_IntMulHW,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rT, (mulhs i32:$rA, i32:$rB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm MULHWU : XOForm_1r<31, 11, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mulhwu", "$rT, $rA, $rB", IIC_IntMulHWU,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rT, (mulhu i32:$rA, i32:$rB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm MULLW : XOForm_1r<31, 235, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mullw", "$rT, $rA, $rB", IIC_IntMulHW,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rT, (mul i32:$rA, i32:$rB))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
} // isCommutable
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SUBF : XOForm_1r<31, 40, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"subf", "$rT, $rA, $rB", IIC_IntGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rT, (sub i32:$rB, i32:$rA))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SUBFC : XOForm_1rc<31, 8, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"subfc", "$rT, $rA, $rB", IIC_IntGeneral,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i32:$rT, (subc i32:$rB, i32:$rA))]>,
|
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm NEG : XOForm_3r<31, 104, 0, (outs gprc:$rT), (ins gprc:$rA),
|
2013-11-28 07:26:09 +08:00
|
|
|
"neg", "$rT, $rA", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i32:$rT, (ineg i32:$rA))]>;
|
2013-04-13 02:17:57 +08:00
|
|
|
let Uses = [CARRY] in {
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
defm ADDE : XOForm_1rc<31, 138, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"adde", "$rT, $rA, $rB", IIC_IntGeneral,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i32:$rT, (adde i32:$rA, i32:$rB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm ADDME : XOForm_3rc<31, 234, 0, (outs gprc:$rT), (ins gprc:$rA),
|
2013-11-28 07:26:09 +08:00
|
|
|
"addme", "$rT, $rA", IIC_IntGeneral,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i32:$rT, (adde i32:$rA, -1))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm ADDZE : XOForm_3rc<31, 202, 0, (outs gprc:$rT), (ins gprc:$rA),
|
2013-11-28 07:26:09 +08:00
|
|
|
"addze", "$rT, $rA", IIC_IntGeneral,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i32:$rT, (adde i32:$rA, 0))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SUBFE : XOForm_1rc<31, 136, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"subfe", "$rT, $rA, $rB", IIC_IntGeneral,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i32:$rT, (sube i32:$rB, i32:$rA))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SUBFME : XOForm_3rc<31, 232, 0, (outs gprc:$rT), (ins gprc:$rA),
|
2013-11-28 07:26:09 +08:00
|
|
|
"subfme", "$rT, $rA", IIC_IntGeneral,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i32:$rT, (sube -1, i32:$rA))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SUBFZE : XOForm_3rc<31, 200, 0, (outs gprc:$rT), (ins gprc:$rA),
|
2013-11-28 07:26:09 +08:00
|
|
|
"subfze", "$rT, $rA", IIC_IntGeneral,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i32:$rT, (sube 0, i32:$rA))]>;
|
2006-03-12 17:13:49 +08:00
|
|
|
}
|
2009-09-19 04:15:22 +08:00
|
|
|
}
|
2004-08-30 10:28:06 +08:00
|
|
|
|
|
|
|
// A-Form instructions. Most of the instructions executed in the FPU are of
|
|
|
|
// this type.
|
|
|
|
//
|
2014-11-26 08:46:26 +08:00
|
|
|
let PPC970_Unit = 3, hasSideEffects = 0 in { // FPU Operations.
|
2008-10-30 02:26:45 +08:00
|
|
|
let Uses = [RM] in {
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in {
|
2013-04-12 10:18:09 +08:00
|
|
|
defm FMADD : AForm_1r<63, 29,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set f64:$FRT, (fma f64:$FRA, f64:$FRC, f64:$FRB))]>;
|
2013-04-12 10:18:09 +08:00
|
|
|
defm FMADDS : AForm_1r<59, 29,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set f32:$FRT, (fma f32:$FRA, f32:$FRC, f32:$FRB))]>;
|
2013-04-12 10:18:09 +08:00
|
|
|
defm FMSUB : AForm_1r<63, 28,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set f64:$FRT,
|
|
|
|
(fma f64:$FRA, f64:$FRC, (fneg f64:$FRB)))]>;
|
2013-04-12 10:18:09 +08:00
|
|
|
defm FMSUBS : AForm_1r<59, 28,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set f32:$FRT,
|
|
|
|
(fma f32:$FRA, f32:$FRC, (fneg f32:$FRB)))]>;
|
2013-04-12 10:18:09 +08:00
|
|
|
defm FNMADD : AForm_1r<63, 31,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fnmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set f64:$FRT,
|
|
|
|
(fneg (fma f64:$FRA, f64:$FRC, f64:$FRB)))]>;
|
2013-04-12 10:18:09 +08:00
|
|
|
defm FNMADDS : AForm_1r<59, 31,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fnmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set f32:$FRT,
|
|
|
|
(fneg (fma f32:$FRA, f32:$FRC, f32:$FRB)))]>;
|
2013-04-12 10:18:09 +08:00
|
|
|
defm FNMSUB : AForm_1r<63, 30,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fnmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set f64:$FRT, (fneg (fma f64:$FRA, f64:$FRC,
|
|
|
|
(fneg f64:$FRB))))]>;
|
2013-04-12 10:18:09 +08:00
|
|
|
defm FNMSUBS : AForm_1r<59, 30,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fnmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set f32:$FRT, (fneg (fma f32:$FRA, f32:$FRC,
|
|
|
|
(fneg f32:$FRB))))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
} // isCommutable
|
2008-10-30 02:26:45 +08:00
|
|
|
}
|
2005-10-02 15:07:49 +08:00
|
|
|
// FSEL is artificially split into 4 and 8-byte forms for the result. To avoid
|
|
|
|
// having 4 of these, force the comparison to always be an 8-byte double (code
|
|
|
|
// should use an FMRSD if the input comparison value really wants to be a float)
|
2005-10-02 14:58:23 +08:00
|
|
|
// and 4/8 byte forms for the result and operand type..
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
2013-04-12 10:18:09 +08:00
|
|
|
defm FSELD : AForm_1r<63, 23,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fsel", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$FRT, (PPCfsel f64:$FRA, f64:$FRC, f64:$FRB))]>;
|
|
|
|
defm FSELS : AForm_1r<63, 23,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f4rc:$FRT), (ins f8rc:$FRA, f4rc:$FRC, f4rc:$FRB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fsel", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f32:$FRT, (PPCfsel f64:$FRA, f32:$FRC, f32:$FRB))]>;
|
2008-10-30 02:26:45 +08:00
|
|
|
let Uses = [RM] in {
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in {
|
2013-04-12 10:18:09 +08:00
|
|
|
defm FADD : AForm_2r<63, 21,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fadd", "$FRT, $FRA, $FRB", IIC_FPAddSub,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$FRT, (fadd f64:$FRA, f64:$FRB))]>;
|
|
|
|
defm FADDS : AForm_2r<59, 21,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fadds", "$FRT, $FRA, $FRB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f32:$FRT, (fadd f32:$FRA, f32:$FRB))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
} // isCommutable
|
2013-04-12 10:18:09 +08:00
|
|
|
defm FDIV : AForm_2r<63, 18,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fdiv", "$FRT, $FRA, $FRB", IIC_FPDivD,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$FRT, (fdiv f64:$FRA, f64:$FRB))]>;
|
|
|
|
defm FDIVS : AForm_2r<59, 18,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fdivs", "$FRT, $FRA, $FRB", IIC_FPDivS,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f32:$FRT, (fdiv f32:$FRA, f32:$FRB))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in {
|
2013-04-12 10:18:09 +08:00
|
|
|
defm FMUL : AForm_3r<63, 25,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fmul", "$FRT, $FRA, $FRC", IIC_FPFused,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$FRT, (fmul f64:$FRA, f64:$FRC))]>;
|
|
|
|
defm FMULS : AForm_3r<59, 25,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fmuls", "$FRT, $FRA, $FRC", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f32:$FRT, (fmul f32:$FRA, f32:$FRC))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
} // isCommutable
|
2013-04-12 10:18:09 +08:00
|
|
|
defm FSUB : AForm_2r<63, 20,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fsub", "$FRT, $FRA, $FRB", IIC_FPAddSub,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$FRT, (fsub f64:$FRA, f64:$FRB))]>;
|
|
|
|
defm FSUBS : AForm_2r<59, 20,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fsubs", "$FRT, $FRA, $FRB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f32:$FRT, (fsub f32:$FRA, f32:$FRB))]>;
|
2008-10-30 02:26:45 +08:00
|
|
|
}
|
2006-03-12 17:13:49 +08:00
|
|
|
}
|
2004-08-30 10:28:06 +08:00
|
|
|
|
2014-11-26 08:46:26 +08:00
|
|
|
let hasSideEffects = 0 in {
|
2012-06-23 07:10:08 +08:00
|
|
|
let PPC970_Unit = 1 in { // FXU Operations.
|
2013-04-07 23:06:53 +08:00
|
|
|
let isSelect = 1 in
|
2012-11-14 03:14:19 +08:00
|
|
|
def ISEL : AForm_4<31, 15,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$rT), (ins gprc_nor0:$rA, gprc:$rB, crbitrc:$cond),
|
2015-02-02 01:52:16 +08:00
|
|
|
"isel $rT, $rA, $rB, $cond", IIC_IntISEL,
|
2012-06-23 07:10:08 +08:00
|
|
|
[]>;
|
|
|
|
}
|
|
|
|
|
2006-03-12 17:13:49 +08:00
|
|
|
let PPC970_Unit = 1 in { // FXU Operations.
|
2004-08-31 10:28:08 +08:00
|
|
|
// M-Form instructions. rotate and mask instructions.
|
|
|
|
//
|
2006-11-16 07:24:18 +08:00
|
|
|
let isCommutable = 1 in {
|
2005-09-10 02:17:41 +08:00
|
|
|
// RLWIMI can be commuted if the rotate amount is zero.
|
2013-04-27 00:53:15 +08:00
|
|
|
defm RLWIMI : MForm_2r<20, (outs gprc:$rA),
|
|
|
|
(ins gprc:$rSi, gprc:$rS, u5imm:$SH, u5imm:$MB,
|
2013-11-28 07:26:09 +08:00
|
|
|
u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME",
|
|
|
|
IIC_IntRotate, []>, PPC970_DGroup_Cracked,
|
|
|
|
RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">;
|
2004-10-17 04:43:38 +08:00
|
|
|
}
|
2013-04-12 10:18:09 +08:00
|
|
|
let BaseName = "rlwinm" in {
|
2005-04-19 13:21:30 +08:00
|
|
|
def RLWINM : MForm_2<21,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$rA), (ins gprc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
|
2013-11-28 07:26:09 +08:00
|
|
|
"rlwinm $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[]>, RecFormRel;
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CR0] in
|
2005-04-19 13:21:30 +08:00
|
|
|
def RLWINMo : MForm_2<21,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs gprc:$rA), (ins gprc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
|
2013-11-28 07:26:09 +08:00
|
|
|
"rlwinm. $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[]>, isDOT, RecFormRel, PPC970_DGroup_Cracked;
|
|
|
|
}
|
2013-04-27 00:53:15 +08:00
|
|
|
defm RLWNM : MForm_2r<23, (outs gprc:$rA),
|
|
|
|
(ins gprc:$rS, gprc:$rB, u5imm:$MB, u5imm:$ME),
|
2013-11-28 07:26:09 +08:00
|
|
|
"rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[]>;
|
2006-03-12 17:13:49 +08:00
|
|
|
}
|
2014-11-26 08:46:26 +08:00
|
|
|
} // hasSideEffects = 0
|
2006-03-20 14:15:45 +08:00
|
|
|
|
2005-09-09 08:39:56 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// PowerPC Instruction Patterns
|
|
|
|
//
|
|
|
|
|
2005-09-27 06:20:16 +08:00
|
|
|
// Arbitrary immediate support. Implement in terms of LIS/ORI.
|
|
|
|
def : Pat<(i32 imm:$imm),
|
|
|
|
(ORI (LIS (HI16 imm:$imm)), (LO16 imm:$imm))>;
|
2005-09-29 01:13:15 +08:00
|
|
|
|
|
|
|
// Implement the 'not' operation with the NOR instruction.
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
def i32not : OutPatFrag<(ops node:$in),
|
|
|
|
(NOR $in, $in)>;
|
|
|
|
def : Pat<(not i32:$in),
|
|
|
|
(i32not $in)>;
|
2005-09-29 01:13:15 +08:00
|
|
|
|
2005-09-29 07:07:13 +08:00
|
|
|
// ADD an arbitrary immediate.
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(add i32:$in, imm:$imm),
|
|
|
|
(ADDIS (ADDI $in, (LO16 imm:$imm)), (HA16 imm:$imm))>;
|
2005-09-29 07:07:13 +08:00
|
|
|
// OR an arbitrary immediate.
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(or i32:$in, imm:$imm),
|
|
|
|
(ORIS (ORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
|
2005-09-29 07:07:13 +08:00
|
|
|
// XOR an arbitrary immediate.
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(xor i32:$in, imm:$imm),
|
|
|
|
(XORIS (XORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
|
2006-02-17 13:43:56 +08:00
|
|
|
// SUBFIC
|
Change some PowerPC PatLeaf definitions to ImmLeaf for fast-isel.
Using PatLeaf rather than ImmLeaf when defining immediate predicates
prevents simple patterns using those predicates from being recognized
for fast instruction selection. This patch replaces the immSExt16
PatLeaf predicate with two ImmLeaf predicates, imm32SExt16 and
imm64SExt16, allowing a few more patterns to be recognized (ADDI,
ADDIC, MULLI, ADDI8, and ADDIC8). Using the new predicates does not
help for LI, LI8, SUBFIC, and SUBFIC8 because these are rejected for
other reasons, but I see no reason to retain the PatLeaf predicate.
No functional change intended, and thus no test cases yet. This is
preliminary work for enabling fast-isel support for PowerPC. When
that support is ready, we'll be able to test this function.
llvm-svn: 182510
2013-05-23 04:09:24 +08:00
|
|
|
def : Pat<(sub imm32SExt16:$imm, i32:$in),
|
2013-03-26 03:04:58 +08:00
|
|
|
(SUBFIC $in, imm:$imm)>;
|
2005-10-19 09:38:02 +08:00
|
|
|
|
2006-06-17 04:22:01 +08:00
|
|
|
// SHL/SRL
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(shl i32:$in, (i32 imm:$imm)),
|
|
|
|
(RLWINM $in, imm:$imm, 0, (SHL32 imm:$imm))>;
|
|
|
|
def : Pat<(srl i32:$in, (i32 imm:$imm)),
|
|
|
|
(RLWINM $in, (SRL32 imm:$imm), imm:$imm, 31)>;
|
2005-10-20 02:42:01 +08:00
|
|
|
|
2006-01-12 05:21:00 +08:00
|
|
|
// ROTL
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(rotl i32:$in, i32:$sh),
|
|
|
|
(RLWNM $in, $sh, 0, 31)>;
|
|
|
|
def : Pat<(rotl i32:$in, (i32 imm:$imm)),
|
|
|
|
(RLWINM $in, imm:$imm, 0, 31)>;
|
2006-05-18 03:00:46 +08:00
|
|
|
|
2006-09-22 13:01:56 +08:00
|
|
|
// RLWNM
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(and (rotl i32:$in, i32:$sh), maskimm32:$imm),
|
|
|
|
(RLWNM $in, $sh, (MB maskimm32:$imm), (ME maskimm32:$imm))>;
|
2006-09-22 13:01:56 +08:00
|
|
|
|
2006-05-18 03:00:46 +08:00
|
|
|
// Calls
|
2013-03-22 23:24:13 +08:00
|
|
|
def : Pat<(PPCcall (i32 tglobaladdr:$dst)),
|
|
|
|
(BL tglobaladdr:$dst)>;
|
|
|
|
def : Pat<(PPCcall (i32 texternalsym:$dst)),
|
|
|
|
(BL texternalsym:$dst)>;
|
2006-05-18 03:00:46 +08:00
|
|
|
|
2008-04-30 17:16:33 +08:00
|
|
|
def : Pat<(PPCtc_return (i32 tglobaladdr:$dst), imm:$imm),
|
|
|
|
(TCRETURNdi tglobaladdr:$dst, imm:$imm)>;
|
|
|
|
|
|
|
|
def : Pat<(PPCtc_return (i32 texternalsym:$dst), imm:$imm),
|
|
|
|
(TCRETURNdi texternalsym:$dst, imm:$imm)>;
|
|
|
|
|
|
|
|
def : Pat<(PPCtc_return CTRRC:$dst, imm:$imm),
|
|
|
|
(TCRETURNri CTRRC:$dst, imm:$imm)>;
|
|
|
|
|
|
|
|
|
|
|
|
|
2005-11-17 15:30:41 +08:00
|
|
|
// Hi and Lo for Darwin Global Addresses.
|
2005-12-11 15:45:47 +08:00
|
|
|
def : Pat<(PPChi tglobaladdr:$in, 0), (LIS tglobaladdr:$in)>;
|
|
|
|
def : Pat<(PPClo tglobaladdr:$in, 0), (LI tglobaladdr:$in)>;
|
|
|
|
def : Pat<(PPChi tconstpool:$in, 0), (LIS tconstpool:$in)>;
|
|
|
|
def : Pat<(PPClo tconstpool:$in, 0), (LI tconstpool:$in)>;
|
2006-04-23 02:53:45 +08:00
|
|
|
def : Pat<(PPChi tjumptable:$in, 0), (LIS tjumptable:$in)>;
|
|
|
|
def : Pat<(PPClo tjumptable:$in, 0), (LI tjumptable:$in)>;
|
2009-11-05 05:31:18 +08:00
|
|
|
def : Pat<(PPChi tblockaddress:$in, 0), (LIS tblockaddress:$in)>;
|
|
|
|
def : Pat<(PPClo tblockaddress:$in, 0), (LI tblockaddress:$in)>;
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(PPChi tglobaltlsaddr:$g, i32:$in),
|
|
|
|
(ADDIS $in, tglobaltlsaddr:$g)>;
|
|
|
|
def : Pat<(PPClo tglobaltlsaddr:$g, i32:$in),
|
2013-03-26 18:55:20 +08:00
|
|
|
(ADDI $in, tglobaltlsaddr:$g)>;
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(add i32:$in, (PPChi tglobaladdr:$g, 0)),
|
|
|
|
(ADDIS $in, tglobaladdr:$g)>;
|
|
|
|
def : Pat<(add i32:$in, (PPChi tconstpool:$g, 0)),
|
|
|
|
(ADDIS $in, tconstpool:$g)>;
|
|
|
|
def : Pat<(add i32:$in, (PPChi tjumptable:$g, 0)),
|
|
|
|
(ADDIS $in, tjumptable:$g)>;
|
|
|
|
def : Pat<(add i32:$in, (PPChi tblockaddress:$g, 0)),
|
|
|
|
(ADDIS $in, tblockaddress:$g)>;
|
2005-11-17 15:30:41 +08:00
|
|
|
|
2013-12-21 02:08:54 +08:00
|
|
|
// Support for thread-local storage.
|
|
|
|
def PPC32GOT: Pseudo<(outs gprc:$rD), (ins), "#PPC32GOT",
|
|
|
|
[(set i32:$rD, (PPCppc32GOT))]>;
|
|
|
|
|
2014-07-26 01:47:22 +08:00
|
|
|
// Get the _GLOBAL_OFFSET_TABLE_ in PIC mode.
|
|
|
|
// This uses two output registers, the first as the real output, the second as a
|
|
|
|
// temporary register, used internally in code generation.
|
|
|
|
def PPC32PICGOT: Pseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT",
|
|
|
|
[]>, NoEncode<"$rT">;
|
|
|
|
|
2013-12-21 02:08:54 +08:00
|
|
|
def LDgotTprelL32: Pseudo<(outs gprc:$rD), (ins s16imm:$disp, gprc_nor0:$reg),
|
2014-07-26 01:47:22 +08:00
|
|
|
"#LDgotTprelL32",
|
|
|
|
[(set i32:$rD,
|
|
|
|
(PPCldGotTprelL tglobaltlsaddr:$disp, i32:$reg))]>;
|
2013-12-21 02:08:54 +08:00
|
|
|
def : Pat<(PPCaddTls i32:$in, tglobaltlsaddr:$g),
|
|
|
|
(ADD4TLS $in, tglobaltlsaddr:$g)>;
|
|
|
|
|
2014-07-26 01:47:22 +08:00
|
|
|
def ADDItlsgdL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
|
|
|
|
"#ADDItlsgdL32",
|
|
|
|
[(set i32:$rD,
|
|
|
|
(PPCaddiTlsgdL i32:$reg, tglobaltlsaddr:$disp))]>;
|
2015-02-11 03:09:05 +08:00
|
|
|
// LR is a true define, while the rest of the Defs are clobbers. R3 is
|
|
|
|
// explicitly defined when this op is created, so not mentioned here.
|
|
|
|
let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
|
|
|
|
Defs = [R0,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
|
|
|
|
def GETtlsADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
|
|
|
|
"GETtlsADDR32",
|
|
|
|
[(set i32:$rD,
|
|
|
|
(PPCgetTlsAddr i32:$reg, tglobaltlsaddr:$sym))]>;
|
|
|
|
// Combined op for ADDItlsgdL32 and GETtlsADDR32, late expanded. R3 and LR
|
|
|
|
// are true defines while the rest of the Defs are clobbers.
|
|
|
|
let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
|
|
|
|
Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
|
|
|
|
def ADDItlsgdLADDR32 : Pseudo<(outs gprc:$rD),
|
|
|
|
(ins gprc_nor0:$reg, s16imm:$disp, tlsgd32:$sym),
|
|
|
|
"#ADDItlsgdLADDR32",
|
|
|
|
[(set i32:$rD,
|
|
|
|
(PPCaddiTlsgdLAddr i32:$reg,
|
|
|
|
tglobaltlsaddr:$disp,
|
|
|
|
tglobaltlsaddr:$sym))]>;
|
2014-07-26 01:47:22 +08:00
|
|
|
def ADDItlsldL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
|
|
|
|
"#ADDItlsldL32",
|
|
|
|
[(set i32:$rD,
|
|
|
|
(PPCaddiTlsldL i32:$reg, tglobaltlsaddr:$disp))]>;
|
2015-02-11 03:09:05 +08:00
|
|
|
// LR is a true define, while the rest of the Defs are clobbers. R3 is
|
|
|
|
// explicitly defined when this op is created, so not mentioned here.
|
|
|
|
let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
|
|
|
|
Defs = [R0,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
|
|
|
|
def GETtlsldADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
|
|
|
|
"GETtlsldADDR32",
|
|
|
|
[(set i32:$rD,
|
|
|
|
(PPCgetTlsldAddr i32:$reg,
|
|
|
|
tglobaltlsaddr:$sym))]>;
|
|
|
|
// Combined op for ADDItlsldL32 and GETtlsADDR32, late expanded. R3 and LR
|
|
|
|
// are true defines while the rest of the Defs are clobbers.
|
|
|
|
let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
|
|
|
|
Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
|
|
|
|
def ADDItlsldLADDR32 : Pseudo<(outs gprc:$rD),
|
|
|
|
(ins gprc_nor0:$reg, s16imm:$disp, tlsgd32:$sym),
|
|
|
|
"#ADDItlsldLADDR32",
|
|
|
|
[(set i32:$rD,
|
|
|
|
(PPCaddiTlsldLAddr i32:$reg,
|
|
|
|
tglobaltlsaddr:$disp,
|
|
|
|
tglobaltlsaddr:$sym))]>;
|
2014-07-26 01:47:22 +08:00
|
|
|
def ADDIdtprelL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
|
|
|
|
"#ADDIdtprelL32",
|
|
|
|
[(set i32:$rD,
|
|
|
|
(PPCaddiDtprelL i32:$reg, tglobaltlsaddr:$disp))]>;
|
|
|
|
def ADDISdtprelHA32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
|
|
|
|
"#ADDISdtprelHA32",
|
|
|
|
[(set i32:$rD,
|
|
|
|
(PPCaddisDtprelHA i32:$reg,
|
|
|
|
tglobaltlsaddr:$disp))]>;
|
|
|
|
|
2014-07-19 07:29:49 +08:00
|
|
|
// Support for Position-independent code
|
2014-11-12 23:16:30 +08:00
|
|
|
def LWZtoc : Pseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg),
|
|
|
|
"#LWZtoc",
|
|
|
|
[(set i32:$rD,
|
|
|
|
(PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>;
|
2014-07-19 07:29:49 +08:00
|
|
|
// Get Global (GOT) Base Register offset, from the word immediately preceding
|
|
|
|
// the function label.
|
2014-11-12 23:16:30 +08:00
|
|
|
def UpdateGBR : Pseudo<(outs gprc:$rD, gprc:$rT), (ins gprc:$rI), "#UpdateGBR", []>;
|
2014-07-19 07:29:49 +08:00
|
|
|
|
|
|
|
|
2005-12-06 10:10:38 +08:00
|
|
|
// Standard shifts. These are represented separately from the real shifts above
|
|
|
|
// so that we can distinguish between shifts that allow 5-bit and 6-bit shift
|
|
|
|
// amounts.
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(sra i32:$rS, i32:$rB),
|
|
|
|
(SRAW $rS, $rB)>;
|
|
|
|
def : Pat<(srl i32:$rS, i32:$rB),
|
|
|
|
(SRW $rS, $rB)>;
|
|
|
|
def : Pat<(shl i32:$rS, i32:$rB),
|
|
|
|
(SLW $rS, $rB)>;
|
2005-12-06 10:10:38 +08:00
|
|
|
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(zextloadi1 iaddr:$src),
|
2005-12-20 07:25:09 +08:00
|
|
|
(LBZ iaddr:$src)>;
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(zextloadi1 xaddr:$src),
|
2005-12-20 07:25:09 +08:00
|
|
|
(LBZX xaddr:$src)>;
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(extloadi1 iaddr:$src),
|
2005-12-20 07:25:09 +08:00
|
|
|
(LBZ iaddr:$src)>;
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(extloadi1 xaddr:$src),
|
2005-12-20 07:25:09 +08:00
|
|
|
(LBZX xaddr:$src)>;
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(extloadi8 iaddr:$src),
|
2005-12-20 07:25:09 +08:00
|
|
|
(LBZ iaddr:$src)>;
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(extloadi8 xaddr:$src),
|
2005-12-20 07:25:09 +08:00
|
|
|
(LBZX xaddr:$src)>;
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(extloadi16 iaddr:$src),
|
2005-12-20 07:25:09 +08:00
|
|
|
(LHZ iaddr:$src)>;
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(extloadi16 xaddr:$src),
|
2005-12-20 07:25:09 +08:00
|
|
|
(LHZX xaddr:$src)>;
|
2010-07-17 05:03:52 +08:00
|
|
|
def : Pat<(f64 (extloadf32 iaddr:$src)),
|
|
|
|
(COPY_TO_REGCLASS (LFS iaddr:$src), F8RC)>;
|
|
|
|
def : Pat<(f64 (extloadf32 xaddr:$src)),
|
|
|
|
(COPY_TO_REGCLASS (LFSX xaddr:$src), F8RC)>;
|
|
|
|
|
2016-08-19 04:08:15 +08:00
|
|
|
def : Pat<(f64 (fpextend f32:$src)),
|
2013-03-26 03:04:58 +08:00
|
|
|
(COPY_TO_REGCLASS $src, F8RC)>;
|
2005-12-20 07:25:09 +08:00
|
|
|
|
2014-10-04 02:04:36 +08:00
|
|
|
// Only seq_cst fences require the heavyweight sync (SYNC 0).
|
|
|
|
// All others can use the lightweight sync (SYNC 1).
|
|
|
|
// source: http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
|
|
|
|
// The rule for seq_cst is duplicated to work with both 64 bits and 32 bits
|
|
|
|
// versions of Power.
|
|
|
|
def : Pat<(atomic_fence (i64 7), (imm)), (SYNC 0)>, Requires<[HasSYNC]>;
|
|
|
|
def : Pat<(atomic_fence (i32 7), (imm)), (SYNC 0)>, Requires<[HasSYNC]>;
|
|
|
|
def : Pat<(atomic_fence (imm), (imm)), (SYNC 1)>, Requires<[HasSYNC]>;
|
2014-10-03 06:34:22 +08:00
|
|
|
def : Pat<(atomic_fence (imm), (imm)), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
|
2011-07-28 06:21:52 +08:00
|
|
|
|
2013-04-03 12:01:11 +08:00
|
|
|
// Additional FNMSUB patterns: -a*c + b == -(a*c - b)
|
|
|
|
def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
|
|
|
|
(FNMSUB $A, $C, $B)>;
|
|
|
|
def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B),
|
|
|
|
(FNMSUB $A, $C, $B)>;
|
|
|
|
def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B),
|
|
|
|
(FNMSUBS $A, $C, $B)>;
|
|
|
|
def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B),
|
|
|
|
(FNMSUBS $A, $C, $B)>;
|
|
|
|
|
2013-08-19 13:01:02 +08:00
|
|
|
// FCOPYSIGN's operand types need not agree.
|
|
|
|
def : Pat<(fcopysign f64:$frB, f32:$frA),
|
|
|
|
(FCPSGND (COPY_TO_REGCLASS $frA, F8RC), $frB)>;
|
|
|
|
def : Pat<(fcopysign f32:$frB, f64:$frA),
|
|
|
|
(FCPSGNS (COPY_TO_REGCLASS $frA, F4RC), $frB)>;
|
|
|
|
|
2006-03-25 15:51:43 +08:00
|
|
|
include "PPCInstrAltivec.td"
|
2014-08-07 20:18:21 +08:00
|
|
|
include "PPCInstrSPE.td"
|
2006-06-17 04:22:01 +08:00
|
|
|
include "PPCInstr64Bit.td"
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
llvm-svn: 203768
2014-03-13 15:58:58 +08:00
|
|
|
include "PPCInstrVSX.td"
|
[PowerPC] Add support for the QPX vector instruction set
This adds support for the QPX vector instruction set, which is used by the
enhanced A2 cores on the IBM BG/Q supercomputers. QPX vectors are 256 bytes
wide, holding 4 double-precision floating-point values. Boolean values, modeled
here as <4 x i1> are actually also represented as floating-point values
(essentially { -1, 1 } for { false, true }). QPX shares many features with
Altivec and VSX, but is distinct from both of them. One major difference is
that, instead of adding completely-separate vector registers, QPX vector
registers are extensions of the scalar floating-point registers (lane 0 is the
corresponding scalar floating-point value). The operations supported on QPX
vectors mirrors that supported on the scalar floating-point values (with some
additional ones for permutations and logical/comparison operations).
I've been maintaining this support out-of-tree, as part of the bgclang project,
for several years. This is not the entire bgclang patch set, but is most of the
subset that can be cleanly integrated into LLVM proper at this time. Adding
this to the LLVM backend is part of my efforts to rebase bgclang to the current
LLVM trunk, but is independently useful (especially for codes that use LLVM as
a JIT in library form).
The assembler/disassembler test coverage is complete. The CodeGen test coverage
is not, but I've included some tests, and more will be added as follow-up work.
llvm-svn: 230413
2015-02-25 09:06:45 +08:00
|
|
|
include "PPCInstrQPX.td"
|
2015-03-26 03:36:23 +08:00
|
|
|
include "PPCInstrHTM.td"
|
2013-05-04 03:50:27 +08:00
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
def crnot : OutPatFrag<(ops node:$in),
|
|
|
|
(CRNOR $in, $in)>;
|
|
|
|
def : Pat<(not i1:$in),
|
|
|
|
(crnot $in)>;
|
|
|
|
|
|
|
|
// Patterns for arithmetic i1 operations.
|
|
|
|
def : Pat<(add i1:$a, i1:$b),
|
|
|
|
(CRXOR $a, $b)>;
|
|
|
|
def : Pat<(sub i1:$a, i1:$b),
|
|
|
|
(CRXOR $a, $b)>;
|
|
|
|
def : Pat<(mul i1:$a, i1:$b),
|
|
|
|
(CRAND $a, $b)>;
|
|
|
|
|
|
|
|
// We're sometimes asked to materialize i1 -1, which is just 1 in this case
|
|
|
|
// (-1 is used to mean all bits set).
|
|
|
|
def : Pat<(i1 -1), (CRSET)>;
|
|
|
|
|
|
|
|
// i1 extensions, implemented in terms of isel.
|
|
|
|
def : Pat<(i32 (zext i1:$in)),
|
|
|
|
(SELECT_I4 $in, (LI 1), (LI 0))>;
|
|
|
|
def : Pat<(i32 (sext i1:$in)),
|
|
|
|
(SELECT_I4 $in, (LI -1), (LI 0))>;
|
|
|
|
|
|
|
|
def : Pat<(i64 (zext i1:$in)),
|
|
|
|
(SELECT_I8 $in, (LI8 1), (LI8 0))>;
|
|
|
|
def : Pat<(i64 (sext i1:$in)),
|
|
|
|
(SELECT_I8 $in, (LI8 -1), (LI8 0))>;
|
|
|
|
|
|
|
|
// FIXME: We should choose either a zext or a sext based on other constants
|
|
|
|
// already around.
|
|
|
|
def : Pat<(i32 (anyext i1:$in)),
|
|
|
|
(SELECT_I4 $in, (LI 1), (LI 0))>;
|
|
|
|
def : Pat<(i64 (anyext i1:$in)),
|
|
|
|
(SELECT_I8 $in, (LI8 1), (LI8 0))>;
|
|
|
|
|
|
|
|
// match setcc on i1 variables.
|
2015-08-31 06:12:50 +08:00
|
|
|
// CRANDC is:
|
|
|
|
// 1 1 : F
|
|
|
|
// 1 0 : T
|
|
|
|
// 0 1 : F
|
|
|
|
// 0 0 : F
|
|
|
|
//
|
|
|
|
// LT is:
|
|
|
|
// -1 -1 : F
|
|
|
|
// -1 0 : T
|
|
|
|
// 0 -1 : F
|
|
|
|
// 0 0 : F
|
|
|
|
//
|
|
|
|
// ULT is:
|
|
|
|
// 1 1 : F
|
|
|
|
// 1 0 : F
|
|
|
|
// 0 1 : T
|
|
|
|
// 0 0 : F
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETLT)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(CRANDC $s1, $s2)>;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETULT)),
|
|
|
|
(CRANDC $s2, $s1)>;
|
2015-08-31 06:12:50 +08:00
|
|
|
// CRORC is:
|
|
|
|
// 1 1 : T
|
|
|
|
// 1 0 : T
|
|
|
|
// 0 1 : F
|
|
|
|
// 0 0 : T
|
|
|
|
//
|
|
|
|
// LE is:
|
|
|
|
// -1 -1 : T
|
|
|
|
// -1 0 : T
|
|
|
|
// 0 -1 : F
|
|
|
|
// 0 0 : T
|
|
|
|
//
|
|
|
|
// ULE is:
|
|
|
|
// 1 1 : T
|
|
|
|
// 1 0 : F
|
|
|
|
// 0 1 : T
|
|
|
|
// 0 0 : T
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETLE)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(CRORC $s1, $s2)>;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETULE)),
|
|
|
|
(CRORC $s2, $s1)>;
|
2015-08-31 06:12:50 +08:00
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETEQ)),
|
|
|
|
(CREQV $s1, $s2)>;
|
2015-08-31 06:12:50 +08:00
|
|
|
|
|
|
|
// GE is:
|
|
|
|
// -1 -1 : T
|
|
|
|
// -1 0 : F
|
|
|
|
// 0 -1 : T
|
|
|
|
// 0 0 : T
|
|
|
|
//
|
|
|
|
// UGE is:
|
|
|
|
// 1 1 : T
|
|
|
|
// 1 0 : T
|
|
|
|
// 0 1 : F
|
|
|
|
// 0 0 : T
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETGE)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(CRORC $s2, $s1)>;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETUGE)),
|
|
|
|
(CRORC $s1, $s2)>;
|
2015-08-31 06:12:50 +08:00
|
|
|
|
|
|
|
// GT is:
|
|
|
|
// -1 -1 : F
|
|
|
|
// -1 0 : F
|
|
|
|
// 0 -1 : T
|
|
|
|
// 0 0 : F
|
|
|
|
//
|
|
|
|
// UGT is:
|
|
|
|
// 1 1 : F
|
|
|
|
// 1 0 : T
|
|
|
|
// 0 1 : F
|
|
|
|
// 0 0 : F
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETGT)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(CRANDC $s2, $s1)>;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETUGT)),
|
|
|
|
(CRANDC $s1, $s2)>;
|
2015-08-31 06:12:50 +08:00
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETNE)),
|
|
|
|
(CRXOR $s1, $s2)>;
|
|
|
|
|
|
|
|
// match setcc on non-i1 (non-vector) variables. Note that SETUEQ, SETOGE,
|
|
|
|
// SETOLE, SETONE, SETULT and SETUGT should be expanded by legalize for
|
|
|
|
// floating-point types.
|
|
|
|
|
|
|
|
multiclass CRNotPat<dag pattern, dag result> {
|
|
|
|
def : Pat<pattern, (crnot result)>;
|
|
|
|
def : Pat<(not pattern), result>;
|
|
|
|
|
|
|
|
// We can also fold the crnot into an extension:
|
|
|
|
def : Pat<(i32 (zext pattern)),
|
|
|
|
(SELECT_I4 result, (LI 0), (LI 1))>;
|
|
|
|
def : Pat<(i32 (sext pattern)),
|
|
|
|
(SELECT_I4 result, (LI 0), (LI -1))>;
|
|
|
|
|
|
|
|
// We can also fold the crnot into an extension:
|
|
|
|
def : Pat<(i64 (zext pattern)),
|
|
|
|
(SELECT_I8 result, (LI8 0), (LI8 1))>;
|
|
|
|
def : Pat<(i64 (sext pattern)),
|
|
|
|
(SELECT_I8 result, (LI8 0), (LI8 -1))>;
|
|
|
|
|
|
|
|
// FIXME: We should choose either a zext or a sext based on other constants
|
|
|
|
// already around.
|
|
|
|
def : Pat<(i32 (anyext pattern)),
|
|
|
|
(SELECT_I4 result, (LI 0), (LI 1))>;
|
|
|
|
|
|
|
|
def : Pat<(i64 (anyext pattern)),
|
|
|
|
(SELECT_I8 result, (LI8 0), (LI8 1))>;
|
|
|
|
}
|
|
|
|
|
|
|
|
// FIXME: Because of what seems like a bug in TableGen's type-inference code,
|
|
|
|
// we need to write imm:$imm in the output patterns below, not just $imm, or
|
|
|
|
// else the resulting matcher will not correctly add the immediate operand
|
|
|
|
// (making it a register operand instead).
|
|
|
|
|
|
|
|
// extended SETCC.
|
|
|
|
multiclass ExtSetCCPat<CondCode cc, PatFrag pfrag,
|
|
|
|
OutPatFrag rfrag, OutPatFrag rfrag8> {
|
|
|
|
def : Pat<(i32 (zext (i1 (pfrag i32:$s1, cc)))),
|
|
|
|
(rfrag $s1)>;
|
|
|
|
def : Pat<(i64 (zext (i1 (pfrag i64:$s1, cc)))),
|
|
|
|
(rfrag8 $s1)>;
|
|
|
|
def : Pat<(i64 (zext (i1 (pfrag i32:$s1, cc)))),
|
|
|
|
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1), sub_32)>;
|
|
|
|
def : Pat<(i32 (zext (i1 (pfrag i64:$s1, cc)))),
|
|
|
|
(EXTRACT_SUBREG (rfrag8 $s1), sub_32)>;
|
|
|
|
|
|
|
|
def : Pat<(i32 (anyext (i1 (pfrag i32:$s1, cc)))),
|
|
|
|
(rfrag $s1)>;
|
|
|
|
def : Pat<(i64 (anyext (i1 (pfrag i64:$s1, cc)))),
|
|
|
|
(rfrag8 $s1)>;
|
|
|
|
def : Pat<(i64 (anyext (i1 (pfrag i32:$s1, cc)))),
|
|
|
|
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1), sub_32)>;
|
|
|
|
def : Pat<(i32 (anyext (i1 (pfrag i64:$s1, cc)))),
|
|
|
|
(EXTRACT_SUBREG (rfrag8 $s1), sub_32)>;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Note that we do all inversions below with i(32|64)not, instead of using
|
|
|
|
// (xori x, 1) because on the A2 nor has single-cycle latency while xori
|
|
|
|
// has 2-cycle latency.
|
|
|
|
|
|
|
|
defm : ExtSetCCPat<SETEQ,
|
|
|
|
PatFrag<(ops node:$in, node:$cc),
|
|
|
|
(setcc $in, 0, $cc)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLWINM (CNTLZW $in), 27, 31, 31)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLDICL (CNTLZD $in), 58, 63)> >;
|
|
|
|
|
|
|
|
defm : ExtSetCCPat<SETNE,
|
|
|
|
PatFrag<(ops node:$in, node:$cc),
|
|
|
|
(setcc $in, 0, $cc)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLWINM (i32not (CNTLZW $in)), 27, 31, 31)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLDICL (i64not (CNTLZD $in)), 58, 63)> >;
|
|
|
|
|
|
|
|
defm : ExtSetCCPat<SETLT,
|
|
|
|
PatFrag<(ops node:$in, node:$cc),
|
|
|
|
(setcc $in, 0, $cc)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLWINM $in, 1, 31, 31)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLDICL $in, 1, 63)> >;
|
|
|
|
|
|
|
|
defm : ExtSetCCPat<SETGE,
|
|
|
|
PatFrag<(ops node:$in, node:$cc),
|
|
|
|
(setcc $in, 0, $cc)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLWINM (i32not $in), 1, 31, 31)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLDICL (i64not $in), 1, 63)> >;
|
|
|
|
|
|
|
|
defm : ExtSetCCPat<SETGT,
|
|
|
|
PatFrag<(ops node:$in, node:$cc),
|
|
|
|
(setcc $in, 0, $cc)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLWINM (ANDC (NEG $in), $in), 1, 31, 31)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLDICL (ANDC8 (NEG8 $in), $in), 1, 63)> >;
|
|
|
|
|
|
|
|
defm : ExtSetCCPat<SETLE,
|
|
|
|
PatFrag<(ops node:$in, node:$cc),
|
|
|
|
(setcc $in, 0, $cc)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLWINM (ORC $in, (NEG $in)), 1, 31, 31)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLDICL (ORC8 $in, (NEG8 $in)), 1, 63)> >;
|
|
|
|
|
|
|
|
defm : ExtSetCCPat<SETLT,
|
|
|
|
PatFrag<(ops node:$in, node:$cc),
|
|
|
|
(setcc $in, -1, $cc)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLWINM (AND $in, (ADDI $in, 1)), 1, 31, 31)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLDICL (AND8 $in, (ADDI8 $in, 1)), 1, 63)> >;
|
|
|
|
|
|
|
|
defm : ExtSetCCPat<SETGE,
|
|
|
|
PatFrag<(ops node:$in, node:$cc),
|
|
|
|
(setcc $in, -1, $cc)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLWINM (NAND $in, (ADDI $in, 1)), 1, 31, 31)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLDICL (NAND8 $in, (ADDI8 $in, 1)), 1, 63)> >;
|
|
|
|
|
|
|
|
defm : ExtSetCCPat<SETGT,
|
|
|
|
PatFrag<(ops node:$in, node:$cc),
|
|
|
|
(setcc $in, -1, $cc)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLWINM (i32not $in), 1, 31, 31)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLDICL (i64not $in), 1, 63)> >;
|
|
|
|
|
|
|
|
defm : ExtSetCCPat<SETLE,
|
|
|
|
PatFrag<(ops node:$in, node:$cc),
|
|
|
|
(setcc $in, -1, $cc)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLWINM $in, 1, 31, 31)>,
|
|
|
|
OutPatFrag<(ops node:$in),
|
|
|
|
(RLDICL $in, 1, 63)> >;
|
|
|
|
|
[PowerPC] Add a pattern for a runtime bit check
Following a suggestion by Sanjay, we should lower:
%shl = shl i32 1, %y
%and = and i32 %x, %shl
%cmp = icmp eq i32 %and, %shl
ret i1 %cmp
into:
subfic r4, r4, 32
rlwnm r3, r3, r4, 31, 31
Add this pattern and some associated patterns for the 64-bit case and the
not-equal case. Fixes PR27356.
llvm-svn: 280454
2016-09-02 10:34:44 +08:00
|
|
|
// An extended SETCC with shift amount.
|
|
|
|
multiclass ExtSetCCShiftPat<CondCode cc, PatFrag pfrag,
|
|
|
|
OutPatFrag rfrag, OutPatFrag rfrag8> {
|
|
|
|
def : Pat<(i32 (zext (i1 (pfrag i32:$s1, i32:$sa, cc)))),
|
|
|
|
(rfrag $s1, $sa)>;
|
|
|
|
def : Pat<(i64 (zext (i1 (pfrag i64:$s1, i32:$sa, cc)))),
|
|
|
|
(rfrag8 $s1, $sa)>;
|
|
|
|
def : Pat<(i64 (zext (i1 (pfrag i32:$s1, i32:$sa, cc)))),
|
|
|
|
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1, $sa), sub_32)>;
|
|
|
|
def : Pat<(i32 (zext (i1 (pfrag i64:$s1, i32:$sa, cc)))),
|
|
|
|
(EXTRACT_SUBREG (rfrag8 $s1, $sa), sub_32)>;
|
|
|
|
|
|
|
|
def : Pat<(i32 (anyext (i1 (pfrag i32:$s1, i32:$sa, cc)))),
|
|
|
|
(rfrag $s1, $sa)>;
|
|
|
|
def : Pat<(i64 (anyext (i1 (pfrag i64:$s1, i32:$sa, cc)))),
|
|
|
|
(rfrag8 $s1, $sa)>;
|
|
|
|
def : Pat<(i64 (anyext (i1 (pfrag i32:$s1, i32:$sa, cc)))),
|
|
|
|
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1, $sa), sub_32)>;
|
|
|
|
def : Pat<(i32 (anyext (i1 (pfrag i64:$s1, i32:$sa, cc)))),
|
|
|
|
(EXTRACT_SUBREG (rfrag8 $s1, $sa), sub_32)>;
|
|
|
|
}
|
|
|
|
|
|
|
|
defm : ExtSetCCShiftPat<SETNE,
|
|
|
|
PatFrag<(ops node:$in, node:$sa, node:$cc),
|
|
|
|
(setcc (and $in, (shl 1, $sa)), 0, $cc)>,
|
|
|
|
OutPatFrag<(ops node:$in, node:$sa),
|
|
|
|
(RLWNM $in, (SUBFIC $sa, 32), 31, 31)>,
|
|
|
|
OutPatFrag<(ops node:$in, node:$sa),
|
|
|
|
(RLDCL $in, (SUBFIC $sa, 64), 63)> >;
|
|
|
|
|
|
|
|
defm : ExtSetCCShiftPat<SETEQ,
|
|
|
|
PatFrag<(ops node:$in, node:$sa, node:$cc),
|
|
|
|
(setcc (and $in, (shl 1, $sa)), 0, $cc)>,
|
|
|
|
OutPatFrag<(ops node:$in, node:$sa),
|
|
|
|
(RLWNM (i32not $in),
|
|
|
|
(SUBFIC $sa, 32), 31, 31)>,
|
|
|
|
OutPatFrag<(ops node:$in, node:$sa),
|
|
|
|
(RLDCL (i64not $in),
|
|
|
|
(SUBFIC $sa, 64), 63)> >;
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
// SETCC for i32.
|
|
|
|
def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULT)),
|
|
|
|
(EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
|
|
|
|
def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLT)),
|
|
|
|
(EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>;
|
|
|
|
def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGT)),
|
|
|
|
(EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>;
|
|
|
|
def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGT)),
|
|
|
|
(EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>;
|
|
|
|
def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETEQ)),
|
|
|
|
(EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>;
|
|
|
|
def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETEQ)),
|
|
|
|
(EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>;
|
|
|
|
|
|
|
|
// For non-equality comparisons, the default code would materialize the
|
|
|
|
// constant, then compare against it, like this:
|
|
|
|
// lis r2, 4660
|
|
|
|
// ori r2, r2, 22136
|
|
|
|
// cmpw cr0, r3, r2
|
|
|
|
// beq cr0,L6
|
|
|
|
// Since we are just comparing for equality, we can emit this instead:
|
|
|
|
// xoris r0,r3,0x1234
|
|
|
|
// cmplwi cr0,r0,0x5678
|
|
|
|
// beq cr0,L6
|
|
|
|
|
|
|
|
def : Pat<(i1 (setcc i32:$s1, imm:$imm, SETEQ)),
|
|
|
|
(EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
|
|
|
|
(LO16 imm:$imm)), sub_eq)>;
|
|
|
|
|
|
|
|
defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)),
|
|
|
|
(EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGE)),
|
|
|
|
(EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULE)),
|
|
|
|
(EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLE)),
|
|
|
|
(EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETNE)),
|
|
|
|
(EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETNE)),
|
|
|
|
(EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>;
|
|
|
|
|
|
|
|
defm : CRNotPat<(i1 (setcc i32:$s1, imm:$imm, SETNE)),
|
|
|
|
(EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
|
|
|
|
(LO16 imm:$imm)), sub_eq)>;
|
|
|
|
|
|
|
|
def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETULT)),
|
|
|
|
(EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
|
|
|
|
def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETLT)),
|
|
|
|
(EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>;
|
|
|
|
def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETUGT)),
|
|
|
|
(EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>;
|
|
|
|
def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETGT)),
|
|
|
|
(EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>;
|
|
|
|
def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETEQ)),
|
|
|
|
(EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;
|
|
|
|
|
|
|
|
defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETUGE)),
|
|
|
|
(EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETGE)),
|
|
|
|
(EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETULE)),
|
|
|
|
(EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETLE)),
|
|
|
|
(EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETNE)),
|
|
|
|
(EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;
|
|
|
|
|
|
|
|
// SETCC for i64.
|
|
|
|
def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULT)),
|
|
|
|
(EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>;
|
|
|
|
def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETLT)),
|
|
|
|
(EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_lt)>;
|
|
|
|
def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGT)),
|
|
|
|
(EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_gt)>;
|
|
|
|
def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGT)),
|
|
|
|
(EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_gt)>;
|
|
|
|
def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETEQ)),
|
|
|
|
(EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_eq)>;
|
|
|
|
def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETEQ)),
|
|
|
|
(EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_eq)>;
|
|
|
|
|
|
|
|
// For non-equality comparisons, the default code would materialize the
|
|
|
|
// constant, then compare against it, like this:
|
|
|
|
// lis r2, 4660
|
|
|
|
// ori r2, r2, 22136
|
|
|
|
// cmpd cr0, r3, r2
|
|
|
|
// beq cr0,L6
|
|
|
|
// Since we are just comparing for equality, we can emit this instead:
|
|
|
|
// xoris r0,r3,0x1234
|
|
|
|
// cmpldi cr0,r0,0x5678
|
|
|
|
// beq cr0,L6
|
|
|
|
|
|
|
|
def : Pat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETEQ)),
|
|
|
|
(EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
|
|
|
|
(LO16 imm:$imm)), sub_eq)>;
|
|
|
|
|
|
|
|
defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGE)),
|
|
|
|
(EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGE)),
|
|
|
|
(EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_lt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULE)),
|
|
|
|
(EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_gt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETLE)),
|
|
|
|
(EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_gt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETNE)),
|
|
|
|
(EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_eq)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETNE)),
|
|
|
|
(EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_eq)>;
|
|
|
|
|
|
|
|
defm : CRNotPat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETNE)),
|
|
|
|
(EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
|
|
|
|
(LO16 imm:$imm)), sub_eq)>;
|
|
|
|
|
|
|
|
def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETULT)),
|
|
|
|
(EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
|
|
|
|
def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETLT)),
|
|
|
|
(EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>;
|
|
|
|
def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETUGT)),
|
|
|
|
(EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>;
|
|
|
|
def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)),
|
|
|
|
(EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
|
|
|
|
def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)),
|
|
|
|
(EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
|
|
|
|
|
|
|
|
defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETUGE)),
|
|
|
|
(EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETGE)),
|
|
|
|
(EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETULE)),
|
|
|
|
(EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETLE)),
|
|
|
|
(EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)),
|
|
|
|
(EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
|
|
|
|
|
|
|
|
// SETCC for f32.
|
|
|
|
def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
|
|
|
|
def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETLT)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
|
|
|
|
def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOGT)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
|
|
|
|
def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETGT)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
|
|
|
|
def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOEQ)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
|
|
|
|
def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
|
|
|
|
def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETUO)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
|
|
|
|
|
|
|
|
defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
|
|
|
|
|
|
|
|
// SETCC for f64.
|
|
|
|
def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
|
|
|
|
def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETLT)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
|
|
|
|
def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOGT)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
|
|
|
|
def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETGT)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
|
|
|
|
def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOEQ)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
|
|
|
|
def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
|
|
|
|
def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETUO)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
|
|
|
|
|
|
|
|
defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
|
|
|
|
defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)),
|
|
|
|
(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
|
|
|
|
|
|
|
|
// match select on i1 variables:
|
|
|
|
def : Pat<(i1 (select i1:$cond, i1:$tval, i1:$fval)),
|
|
|
|
(CROR (CRAND $cond , $tval),
|
|
|
|
(CRAND (crnot $cond), $fval))>;
|
|
|
|
|
|
|
|
// match selectcc on i1 variables:
|
|
|
|
// select (lhs == rhs), tval, fval is:
|
|
|
|
// ((lhs == rhs) & tval) | (!(lhs == rhs) & fval)
|
|
|
|
def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETLT)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(CROR (CRAND (CRANDC $lhs, $rhs), $tval),
|
|
|
|
(CRAND (CRORC $rhs, $lhs), $fval))>;
|
|
|
|
def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETULT)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(CROR (CRAND (CRANDC $rhs, $lhs), $tval),
|
|
|
|
(CRAND (CRORC $lhs, $rhs), $fval))>;
|
|
|
|
def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETLE)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(CROR (CRAND (CRORC $lhs, $rhs), $tval),
|
|
|
|
(CRAND (CRANDC $rhs, $lhs), $fval))>;
|
|
|
|
def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETULE)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(CROR (CRAND (CRORC $rhs, $lhs), $tval),
|
|
|
|
(CRAND (CRANDC $lhs, $rhs), $fval))>;
|
|
|
|
def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETEQ)),
|
|
|
|
(CROR (CRAND (CREQV $lhs, $rhs), $tval),
|
|
|
|
(CRAND (CRXOR $lhs, $rhs), $fval))>;
|
|
|
|
def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETGE)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(CROR (CRAND (CRORC $rhs, $lhs), $tval),
|
|
|
|
(CRAND (CRANDC $lhs, $rhs), $fval))>;
|
|
|
|
def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETUGE)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(CROR (CRAND (CRORC $lhs, $rhs), $tval),
|
|
|
|
(CRAND (CRANDC $rhs, $lhs), $fval))>;
|
|
|
|
def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETGT)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(CROR (CRAND (CRANDC $rhs, $lhs), $tval),
|
|
|
|
(CRAND (CRORC $lhs, $rhs), $fval))>;
|
|
|
|
def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETUGT)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(CROR (CRAND (CRANDC $lhs, $rhs), $tval),
|
|
|
|
(CRAND (CRORC $rhs, $lhs), $fval))>;
|
|
|
|
def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETNE)),
|
|
|
|
(CROR (CRAND (CREQV $lhs, $rhs), $fval),
|
|
|
|
(CRAND (CRXOR $lhs, $rhs), $tval))>;
|
|
|
|
|
|
|
|
// match selectcc on i1 variables with non-i1 output.
|
|
|
|
def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETLT)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_I4 (CRANDC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETULT)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_I4 (CRANDC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETLE)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_I4 (CRORC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETULE)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_I4 (CRORC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETEQ)),
|
|
|
|
(SELECT_I4 (CREQV $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETGE)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_I4 (CRORC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETUGE)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_I4 (CRORC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETGT)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_I4 (CRANDC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETUGT)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_I4 (CRANDC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETNE)),
|
|
|
|
(SELECT_I4 (CRXOR $lhs, $rhs), $tval, $fval)>;
|
|
|
|
|
|
|
|
def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETLT)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_I8 (CRANDC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETULT)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_I8 (CRANDC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETLE)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_I8 (CRORC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETULE)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_I8 (CRORC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETEQ)),
|
|
|
|
(SELECT_I8 (CREQV $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETGE)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_I8 (CRORC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETUGE)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_I8 (CRORC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETGT)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_I8 (CRANDC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETUGT)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_I8 (CRANDC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETNE)),
|
|
|
|
(SELECT_I8 (CRXOR $lhs, $rhs), $tval, $fval)>;
|
|
|
|
|
|
|
|
def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_F4 (CRANDC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULT)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_F4 (CRANDC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLE)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_F4 (CRORC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULE)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_F4 (CRORC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETEQ)),
|
|
|
|
(SELECT_F4 (CREQV $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGE)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_F4 (CRORC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGE)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_F4 (CRORC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGT)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_F4 (CRANDC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGT)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_F4 (CRANDC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)),
|
|
|
|
(SELECT_F4 (CRXOR $lhs, $rhs), $tval, $fval)>;
|
|
|
|
|
|
|
|
def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_F8 (CRANDC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULT)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_F8 (CRANDC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_F8 (CRORC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULE)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_F8 (CRORC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)),
|
|
|
|
(SELECT_F8 (CREQV $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_F8 (CRORC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGE)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_F8 (CRORC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_F8 (CRANDC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_F8 (CRANDC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)),
|
|
|
|
(SELECT_F8 (CRXOR $lhs, $rhs), $tval, $fval)>;
|
|
|
|
|
|
|
|
def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLT)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_VRRC (CRANDC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETULT)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_VRRC (CRANDC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLE)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_VRRC (CRORC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETULE)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_VRRC (CRORC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETEQ)),
|
|
|
|
(SELECT_VRRC (CREQV $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETGE)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_VRRC (CRORC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETUGE)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_VRRC (CRORC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETGT)),
|
2015-08-31 06:12:50 +08:00
|
|
|
(SELECT_VRRC (CRANDC $rhs, $lhs), $tval, $fval)>;
|
|
|
|
def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETUGT)),
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(SELECT_VRRC (CRANDC $lhs, $rhs), $tval, $fval)>;
|
|
|
|
def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETNE)),
|
|
|
|
(SELECT_VRRC (CRXOR $lhs, $rhs), $tval, $fval)>;
|
|
|
|
|
|
|
|
let usesCustomInserter = 1 in {
|
|
|
|
def ANDIo_1_EQ_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in),
|
|
|
|
"#ANDIo_1_EQ_BIT",
|
|
|
|
[(set i1:$dst, (trunc (not i32:$in)))]>;
|
|
|
|
def ANDIo_1_GT_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in),
|
|
|
|
"#ANDIo_1_GT_BIT",
|
|
|
|
[(set i1:$dst, (trunc i32:$in))]>;
|
|
|
|
|
|
|
|
def ANDIo_1_EQ_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in),
|
|
|
|
"#ANDIo_1_EQ_BIT8",
|
|
|
|
[(set i1:$dst, (trunc (not i64:$in)))]>;
|
|
|
|
def ANDIo_1_GT_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in),
|
|
|
|
"#ANDIo_1_GT_BIT8",
|
|
|
|
[(set i1:$dst, (trunc i64:$in))]>;
|
|
|
|
}
|
|
|
|
|
|
|
|
def : Pat<(i1 (not (trunc i32:$in))),
|
|
|
|
(ANDIo_1_EQ_BIT $in)>;
|
|
|
|
def : Pat<(i1 (not (trunc i64:$in))),
|
|
|
|
(ANDIo_1_EQ_BIT8 $in)>;
|
2013-05-04 03:51:09 +08:00
|
|
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// PowerPC Instructions used for assembler/disassembler only
|
|
|
|
//
|
|
|
|
|
2014-08-05 21:34:01 +08:00
|
|
|
// FIXME: For B=0 or B > 8, the registers following RT are used.
|
|
|
|
// WARNING: Do not add patterns for this instruction without fixing this.
|
|
|
|
def LSWI : XForm_base_r3xo<31, 597, (outs gprc:$RT), (ins gprc:$A, u5imm:$B),
|
|
|
|
"lswi $RT, $A, $B", IIC_LdStLoad, []>;
|
|
|
|
|
|
|
|
// FIXME: For B=0 or B > 8, the registers following RT are used.
|
|
|
|
// WARNING: Do not add patterns for this instruction without fixing this.
|
|
|
|
def STSWI : XForm_base_r3xo<31, 725, (outs), (ins gprc:$RT, gprc:$A, u5imm:$B),
|
|
|
|
"stswi $RT, $A, $B", IIC_LdStLoad, []>;
|
|
|
|
|
2013-05-04 03:51:09 +08:00
|
|
|
def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"isync", IIC_SprISYNC, []>;
|
2013-05-04 03:51:09 +08:00
|
|
|
|
|
|
|
def ICBI : XForm_1a<31, 982, (outs), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"icbi $src", IIC_LdStICBI, []>;
|
2013-05-04 03:51:09 +08:00
|
|
|
|
2015-02-06 02:57:02 +08:00
|
|
|
// We used to have EIEIO as value but E[0-9A-Z] is a reserved name
|
|
|
|
def EnforceIEIO : XForm_24_eieio<31, 854, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"eieio", IIC_LdStLoad, []>;
|
2013-07-02 01:06:26 +08:00
|
|
|
|
2013-07-02 01:21:23 +08:00
|
|
|
def WAIT : XForm_24_sync<31, 62, (outs), (ins i32imm:$L),
|
2013-11-28 07:26:09 +08:00
|
|
|
"wait $L", IIC_LdStLoad, []>;
|
2013-07-02 01:21:23 +08:00
|
|
|
|
2014-07-30 07:16:31 +08:00
|
|
|
def MBAR : XForm_mbar<31, 854, (outs), (ins u5imm:$MO),
|
|
|
|
"mbar $MO", IIC_LdStLoad>, Requires<[IsBookE]>;
|
|
|
|
|
2014-07-30 06:21:57 +08:00
|
|
|
def MTSR: XForm_sr<31, 210, (outs), (ins gprc:$RS, u4imm:$SR),
|
|
|
|
"mtsr $SR, $RS", IIC_SprMTSR>;
|
|
|
|
|
|
|
|
def MFSR: XForm_sr<31, 595, (outs gprc:$RS), (ins u4imm:$SR),
|
|
|
|
"mfsr $RS, $SR", IIC_SprMFSR>;
|
|
|
|
|
|
|
|
def MTSRIN: XForm_srin<31, 242, (outs), (ins gprc:$RS, gprc:$RB),
|
|
|
|
"mtsrin $RS, $RB", IIC_SprMTSR>;
|
|
|
|
|
|
|
|
def MFSRIN: XForm_srin<31, 659, (outs gprc:$RS), (ins gprc:$RB),
|
|
|
|
"mfsrin $RS, $RB", IIC_SprMFSR>;
|
|
|
|
|
2013-09-13 01:50:54 +08:00
|
|
|
def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, i32imm:$L),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mtmsr $RS, $L", IIC_SprMTMSR>;
|
2013-09-13 01:50:54 +08:00
|
|
|
|
2014-07-30 18:32:51 +08:00
|
|
|
def WRTEE: XForm_mtmsr<31, 131, (outs), (ins gprc:$RS),
|
|
|
|
"wrtee $RS", IIC_SprMTMSR>, Requires<[IsBookE]> {
|
|
|
|
let L = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
def WRTEEI: I<31, (outs), (ins i1imm:$E), "wrteei $E", IIC_SprMTMSR>,
|
|
|
|
Requires<[IsBookE]> {
|
|
|
|
bits<1> E;
|
|
|
|
|
|
|
|
let Inst{16} = E;
|
|
|
|
let Inst{21-30} = 163;
|
|
|
|
}
|
|
|
|
|
2014-08-09 21:58:31 +08:00
|
|
|
def DCCCI : XForm_tlb<454, (outs), (ins gprc:$A, gprc:$B),
|
|
|
|
"dccci $A, $B", IIC_LdStLoad>, Requires<[IsPPC4xx]>;
|
|
|
|
def ICCCI : XForm_tlb<966, (outs), (ins gprc:$A, gprc:$B),
|
|
|
|
"iccci $A, $B", IIC_LdStLoad>, Requires<[IsPPC4xx]>;
|
|
|
|
|
|
|
|
def : InstAlias<"dci 0", (DCCCI R0, R0)>, Requires<[IsPPC4xx]>;
|
|
|
|
def : InstAlias<"dccci", (DCCCI R0, R0)>, Requires<[IsPPC4xx]>;
|
|
|
|
def : InstAlias<"ici 0", (ICCCI R0, R0)>, Requires<[IsPPC4xx]>;
|
|
|
|
def : InstAlias<"iccci", (ICCCI R0, R0)>, Requires<[IsPPC4xx]>;
|
2014-08-05 22:40:32 +08:00
|
|
|
|
2013-09-13 01:50:54 +08:00
|
|
|
def MFMSR : XForm_rs<31, 83, (outs gprc:$RT), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mfmsr $RT", IIC_SprMFMSR, []>;
|
2013-09-13 01:50:54 +08:00
|
|
|
|
|
|
|
def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, i32imm:$L),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mtmsrd $RS, $L", IIC_SprMTMSRD>;
|
2013-09-13 01:50:54 +08:00
|
|
|
|
2015-01-15 09:00:53 +08:00
|
|
|
def MCRFS : XLForm_3<63, 64, (outs crrc:$BF), (ins crrc:$BFA),
|
|
|
|
"mcrfs $BF, $BFA", IIC_BrMCR>;
|
|
|
|
|
|
|
|
def MTFSFI : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
|
|
|
|
"mtfsfi $BF, $U, $W", IIC_IntMFFS>;
|
|
|
|
|
|
|
|
def MTFSFIo : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
|
|
|
|
"mtfsfi. $BF, $U, $W", IIC_IntMFFS>, isDOT;
|
|
|
|
|
|
|
|
def : InstAlias<"mtfsfi $BF, $U", (MTFSFI crrc:$BF, i32imm:$U, 0)>;
|
|
|
|
def : InstAlias<"mtfsfi. $BF, $U", (MTFSFIo crrc:$BF, i32imm:$U, 0)>;
|
|
|
|
|
|
|
|
def MTFSF : XFLForm_1<63, 711, (outs),
|
|
|
|
(ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W),
|
|
|
|
"mtfsf $FLM, $FRB, $L, $W", IIC_IntMFFS, []>;
|
|
|
|
def MTFSFo : XFLForm_1<63, 711, (outs),
|
|
|
|
(ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W),
|
|
|
|
"mtfsf. $FLM, $FRB, $L, $W", IIC_IntMFFS, []>, isDOT;
|
|
|
|
|
|
|
|
def : InstAlias<"mtfsf $FLM, $FRB", (MTFSF i32imm:$FLM, f8rc:$FRB, 0, 0)>;
|
|
|
|
def : InstAlias<"mtfsf. $FLM, $FRB", (MTFSFo i32imm:$FLM, f8rc:$FRB, 0, 0)>;
|
|
|
|
|
2013-09-13 01:50:54 +08:00
|
|
|
def SLBIE : XForm_16b<31, 434, (outs), (ins gprc:$RB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"slbie $RB", IIC_SprSLBIE, []>;
|
2013-09-13 01:50:54 +08:00
|
|
|
|
|
|
|
def SLBMTE : XForm_26<31, 402, (outs), (ins gprc:$RS, gprc:$RB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"slbmte $RS, $RB", IIC_SprSLBMTE, []>;
|
2013-09-13 01:50:54 +08:00
|
|
|
|
|
|
|
def SLBMFEE : XForm_26<31, 915, (outs gprc:$RT), (ins gprc:$RB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"slbmfee $RT, $RB", IIC_SprSLBMFEE, []>;
|
2013-09-13 01:50:54 +08:00
|
|
|
|
2016-09-03 07:42:01 +08:00
|
|
|
def SLBMFEV : XLForm_1_gen<31, 851, (outs gprc:$RT), (ins gprc:$RB),
|
|
|
|
"slbmfev $RT, $RB", IIC_SprSLBMFEV, []>;
|
|
|
|
|
2013-11-28 07:26:09 +08:00
|
|
|
def SLBIA : XForm_0<31, 498, (outs), (ins), "slbia", IIC_SprSLBIA, []>;
|
2013-09-13 01:50:54 +08:00
|
|
|
|
2014-08-03 04:16:29 +08:00
|
|
|
def TLBIA : XForm_0<31, 370, (outs), (ins),
|
|
|
|
"tlbia", IIC_SprTLBIA, []>;
|
|
|
|
|
2013-09-13 01:50:54 +08:00
|
|
|
def TLBSYNC : XForm_0<31, 566, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"tlbsync", IIC_SprTLBSYNC, []>;
|
2013-09-13 01:50:54 +08:00
|
|
|
|
|
|
|
def TLBIEL : XForm_16b<31, 274, (outs), (ins gprc:$RB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"tlbiel $RB", IIC_SprTLBIEL, []>;
|
2013-09-13 01:50:54 +08:00
|
|
|
|
2014-08-05 07:49:45 +08:00
|
|
|
def TLBLD : XForm_16b<31, 978, (outs), (ins gprc:$RB),
|
|
|
|
"tlbld $RB", IIC_LdStLoad, []>, Requires<[IsPPC6xx]>;
|
|
|
|
def TLBLI : XForm_16b<31, 1010, (outs), (ins gprc:$RB),
|
|
|
|
"tlbli $RB", IIC_LdStLoad, []>, Requires<[IsPPC6xx]>;
|
|
|
|
|
2013-09-13 01:50:54 +08:00
|
|
|
def TLBIE : XForm_26<31, 306, (outs), (ins gprc:$RS, gprc:$RB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"tlbie $RB,$RS", IIC_SprTLBIE, []>;
|
2013-09-13 01:50:54 +08:00
|
|
|
|
2014-07-31 06:51:15 +08:00
|
|
|
def TLBSX : XForm_tlb<914, (outs), (ins gprc:$A, gprc:$B), "tlbsx $A, $B",
|
|
|
|
IIC_LdStLoad>, Requires<[IsBookE]>;
|
|
|
|
|
|
|
|
def TLBIVAX : XForm_tlb<786, (outs), (ins gprc:$A, gprc:$B), "tlbivax $A, $B",
|
|
|
|
IIC_LdStLoad>, Requires<[IsBookE]>;
|
2014-07-31 04:44:04 +08:00
|
|
|
|
|
|
|
def TLBRE : XForm_24_eieio<31, 946, (outs), (ins),
|
|
|
|
"tlbre", IIC_LdStLoad, []>, Requires<[IsBookE]>;
|
|
|
|
|
|
|
|
def TLBWE : XForm_24_eieio<31, 978, (outs), (ins),
|
|
|
|
"tlbwe", IIC_LdStLoad, []>, Requires<[IsBookE]>;
|
|
|
|
|
2014-08-05 05:28:22 +08:00
|
|
|
def TLBRE2 : XForm_tlbws<31, 946, (outs gprc:$RS), (ins gprc:$A, i1imm:$WS),
|
|
|
|
"tlbre $RS, $A, $WS", IIC_LdStLoad, []>, Requires<[IsPPC4xx]>;
|
|
|
|
|
|
|
|
def TLBWE2 : XForm_tlbws<31, 978, (outs), (ins gprc:$RS, gprc:$A, i1imm:$WS),
|
|
|
|
"tlbwe $RS, $A, $WS", IIC_LdStLoad, []>, Requires<[IsPPC4xx]>;
|
|
|
|
|
|
|
|
def TLBSX2 : XForm_base_r3xo<31, 914, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
|
|
|
|
"tlbsx $RST, $A, $B", IIC_LdStLoad, []>,
|
|
|
|
Requires<[IsPPC4xx]>;
|
|
|
|
def TLBSX2D : XForm_base_r3xo<31, 914, (outs),
|
|
|
|
(ins gprc:$RST, gprc:$A, gprc:$B),
|
|
|
|
"tlbsx. $RST, $A, $B", IIC_LdStLoad, []>,
|
|
|
|
Requires<[IsPPC4xx]>, isDOT;
|
|
|
|
|
2014-08-07 20:39:59 +08:00
|
|
|
def RFID : XForm_0<19, 18, (outs), (ins), "rfid", IIC_IntRFID, []>;
|
|
|
|
|
2014-08-07 20:35:16 +08:00
|
|
|
def RFI : XForm_0<19, 50, (outs), (ins), "rfi", IIC_SprRFI, []>,
|
2014-07-30 07:45:20 +08:00
|
|
|
Requires<[IsBookE]>;
|
|
|
|
def RFCI : XForm_0<19, 51, (outs), (ins), "rfci", IIC_BrB, []>,
|
|
|
|
Requires<[IsBookE]>;
|
2014-07-29 23:49:09 +08:00
|
|
|
|
2014-08-04 23:47:38 +08:00
|
|
|
def RFDI : XForm_0<19, 39, (outs), (ins), "rfdi", IIC_BrB, []>,
|
|
|
|
Requires<[IsE500]>;
|
|
|
|
def RFMCI : XForm_0<19, 38, (outs), (ins), "rfmci", IIC_BrB, []>,
|
|
|
|
Requires<[IsE500]>;
|
2014-07-31 05:09:03 +08:00
|
|
|
|
2014-08-03 04:00:26 +08:00
|
|
|
def MFDCR : XFXForm_1<31, 323, (outs gprc:$RT), (ins i32imm:$SPR),
|
2014-08-04 23:47:38 +08:00
|
|
|
"mfdcr $RT, $SPR", IIC_SprMFSPR>, Requires<[IsPPC4xx]>;
|
2014-08-03 04:00:26 +08:00
|
|
|
def MTDCR : XFXForm_1<31, 451, (outs), (ins gprc:$RT, i32imm:$SPR),
|
2014-08-04 23:47:38 +08:00
|
|
|
"mtdcr $SPR, $RT", IIC_SprMTSPR>, Requires<[IsPPC4xx]>;
|
2014-08-03 04:00:26 +08:00
|
|
|
|
2016-09-03 07:42:01 +08:00
|
|
|
def HRFID : XLForm_1_np<19, 274, (outs), (ins), "hrfid", IIC_BrB, []>;
|
|
|
|
def NAP : XLForm_1_np<19, 434, (outs), (ins), "nap", IIC_BrB, []>;
|
|
|
|
|
2014-11-25 08:30:11 +08:00
|
|
|
def ATTN : XForm_attn<0, 256, (outs), (ins), "attn", IIC_BrB>;
|
|
|
|
|
2014-11-30 18:15:56 +08:00
|
|
|
def LBZCIX : XForm_base_r3xo<31, 853, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
|
|
|
|
"lbzcix $RST, $A, $B", IIC_LdStLoad, []>;
|
|
|
|
def LHZCIX : XForm_base_r3xo<31, 821, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
|
|
|
|
"lhzcix $RST, $A, $B", IIC_LdStLoad, []>;
|
|
|
|
def LWZCIX : XForm_base_r3xo<31, 789, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
|
|
|
|
"lwzcix $RST, $A, $B", IIC_LdStLoad, []>;
|
|
|
|
def LDCIX : XForm_base_r3xo<31, 885, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
|
|
|
|
"ldcix $RST, $A, $B", IIC_LdStLoad, []>;
|
|
|
|
|
|
|
|
def STBCIX : XForm_base_r3xo<31, 981, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
|
|
|
|
"stbcix $RST, $A, $B", IIC_LdStLoad, []>;
|
|
|
|
def STHCIX : XForm_base_r3xo<31, 949, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
|
|
|
|
"sthcix $RST, $A, $B", IIC_LdStLoad, []>;
|
|
|
|
def STWCIX : XForm_base_r3xo<31, 917, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
|
|
|
|
"stwcix $RST, $A, $B", IIC_LdStLoad, []>;
|
|
|
|
def STDCIX : XForm_base_r3xo<31, 1013, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
|
|
|
|
"stdcix $RST, $A, $B", IIC_LdStLoad, []>;
|
|
|
|
|
2013-05-04 03:50:27 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// PowerPC Assembler Instruction Aliases
|
|
|
|
//
|
|
|
|
|
|
|
|
// Pseudo-instructions for alternate assembly syntax (never used by codegen).
|
|
|
|
// These are aliases that require C++ handling to convert to the target
|
|
|
|
// instruction, while InstAliases can be handled directly by tblgen.
|
|
|
|
class PPCAsmPseudo<string asm, dag iops>
|
|
|
|
: Instruction {
|
|
|
|
let Namespace = "PPC";
|
|
|
|
bit PPC64 = 0; // Default value, override with isPPC64
|
|
|
|
|
|
|
|
let OutOperandList = (outs);
|
|
|
|
let InOperandList = iops;
|
|
|
|
let Pattern = [];
|
|
|
|
let AsmString = asm;
|
|
|
|
let isAsmParserOnly = 1;
|
|
|
|
let isPseudo = 1;
|
|
|
|
}
|
|
|
|
|
2013-06-11 01:19:43 +08:00
|
|
|
def : InstAlias<"sc", (SC 0)>;
|
|
|
|
|
2014-10-03 06:34:22 +08:00
|
|
|
def : InstAlias<"sync", (SYNC 0)>, Requires<[HasSYNC]>;
|
2015-04-24 07:05:08 +08:00
|
|
|
def : InstAlias<"msync", (SYNC 0), 0>, Requires<[HasSYNC]>;
|
2014-10-03 06:34:22 +08:00
|
|
|
def : InstAlias<"lwsync", (SYNC 1)>, Requires<[HasSYNC]>;
|
|
|
|
def : InstAlias<"ptesync", (SYNC 2)>, Requires<[HasSYNC]>;
|
2013-07-02 00:37:52 +08:00
|
|
|
|
2013-07-02 01:21:23 +08:00
|
|
|
def : InstAlias<"wait", (WAIT 0)>;
|
|
|
|
def : InstAlias<"waitrsv", (WAIT 1)>;
|
|
|
|
def : InstAlias<"waitimpl", (WAIT 2)>;
|
|
|
|
|
2014-07-30 07:31:27 +08:00
|
|
|
def : InstAlias<"mbar", (MBAR 0)>, Requires<[IsBookE]>;
|
|
|
|
|
2015-04-24 06:47:57 +08:00
|
|
|
def DCBTx : PPCAsmPseudo<"dcbt $dst", (ins memrr:$dst)>;
|
|
|
|
def DCBTSTx : PPCAsmPseudo<"dcbtst $dst", (ins memrr:$dst)>;
|
|
|
|
|
|
|
|
def DCBTCT : PPCAsmPseudo<"dcbtct $dst, $TH", (ins memrr:$dst, u5imm:$TH)>;
|
|
|
|
def DCBTDS : PPCAsmPseudo<"dcbtds $dst, $TH", (ins memrr:$dst, u5imm:$TH)>;
|
|
|
|
def DCBTT : PPCAsmPseudo<"dcbtt $dst", (ins memrr:$dst)>;
|
|
|
|
|
|
|
|
def DCBTSTCT : PPCAsmPseudo<"dcbtstct $dst, $TH", (ins memrr:$dst, u5imm:$TH)>;
|
|
|
|
def DCBTSTDS : PPCAsmPseudo<"dcbtstds $dst, $TH", (ins memrr:$dst, u5imm:$TH)>;
|
|
|
|
def DCBTSTT : PPCAsmPseudo<"dcbtstt $dst", (ins memrr:$dst)>;
|
|
|
|
|
2016-09-03 07:41:54 +08:00
|
|
|
def DCBFx : PPCAsmPseudo<"dcbf $dst", (ins memrr:$dst)>;
|
|
|
|
def DCBFL : PPCAsmPseudo<"dcbfl $dst", (ins memrr:$dst)>;
|
|
|
|
def DCBFLP : PPCAsmPseudo<"dcbflp $dst", (ins memrr:$dst)>;
|
|
|
|
|
2013-07-02 05:40:54 +08:00
|
|
|
def : InstAlias<"crset $bx", (CREQV crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
|
|
|
|
def : InstAlias<"crclr $bx", (CRXOR crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
|
|
|
|
def : InstAlias<"crmove $bx, $by", (CROR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>;
|
|
|
|
def : InstAlias<"crnot $bx, $by", (CRNOR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>;
|
|
|
|
|
2013-07-03 20:32:41 +08:00
|
|
|
def : InstAlias<"mtxer $Rx", (MTSPR 1, gprc:$Rx)>;
|
|
|
|
def : InstAlias<"mfxer $Rx", (MFSPR gprc:$Rx, 1)>;
|
|
|
|
|
2014-08-07 21:16:58 +08:00
|
|
|
def : InstAlias<"mfrtcu $Rx", (MFSPR gprc:$Rx, 4)>;
|
|
|
|
def : InstAlias<"mfrtcl $Rx", (MFSPR gprc:$Rx, 5)>;
|
|
|
|
|
2014-07-30 02:55:43 +08:00
|
|
|
def : InstAlias<"mtdscr $Rx", (MTSPR 17, gprc:$Rx)>;
|
|
|
|
def : InstAlias<"mfdscr $Rx", (MFSPR gprc:$Rx, 17)>;
|
|
|
|
|
2014-07-30 06:42:44 +08:00
|
|
|
def : InstAlias<"mtdsisr $Rx", (MTSPR 18, gprc:$Rx)>;
|
|
|
|
def : InstAlias<"mfdsisr $Rx", (MFSPR gprc:$Rx, 18)>;
|
2014-07-30 02:55:43 +08:00
|
|
|
|
|
|
|
def : InstAlias<"mtdar $Rx", (MTSPR 19, gprc:$Rx)>;
|
|
|
|
def : InstAlias<"mfdar $Rx", (MFSPR gprc:$Rx, 19)>;
|
|
|
|
|
|
|
|
def : InstAlias<"mtdec $Rx", (MTSPR 22, gprc:$Rx)>;
|
|
|
|
def : InstAlias<"mfdec $Rx", (MFSPR gprc:$Rx, 22)>;
|
|
|
|
|
|
|
|
def : InstAlias<"mtsdr1 $Rx", (MTSPR 25, gprc:$Rx)>;
|
|
|
|
def : InstAlias<"mfsdr1 $Rx", (MFSPR gprc:$Rx, 25)>;
|
|
|
|
|
|
|
|
def : InstAlias<"mtsrr0 $Rx", (MTSPR 26, gprc:$Rx)>;
|
|
|
|
def : InstAlias<"mfsrr0 $Rx", (MFSPR gprc:$Rx, 26)>;
|
|
|
|
|
|
|
|
def : InstAlias<"mtsrr1 $Rx", (MTSPR 27, gprc:$Rx)>;
|
|
|
|
def : InstAlias<"mfsrr1 $Rx", (MFSPR gprc:$Rx, 27)>;
|
|
|
|
|
2014-08-05 22:53:05 +08:00
|
|
|
def : InstAlias<"mtsrr2 $Rx", (MTSPR 990, gprc:$Rx)>, Requires<[IsPPC4xx]>;
|
|
|
|
def : InstAlias<"mfsrr2 $Rx", (MFSPR gprc:$Rx, 990)>, Requires<[IsPPC4xx]>;
|
|
|
|
|
|
|
|
def : InstAlias<"mtsrr3 $Rx", (MTSPR 991, gprc:$Rx)>, Requires<[IsPPC4xx]>;
|
|
|
|
def : InstAlias<"mfsrr3 $Rx", (MFSPR gprc:$Rx, 991)>, Requires<[IsPPC4xx]>;
|
|
|
|
|
2014-07-30 02:55:43 +08:00
|
|
|
def : InstAlias<"mtcfar $Rx", (MTSPR 28, gprc:$Rx)>;
|
|
|
|
def : InstAlias<"mfcfar $Rx", (MFSPR gprc:$Rx, 28)>;
|
|
|
|
|
|
|
|
def : InstAlias<"mtamr $Rx", (MTSPR 29, gprc:$Rx)>;
|
|
|
|
def : InstAlias<"mfamr $Rx", (MFSPR gprc:$Rx, 29)>;
|
|
|
|
|
2014-07-31 07:59:11 +08:00
|
|
|
def : InstAlias<"mtpid $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsBookE]>;
|
|
|
|
def : InstAlias<"mfpid $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsBookE]>;
|
|
|
|
|
2013-07-08 23:20:38 +08:00
|
|
|
def : InstAlias<"mftb $Rx", (MFTB gprc:$Rx, 268)>;
|
2014-08-05 04:28:34 +08:00
|
|
|
def : InstAlias<"mftbl $Rx", (MFTB gprc:$Rx, 268)>;
|
2013-07-08 23:20:38 +08:00
|
|
|
def : InstAlias<"mftbu $Rx", (MFTB gprc:$Rx, 269)>;
|
|
|
|
|
2014-08-07 21:06:23 +08:00
|
|
|
def : InstAlias<"mttbl $Rx", (MTSPR 284, gprc:$Rx)>;
|
|
|
|
def : InstAlias<"mttbu $Rx", (MTSPR 285, gprc:$Rx)>;
|
|
|
|
|
2014-08-05 22:18:16 +08:00
|
|
|
def : InstAlias<"mftblo $Rx", (MFSPR gprc:$Rx, 989)>, Requires<[IsPPC4xx]>;
|
|
|
|
def : InstAlias<"mttblo $Rx", (MTSPR 989, gprc:$Rx)>, Requires<[IsPPC4xx]>;
|
|
|
|
def : InstAlias<"mftbhi $Rx", (MFSPR gprc:$Rx, 988)>, Requires<[IsPPC4xx]>;
|
|
|
|
def : InstAlias<"mttbhi $Rx", (MTSPR 988, gprc:$Rx)>, Requires<[IsPPC4xx]>;
|
|
|
|
|
2013-06-25 02:08:03 +08:00
|
|
|
def : InstAlias<"xnop", (XORI R0, R0, 0)>;
|
|
|
|
|
2013-05-04 03:50:27 +08:00
|
|
|
def : InstAlias<"mr $rA, $rB", (OR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
|
2013-06-25 02:08:03 +08:00
|
|
|
def : InstAlias<"mr. $rA, $rB", (OR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
|
|
|
|
|
|
|
|
def : InstAlias<"not $rA, $rB", (NOR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
|
|
|
|
def : InstAlias<"not. $rA, $rB", (NOR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
|
|
|
|
|
2013-07-04 01:59:07 +08:00
|
|
|
def : InstAlias<"mtcr $rA", (MTCRF8 255, g8rc:$rA)>;
|
|
|
|
|
2014-08-05 01:07:41 +08:00
|
|
|
foreach BATR = 0-3 in {
|
|
|
|
def : InstAlias<"mtdbatu "#BATR#", $Rx",
|
|
|
|
(MTSPR !add(BATR, !add(BATR, 536)), gprc:$Rx)>,
|
|
|
|
Requires<[IsPPC6xx]>;
|
|
|
|
def : InstAlias<"mfdbatu $Rx, "#BATR,
|
|
|
|
(MFSPR gprc:$Rx, !add(BATR, !add(BATR, 536)))>,
|
|
|
|
Requires<[IsPPC6xx]>;
|
|
|
|
def : InstAlias<"mtdbatl "#BATR#", $Rx",
|
|
|
|
(MTSPR !add(BATR, !add(BATR, 537)), gprc:$Rx)>,
|
|
|
|
Requires<[IsPPC6xx]>;
|
|
|
|
def : InstAlias<"mfdbatl $Rx, "#BATR,
|
|
|
|
(MFSPR gprc:$Rx, !add(BATR, !add(BATR, 537)))>,
|
|
|
|
Requires<[IsPPC6xx]>;
|
|
|
|
def : InstAlias<"mtibatu "#BATR#", $Rx",
|
|
|
|
(MTSPR !add(BATR, !add(BATR, 528)), gprc:$Rx)>,
|
|
|
|
Requires<[IsPPC6xx]>;
|
|
|
|
def : InstAlias<"mfibatu $Rx, "#BATR,
|
|
|
|
(MFSPR gprc:$Rx, !add(BATR, !add(BATR, 528)))>,
|
|
|
|
Requires<[IsPPC6xx]>;
|
|
|
|
def : InstAlias<"mtibatl "#BATR#", $Rx",
|
|
|
|
(MTSPR !add(BATR, !add(BATR, 529)), gprc:$Rx)>,
|
|
|
|
Requires<[IsPPC6xx]>;
|
|
|
|
def : InstAlias<"mfibatl $Rx, "#BATR,
|
|
|
|
(MFSPR gprc:$Rx, !add(BATR, !add(BATR, 529)))>,
|
|
|
|
Requires<[IsPPC6xx]>;
|
|
|
|
}
|
|
|
|
|
2014-08-05 23:45:15 +08:00
|
|
|
foreach BR = 0-7 in {
|
|
|
|
def : InstAlias<"mfbr"#BR#" $Rx",
|
|
|
|
(MFDCR gprc:$Rx, !add(BR, 0x80))>,
|
|
|
|
Requires<[IsPPC4xx]>;
|
|
|
|
def : InstAlias<"mtbr"#BR#" $Rx",
|
|
|
|
(MTDCR gprc:$Rx, !add(BR, 0x80))>,
|
|
|
|
Requires<[IsPPC4xx]>;
|
|
|
|
}
|
|
|
|
|
2014-08-05 06:56:42 +08:00
|
|
|
def : InstAlias<"mtdccr $Rx", (MTSPR 1018, gprc:$Rx)>, Requires<[IsPPC4xx]>;
|
|
|
|
def : InstAlias<"mfdccr $Rx", (MFSPR gprc:$Rx, 1018)>, Requires<[IsPPC4xx]>;
|
|
|
|
|
|
|
|
def : InstAlias<"mticcr $Rx", (MTSPR 1019, gprc:$Rx)>, Requires<[IsPPC4xx]>;
|
|
|
|
def : InstAlias<"mficcr $Rx", (MFSPR gprc:$Rx, 1019)>, Requires<[IsPPC4xx]>;
|
|
|
|
|
|
|
|
def : InstAlias<"mtdear $Rx", (MTSPR 981, gprc:$Rx)>, Requires<[IsPPC4xx]>;
|
|
|
|
def : InstAlias<"mfdear $Rx", (MFSPR gprc:$Rx, 981)>, Requires<[IsPPC4xx]>;
|
|
|
|
|
|
|
|
def : InstAlias<"mtesr $Rx", (MTSPR 980, gprc:$Rx)>, Requires<[IsPPC4xx]>;
|
|
|
|
def : InstAlias<"mfesr $Rx", (MFSPR gprc:$Rx, 980)>, Requires<[IsPPC4xx]>;
|
|
|
|
|
2014-08-07 20:18:21 +08:00
|
|
|
def : InstAlias<"mfspefscr $Rx", (MFSPR gprc:$Rx, 512)>;
|
|
|
|
def : InstAlias<"mtspefscr $Rx", (MTSPR 512, gprc:$Rx)>;
|
|
|
|
|
2014-08-05 07:53:42 +08:00
|
|
|
def : InstAlias<"mttcr $Rx", (MTSPR 986, gprc:$Rx)>, Requires<[IsPPC4xx]>;
|
|
|
|
def : InstAlias<"mftcr $Rx", (MFSPR gprc:$Rx, 986)>, Requires<[IsPPC4xx]>;
|
|
|
|
|
2013-06-25 02:08:03 +08:00
|
|
|
def LAx : PPCAsmPseudo<"la $rA, $addr", (ins gprc:$rA, memri:$addr)>;
|
2013-05-04 03:50:27 +08:00
|
|
|
|
2013-06-25 21:16:48 +08:00
|
|
|
def SUBI : PPCAsmPseudo<"subi $rA, $rB, $imm",
|
|
|
|
(ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
|
|
|
|
def SUBIS : PPCAsmPseudo<"subis $rA, $rB, $imm",
|
|
|
|
(ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
|
|
|
|
def SUBIC : PPCAsmPseudo<"subic $rA, $rB, $imm",
|
|
|
|
(ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
|
|
|
|
def SUBICo : PPCAsmPseudo<"subic. $rA, $rB, $imm",
|
|
|
|
(ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
|
|
|
|
|
|
|
|
def : InstAlias<"sub $rA, $rB, $rC", (SUBF8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
|
|
|
|
def : InstAlias<"sub. $rA, $rB, $rC", (SUBF8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
|
|
|
|
def : InstAlias<"subc $rA, $rB, $rC", (SUBFC8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
|
|
|
|
def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
|
|
|
|
|
2013-09-13 01:50:54 +08:00
|
|
|
def : InstAlias<"mtmsrd $RS", (MTMSRD gprc:$RS, 0)>;
|
|
|
|
def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>;
|
|
|
|
|
2014-08-07 21:35:34 +08:00
|
|
|
def : InstAlias<"mfasr $RT", (MFSPR gprc:$RT, 280)>;
|
|
|
|
def : InstAlias<"mtasr $RT", (MTSPR 280, gprc:$RT)>;
|
|
|
|
|
2014-08-05 01:26:15 +08:00
|
|
|
foreach SPRG = 0-3 in {
|
|
|
|
def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 272))>;
|
|
|
|
def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR gprc:$RT, !add(SPRG, 272))>;
|
|
|
|
def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR !add(SPRG, 272), gprc:$RT)>;
|
|
|
|
def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR !add(SPRG, 272), gprc:$RT)>;
|
|
|
|
}
|
|
|
|
foreach SPRG = 4-7 in {
|
|
|
|
def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 256))>,
|
|
|
|
Requires<[IsBookE]>;
|
|
|
|
def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR gprc:$RT, !add(SPRG, 256))>,
|
|
|
|
Requires<[IsBookE]>;
|
|
|
|
def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR !add(SPRG, 256), gprc:$RT)>,
|
|
|
|
Requires<[IsBookE]>;
|
|
|
|
def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR !add(SPRG, 256), gprc:$RT)>,
|
|
|
|
Requires<[IsBookE]>;
|
|
|
|
}
|
2013-09-13 01:50:54 +08:00
|
|
|
|
|
|
|
def : InstAlias<"mtasr $RS", (MTSPR 280, gprc:$RS)>;
|
|
|
|
|
|
|
|
def : InstAlias<"mfdec $RT", (MFSPR gprc:$RT, 22)>;
|
|
|
|
def : InstAlias<"mtdec $RT", (MTSPR 22, gprc:$RT)>;
|
|
|
|
|
|
|
|
def : InstAlias<"mfpvr $RT", (MFSPR gprc:$RT, 287)>;
|
|
|
|
|
|
|
|
def : InstAlias<"mfsdr1 $RT", (MFSPR gprc:$RT, 25)>;
|
|
|
|
def : InstAlias<"mtsdr1 $RT", (MTSPR 25, gprc:$RT)>;
|
|
|
|
|
|
|
|
def : InstAlias<"mfsrr0 $RT", (MFSPR gprc:$RT, 26)>;
|
|
|
|
def : InstAlias<"mfsrr1 $RT", (MFSPR gprc:$RT, 27)>;
|
|
|
|
def : InstAlias<"mtsrr0 $RT", (MTSPR 26, gprc:$RT)>;
|
|
|
|
def : InstAlias<"mtsrr1 $RT", (MTSPR 27, gprc:$RT)>;
|
|
|
|
|
|
|
|
def : InstAlias<"tlbie $RB", (TLBIE R0, gprc:$RB)>;
|
|
|
|
|
2014-08-05 05:28:22 +08:00
|
|
|
def : InstAlias<"tlbrehi $RS, $A", (TLBRE2 gprc:$RS, gprc:$A, 0)>,
|
|
|
|
Requires<[IsPPC4xx]>;
|
|
|
|
def : InstAlias<"tlbrelo $RS, $A", (TLBRE2 gprc:$RS, gprc:$A, 1)>,
|
|
|
|
Requires<[IsPPC4xx]>;
|
|
|
|
def : InstAlias<"tlbwehi $RS, $A", (TLBWE2 gprc:$RS, gprc:$A, 0)>,
|
|
|
|
Requires<[IsPPC4xx]>;
|
|
|
|
def : InstAlias<"tlbwelo $RS, $A", (TLBWE2 gprc:$RS, gprc:$A, 1)>,
|
|
|
|
Requires<[IsPPC4xx]>;
|
|
|
|
|
2013-06-25 21:17:41 +08:00
|
|
|
def EXTLWI : PPCAsmPseudo<"extlwi $rA, $rS, $n, $b",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
|
|
|
|
def EXTLWIo : PPCAsmPseudo<"extlwi. $rA, $rS, $n, $b",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
|
|
|
|
def EXTRWI : PPCAsmPseudo<"extrwi $rA, $rS, $n, $b",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
|
|
|
|
def EXTRWIo : PPCAsmPseudo<"extrwi. $rA, $rS, $n, $b",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
|
|
|
|
def INSLWI : PPCAsmPseudo<"inslwi $rA, $rS, $n, $b",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
|
|
|
|
def INSLWIo : PPCAsmPseudo<"inslwi. $rA, $rS, $n, $b",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
|
|
|
|
def INSRWI : PPCAsmPseudo<"insrwi $rA, $rS, $n, $b",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
|
|
|
|
def INSRWIo : PPCAsmPseudo<"insrwi. $rA, $rS, $n, $b",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
|
|
|
|
def ROTRWI : PPCAsmPseudo<"rotrwi $rA, $rS, $n",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$n)>;
|
|
|
|
def ROTRWIo : PPCAsmPseudo<"rotrwi. $rA, $rS, $n",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$n)>;
|
2013-05-04 03:50:27 +08:00
|
|
|
def SLWI : PPCAsmPseudo<"slwi $rA, $rS, $n",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$n)>;
|
2013-06-25 21:17:41 +08:00
|
|
|
def SLWIo : PPCAsmPseudo<"slwi. $rA, $rS, $n",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$n)>;
|
2013-05-04 03:50:27 +08:00
|
|
|
def SRWI : PPCAsmPseudo<"srwi $rA, $rS, $n",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$n)>;
|
2013-06-25 21:17:41 +08:00
|
|
|
def SRWIo : PPCAsmPseudo<"srwi. $rA, $rS, $n",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$n)>;
|
|
|
|
def CLRRWI : PPCAsmPseudo<"clrrwi $rA, $rS, $n",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$n)>;
|
|
|
|
def CLRRWIo : PPCAsmPseudo<"clrrwi. $rA, $rS, $n",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$n)>;
|
|
|
|
def CLRLSLWI : PPCAsmPseudo<"clrlslwi $rA, $rS, $b, $n",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>;
|
|
|
|
def CLRLSLWIo : PPCAsmPseudo<"clrlslwi. $rA, $rS, $b, $n",
|
|
|
|
(ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>;
|
|
|
|
|
|
|
|
def : InstAlias<"rotlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>;
|
|
|
|
def : InstAlias<"rotlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>;
|
|
|
|
def : InstAlias<"rotlw $rA, $rS, $rB", (RLWNM gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>;
|
|
|
|
def : InstAlias<"rotlw. $rA, $rS, $rB", (RLWNMo gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>;
|
|
|
|
def : InstAlias<"clrlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;
|
|
|
|
def : InstAlias<"clrlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;
|
|
|
|
|
2015-10-28 11:26:45 +08:00
|
|
|
def : InstAlias<"cntlzw $rA, $rS", (CNTLZW gprc:$rA, gprc:$rS)>;
|
|
|
|
def : InstAlias<"cntlzw. $rA, $rS", (CNTLZWo gprc:$rA, gprc:$rS)>;
|
|
|
|
// The POWER variant
|
|
|
|
def : MnemonicAlias<"cntlz", "cntlzw">;
|
|
|
|
def : MnemonicAlias<"cntlz.", "cntlzw.">;
|
2015-02-11 02:45:02 +08:00
|
|
|
|
2013-06-25 21:17:41 +08:00
|
|
|
def EXTLDI : PPCAsmPseudo<"extldi $rA, $rS, $n, $b",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
|
|
|
|
def EXTLDIo : PPCAsmPseudo<"extldi. $rA, $rS, $n, $b",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
|
|
|
|
def EXTRDI : PPCAsmPseudo<"extrdi $rA, $rS, $n, $b",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
|
|
|
|
def EXTRDIo : PPCAsmPseudo<"extrdi. $rA, $rS, $n, $b",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
|
|
|
|
def INSRDI : PPCAsmPseudo<"insrdi $rA, $rS, $n, $b",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
|
|
|
|
def INSRDIo : PPCAsmPseudo<"insrdi. $rA, $rS, $n, $b",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
|
|
|
|
def ROTRDI : PPCAsmPseudo<"rotrdi $rA, $rS, $n",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
|
|
|
|
def ROTRDIo : PPCAsmPseudo<"rotrdi. $rA, $rS, $n",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
|
2013-05-04 03:50:27 +08:00
|
|
|
def SLDI : PPCAsmPseudo<"sldi $rA, $rS, $n",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
|
2013-06-25 21:17:41 +08:00
|
|
|
def SLDIo : PPCAsmPseudo<"sldi. $rA, $rS, $n",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
|
2013-05-04 03:50:27 +08:00
|
|
|
def SRDI : PPCAsmPseudo<"srdi $rA, $rS, $n",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
|
2013-06-25 21:17:41 +08:00
|
|
|
def SRDIo : PPCAsmPseudo<"srdi. $rA, $rS, $n",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
|
|
|
|
def CLRRDI : PPCAsmPseudo<"clrrdi $rA, $rS, $n",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
|
|
|
|
def CLRRDIo : PPCAsmPseudo<"clrrdi. $rA, $rS, $n",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
|
|
|
|
def CLRLSLDI : PPCAsmPseudo<"clrlsldi $rA, $rS, $b, $n",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>;
|
|
|
|
def CLRLSLDIo : PPCAsmPseudo<"clrlsldi. $rA, $rS, $b, $n",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>;
|
|
|
|
|
|
|
|
def : InstAlias<"rotldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
|
|
|
|
def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
|
|
|
|
def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
|
|
|
|
def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
|
|
|
|
def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
|
|
|
|
def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
|
2013-05-04 03:50:27 +08:00
|
|
|
|
2015-03-29 03:42:41 +08:00
|
|
|
def RLWINMbm : PPCAsmPseudo<"rlwinm $rA, $rS, $n, $b",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
|
|
|
|
def RLWINMobm : PPCAsmPseudo<"rlwinm. $rA, $rS, $n, $b",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
|
|
|
|
def RLWIMIbm : PPCAsmPseudo<"rlwimi $rA, $rS, $n, $b",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
|
|
|
|
def RLWIMIobm : PPCAsmPseudo<"rlwimi. $rA, $rS, $n, $b",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
|
|
|
|
def RLWNMbm : PPCAsmPseudo<"rlwnm $rA, $rS, $n, $b",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
|
|
|
|
def RLWNMobm : PPCAsmPseudo<"rlwnm. $rA, $rS, $n, $b",
|
|
|
|
(ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
|
|
|
|
|
2013-06-24 19:55:21 +08:00
|
|
|
// These generic branch instruction forms are used for the assembler parser only.
|
|
|
|
// Defs and Uses are conservative, since we don't know the BO value.
|
|
|
|
let PPC970_Unit = 7 in {
|
|
|
|
let Defs = [CTR], Uses = [CTR, RM] in {
|
|
|
|
def gBC : BForm_3<16, 0, 0, (outs),
|
|
|
|
(ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst),
|
|
|
|
"bc $bo, $bi, $dst">;
|
|
|
|
def gBCA : BForm_3<16, 1, 0, (outs),
|
|
|
|
(ins u5imm:$bo, crbitrc:$bi, abscondbrtarget:$dst),
|
|
|
|
"bca $bo, $bi, $dst">;
|
[PowerPC] Support asm parsing for bc[l][a][+-] mnemonics
PowerPC assembly code in the wild, so it seems, has things like this:
bc+ 12, 28, .L9
This is a bit odd because the '+' here becomes part of the BO field, and the BO
field is otherwise the first operand. Nevertheless, the ISA specification does
clearly say that the +- hint syntax applies to all conditional-branch mnemonics
(that test either CTR or a condition register, although not the forms which
check both), both basic and extended, so this is supposed to be valid.
This introduces some asm-parser-only definitions which take only the upper
three bits from the specified BO value, and the lower two bits are implied by
the +- suffix (via some associated aliases).
Fixes PR23646.
llvm-svn: 280571
2016-09-03 10:31:44 +08:00
|
|
|
let isAsmParserOnly = 1 in {
|
|
|
|
def gBCat : BForm_3_at<16, 0, 0, (outs),
|
|
|
|
(ins u5imm:$bo, atimm:$at, crbitrc:$bi,
|
|
|
|
condbrtarget:$dst),
|
|
|
|
"bc$at $bo, $bi, $dst">;
|
|
|
|
def gBCAat : BForm_3_at<16, 1, 0, (outs),
|
|
|
|
(ins u5imm:$bo, atimm:$at, crbitrc:$bi,
|
|
|
|
abscondbrtarget:$dst),
|
|
|
|
"bca$at $bo, $bi, $dst">;
|
|
|
|
} // isAsmParserOnly = 1
|
2013-06-24 19:55:21 +08:00
|
|
|
}
|
|
|
|
let Defs = [LR, CTR], Uses = [CTR, RM] in {
|
|
|
|
def gBCL : BForm_3<16, 0, 1, (outs),
|
|
|
|
(ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst),
|
|
|
|
"bcl $bo, $bi, $dst">;
|
|
|
|
def gBCLA : BForm_3<16, 1, 1, (outs),
|
|
|
|
(ins u5imm:$bo, crbitrc:$bi, abscondbrtarget:$dst),
|
|
|
|
"bcla $bo, $bi, $dst">;
|
[PowerPC] Support asm parsing for bc[l][a][+-] mnemonics
PowerPC assembly code in the wild, so it seems, has things like this:
bc+ 12, 28, .L9
This is a bit odd because the '+' here becomes part of the BO field, and the BO
field is otherwise the first operand. Nevertheless, the ISA specification does
clearly say that the +- hint syntax applies to all conditional-branch mnemonics
(that test either CTR or a condition register, although not the forms which
check both), both basic and extended, so this is supposed to be valid.
This introduces some asm-parser-only definitions which take only the upper
three bits from the specified BO value, and the lower two bits are implied by
the +- suffix (via some associated aliases).
Fixes PR23646.
llvm-svn: 280571
2016-09-03 10:31:44 +08:00
|
|
|
let isAsmParserOnly = 1 in {
|
|
|
|
def gBCLat : BForm_3_at<16, 0, 1, (outs),
|
|
|
|
(ins u5imm:$bo, atimm:$at, crbitrc:$bi,
|
|
|
|
condbrtarget:$dst),
|
|
|
|
"bcl$at $bo, $bi, $dst">;
|
|
|
|
def gBCLAat : BForm_3_at<16, 1, 1, (outs),
|
|
|
|
(ins u5imm:$bo, atimm:$at, crbitrc:$bi,
|
|
|
|
abscondbrtarget:$dst),
|
|
|
|
"bcla$at $bo, $bi, $dst">;
|
|
|
|
} // // isAsmParserOnly = 1
|
2013-06-24 19:55:21 +08:00
|
|
|
}
|
|
|
|
let Defs = [CTR], Uses = [CTR, LR, RM] in
|
|
|
|
def gBCLR : XLForm_2<19, 16, 0, (outs),
|
|
|
|
(ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bclr $bo, $bi, $bh", IIC_BrB, []>;
|
2013-06-24 19:55:21 +08:00
|
|
|
let Defs = [LR, CTR], Uses = [CTR, LR, RM] in
|
|
|
|
def gBCLRL : XLForm_2<19, 16, 1, (outs),
|
|
|
|
(ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bclrl $bo, $bi, $bh", IIC_BrB, []>;
|
2013-06-24 19:55:21 +08:00
|
|
|
let Defs = [CTR], Uses = [CTR, LR, RM] in
|
|
|
|
def gBCCTR : XLForm_2<19, 528, 0, (outs),
|
|
|
|
(ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bcctr $bo, $bi, $bh", IIC_BrB, []>;
|
2013-06-24 19:55:21 +08:00
|
|
|
let Defs = [LR, CTR], Uses = [CTR, LR, RM] in
|
|
|
|
def gBCCTRL : XLForm_2<19, 528, 1, (outs),
|
|
|
|
(ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bcctrl $bo, $bi, $bh", IIC_BrB, []>;
|
2013-06-24 19:55:21 +08:00
|
|
|
}
|
[PowerPC] Support asm parsing for bc[l][a][+-] mnemonics
PowerPC assembly code in the wild, so it seems, has things like this:
bc+ 12, 28, .L9
This is a bit odd because the '+' here becomes part of the BO field, and the BO
field is otherwise the first operand. Nevertheless, the ISA specification does
clearly say that the +- hint syntax applies to all conditional-branch mnemonics
(that test either CTR or a condition register, although not the forms which
check both), both basic and extended, so this is supposed to be valid.
This introduces some asm-parser-only definitions which take only the upper
three bits from the specified BO value, and the lower two bits are implied by
the +- suffix (via some associated aliases).
Fixes PR23646.
llvm-svn: 280571
2016-09-03 10:31:44 +08:00
|
|
|
|
|
|
|
multiclass BranchSimpleMnemonicAT<string pm, int at> {
|
|
|
|
def : InstAlias<"bc"#pm#" $bo, $bi, $dst", (gBCat u5imm:$bo, at, crbitrc:$bi,
|
|
|
|
condbrtarget:$dst)>;
|
|
|
|
def : InstAlias<"bca"#pm#" $bo, $bi, $dst", (gBCAat u5imm:$bo, at, crbitrc:$bi,
|
|
|
|
condbrtarget:$dst)>;
|
|
|
|
def : InstAlias<"bcl"#pm#" $bo, $bi, $dst", (gBCLat u5imm:$bo, at, crbitrc:$bi,
|
|
|
|
condbrtarget:$dst)>;
|
|
|
|
def : InstAlias<"bcla"#pm#" $bo, $bi, $dst", (gBCLAat u5imm:$bo, at, crbitrc:$bi,
|
|
|
|
condbrtarget:$dst)>;
|
|
|
|
}
|
|
|
|
defm : BranchSimpleMnemonicAT<"+", 3>;
|
|
|
|
defm : BranchSimpleMnemonicAT<"-", 2>;
|
|
|
|
|
2013-06-24 19:55:21 +08:00
|
|
|
def : InstAlias<"bclr $bo, $bi", (gBCLR u5imm:$bo, crbitrc:$bi, 0)>;
|
|
|
|
def : InstAlias<"bclrl $bo, $bi", (gBCLRL u5imm:$bo, crbitrc:$bi, 0)>;
|
|
|
|
def : InstAlias<"bcctr $bo, $bi", (gBCCTR u5imm:$bo, crbitrc:$bi, 0)>;
|
|
|
|
def : InstAlias<"bcctrl $bo, $bi", (gBCCTRL u5imm:$bo, crbitrc:$bi, 0)>;
|
|
|
|
|
2013-06-25 00:52:04 +08:00
|
|
|
multiclass BranchSimpleMnemonic1<string name, string pm, int bo> {
|
|
|
|
def : InstAlias<"b"#name#pm#" $bi, $dst", (gBC bo, crbitrc:$bi, condbrtarget:$dst)>;
|
|
|
|
def : InstAlias<"b"#name#"a"#pm#" $bi, $dst", (gBCA bo, crbitrc:$bi, abscondbrtarget:$dst)>;
|
|
|
|
def : InstAlias<"b"#name#"lr"#pm#" $bi", (gBCLR bo, crbitrc:$bi, 0)>;
|
|
|
|
def : InstAlias<"b"#name#"l"#pm#" $bi, $dst", (gBCL bo, crbitrc:$bi, condbrtarget:$dst)>;
|
|
|
|
def : InstAlias<"b"#name#"la"#pm#" $bi, $dst", (gBCLA bo, crbitrc:$bi, abscondbrtarget:$dst)>;
|
|
|
|
def : InstAlias<"b"#name#"lrl"#pm#" $bi", (gBCLRL bo, crbitrc:$bi, 0)>;
|
|
|
|
}
|
|
|
|
multiclass BranchSimpleMnemonic2<string name, string pm, int bo>
|
|
|
|
: BranchSimpleMnemonic1<name, pm, bo> {
|
|
|
|
def : InstAlias<"b"#name#"ctr"#pm#" $bi", (gBCCTR bo, crbitrc:$bi, 0)>;
|
|
|
|
def : InstAlias<"b"#name#"ctrl"#pm#" $bi", (gBCCTRL bo, crbitrc:$bi, 0)>;
|
|
|
|
}
|
|
|
|
defm : BranchSimpleMnemonic2<"t", "", 12>;
|
|
|
|
defm : BranchSimpleMnemonic2<"f", "", 4>;
|
|
|
|
defm : BranchSimpleMnemonic2<"t", "-", 14>;
|
|
|
|
defm : BranchSimpleMnemonic2<"f", "-", 6>;
|
|
|
|
defm : BranchSimpleMnemonic2<"t", "+", 15>;
|
|
|
|
defm : BranchSimpleMnemonic2<"f", "+", 7>;
|
|
|
|
defm : BranchSimpleMnemonic1<"dnzt", "", 8>;
|
|
|
|
defm : BranchSimpleMnemonic1<"dnzf", "", 0>;
|
|
|
|
defm : BranchSimpleMnemonic1<"dzt", "", 10>;
|
|
|
|
defm : BranchSimpleMnemonic1<"dzf", "", 2>;
|
|
|
|
|
|
|
|
multiclass BranchExtendedMnemonicPM<string name, string pm, int bibo> {
|
|
|
|
def : InstAlias<"b"#name#pm#" $cc, $dst",
|
2013-06-11 01:18:29 +08:00
|
|
|
(BCC bibo, crrc:$cc, condbrtarget:$dst)>;
|
2013-06-25 00:52:04 +08:00
|
|
|
def : InstAlias<"b"#name#pm#" $dst",
|
2013-06-11 01:19:15 +08:00
|
|
|
(BCC bibo, CR0, condbrtarget:$dst)>;
|
|
|
|
|
2013-06-25 00:52:04 +08:00
|
|
|
def : InstAlias<"b"#name#"a"#pm#" $cc, $dst",
|
2013-06-24 19:03:33 +08:00
|
|
|
(BCCA bibo, crrc:$cc, abscondbrtarget:$dst)>;
|
2013-06-25 00:52:04 +08:00
|
|
|
def : InstAlias<"b"#name#"a"#pm#" $dst",
|
2013-06-24 19:03:33 +08:00
|
|
|
(BCCA bibo, CR0, abscondbrtarget:$dst)>;
|
|
|
|
|
2013-06-25 00:52:04 +08:00
|
|
|
def : InstAlias<"b"#name#"lr"#pm#" $cc",
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(BCCLR bibo, crrc:$cc)>;
|
2013-06-25 00:52:04 +08:00
|
|
|
def : InstAlias<"b"#name#"lr"#pm,
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(BCCLR bibo, CR0)>;
|
2013-06-11 01:19:15 +08:00
|
|
|
|
2013-06-25 00:52:04 +08:00
|
|
|
def : InstAlias<"b"#name#"ctr"#pm#" $cc",
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(BCCCTR bibo, crrc:$cc)>;
|
2013-06-25 00:52:04 +08:00
|
|
|
def : InstAlias<"b"#name#"ctr"#pm,
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(BCCCTR bibo, CR0)>;
|
2013-06-11 01:19:15 +08:00
|
|
|
|
2013-06-25 00:52:04 +08:00
|
|
|
def : InstAlias<"b"#name#"l"#pm#" $cc, $dst",
|
2013-06-24 19:02:19 +08:00
|
|
|
(BCCL bibo, crrc:$cc, condbrtarget:$dst)>;
|
2013-06-25 00:52:04 +08:00
|
|
|
def : InstAlias<"b"#name#"l"#pm#" $dst",
|
2013-06-24 19:02:19 +08:00
|
|
|
(BCCL bibo, CR0, condbrtarget:$dst)>;
|
|
|
|
|
2013-06-25 00:52:04 +08:00
|
|
|
def : InstAlias<"b"#name#"la"#pm#" $cc, $dst",
|
2013-06-24 19:03:33 +08:00
|
|
|
(BCCLA bibo, crrc:$cc, abscondbrtarget:$dst)>;
|
2013-06-25 00:52:04 +08:00
|
|
|
def : InstAlias<"b"#name#"la"#pm#" $dst",
|
2013-06-24 19:03:33 +08:00
|
|
|
(BCCLA bibo, CR0, abscondbrtarget:$dst)>;
|
|
|
|
|
2013-06-25 00:52:04 +08:00
|
|
|
def : InstAlias<"b"#name#"lrl"#pm#" $cc",
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(BCCLRL bibo, crrc:$cc)>;
|
2013-06-25 00:52:04 +08:00
|
|
|
def : InstAlias<"b"#name#"lrl"#pm,
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(BCCLRL bibo, CR0)>;
|
2013-06-24 19:01:55 +08:00
|
|
|
|
2013-06-25 00:52:04 +08:00
|
|
|
def : InstAlias<"b"#name#"ctrl"#pm#" $cc",
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(BCCCTRL bibo, crrc:$cc)>;
|
2013-06-25 00:52:04 +08:00
|
|
|
def : InstAlias<"b"#name#"ctrl"#pm,
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
(BCCCTRL bibo, CR0)>;
|
2013-06-11 01:18:29 +08:00
|
|
|
}
|
2013-06-25 00:52:04 +08:00
|
|
|
multiclass BranchExtendedMnemonic<string name, int bibo> {
|
|
|
|
defm : BranchExtendedMnemonicPM<name, "", bibo>;
|
|
|
|
defm : BranchExtendedMnemonicPM<name, "-", !add(bibo, 2)>;
|
|
|
|
defm : BranchExtendedMnemonicPM<name, "+", !add(bibo, 3)>;
|
|
|
|
}
|
2013-06-11 01:18:29 +08:00
|
|
|
defm : BranchExtendedMnemonic<"lt", 12>;
|
|
|
|
defm : BranchExtendedMnemonic<"gt", 44>;
|
|
|
|
defm : BranchExtendedMnemonic<"eq", 76>;
|
|
|
|
defm : BranchExtendedMnemonic<"un", 108>;
|
|
|
|
defm : BranchExtendedMnemonic<"so", 108>;
|
|
|
|
defm : BranchExtendedMnemonic<"ge", 4>;
|
|
|
|
defm : BranchExtendedMnemonic<"nl", 4>;
|
|
|
|
defm : BranchExtendedMnemonic<"le", 36>;
|
|
|
|
defm : BranchExtendedMnemonic<"ng", 36>;
|
|
|
|
defm : BranchExtendedMnemonic<"ne", 68>;
|
|
|
|
defm : BranchExtendedMnemonic<"nu", 100>;
|
|
|
|
defm : BranchExtendedMnemonic<"ns", 100>;
|
2013-05-04 03:50:27 +08:00
|
|
|
|
[PowerPC] Support compare mnemonics with implied CR0
Just like for branch mnemonics (where support was recently added), the
assembler is supposed to support extended mnemonics for the compare
instructions where no condition register is specified explicitly
(and CR0 is assumed implicitly).
This patch adds support for those extended compare mnemonics.
Index: llvm-head/test/MC/PowerPC/ppc64-encoding-ext.s
===================================================================
--- llvm-head.orig/test/MC/PowerPC/ppc64-encoding-ext.s
+++ llvm-head/test/MC/PowerPC/ppc64-encoding-ext.s
@@ -449,21 +449,37 @@
# CHECK: cmpdi 2, 3, 128 # encoding: [0x2d,0x23,0x00,0x80]
cmpdi 2, 3, 128
+# CHECK: cmpdi 0, 3, 128 # encoding: [0x2c,0x23,0x00,0x80]
+ cmpdi 3, 128
# CHECK: cmpd 2, 3, 4 # encoding: [0x7d,0x23,0x20,0x00]
cmpd 2, 3, 4
+# CHECK: cmpd 0, 3, 4 # encoding: [0x7c,0x23,0x20,0x00]
+ cmpd 3, 4
# CHECK: cmpldi 2, 3, 128 # encoding: [0x29,0x23,0x00,0x80]
cmpldi 2, 3, 128
+# CHECK: cmpldi 0, 3, 128 # encoding: [0x28,0x23,0x00,0x80]
+ cmpldi 3, 128
# CHECK: cmpld 2, 3, 4 # encoding: [0x7d,0x23,0x20,0x40]
cmpld 2, 3, 4
+# CHECK: cmpld 0, 3, 4 # encoding: [0x7c,0x23,0x20,0x40]
+ cmpld 3, 4
# CHECK: cmpwi 2, 3, 128 # encoding: [0x2d,0x03,0x00,0x80]
cmpwi 2, 3, 128
+# CHECK: cmpwi 0, 3, 128 # encoding: [0x2c,0x03,0x00,0x80]
+ cmpwi 3, 128
# CHECK: cmpw 2, 3, 4 # encoding: [0x7d,0x03,0x20,0x00]
cmpw 2, 3, 4
+# CHECK: cmpw 0, 3, 4 # encoding: [0x7c,0x03,0x20,0x00]
+ cmpw 3, 4
# CHECK: cmplwi 2, 3, 128 # encoding: [0x29,0x03,0x00,0x80]
cmplwi 2, 3, 128
+# CHECK: cmplwi 0, 3, 128 # encoding: [0x28,0x03,0x00,0x80]
+ cmplwi 3, 128
# CHECK: cmplw 2, 3, 4 # encoding: [0x7d,0x03,0x20,0x40]
cmplw 2, 3, 4
+# CHECK: cmplw 0, 3, 4 # encoding: [0x7c,0x03,0x20,0x40]
+ cmplw 3, 4
# FIXME: Trap mnemonics
Index: llvm-head/lib/Target/PowerPC/PPCInstrInfo.td
===================================================================
--- llvm-head.orig/lib/Target/PowerPC/PPCInstrInfo.td
+++ llvm-head/lib/Target/PowerPC/PPCInstrInfo.td
@@ -2201,3 +2201,12 @@ defm : BranchExtendedMnemonic<"ne", 68>;
defm : BranchExtendedMnemonic<"nu", 100>;
defm : BranchExtendedMnemonic<"ns", 100>;
+def : InstAlias<"cmpwi $rA, $imm", (CMPWI CR0, gprc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmpw $rA, $rB", (CMPW CR0, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmplwi $rA, $imm", (CMPLWI CR0, gprc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmplw $rA, $rB", (CMPLW CR0, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmpdi $rA, $imm", (CMPDI CR0, g8rc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmpd $rA, $rB", (CMPD CR0, g8rc:$rA, g8rc:$rB)>;
+def : InstAlias<"cmpldi $rA, $imm", (CMPLDI CR0, g8rc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmpld $rA, $rB", (CMPLD CR0, g8rc:$rA, g8rc:$rB)>;
+
llvm-svn: 184435
2013-06-21 00:15:12 +08:00
|
|
|
def : InstAlias<"cmpwi $rA, $imm", (CMPWI CR0, gprc:$rA, s16imm:$imm)>;
|
|
|
|
def : InstAlias<"cmpw $rA, $rB", (CMPW CR0, gprc:$rA, gprc:$rB)>;
|
|
|
|
def : InstAlias<"cmplwi $rA, $imm", (CMPLWI CR0, gprc:$rA, u16imm:$imm)>;
|
|
|
|
def : InstAlias<"cmplw $rA, $rB", (CMPLW CR0, gprc:$rA, gprc:$rB)>;
|
2014-01-03 05:26:59 +08:00
|
|
|
def : InstAlias<"cmpdi $rA, $imm", (CMPDI CR0, g8rc:$rA, s16imm64:$imm)>;
|
[PowerPC] Support compare mnemonics with implied CR0
Just like for branch mnemonics (where support was recently added), the
assembler is supposed to support extended mnemonics for the compare
instructions where no condition register is specified explicitly
(and CR0 is assumed implicitly).
This patch adds support for those extended compare mnemonics.
Index: llvm-head/test/MC/PowerPC/ppc64-encoding-ext.s
===================================================================
--- llvm-head.orig/test/MC/PowerPC/ppc64-encoding-ext.s
+++ llvm-head/test/MC/PowerPC/ppc64-encoding-ext.s
@@ -449,21 +449,37 @@
# CHECK: cmpdi 2, 3, 128 # encoding: [0x2d,0x23,0x00,0x80]
cmpdi 2, 3, 128
+# CHECK: cmpdi 0, 3, 128 # encoding: [0x2c,0x23,0x00,0x80]
+ cmpdi 3, 128
# CHECK: cmpd 2, 3, 4 # encoding: [0x7d,0x23,0x20,0x00]
cmpd 2, 3, 4
+# CHECK: cmpd 0, 3, 4 # encoding: [0x7c,0x23,0x20,0x00]
+ cmpd 3, 4
# CHECK: cmpldi 2, 3, 128 # encoding: [0x29,0x23,0x00,0x80]
cmpldi 2, 3, 128
+# CHECK: cmpldi 0, 3, 128 # encoding: [0x28,0x23,0x00,0x80]
+ cmpldi 3, 128
# CHECK: cmpld 2, 3, 4 # encoding: [0x7d,0x23,0x20,0x40]
cmpld 2, 3, 4
+# CHECK: cmpld 0, 3, 4 # encoding: [0x7c,0x23,0x20,0x40]
+ cmpld 3, 4
# CHECK: cmpwi 2, 3, 128 # encoding: [0x2d,0x03,0x00,0x80]
cmpwi 2, 3, 128
+# CHECK: cmpwi 0, 3, 128 # encoding: [0x2c,0x03,0x00,0x80]
+ cmpwi 3, 128
# CHECK: cmpw 2, 3, 4 # encoding: [0x7d,0x03,0x20,0x00]
cmpw 2, 3, 4
+# CHECK: cmpw 0, 3, 4 # encoding: [0x7c,0x03,0x20,0x00]
+ cmpw 3, 4
# CHECK: cmplwi 2, 3, 128 # encoding: [0x29,0x03,0x00,0x80]
cmplwi 2, 3, 128
+# CHECK: cmplwi 0, 3, 128 # encoding: [0x28,0x03,0x00,0x80]
+ cmplwi 3, 128
# CHECK: cmplw 2, 3, 4 # encoding: [0x7d,0x03,0x20,0x40]
cmplw 2, 3, 4
+# CHECK: cmplw 0, 3, 4 # encoding: [0x7c,0x03,0x20,0x40]
+ cmplw 3, 4
# FIXME: Trap mnemonics
Index: llvm-head/lib/Target/PowerPC/PPCInstrInfo.td
===================================================================
--- llvm-head.orig/lib/Target/PowerPC/PPCInstrInfo.td
+++ llvm-head/lib/Target/PowerPC/PPCInstrInfo.td
@@ -2201,3 +2201,12 @@ defm : BranchExtendedMnemonic<"ne", 68>;
defm : BranchExtendedMnemonic<"nu", 100>;
defm : BranchExtendedMnemonic<"ns", 100>;
+def : InstAlias<"cmpwi $rA, $imm", (CMPWI CR0, gprc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmpw $rA, $rB", (CMPW CR0, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmplwi $rA, $imm", (CMPLWI CR0, gprc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmplw $rA, $rB", (CMPLW CR0, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmpdi $rA, $imm", (CMPDI CR0, g8rc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmpd $rA, $rB", (CMPD CR0, g8rc:$rA, g8rc:$rB)>;
+def : InstAlias<"cmpldi $rA, $imm", (CMPLDI CR0, g8rc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmpld $rA, $rB", (CMPLD CR0, g8rc:$rA, g8rc:$rB)>;
+
llvm-svn: 184435
2013-06-21 00:15:12 +08:00
|
|
|
def : InstAlias<"cmpd $rA, $rB", (CMPD CR0, g8rc:$rA, g8rc:$rB)>;
|
2014-01-03 05:26:59 +08:00
|
|
|
def : InstAlias<"cmpldi $rA, $imm", (CMPLDI CR0, g8rc:$rA, u16imm64:$imm)>;
|
[PowerPC] Support compare mnemonics with implied CR0
Just like for branch mnemonics (where support was recently added), the
assembler is supposed to support extended mnemonics for the compare
instructions where no condition register is specified explicitly
(and CR0 is assumed implicitly).
This patch adds support for those extended compare mnemonics.
Index: llvm-head/test/MC/PowerPC/ppc64-encoding-ext.s
===================================================================
--- llvm-head.orig/test/MC/PowerPC/ppc64-encoding-ext.s
+++ llvm-head/test/MC/PowerPC/ppc64-encoding-ext.s
@@ -449,21 +449,37 @@
# CHECK: cmpdi 2, 3, 128 # encoding: [0x2d,0x23,0x00,0x80]
cmpdi 2, 3, 128
+# CHECK: cmpdi 0, 3, 128 # encoding: [0x2c,0x23,0x00,0x80]
+ cmpdi 3, 128
# CHECK: cmpd 2, 3, 4 # encoding: [0x7d,0x23,0x20,0x00]
cmpd 2, 3, 4
+# CHECK: cmpd 0, 3, 4 # encoding: [0x7c,0x23,0x20,0x00]
+ cmpd 3, 4
# CHECK: cmpldi 2, 3, 128 # encoding: [0x29,0x23,0x00,0x80]
cmpldi 2, 3, 128
+# CHECK: cmpldi 0, 3, 128 # encoding: [0x28,0x23,0x00,0x80]
+ cmpldi 3, 128
# CHECK: cmpld 2, 3, 4 # encoding: [0x7d,0x23,0x20,0x40]
cmpld 2, 3, 4
+# CHECK: cmpld 0, 3, 4 # encoding: [0x7c,0x23,0x20,0x40]
+ cmpld 3, 4
# CHECK: cmpwi 2, 3, 128 # encoding: [0x2d,0x03,0x00,0x80]
cmpwi 2, 3, 128
+# CHECK: cmpwi 0, 3, 128 # encoding: [0x2c,0x03,0x00,0x80]
+ cmpwi 3, 128
# CHECK: cmpw 2, 3, 4 # encoding: [0x7d,0x03,0x20,0x00]
cmpw 2, 3, 4
+# CHECK: cmpw 0, 3, 4 # encoding: [0x7c,0x03,0x20,0x00]
+ cmpw 3, 4
# CHECK: cmplwi 2, 3, 128 # encoding: [0x29,0x03,0x00,0x80]
cmplwi 2, 3, 128
+# CHECK: cmplwi 0, 3, 128 # encoding: [0x28,0x03,0x00,0x80]
+ cmplwi 3, 128
# CHECK: cmplw 2, 3, 4 # encoding: [0x7d,0x03,0x20,0x40]
cmplw 2, 3, 4
+# CHECK: cmplw 0, 3, 4 # encoding: [0x7c,0x03,0x20,0x40]
+ cmplw 3, 4
# FIXME: Trap mnemonics
Index: llvm-head/lib/Target/PowerPC/PPCInstrInfo.td
===================================================================
--- llvm-head.orig/lib/Target/PowerPC/PPCInstrInfo.td
+++ llvm-head/lib/Target/PowerPC/PPCInstrInfo.td
@@ -2201,3 +2201,12 @@ defm : BranchExtendedMnemonic<"ne", 68>;
defm : BranchExtendedMnemonic<"nu", 100>;
defm : BranchExtendedMnemonic<"ns", 100>;
+def : InstAlias<"cmpwi $rA, $imm", (CMPWI CR0, gprc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmpw $rA, $rB", (CMPW CR0, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmplwi $rA, $imm", (CMPLWI CR0, gprc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmplw $rA, $rB", (CMPLW CR0, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmpdi $rA, $imm", (CMPDI CR0, g8rc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmpd $rA, $rB", (CMPD CR0, g8rc:$rA, g8rc:$rB)>;
+def : InstAlias<"cmpldi $rA, $imm", (CMPLDI CR0, g8rc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmpld $rA, $rB", (CMPLD CR0, g8rc:$rA, g8rc:$rB)>;
+
llvm-svn: 184435
2013-06-21 00:15:12 +08:00
|
|
|
def : InstAlias<"cmpld $rA, $rB", (CMPLD CR0, g8rc:$rA, g8rc:$rB)>;
|
|
|
|
|
2013-07-08 22:49:37 +08:00
|
|
|
def : InstAlias<"cmpi $bf, 0, $rA, $imm", (CMPWI crrc:$bf, gprc:$rA, s16imm:$imm)>;
|
|
|
|
def : InstAlias<"cmp $bf, 0, $rA, $rB", (CMPW crrc:$bf, gprc:$rA, gprc:$rB)>;
|
|
|
|
def : InstAlias<"cmpli $bf, 0, $rA, $imm", (CMPLWI crrc:$bf, gprc:$rA, u16imm:$imm)>;
|
|
|
|
def : InstAlias<"cmpl $bf, 0, $rA, $rB", (CMPLW crrc:$bf, gprc:$rA, gprc:$rB)>;
|
2014-01-03 05:26:59 +08:00
|
|
|
def : InstAlias<"cmpi $bf, 1, $rA, $imm", (CMPDI crrc:$bf, g8rc:$rA, s16imm64:$imm)>;
|
2013-07-08 22:49:37 +08:00
|
|
|
def : InstAlias<"cmp $bf, 1, $rA, $rB", (CMPD crrc:$bf, g8rc:$rA, g8rc:$rB)>;
|
2014-01-03 05:26:59 +08:00
|
|
|
def : InstAlias<"cmpli $bf, 1, $rA, $imm", (CMPLDI crrc:$bf, g8rc:$rA, u16imm64:$imm)>;
|
2013-07-08 22:49:37 +08:00
|
|
|
def : InstAlias<"cmpl $bf, 1, $rA, $rB", (CMPLD crrc:$bf, g8rc:$rA, g8rc:$rB)>;
|
|
|
|
|
2013-07-04 22:40:12 +08:00
|
|
|
multiclass TrapExtendedMnemonic<string name, int to> {
|
|
|
|
def : InstAlias<"td"#name#"i $rA, $imm", (TDI to, g8rc:$rA, s16imm:$imm)>;
|
|
|
|
def : InstAlias<"td"#name#" $rA, $rB", (TD to, g8rc:$rA, g8rc:$rB)>;
|
|
|
|
def : InstAlias<"tw"#name#"i $rA, $imm", (TWI to, gprc:$rA, s16imm:$imm)>;
|
|
|
|
def : InstAlias<"tw"#name#" $rA, $rB", (TW to, gprc:$rA, gprc:$rB)>;
|
|
|
|
}
|
|
|
|
defm : TrapExtendedMnemonic<"lt", 16>;
|
|
|
|
defm : TrapExtendedMnemonic<"le", 20>;
|
|
|
|
defm : TrapExtendedMnemonic<"eq", 4>;
|
|
|
|
defm : TrapExtendedMnemonic<"ge", 12>;
|
|
|
|
defm : TrapExtendedMnemonic<"gt", 8>;
|
|
|
|
defm : TrapExtendedMnemonic<"nl", 12>;
|
|
|
|
defm : TrapExtendedMnemonic<"ne", 24>;
|
|
|
|
defm : TrapExtendedMnemonic<"ng", 20>;
|
|
|
|
defm : TrapExtendedMnemonic<"llt", 2>;
|
|
|
|
defm : TrapExtendedMnemonic<"lle", 6>;
|
|
|
|
defm : TrapExtendedMnemonic<"lge", 5>;
|
|
|
|
defm : TrapExtendedMnemonic<"lgt", 1>;
|
|
|
|
defm : TrapExtendedMnemonic<"lnl", 5>;
|
|
|
|
defm : TrapExtendedMnemonic<"lng", 6>;
|
|
|
|
defm : TrapExtendedMnemonic<"u", 31>;
|
[Power] Improve the expansion of atomic loads/stores
Summary:
Atomic loads and store of up to the native size (32 bits, or 64 for PPC64)
can be lowered to a simple load or store instruction (as the synchronization
is already handled by AtomicExpand, and the atomicity is guaranteed thanks to
the alignment requirements of atomic accesses). This is exactly what this patch
does. Previously, these were implemented by complex
load-linked/store-conditional loops.. an obvious performance problem.
For example, this patch turns
```
define void @store_i8_unordered(i8* %mem) {
store atomic i8 42, i8* %mem unordered, align 1
ret void
}
```
from
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
rlwinm r2, r3, 3, 27, 28
li r4, 42
xori r5, r2, 24
rlwinm r2, r3, 0, 0, 29
li r3, 255
slw r4, r4, r5
slw r3, r3, r5
and r4, r4, r3
LBB4_1: ; =>This Inner Loop Header: Depth=1
lwarx r5, 0, r2
andc r5, r5, r3
or r5, r4, r5
stwcx. r5, 0, r2
bne cr0, LBB4_1
; BB#2:
blr
```
into
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
li r2, 42
stb r2, 0(r3)
blr
```
which looks like a pretty clear win to me.
Test Plan:
fixed the tests + new test for indexed accesses + make check-all
Reviewers: jfb, wschmidt, hfinkel
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D5587
llvm-svn: 218922
2014-10-03 06:27:07 +08:00
|
|
|
|
|
|
|
// Atomic loads
|
|
|
|
def : Pat<(atomic_load_8 iaddr:$src), (LBZ memri:$src)>;
|
|
|
|
def : Pat<(atomic_load_16 iaddr:$src), (LHZ memri:$src)>;
|
|
|
|
def : Pat<(atomic_load_32 iaddr:$src), (LWZ memri:$src)>;
|
|
|
|
def : Pat<(atomic_load_8 xaddr:$src), (LBZX memrr:$src)>;
|
|
|
|
def : Pat<(atomic_load_16 xaddr:$src), (LHZX memrr:$src)>;
|
|
|
|
def : Pat<(atomic_load_32 xaddr:$src), (LWZX memrr:$src)>;
|
|
|
|
|
|
|
|
// Atomic stores
|
|
|
|
def : Pat<(atomic_store_8 iaddr:$ptr, i32:$val), (STB gprc:$val, memri:$ptr)>;
|
|
|
|
def : Pat<(atomic_store_16 iaddr:$ptr, i32:$val), (STH gprc:$val, memri:$ptr)>;
|
|
|
|
def : Pat<(atomic_store_32 iaddr:$ptr, i32:$val), (STW gprc:$val, memri:$ptr)>;
|
|
|
|
def : Pat<(atomic_store_8 xaddr:$ptr, i32:$val), (STBX gprc:$val, memrr:$ptr)>;
|
|
|
|
def : Pat<(atomic_store_16 xaddr:$ptr, i32:$val), (STHX gprc:$val, memrr:$ptr)>;
|
|
|
|
def : Pat<(atomic_store_32 xaddr:$ptr, i32:$val), (STWX gprc:$val, memrr:$ptr)>;
|
2016-04-06 09:46:45 +08:00
|
|
|
|
|
|
|
let Predicates = [IsISA3_0] in {
|
|
|
|
|
|
|
|
// Copy-Paste Facility
|
|
|
|
// We prefix 'CP' to COPY due to name conflict in Target.td. We also prefix to
|
|
|
|
// PASTE for naming consistency.
|
|
|
|
let mayLoad = 1 in
|
|
|
|
def CP_COPY : X_L1_RA5_RB5<31, 774, "copy" , gprc, IIC_LdStCOPY, []>;
|
|
|
|
|
|
|
|
let mayStore = 1 in
|
|
|
|
def CP_PASTE : X_L1_RA5_RB5<31, 902, "paste" , gprc, IIC_LdStPASTE, []>;
|
|
|
|
|
|
|
|
let mayStore = 1, Defs = [CR0] in
|
|
|
|
def CP_PASTEo : X_L1_RA5_RB5<31, 902, "paste.", gprc, IIC_LdStPASTE, []>, isDOT;
|
|
|
|
|
|
|
|
def CP_COPYx : PPCAsmPseudo<"copy $rA, $rB" , (ins gprc:$rA, gprc:$rB)>;
|
|
|
|
def CP_PASTEx : PPCAsmPseudo<"paste $rA, $rB", (ins gprc:$rA, gprc:$rB)>;
|
|
|
|
def CP_COPY_FIRST : PPCAsmPseudo<"copy_first $rA, $rB",
|
|
|
|
(ins gprc:$rA, gprc:$rB)>;
|
|
|
|
def CP_PASTE_LAST : PPCAsmPseudo<"paste_last $rA, $rB",
|
|
|
|
(ins gprc:$rA, gprc:$rB)>;
|
|
|
|
def CP_ABORT : XForm_0<31, 838, (outs), (ins), "cp_abort", IIC_SprABORT, []>;
|
|
|
|
|
|
|
|
// Message Synchronize
|
|
|
|
def MSGSYNC : XForm_0<31, 886, (outs), (ins), "msgsync", IIC_SprMSGSYNC, []>;
|
|
|
|
|
|
|
|
// Power-Saving Mode Instruction:
|
|
|
|
def STOP : XForm_0<19, 370, (outs), (ins), "stop", IIC_SprSTOP, []>;
|
|
|
|
|
|
|
|
} // IsISA3_0
|