2012-02-18 20:03:15 +08:00
|
|
|
//===-- PPCInstr64Bit.td - The PowerPC 64-bit Support ------*- tablegen -*-===//
|
|
|
|
//
|
2006-06-17 04:22:01 +08:00
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
2007-12-30 04:36:04 +08:00
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
2012-02-18 20:03:15 +08:00
|
|
|
//
|
2006-06-17 04:22:01 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This file describes the PowerPC 64-bit instructions. These patterns are used
|
|
|
|
// both when in ppc64 mode and when in "use 64-bit extensions in 32-bit" mode.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2006-06-21 05:23:06 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// 64-bit operands.
|
|
|
|
//
|
2006-06-27 07:53:10 +08:00
|
|
|
def s16imm64 : Operand<i64> {
|
|
|
|
let PrintMethod = "printS16ImmOperand";
|
2013-06-26 21:49:15 +08:00
|
|
|
let EncoderMethod = "getImm16Encoding";
|
2013-05-04 03:49:39 +08:00
|
|
|
let ParserMatchClass = PPCS16ImmAsmOperand;
|
2013-12-20 00:13:01 +08:00
|
|
|
let DecoderMethod = "decodeSImmOperand<16>";
|
2006-06-27 07:53:10 +08:00
|
|
|
}
|
|
|
|
def u16imm64 : Operand<i64> {
|
|
|
|
let PrintMethod = "printU16ImmOperand";
|
2013-06-26 21:49:15 +08:00
|
|
|
let EncoderMethod = "getImm16Encoding";
|
2013-05-04 03:49:39 +08:00
|
|
|
let ParserMatchClass = PPCU16ImmAsmOperand;
|
2013-12-20 00:13:01 +08:00
|
|
|
let DecoderMethod = "decodeUImmOperand<16>";
|
2006-06-27 07:53:10 +08:00
|
|
|
}
|
2013-06-26 21:49:53 +08:00
|
|
|
def s17imm64 : Operand<i64> {
|
|
|
|
// This operand type is used for addis/lis to allow the assembler parser
|
|
|
|
// to accept immediates in the range -65536..65535 for compatibility with
|
|
|
|
// the GNU assembler. The operand is treated as 16-bit otherwise.
|
|
|
|
let PrintMethod = "printS16ImmOperand";
|
|
|
|
let EncoderMethod = "getImm16Encoding";
|
|
|
|
let ParserMatchClass = PPCS17ImmAsmOperand;
|
2013-12-20 00:13:01 +08:00
|
|
|
let DecoderMethod = "decodeSImmOperand<16>";
|
2013-06-26 21:49:53 +08:00
|
|
|
}
|
2012-09-06 03:22:27 +08:00
|
|
|
def tocentry : Operand<iPTR> {
|
2013-03-20 03:50:30 +08:00
|
|
|
let MIOperandInfo = (ops i64imm:$imm);
|
2012-09-06 03:22:27 +08:00
|
|
|
}
|
2012-12-05 00:18:08 +08:00
|
|
|
def tlsreg : Operand<i64> {
|
|
|
|
let EncoderMethod = "getTLSRegEncoding";
|
2013-07-05 20:22:36 +08:00
|
|
|
let ParserMatchClass = PPCTLSRegOperand;
|
2012-12-05 00:18:08 +08:00
|
|
|
}
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
def tlsgd : Operand<i64> {}
|
2013-07-03 05:31:04 +08:00
|
|
|
def tlscall : Operand<i64> {
|
|
|
|
let PrintMethod = "printTLSCall";
|
|
|
|
let MIOperandInfo = (ops calltarget:$func, tlsgd:$sym);
|
|
|
|
let EncoderMethod = "getTLSCallEncoding";
|
|
|
|
}
|
2006-06-21 05:23:06 +08:00
|
|
|
|
2006-06-21 07:18:58 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// 64-bit transformation functions.
|
|
|
|
//
|
|
|
|
|
|
|
|
def SHL64 : SDNodeXForm<imm, [{
|
|
|
|
// Transformation function: 63 - imm
|
2008-09-13 00:56:44 +08:00
|
|
|
return getI32Imm(63 - N->getZExtValue());
|
2006-06-21 07:18:58 +08:00
|
|
|
}]>;
|
2006-06-21 05:23:06 +08:00
|
|
|
|
2006-06-21 07:18:58 +08:00
|
|
|
def SRL64 : SDNodeXForm<imm, [{
|
|
|
|
// Transformation function: 64 - imm
|
2008-09-13 00:56:44 +08:00
|
|
|
return N->getZExtValue() ? getI32Imm(64 - N->getZExtValue()) : getI32Imm(0);
|
2006-06-21 07:18:58 +08:00
|
|
|
}]>;
|
|
|
|
|
|
|
|
def HI32_48 : SDNodeXForm<imm, [{
|
|
|
|
// Transformation function: shift the immediate value down into the low bits.
|
2008-09-13 00:56:44 +08:00
|
|
|
return getI32Imm((unsigned short)(N->getZExtValue() >> 32));
|
2006-06-21 07:18:58 +08:00
|
|
|
}]>;
|
|
|
|
|
|
|
|
def HI48_64 : SDNodeXForm<imm, [{
|
|
|
|
// Transformation function: shift the immediate value down into the low bits.
|
2008-09-13 00:56:44 +08:00
|
|
|
return getI32Imm((unsigned short)(N->getZExtValue() >> 48));
|
2006-06-21 07:18:58 +08:00
|
|
|
}]>;
|
2006-06-21 05:23:06 +08:00
|
|
|
|
2006-06-17 04:22:01 +08:00
|
|
|
|
2006-11-15 02:44:47 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Calls.
|
|
|
|
//
|
|
|
|
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
|
2013-03-26 18:53:03 +08:00
|
|
|
let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
|
PPC: Prep for if conversion of bctr[l]
This adds in-principle support for if-converting the bctr[l] instructions.
These instructions are used for indirect branching. It seems, however, that the
current if converter will never actually predicate these. To do so, it would
need the ability to hoist a few setup insts. out of the conditionally-executed
block. For example, code like this:
void foo(int a, int (*bar)()) { if (a != 0) bar(); }
becomes:
...
beq 0, .LBB0_2
std 2, 40(1)
mr 12, 4
ld 3, 0(4)
ld 11, 16(4)
ld 2, 8(4)
mtctr 3
bctrl
ld 2, 40(1)
.LBB0_2:
...
and it would be safe to do all of this unconditionally with a predicated
beqctrl instruction.
llvm-svn: 179156
2013-04-10 14:42:34 +08:00
|
|
|
let isBranch = 1, isIndirectBranch = 1, Uses = [CTR8] in {
|
2013-11-28 07:26:09 +08:00
|
|
|
def BCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
|
|
|
|
[]>,
|
2013-03-26 18:53:03 +08:00
|
|
|
Requires<[In64BitMode]>;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
def BCCCTR8 : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond),
|
|
|
|
"b${cond:cc}ctr${cond:pm} ${cond:reg}", IIC_BrB,
|
|
|
|
[]>,
|
|
|
|
Requires<[In64BitMode]>;
|
|
|
|
|
|
|
|
def BCCTR8 : XLForm_2_br2<19, 528, 12, 0, (outs), (ins crbitrc:$bi),
|
|
|
|
"bcctr 12, $bi, 0", IIC_BrB, []>,
|
|
|
|
Requires<[In64BitMode]>;
|
|
|
|
def BCCTR8n : XLForm_2_br2<19, 528, 4, 0, (outs), (ins crbitrc:$bi),
|
|
|
|
"bcctr 4, $bi, 0", IIC_BrB, []>,
|
PPC: Prep for if conversion of bctr[l]
This adds in-principle support for if-converting the bctr[l] instructions.
These instructions are used for indirect branching. It seems, however, that the
current if converter will never actually predicate these. To do so, it would
need the ability to hoist a few setup insts. out of the conditionally-executed
block. For example, code like this:
void foo(int a, int (*bar)()) { if (a != 0) bar(); }
becomes:
...
beq 0, .LBB0_2
std 2, 40(1)
mr 12, 4
ld 3, 0(4)
ld 11, 16(4)
ld 2, 8(4)
mtctr 3
bctrl
ld 2, 40(1)
.LBB0_2:
...
and it would be safe to do all of this unconditionally with a predicated
beqctrl instruction.
llvm-svn: 179156
2013-04-10 14:42:34 +08:00
|
|
|
Requires<[In64BitMode]>;
|
|
|
|
}
|
2013-03-26 18:53:03 +08:00
|
|
|
}
|
|
|
|
|
2006-11-15 02:44:47 +08:00
|
|
|
let Defs = [LR8] in
|
2012-10-05 02:14:28 +08:00
|
|
|
def MovePCtoLR8 : Pseudo<(outs), (ins), "#MovePCtoLR8", []>,
|
2006-11-15 02:44:47 +08:00
|
|
|
PPC970_Unit_BRU;
|
|
|
|
|
2013-03-26 18:53:03 +08:00
|
|
|
let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
|
|
|
|
let Defs = [CTR8], Uses = [CTR8] in {
|
|
|
|
def BDZ8 : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
|
|
|
|
"bdz $dst">;
|
|
|
|
def BDNZ8 : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
|
|
|
|
"bdnz $dst">;
|
|
|
|
}
|
2013-04-10 06:58:37 +08:00
|
|
|
|
|
|
|
let isReturn = 1, Defs = [CTR8], Uses = [CTR8, LR8, RM] in {
|
|
|
|
def BDZLR8 : XLForm_2_ext<19, 16, 18, 0, 0, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bdzlr", IIC_BrB, []>;
|
2013-04-10 06:58:37 +08:00
|
|
|
def BDNZLR8 : XLForm_2_ext<19, 16, 16, 0, 0, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bdnzlr", IIC_BrB, []>;
|
2013-04-10 06:58:37 +08:00
|
|
|
}
|
2013-03-26 18:53:03 +08:00
|
|
|
}
|
|
|
|
|
2013-04-10 06:58:37 +08:00
|
|
|
|
|
|
|
|
2012-03-07 00:41:49 +08:00
|
|
|
let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
|
2006-11-15 02:44:47 +08:00
|
|
|
// Convenient aliases for call instructions
|
2008-10-30 02:26:45 +08:00
|
|
|
let Uses = [RM] in {
|
2013-03-22 23:24:13 +08:00
|
|
|
def BL8 : IForm<18, 0, 1, (outs), (ins calltarget:$func),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bl $func", IIC_BrB, []>; // See Pat patterns below.
|
2006-11-15 02:44:47 +08:00
|
|
|
|
2013-07-03 05:31:59 +08:00
|
|
|
def BL8_TLS : IForm<18, 0, 1, (outs), (ins tlscall:$func),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bl $func", IIC_BrB, []>;
|
2013-07-03 05:31:59 +08:00
|
|
|
|
2013-06-24 19:03:33 +08:00
|
|
|
def BLA8 : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bla $func", IIC_BrB, [(PPCcall (i64 imm:$func))]>;
|
2013-03-22 23:24:13 +08:00
|
|
|
}
|
|
|
|
let Uses = [RM], isCodeGenOnly = 1 in {
|
|
|
|
def BL8_NOP : IForm_and_DForm_4_zero<18, 0, 1, 24,
|
2012-07-14 04:44:29 +08:00
|
|
|
(outs), (ins calltarget:$func),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bl $func\n\tnop", IIC_BrB, []>;
|
2012-03-31 22:45:15 +08:00
|
|
|
|
2013-07-03 05:31:04 +08:00
|
|
|
def BL8_NOP_TLS : IForm_and_DForm_4_zero<18, 0, 1, 24,
|
|
|
|
(outs), (ins tlscall:$func),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bl $func\n\tnop", IIC_BrB, []>;
|
2012-12-13 03:29:35 +08:00
|
|
|
|
2013-03-22 23:24:13 +08:00
|
|
|
def BLA8_NOP : IForm_and_DForm_4_zero<18, 1, 1, 24,
|
2013-06-24 19:03:33 +08:00
|
|
|
(outs), (ins abscalltarget:$func),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bla $func\n\tnop", IIC_BrB,
|
2013-03-22 23:24:13 +08:00
|
|
|
[(PPCcall_nop (i64 imm:$func))]>;
|
2008-10-30 02:26:45 +08:00
|
|
|
}
|
2013-03-22 23:24:13 +08:00
|
|
|
let Uses = [CTR8, RM] in {
|
|
|
|
def BCTRL8 : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"bctrl", IIC_BrB, [(PPCbctrl)]>,
|
2013-03-22 23:24:13 +08:00
|
|
|
Requires<[In64BitMode]>;
|
2013-04-18 01:19:05 +08:00
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
let isCodeGenOnly = 1 in {
|
|
|
|
def BCCCTRL8 : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond),
|
|
|
|
"b${cond:cc}ctrl${cond:pm} ${cond:reg}", IIC_BrB,
|
|
|
|
[]>,
|
|
|
|
Requires<[In64BitMode]>;
|
|
|
|
|
|
|
|
def BCCTRL8 : XLForm_2_br2<19, 528, 12, 1, (outs), (ins crbitrc:$bi),
|
|
|
|
"bcctrl 12, $bi, 0", IIC_BrB, []>,
|
|
|
|
Requires<[In64BitMode]>;
|
|
|
|
def BCCTRL8n : XLForm_2_br2<19, 528, 4, 1, (outs), (ins crbitrc:$bi),
|
|
|
|
"bcctrl 4, $bi, 0", IIC_BrB, []>,
|
|
|
|
Requires<[In64BitMode]>;
|
|
|
|
}
|
2008-10-24 04:41:28 +08:00
|
|
|
}
|
2007-02-25 13:34:32 +08:00
|
|
|
}
|
2014-12-24 06:29:40 +08:00
|
|
|
|
|
|
|
let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
|
|
|
|
Defs = [LR8, X2], Uses = [CTR8, RM], RST = 2 in {
|
|
|
|
def BCTRL8_LDinto_toc :
|
|
|
|
XLForm_2_ext_and_DSForm_1<19, 528, 20, 0, 1, 58, 0, (outs),
|
|
|
|
(ins memrix:$src),
|
|
|
|
"bctrl\n\tld 2, $src", IIC_BrB,
|
|
|
|
[(PPCbctrl_load_toc ixaddr:$src)]>,
|
|
|
|
Requires<[In64BitMode]>;
|
|
|
|
}
|
|
|
|
|
2013-04-12 10:18:09 +08:00
|
|
|
} // Interpretation64Bit
|
2007-02-25 13:34:32 +08:00
|
|
|
|
2013-12-18 07:05:18 +08:00
|
|
|
// FIXME: Duplicating this for the asm parser should be unnecessary, but the
|
|
|
|
// previous definition must be marked as CodeGen only to prevent decoding
|
|
|
|
// conflicts.
|
|
|
|
let Interpretation64Bit = 1, isAsmParserOnly = 1 in
|
|
|
|
let isCall = 1, PPC970_Unit = 7, Defs = [LR8], Uses = [RM] in
|
|
|
|
def BL8_TLS_ : IForm<18, 0, 1, (outs), (ins tlscall:$func),
|
|
|
|
"bl $func", IIC_BrB, []>;
|
|
|
|
|
2006-11-15 02:44:47 +08:00
|
|
|
// Calls
|
2013-03-22 23:24:13 +08:00
|
|
|
def : Pat<(PPCcall (i64 tglobaladdr:$dst)),
|
|
|
|
(BL8 tglobaladdr:$dst)>;
|
|
|
|
def : Pat<(PPCcall_nop (i64 tglobaladdr:$dst)),
|
|
|
|
(BL8_NOP tglobaladdr:$dst)>;
|
|
|
|
|
|
|
|
def : Pat<(PPCcall (i64 texternalsym:$dst)),
|
|
|
|
(BL8 texternalsym:$dst)>;
|
|
|
|
def : Pat<(PPCcall_nop (i64 texternalsym:$dst)),
|
|
|
|
(BL8_NOP texternalsym:$dst)>;
|
2006-11-15 02:44:47 +08:00
|
|
|
|
[PowerPC] Replace foul hackery with real calls to __tls_get_addr
My original support for the general dynamic and local dynamic TLS
models contained some fairly obtuse hacks to generate calls to
__tls_get_addr when lowering a TargetGlobalAddress. Rather than
generating real calls, special GET_TLS_ADDR nodes were used to wrap
the calls and only reveal them at assembly time. I attempted to
provide correct parameter and return values by chaining CopyToReg and
CopyFromReg nodes onto the GET_TLS_ADDR nodes, but this was also not
fully correct. Problems were seen with two back-to-back stores to TLS
variables, where the call sequences ended up overlapping with unhappy
results. Additionally, since these weren't real calls, the proper
register side effects of a call were not recorded, so clobbered values
were kept live across the calls.
The proper thing to do is to lower these into calls in the first
place. This is relatively straightforward; see the changes to
PPCTargetLowering::LowerGlobalTLSAddress() in PPCISelLowering.cpp.
The changes here are standard call lowering, except that we need to
track the fact that these calls will require a relocation. This is
done by adding a machine operand flag of MO_TLSLD or MO_TLSGD to the
TargetGlobalAddress operand that appears earlier in the sequence.
The calls to LowerCallTo() eventually find their way to
LowerCall_64SVR4() or LowerCall_32SVR4(), which call FinishCall(),
which calls PrepareCall(). In PrepareCall(), we detect the calls to
__tls_get_addr and immediately snag the TargetGlobalTLSAddress with
the annotated relocation information. This becomes an extra operand
on the call following the callee, which is expected for nodes of type
tlscall. We change the call opcode to CALL_TLS for this case. Back
in FinishCall(), we change it again to CALL_NOP_TLS for 64-bit only,
since we require a TOC-restore nop following the call for the 64-bit
ABIs.
During selection, patterns in PPCInstrInfo.td and PPCInstr64Bit.td
convert the CALL_TLS nodes into BL_TLS nodes, and convert the
CALL_NOP_TLS nodes into BL8_NOP_TLS nodes. This replaces the code
removed from PPCAsmPrinter.cpp, as the BL_TLS or BL8_NOP_TLS
nodes can now be emitted normally using their patterns and the
associated printTLSCall print method.
Finally, as a result of these changes, all references to get-tls-addr
in its various guises are no longer used, so they have been removed.
There are existing TLS tests to verify the changes haven't messed
anything up). I've added one new test that verifies that the problem
with the original code has been fixed.
llvm-svn: 221703
2014-11-12 04:44:09 +08:00
|
|
|
def : Pat<(PPCcall_nop_tls texternalsym:$func, tglobaltlsaddr:$sym),
|
|
|
|
(BL8_NOP_TLS texternalsym:$func, tglobaltlsaddr:$sym)>;
|
|
|
|
|
2008-07-12 10:23:19 +08:00
|
|
|
// Atomic operations
|
2009-10-30 02:10:34 +08:00
|
|
|
let usesCustomInserter = 1 in {
|
2011-04-05 01:07:09 +08:00
|
|
|
let Defs = [CR0] in {
|
2008-07-12 10:23:19 +08:00
|
|
|
def ATOMIC_LOAD_ADD_I64 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_ADD_I64",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$dst, (atomic_load_add_64 xoaddr:$ptr, i64:$incr))]>;
|
2008-08-26 06:34:37 +08:00
|
|
|
def ATOMIC_LOAD_SUB_I64 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_SUB_I64",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$dst, (atomic_load_sub_64 xoaddr:$ptr, i64:$incr))]>;
|
2008-08-26 06:34:37 +08:00
|
|
|
def ATOMIC_LOAD_OR_I64 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_OR_I64",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$dst, (atomic_load_or_64 xoaddr:$ptr, i64:$incr))]>;
|
2008-08-26 06:34:37 +08:00
|
|
|
def ATOMIC_LOAD_XOR_I64 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_XOR_I64",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$dst, (atomic_load_xor_64 xoaddr:$ptr, i64:$incr))]>;
|
2008-08-26 06:34:37 +08:00
|
|
|
def ATOMIC_LOAD_AND_I64 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_AND_i64",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$dst, (atomic_load_and_64 xoaddr:$ptr, i64:$incr))]>;
|
2008-08-26 06:34:37 +08:00
|
|
|
def ATOMIC_LOAD_NAND_I64 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_NAND_I64",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$dst, (atomic_load_nand_64 xoaddr:$ptr, i64:$incr))]>;
|
2008-08-26 06:34:37 +08:00
|
|
|
|
2008-08-22 11:49:10 +08:00
|
|
|
def ATOMIC_CMP_SWAP_I64 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$old, g8rc:$new), "#ATOMIC_CMP_SWAP_I64",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$dst, (atomic_cmp_swap_64 xoaddr:$ptr, i64:$old, i64:$new))]>;
|
2008-08-26 06:34:37 +08:00
|
|
|
|
2008-08-26 05:09:52 +08:00
|
|
|
def ATOMIC_SWAP_I64 : Pseudo<
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$new), "#ATOMIC_SWAP_I64",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$dst, (atomic_swap_64 xoaddr:$ptr, i64:$new))]>;
|
2008-08-22 11:49:10 +08:00
|
|
|
}
|
2008-04-19 10:30:38 +08:00
|
|
|
}
|
|
|
|
|
2008-07-12 10:23:19 +08:00
|
|
|
// Instructions to support atomic operations
|
2013-04-27 00:53:15 +08:00
|
|
|
def LDARX : XForm_1<31, 84, (outs g8rc:$rD), (ins memrr:$ptr),
|
2013-11-28 07:26:09 +08:00
|
|
|
"ldarx $rD, $ptr", IIC_LdStLDARX,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD, (PPClarx xoaddr:$ptr))]>;
|
2008-07-12 10:23:19 +08:00
|
|
|
|
|
|
|
let Defs = [CR0] in
|
2013-04-27 00:53:15 +08:00
|
|
|
def STDCX : XForm_1<31, 214, (outs), (ins g8rc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stdcx. $rS, $dst", IIC_LdStSTDCX,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(PPCstcx i64:$rS, xoaddr:$dst)]>,
|
2008-07-12 10:23:19 +08:00
|
|
|
isDOT;
|
|
|
|
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
|
2008-10-30 02:26:45 +08:00
|
|
|
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
|
2008-04-30 17:16:33 +08:00
|
|
|
def TCRETURNdi8 :Pseudo< (outs),
|
2012-07-14 04:44:29 +08:00
|
|
|
(ins calltarget:$dst, i32imm:$offset),
|
2008-04-30 17:16:33 +08:00
|
|
|
"#TC_RETURNd8 $dst $offset",
|
|
|
|
[]>;
|
|
|
|
|
2008-10-30 02:26:45 +08:00
|
|
|
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
|
2013-06-24 19:03:33 +08:00
|
|
|
def TCRETURNai8 :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset),
|
2008-04-30 17:16:33 +08:00
|
|
|
"#TC_RETURNa8 $func $offset",
|
|
|
|
[(PPCtc_return (i64 imm:$func), imm:$offset)]>;
|
|
|
|
|
2008-10-30 02:26:45 +08:00
|
|
|
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
|
2012-07-14 04:44:29 +08:00
|
|
|
def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset),
|
2008-04-30 17:16:33 +08:00
|
|
|
"#TC_RETURNr8 $dst $offset",
|
|
|
|
[]>;
|
|
|
|
|
|
|
|
let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
|
2013-03-26 18:53:03 +08:00
|
|
|
isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR8, RM] in
|
2013-11-28 07:26:09 +08:00
|
|
|
def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
|
|
|
|
[]>,
|
2013-03-26 18:53:03 +08:00
|
|
|
Requires<[In64BitMode]>;
|
2008-04-30 17:16:33 +08:00
|
|
|
|
|
|
|
let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
|
2008-10-30 02:26:45 +08:00
|
|
|
isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
|
2008-04-30 17:16:33 +08:00
|
|
|
def TAILB8 : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"b $dst", IIC_BrB,
|
2008-04-30 17:16:33 +08:00
|
|
|
[]>;
|
|
|
|
|
|
|
|
let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
|
2008-10-30 02:26:45 +08:00
|
|
|
isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
|
2013-06-24 19:03:33 +08:00
|
|
|
def TAILBA8 : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"ba $dst", IIC_BrB,
|
2008-04-30 17:16:33 +08:00
|
|
|
[]>;
|
2013-04-12 10:18:09 +08:00
|
|
|
} // Interpretation64Bit
|
2013-03-26 18:57:16 +08:00
|
|
|
|
2008-04-30 17:16:33 +08:00
|
|
|
def : Pat<(PPCtc_return (i64 tglobaladdr:$dst), imm:$imm),
|
|
|
|
(TCRETURNdi8 tglobaladdr:$dst, imm:$imm)>;
|
|
|
|
|
|
|
|
def : Pat<(PPCtc_return (i64 texternalsym:$dst), imm:$imm),
|
|
|
|
(TCRETURNdi8 texternalsym:$dst, imm:$imm)>;
|
|
|
|
|
|
|
|
def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm),
|
|
|
|
(TCRETURNri8 CTRRC8:$dst, imm:$imm)>;
|
|
|
|
|
2012-06-08 23:38:21 +08:00
|
|
|
|
2013-03-28 11:38:08 +08:00
|
|
|
// 64-bit CR instructions
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
|
2014-11-26 08:46:26 +08:00
|
|
|
let hasSideEffects = 0 in {
|
2013-07-04 01:59:07 +08:00
|
|
|
def MTOCRF8: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins g8rc:$ST),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mtocrf $FXM, $ST", IIC_BrMCRX>,
|
2013-07-04 01:59:07 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_CRU;
|
|
|
|
|
|
|
|
def MTCRF8 : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, g8rc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mtcrf $FXM, $rS", IIC_BrMCRX>,
|
2011-12-07 14:34:06 +08:00
|
|
|
PPC970_MicroCode, PPC970_Unit_CRU;
|
|
|
|
|
2013-09-12 13:24:49 +08:00
|
|
|
let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking.
|
[PowerPC] Always use mfocrf if available
When accessing just a single CR register, it is always preferable to
use mfocrf instead of mfcr, if the former is available on the CPU.
Current code makes that distinction in many, but not all places
where a single CR register value is retrieved. One missing
location is PPCRegisterInfo::lowerCRSpilling.
To fix this and make this simpler in the future, this patch changes
the bulk of the back-end to always assume mfocrf is available and
simply generate it when needed.
On machines that actually do not support mfocrf, the instruction
is replaced by mfcr at the very end, in EmitInstruction.
This has the additional benefit that we no longer need the
MFCRpseud hack, since before EmitInstruction we always have
a MFOCRF instruction pattern, which already models data flow
as required.
The patch also adds the MFOCRF8 version of the instruction,
which was missing so far.
Except for the PPCRegisterInfo::lowerCRSpilling case, no change
in generated code intended.
llvm-svn: 185556
2013-07-04 01:05:42 +08:00
|
|
|
def MFOCRF8: XFXForm_5a<31, 19, (outs g8rc:$rT), (ins crbitm:$FXM),
|
2013-12-01 04:41:13 +08:00
|
|
|
"mfocrf $rT, $FXM", IIC_SprMFCRF>,
|
[PowerPC] Always use mfocrf if available
When accessing just a single CR register, it is always preferable to
use mfocrf instead of mfcr, if the former is available on the CPU.
Current code makes that distinction in many, but not all places
where a single CR register value is retrieved. One missing
location is PPCRegisterInfo::lowerCRSpilling.
To fix this and make this simpler in the future, this patch changes
the bulk of the back-end to always assume mfocrf is available and
simply generate it when needed.
On machines that actually do not support mfocrf, the instruction
is replaced by mfcr at the very end, in EmitInstruction.
This has the additional benefit that we no longer need the
MFCRpseud hack, since before EmitInstruction we always have
a MFOCRF instruction pattern, which already models data flow
as required.
The patch also adds the MFOCRF8 version of the instruction,
which was missing so far.
Except for the PPCRegisterInfo::lowerCRSpilling case, no change
in generated code intended.
llvm-svn: 185556
2013-07-04 01:05:42 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_CRU;
|
2013-04-07 22:33:13 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def MFCR8 : XFXForm_3<31, 19, (outs g8rc:$rT), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mfcr $rT", IIC_SprMFCR>,
|
2011-12-07 14:34:06 +08:00
|
|
|
PPC970_MicroCode, PPC970_Unit_CRU;
|
2014-11-26 08:46:26 +08:00
|
|
|
} // hasSideEffects = 0
|
2008-04-30 17:16:33 +08:00
|
|
|
|
2013-03-26 18:57:16 +08:00
|
|
|
let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
|
2013-07-17 13:35:44 +08:00
|
|
|
let Defs = [CTR8] in
|
2013-04-27 00:53:15 +08:00
|
|
|
def EH_SjLj_SetJmp64 : Pseudo<(outs gprc:$dst), (ins memr:$buf),
|
2013-03-22 05:37:52 +08:00
|
|
|
"#EH_SJLJ_SETJMP64",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>,
|
2013-03-22 05:37:52 +08:00
|
|
|
Requires<[In64BitMode]>;
|
|
|
|
let isTerminator = 1 in
|
|
|
|
def EH_SjLj_LongJmp64 : Pseudo<(outs), (ins memr:$buf),
|
|
|
|
"#EH_SJLJ_LONGJMP64",
|
|
|
|
[(PPCeh_sjlj_longjmp addr:$buf)]>,
|
|
|
|
Requires<[In64BitMode]>;
|
|
|
|
}
|
|
|
|
|
2006-11-15 02:44:47 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// 64-bit SPR manipulation instrs.
|
|
|
|
|
2008-10-24 04:41:28 +08:00
|
|
|
let Uses = [CTR8] in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def MFCTR8 : XFXForm_1_ext<31, 339, 9, (outs g8rc:$rT), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mfctr $rT", IIC_SprMFSPR>,
|
2006-11-15 02:44:47 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_FXU;
|
2008-10-24 04:41:28 +08:00
|
|
|
}
|
2013-03-26 03:05:30 +08:00
|
|
|
let Pattern = [(PPCmtctr i64:$rS)], Defs = [CTR8] in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mtctr $rS", IIC_SprMTSPR>,
|
2006-11-15 02:44:47 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_FXU;
|
2006-06-28 02:36:44 +08:00
|
|
|
}
|
2013-12-18 07:05:18 +08:00
|
|
|
let hasSideEffects = 1, Defs = [CTR8] in {
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
llvm-svn: 181927
2013-05-16 05:37:41 +08:00
|
|
|
let Pattern = [(int_ppc_mtctr i64:$rS)] in
|
2013-05-21 00:08:37 +08:00
|
|
|
def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mtctr $rS", IIC_SprMTSPR>,
|
2013-05-21 00:08:37 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_FXU;
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
llvm-svn: 181927
2013-05-16 05:37:41 +08:00
|
|
|
}
|
2006-06-28 02:18:41 +08:00
|
|
|
|
2013-12-18 07:05:18 +08:00
|
|
|
let Pattern = [(set i64:$rT, readcyclecounter)] in
|
2013-04-27 00:53:15 +08:00
|
|
|
def MFTB8 : XFXForm_1_ext<31, 339, 268, (outs g8rc:$rT), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mfspr $rT, 268", IIC_SprMFTB>,
|
2012-08-04 22:10:46 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_FXU;
|
2012-08-08 01:04:20 +08:00
|
|
|
// Note that encoding mftb using mfspr is now the preferred form,
|
|
|
|
// and has been since at least ISA v2.03. The mftb instruction has
|
|
|
|
// now been phased out. Using mfspr, however, is known not to work on
|
|
|
|
// the POWER3.
|
2012-08-04 22:10:46 +08:00
|
|
|
|
2007-09-12 03:55:27 +08:00
|
|
|
let Defs = [X1], Uses = [X1] in
|
2013-04-27 00:53:15 +08:00
|
|
|
def DYNALLOC8 : Pseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$result,
|
|
|
|
(PPCdynalloc i64:$negsize, iaddr:$fpsi))]>;
|
2006-11-17 06:43:37 +08:00
|
|
|
|
2008-10-24 04:41:28 +08:00
|
|
|
let Defs = [LR8] in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def MTLR8 : XFXForm_7_ext<31, 467, 8, (outs), (ins g8rc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mtlr $rS", IIC_SprMTSPR>,
|
2006-11-15 02:44:47 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_FXU;
|
2008-10-24 04:41:28 +08:00
|
|
|
}
|
|
|
|
let Uses = [LR8] in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def MFLR8 : XFXForm_1_ext<31, 339, 8, (outs g8rc:$rT), (ins),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mflr $rT", IIC_SprMFSPR>,
|
2006-11-15 02:44:47 +08:00
|
|
|
PPC970_DGroup_First, PPC970_Unit_FXU;
|
2008-10-24 04:41:28 +08:00
|
|
|
}
|
2013-04-12 10:18:09 +08:00
|
|
|
} // Interpretation64Bit
|
2006-11-15 02:44:47 +08:00
|
|
|
|
2006-06-17 04:22:01 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Fixed point instructions.
|
|
|
|
//
|
|
|
|
|
|
|
|
let PPC970_Unit = 1 in { // FXU Operations.
|
2013-04-12 10:18:09 +08:00
|
|
|
let Interpretation64Bit = 1 in {
|
2014-11-26 08:46:26 +08:00
|
|
|
let hasSideEffects = 0 in {
|
2013-12-18 07:05:18 +08:00
|
|
|
let isCodeGenOnly = 1 in {
|
2006-06-17 04:22:01 +08:00
|
|
|
|
2012-08-28 10:10:33 +08:00
|
|
|
let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
|
2013-05-24 06:48:06 +08:00
|
|
|
def LI8 : DForm_2_r0<14, (outs g8rc:$rD), (ins s16imm64:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"li $rD, $imm", IIC_IntSimple,
|
Change some PowerPC PatLeaf definitions to ImmLeaf for fast-isel.
Using PatLeaf rather than ImmLeaf when defining immediate predicates
prevents simple patterns using those predicates from being recognized
for fast instruction selection. This patch replaces the immSExt16
PatLeaf predicate with two ImmLeaf predicates, imm32SExt16 and
imm64SExt16, allowing a few more patterns to be recognized (ADDI,
ADDIC, MULLI, ADDI8, and ADDIC8). Using the new predicates does not
help for LI, LI8, SUBFIC, and SUBFIC8 because these are rejected for
other reasons, but I see no reason to retain the PatLeaf predicate.
No functional change intended, and thus no test cases yet. This is
preliminary work for enabling fast-isel support for PowerPC. When
that support is ready, we'll be able to test this function.
llvm-svn: 182510
2013-05-23 04:09:24 +08:00
|
|
|
[(set i64:$rD, imm64SExt16:$imm)]>;
|
2013-06-26 21:49:53 +08:00
|
|
|
def LIS8 : DForm_2_r0<15, (outs g8rc:$rD), (ins s17imm64:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lis $rD, $imm", IIC_IntSimple,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD, imm16ShiftedSExt:$imm)]>;
|
2012-08-28 10:10:33 +08:00
|
|
|
}
|
Add some 64-bit logical ops.
Split imm16Shifted into a sext/zext form for 64-bit support.
Add some patterns for immediate formation. For example, we now compile this:
static unsigned long long Y;
void test3() {
Y = 0xF0F00F00;
}
into:
_test3:
li r2, 3840
lis r3, ha16(_Y)
xoris r2, r2, 61680
std r2, lo16(_Y)(r3)
blr
GCC produces:
_test3:
li r0,0
lis r2,ha16(_Y)
ori r0,r0,61680
sldi r0,r0,16
ori r0,r0,3840
std r0,lo16(_Y)(r2)
blr
llvm-svn: 28883
2006-06-21 06:34:10 +08:00
|
|
|
|
|
|
|
// Logical ops.
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
defm NAND8: XForm_6r<31, 476, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"nand", "$rA, $rS, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rA, (not (and i64:$rS, i64:$rB)))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm AND8 : XForm_6r<31, 28, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"and", "$rA, $rS, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rA, (and i64:$rS, i64:$rB))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
} // isCommutable
|
2013-04-27 00:53:15 +08:00
|
|
|
defm ANDC8: XForm_6r<31, 60, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"andc", "$rA, $rS, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rA, (and i64:$rS, (not i64:$rB)))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
defm OR8 : XForm_6r<31, 444, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"or", "$rA, $rS, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rA, (or i64:$rS, i64:$rB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm NOR8 : XForm_6r<31, 124, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"nor", "$rA, $rS, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rA, (not (or i64:$rS, i64:$rB)))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
} // isCommutable
|
2013-04-27 00:53:15 +08:00
|
|
|
defm ORC8 : XForm_6r<31, 412, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"orc", "$rA, $rS, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rA, (or i64:$rS, (not i64:$rB)))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
defm EQV8 : XForm_6r<31, 284, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"eqv", "$rA, $rS, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rA, (not (xor i64:$rS, i64:$rB)))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm XOR8 : XForm_6r<31, 316, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"xor", "$rA, $rS, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rA, (xor i64:$rS, i64:$rB))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
} // let isCommutable = 1
|
2006-06-21 07:11:59 +08:00
|
|
|
|
|
|
|
// Logical ops with immediate.
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CR0] in {
|
2014-01-03 05:26:59 +08:00
|
|
|
def ANDIo8 : DForm_4<28, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
|
2013-11-28 07:26:09 +08:00
|
|
|
"andi. $dst, $src1, $src2", IIC_IntGeneral,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$dst, (and i64:$src1, immZExt16:$src2))]>,
|
Add some 64-bit logical ops.
Split imm16Shifted into a sext/zext form for 64-bit support.
Add some patterns for immediate formation. For example, we now compile this:
static unsigned long long Y;
void test3() {
Y = 0xF0F00F00;
}
into:
_test3:
li r2, 3840
lis r3, ha16(_Y)
xoris r2, r2, 61680
std r2, lo16(_Y)(r3)
blr
GCC produces:
_test3:
li r0,0
lis r2,ha16(_Y)
ori r0,r0,61680
sldi r0,r0,16
ori r0,r0,3840
std r0,lo16(_Y)(r2)
blr
llvm-svn: 28883
2006-06-21 06:34:10 +08:00
|
|
|
isDOT;
|
2014-01-03 05:26:59 +08:00
|
|
|
def ANDISo8 : DForm_4<29, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
|
2013-11-28 07:26:09 +08:00
|
|
|
"andis. $dst, $src1, $src2", IIC_IntGeneral,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$dst, (and i64:$src1, imm16ShiftedZExt:$src2))]>,
|
Add some 64-bit logical ops.
Split imm16Shifted into a sext/zext form for 64-bit support.
Add some patterns for immediate formation. For example, we now compile this:
static unsigned long long Y;
void test3() {
Y = 0xF0F00F00;
}
into:
_test3:
li r2, 3840
lis r3, ha16(_Y)
xoris r2, r2, 61680
std r2, lo16(_Y)(r3)
blr
GCC produces:
_test3:
li r0,0
lis r2,ha16(_Y)
ori r0,r0,61680
sldi r0,r0,16
ori r0,r0,3840
std r0,lo16(_Y)(r2)
blr
llvm-svn: 28883
2006-06-21 06:34:10 +08:00
|
|
|
isDOT;
|
2013-04-13 02:17:57 +08:00
|
|
|
}
|
2014-01-03 05:26:59 +08:00
|
|
|
def ORI8 : DForm_4<24, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
|
2013-11-28 07:26:09 +08:00
|
|
|
"ori $dst, $src1, $src2", IIC_IntSimple,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$dst, (or i64:$src1, immZExt16:$src2))]>;
|
2014-01-03 05:26:59 +08:00
|
|
|
def ORIS8 : DForm_4<25, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
|
2013-11-28 07:26:09 +08:00
|
|
|
"oris $dst, $src1, $src2", IIC_IntSimple,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$dst, (or i64:$src1, imm16ShiftedZExt:$src2))]>;
|
2014-01-03 05:26:59 +08:00
|
|
|
def XORI8 : DForm_4<26, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
|
2013-11-28 07:26:09 +08:00
|
|
|
"xori $dst, $src1, $src2", IIC_IntSimple,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$dst, (xor i64:$src1, immZExt16:$src2))]>;
|
2014-01-03 05:26:59 +08:00
|
|
|
def XORIS8 : DForm_4<27, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
|
2013-11-28 07:26:09 +08:00
|
|
|
"xoris $dst, $src1, $src2", IIC_IntSimple,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$dst, (xor i64:$src1, imm16ShiftedZExt:$src2))]>;
|
Add some 64-bit logical ops.
Split imm16Shifted into a sext/zext form for 64-bit support.
Add some patterns for immediate formation. For example, we now compile this:
static unsigned long long Y;
void test3() {
Y = 0xF0F00F00;
}
into:
_test3:
li r2, 3840
lis r3, ha16(_Y)
xoris r2, r2, 61680
std r2, lo16(_Y)(r3)
blr
GCC produces:
_test3:
li r0,0
lis r2,ha16(_Y)
ori r0,r0,61680
sldi r0,r0,16
ori r0,r0,3840
std r0,lo16(_Y)(r2)
blr
llvm-svn: 28883
2006-06-21 06:34:10 +08:00
|
|
|
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
defm ADD8 : XOForm_1r<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"add", "$rT, $rA, $rB", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rT, (add i64:$rA, i64:$rB))]>;
|
2012-12-05 00:18:08 +08:00
|
|
|
// ADD8 has a special form: reg = ADD8(reg, sym@tls) for use by the
|
|
|
|
// initial-exec thread-local storage model.
|
2013-04-27 00:53:15 +08:00
|
|
|
def ADD8TLS : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, tlsreg:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"add $rT, $rA, $rB", IIC_IntSimple,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rT, (add i64:$rA, tglobaltlsaddr:$rB))]>;
|
2007-05-17 14:52:46 +08:00
|
|
|
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
defm ADDC8 : XOForm_1rc<31, 10, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"addc", "$rT, $rA, $rB", IIC_IntGeneral,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i64:$rT, (addc i64:$rA, i64:$rB))]>,
|
|
|
|
PPC970_DGroup_Cracked;
|
2014-03-24 23:07:28 +08:00
|
|
|
|
2013-04-13 02:17:57 +08:00
|
|
|
let Defs = [CARRY] in
|
2013-04-27 00:53:15 +08:00
|
|
|
def ADDIC8 : DForm_2<12, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"addic $rD, $rA, $imm", IIC_IntGeneral,
|
Change some PowerPC PatLeaf definitions to ImmLeaf for fast-isel.
Using PatLeaf rather than ImmLeaf when defining immediate predicates
prevents simple patterns using those predicates from being recognized
for fast instruction selection. This patch replaces the immSExt16
PatLeaf predicate with two ImmLeaf predicates, imm32SExt16 and
imm64SExt16, allowing a few more patterns to be recognized (ADDI,
ADDIC, MULLI, ADDI8, and ADDIC8). Using the new predicates does not
help for LI, LI8, SUBFIC, and SUBFIC8 because these are rejected for
other reasons, but I see no reason to retain the PatLeaf predicate.
No functional change intended, and thus no test cases yet. This is
preliminary work for enabling fast-isel support for PowerPC. When
that support is ready, we'll be able to test this function.
llvm-svn: 182510
2013-05-23 04:09:24 +08:00
|
|
|
[(set i64:$rD, (addc i64:$rA, imm64SExt16:$imm))]>;
|
2013-05-24 06:48:06 +08:00
|
|
|
def ADDI8 : DForm_2<14, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s16imm64:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"addi $rD, $rA, $imm", IIC_IntSimple,
|
Change some PowerPC PatLeaf definitions to ImmLeaf for fast-isel.
Using PatLeaf rather than ImmLeaf when defining immediate predicates
prevents simple patterns using those predicates from being recognized
for fast instruction selection. This patch replaces the immSExt16
PatLeaf predicate with two ImmLeaf predicates, imm32SExt16 and
imm64SExt16, allowing a few more patterns to be recognized (ADDI,
ADDIC, MULLI, ADDI8, and ADDIC8). Using the new predicates does not
help for LI, LI8, SUBFIC, and SUBFIC8 because these are rejected for
other reasons, but I see no reason to retain the PatLeaf predicate.
No functional change intended, and thus no test cases yet. This is
preliminary work for enabling fast-isel support for PowerPC. When
that support is ready, we'll be able to test this function.
llvm-svn: 182510
2013-05-23 04:09:24 +08:00
|
|
|
[(set i64:$rD, (add i64:$rA, imm64SExt16:$imm))]>;
|
2013-06-26 21:49:53 +08:00
|
|
|
def ADDIS8 : DForm_2<15, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s17imm64:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"addis $rD, $rA, $imm", IIC_IntSimple,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD, (add i64:$rA, imm16ShiftedSExt:$imm))]>;
|
Add some 64-bit logical ops.
Split imm16Shifted into a sext/zext form for 64-bit support.
Add some patterns for immediate formation. For example, we now compile this:
static unsigned long long Y;
void test3() {
Y = 0xF0F00F00;
}
into:
_test3:
li r2, 3840
lis r3, ha16(_Y)
xoris r2, r2, 61680
std r2, lo16(_Y)(r3)
blr
GCC produces:
_test3:
li r0,0
lis r2,ha16(_Y)
ori r0,r0,61680
sldi r0,r0,16
ori r0,r0,3840
std r0,lo16(_Y)(r2)
blr
llvm-svn: 28883
2006-06-21 06:34:10 +08:00
|
|
|
|
2009-09-19 04:15:22 +08:00
|
|
|
let Defs = [CARRY] in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def SUBFIC8: DForm_2< 8, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"subfic $rD, $rA, $imm", IIC_IntGeneral,
|
Change some PowerPC PatLeaf definitions to ImmLeaf for fast-isel.
Using PatLeaf rather than ImmLeaf when defining immediate predicates
prevents simple patterns using those predicates from being recognized
for fast instruction selection. This patch replaces the immSExt16
PatLeaf predicate with two ImmLeaf predicates, imm32SExt16 and
imm64SExt16, allowing a few more patterns to be recognized (ADDI,
ADDIC, MULLI, ADDI8, and ADDIC8). Using the new predicates does not
help for LI, LI8, SUBFIC, and SUBFIC8 because these are rejected for
other reasons, but I see no reason to retain the PatLeaf predicate.
No functional change intended, and thus no test cases yet. This is
preliminary work for enabling fast-isel support for PowerPC. When
that support is ready, we'll be able to test this function.
llvm-svn: 182510
2013-05-23 04:09:24 +08:00
|
|
|
[(set i64:$rD, (subc imm64SExt16:$imm, i64:$rA))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SUBFC8 : XOForm_1r<31, 8, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"subfc", "$rT, $rA, $rB", IIC_IntGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rT, (subc i64:$rB, i64:$rA))]>,
|
|
|
|
PPC970_DGroup_Cracked;
|
|
|
|
}
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SUBF8 : XOForm_1r<31, 40, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"subf", "$rT, $rA, $rB", IIC_IntGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rT, (sub i64:$rB, i64:$rA))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm NEG8 : XOForm_3r<31, 104, 0, (outs g8rc:$rT), (ins g8rc:$rA),
|
2013-11-28 07:26:09 +08:00
|
|
|
"neg", "$rT, $rA", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rT, (ineg i64:$rA))]>;
|
2013-04-13 02:17:57 +08:00
|
|
|
let Uses = [CARRY] in {
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
defm ADDE8 : XOForm_1rc<31, 138, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"adde", "$rT, $rA, $rB", IIC_IntGeneral,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i64:$rT, (adde i64:$rA, i64:$rB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm ADDME8 : XOForm_3rc<31, 234, 0, (outs g8rc:$rT), (ins g8rc:$rA),
|
2013-11-28 07:26:09 +08:00
|
|
|
"addme", "$rT, $rA", IIC_IntGeneral,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i64:$rT, (adde i64:$rA, -1))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm ADDZE8 : XOForm_3rc<31, 202, 0, (outs g8rc:$rT), (ins g8rc:$rA),
|
2013-11-28 07:26:09 +08:00
|
|
|
"addze", "$rT, $rA", IIC_IntGeneral,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i64:$rT, (adde i64:$rA, 0))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SUBFE8 : XOForm_1rc<31, 136, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"subfe", "$rT, $rA, $rB", IIC_IntGeneral,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i64:$rT, (sube i64:$rB, i64:$rA))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SUBFME8 : XOForm_3rc<31, 232, 0, (outs g8rc:$rT), (ins g8rc:$rA),
|
2013-11-28 07:26:09 +08:00
|
|
|
"subfme", "$rT, $rA", IIC_IntGeneral,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i64:$rT, (sube -1, i64:$rA))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SUBFZE8 : XOForm_3rc<31, 200, 0, (outs g8rc:$rT), (ins g8rc:$rA),
|
2013-11-28 07:26:09 +08:00
|
|
|
"subfze", "$rT, $rA", IIC_IntGeneral,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i64:$rT, (sube 0, i64:$rA))]>;
|
2013-04-12 10:18:09 +08:00
|
|
|
}
|
2013-12-18 07:05:18 +08:00
|
|
|
} // isCodeGenOnly
|
2013-04-12 10:18:09 +08:00
|
|
|
|
2013-12-18 07:05:18 +08:00
|
|
|
// FIXME: Duplicating this for the asm parser should be unnecessary, but the
|
|
|
|
// previous definition must be marked as CodeGen only to prevent decoding
|
|
|
|
// conflicts.
|
|
|
|
let isAsmParserOnly = 1 in
|
|
|
|
def ADD8TLS_ : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, tlsreg:$rB),
|
|
|
|
"add $rT, $rA, $rB", IIC_IntSimple, []>;
|
2013-04-12 10:18:09 +08:00
|
|
|
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
defm MULHD : XOForm_1r<31, 73, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mulhd", "$rT, $rA, $rB", IIC_IntMulHW,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rT, (mulhs i64:$rA, i64:$rB))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm MULHDU : XOForm_1r<31, 9, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mulhdu", "$rT, $rA, $rB", IIC_IntMulHWU,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rT, (mulhu i64:$rA, i64:$rB))]>;
|
2014-03-24 23:07:28 +08:00
|
|
|
} // isCommutable
|
2013-04-12 10:18:09 +08:00
|
|
|
}
|
|
|
|
} // Interpretation64Bit
|
2006-06-17 04:22:01 +08:00
|
|
|
|
2014-11-26 08:46:26 +08:00
|
|
|
let isCompare = 1, hasSideEffects = 0 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def CMPD : XForm_16_ext<31, 0, (outs crrc:$crD), (ins g8rc:$rA, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"cmpd $crD, $rA, $rB", IIC_IntCompare>, isPPC64;
|
2013-04-27 00:53:15 +08:00
|
|
|
def CMPLD : XForm_16_ext<31, 32, (outs crrc:$crD), (ins g8rc:$rA, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"cmpld $crD, $rA, $rB", IIC_IntCompare>, isPPC64;
|
2014-01-03 05:26:59 +08:00
|
|
|
def CMPDI : DForm_5_ext<11, (outs crrc:$crD), (ins g8rc:$rA, s16imm64:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"cmpdi $crD, $rA, $imm", IIC_IntCompare>, isPPC64;
|
2014-01-03 05:26:59 +08:00
|
|
|
def CMPLDI : DForm_6_ext<10, (outs crrc:$dst), (ins g8rc:$src1, u16imm64:$src2),
|
2013-11-28 07:26:09 +08:00
|
|
|
"cmpldi $dst, $src1, $src2",
|
|
|
|
IIC_IntCompare>, isPPC64;
|
2013-04-15 10:37:46 +08:00
|
|
|
}
|
2006-06-17 04:22:01 +08:00
|
|
|
|
2014-11-26 08:46:26 +08:00
|
|
|
let hasSideEffects = 0 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SLD : XForm_6r<31, 27, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"sld", "$rA, $rS, $rB", IIC_IntRotateD,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rA, (PPCshl i64:$rS, i32:$rB))]>, isPPC64;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SRD : XForm_6r<31, 539, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"srd", "$rA, $rS, $rB", IIC_IntRotateD,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rA, (PPCsrl i64:$rS, i32:$rB))]>, isPPC64;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SRAD : XForm_6rc<31, 794, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"srad", "$rA, $rS, $rB", IIC_IntRotateD,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i64:$rA, (PPCsra i64:$rS, i32:$rB))]>, isPPC64;
|
2013-04-12 10:18:09 +08:00
|
|
|
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
defm EXTSB8 : XForm_11r<31, 954, (outs g8rc:$rA), (ins g8rc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"extsb", "$rA, $rS", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rA, (sext_inreg i64:$rS, i8))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm EXTSH8 : XForm_11r<31, 922, (outs g8rc:$rA), (ins g8rc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"extsh", "$rA, $rS", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rA, (sext_inreg i64:$rS, i16))]>;
|
2014-12-13 07:59:36 +08:00
|
|
|
|
|
|
|
defm SLW8 : XForm_6r<31, 24, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
|
|
|
|
"slw", "$rA, $rS, $rB", IIC_IntGeneral, []>;
|
|
|
|
defm SRW8 : XForm_6r<31, 536, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
|
|
|
|
"srw", "$rA, $rS, $rB", IIC_IntGeneral, []>;
|
2013-04-12 10:18:09 +08:00
|
|
|
} // Interpretation64Bit
|
|
|
|
|
2013-08-27 03:42:51 +08:00
|
|
|
// For fast-isel:
|
|
|
|
let isCodeGenOnly = 1 in {
|
|
|
|
def EXTSB8_32_64 : XForm_11<31, 954, (outs g8rc:$rA), (ins gprc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"extsb $rA, $rS", IIC_IntSimple, []>, isPPC64;
|
2013-08-27 03:42:51 +08:00
|
|
|
def EXTSH8_32_64 : XForm_11<31, 922, (outs g8rc:$rA), (ins gprc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"extsh $rA, $rS", IIC_IntSimple, []>, isPPC64;
|
2013-08-27 03:42:51 +08:00
|
|
|
} // isCodeGenOnly for fast-isel
|
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
defm EXTSW : XForm_11r<31, 986, (outs g8rc:$rA), (ins g8rc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"extsw", "$rA, $rS", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rA, (sext_inreg i64:$rS, i32))]>, isPPC64;
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
defm EXTSW_32_64 : XForm_11r<31, 986, (outs g8rc:$rA), (ins gprc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"extsw", "$rA, $rS", IIC_IntSimple,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rA, (sext i32:$rS))]>, isPPC64;
|
2006-06-17 04:22:01 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
defm SRADI : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
|
2013-11-28 07:26:09 +08:00
|
|
|
"sradi", "$rA, $rS, $SH", IIC_IntRotateDI,
|
2013-04-13 02:17:57 +08:00
|
|
|
[(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm CNTLZD : XForm_11r<31, 58, (outs g8rc:$rA), (ins g8rc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"cntlzd", "$rA, $rS", IIC_IntGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rA, (ctlz i64:$rS))]>;
|
2013-11-21 04:54:55 +08:00
|
|
|
def POPCNTD : XForm_11<31, 506, (outs g8rc:$rA), (ins g8rc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"popcntd $rA, $rS", IIC_IntGeneral,
|
2013-11-21 04:54:55 +08:00
|
|
|
[(set i64:$rA, (ctpop i64:$rS))]>;
|
2007-03-25 12:44:03 +08:00
|
|
|
|
2013-04-01 23:58:15 +08:00
|
|
|
// popcntw also does a population count on the high 32 bits (storing the
|
|
|
|
// results in the high 32-bits of the output). We'll ignore that here (which is
|
|
|
|
// safe because we never separately use the high part of the 64-bit registers).
|
2013-11-21 04:54:55 +08:00
|
|
|
def POPCNTW : XForm_11<31, 378, (outs gprc:$rA), (ins gprc:$rS),
|
2013-11-28 07:26:09 +08:00
|
|
|
"popcntw $rA, $rS", IIC_IntGeneral,
|
2013-11-21 04:54:55 +08:00
|
|
|
[(set i32:$rA, (ctpop i32:$rS))]>;
|
2013-04-12 10:18:09 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
defm DIVD : XOForm_1r<31, 489, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"divd", "$rT, $rA, $rB", IIC_IntDivD,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rT, (sdiv i64:$rA, i64:$rB))]>, isPPC64,
|
|
|
|
PPC970_DGroup_First, PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm DIVDU : XOForm_1r<31, 457, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"divdu", "$rT, $rA, $rB", IIC_IntDivD,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rT, (udiv i64:$rA, i64:$rB))]>, isPPC64,
|
|
|
|
PPC970_DGroup_First, PPC970_DGroup_Cracked;
|
2014-03-24 23:07:28 +08:00
|
|
|
let isCommutable = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
defm MULLD : XOForm_1r<31, 233, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mulld", "$rT, $rA, $rB", IIC_IntMulHD,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set i64:$rT, (mul i64:$rA, i64:$rB))]>, isPPC64;
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
2013-08-07 01:03:03 +08:00
|
|
|
def MULLI8 : DForm_2<7, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
|
2013-11-28 07:26:09 +08:00
|
|
|
"mulli $rD, $rA, $imm", IIC_IntMulLI,
|
2013-08-07 01:03:03 +08:00
|
|
|
[(set i64:$rD, (mul i64:$rA, imm64SExt16:$imm))]>;
|
2013-04-12 10:18:09 +08:00
|
|
|
}
|
2006-06-27 07:53:10 +08:00
|
|
|
|
2014-11-26 08:46:26 +08:00
|
|
|
let hasSideEffects = 0 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
defm RLDIMI : MDForm_1r<30, 3, (outs g8rc:$rA),
|
|
|
|
(ins g8rc:$rSi, g8rc:$rS, u6imm:$SH, u6imm:$MBE),
|
2013-11-28 07:26:09 +08:00
|
|
|
"rldimi", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
|
2013-04-12 10:18:09 +08:00
|
|
|
[]>, isPPC64, RegConstraint<"$rSi = $rA">,
|
|
|
|
NoEncode<"$rSi">;
|
2006-06-17 04:22:01 +08:00
|
|
|
|
|
|
|
// Rotate instructions.
|
2013-04-26 23:39:12 +08:00
|
|
|
defm RLDCL : MDSForm_1r<30, 8,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB, u6imm:$MBE),
|
2013-11-28 07:26:09 +08:00
|
|
|
"rldcl", "$rA, $rS, $rB, $MBE", IIC_IntRotateD,
|
2013-04-12 10:18:09 +08:00
|
|
|
[]>, isPPC64;
|
2013-06-25 21:17:10 +08:00
|
|
|
defm RLDCR : MDSForm_1r<30, 9,
|
|
|
|
(outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB, u6imm:$MBE),
|
2013-11-28 07:26:09 +08:00
|
|
|
"rldcr", "$rA, $rS, $rB, $MBE", IIC_IntRotateD,
|
2013-06-25 21:17:10 +08:00
|
|
|
[]>, isPPC64;
|
2013-04-12 10:18:09 +08:00
|
|
|
defm RLDICL : MDForm_1r<30, 0,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
|
2013-11-28 07:26:09 +08:00
|
|
|
"rldicl", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
|
2013-04-12 10:18:09 +08:00
|
|
|
[]>, isPPC64;
|
2013-08-27 03:42:51 +08:00
|
|
|
// For fast-isel:
|
|
|
|
let isCodeGenOnly = 1 in
|
|
|
|
def RLDICL_32_64 : MDForm_1<30, 0,
|
|
|
|
(outs g8rc:$rA),
|
|
|
|
(ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
|
2013-11-28 07:26:09 +08:00
|
|
|
"rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
|
2013-08-27 03:42:51 +08:00
|
|
|
[]>, isPPC64;
|
|
|
|
// End fast-isel.
|
2013-04-12 10:18:09 +08:00
|
|
|
defm RLDICR : MDForm_1r<30, 1,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
|
2013-11-28 07:26:09 +08:00
|
|
|
"rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
|
2013-04-12 10:18:09 +08:00
|
|
|
[]>, isPPC64;
|
2013-06-25 21:17:10 +08:00
|
|
|
defm RLDIC : MDForm_1r<30, 2,
|
|
|
|
(outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
|
2013-11-28 07:26:09 +08:00
|
|
|
"rldic", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
|
2013-06-25 21:17:10 +08:00
|
|
|
[]>, isPPC64;
|
2013-04-12 10:18:09 +08:00
|
|
|
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
defm RLWINM8 : MForm_2r<21, (outs g8rc:$rA),
|
|
|
|
(ins g8rc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
|
2013-11-28 07:26:09 +08:00
|
|
|
"rlwinm", "$rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[]>;
|
2011-12-07 14:34:06 +08:00
|
|
|
|
2014-12-13 07:59:36 +08:00
|
|
|
defm RLWNM8 : MForm_2r<23, (outs g8rc:$rA),
|
|
|
|
(ins g8rc:$rS, g8rc:$rB, u5imm:$MB, u5imm:$ME),
|
|
|
|
"rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral,
|
|
|
|
[]>;
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
// RLWIMI can be commuted if the rotate amount is zero.
|
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
|
|
|
defm RLWIMI8 : MForm_2r<20, (outs g8rc:$rA),
|
|
|
|
(ins g8rc:$rSi, g8rc:$rS, u5imm:$SH, u5imm:$MB,
|
|
|
|
u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME",
|
|
|
|
IIC_IntRotate, []>, PPC970_DGroup_Cracked,
|
|
|
|
RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">;
|
|
|
|
|
2013-04-07 23:06:53 +08:00
|
|
|
let isSelect = 1 in
|
2012-11-14 03:14:19 +08:00
|
|
|
def ISEL8 : AForm_4<31, 15,
|
2013-04-27 00:53:15 +08:00
|
|
|
(outs g8rc:$rT), (ins g8rc_nox0:$rA, g8rc:$rB, crbitrc:$cond),
|
2013-11-28 07:26:09 +08:00
|
|
|
"isel $rT, $rA, $rB, $cond", IIC_IntGeneral,
|
2012-06-23 07:10:08 +08:00
|
|
|
[]>;
|
2013-04-12 10:18:09 +08:00
|
|
|
} // Interpretation64Bit
|
2014-11-26 08:46:26 +08:00
|
|
|
} // hasSideEffects = 0
|
2006-06-27 07:53:10 +08:00
|
|
|
} // End FXU Operations.
|
2006-06-17 04:22:01 +08:00
|
|
|
|
|
|
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Load/Store instructions.
|
|
|
|
//
|
|
|
|
|
|
|
|
|
2006-07-14 12:42:02 +08:00
|
|
|
// Sign extending loads.
|
2008-12-04 02:15:48 +08:00
|
|
|
let canFoldAsLoad = 1, PPC970_Unit = 2 in {
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
def LHA8: DForm_1<42, (outs g8rc:$rD), (ins memri:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lha $rD, $src", IIC_LdStLHA,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD, (sextloadi16 iaddr:$src))]>,
|
2006-07-14 12:42:02 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LWA : DSForm_1<58, 2, (outs g8rc:$rD), (ins memrix:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lwa $rD, $src", IIC_LdStLWA,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD,
|
2013-03-19 07:00:58 +08:00
|
|
|
(aligned4sextloadi32 ixaddr:$src))]>, isPPC64,
|
2006-06-20 08:38:36 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
def LHAX8: XForm_1<31, 343, (outs g8rc:$rD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lhax $rD, $src", IIC_LdStLHA,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD, (sextloadi16 xaddr:$src))]>,
|
2006-07-14 12:42:02 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LWAX : XForm_1<31, 341, (outs g8rc:$rD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lwax $rD, $src", IIC_LdStLHA,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
|
2006-06-17 04:22:01 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-08-30 10:29:45 +08:00
|
|
|
// For fast-isel:
|
|
|
|
let isCodeGenOnly = 1, mayLoad = 1 in {
|
|
|
|
def LWA_32 : DSForm_1<58, 2, (outs gprc:$rD), (ins memrix:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lwa $rD, $src", IIC_LdStLWA, []>, isPPC64,
|
2013-08-30 10:29:45 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
|
|
|
def LWAX_32 : XForm_1<31, 341, (outs gprc:$rD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lwax $rD, $src", IIC_LdStLHA, []>, isPPC64,
|
2013-08-30 10:29:45 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
|
|
|
} // end fast-isel isCodeGenOnly
|
2006-07-14 12:42:02 +08:00
|
|
|
|
2006-11-11 07:58:45 +08:00
|
|
|
// Update forms.
|
2014-11-26 08:46:26 +08:00
|
|
|
let mayLoad = 1, hasSideEffects = 0 in {
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
def LHAU8 : DForm_1<43, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
|
2013-03-20 03:52:30 +08:00
|
|
|
(ins memri:$addr),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lhau $rD, $addr", IIC_LdStLHAU,
|
2013-03-20 03:52:30 +08:00
|
|
|
[]>, RegConstraint<"$addr.reg = $ea_result">,
|
2006-11-16 07:24:18 +08:00
|
|
|
NoEncode<"$ea_result">;
|
2006-11-11 07:58:45 +08:00
|
|
|
// NO LWAU!
|
|
|
|
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
2013-04-27 00:53:15 +08:00
|
|
|
def LHAUX8 : XForm_1<31, 375, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
|
2012-06-20 23:43:03 +08:00
|
|
|
(ins memrr:$addr),
|
2013-12-01 04:41:13 +08:00
|
|
|
"lhaux $rD, $addr", IIC_LdStLHAUX,
|
2013-03-22 22:59:13 +08:00
|
|
|
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
|
2012-06-20 23:43:03 +08:00
|
|
|
NoEncode<"$ea_result">;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LWAUX : XForm_1<31, 373, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
|
2012-06-20 23:43:03 +08:00
|
|
|
(ins memrr:$addr),
|
2013-12-01 04:41:13 +08:00
|
|
|
"lwaux $rD, $addr", IIC_LdStLHAUX,
|
2013-03-22 22:59:13 +08:00
|
|
|
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
|
2012-06-20 23:43:03 +08:00
|
|
|
NoEncode<"$ea_result">, isPPC64;
|
2006-11-11 07:58:45 +08:00
|
|
|
}
|
2013-03-20 03:53:27 +08:00
|
|
|
}
|
2006-11-11 07:58:45 +08:00
|
|
|
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
|
2006-07-14 12:42:02 +08:00
|
|
|
// Zero extending loads.
|
2008-12-04 02:15:48 +08:00
|
|
|
let canFoldAsLoad = 1, PPC970_Unit = 2 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def LBZ8 : DForm_1<34, (outs g8rc:$rD), (ins memri:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lbz $rD, $src", IIC_LdStLoad,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD, (zextloadi8 iaddr:$src))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LHZ8 : DForm_1<40, (outs g8rc:$rD), (ins memri:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lhz $rD, $src", IIC_LdStLoad,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD, (zextloadi16 iaddr:$src))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LWZ8 : DForm_1<32, (outs g8rc:$rD), (ins memri:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lwz $rD, $src", IIC_LdStLoad,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD, (zextloadi32 iaddr:$src))]>, isPPC64;
|
2006-07-14 12:42:02 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LBZX8 : XForm_1<31, 87, (outs g8rc:$rD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lbzx $rD, $src", IIC_LdStLoad,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD, (zextloadi8 xaddr:$src))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LHZX8 : XForm_1<31, 279, (outs g8rc:$rD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lhzx $rD, $src", IIC_LdStLoad,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD, (zextloadi16 xaddr:$src))]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LWZX8 : XForm_1<31, 23, (outs g8rc:$rD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lwzx $rD, $src", IIC_LdStLoad,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD, (zextloadi32 xaddr:$src))]>;
|
2006-11-11 07:58:45 +08:00
|
|
|
|
|
|
|
|
|
|
|
// Update forms.
|
2014-11-26 08:46:26 +08:00
|
|
|
let mayLoad = 1, hasSideEffects = 0 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def LBZU8 : DForm_1<35, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lbzu $rD, $addr", IIC_LdStLoadUpd,
|
2006-11-16 07:24:18 +08:00
|
|
|
[]>, RegConstraint<"$addr.reg = $ea_result">,
|
|
|
|
NoEncode<"$ea_result">;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LHZU8 : DForm_1<41, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lhzu $rD, $addr", IIC_LdStLoadUpd,
|
2006-11-16 07:24:18 +08:00
|
|
|
[]>, RegConstraint<"$addr.reg = $ea_result">,
|
|
|
|
NoEncode<"$ea_result">;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LWZU8 : DForm_1<33, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
|
2013-11-28 07:26:09 +08:00
|
|
|
"lwzu $rD, $addr", IIC_LdStLoadUpd,
|
2006-11-16 07:24:18 +08:00
|
|
|
[]>, RegConstraint<"$addr.reg = $ea_result">,
|
|
|
|
NoEncode<"$ea_result">;
|
2012-06-20 23:43:03 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LBZUX8 : XForm_1<31, 119, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
|
2012-06-20 23:43:03 +08:00
|
|
|
(ins memrr:$addr),
|
2013-12-01 04:41:13 +08:00
|
|
|
"lbzux $rD, $addr", IIC_LdStLoadUpdX,
|
2013-03-22 22:59:13 +08:00
|
|
|
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
|
2012-06-20 23:43:03 +08:00
|
|
|
NoEncode<"$ea_result">;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LHZUX8 : XForm_1<31, 311, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
|
2012-06-20 23:43:03 +08:00
|
|
|
(ins memrr:$addr),
|
2013-12-01 04:41:13 +08:00
|
|
|
"lhzux $rD, $addr", IIC_LdStLoadUpdX,
|
2013-03-22 22:59:13 +08:00
|
|
|
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
|
2012-06-20 23:43:03 +08:00
|
|
|
NoEncode<"$ea_result">;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LWZUX8 : XForm_1<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
|
2012-06-20 23:43:03 +08:00
|
|
|
(ins memrr:$addr),
|
2013-12-01 04:41:13 +08:00
|
|
|
"lwzux $rD, $addr", IIC_LdStLoadUpdX,
|
2013-03-22 22:59:13 +08:00
|
|
|
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
|
2012-06-20 23:43:03 +08:00
|
|
|
NoEncode<"$ea_result">;
|
2006-11-11 07:58:45 +08:00
|
|
|
}
|
2008-12-03 10:30:17 +08:00
|
|
|
}
|
2013-04-12 10:18:09 +08:00
|
|
|
} // Interpretation64Bit
|
2006-07-14 12:42:02 +08:00
|
|
|
|
|
|
|
|
|
|
|
// Full 8-byte loads.
|
2008-12-04 02:15:48 +08:00
|
|
|
let canFoldAsLoad = 1, PPC970_Unit = 2 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def LD : DSForm_1<58, 0, (outs g8rc:$rD), (ins memrix:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"ld $rD, $src", IIC_LdStLD,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD, (aligned4load ixaddr:$src))]>, isPPC64;
|
2014-10-31 18:33:14 +08:00
|
|
|
// The following four definitions are selected for small code model only.
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
// Otherwise, we need to create two instructions to form a 32-bit offset,
|
|
|
|
// so we have a custom matcher for TOC_ENTRY in PPCDAGToDAGIsel::Select().
|
2013-04-27 00:53:15 +08:00
|
|
|
def LDtoc: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
|
2012-10-05 02:14:28 +08:00
|
|
|
"#LDtoc",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD,
|
|
|
|
(PPCtoc_entry tglobaladdr:$disp, i64:$reg))]>, isPPC64;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LDtocJTI: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
|
2012-10-05 02:14:28 +08:00
|
|
|
"#LDtocJTI",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD,
|
|
|
|
(PPCtoc_entry tjumptable:$disp, i64:$reg))]>, isPPC64;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LDtocCPT: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
|
2012-10-05 02:14:28 +08:00
|
|
|
"#LDtocCPT",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD,
|
|
|
|
(PPCtoc_entry tconstpool:$disp, i64:$reg))]>, isPPC64;
|
2014-10-31 18:33:14 +08:00
|
|
|
def LDtocBA: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
|
|
|
|
"#LDtocCPT",
|
|
|
|
[(set i64:$rD,
|
|
|
|
(PPCtoc_entry tblockaddress:$disp, i64:$reg))]>, isPPC64;
|
2012-02-25 01:54:01 +08:00
|
|
|
|
[PPC64] Add missing dependency on X2 to LDinto_toc.
The LDinto_toc pattern has been part of 64-bit PowerPC for a long
time, and represents loading from a memory location into the TOC
register (X2). However, this pattern doesn't explicitly record that
it modifies that register. This patch adds the missing dependency.
It was very surprising to me that this has never shown up as a problem
in the past, and that we only saw this problem recently in a single
scenario when building a self-hosted clang. It turns out that in most
cases we have another dependency present that keeps the LDinto_toc
instruction tied in place. LDinto_toc is used for TOC restore
following a call site, so this is a typical sequence:
BCTRL8 <regmask>, %CTR8<imp-use>, %RM<imp-use>, %X3<imp-use>, %X12<imp-use>, %X1<imp-def>, ...
LDinto_toc 24, %X1
ADJCALLSTACKUP 96, 0, %R1<imp-def>, %R1<imp-use>
Because the LDinto_toc is inserted prior to the ADJCALLSTACKUP, there
is a natural anti-dependency between the two that keeps it in place.
Therefore we don't usually see a problem. However, in one particular
case, one call is followed immediately by another call, and the second
call requires a parameter that is a TOC-relative address. This is the
code sequence:
BCTRL8 <regmask>, %CTR8<imp-use>, %RM<imp-use>, %X3<imp-use>, %X4<imp-use>, %X5<imp-use>, %X12<imp-use>, %X1<imp-def>, ...
LDinto_toc 24, %X1
ADJCALLSTACKUP 96, 0, %R1<imp-def>, %R1<imp-use>
ADJCALLSTACKDOWN 96, %R1<imp-def>, %R1<imp-use>
%vreg39<def> = ADDIStocHA %X2, <ga:@.str>; G8RC_and_G8RC_NOX0:%vreg39
%vreg40<def> = ADDItocL %vreg39<kill>, <ga:@.str>; G8RC:%vreg40 G8RC_and_G8RC_NOX0:%vreg39
Note that the back-to-back stack adjustments are the same size! The
back end is smart enough to recognize this and optimize them away:
BCTRL8 <regmask>, %CTR8<imp-use>, %RM<imp-use>, %X3<imp-use>, %X4<imp-use>, %X5<imp-use>, %X12<imp-use>, %X1<imp-def>, ...
LDinto_toc 24, %X1
%vreg39<def> = ADDIStocHA %X2, <ga:@.str>; G8RC_and_G8RC_NOX0:%vreg39
%vreg40<def> = ADDItocL %vreg39<kill>, <ga:@.str>; G8RC:%vreg40 G8RC_and_G8RC_NOX0:%vreg39
Now there is nothing to prevent the ADDIStocHA instruction from moving
ahead of the LDinto_toc instruction, and because of the longest-path
heuristic, this is what happens.
With the accompanying patch, %X2 is represented as an implicit def:
BCTRL8 <regmask>, %CTR8<imp-use>, %RM<imp-use>, %X3<imp-use>, %X4<imp-use>, %X5<imp-use>, %X12<imp-use>, %X1<imp-def>, ...
LDinto_toc 24, %X1, %X2<imp-def,dead>
ADJCALLSTACKUP 96, 0, %R1<imp-def,dead>, %R1<imp-use>
ADJCALLSTACKDOWN 96, %R1<imp-def,dead>, %R1<imp-use>
%vreg39<def> = ADDIStocHA %X2, <ga:@.str>; G8RC_and_G8RC_NOX0:%vreg39
%vreg40<def> = ADDItocL %vreg39<kill>, <ga:@.str>; G8RC:%vreg40 G8RC_and_G8RC_NOX0:%vreg39
So now when the two stack adjustments are removed, ADDIStocHA is
prevented from being moved above LDinto_toc.
I have not yet created a test case for this, because the original
failure occurs on a relatively large function that needs reduction.
However, this is a fairly serious bug, despite its infrequency, and I
wanted to get this patch onto the list as soon as possible so that it
can be considered for a 3.5 backport. I'll work on whittling down a
test case.
Have we missed the boat for 3.5 at this point?
Thanks,
Bill
llvm-svn: 215685
2014-08-15 09:25:26 +08:00
|
|
|
let hasSideEffects = 1, isCodeGenOnly = 1, RST = 2, Defs = [X2] in
|
[PowerPC] Simplify and improve loading into TOC register
During an indirect function call sequence on the 64-bit SVR4 ABI,
generate code must load and then restore the TOC register.
This does not use a regular LOAD instruction since the TOC
register r2 is marked as reserved. Instead, the are two
special instruction patterns:
let RST = 2, DS = 2 in
def LDinto_toc: DSForm_1a<58, 0, (outs), (ins g8rc:$reg),
"ld 2, 8($reg)", IIC_LdStLD,
[(PPCload_toc i64:$reg)]>, isPPC64;
let RST = 2, DS = 10, RA = 1 in
def LDtoc_restore : DSForm_1a<58, 0, (outs), (ins),
"ld 2, 40(1)", IIC_LdStLD,
[(PPCtoc_restore)]>, isPPC64;
Note that these not only restrict the destination of the
load to r2, but they also restrict the *source* of the
load to particular address combinations. The latter is
a problem when we want to support the ELFv2 ABI, since
there the TOC save slot is no longer at 40(1).
This patch replaces those two instructions with a single
instruction pattern that only hard-codes r2 as destination,
but supports generic addresses as source. This will allow
supporting the ELFv2 ABI, and also helps generate more
efficient code for calls to absolute addresses (allowing
simplification of the ppc64-calls.ll test case).
llvm-svn: 211193
2014-06-19 01:52:49 +08:00
|
|
|
def LDinto_toc: DSForm_1<58, 0, (outs), (ins memrix:$src),
|
|
|
|
"ld 2, $src", IIC_LdStLD,
|
|
|
|
[(PPCload_toc ixaddr:$src)]>, isPPC64;
|
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LDX : XForm_1<31, 21, (outs g8rc:$rD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"ldx $rD, $src", IIC_LdStLD,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD, (load xaddr:$src))]>, isPPC64;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LDBRX : XForm_1<31, 532, (outs g8rc:$rD), (ins memrr:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"ldbrx $rD, $src", IIC_LdStLoad,
|
2013-03-29 03:25:55 +08:00
|
|
|
[(set i64:$rD, (PPClbrx xoaddr:$src, i64))]>, isPPC64;
|
|
|
|
|
2014-11-26 08:46:26 +08:00
|
|
|
let mayLoad = 1, hasSideEffects = 0 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def LDU : DSForm_1<58, 1, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memrix:$addr),
|
2013-11-28 07:26:09 +08:00
|
|
|
"ldu $rD, $addr", IIC_LdStLDU,
|
2006-11-16 07:24:18 +08:00
|
|
|
[]>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
|
|
|
|
NoEncode<"$ea_result">;
|
2006-11-11 07:58:45 +08:00
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def LDUX : XForm_1<31, 53, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
|
2012-06-20 23:43:03 +08:00
|
|
|
(ins memrr:$addr),
|
2013-12-01 04:41:13 +08:00
|
|
|
"ldux $rD, $addr", IIC_LdStLDUX,
|
2013-03-22 22:59:13 +08:00
|
|
|
[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
|
2012-06-20 23:43:03 +08:00
|
|
|
NoEncode<"$ea_result">, isPPC64;
|
2006-06-17 04:22:01 +08:00
|
|
|
}
|
2013-04-07 14:30:47 +08:00
|
|
|
}
|
2006-07-14 12:42:02 +08:00
|
|
|
|
2009-12-18 21:00:15 +08:00
|
|
|
def : Pat<(PPCload ixaddr:$src),
|
|
|
|
(LD ixaddr:$src)>;
|
|
|
|
def : Pat<(PPCload xaddr:$src),
|
|
|
|
(LDX xaddr:$src)>;
|
|
|
|
|
2013-02-22 01:12:27 +08:00
|
|
|
// Support for medium and large code model.
|
2013-04-27 00:53:15 +08:00
|
|
|
def ADDIStocHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
"#ADDIStocHA",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD,
|
|
|
|
(PPCaddisTocHA i64:$reg, tglobaladdr:$disp))]>,
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
isPPC64;
|
2013-04-27 00:53:15 +08:00
|
|
|
def LDtocL: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg),
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
"#LDtocL",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD,
|
|
|
|
(PPCldTocL tglobaladdr:$disp, i64:$reg))]>, isPPC64;
|
2013-04-27 00:53:15 +08:00
|
|
|
def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
"#ADDItocL",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD,
|
|
|
|
(PPCaddiTocL i64:$reg, tglobaladdr:$disp))]>, isPPC64;
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
llvm-svn: 168708
2012-11-28 01:35:46 +08:00
|
|
|
|
2012-12-05 00:18:08 +08:00
|
|
|
// Support for thread-local storage.
|
2013-05-24 06:48:06 +08:00
|
|
|
def ADDISgotTprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
llvm-svn: 170209
2012-12-15 01:02:38 +08:00
|
|
|
"#ADDISgotTprelHA",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD,
|
|
|
|
(PPCaddisGotTprelHA i64:$reg,
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
llvm-svn: 170209
2012-12-15 01:02:38 +08:00
|
|
|
tglobaltlsaddr:$disp))]>,
|
|
|
|
isPPC64;
|
2013-05-24 06:48:06 +08:00
|
|
|
def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg),
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
llvm-svn: 170209
2012-12-15 01:02:38 +08:00
|
|
|
"#LDgotTprelL",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD,
|
|
|
|
(PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>,
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
llvm-svn: 170209
2012-12-15 01:02:38 +08:00
|
|
|
isPPC64;
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g),
|
|
|
|
(ADD8TLS $in, tglobaltlsaddr:$g)>;
|
2013-05-24 06:48:06 +08:00
|
|
|
def ADDIStlsgdHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
"#ADDIStlsgdHA",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD,
|
|
|
|
(PPCaddisTlsgdHA i64:$reg, tglobaltlsaddr:$disp))]>,
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
isPPC64;
|
2013-05-24 06:48:06 +08:00
|
|
|
def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
"#ADDItlsgdL",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD,
|
|
|
|
(PPCaddiTlsgdL i64:$reg, tglobaltlsaddr:$disp))]>,
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
llvm-svn: 169910
2012-12-12 04:30:11 +08:00
|
|
|
isPPC64;
|
2013-05-24 06:48:06 +08:00
|
|
|
def ADDIStlsldHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
|
2012-12-13 03:29:35 +08:00
|
|
|
"#ADDIStlsldHA",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD,
|
|
|
|
(PPCaddisTlsldHA i64:$reg, tglobaltlsaddr:$disp))]>,
|
2012-12-13 03:29:35 +08:00
|
|
|
isPPC64;
|
2013-05-24 06:48:06 +08:00
|
|
|
def ADDItlsldL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
|
2012-12-13 03:29:35 +08:00
|
|
|
"#ADDItlsldL",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD,
|
|
|
|
(PPCaddiTlsldL i64:$reg, tglobaltlsaddr:$disp))]>,
|
2012-12-13 03:29:35 +08:00
|
|
|
isPPC64;
|
2013-05-24 06:48:06 +08:00
|
|
|
def ADDISdtprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
|
2012-12-13 03:29:35 +08:00
|
|
|
"#ADDISdtprelHA",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD,
|
|
|
|
(PPCaddisDtprelHA i64:$reg,
|
2012-12-14 04:57:10 +08:00
|
|
|
tglobaltlsaddr:$disp))]>,
|
2012-12-13 03:29:35 +08:00
|
|
|
isPPC64;
|
2013-05-24 06:48:06 +08:00
|
|
|
def ADDIdtprelL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
|
2012-12-13 03:29:35 +08:00
|
|
|
"#ADDIdtprelL",
|
2013-03-26 03:05:30 +08:00
|
|
|
[(set i64:$rD,
|
|
|
|
(PPCaddiDtprelL i64:$reg, tglobaltlsaddr:$disp))]>,
|
2012-12-13 03:29:35 +08:00
|
|
|
isPPC64;
|
2012-12-05 00:18:08 +08:00
|
|
|
|
2008-01-06 13:53:26 +08:00
|
|
|
let PPC970_Unit = 2 in {
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
|
2006-07-14 12:42:02 +08:00
|
|
|
// Truncating stores.
|
2013-04-27 00:53:15 +08:00
|
|
|
def STB8 : DForm_1<38, (outs), (ins g8rc:$rS, memri:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stb $rS, $src", IIC_LdStStore,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(truncstorei8 i64:$rS, iaddr:$src)]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STH8 : DForm_1<44, (outs), (ins g8rc:$rS, memri:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"sth $rS, $src", IIC_LdStStore,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(truncstorei16 i64:$rS, iaddr:$src)]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STW8 : DForm_1<36, (outs), (ins g8rc:$rS, memri:$src),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stw $rS, $src", IIC_LdStStore,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(truncstorei32 i64:$rS, iaddr:$src)]>;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STBX8 : XForm_8<31, 215, (outs), (ins g8rc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stbx $rS, $dst", IIC_LdStStore,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(truncstorei8 i64:$rS, xaddr:$dst)]>,
|
2006-07-14 12:42:02 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STHX8 : XForm_8<31, 407, (outs), (ins g8rc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"sthx $rS, $dst", IIC_LdStStore,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(truncstorei16 i64:$rS, xaddr:$dst)]>,
|
2006-07-14 12:42:02 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STWX8 : XForm_8<31, 151, (outs), (ins g8rc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stwx $rS, $dst", IIC_LdStStore,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(truncstorei32 i64:$rS, xaddr:$dst)]>,
|
2006-07-14 12:42:02 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-12 10:18:09 +08:00
|
|
|
} // Interpretation64Bit
|
|
|
|
|
2006-11-16 08:57:19 +08:00
|
|
|
// Normal 8-byte stores.
|
2013-04-27 00:53:15 +08:00
|
|
|
def STD : DSForm_1<62, 0, (outs), (ins g8rc:$rS, memrix:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"std $rS, $dst", IIC_LdStSTD,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(aligned4store i64:$rS, ixaddr:$dst)]>, isPPC64;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STDX : XForm_8<31, 149, (outs), (ins g8rc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stdx $rS, $dst", IIC_LdStSTD,
|
2013-03-26 03:05:30 +08:00
|
|
|
[(store i64:$rS, xaddr:$dst)]>, isPPC64,
|
2006-11-16 08:57:19 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STDBRX: XForm_8<31, 660, (outs), (ins g8rc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stdbrx $rS, $dst", IIC_LdStStore,
|
2013-03-29 03:25:55 +08:00
|
|
|
[(PPCstbrx i64:$rS, xoaddr:$dst, i64)]>, isPPC64,
|
|
|
|
PPC970_DGroup_Cracked;
|
2006-06-17 04:22:01 +08:00
|
|
|
}
|
|
|
|
|
2013-03-20 03:52:04 +08:00
|
|
|
// Stores with Update (pre-inc).
|
|
|
|
let PPC970_Unit = 2, mayStore = 1 in {
|
2013-12-18 07:05:18 +08:00
|
|
|
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
|
2013-04-27 00:53:15 +08:00
|
|
|
def STBU8 : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stbu $rS, $dst", IIC_LdStStoreUpd, []>,
|
2013-03-20 03:52:04 +08:00
|
|
|
RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STHU8 : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"sthu $rS, $dst", IIC_LdStStoreUpd, []>,
|
2013-03-20 03:52:04 +08:00
|
|
|
RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STWU8 : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stwu $rS, $dst", IIC_LdStStoreUpd, []>,
|
2013-03-20 03:52:04 +08:00
|
|
|
RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
|
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def STBUX8: XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stbux $rS, $dst", IIC_LdStStoreUpd, []>,
|
2013-03-22 22:59:13 +08:00
|
|
|
RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
|
2013-03-20 03:52:04 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STHUX8: XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"sthux $rS, $dst", IIC_LdStStoreUpd, []>,
|
2013-03-22 22:59:13 +08:00
|
|
|
RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
|
2013-03-20 03:52:04 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-27 00:53:15 +08:00
|
|
|
def STWUX8: XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
|
2013-11-28 07:26:09 +08:00
|
|
|
"stwux $rS, $dst", IIC_LdStStoreUpd, []>,
|
2013-03-22 22:59:13 +08:00
|
|
|
RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
|
2013-03-20 03:52:04 +08:00
|
|
|
PPC970_DGroup_Cracked;
|
2013-04-12 10:18:09 +08:00
|
|
|
} // Interpretation64Bit
|
|
|
|
|
2013-12-18 07:05:18 +08:00
|
|
|
def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrix:$dst),
|
|
|
|
"stdu $rS, $dst", IIC_LdStSTDU, []>,
|
|
|
|
RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">,
|
|
|
|
isPPC64;
|
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
def STDUX : XForm_8<31, 181, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
|
2013-12-01 04:41:13 +08:00
|
|
|
"stdux $rS, $dst", IIC_LdStSTDUX, []>,
|
2013-03-22 22:59:13 +08:00
|
|
|
RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
|
2013-03-20 03:52:04 +08:00
|
|
|
PPC970_DGroup_Cracked, isPPC64;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Patterns to match the pre-inc stores. We can't put the patterns on
|
|
|
|
// the instruction definitions directly as ISel wants the address base
|
|
|
|
// and offset to be separate operands, not a single complex operand.
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(pre_truncsti8 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
|
|
|
|
(STBU8 $rS, iaddroff:$ptroff, $ptrreg)>;
|
|
|
|
def : Pat<(pre_truncsti16 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
|
|
|
|
(STHU8 $rS, iaddroff:$ptroff, $ptrreg)>;
|
|
|
|
def : Pat<(pre_truncsti32 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
|
|
|
|
(STWU8 $rS, iaddroff:$ptroff, $ptrreg)>;
|
|
|
|
def : Pat<(aligned4pre_store i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
|
|
|
|
(STDU $rS, iaddroff:$ptroff, $ptrreg)>;
|
|
|
|
|
|
|
|
def : Pat<(pre_truncsti8 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
|
|
|
|
(STBUX8 $rS, $ptrreg, $ptroff)>;
|
|
|
|
def : Pat<(pre_truncsti16 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
|
|
|
|
(STHUX8 $rS, $ptrreg, $ptroff)>;
|
|
|
|
def : Pat<(pre_truncsti32 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
|
|
|
|
(STWUX8 $rS, $ptrreg, $ptroff)>;
|
|
|
|
def : Pat<(pre_store i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
|
|
|
|
(STDUX $rS, $ptrreg, $ptroff)>;
|
2006-06-17 04:22:01 +08:00
|
|
|
|
|
|
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Floating point instructions.
|
|
|
|
//
|
|
|
|
|
|
|
|
|
2014-11-26 08:46:26 +08:00
|
|
|
let PPC970_Unit = 3, hasSideEffects = 0,
|
2013-04-12 10:18:09 +08:00
|
|
|
Uses = [RM] in { // FPU Operations.
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FCFID : XForm_26r<63, 846, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fcfid", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$frD, (PPCfcfid f64:$frB))]>, isPPC64;
|
2013-09-26 12:11:24 +08:00
|
|
|
defm FCTID : XForm_26r<63, 814, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fctid", "$frD, $frB", IIC_FPGeneral,
|
2013-09-26 13:22:11 +08:00
|
|
|
[]>, isPPC64;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FCTIDZ : XForm_26r<63, 815, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fctidz", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$frD, (PPCfctidz f64:$frB))]>, isPPC64;
|
|
|
|
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FCFIDU : XForm_26r<63, 974, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fcfidu", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$frD, (PPCfcfidu f64:$frB))]>, isPPC64;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FCFIDS : XForm_26r<59, 846, (outs f4rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fcfids", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f32:$frD, (PPCfcfids f64:$frB))]>, isPPC64;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FCFIDUS : XForm_26r<59, 974, (outs f4rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fcfidus", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f32:$frD, (PPCfcfidus f64:$frB))]>, isPPC64;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FCTIDUZ : XForm_26r<63, 943, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fctiduz", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$frD, (PPCfctiduz f64:$frB))]>, isPPC64;
|
2013-04-27 00:53:15 +08:00
|
|
|
defm FCTIWUZ : XForm_26r<63, 143, (outs f8rc:$frD), (ins f8rc:$frB),
|
2013-11-28 07:26:09 +08:00
|
|
|
"fctiwuz", "$frD, $frB", IIC_FPGeneral,
|
2013-04-12 10:18:09 +08:00
|
|
|
[(set f64:$frD, (PPCfctiwuz f64:$frB))]>, isPPC64;
|
2006-06-17 04:22:01 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Instruction Patterns
|
|
|
|
//
|
Add some 64-bit logical ops.
Split imm16Shifted into a sext/zext form for 64-bit support.
Add some patterns for immediate formation. For example, we now compile this:
static unsigned long long Y;
void test3() {
Y = 0xF0F00F00;
}
into:
_test3:
li r2, 3840
lis r3, ha16(_Y)
xoris r2, r2, 61680
std r2, lo16(_Y)(r3)
blr
GCC produces:
_test3:
li r0,0
lis r2,ha16(_Y)
ori r0,r0,61680
sldi r0,r0,16
ori r0,r0,3840
std r0,lo16(_Y)(r2)
blr
llvm-svn: 28883
2006-06-21 06:34:10 +08:00
|
|
|
|
2006-06-17 04:22:01 +08:00
|
|
|
// Extensions and truncates to/from 32-bit regs.
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(i64 (zext i32:$in)),
|
|
|
|
(RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32),
|
2012-06-10 06:10:19 +08:00
|
|
|
0, 32)>;
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(i64 (anyext i32:$in)),
|
|
|
|
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32)>;
|
|
|
|
def : Pat<(i32 (trunc i64:$in)),
|
|
|
|
(EXTRACT_SUBREG $in, sub_32)>;
|
2006-06-17 04:22:01 +08:00
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
llvm-svn: 202451
2014-02-28 08:27:01 +08:00
|
|
|
// Implement the 'not' operation with the NOR instruction.
|
|
|
|
// (we could use the default xori pattern, but nor has lower latency on some
|
|
|
|
// cores (such as the A2)).
|
|
|
|
def i64not : OutPatFrag<(ops node:$in),
|
|
|
|
(NOR8 $in, $in)>;
|
|
|
|
def : Pat<(not i64:$in),
|
|
|
|
(i64not $in)>;
|
|
|
|
|
2006-07-14 12:42:02 +08:00
|
|
|
// Extending loads with i64 targets.
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(zextloadi1 iaddr:$src),
|
2006-07-14 12:42:02 +08:00
|
|
|
(LBZ8 iaddr:$src)>;
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(zextloadi1 xaddr:$src),
|
2006-07-14 12:42:02 +08:00
|
|
|
(LBZX8 xaddr:$src)>;
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(extloadi1 iaddr:$src),
|
2006-07-14 12:42:02 +08:00
|
|
|
(LBZ8 iaddr:$src)>;
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(extloadi1 xaddr:$src),
|
2006-07-14 12:42:02 +08:00
|
|
|
(LBZX8 xaddr:$src)>;
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(extloadi8 iaddr:$src),
|
2006-07-14 12:42:02 +08:00
|
|
|
(LBZ8 iaddr:$src)>;
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(extloadi8 xaddr:$src),
|
2006-07-14 12:42:02 +08:00
|
|
|
(LBZX8 xaddr:$src)>;
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(extloadi16 iaddr:$src),
|
2006-07-14 12:42:02 +08:00
|
|
|
(LHZ8 iaddr:$src)>;
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(extloadi16 xaddr:$src),
|
2006-07-14 12:42:02 +08:00
|
|
|
(LHZX8 xaddr:$src)>;
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(extloadi32 iaddr:$src),
|
2006-07-14 12:42:02 +08:00
|
|
|
(LWZ8 iaddr:$src)>;
|
2006-10-10 04:57:25 +08:00
|
|
|
def : Pat<(extloadi32 xaddr:$src),
|
2006-07-14 12:42:02 +08:00
|
|
|
(LWZX8 xaddr:$src)>;
|
|
|
|
|
2008-03-08 04:18:24 +08:00
|
|
|
// Standard shifts. These are represented separately from the real shifts above
|
|
|
|
// so that we can distinguish between shifts that allow 6-bit and 7-bit shift
|
|
|
|
// amounts.
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(sra i64:$rS, i32:$rB),
|
|
|
|
(SRAD $rS, $rB)>;
|
|
|
|
def : Pat<(srl i64:$rS, i32:$rB),
|
|
|
|
(SRD $rS, $rB)>;
|
|
|
|
def : Pat<(shl i64:$rS, i32:$rB),
|
|
|
|
(SLD $rS, $rB)>;
|
2008-03-08 04:18:24 +08:00
|
|
|
|
2006-06-17 04:22:01 +08:00
|
|
|
// SHL/SRL
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(shl i64:$in, (i32 imm:$imm)),
|
|
|
|
(RLDICR $in, imm:$imm, (SHL64 imm:$imm))>;
|
|
|
|
def : Pat<(srl i64:$in, (i32 imm:$imm)),
|
|
|
|
(RLDICL $in, (SRL64 imm:$imm), imm:$imm)>;
|
2006-06-21 05:23:06 +08:00
|
|
|
|
2007-09-05 04:20:29 +08:00
|
|
|
// ROTL
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(rotl i64:$in, i32:$sh),
|
|
|
|
(RLDCL $in, $sh, 0)>;
|
|
|
|
def : Pat<(rotl i64:$in, (i32 imm:$imm)),
|
|
|
|
(RLDICL $in, imm:$imm, 0)>;
|
2007-09-05 04:20:29 +08:00
|
|
|
|
2006-06-21 05:23:06 +08:00
|
|
|
// Hi and Lo for Darwin Global Addresses.
|
|
|
|
def : Pat<(PPChi tglobaladdr:$in, 0), (LIS8 tglobaladdr:$in)>;
|
|
|
|
def : Pat<(PPClo tglobaladdr:$in, 0), (LI8 tglobaladdr:$in)>;
|
|
|
|
def : Pat<(PPChi tconstpool:$in , 0), (LIS8 tconstpool:$in)>;
|
|
|
|
def : Pat<(PPClo tconstpool:$in , 0), (LI8 tconstpool:$in)>;
|
|
|
|
def : Pat<(PPChi tjumptable:$in , 0), (LIS8 tjumptable:$in)>;
|
|
|
|
def : Pat<(PPClo tjumptable:$in , 0), (LI8 tjumptable:$in)>;
|
2009-11-05 05:31:18 +08:00
|
|
|
def : Pat<(PPChi tblockaddress:$in, 0), (LIS8 tblockaddress:$in)>;
|
|
|
|
def : Pat<(PPClo tblockaddress:$in, 0), (LI8 tblockaddress:$in)>;
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(PPChi tglobaltlsaddr:$g, i64:$in),
|
|
|
|
(ADDIS8 $in, tglobaltlsaddr:$g)>;
|
|
|
|
def : Pat<(PPClo tglobaltlsaddr:$g, i64:$in),
|
2013-03-26 18:55:20 +08:00
|
|
|
(ADDI8 $in, tglobaltlsaddr:$g)>;
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(add i64:$in, (PPChi tglobaladdr:$g, 0)),
|
|
|
|
(ADDIS8 $in, tglobaladdr:$g)>;
|
|
|
|
def : Pat<(add i64:$in, (PPChi tconstpool:$g, 0)),
|
|
|
|
(ADDIS8 $in, tconstpool:$g)>;
|
|
|
|
def : Pat<(add i64:$in, (PPChi tjumptable:$g, 0)),
|
|
|
|
(ADDIS8 $in, tjumptable:$g)>;
|
|
|
|
def : Pat<(add i64:$in, (PPChi tblockaddress:$g, 0)),
|
|
|
|
(ADDIS8 $in, tblockaddress:$g)>;
|
2013-03-19 07:00:58 +08:00
|
|
|
|
|
|
|
// Patterns to match r+r indexed loads and stores for
|
|
|
|
// addresses without at least 4-byte alignment.
|
|
|
|
def : Pat<(i64 (unaligned4sextloadi32 xoaddr:$src)),
|
|
|
|
(LWAX xoaddr:$src)>;
|
|
|
|
def : Pat<(i64 (unaligned4load xoaddr:$src)),
|
|
|
|
(LDX xoaddr:$src)>;
|
2013-03-26 03:04:58 +08:00
|
|
|
def : Pat<(unaligned4store i64:$rS, xoaddr:$dst),
|
|
|
|
(STDX $rS, xoaddr:$dst)>;
|
2013-03-19 07:00:58 +08:00
|
|
|
|
[Power] Improve the expansion of atomic loads/stores
Summary:
Atomic loads and store of up to the native size (32 bits, or 64 for PPC64)
can be lowered to a simple load or store instruction (as the synchronization
is already handled by AtomicExpand, and the atomicity is guaranteed thanks to
the alignment requirements of atomic accesses). This is exactly what this patch
does. Previously, these were implemented by complex
load-linked/store-conditional loops.. an obvious performance problem.
For example, this patch turns
```
define void @store_i8_unordered(i8* %mem) {
store atomic i8 42, i8* %mem unordered, align 1
ret void
}
```
from
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
rlwinm r2, r3, 3, 27, 28
li r4, 42
xori r5, r2, 24
rlwinm r2, r3, 0, 0, 29
li r3, 255
slw r4, r4, r5
slw r3, r3, r5
and r4, r4, r3
LBB4_1: ; =>This Inner Loop Header: Depth=1
lwarx r5, 0, r2
andc r5, r5, r3
or r5, r4, r5
stwcx. r5, 0, r2
bne cr0, LBB4_1
; BB#2:
blr
```
into
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
li r2, 42
stb r2, 0(r3)
blr
```
which looks like a pretty clear win to me.
Test Plan:
fixed the tests + new test for indexed accesses + make check-all
Reviewers: jfb, wschmidt, hfinkel
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D5587
llvm-svn: 218922
2014-10-03 06:27:07 +08:00
|
|
|
// 64-bits atomic loads and stores
|
|
|
|
def : Pat<(atomic_load_64 ixaddr:$src), (LD memrix:$src)>;
|
|
|
|
def : Pat<(atomic_load_64 xaddr:$src), (LDX memrr:$src)>;
|
|
|
|
|
|
|
|
def : Pat<(atomic_store_64 ixaddr:$ptr, i64:$val), (STD g8rc:$val, memrix:$ptr)>;
|
|
|
|
def : Pat<(atomic_store_64 xaddr:$ptr, i64:$val), (STDX g8rc:$val, memrr:$ptr)>;
|