2005-10-24 03:52:42 +08:00
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
// Random ideas for the X86 backend.
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
|
|
|
|
Hi and Lo parts (combination of MUL and MULH[SU] into one node). Add this to
|
|
|
|
X86, & make the dag combiner produce it when needed. This will eliminate one
|
|
|
|
imul from the code generated for:
|
|
|
|
|
|
|
|
long long test(long long X, long long Y) { return X*Y; }
|
|
|
|
|
|
|
|
by using the EAX result from the mul. We should add a similar node for
|
|
|
|
DIVREM.
|
|
|
|
|
2005-12-02 08:11:20 +08:00
|
|
|
another case is:
|
|
|
|
|
|
|
|
long long test(int X, int Y) { return (long long)X*Y; }
|
|
|
|
|
|
|
|
... which should only be one imul instruction.
|
|
|
|
|
2005-10-24 03:52:42 +08:00
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
This should be one DIV/IDIV instruction, not a libcall:
|
|
|
|
|
|
|
|
unsigned test(unsigned long long X, unsigned Y) {
|
|
|
|
return X/Y;
|
|
|
|
}
|
|
|
|
|
|
|
|
This can be done trivially with a custom legalizer. What about overflow
|
|
|
|
though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
|
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
Some targets (e.g. athlons) prefer freep to fstp ST(0):
|
|
|
|
http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
|
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
2006-01-13 06:54:21 +08:00
|
|
|
This should use fiadd on chips where it is profitable:
|
2005-10-24 03:52:42 +08:00
|
|
|
double foo(double P, int *I) { return P+*I; }
|
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
The FP stackifier needs to be global. Also, it should handle simple permutates
|
|
|
|
to reduce number of shuffle instructions, e.g. turning:
|
|
|
|
|
|
|
|
fld P -> fld Q
|
|
|
|
fld Q fld P
|
|
|
|
fxch
|
|
|
|
|
|
|
|
or:
|
|
|
|
|
|
|
|
fxch -> fucomi
|
|
|
|
fucomi jl X
|
|
|
|
jg X
|
|
|
|
|
2006-01-17 01:53:00 +08:00
|
|
|
Ideas:
|
|
|
|
http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
|
|
|
|
|
|
|
|
|
2005-10-24 03:52:42 +08:00
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
Improvements to the multiply -> shift/add algorithm:
|
|
|
|
http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
|
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
Improve code like this (occurs fairly frequently, e.g. in LLVM):
|
|
|
|
long long foo(int x) { return 1LL << x; }
|
|
|
|
|
|
|
|
http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
|
|
|
|
http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
|
|
|
|
http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
|
|
|
|
|
|
|
|
Another useful one would be ~0ULL >> X and ~0ULL << X.
|
|
|
|
|
2005-10-24 05:44:59 +08:00
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
Should support emission of the bswap instruction, probably by adding a new
|
|
|
|
DAG node for byte swapping. Also useful on PPC which has byte-swapping loads.
|
|
|
|
|
2005-11-28 12:52:39 +08:00
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
Compile this:
|
|
|
|
_Bool f(_Bool a) { return a!=1; }
|
|
|
|
|
|
|
|
into:
|
|
|
|
movzbl %dil, %eax
|
|
|
|
xorl $1, %eax
|
|
|
|
ret
|
2005-12-17 09:25:19 +08:00
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
Some isel ideas:
|
|
|
|
|
|
|
|
1. Dynamic programming based approach when compile time if not an
|
|
|
|
issue.
|
|
|
|
2. Code duplication (addressing mode) during isel.
|
|
|
|
3. Other ideas from "Register-Sensitive Selection, Duplication, and
|
|
|
|
Sequencing of Instructions".
|
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
Should we promote i16 to i32 to avoid partial register update stalls?
|
2005-12-17 14:54:43 +08:00
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
Leave any_extend as pseudo instruction and hint to register
|
|
|
|
allocator. Delay codegen until post register allocation.
|
2006-01-13 06:54:21 +08:00
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
Add a target specific hook to DAG combiner to handle SINT_TO_FP and
|
|
|
|
FP_TO_SINT when the source operand is already in memory.
|
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
Check if load folding would add a cycle in the dag.
|
2006-01-13 09:20:42 +08:00
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
Model X86 EFLAGS as a real register to avoid redudant cmp / test. e.g.
|
|
|
|
|
|
|
|
cmpl $1, %eax
|
|
|
|
setg %al
|
|
|
|
testb %al, %al # unnecessary
|
|
|
|
jne .BB7
|
2006-01-17 01:53:00 +08:00
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
Count leading zeros and count trailing zeros:
|
|
|
|
|
|
|
|
int clz(int X) { return __builtin_clz(X); }
|
|
|
|
int ctz(int X) { return __builtin_ctz(X); }
|
|
|
|
|
|
|
|
$ gcc t.c -S -o - -O3 -fomit-frame-pointer -masm=intel
|
|
|
|
clz:
|
|
|
|
bsr %eax, DWORD PTR [%esp+4]
|
|
|
|
xor %eax, 31
|
|
|
|
ret
|
|
|
|
ctz:
|
|
|
|
bsf %eax, DWORD PTR [%esp+4]
|
|
|
|
ret
|
|
|
|
|
|
|
|
however, check that these are defined for 0 and 32. Our intrinsics are, GCC's
|
|
|
|
aren't.
|
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
Use push/pop instructions in prolog/epilog sequences instead of stores off
|
|
|
|
ESP (certain code size win, perf win on some [which?] processors).
|
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
Only use inc/neg/not instructions on processors where they are faster than
|
|
|
|
add/sub/xor. They are slower on the P4 due to only updating some processor
|
|
|
|
flags.
|
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
Open code rint,floor,ceil,trunc:
|
|
|
|
http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
|
|
|
|
http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
|
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
|
|
|
|
|
2006-01-28 06:11:01 +08:00
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
2006-01-29 14:48:25 +08:00
|
|
|
For all targets, not just X86:
|
|
|
|
When llvm.memcpy, llvm.memset, or llvm.memmove are lowered, they should be
|
|
|
|
optimized to a few store instructions if the source is constant and the length
|
|
|
|
is smallish (< 8). This will greatly help some tests like Shootout/strcat.c
|
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
2006-01-28 06:11:01 +08:00
|
|
|
Solve this DAG isel folding deficiency:
|
|
|
|
|
|
|
|
int X, Y;
|
|
|
|
|
|
|
|
void fn1(void)
|
|
|
|
{
|
|
|
|
X = X | (Y << 3);
|
|
|
|
}
|
|
|
|
|
|
|
|
compiles to
|
|
|
|
|
|
|
|
fn1:
|
|
|
|
movl Y, %eax
|
|
|
|
shll $3, %eax
|
|
|
|
orl X, %eax
|
|
|
|
movl %eax, X
|
|
|
|
ret
|
|
|
|
|
|
|
|
The problem is the store's chain operand is not the load X but rather
|
2006-01-28 06:54:32 +08:00
|
|
|
a TokenFactor of the load X and load Y, which prevents the folding.
|
|
|
|
|
|
|
|
There are two ways to fix this:
|
|
|
|
|
|
|
|
1. The dag combiner can start using alias analysis to realize that y/x
|
|
|
|
don't alias, making the store to X not dependent on the load from Y.
|
|
|
|
2. The generated isel could be made smarter in the case it can't
|
|
|
|
disambiguate the pointers.
|
|
|
|
|
|
|
|
Number 1 is the preferred solution.
|
2006-01-29 17:08:15 +08:00
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
The instruction selector sometimes misses folding a load into a compare. The
|
|
|
|
pattern is written as (cmp reg, (load p)). Because the compare isn't
|
|
|
|
commutative, it is not matched with the load on both sides. The dag combiner
|
|
|
|
should be made smart enough to cannonicalize the load into the RHS of a compare
|
|
|
|
when it can invert the result of the compare for free.
|
|
|
|
|
2006-01-29 17:14:47 +08:00
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
2006-01-29 17:42:20 +08:00
|
|
|
None of the SSE instructions are handled in X86RegisterInfo::foldMemoryOperand,
|
|
|
|
which prevents the spiller from folding spill code into the instructions.
|
|
|
|
|
|
|
|
This leads to code like this:
|
|
|
|
|
|
|
|
mov %eax, 8(%esp)
|
|
|
|
cvtsi2sd %eax, %xmm0
|
|
|
|
instead of:
|
|
|
|
cvtsi2sd 8(%esp), %xmm0
|
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
This instruction selector selects 'int X = 0' as 'mov Reg, 0' not 'xor Reg,Reg'
|
|
|
|
This is bigger and slower.
|
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
LSR should be turned on for the X86 backend and tuned to take advantage of its
|
|
|
|
addressing modes.
|
|
|
|
|
2006-01-29 17:46:06 +08:00
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
|
|
|
|
other fast SSE modes.
|
2006-01-31 08:20:38 +08:00
|
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
2006-01-31 08:45:37 +08:00
|
|
|
Think about doing i64 math in SSE regs.
|
|
|
|
|
2006-01-31 10:10:06 +08:00
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
The DAG Isel doesn't fold the loads into the adds in this testcase. The
|
|
|
|
pattern selector does. This is because the chain value of the load gets
|
|
|
|
selected first, and the loads aren't checking to see if they are only used by
|
|
|
|
and add.
|
|
|
|
|
|
|
|
.ll:
|
|
|
|
|
|
|
|
int %test(int* %x, int* %y, int* %z) {
|
|
|
|
%X = load int* %x
|
|
|
|
%Y = load int* %y
|
|
|
|
%Z = load int* %z
|
|
|
|
%a = add int %X, %Y
|
|
|
|
%b = add int %a, %Z
|
|
|
|
ret int %b
|
|
|
|
}
|
|
|
|
|
|
|
|
dag isel:
|
|
|
|
|
|
|
|
_test:
|
|
|
|
movl 4(%esp), %eax
|
|
|
|
movl (%eax), %eax
|
|
|
|
movl 8(%esp), %ecx
|
|
|
|
movl (%ecx), %ecx
|
|
|
|
addl %ecx, %eax
|
|
|
|
movl 12(%esp), %ecx
|
|
|
|
movl (%ecx), %ecx
|
|
|
|
addl %ecx, %eax
|
|
|
|
ret
|
|
|
|
|
|
|
|
pattern isel:
|
|
|
|
|
|
|
|
_test:
|
|
|
|
movl 12(%esp), %ecx
|
|
|
|
movl 4(%esp), %edx
|
|
|
|
movl 8(%esp), %eax
|
|
|
|
movl (%eax), %eax
|
|
|
|
addl (%edx), %eax
|
|
|
|
addl (%ecx), %eax
|
|
|
|
ret
|
|
|
|
|
|
|
|
This is bad for register pressure, though the dag isel is producing a
|
|
|
|
better schedule. :)
|