llvm-project/llvm/test/CodeGen/X86/fast-isel-x86-64.ll

; RUN: llc < %s -mattr=-avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
; RUN: llc < %s -mattr=+avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.0.0"

; Make sure that fast-isel folds the immediate into the binop even though it
; is non-canonical.
define i32 @test1(i32 %i) nounwind ssp {
  %and = and i32 8, %i
  ret i32 %and
}

; CHECK: test1:
; CHECK: andl	$8, 


; rdar://9289512 - The load should fold into the compare.
define void @test2(i64 %x) nounwind ssp {
entry:
  %x.addr = alloca i64, align 8
  store i64 %x, i64* %x.addr, align 8
  %tmp = load i64* %x.addr, align 8
  %cmp = icmp sgt i64 %tmp, 42
  br i1 %cmp, label %if.then, label %if.end

if.then:                                          ; preds = %entry
  br label %if.end

if.end:                                           ; preds = %if.then, %entry
  ret void
; CHECK: test2:
; CHECK: movq	%rdi, -8(%rsp)
; CHECK: cmpq	$42, -8(%rsp)
}


@G = external global i32
define i64 @test3() nounwind {
  %A = ptrtoint i32* @G to i64
  ret i64 %A
; CHECK: test3:
; CHECK: movq _G@GOTPCREL(%rip), %rax
; CHECK-NEXT: ret
}


; rdar://9289558
@rtx_length = external global [153 x i8]

define i32 @test4(i64 %idxprom9) nounwind {
  %arrayidx10 = getelementptr inbounds [153 x i8]* @rtx_length, i32 0, i64 %idxprom9
  %tmp11 = load i8* %arrayidx10, align 1
  %conv = zext i8 %tmp11 to i32
  ret i32 %conv

; CHECK: test4:
; CHECK: movq	_rtx_length@GOTPCREL(%rip), %rax
; CHECK-NEXT: movzbl	(%rax,%rdi), %eax
; CHECK-NEXT: ret
}


; PR3242 - Out of range shifts should not be folded by fastisel.
define void @test5(i32 %x, i32* %p) nounwind {
  %y = ashr i32 %x, 50000
  store i32 %y, i32* %p
  ret void

; CHECK: test5:
; CHECK: movl	$50000, %ecx
; CHECK: sarl	%cl, %edi
; CHECK: ret
}

; rdar://9289501 - fast isel should fold trivial multiplies to shifts.
define i64 @test6(i64 %x) nounwind ssp {
entry:
  %mul = mul nsw i64 %x, 8
  ret i64 %mul

; CHECK: test6:
; CHECK: shlq	$3, %rdi
}

define i32 @test7(i32 %x) nounwind ssp {
entry:
  %mul = mul nsw i32 %x, 8
  ret i32 %mul
; CHECK: test7:
; CHECK: shll	$3, %edi
}


; rdar://9289507 - folding of immediates into 64-bit operations.
define i64 @test8(i64 %x) nounwind ssp {
entry:
  %add = add nsw i64 %x, 7
  ret i64 %add

; CHECK: test8:
; CHECK: addq	$7, %rdi
}

define i64 @test9(i64 %x) nounwind ssp {
entry:
  %add = mul nsw i64 %x, 7
  ret i64 %add
; CHECK: test9:
; CHECK: imulq	$7, %rdi, %rax
}

; rdar://9297011 - Don't reject udiv by a power of 2.
define i32 @test10(i32 %X) nounwind {
  %Y = udiv i32 %X, 8
  ret i32 %Y
; CHECK: test10:
; CHECK: shrl	$3, 
}

define i32 @test11(i32 %X) nounwind {
  %Y = sdiv exact i32 %X, 8
  ret i32 %Y
; CHECK: test11:
; CHECK: sarl	$3, 
}


; rdar://9297006 - Trunc to bool.
define void @test12(i8 %tmp) nounwind ssp noredzone {
entry:
  %tobool = trunc i8 %tmp to i1
  br i1 %tobool, label %if.then, label %if.end

if.then:                                          ; preds = %entry
  call void @test12(i8 0) noredzone
  br label %if.end

if.end:                                           ; preds = %if.then, %entry
  ret void
; CHECK: test12:
; CHECK: testb	$1,
; CHECK-NEXT: je L
; CHECK-NEXT: movl $0, %edi
; CHECK-NEXT: callq
}

declare void @test13f(i1 %X)

define void @test13() nounwind {
  call void @test13f(i1 0)
  ret void
; CHECK: test13:
; CHECK: movl $0, %edi
; CHECK-NEXT: callq
}


; rdar://9297003 - fast isel bails out on all functions taking bools
define void @test14(i8 %tmp) nounwind ssp noredzone {
entry:
  %tobool = trunc i8 %tmp to i1
  call void @test13f(i1 zeroext %tobool) noredzone
  ret void
; CHECK: test14:
; CHECK: andb	$1, 
; CHECK: callq
}

declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)

; rdar://9289488 - fast-isel shouldn't bail out on llvm.memcpy
define void @test15(i8* %a, i8* %b) nounwind {
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 4, i32 4, i1 false)
  ret void
; CHECK: test15:
; CHECK-NEXT: movl	(%rsi), %eax
; CHECK-NEXT: movl	%eax, (%rdi)
; CHECK-NEXT: ret
}

; Handling for varargs calls
declare void @test16callee(...) nounwind
define void @test16() nounwind {
; CHECK: test16:
; CHECK: movl $1, %edi
; CHECK: movb $0, %al
; CHECK: callq _test16callee
  call void (...)* @test16callee(i32 1)
  br label %block2

block2:
; CHECK: movabsq $1
; CHECK: cvtsi2sdq {{.*}} %xmm0
; CHECK: movb $1, %al
; CHECK: callq _test16callee

; AVX: movabsq $1
; AVX: vmovsd LCP{{.*}}_{{.*}}(%rip), %xmm0
; AVX: movb $1, %al
; AVX: callq _test16callee
  call void (...)* @test16callee(double 1.000000e+00)
  ret void
}


declare void @foo() unnamed_addr ssp align 2

; Verify that we don't fold the load into the compare here.  That would move it
; w.r.t. the call.
define i32 @test17(i32 *%P) ssp nounwind {
entry:
  %tmp = load i32* %P
  %cmp = icmp ne i32 %tmp, 5
  call void @foo()
  br i1 %cmp, label %if.then, label %if.else

if.then:                                          ; preds = %entry
  ret i32 1

if.else:                                          ; preds = %entry
  ret i32 2
; CHECK: test17:
; CHECK: movl	(%rdi), %eax
; CHECK: callq _foo
; CHECK: cmpl	$5, %eax
; CHECK-NEXT: je 
}

; Check that 0.0 is materialized using xorps
define void @test18(float* %p1) {
  store float 0.0, float* %p1
  ret void
; CHECK: test18:
; CHECK: xorps
}

; Without any type hints, doubles use the smaller xorps instead of xorpd.
define void @test19(double* %p1) {
  store double 0.0, double* %p1
  ret void
; CHECK: test19:
; CHECK: xorps
}

; Check that we fast-isel sret
%struct.a = type { i64, i64, i64 }
define void @test20() nounwind ssp {
entry:
  %tmp = alloca %struct.a, align 8
  call void @test20sret(%struct.a* sret %tmp)
  ret void
; CHECK: test20:
; CHECK: leaq (%rsp), %rdi
; CHECK: callq _test20sret
}
declare void @test20sret(%struct.a* sret)

; Check that -0.0 is not materialized using xor
define void @test21(double* %p1) {
  store double -0.0, double* %p1
  ret void
; CHECK: test21:
; CHECK-NOT: xor
; CHECK: movsd	LCPI
}

; Check that immediate arguments to a function
; do not cause massive spilling and are used
; as immediates just before the call.
define void @test22() nounwind {
entry:
  call void @foo22(i32 0)
  call void @foo22(i32 1)
  call void @foo22(i32 2)
  call void @foo22(i32 3)
  ret void
; CHECK: test22:
; CHECK: movl	$0, %edi
; CHECK: callq	_foo22
; CHECK: movl	$1, %edi
; CHECK: callq	_foo22
; CHECK: movl	$2, %edi
; CHECK: callq	_foo22
; CHECK: movl	$3, %edi
; CHECK: callq	_foo22
}

declare void @foo22(i32)

; PR13563
define void @test23(i8* noalias sret %result) {
  %a = alloca i8
  %b = call i8* @foo23()
  ret void
; CHECK: test23:
; CHECK: call
; CHECK: movq  %rdi, %rax
; CHECK: ret
}

declare i8* @foo23()
Pad Short Functions for Intel Atom The current Intel Atom microarchitecture has a feature whereby when a function returns early then it is slightly faster to execute a sequence of NOP instructions to wait until the return address is ready, as opposed to simply stalling on the ret instruction until the return address is ready. When compiling for X86 Atom only, this patch will run a pass, called "X86PadShortFunction" which will add NOP instructions where less than four cycles elapse between function entry and return. It includes tests. This patch has been updated to address Nadav's review comments - Optimize only at >= O1 and don't do optimization if -Os is set - Stores MachineBasicBlock* instead of BBNum - Uses DenseMap instead of std::map - Fixes placement of braces Patch by Andy Zhang. llvm-svn: 171879 2013-01-09 02:27:24 +08:00			`; RUN: llc < %s -mattr=-avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort \| FileCheck %s`
			`; RUN: llc < %s -mattr=+avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort \| FileCheck %s --check-prefix=AVX`
fix rdar://9289583 - fast isel should handle non-canonical commutative binops allowing us to fold the immediate into the 'and' in this case: int test1(int i) { return 8&i; } llvm-svn: 129653 2011-04-17 09:16:47 +08:00
			`target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"`
			`target triple = "x86_64-apple-darwin10.0.0"`

			`; Make sure that fast-isel folds the immediate into the binop even though it`
			`; is non-canonical.`
			`define i32 @test1(i32 %i) nounwind ssp {`
			`%and = and i32 8, %i`
			`ret i32 %and`
			`}`

			`; CHECK: test1:`
			`; CHECK: andl $8,`
Fix rdar://9289512 - not folding load into compare at -O0 The basic issue here is that bottom-up isel is matching the branch and compare, and was failing to fold the load into the branch/compare combo. Fixing this (by allowing folding into any instruction of a sequence that is selected) allows us to produce things like: cmpb $0, 52(%rax) je LBB4_2 instead of: movb 52(%rax), %cl cmpb $0, %cl je LBB4_2 This makes the generated -O0 code run a bit faster, but also speeds up compile time by putting less pressure on the register allocator and generating less code. This was one of the biggest classes of missing load folding. Implementing this shrinks 176.gcc's c-decl.s (as a random example) by about 4% in (verbose-asm) line count. llvm-svn: 129656 2011-04-17 14:35:44 +08:00

Recommit the fix for rdar://9289512 with a couple tweaks to fix bugs exposed by the gcc dejagnu testsuite: 1. The load may actually be used by a dead instruction, which would cause an assert. 2. The load may not be used by the current chain of instructions, and we could move it past a side-effecting instruction. Change how we process uses to define the problem away. llvm-svn: 130018 2011-04-23 05:59:37 +08:00			`; rdar://9289512 - The load should fold into the compare.`
			`define void @test2(i64 %x) nounwind ssp {`
			`entry:`
			`%x.addr = alloca i64, align 8`
			`store i64 %x, i64* %x.addr, align 8`
			`%tmp = load i64* %x.addr, align 8`
			`%cmp = icmp sgt i64 %tmp, 42`
			`br i1 %cmp, label %if.then, label %if.end`

			`if.then: ; preds = %entry`
			`br label %if.end`

			`if.end: ; preds = %if.then, %entry`
			`ret void`
			`; CHECK: test2:`
			`; CHECK: movq %rdi, -8(%rsp)`
			`; CHECK: cmpq $42, -8(%rsp)`
			`}`




fix an oversight which caused us to compile the testcase (and other less trivial things) into a dummy lea. Before we generated: _test: ## @test movq _G@GOTPCREL(%rip), %rax leaq (%rax), %rax ret now we produce: _test: ## @test movq _G@GOTPCREL(%rip), %rax ret This is part of rdar://9289558 llvm-svn: 129662 2011-04-18 01:12:08 +08:00			`@G = external global i32`
			`define i64 @test3() nounwind {`
			`%A = ptrtoint i32* @G to i64`
			`ret i64 %A`
			`; CHECK: test3:`
			`; CHECK: movq _G@GOTPCREL(%rip), %rax`
			`; CHECK-NEXT: ret`
fix an x86 fast isel issue where we'd completely give up on folding an address when we have a global variable base an an index. Instead, just give up on folding the global variable. Before we'd geenrate: _test: ## @test ## BB#0: movq _rtx_length@GOTPCREL(%rip), %rax leaq (%rax), %rax addq %rdi, %rax movzbl (%rax), %eax ret now we generate: _test: ## @test ## BB#0: movq _rtx_length@GOTPCREL(%rip), %rax movzbl (%rax,%rdi), %eax ret The difference is even more significant when there is a scale involved. This fixes rdar://9289558 - total fail with addr mode formation at -O0/x86-64 llvm-svn: 129664 2011-04-18 01:47:38 +08:00			`}`



			`; rdar://9289558`
			`@rtx_length = external global [153 x i8]`

			`define i32 @test4(i64 %idxprom9) nounwind {`
			`%arrayidx10 = getelementptr inbounds [153 x i8]* @rtx_length, i32 0, i64 %idxprom9`
			`%tmp11 = load i8* %arrayidx10, align 1`
			`%conv = zext i8 %tmp11 to i32`
			`ret i32 %conv`

			`; CHECK: test4:`
			`; CHECK: movq _rtx_length@GOTPCREL(%rip), %rax`
			`; CHECK-NEXT: movzbl (%rax,%rdi), %eax`
			`; CHECK-NEXT: ret`
			`}`
1. merge fast-isel-shift-imm.ll into fast-isel-x86-64.ll 2. implement rdar://9289501 - fast isel should fold trivial multiplies to shifts 3. teach tblgen to handle shift immediates that are different sizes than the shifted operands, eliminating some code from the X86 fast isel backend. 4. Have FastISel::SelectBinaryOp use (the poorly named) FastEmit_ri_ function instead of FastEmit_ri to simplify code. llvm-svn: 129666 2011-04-18 04:23:29 +08:00

			`; PR3242 - Out of range shifts should not be folded by fastisel.`
			`define void @test5(i32 %x, i32* %p) nounwind {`
			`%y = ashr i32 %x, 50000`
			`store i32 %y, i32* %p`
			`ret void`

			`; CHECK: test5:`
			`; CHECK: movl $50000, %ecx`
			`; CHECK: sarl %cl, %edi`
			`; CHECK: ret`
			`}`

			`; rdar://9289501 - fast isel should fold trivial multiplies to shifts.`
			`define i64 @test6(i64 %x) nounwind ssp {`
			`entry:`
			`%mul = mul nsw i64 %x, 8`
			`ret i64 %mul`

			`; CHECK: test6:`
Disable expensive two-address optimizations at -O0. rdar://10453055 llvm-svn: 144806 2011-11-17 02:44:48 +08:00			`; CHECK: shlq $3, %rdi`
1. merge fast-isel-shift-imm.ll into fast-isel-x86-64.ll 2. implement rdar://9289501 - fast isel should fold trivial multiplies to shifts 3. teach tblgen to handle shift immediates that are different sizes than the shifted operands, eliminating some code from the X86 fast isel backend. 4. Have FastISel::SelectBinaryOp use (the poorly named) FastEmit_ri_ function instead of FastEmit_ri to simplify code. llvm-svn: 129666 2011-04-18 04:23:29 +08:00			`}`

			`define i32 @test7(i32 %x) nounwind ssp {`
			`entry:`
			`%mul = mul nsw i32 %x, 8`
			`ret i32 %mul`
			`; CHECK: test7:`
Disable expensive two-address optimizations at -O0. rdar://10453055 llvm-svn: 144806 2011-11-17 02:44:48 +08:00			`; CHECK: shll $3, %edi`
1. merge fast-isel-shift-imm.ll into fast-isel-x86-64.ll 2. implement rdar://9289501 - fast isel should fold trivial multiplies to shifts 3. teach tblgen to handle shift immediates that are different sizes than the shifted operands, eliminating some code from the X86 fast isel backend. 4. Have FastISel::SelectBinaryOp use (the poorly named) FastEmit_ri_ function instead of FastEmit_ri to simplify code. llvm-svn: 129666 2011-04-18 04:23:29 +08:00			`}`

Implement major new fastisel functionality: the matcher can now handle immediates with value constraints on them (when defined as ImmLeaf's). This is particularly important for X86-64, where almost all reg/imm instructions take a i64immSExt32 immediate operand, which has a value constraint. Before this patch we ended up iseling the examples into such amazing code as: movabsq $7, %rax imulq %rax, %rdi movq %rdi, %rax ret now we produce: imulq $7, %rdi, %rax ret This dramatically shrinks the generated code at -O0 on x86-64. llvm-svn: 129691 2011-04-18 14:22:33 +08:00
			`; rdar://9289507 - folding of immediates into 64-bit operations.`
			`define i64 @test8(i64 %x) nounwind ssp {`
			`entry:`
			`%add = add nsw i64 %x, 7`
			`ret i64 %add`

			`; CHECK: test8:`
			`; CHECK: addq $7, %rdi`
			`}`

			`define i64 @test9(i64 %x) nounwind ssp {`
			`entry:`
			`%add = mul nsw i64 %x, 7`
			`ret i64 %add`
			`; CHECK: test9:`
			`; CHECK: imulq $7, %rdi, %rax`
			`}`
fix rdar://9297011 - udiv by power of two causing fast-isel rejects llvm-svn: 129693 2011-04-18 14:55:51 +08:00
			`; rdar://9297011 - Don't reject udiv by a power of 2.`
			`define i32 @test10(i32 %X) nounwind {`
			`%Y = udiv i32 %X, 8`
			`ret i32 %Y`
			`; CHECK: test10:`
			`; CHECK: shrl $3,`
			`}`
while we're at it, handle 'sdiv exact' of a power of 2 also, this fixes a few rejects on c++ iterator loops. llvm-svn: 129694 2011-04-18 15:00:40 +08:00
			`define i32 @test11(i32 %X) nounwind {`
			`%Y = sdiv exact i32 %X, 8`
			`ret i32 %Y`
			`; CHECK: test11:`
			`; CHECK: sarl $3,`
			`}`

fix rdar://9297006 - fast isel bails out on trunc to i1 -> bools cry, a common cause of fast isel rejects on c++ code. llvm-svn: 129748 2011-04-19 12:22:17 +08:00
			`; rdar://9297006 - Trunc to bool.`
			`define void @test12(i8 %tmp) nounwind ssp noredzone {`
			`entry:`
			`%tobool = trunc i8 %tmp to i1`
			`br i1 %tobool, label %if.then, label %if.end`

			`if.then: ; preds = %entry`
			`call void @test12(i8 0) noredzone`
			`br label %if.end`

			`if.end: ; preds = %if.then, %entry`
			`ret void`
			`; CHECK: test12:`
			`; CHECK: testb $1,`
be layout aware, to produce: testb $1, %al je LBB0_2 ## BB#1: ## %if.then movb $0, %al instead of: testb $1, %al jne LBB0_1 jmp LBB0_2 LBB0_1: ## %if.then movb $0, %al how 'bout that. llvm-svn: 129749 2011-04-19 12:26:32 +08:00			`; CHECK-NEXT: je L`
Handle i1/i8/i16 constant integer arguments to calls by prepromoting them. Before we would bail out on i1 arguments all together, now we just bail on non-constant ones. Also, we used to emit extraneous code. e.g. test12 was: movb $0, %al movzbl %al, %edi callq _test12 and test13 was: movb $0, %al xorl %edi, %edi movb %al, 7(%rsp) callq _test13f Now we get: movl $0, %edi callq _test12 and: movl $0, %edi callq _test13f llvm-svn: 129751 2011-04-19 12:42:38 +08:00			`; CHECK-NEXT: movl $0, %edi`
			`; CHECK-NEXT: callq`
			`}`

			`declare void @test13f(i1 %X)`

			`define void @test13() nounwind {`
			`call void @test13f(i1 0)`
			`ret void`
			`; CHECK: test13:`
			`; CHECK: movl $0, %edi`
			`; CHECK-NEXT: callq`
fix rdar://9297006 - fast isel bails out on trunc to i1 -> bools cry, a common cause of fast isel rejects on c++ code. llvm-svn: 129748 2011-04-19 12:22:17 +08:00			`}`

Implement support for fast isel of calls of i1 arguments, even though they are illegal, when they are a truncate from something else. This eliminates fully half of all the fastisel rejections on a test c++ file I'm working with, which should make a substantial improvement for -O0 compile of c++ code. This fixed rdar://9297003 - fast isel bails out on all functions taking bools llvm-svn: 129752 2011-04-19 13:09:50 +08:00

			`; rdar://9297003 - fast isel bails out on all functions taking bools`
			`define void @test14(i8 %tmp) nounwind ssp noredzone {`
			`entry:`
			`%tobool = trunc i8 %tmp to i1`
			`call void @test13f(i1 zeroext %tobool) noredzone`
			`ret void`
			`; CHECK: test14:`
			`; CHECK: andb $1,`
			`; CHECK: callq`
			`}`

Implement support for x86 fastisel of small fixed-sized memcpys, which are generated en-mass for C++ PODs. On my c++ test file, this cuts the fast isel rejects by 10x and shrinks the generated .s file by 5% llvm-svn: 129755 2011-04-19 13:52:03 +08:00			`declare void @llvm.memcpy.p0i8.p0i8.i64(i8, i8, i64, i32, i1)`

			`; rdar://9289488 - fast-isel shouldn't bail out on llvm.memcpy`
			`define void @test15(i8* %a, i8* %b) nounwind {`
			`call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 4, i32 4, i1 false)`
			`ret void`
			`; CHECK: test15:`
			`; CHECK-NEXT: movl (%rsi), %eax`
			`; CHECK-NEXT: movl %eax, (%rdi)`
			`; CHECK-NEXT: ret`
			`}`
Add support for FastISel'ing varargs calls. llvm-svn: 129765 2011-04-20 01:22:22 +08:00
			`; Handling for varargs calls`
			`declare void @test16callee(...) nounwind`
			`define void @test16() nounwind {`
			`; CHECK: test16:`
			`; CHECK: movl $1, %edi`
			`; CHECK: movb $0, %al`
			`; CHECK: callq _test16callee`
			`call void (...)* @test16callee(i32 1)`
			`br label %block2`

			`block2:`
			`; CHECK: movabsq $1`
			`; CHECK: cvtsi2sdq {{.*}} %xmm0`
			`; CHECK: movb $1, %al`
			`; CHECK: callq _test16callee`
Introduce 'UseSSEx' to force SSE legacy encoding - Add 'UseSSEx' to force SSE legacy insn not being selected when AVX is enabled. As the penalty of inter-mixing SSE and AVX instructions, we need prevent SSE legacy insn from being generated except explicitly specified through some intrinsics. For patterns supported by both SSE and AVX, so far, we force AVX insn will be tried first relying on AddedComplexity or position in td file. It's error-prone and introduces bugs accidentally. 'UseSSEx' is disabled when AVX is turned on. For SSE insns inherited by AVX, we need this predicate to force VEX encoding or SSE legacy encoding only. For insns not inherited by AVX, we still use the previous predicates, i.e. 'HasSSEx'. So far, these insns fall into the following categories: * SSE insns with MMX operands * SSE insns with GPR/MEM operands only (xFENCE, PREFETCH, CLFLUSH, CRC, and etc.) * SSE4A insns. * MMX insns. * x87 insns added by SSE. 2 test cases are modified: - test/CodeGen/X86/fast-isel-x86-64.ll AVX code generation is different from SSE one. 'vcvtsi2sdq' cannot be selected by fast-isel due to complicated pattern and fast-isel fallback to materialize it from constant pool. - test/CodeGen/X86/widen_load-1.ll AVX code generation is different from SSE one after fixing SSE/AVX inter-mixing. Exec-domain fixing prefers 'vmovapd' instead of 'vmovaps'. llvm-svn: 162919 2012-08-31 00:54:46 +08:00
			`; AVX: movabsq $1`
			`; AVX: vmovsd LCP{{.}}_{{.}}(%rip), %xmm0`
			`; AVX: movb $1, %al`
			`; AVX: callq _test16callee`
Add support for FastISel'ing varargs calls. llvm-svn: 129765 2011-04-20 01:22:22 +08:00			`call void (...)* @test16callee(double 1.000000e+00)`
			`ret void`
			`}`
Recommit the fix for rdar://9289512 with a couple tweaks to fix bugs exposed by the gcc dejagnu testsuite: 1. The load may actually be used by a dead instruction, which would cause an assert. 2. The load may not be used by the current chain of instructions, and we could move it past a side-effecting instruction. Change how we process uses to define the problem away. llvm-svn: 130018 2011-04-23 05:59:37 +08:00

			`declare void @foo() unnamed_addr ssp align 2`

			`; Verify that we don't fold the load into the compare here. That would move it`
			`; w.r.t. the call.`
			`define i32 @test17(i32 *%P) ssp nounwind {`
			`entry:`
			`%tmp = load i32* %P`
			`%cmp = icmp ne i32 %tmp, 5`
			`call void @foo()`
			`br i1 %cmp, label %if.then, label %if.else`

			`if.then: ; preds = %entry`
			`ret i32 1`

			`if.else: ; preds = %entry`
			`ret i32 2`
			`; CHECK: test17:`
			`; CHECK: movl (%rdi), %eax`
			`; CHECK: callq _foo`
			`; CHECK: cmpl $5, %eax`
			`; CHECK-NEXT: je`
			`}`

Make X86::FsFLD0SS / FsFLD0SD real pseudo-instructions. Like V_SET0, these instructions are expanded by ExpandPostRA to xorps / vxorps so they can participate in execution domain swizzling. This also makes the AVX variants redundant. llvm-svn: 145440 2011-11-30 06:27:25 +08:00			`; Check that 0.0 is materialized using xorps`
Make the fast-isel code for literal 0.0 a bit shorter/faster, since 0.0 is common. rdar://problem/9303592 . llvm-svn: 130338 2011-04-28 06:41:55 +08:00			`define void @test18(float* %p1) {`
			`store float 0.0, float* %p1`
			`ret void`
			`; CHECK: test18:`
Make X86::FsFLD0SS / FsFLD0SD real pseudo-instructions. Like V_SET0, these instructions are expanded by ExpandPostRA to xorps / vxorps so they can participate in execution domain swizzling. This also makes the AVX variants redundant. llvm-svn: 145440 2011-11-30 06:27:25 +08:00			`; CHECK: xorps`
Make the fast-isel code for literal 0.0 a bit shorter/faster, since 0.0 is common. rdar://problem/9303592 . llvm-svn: 130338 2011-04-28 06:41:55 +08:00			`}`
Make X86::FsFLD0SS / FsFLD0SD real pseudo-instructions. Like V_SET0, these instructions are expanded by ExpandPostRA to xorps / vxorps so they can participate in execution domain swizzling. This also makes the AVX variants redundant. llvm-svn: 145440 2011-11-30 06:27:25 +08:00
			`; Without any type hints, doubles use the smaller xorps instead of xorpd.`
Make the fast-isel code for literal 0.0 a bit shorter/faster, since 0.0 is common. rdar://problem/9303592 . llvm-svn: 130338 2011-04-28 06:41:55 +08:00			`define void @test19(double* %p1) {`
			`store double 0.0, double* %p1`
			`ret void`
			`; CHECK: test19:`
Make X86::FsFLD0SS / FsFLD0SD real pseudo-instructions. Like V_SET0, these instructions are expanded by ExpandPostRA to xorps / vxorps so they can participate in execution domain swizzling. This also makes the AVX variants redundant. llvm-svn: 145440 2011-11-30 06:27:25 +08:00			`; CHECK: xorps`
Make the fast-isel code for literal 0.0 a bit shorter/faster, since 0.0 is common. rdar://problem/9303592 . llvm-svn: 130338 2011-04-28 06:41:55 +08:00			`}`
fast-isel sret. We actually don't need to do anything special on x86. :) rdar://problem/9303592 . llvm-svn: 130348 2011-04-28 07:58:52 +08:00
fast-isel sret calls, try 2. We actually do need to do something on x86-32. rdar://problem/9303592 . llvm-svn: 130429 2011-04-29 04:19:12 +08:00			`; Check that we fast-isel sret`
			`%struct.a = type { i64, i64, i64 }`
			`define void @test20() nounwind ssp {`
			`entry:`
			`%tmp = alloca %struct.a, align 8`
			`call void @test20sret(%struct.a* sret %tmp)`
			`ret void`
			`; CHECK: test20:`
			`; CHECK: leaq (%rsp), %rdi`
			`; CHECK: callq _test20sret`
			`}`
			`declare void @test20sret(%struct.a* sret)`

Make X86::FsFLD0SS / FsFLD0SD real pseudo-instructions. Like V_SET0, these instructions are expanded by ExpandPostRA to xorps / vxorps so they can participate in execution domain swizzling. This also makes the AVX variants redundant. llvm-svn: 145440 2011-11-30 06:27:25 +08:00			`; Check that -0.0 is not materialized using xor`
Fix a silly mistake in r130338. llvm-svn: 130360 2011-04-28 08:42:03 +08:00			`define void @test21(double* %p1) {`
			`store double -0.0, double* %p1`
			`ret void`
			`; CHECK: test21:`
Make X86::FsFLD0SS / FsFLD0SD real pseudo-instructions. Like V_SET0, these instructions are expanded by ExpandPostRA to xorps / vxorps so they can participate in execution domain swizzling. This also makes the AVX variants redundant. llvm-svn: 145440 2011-11-30 06:27:25 +08:00			`; CHECK-NOT: xor`
Fix a silly mistake in r130338. llvm-svn: 130360 2011-04-28 08:42:03 +08:00			`; CHECK: movsd LCPI`
FastISel: avoid function calls between the materialization of the constant and its use. llvm-svn: 137993 2011-08-19 06:06:10 +08:00			`}`

			`; Check that immediate arguments to a function`
			`; do not cause massive spilling and are used`
			`; as immediates just before the call.`
			`define void @test22() nounwind {`
			`entry:`
			`call void @foo22(i32 0)`
			`call void @foo22(i32 1)`
			`call void @foo22(i32 2)`
			`call void @foo22(i32 3)`
			`ret void`
			`; CHECK: test22:`
			`; CHECK: movl $0, %edi`
			`; CHECK: callq _foo22`
			`; CHECK: movl $1, %edi`
			`; CHECK: callq _foo22`
			`; CHECK: movl $2, %edi`
			`; CHECK: callq _foo22`
			`; CHECK: movl $3, %edi`
			`; CHECK: callq _foo22`
			`}`

			`declare void @foo22(i32)`
Make sure to put our sret argument into %rax on x86-64. Fixes PR13563! llvm-svn: 165063 2012-10-03 06:45:06 +08:00
			`; PR13563`
			`define void @test23(i8* noalias sret %result) {`
			`%a = alloca i8`
			`%b = call i8* @foo23()`
			`ret void`
			`; CHECK: test23:`
			`; CHECK: call`
			`; CHECK: movq %rdi, %rax`
			`; CHECK: ret`
			`}`

			`declare i8* @foo23()`