llvm-project/llvm/test/CodeGen/X86/fold-pcmpeqd-0.ll

; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=yonah -regalloc=linearscan | FileCheck --check-prefix=I386 %s
; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=X86-64 %s

; This testcase shouldn't need to spill the -1 value,
; so it should just use pcmpeqd to materialize an all-ones vector.
; For i386, cp load of -1 are folded.

; With -regalloc=greedy, the live range is split before spilling, so the first
; pcmpeq doesn't get folded as a constant pool load.

; I386-NOT: pcmpeqd
; I386: orps LCPI0_2, %xmm
; I386-NOT: pcmpeqd
; I386: orps LCPI0_2, %xmm

; X86-64: pcmpeqd
; X86-64-NOT: pcmpeqd

	%struct.__ImageExecInfo = type <{ <4 x i32>, <4 x float>, <2 x i64>, i8*, i8*, i8*, i32, i32, i32, i32, i32 }>
	%struct._cl_image_format_t = type <{ i32, i32, i32 }>
	%struct._image2d_t = type <{ i8*, %struct._cl_image_format_t, i32, i32, i32, i32, i32, i32 }>

define void @program_1(%struct._image2d_t* %dest, %struct._image2d_t* %t0, <4 x float> %p0, <4 x float> %p1, <4 x float> %p4, <4 x float> %p5, <4 x float> %p6) nounwind {
entry:
	%tmp3.i = load i32* null		; <i32> [#uses=1]
	%cmp = icmp sgt i32 %tmp3.i, 200		; <i1> [#uses=1]
	br i1 %cmp, label %forcond, label %ifthen

ifthen:		; preds = %entry
	ret void

forcond:		; preds = %entry
	%tmp3.i536 = load i32* null		; <i32> [#uses=1]
	%cmp12 = icmp slt i32 0, %tmp3.i536		; <i1> [#uses=1]
	br i1 %cmp12, label %forbody, label %afterfor

forbody:		; preds = %forcond
	%bitcast204.i313 = bitcast <4 x i32> zeroinitializer to <4 x float>		; <<4 x float>> [#uses=1]
	%mul233 = fmul <4 x float> %bitcast204.i313, zeroinitializer		; <<4 x float>> [#uses=1]
	%mul257 = fmul <4 x float> %mul233, zeroinitializer		; <<4 x float>> [#uses=1]
	%mul275 = fmul <4 x float> %mul257, zeroinitializer		; <<4 x float>> [#uses=1]
	%tmp51 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %mul275, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
	%bitcast198.i182 = bitcast <4 x float> zeroinitializer to <4 x i32>		; <<4 x i32>> [#uses=0]
	%bitcast204.i185 = bitcast <4 x i32> zeroinitializer to <4 x float>		; <<4 x float>> [#uses=1]
	%tmp69 = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> zeroinitializer) nounwind		; <<4 x i32>> [#uses=1]
	%tmp70 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp69) nounwind		; <<4 x float>> [#uses=1]
	%sub140.i78 = fsub <4 x float> zeroinitializer, %tmp70		; <<4 x float>> [#uses=2]
	%mul166.i86 = fmul <4 x float> zeroinitializer, %sub140.i78		; <<4 x float>> [#uses=1]
	%add167.i87 = fadd <4 x float> %mul166.i86, < float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000 >		; <<4 x float>> [#uses=1]
	%mul171.i88 = fmul <4 x float> %add167.i87, %sub140.i78		; <<4 x float>> [#uses=1]
	%add172.i89 = fadd <4 x float> %mul171.i88, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 >		; <<4 x float>> [#uses=1]
	%bitcast176.i90 = bitcast <4 x float> %add172.i89 to <4 x i32>		; <<4 x i32>> [#uses=1]
	%andnps178.i92 = and <4 x i32> %bitcast176.i90, zeroinitializer		; <<4 x i32>> [#uses=1]
	%bitcast179.i93 = bitcast <4 x i32> %andnps178.i92 to <4 x float>		; <<4 x float>> [#uses=1]
	%mul186.i96 = fmul <4 x float> %bitcast179.i93, zeroinitializer		; <<4 x float>> [#uses=1]
	%bitcast190.i98 = bitcast <4 x float> %mul186.i96 to <4 x i32>		; <<4 x i32>> [#uses=1]
	%andnps192.i100 = and <4 x i32> %bitcast190.i98, zeroinitializer		; <<4 x i32>> [#uses=1]
	%xorps.i102 = xor <4 x i32> zeroinitializer, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
	%orps203.i103 = or <4 x i32> %andnps192.i100, %xorps.i102		; <<4 x i32>> [#uses=1]
	%bitcast204.i104 = bitcast <4 x i32> %orps203.i103 to <4 x float>		; <<4 x float>> [#uses=1]
	%cmple.i = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> zeroinitializer, <4 x float> %tmp51, i8 2) nounwind		; <<4 x float>> [#uses=1]
	%tmp80 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
	%sub140.i = fsub <4 x float> zeroinitializer, %tmp80		; <<4 x float>> [#uses=1]
	%bitcast148.i = bitcast <4 x float> zeroinitializer to <4 x i32>		; <<4 x i32>> [#uses=1]
	%andnps150.i = and <4 x i32> %bitcast148.i, < i32 -2139095041, i32 -2139095041, i32 -2139095041, i32 -2139095041 >		; <<4 x i32>> [#uses=0]
	%mul171.i = fmul <4 x float> zeroinitializer, %sub140.i		; <<4 x float>> [#uses=1]
	%add172.i = fadd <4 x float> %mul171.i, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 >		; <<4 x float>> [#uses=1]
	%bitcast176.i = bitcast <4 x float> %add172.i to <4 x i32>		; <<4 x i32>> [#uses=1]
	%andnps178.i = and <4 x i32> %bitcast176.i, zeroinitializer		; <<4 x i32>> [#uses=1]
	%bitcast179.i = bitcast <4 x i32> %andnps178.i to <4 x float>		; <<4 x float>> [#uses=1]
	%mul186.i = fmul <4 x float> %bitcast179.i, zeroinitializer		; <<4 x float>> [#uses=1]
	%bitcast189.i = bitcast <4 x float> zeroinitializer to <4 x i32>		; <<4 x i32>> [#uses=0]
	%bitcast190.i = bitcast <4 x float> %mul186.i to <4 x i32>		; <<4 x i32>> [#uses=1]
	%andnps192.i = and <4 x i32> %bitcast190.i, zeroinitializer		; <<4 x i32>> [#uses=1]
	%bitcast198.i = bitcast <4 x float> %cmple.i to <4 x i32>		; <<4 x i32>> [#uses=1]
	%xorps.i = xor <4 x i32> %bitcast198.i, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
	%orps203.i = or <4 x i32> %andnps192.i, %xorps.i		; <<4 x i32>> [#uses=1]
	%bitcast204.i = bitcast <4 x i32> %orps203.i to <4 x float>		; <<4 x float>> [#uses=1]
	%mul307 = fmul <4 x float> %bitcast204.i185, zeroinitializer		; <<4 x float>> [#uses=1]
	%mul310 = fmul <4 x float> %bitcast204.i104, zeroinitializer		; <<4 x float>> [#uses=2]
	%mul313 = fmul <4 x float> %bitcast204.i, zeroinitializer		; <<4 x float>> [#uses=1]
	%tmp82 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul307, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
	%bitcast11.i15 = bitcast <4 x float> %tmp82 to <4 x i32>		; <<4 x i32>> [#uses=1]
	%andnps.i17 = and <4 x i32> %bitcast11.i15, zeroinitializer		; <<4 x i32>> [#uses=1]
	%orps.i18 = or <4 x i32> %andnps.i17, zeroinitializer		; <<4 x i32>> [#uses=1]
	%bitcast17.i19 = bitcast <4 x i32> %orps.i18 to <4 x float>		; <<4 x float>> [#uses=1]
	%tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
	%bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32>		; <<4 x i32>> [#uses=1]
	%bitcast6.i4 = bitcast <4 x float> zeroinitializer to <4 x i32>		; <<4 x i32>> [#uses=2]
	%andps.i5 = and <4 x i32> %bitcast.i3, %bitcast6.i4		; <<4 x i32>> [#uses=1]
	%bitcast11.i6 = bitcast <4 x float> %tmp83 to <4 x i32>		; <<4 x i32>> [#uses=1]
	%not.i7 = xor <4 x i32> %bitcast6.i4, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
	%andnps.i8 = and <4 x i32> %bitcast11.i6, %not.i7		; <<4 x i32>> [#uses=1]
	%orps.i9 = or <4 x i32> %andnps.i8, %andps.i5		; <<4 x i32>> [#uses=1]
	%bitcast17.i10 = bitcast <4 x i32> %orps.i9 to <4 x float>		; <<4 x float>> [#uses=1]
	%bitcast.i = bitcast <4 x float> %mul313 to <4 x i32>		; <<4 x i32>> [#uses=1]
	%andps.i = and <4 x i32> %bitcast.i, zeroinitializer		; <<4 x i32>> [#uses=1]
	%orps.i = or <4 x i32> zeroinitializer, %andps.i		; <<4 x i32>> [#uses=1]
	%bitcast17.i = bitcast <4 x i32> %orps.i to <4 x float>		; <<4 x float>> [#uses=1]
	call void null(<4 x float> %bitcast17.i19, <4 x float> %bitcast17.i10, <4 x float> %bitcast17.i, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind
	unreachable

afterfor:		; preds = %forcond
	ret void
}

declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone

declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone

declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone

declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone

declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
Fix register-dependent X86 tests. llvm-svn: 128867 2011-04-05 08:32:44 +08:00			`; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=yonah -regalloc=linearscan \| FileCheck --check-prefix=I386 %s`
			`; RUN: llc < %s -mtriple=x86_64-apple-darwin \| FileCheck --check-prefix=X86-64 %s`
Mark x86's V_SET0 and V_SETALLONES with isSimpleLoad, and teach X86's foldMemoryOperand how to "fold" them, by converting them into constant-pool loads. When they aren't folded, they use xorps/cmpeqd, but for example when register pressure is high, they may now be folded as memory operands, which reduces register pressure. Also, mark V_SET0 isAsCheapAsAMove so that two-address-elimination will remat it instead of copying zeros around (V_SETALLONES was already marked). llvm-svn: 60461 2008-12-03 13:21:24 +08:00
Fix some register-alias-related bugs in the post-RA scheduler liveness computation code. Also, avoid adding output-depenency edges when both defs are dead, which frequently happens with EFLAGS defs. Compute Depth and Height lazily, and always in terms of edge latency values. For the schedulers that don't care about latency, edge latencies are set to 1. Eliminate Cycle and CycleBound, and LatencyPriorityQueue's Latencies array. These are all subsumed by the Depth and Height fields. llvm-svn: 61073 2008-12-16 11:25:46 +08:00			`; This testcase shouldn't need to spill the -1 value,`
Mark x86's V_SET0 and V_SETALLONES with isSimpleLoad, and teach X86's foldMemoryOperand how to "fold" them, by converting them into constant-pool loads. When they aren't folded, they use xorps/cmpeqd, but for example when register pressure is high, they may now be folded as memory operands, which reduces register pressure. Also, mark V_SET0 isAsCheapAsAMove so that two-address-elimination will remat it instead of copying zeros around (V_SETALLONES was already marked). llvm-svn: 60461 2008-12-03 13:21:24 +08:00			`; so it should just use pcmpeqd to materialize an all-ones vector.`
Fix PR3457: Ignore control successors when looking for closest scheduled successor. A control successor doesn't read result(s) produced by the scheduling unit being evaluated. llvm-svn: 64210 2009-02-10 16:30:11 +08:00			`; For i386, cp load of -1 are folded.`
Mark x86's V_SET0 and V_SETALLONES with isSimpleLoad, and teach X86's foldMemoryOperand how to "fold" them, by converting them into constant-pool loads. When they aren't folded, they use xorps/cmpeqd, but for example when register pressure is high, they may now be folded as memory operands, which reduces register pressure. Also, mark V_SET0 isAsCheapAsAMove so that two-address-elimination will remat it instead of copying zeros around (V_SETALLONES was already marked). llvm-svn: 60461 2008-12-03 13:21:24 +08:00
Fix register-dependent X86 tests. llvm-svn: 128867 2011-04-05 08:32:44 +08:00			`; With -regalloc=greedy, the live range is split before spilling, so the first`
			`; pcmpeq doesn't get folded as a constant pool load.`

			`; I386-NOT: pcmpeqd`
			`; I386: orps LCPI0_2, %xmm`
			`; I386-NOT: pcmpeqd`
			`; I386: orps LCPI0_2, %xmm`

			`; X86-64: pcmpeqd`
			`; X86-64-NOT: pcmpeqd`

Mark x86's V_SET0 and V_SETALLONES with isSimpleLoad, and teach X86's foldMemoryOperand how to "fold" them, by converting them into constant-pool loads. When they aren't folded, they use xorps/cmpeqd, but for example when register pressure is high, they may now be folded as memory operands, which reduces register pressure. Also, mark V_SET0 isAsCheapAsAMove so that two-address-elimination will remat it instead of copying zeros around (V_SETALLONES was already marked). llvm-svn: 60461 2008-12-03 13:21:24 +08:00			`%struct.__ImageExecInfo = type <{ <4 x i32>, <4 x float>, <2 x i64>, i8, i8, i8*, i32, i32, i32, i32, i32 }>`
			`%struct._cl_image_format_t = type <{ i32, i32, i32 }>`
			`%struct._image2d_t = type <{ i8*, %struct._cl_image_format_t, i32, i32, i32, i32, i32, i32 }>`

			`define void @program_1(%struct._image2d_t* %dest, %struct._image2d_t* %t0, <4 x float> %p0, <4 x float> %p1, <4 x float> %p4, <4 x float> %p5, <4 x float> %p6) nounwind {`
			`entry:`
			`%tmp3.i = load i32* null ; <i32> [#uses=1]`
			`%cmp = icmp sgt i32 %tmp3.i, 200 ; <i1> [#uses=1]`
			`br i1 %cmp, label %forcond, label %ifthen`

			`ifthen: ; preds = %entry`
			`ret void`

			`forcond: ; preds = %entry`
			`%tmp3.i536 = load i32* null ; <i32> [#uses=1]`
			`%cmp12 = icmp slt i32 0, %tmp3.i536 ; <i1> [#uses=1]`
			`br i1 %cmp12, label %forbody, label %afterfor`

			`forbody: ; preds = %forcond`
			`%bitcast204.i313 = bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>> [#uses=1]`
Split the Add, Sub, and Mul instruction opcodes into separate integer and floating-point opcodes, introducing FAdd, FSub, and FMul. For now, the AsmParser, BitcodeReader, and IRBuilder all preserve backwards compatability, and the Core LLVM APIs preserve backwards compatibility for IR producers. Most front-ends won't need to change immediately. This implements the first step of the plan outlined here: http://nondot.org/sabre/LLVMNotes/IntegerOverflow.txt llvm-svn: 72897 2009-06-05 06:49:04 +08:00			`%mul233 = fmul <4 x float> %bitcast204.i313, zeroinitializer ; <<4 x float>> [#uses=1]`
			`%mul257 = fmul <4 x float> %mul233, zeroinitializer ; <<4 x float>> [#uses=1]`
			`%mul275 = fmul <4 x float> %mul257, zeroinitializer ; <<4 x float>> [#uses=1]`
Mark x86's V_SET0 and V_SETALLONES with isSimpleLoad, and teach X86's foldMemoryOperand how to "fold" them, by converting them into constant-pool loads. When they aren't folded, they use xorps/cmpeqd, but for example when register pressure is high, they may now be folded as memory operands, which reduces register pressure. Also, mark V_SET0 isAsCheapAsAMove so that two-address-elimination will remat it instead of copying zeros around (V_SETALLONES was already marked). llvm-svn: 60461 2008-12-03 13:21:24 +08:00			`%tmp51 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %mul275, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]`
			`%bitcast198.i182 = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=0]`
			`%bitcast204.i185 = bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>> [#uses=1]`
			`%tmp69 = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> zeroinitializer) nounwind ; <<4 x i32>> [#uses=1]`
			`%tmp70 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp69) nounwind ; <<4 x float>> [#uses=1]`
Split the Add, Sub, and Mul instruction opcodes into separate integer and floating-point opcodes, introducing FAdd, FSub, and FMul. For now, the AsmParser, BitcodeReader, and IRBuilder all preserve backwards compatability, and the Core LLVM APIs preserve backwards compatibility for IR producers. Most front-ends won't need to change immediately. This implements the first step of the plan outlined here: http://nondot.org/sabre/LLVMNotes/IntegerOverflow.txt llvm-svn: 72897 2009-06-05 06:49:04 +08:00			`%sub140.i78 = fsub <4 x float> zeroinitializer, %tmp70 ; <<4 x float>> [#uses=2]`
			`%mul166.i86 = fmul <4 x float> zeroinitializer, %sub140.i78 ; <<4 x float>> [#uses=1]`
			`%add167.i87 = fadd <4 x float> %mul166.i86, < float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000 > ; <<4 x float>> [#uses=1]`
			`%mul171.i88 = fmul <4 x float> %add167.i87, %sub140.i78 ; <<4 x float>> [#uses=1]`
			`%add172.i89 = fadd <4 x float> %mul171.i88, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 > ; <<4 x float>> [#uses=1]`
Mark x86's V_SET0 and V_SETALLONES with isSimpleLoad, and teach X86's foldMemoryOperand how to "fold" them, by converting them into constant-pool loads. When they aren't folded, they use xorps/cmpeqd, but for example when register pressure is high, they may now be folded as memory operands, which reduces register pressure. Also, mark V_SET0 isAsCheapAsAMove so that two-address-elimination will remat it instead of copying zeros around (V_SETALLONES was already marked). llvm-svn: 60461 2008-12-03 13:21:24 +08:00			`%bitcast176.i90 = bitcast <4 x float> %add172.i89 to <4 x i32> ; <<4 x i32>> [#uses=1]`
			`%andnps178.i92 = and <4 x i32> %bitcast176.i90, zeroinitializer ; <<4 x i32>> [#uses=1]`
			`%bitcast179.i93 = bitcast <4 x i32> %andnps178.i92 to <4 x float> ; <<4 x float>> [#uses=1]`
Split the Add, Sub, and Mul instruction opcodes into separate integer and floating-point opcodes, introducing FAdd, FSub, and FMul. For now, the AsmParser, BitcodeReader, and IRBuilder all preserve backwards compatability, and the Core LLVM APIs preserve backwards compatibility for IR producers. Most front-ends won't need to change immediately. This implements the first step of the plan outlined here: http://nondot.org/sabre/LLVMNotes/IntegerOverflow.txt llvm-svn: 72897 2009-06-05 06:49:04 +08:00			`%mul186.i96 = fmul <4 x float> %bitcast179.i93, zeroinitializer ; <<4 x float>> [#uses=1]`
Mark x86's V_SET0 and V_SETALLONES with isSimpleLoad, and teach X86's foldMemoryOperand how to "fold" them, by converting them into constant-pool loads. When they aren't folded, they use xorps/cmpeqd, but for example when register pressure is high, they may now be folded as memory operands, which reduces register pressure. Also, mark V_SET0 isAsCheapAsAMove so that two-address-elimination will remat it instead of copying zeros around (V_SETALLONES was already marked). llvm-svn: 60461 2008-12-03 13:21:24 +08:00			`%bitcast190.i98 = bitcast <4 x float> %mul186.i96 to <4 x i32> ; <<4 x i32>> [#uses=1]`
			`%andnps192.i100 = and <4 x i32> %bitcast190.i98, zeroinitializer ; <<4 x i32>> [#uses=1]`
			`%xorps.i102 = xor <4 x i32> zeroinitializer, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1]`
			`%orps203.i103 = or <4 x i32> %andnps192.i100, %xorps.i102 ; <<4 x i32>> [#uses=1]`
			`%bitcast204.i104 = bitcast <4 x i32> %orps203.i103 to <4 x float> ; <<4 x float>> [#uses=1]`
			`%cmple.i = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> zeroinitializer, <4 x float> %tmp51, i8 2) nounwind ; <<4 x float>> [#uses=1]`
			`%tmp80 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]`
Split the Add, Sub, and Mul instruction opcodes into separate integer and floating-point opcodes, introducing FAdd, FSub, and FMul. For now, the AsmParser, BitcodeReader, and IRBuilder all preserve backwards compatability, and the Core LLVM APIs preserve backwards compatibility for IR producers. Most front-ends won't need to change immediately. This implements the first step of the plan outlined here: http://nondot.org/sabre/LLVMNotes/IntegerOverflow.txt llvm-svn: 72897 2009-06-05 06:49:04 +08:00			`%sub140.i = fsub <4 x float> zeroinitializer, %tmp80 ; <<4 x float>> [#uses=1]`
Mark x86's V_SET0 and V_SETALLONES with isSimpleLoad, and teach X86's foldMemoryOperand how to "fold" them, by converting them into constant-pool loads. When they aren't folded, they use xorps/cmpeqd, but for example when register pressure is high, they may now be folded as memory operands, which reduces register pressure. Also, mark V_SET0 isAsCheapAsAMove so that two-address-elimination will remat it instead of copying zeros around (V_SETALLONES was already marked). llvm-svn: 60461 2008-12-03 13:21:24 +08:00			`%bitcast148.i = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1]`
			`%andnps150.i = and <4 x i32> %bitcast148.i, < i32 -2139095041, i32 -2139095041, i32 -2139095041, i32 -2139095041 > ; <<4 x i32>> [#uses=0]`
Split the Add, Sub, and Mul instruction opcodes into separate integer and floating-point opcodes, introducing FAdd, FSub, and FMul. For now, the AsmParser, BitcodeReader, and IRBuilder all preserve backwards compatability, and the Core LLVM APIs preserve backwards compatibility for IR producers. Most front-ends won't need to change immediately. This implements the first step of the plan outlined here: http://nondot.org/sabre/LLVMNotes/IntegerOverflow.txt llvm-svn: 72897 2009-06-05 06:49:04 +08:00			`%mul171.i = fmul <4 x float> zeroinitializer, %sub140.i ; <<4 x float>> [#uses=1]`
			`%add172.i = fadd <4 x float> %mul171.i, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 > ; <<4 x float>> [#uses=1]`
Mark x86's V_SET0 and V_SETALLONES with isSimpleLoad, and teach X86's foldMemoryOperand how to "fold" them, by converting them into constant-pool loads. When they aren't folded, they use xorps/cmpeqd, but for example when register pressure is high, they may now be folded as memory operands, which reduces register pressure. Also, mark V_SET0 isAsCheapAsAMove so that two-address-elimination will remat it instead of copying zeros around (V_SETALLONES was already marked). llvm-svn: 60461 2008-12-03 13:21:24 +08:00			`%bitcast176.i = bitcast <4 x float> %add172.i to <4 x i32> ; <<4 x i32>> [#uses=1]`
			`%andnps178.i = and <4 x i32> %bitcast176.i, zeroinitializer ; <<4 x i32>> [#uses=1]`
			`%bitcast179.i = bitcast <4 x i32> %andnps178.i to <4 x float> ; <<4 x float>> [#uses=1]`
Split the Add, Sub, and Mul instruction opcodes into separate integer and floating-point opcodes, introducing FAdd, FSub, and FMul. For now, the AsmParser, BitcodeReader, and IRBuilder all preserve backwards compatability, and the Core LLVM APIs preserve backwards compatibility for IR producers. Most front-ends won't need to change immediately. This implements the first step of the plan outlined here: http://nondot.org/sabre/LLVMNotes/IntegerOverflow.txt llvm-svn: 72897 2009-06-05 06:49:04 +08:00			`%mul186.i = fmul <4 x float> %bitcast179.i, zeroinitializer ; <<4 x float>> [#uses=1]`
Mark x86's V_SET0 and V_SETALLONES with isSimpleLoad, and teach X86's foldMemoryOperand how to "fold" them, by converting them into constant-pool loads. When they aren't folded, they use xorps/cmpeqd, but for example when register pressure is high, they may now be folded as memory operands, which reduces register pressure. Also, mark V_SET0 isAsCheapAsAMove so that two-address-elimination will remat it instead of copying zeros around (V_SETALLONES was already marked). llvm-svn: 60461 2008-12-03 13:21:24 +08:00			`%bitcast189.i = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=0]`
			`%bitcast190.i = bitcast <4 x float> %mul186.i to <4 x i32> ; <<4 x i32>> [#uses=1]`
			`%andnps192.i = and <4 x i32> %bitcast190.i, zeroinitializer ; <<4 x i32>> [#uses=1]`
			`%bitcast198.i = bitcast <4 x float> %cmple.i to <4 x i32> ; <<4 x i32>> [#uses=1]`
			`%xorps.i = xor <4 x i32> %bitcast198.i, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1]`
			`%orps203.i = or <4 x i32> %andnps192.i, %xorps.i ; <<4 x i32>> [#uses=1]`
			`%bitcast204.i = bitcast <4 x i32> %orps203.i to <4 x float> ; <<4 x float>> [#uses=1]`
Split the Add, Sub, and Mul instruction opcodes into separate integer and floating-point opcodes, introducing FAdd, FSub, and FMul. For now, the AsmParser, BitcodeReader, and IRBuilder all preserve backwards compatability, and the Core LLVM APIs preserve backwards compatibility for IR producers. Most front-ends won't need to change immediately. This implements the first step of the plan outlined here: http://nondot.org/sabre/LLVMNotes/IntegerOverflow.txt llvm-svn: 72897 2009-06-05 06:49:04 +08:00			`%mul307 = fmul <4 x float> %bitcast204.i185, zeroinitializer ; <<4 x float>> [#uses=1]`
			`%mul310 = fmul <4 x float> %bitcast204.i104, zeroinitializer ; <<4 x float>> [#uses=2]`
			`%mul313 = fmul <4 x float> %bitcast204.i, zeroinitializer ; <<4 x float>> [#uses=1]`
Mark x86's V_SET0 and V_SETALLONES with isSimpleLoad, and teach X86's foldMemoryOperand how to "fold" them, by converting them into constant-pool loads. When they aren't folded, they use xorps/cmpeqd, but for example when register pressure is high, they may now be folded as memory operands, which reduces register pressure. Also, mark V_SET0 isAsCheapAsAMove so that two-address-elimination will remat it instead of copying zeros around (V_SETALLONES was already marked). llvm-svn: 60461 2008-12-03 13:21:24 +08:00			`%tmp82 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul307, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]`
			`%bitcast11.i15 = bitcast <4 x float> %tmp82 to <4 x i32> ; <<4 x i32>> [#uses=1]`
			`%andnps.i17 = and <4 x i32> %bitcast11.i15, zeroinitializer ; <<4 x i32>> [#uses=1]`
			`%orps.i18 = or <4 x i32> %andnps.i17, zeroinitializer ; <<4 x i32>> [#uses=1]`
			`%bitcast17.i19 = bitcast <4 x i32> %orps.i18 to <4 x float> ; <<4 x float>> [#uses=1]`
			`%tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]`
			`%bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32> ; <<4 x i32>> [#uses=1]`
			`%bitcast6.i4 = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=2]`
			`%andps.i5 = and <4 x i32> %bitcast.i3, %bitcast6.i4 ; <<4 x i32>> [#uses=1]`
			`%bitcast11.i6 = bitcast <4 x float> %tmp83 to <4 x i32> ; <<4 x i32>> [#uses=1]`
			`%not.i7 = xor <4 x i32> %bitcast6.i4, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1]`
			`%andnps.i8 = and <4 x i32> %bitcast11.i6, %not.i7 ; <<4 x i32>> [#uses=1]`
			`%orps.i9 = or <4 x i32> %andnps.i8, %andps.i5 ; <<4 x i32>> [#uses=1]`
			`%bitcast17.i10 = bitcast <4 x i32> %orps.i9 to <4 x float> ; <<4 x float>> [#uses=1]`
			`%bitcast.i = bitcast <4 x float> %mul313 to <4 x i32> ; <<4 x i32>> [#uses=1]`
			`%andps.i = and <4 x i32> %bitcast.i, zeroinitializer ; <<4 x i32>> [#uses=1]`
			`%orps.i = or <4 x i32> zeroinitializer, %andps.i ; <<4 x i32>> [#uses=1]`
			`%bitcast17.i = bitcast <4 x i32> %orps.i to <4 x float> ; <<4 x float>> [#uses=1]`
			`call void null(<4 x float> %bitcast17.i19, <4 x float> %bitcast17.i10, <4 x float> %bitcast17.i, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind`
			`unreachable`

			`afterfor: ; preds = %forcond`
			`ret void`
			`}`

			`declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone`

			`declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone`

			`declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone`

			`declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone`

			`declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone`