llvm-project/llvm/test/CodeGen/X86/avx-vzeroupper.ll

; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck --check-prefix=FAST-YMM-ZMM %s
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck --check-prefix=BTVER2 %s

; FAST-YMM-ZMM-NOT: vzeroupper
; BTVER2-NOT: vzeroupper

declare i32 @foo()
declare <4 x float> @do_sse(<4 x float>)
declare <8 x float> @do_avx(<8 x float>)
declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
@x = common global <4 x float> zeroinitializer, align 16
@g = common global <8 x float> zeroinitializer, align 32

;; Basic checking - don't emit any vzeroupper instruction

; CHECK: _test00
define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
entry:
  ; CHECK-NOT: vzeroupper
  %add.i = fadd <4 x float> %a, %b
  %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind
  ; CHECK: ret
  ret <4 x float> %call3
}

;; Check parameter 256-bit parameter passing

; CHECK: _test01
define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind uwtable ssp {
entry:
  %tmp = load <4 x float>, <4 x float>* @x, align 16
  ; CHECK: vzeroupper
  ; CHECK-NEXT: callq _do_sse
  %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind
  store <4 x float> %call, <4 x float>* @x, align 16
  ; CHECK-NOT: vzeroupper
  ; CHECK: callq _do_sse
  %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind
  store <4 x float> %call2, <4 x float>* @x, align 16
  ; CHECK: ret
  ret <8 x float> %c
}

;; Check that vzeroupper is emitted for tail calls.

; CHECK: _test02
define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind uwtable ssp {
entry:
  %add.i = fadd <8 x float> %a, %b
  %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0)
  ; CHECK: vzeroupper
  ; CHECK: jmp _do_sse
  %call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind
  ret <4 x float> %call3
}

;; Test the pass convergence and also that vzeroupper is only issued when necessary,
;; for this function it should be only once

; CHECK: _test03
define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
entry:
  %add.i = fadd <4 x float> %a, %b
  br label %while.cond

while.cond: 
  %call = tail call i32 @foo()
  %tobool = icmp eq i32 %call, 0
  br i1 %tobool, label %for.body, label %while.cond

for.body:
  ; CHECK: LBB
  ; CHECK-NOT: vzeroupper
  %i.018 = phi i32 [ 0, %while.cond ], [ %1, %for.body ]
  %c.017 = phi <4 x float> [ %add.i, %while.cond ], [ %call14, %for.body ]
  ; CHECK: callq _do_sse
  %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind
  ; CHECK-NEXT: callq _do_sse
  %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind
  %tmp11 = load <8 x float>, <8 x float>* @g, align 32
  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind
  ; CHECK: vzeroupper
  ; CHECK-NEXT: callq _do_sse
  %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind
  %1 = add nsw i32 %i.018, 1
  %exitcond = icmp eq i32 %1, 4
  br i1 %exitcond, label %for.end, label %for.body

for.end:
  ret <4 x float> %call14
}

;; Check that we also perform vzeroupper when we return from a function.

; CHECK: _test04
define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
entry:
  %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  ; CHECK-NOT: vzeroupper
  ; CHECK: call
  %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind
  %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  ; CHECK: vzeroupper
  ; CHECK: ret
  ret <4 x float> %shuf2
}
fixed to test only the feature, not the feature and a CPU llvm-svn: 231516 2015-03-07 04:58:15 +08:00			`; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx \| FileCheck %s`
[X86] Generate VZEROUPPER for Skylake-avx512. VZEROUPPER should not be issued on Knights Landing (KNL), but on Skylake-avx512 it should be. Differential Revision: https://reviews.llvm.org/D29874 llvm-svn: 296859 2017-03-03 17:03:24 +08:00			`; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx512f \| FileCheck %s`
			`; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-or-zmm-write \| FileCheck --check-prefix=FAST-YMM-ZMM %s`
Disable the vzeroupper insertion pass on PS4. Differential Revision: http://reviews.llvm.org/D16837 llvm-svn: 260764 2016-02-13 07:37:57 +08:00			`; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=btver2 \| FileCheck --check-prefix=BTVER2 %s`

[X86] Generate VZEROUPPER for Skylake-avx512. VZEROUPPER should not be issued on Knights Landing (KNL), but on Skylake-avx512 it should be. Differential Revision: https://reviews.llvm.org/D29874 llvm-svn: 296859 2017-03-03 17:03:24 +08:00			`; FAST-YMM-ZMM-NOT: vzeroupper`
Disable the vzeroupper insertion pass on PS4. Differential Revision: http://reviews.llvm.org/D16837 llvm-svn: 260764 2016-02-13 07:37:57 +08:00			`; BTVER2-NOT: vzeroupper`
Introduce a pass to insert vzeroupper instructions to avoid AVX to SSE transition penalty. The pass is enabled through the "x86-use-vzeroupper" llc command line option. This is only the first step (very naive and conservative one) to sketch out the idea, but proper DFA is coming next to allow smarter decisions. Comments and ideas now and in further commits will be very appreciated. llvm-svn: 138317 2011-08-23 09:14:17 +08:00
[X86] New and improved VZeroUpperInserter optimization. - Adds support for inserting vzerouppers before tail-calls. This is enabled implicitly by having MachineInstr::copyImplicitOps preserve regmask operands, which allows VZeroUpperInserter to see where tail-calls use vector registers. - Fixes a bug that caused the previous version of this optimization to miss some vzeroupper insertion points in loops. (Loops-with-vector-code that followed loops-without-vector-code were mistakenly overlooked by the previous version). - New algorithm never revisits instructions. Fixes <rdar://problem/16228798> llvm-svn: 204021 2014-03-17 09:22:54 +08:00			`declare i32 @foo()`
Enhanced vzeroupper insertion pass that avoids inserting vzeroupper where it is unnecessary through local analysis. Patch from Bruno Cardoso Lopes, with some additional changes. I'm going to wait for any review comments and perform some additional testing before turning this on by default. llvm-svn: 143750 2011-11-05 07:46:11 +08:00			`declare <4 x float> @do_sse(<4 x float>)`
			`declare <8 x float> @do_avx(<8 x float>)`
			`declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone`
			`@x = common global <4 x float> zeroinitializer, align 16`
			`@g = common global <8 x float> zeroinitializer, align 32`

			`;; Basic checking - don't emit any vzeroupper instruction`
Introduce a pass to insert vzeroupper instructions to avoid AVX to SSE transition penalty. The pass is enabled through the "x86-use-vzeroupper" llc command line option. This is only the first step (very naive and conservative one) to sketch out the idea, but proper DFA is coming next to allow smarter decisions. Comments and ideas now and in further commits will be very appreciated. llvm-svn: 138317 2011-08-23 09:14:17 +08:00
			`; CHECK: _test00`
			`define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {`
			`entry:`
Enhanced vzeroupper insertion pass that avoids inserting vzeroupper where it is unnecessary through local analysis. Patch from Bruno Cardoso Lopes, with some additional changes. I'm going to wait for any review comments and perform some additional testing before turning this on by default. llvm-svn: 143750 2011-11-05 07:46:11 +08:00			`; CHECK-NOT: vzeroupper`
Introduce a pass to insert vzeroupper instructions to avoid AVX to SSE transition penalty. The pass is enabled through the "x86-use-vzeroupper" llc command line option. This is only the first step (very naive and conservative one) to sketch out the idea, but proper DFA is coming next to allow smarter decisions. Comments and ideas now and in further commits will be very appreciated. llvm-svn: 138317 2011-08-23 09:14:17 +08:00			`%add.i = fadd <4 x float> %a, %b`
Enhanced vzeroupper insertion pass that avoids inserting vzeroupper where it is unnecessary through local analysis. Patch from Bruno Cardoso Lopes, with some additional changes. I'm going to wait for any review comments and perform some additional testing before turning this on by default. llvm-svn: 143750 2011-11-05 07:46:11 +08:00			`%call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind`
			`; CHECK: ret`
			`ret <4 x float> %call3`
			`}`

			`;; Check parameter 256-bit parameter passing`

			`; CHECK: _test01`
			`define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind uwtable ssp {`
			`entry:`
[opaque pointer type] Add textual IR support for explicit type parameter to load instruction Essentially the same as the GEP change in r230786. A similar migration script can be used to update test cases, though a few more test case improvements/changes were required this time around: (r229269-r229278) import fileinput import sys import re pat = re.compile(r"((?:=\|:\|^)\sload (?:atomic )?(?:volatile )?(.?))(\| addrspace\(\d+\) )\($\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$)") for line in sys.stdin: sys.stdout.write(re.sub(pat, r"\1, \2\3*\4", line)) Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7649 llvm-svn: 230794 2015-02-28 05:17:42 +08:00			`%tmp = load <4 x float>, <4 x float>* @x, align 16`
Introduce a pass to insert vzeroupper instructions to avoid AVX to SSE transition penalty. The pass is enabled through the "x86-use-vzeroupper" llc command line option. This is only the first step (very naive and conservative one) to sketch out the idea, but proper DFA is coming next to allow smarter decisions. Comments and ideas now and in further commits will be very appreciated. llvm-svn: 138317 2011-08-23 09:14:17 +08:00			`; CHECK: vzeroupper`
			`; CHECK-NEXT: callq _do_sse`
Enhanced vzeroupper insertion pass that avoids inserting vzeroupper where it is unnecessary through local analysis. Patch from Bruno Cardoso Lopes, with some additional changes. I'm going to wait for any review comments and perform some additional testing before turning this on by default. llvm-svn: 143750 2011-11-05 07:46:11 +08:00			`%call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind`
			`store <4 x float> %call, <4 x float>* @x, align 16`
Introduce a pass to insert vzeroupper instructions to avoid AVX to SSE transition penalty. The pass is enabled through the "x86-use-vzeroupper" llc command line option. This is only the first step (very naive and conservative one) to sketch out the idea, but proper DFA is coming next to allow smarter decisions. Comments and ideas now and in further commits will be very appreciated. llvm-svn: 138317 2011-08-23 09:14:17 +08:00			`; CHECK-NOT: vzeroupper`
Enhanced vzeroupper insertion pass that avoids inserting vzeroupper where it is unnecessary through local analysis. Patch from Bruno Cardoso Lopes, with some additional changes. I'm going to wait for any review comments and perform some additional testing before turning this on by default. llvm-svn: 143750 2011-11-05 07:46:11 +08:00			`; CHECK: callq _do_sse`
			`%call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind`
			`store <4 x float> %call2, <4 x float>* @x, align 16`
			`; CHECK: ret`
			`ret <8 x float> %c`
			`}`

[X86] New and improved VZeroUpperInserter optimization. - Adds support for inserting vzerouppers before tail-calls. This is enabled implicitly by having MachineInstr::copyImplicitOps preserve regmask operands, which allows VZeroUpperInserter to see where tail-calls use vector registers. - Fixes a bug that caused the previous version of this optimization to miss some vzeroupper insertion points in loops. (Loops-with-vector-code that followed loops-without-vector-code were mistakenly overlooked by the previous version). - New algorithm never revisits instructions. Fixes <rdar://problem/16228798> llvm-svn: 204021 2014-03-17 09:22:54 +08:00			`;; Check that vzeroupper is emitted for tail calls.`

			`; CHECK: _test02`
			`define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind uwtable ssp {`
			`entry:`
			`%add.i = fadd <8 x float> %a, %b`
			`%add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0)`
			`; CHECK: vzeroupper`
			`; CHECK: jmp _do_sse`
			`%call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind`
			`ret <4 x float> %call3`
			`}`

Enhanced vzeroupper insertion pass that avoids inserting vzeroupper where it is unnecessary through local analysis. Patch from Bruno Cardoso Lopes, with some additional changes. I'm going to wait for any review comments and perform some additional testing before turning this on by default. llvm-svn: 143750 2011-11-05 07:46:11 +08:00			`;; Test the pass convergence and also that vzeroupper is only issued when necessary,`
			`;; for this function it should be only once`

[X86] New and improved VZeroUpperInserter optimization. - Adds support for inserting vzerouppers before tail-calls. This is enabled implicitly by having MachineInstr::copyImplicitOps preserve regmask operands, which allows VZeroUpperInserter to see where tail-calls use vector registers. - Fixes a bug that caused the previous version of this optimization to miss some vzeroupper insertion points in loops. (Loops-with-vector-code that followed loops-without-vector-code were mistakenly overlooked by the previous version). - New algorithm never revisits instructions. Fixes <rdar://problem/16228798> llvm-svn: 204021 2014-03-17 09:22:54 +08:00			`; CHECK: _test03`
			`define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {`
Enhanced vzeroupper insertion pass that avoids inserting vzeroupper where it is unnecessary through local analysis. Patch from Bruno Cardoso Lopes, with some additional changes. I'm going to wait for any review comments and perform some additional testing before turning this on by default. llvm-svn: 143750 2011-11-05 07:46:11 +08:00			`entry:`
			`%add.i = fadd <4 x float> %a, %b`
[X86] New and improved VZeroUpperInserter optimization. - Adds support for inserting vzerouppers before tail-calls. This is enabled implicitly by having MachineInstr::copyImplicitOps preserve regmask operands, which allows VZeroUpperInserter to see where tail-calls use vector registers. - Fixes a bug that caused the previous version of this optimization to miss some vzeroupper insertion points in loops. (Loops-with-vector-code that followed loops-without-vector-code were mistakenly overlooked by the previous version). - New algorithm never revisits instructions. Fixes <rdar://problem/16228798> llvm-svn: 204021 2014-03-17 09:22:54 +08:00			`br label %while.cond`
Enhanced vzeroupper insertion pass that avoids inserting vzeroupper where it is unnecessary through local analysis. Patch from Bruno Cardoso Lopes, with some additional changes. I'm going to wait for any review comments and perform some additional testing before turning this on by default. llvm-svn: 143750 2011-11-05 07:46:11 +08:00
[X86] New and improved VZeroUpperInserter optimization. - Adds support for inserting vzerouppers before tail-calls. This is enabled implicitly by having MachineInstr::copyImplicitOps preserve regmask operands, which allows VZeroUpperInserter to see where tail-calls use vector registers. - Fixes a bug that caused the previous version of this optimization to miss some vzeroupper insertion points in loops. (Loops-with-vector-code that followed loops-without-vector-code were mistakenly overlooked by the previous version). - New algorithm never revisits instructions. Fixes <rdar://problem/16228798> llvm-svn: 204021 2014-03-17 09:22:54 +08:00			`while.cond:`
			`%call = tail call i32 @foo()`
			`%tobool = icmp eq i32 %call, 0`
			`br i1 %tobool, label %for.body, label %while.cond`

			`for.body:`
Enhanced vzeroupper insertion pass that avoids inserting vzeroupper where it is unnecessary through local analysis. Patch from Bruno Cardoso Lopes, with some additional changes. I'm going to wait for any review comments and perform some additional testing before turning this on by default. llvm-svn: 143750 2011-11-05 07:46:11 +08:00			`; CHECK: LBB`
			`; CHECK-NOT: vzeroupper`
[X86] New and improved VZeroUpperInserter optimization. - Adds support for inserting vzerouppers before tail-calls. This is enabled implicitly by having MachineInstr::copyImplicitOps preserve regmask operands, which allows VZeroUpperInserter to see where tail-calls use vector registers. - Fixes a bug that caused the previous version of this optimization to miss some vzeroupper insertion points in loops. (Loops-with-vector-code that followed loops-without-vector-code were mistakenly overlooked by the previous version). - New algorithm never revisits instructions. Fixes <rdar://problem/16228798> llvm-svn: 204021 2014-03-17 09:22:54 +08:00			`%i.018 = phi i32 [ 0, %while.cond ], [ %1, %for.body ]`
			`%c.017 = phi <4 x float> [ %add.i, %while.cond ], [ %call14, %for.body ]`
Enhanced vzeroupper insertion pass that avoids inserting vzeroupper where it is unnecessary through local analysis. Patch from Bruno Cardoso Lopes, with some additional changes. I'm going to wait for any review comments and perform some additional testing before turning this on by default. llvm-svn: 143750 2011-11-05 07:46:11 +08:00			`; CHECK: callq _do_sse`
			`%call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind`
			`; CHECK-NEXT: callq _do_sse`
			`%call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind`
[opaque pointer type] Add textual IR support for explicit type parameter to load instruction Essentially the same as the GEP change in r230786. A similar migration script can be used to update test cases, though a few more test case improvements/changes were required this time around: (r229269-r229278) import fileinput import sys import re pat = re.compile(r"((?:=\|:\|^)\sload (?:atomic )?(?:volatile )?(.?))(\| addrspace\(\d+\) )\($\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$)") for line in sys.stdin: sys.stdout.write(re.sub(pat, r"\1, \2\3*\4", line)) Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7649 llvm-svn: 230794 2015-02-28 05:17:42 +08:00			`%tmp11 = load <8 x float>, <8 x float>* @g, align 32`
Enhanced vzeroupper insertion pass that avoids inserting vzeroupper where it is unnecessary through local analysis. Patch from Bruno Cardoso Lopes, with some additional changes. I'm going to wait for any review comments and perform some additional testing before turning this on by default. llvm-svn: 143750 2011-11-05 07:46:11 +08:00			`%0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind`
Introduce a pass to insert vzeroupper instructions to avoid AVX to SSE transition penalty. The pass is enabled through the "x86-use-vzeroupper" llc command line option. This is only the first step (very naive and conservative one) to sketch out the idea, but proper DFA is coming next to allow smarter decisions. Comments and ideas now and in further commits will be very appreciated. llvm-svn: 138317 2011-08-23 09:14:17 +08:00			`; CHECK: vzeroupper`
Enhanced vzeroupper insertion pass that avoids inserting vzeroupper where it is unnecessary through local analysis. Patch from Bruno Cardoso Lopes, with some additional changes. I'm going to wait for any review comments and perform some additional testing before turning this on by default. llvm-svn: 143750 2011-11-05 07:46:11 +08:00			`; CHECK-NEXT: callq _do_sse`
			`%call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind`
			`%1 = add nsw i32 %i.018, 1`
			`%exitcond = icmp eq i32 %1, 4`
			`br i1 %exitcond, label %for.end, label %for.body`

[X86] New and improved VZeroUpperInserter optimization. - Adds support for inserting vzerouppers before tail-calls. This is enabled implicitly by having MachineInstr::copyImplicitOps preserve regmask operands, which allows VZeroUpperInserter to see where tail-calls use vector registers. - Fixes a bug that caused the previous version of this optimization to miss some vzeroupper insertion points in loops. (Loops-with-vector-code that followed loops-without-vector-code were mistakenly overlooked by the previous version). - New algorithm never revisits instructions. Fixes <rdar://problem/16228798> llvm-svn: 204021 2014-03-17 09:22:54 +08:00			`for.end:`
Enhanced vzeroupper insertion pass that avoids inserting vzeroupper where it is unnecessary through local analysis. Patch from Bruno Cardoso Lopes, with some additional changes. I'm going to wait for any review comments and perform some additional testing before turning this on by default. llvm-svn: 143750 2011-11-05 07:46:11 +08:00			`ret <4 x float> %call14`
Introduce a pass to insert vzeroupper instructions to avoid AVX to SSE transition penalty. The pass is enabled through the "x86-use-vzeroupper" llc command line option. This is only the first step (very naive and conservative one) to sketch out the idea, but proper DFA is coming next to allow smarter decisions. Comments and ideas now and in further commits will be very appreciated. llvm-svn: 138317 2011-08-23 09:14:17 +08:00			`}`

Enhanced vzeroupper insertion pass that avoids inserting vzeroupper where it is unnecessary through local analysis. Patch from Bruno Cardoso Lopes, with some additional changes. I'm going to wait for any review comments and perform some additional testing before turning this on by default. llvm-svn: 143750 2011-11-05 07:46:11 +08:00			`;; Check that we also perform vzeroupper when we return from a function.`

[X86] New and improved VZeroUpperInserter optimization. - Adds support for inserting vzerouppers before tail-calls. This is enabled implicitly by having MachineInstr::copyImplicitOps preserve regmask operands, which allows VZeroUpperInserter to see where tail-calls use vector registers. - Fixes a bug that caused the previous version of this optimization to miss some vzeroupper insertion points in loops. (Loops-with-vector-code that followed loops-without-vector-code were mistakenly overlooked by the previous version). - New algorithm never revisits instructions. Fixes <rdar://problem/16228798> llvm-svn: 204021 2014-03-17 09:22:54 +08:00			`; CHECK: _test04`
			`define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {`
Enhanced vzeroupper insertion pass that avoids inserting vzeroupper where it is unnecessary through local analysis. Patch from Bruno Cardoso Lopes, with some additional changes. I'm going to wait for any review comments and perform some additional testing before turning this on by default. llvm-svn: 143750 2011-11-05 07:46:11 +08:00			`entry:`
			`%shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>`
			`; CHECK-NOT: vzeroupper`
			`; CHECK: call`
			`%call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind`
			`%shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>`
			`; CHECK: vzeroupper`
			`; CHECK: ret`
			`ret <4 x float> %shuf2`
			`}`