llvm-project/llvm/test/CodeGen/X86/avx-vzeroupper.ll

109 lines
3.9 KiB
LLVM

; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck --check-prefix=FAST-YMM-ZMM %s
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck --check-prefix=BTVER2 %s
; FAST-YMM-ZMM-NOT: vzeroupper
; BTVER2-NOT: vzeroupper
declare i32 @foo()
declare <4 x float> @do_sse(<4 x float>)
declare <8 x float> @do_avx(<8 x float>)
declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
@x = common global <4 x float> zeroinitializer, align 16
@g = common global <8 x float> zeroinitializer, align 32
;; Basic checking - don't emit any vzeroupper instruction
; CHECK: _test00
define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
entry:
; CHECK-NOT: vzeroupper
%add.i = fadd <4 x float> %a, %b
%call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind
; CHECK: ret
ret <4 x float> %call3
}
;; Check parameter 256-bit parameter passing
; CHECK: _test01
define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind uwtable ssp {
entry:
%tmp = load <4 x float>, <4 x float>* @x, align 16
; CHECK: vzeroupper
; CHECK-NEXT: callq _do_sse
%call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind
store <4 x float> %call, <4 x float>* @x, align 16
; CHECK-NOT: vzeroupper
; CHECK: callq _do_sse
%call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind
store <4 x float> %call2, <4 x float>* @x, align 16
; CHECK: ret
ret <8 x float> %c
}
;; Check that vzeroupper is emitted for tail calls.
; CHECK: _test02
define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind uwtable ssp {
entry:
%add.i = fadd <8 x float> %a, %b
%add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0)
; CHECK: vzeroupper
; CHECK: jmp _do_sse
%call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind
ret <4 x float> %call3
}
;; Test the pass convergence and also that vzeroupper is only issued when necessary,
;; for this function it should be only once
; CHECK: _test03
define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
entry:
%add.i = fadd <4 x float> %a, %b
br label %while.cond
while.cond:
%call = tail call i32 @foo()
%tobool = icmp eq i32 %call, 0
br i1 %tobool, label %for.body, label %while.cond
for.body:
; CHECK: LBB
; CHECK-NOT: vzeroupper
%i.018 = phi i32 [ 0, %while.cond ], [ %1, %for.body ]
%c.017 = phi <4 x float> [ %add.i, %while.cond ], [ %call14, %for.body ]
; CHECK: callq _do_sse
%call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind
; CHECK-NEXT: callq _do_sse
%call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind
%tmp11 = load <8 x float>, <8 x float>* @g, align 32
%0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind
; CHECK: vzeroupper
; CHECK-NEXT: callq _do_sse
%call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind
%1 = add nsw i32 %i.018, 1
%exitcond = icmp eq i32 %1, 4
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret <4 x float> %call14
}
;; Check that we also perform vzeroupper when we return from a function.
; CHECK: _test04
define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
entry:
%shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NOT: vzeroupper
; CHECK: call
%call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind
%shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK: vzeroupper
; CHECK: ret
ret <4 x float> %shuf2
}