Automatically generate AVX512 test cases. NFC

llvm-svn: 369264
This commit is contained in:
Amaury Sechet 2019-08-19 14:34:08 +00:00
parent 9d5e8a476f
commit 8130154115
9 changed files with 255 additions and 199 deletions

View File

@ -11,12 +11,12 @@ define void @bar__512(<16 x i32>* %var) #0 {
; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: vmovups (%rdi), %zmm0
; CHECK-NEXT: vmovups %zmm0, (%rsp) ## 64-byte Spill
; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %zmm1
; CHECK-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; CHECK-NEXT: vmovaps %zmm1, (%rdi)
; CHECK-NEXT: callq _Print__512
; CHECK-NEXT: vmovups (%rsp), %zmm0 ## 64-byte Reload
; CHECK-NEXT: callq _Print__512
; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %zmm0
; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; CHECK-NEXT: vmovaps %zmm0, (%rbx)
; CHECK-NEXT: addq $112, %rsp
; CHECK-NEXT: popq %rbx

View File

@ -9,7 +9,7 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>
define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss_load0(<4 x float>* %x0ptr, <4 x float> %x1, <4 x float> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_load0:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1
; CHECK-NEXT: vfmadd231ss {{.*#+}} xmm1 = (xmm0 * mem) + xmm1
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%x0 = load <4 x float>, <4 x float>* %x0ptr
@ -20,7 +20,7 @@ define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss_load0(<4 x float>* %x0pt
define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss_load1(<4 x float> %x0, <4 x float>* %x1ptr, <4 x float> %x2){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_load1:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1
; CHECK-NEXT: vfmadd231ss {{.*#+}} xmm1 = (xmm0 * mem) + xmm1
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%x1 = load <4 x float>, <4 x float>* %x1ptr
@ -31,7 +31,7 @@ define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss_load1(<4 x float> %x0, <
define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd_load0(<2 x double>* %x0ptr, <2 x double> %x1, <2 x double> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd_load0:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd231sd (%rdi), %xmm0, %xmm1
; CHECK-NEXT: vfmadd231sd {{.*#+}} xmm1 = (xmm0 * mem) + xmm1
; CHECK-NEXT: vmovapd %xmm1, %xmm0
; CHECK-NEXT: retq
%x0 = load <2 x double>, <2 x double>* %x0ptr
@ -42,7 +42,7 @@ define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd_load0(<2 x double>* %x0
define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd_load1(<2 x double> %x0, <2 x double>* %x1ptr, <2 x double> %x2){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd_load1:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd231sd (%rdi), %xmm0, %xmm1
; CHECK-NEXT: vfmadd231sd {{.*#+}} xmm1 = (xmm0 * mem) + xmm1
; CHECK-NEXT: vmovapd %xmm1, %xmm0
; CHECK-NEXT: retq
%x1 = load <2 x double>, <2 x double>* %x1ptr
@ -53,7 +53,7 @@ define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd_load1(<2 x double> %x0,
define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss_load0(<4 x float>* %x0ptr, <4 x float> %x1, <4 x float> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ss_load0:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmsub231ss (%rdi), %xmm0, %xmm1
; CHECK-NEXT: vfmsub231ss {{.*#+}} xmm1 = (xmm0 * mem) - xmm1
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%x0 = load <4 x float>, <4 x float>* %x0ptr
@ -64,7 +64,7 @@ define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss_load0(<4 x float>* %x0pt
define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss_load1(<4 x float> %x0, <4 x float>* %x1ptr, <4 x float> %x2){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ss_load1:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmsub231ss (%rdi), %xmm0, %xmm1
; CHECK-NEXT: vfmsub231ss {{.*#+}} xmm1 = (xmm0 * mem) - xmm1
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%x1 = load <4 x float>, <4 x float>* %x1ptr
@ -75,7 +75,7 @@ define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss_load1(<4 x float> %x0, <
define <2 x double> @test_int_x86_avx512_mask3_vfmsub_sd_load0(<2 x double>* %x0ptr, <2 x double> %x1, <2 x double> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_sd_load0:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1
; CHECK-NEXT: vfmsub231sd {{.*#+}} xmm1 = (xmm0 * mem) - xmm1
; CHECK-NEXT: vmovapd %xmm1, %xmm0
; CHECK-NEXT: retq
%x0 = load <2 x double>, <2 x double>* %x0ptr
@ -86,7 +86,7 @@ define <2 x double> @test_int_x86_avx512_mask3_vfmsub_sd_load0(<2 x double>* %x0
define <2 x double> @test_int_x86_avx512_mask3_vfmsub_sd_load1(<2 x double> %x0, <2 x double>* %x1ptr, <2 x double> %x2){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_sd_load1:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1
; CHECK-NEXT: vfmsub231sd {{.*#+}} xmm1 = (xmm0 * mem) - xmm1
; CHECK-NEXT: vmovapd %xmm1, %xmm0
; CHECK-NEXT: retq
%x1 = load <2 x double>, <2 x double>* %x1ptr

View File

@ -1,10 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
;CHECK-LABEL: test
;CHECK-NOT: dec
;CHECK-NOT: enc
;CHECK: ret
define i32 @test(i32 %a, i32 %b) {
; CHECK-LABEL: test:
; CHECK: ## %bb.0:
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: leal -1(%rdi), %eax
; CHECK-NEXT: addl $1, %esi
; CHECK-NEXT: imull %esi, %eax
; CHECK-NEXT: retq
%a1 = add i32 %a, -1
%b1 = add i32 %b, 1
%res = mul i32 %a1, %b1

View File

@ -94,12 +94,12 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-64, %esp
; X32-NEXT: subl $256, %esp ## imm = 0x100
; X32-NEXT: vmovaps %zmm1, {{[0-9]+}}(%esp) ## 64-byte Spill
; X32-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill
; X32-NEXT: vaddps %zmm1, %zmm0, %zmm0
; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl %eax, (%esp)
; X32-NEXT: calll _func_float16_ptr
; X32-NEXT: vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0 ## 64-byte Folded Reload
; X32-NEXT: vaddps {{[-0-9]+}}(%e{{[sb]}}p), %zmm0, %zmm0 ## 64-byte Folded Reload
; X32-NEXT: vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0
; X32-NEXT: movl %ebp, %esp
; X32-NEXT: popl %ebp
@ -186,52 +186,52 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
; WIN64-KNL-NEXT: pushq %rbp
; WIN64-KNL-NEXT: subq $1264, %rsp # imm = 0x4F0
; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rbp
; WIN64-KNL-NEXT: kmovw %k7, 1134(%rbp) # 2-byte Spill
; WIN64-KNL-NEXT: kmovw %k6, 1132(%rbp) # 2-byte Spill
; WIN64-KNL-NEXT: kmovw %k5, 1130(%rbp) # 2-byte Spill
; WIN64-KNL-NEXT: kmovw %k4, 1128(%rbp) # 2-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm21, 1024(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm20, 960(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm19, 896(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm18, 832(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm17, 768(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm16, 704(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm15, 640(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm14, 576(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm13, 512(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm12, 448(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm11, 384(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm10, 320(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm9, 256(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm8, 192(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm7, 128(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm6, 64(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; WIN64-KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; WIN64-KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; WIN64-KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-KNL-NEXT: andq $-64, %rsp
; WIN64-KNL-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp)
; WIN64-KNL-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
; WIN64-KNL-NEXT: callq func_float16
; WIN64-KNL-NEXT: vmovaps 64(%rbp), %zmm6 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 128(%rbp), %zmm7 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 192(%rbp), %zmm8 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 256(%rbp), %zmm9 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 320(%rbp), %zmm10 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 384(%rbp), %zmm11 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 448(%rbp), %zmm12 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 512(%rbp), %zmm13 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 576(%rbp), %zmm14 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 640(%rbp), %zmm15 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 704(%rbp), %zmm16 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 768(%rbp), %zmm17 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 832(%rbp), %zmm18 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 896(%rbp), %zmm19 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 960(%rbp), %zmm20 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 1024(%rbp), %zmm21 # 64-byte Reload
; WIN64-KNL-NEXT: kmovw 1128(%rbp), %k4 # 2-byte Reload
; WIN64-KNL-NEXT: kmovw 1130(%rbp), %k5 # 2-byte Reload
; WIN64-KNL-NEXT: kmovw 1132(%rbp), %k6 # 2-byte Reload
; WIN64-KNL-NEXT: kmovw 1134(%rbp), %k7 # 2-byte Reload
; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; WIN64-KNL-NEXT: leaq 1136(%rbp), %rsp
; WIN64-KNL-NEXT: popq %rbp
; WIN64-KNL-NEXT: retq
@ -241,52 +241,52 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
; WIN64-SKX-NEXT: pushq %rbp
; WIN64-SKX-NEXT: subq $1264, %rsp # imm = 0x4F0
; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rbp
; WIN64-SKX-NEXT: kmovq %k7, 1128(%rbp) # 8-byte Spill
; WIN64-SKX-NEXT: kmovq %k6, 1120(%rbp) # 8-byte Spill
; WIN64-SKX-NEXT: kmovq %k5, 1112(%rbp) # 8-byte Spill
; WIN64-SKX-NEXT: kmovq %k4, 1104(%rbp) # 8-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm21, 1024(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm20, 960(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm19, 896(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm18, 832(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm17, 768(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm16, 704(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm15, 640(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm14, 576(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm13, 512(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm12, 448(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm11, 384(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm10, 320(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm9, 256(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm8, 192(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm7, 128(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm6, 64(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: kmovq %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; WIN64-SKX-NEXT: kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; WIN64-SKX-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; WIN64-SKX-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; WIN64-SKX-NEXT: andq $-64, %rsp
; WIN64-SKX-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp)
; WIN64-SKX-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
; WIN64-SKX-NEXT: callq func_float16
; WIN64-SKX-NEXT: vmovaps 64(%rbp), %zmm6 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 128(%rbp), %zmm7 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 192(%rbp), %zmm8 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 256(%rbp), %zmm9 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 320(%rbp), %zmm10 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 384(%rbp), %zmm11 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 448(%rbp), %zmm12 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 512(%rbp), %zmm13 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 576(%rbp), %zmm14 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 640(%rbp), %zmm15 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 704(%rbp), %zmm16 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 768(%rbp), %zmm17 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 832(%rbp), %zmm18 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 896(%rbp), %zmm19 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 960(%rbp), %zmm20 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 1024(%rbp), %zmm21 # 64-byte Reload
; WIN64-SKX-NEXT: kmovq 1104(%rbp), %k4 # 8-byte Reload
; WIN64-SKX-NEXT: kmovq 1112(%rbp), %k5 # 8-byte Reload
; WIN64-SKX-NEXT: kmovq 1120(%rbp), %k6 # 8-byte Reload
; WIN64-SKX-NEXT: kmovq 1128(%rbp), %k7 # 8-byte Reload
; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload
; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 8-byte Reload
; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; WIN64-SKX-NEXT: leaq 1136(%rbp), %rsp
; WIN64-SKX-NEXT: popq %rbp
; WIN64-SKX-NEXT: retq
@ -296,47 +296,47 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
; X64-KNL-NEXT: pushq %rsi
; X64-KNL-NEXT: pushq %rdi
; X64-KNL-NEXT: subq $1064, %rsp ## imm = 0x428
; X64-KNL-NEXT: kmovw %k7, {{[0-9]+}}(%rsp) ## 2-byte Spill
; X64-KNL-NEXT: kmovw %k6, {{[0-9]+}}(%rsp) ## 2-byte Spill
; X64-KNL-NEXT: kmovw %k5, {{[0-9]+}}(%rsp) ## 2-byte Spill
; X64-KNL-NEXT: kmovw %k4, {{[0-9]+}}(%rsp) ## 2-byte Spill
; X64-KNL-NEXT: vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; X64-KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; X64-KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; X64-KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; X64-KNL-NEXT: vmovups %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-KNL-NEXT: vmovups %zmm16, (%rsp) ## 64-byte Spill
; X64-KNL-NEXT: callq _func_float16
; X64-KNL-NEXT: vmovups (%rsp), %zmm16 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload
; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k4 ## 2-byte Reload
; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k5 ## 2-byte Reload
; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k6 ## 2-byte Reload
; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k7 ## 2-byte Reload
; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload
; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload
; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
; X64-KNL-NEXT: addq $1064, %rsp ## imm = 0x428
; X64-KNL-NEXT: popq %rdi
; X64-KNL-NEXT: popq %rsi
@ -347,47 +347,47 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
; X64-SKX-NEXT: pushq %rsi
; X64-SKX-NEXT: pushq %rdi
; X64-SKX-NEXT: subq $1064, %rsp ## imm = 0x428
; X64-SKX-NEXT: kmovq %k7, {{[0-9]+}}(%rsp) ## 8-byte Spill
; X64-SKX-NEXT: kmovq %k6, {{[0-9]+}}(%rsp) ## 8-byte Spill
; X64-SKX-NEXT: kmovq %k5, {{[0-9]+}}(%rsp) ## 8-byte Spill
; X64-SKX-NEXT: kmovq %k4, {{[0-9]+}}(%rsp) ## 8-byte Spill
; X64-SKX-NEXT: vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill
; X64-SKX-NEXT: kmovq %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-SKX-NEXT: kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-SKX-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-SKX-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-SKX-NEXT: vmovups %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
; X64-SKX-NEXT: vmovups %zmm16, (%rsp) ## 64-byte Spill
; X64-SKX-NEXT: callq _func_float16
; X64-SKX-NEXT: vmovups (%rsp), %zmm16 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload
; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k4 ## 8-byte Reload
; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k5 ## 8-byte Reload
; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k6 ## 8-byte Reload
; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k7 ## 8-byte Reload
; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload
; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload
; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 8-byte Reload
; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 8-byte Reload
; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 8-byte Reload
; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 8-byte Reload
; X64-SKX-NEXT: addq $1064, %rsp ## imm = 0x428
; X64-SKX-NEXT: popq %rdi
; X64-SKX-NEXT: popq %rsi

View File

@ -10,9 +10,9 @@ define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) {
; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0
; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
; CHECK-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
; CHECK-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
; CHECK-NEXT: vpmovm2d %k0, %xmm0
; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq
@ -32,10 +32,10 @@ define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) {
; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0
; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
; CHECK-NEXT: korb %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
; CHECK-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
; CHECK-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq
@ -55,10 +55,10 @@ define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) {
; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
; CHECK-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
; CHECK-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
; CHECK-NEXT: vpmovm2b %k0, %xmm0
; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq
@ -77,10 +77,10 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) {
; CHECK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0
; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
; CHECK-NEXT: kord %k1, %k0, %k0
; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill
; CHECK-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload
; CHECK-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 4-byte Reload
; CHECK-NEXT: vpmovm2b %k0, %ymm0
; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq

View File

@ -58,7 +58,7 @@ define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %a, <2 x do
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vfmadd213sd (%rdi), %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
; CHECK-NEXT: retq
%c.val = load double, double* %c
%cv0 = insertelement <2 x double> undef, double %c.val, i32 0

View File

@ -1,33 +1,63 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw | FileCheck %s
define i32 @f(<16 x float> %A, <16 x float> %AA, i8* %B, <8 x double> %C, <8 x double> %CC, <8 x i64> %E, <8 x i64> %EE, <16 x i32> %F, <16 x i32> %FF, <32 x i16> %G, <32 x i16> %GG, <64 x i8> %H, <64 x i8> %HH, i32 * %loadptr) {
; CHECK: vmovntps %z
; CHECK-LABEL: f:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-64, %rsp
; CHECK-NEXT: subq $64, %rsp
; CHECK-NEXT: vmovdqa64 144(%rbp), %zmm8
; CHECK-NEXT: vmovdqa64 16(%rbp), %zmm9
; CHECK-NEXT: movl (%rsi), %eax
; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vmovntps %zmm0, (%rdi)
; CHECK-NEXT: vpaddq %zmm5, %zmm4, %zmm0
; CHECK-NEXT: addl (%rsi), %eax
; CHECK-NEXT: vmovntdq %zmm0, (%rdi)
; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm0
; CHECK-NEXT: addl (%rsi), %eax
; CHECK-NEXT: vmovntpd %zmm0, (%rdi)
; CHECK-NEXT: vpaddd %zmm7, %zmm6, %zmm0
; CHECK-NEXT: addl (%rsi), %eax
; CHECK-NEXT: vmovntdq %zmm0, (%rdi)
; CHECK-NEXT: vpaddw 80(%rbp), %zmm9, %zmm0
; CHECK-NEXT: addl (%rsi), %eax
; CHECK-NEXT: vmovntdq %zmm0, (%rdi)
; CHECK-NEXT: vpaddb 208(%rbp), %zmm8, %zmm0
; CHECK-NEXT: addl (%rsi), %eax
; CHECK-NEXT: vmovntdq %zmm0, (%rdi)
; CHECK-NEXT: addl (%rsi), %eax
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .cfi_def_cfa %rsp, 8
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%v0 = load i32, i32* %loadptr, align 1
%cast = bitcast i8* %B to <16 x float>*
%A2 = fadd <16 x float> %A, %AA
store <16 x float> %A2, <16 x float>* %cast, align 64, !nontemporal !0
%v1 = load i32, i32* %loadptr, align 1
; CHECK: vmovntdq %z
%cast1 = bitcast i8* %B to <8 x i64>*
%E2 = add <8 x i64> %E, %EE
store <8 x i64> %E2, <8 x i64>* %cast1, align 64, !nontemporal !0
%v2 = load i32, i32* %loadptr, align 1
; CHECK: vmovntpd %z
%cast2 = bitcast i8* %B to <8 x double>*
%C2 = fadd <8 x double> %C, %CC
store <8 x double> %C2, <8 x double>* %cast2, align 64, !nontemporal !0
%v3 = load i32, i32* %loadptr, align 1
; CHECK: vmovntdq %z
%cast3 = bitcast i8* %B to <16 x i32>*
%F2 = add <16 x i32> %F, %FF
store <16 x i32> %F2, <16 x i32>* %cast3, align 64, !nontemporal !0
%v4 = load i32, i32* %loadptr, align 1
; CHECK: vmovntdq %z
%cast4 = bitcast i8* %B to <32 x i16>*
%G2 = add <32 x i16> %G, %GG
store <32 x i16> %G2, <32 x i16>* %cast4, align 64, !nontemporal !0
%v5 = load i32, i32* %loadptr, align 1
; CHECK: vmovntdq %z
%cast5 = bitcast i8* %B to <64 x i8>*
%H2 = add <64 x i8> %H, %HH
store <64 x i8> %H2, <64 x i8>* %cast5, align 64, !nontemporal !0

View File

@ -8,7 +8,7 @@ define <4 x float>@test_var_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %
; CHECK-LABEL: test_var_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 %mask, i32 4)
ret < 4 x float> %res
@ -18,7 +18,7 @@ define <4 x float>@test_var_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float>
; CHECK-LABEL: test_var_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 %mask, i32 4)
ret < 4 x float> %res
@ -67,7 +67,7 @@ define <4 x float>@test_const2_maskz(<4 x float> %v0, <4 x float> %v1, <4 x floa
define <4 x float>@test_const_allone_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
; CHECK-LABEL: test_const_allone_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 -1, i32 4)
ret < 4 x float> %res
@ -76,7 +76,7 @@ define <4 x float>@test_const_allone_mask(<4 x float> %v0, <4 x float> %v1, <4 x
define <4 x float>@test_const_allone_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
; CHECK-LABEL: test_const_allone_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 -1, i32 4)
ret < 4 x float> %res
@ -85,7 +85,7 @@ define <4 x float>@test_const_allone_maskz(<4 x float> %v0, <4 x float> %v1, <4
define <4 x float>@test_const_3_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
; CHECK-LABEL: test_const_3_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 3, i32 4)
ret < 4 x float> %res
@ -94,7 +94,7 @@ define <4 x float>@test_const_3_mask(<4 x float> %v0, <4 x float> %v1, <4 x floa
define <4 x float>@test_const_3_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
; CHECK-LABEL: test_const_3_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 3, i32 4)
ret < 4 x float> %res

View File

@ -1,17 +1,29 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding | FileCheck %s
define i32 @f256(<8 x float> %A, <8 x float> %AA, i8* %B, <4 x double> %C, <4 x double> %CC, i32 %D, <4 x i64> %E, <4 x i64> %EE, i32* %loadptr) {
; CHECK: vmovntps %ymm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
; CHECK-LABEL: f256:
; CHECK: ## %bb.0:
; CHECK-NEXT: movl (%rdx), %eax ## encoding: [0x8b,0x02]
; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
; CHECK-NEXT: vmovntps %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x2b,0x07]
; CHECK-NEXT: vpaddq %ymm5, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xd4,0xc5]
; CHECK-NEXT: addl (%rdx), %eax ## encoding: [0x03,0x02]
; CHECK-NEXT: vmovntdq %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x07]
; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3]
; CHECK-NEXT: addl (%rdx), %eax ## encoding: [0x03,0x02]
; CHECK-NEXT: vmovntpd %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x2b,0x07]
; CHECK-NEXT: addl (%rdx), %eax ## encoding: [0x03,0x02]
; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%v0 = load i32, i32* %loadptr, align 1
%cast = bitcast i8* %B to <8 x float>*
%A2 = fadd <8 x float> %A, %AA
store <8 x float> %A2, <8 x float>* %cast, align 64, !nontemporal !0
; CHECK: vmovntdq %ymm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
%v1 = load i32, i32* %loadptr, align 1
%cast1 = bitcast i8* %B to <4 x i64>*
%E2 = add <4 x i64> %E, %EE
store <4 x i64> %E2, <4 x i64>* %cast1, align 64, !nontemporal !0
; CHECK: vmovntpd %ymm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
%v2 = load i32, i32* %loadptr, align 1
%cast2 = bitcast i8* %B to <4 x double>*
%C2 = fadd <4 x double> %C, %CC
@ -24,17 +36,27 @@ define i32 @f256(<8 x float> %A, <8 x float> %AA, i8* %B, <4 x double> %C, <4 x
}
define i32 @f128(<4 x float> %A, <4 x float> %AA, i8* %B, <2 x double> %C, <2 x double> %CC, i32 %D, <2 x i64> %E, <2 x i64> %EE, i32* %loadptr) {
; CHECK-LABEL: f128:
; CHECK: ## %bb.0:
; CHECK-NEXT: movl (%rdx), %eax ## encoding: [0x8b,0x02]
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
; CHECK-NEXT: vmovntps %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x07]
; CHECK-NEXT: vpaddq %xmm5, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xd4,0xc5]
; CHECK-NEXT: addl (%rdx), %eax ## encoding: [0x03,0x02]
; CHECK-NEXT: vmovntdq %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe7,0x07]
; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3]
; CHECK-NEXT: addl (%rdx), %eax ## encoding: [0x03,0x02]
; CHECK-NEXT: vmovntpd %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2b,0x07]
; CHECK-NEXT: addl (%rdx), %eax ## encoding: [0x03,0x02]
; CHECK-NEXT: retq ## encoding: [0xc3]
%v0 = load i32, i32* %loadptr, align 1
; CHECK: vmovntps %xmm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
%cast = bitcast i8* %B to <4 x float>*
%A2 = fadd <4 x float> %A, %AA
store <4 x float> %A2, <4 x float>* %cast, align 64, !nontemporal !0
; CHECK: vmovntdq %xmm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
%v1 = load i32, i32* %loadptr, align 1
%cast1 = bitcast i8* %B to <2 x i64>*
%E2 = add <2 x i64> %E, %EE
store <2 x i64> %E2, <2 x i64>* %cast1, align 64, !nontemporal !0
; CHECK: vmovntpd %xmm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
%v2 = load i32, i32* %loadptr, align 1
%cast2 = bitcast i8* %B to <2 x double>*
%C2 = fadd <2 x double> %C, %CC