From 81301541159280ae66eeeda32786c474ff76264c Mon Sep 17 00:00:00 2001 From: Amaury Sechet Date: Mon, 19 Aug 2019 14:34:08 +0000 Subject: [PATCH] Automatically generate AVX512 test cases. NFC llvm-svn: 369264 --- llvm/test/CodeGen/X86/avx512-bugfix-25270.ll | 4 +- llvm/test/CodeGen/X86/avx512-fma-commute.ll | 16 +- llvm/test/CodeGen/X86/avx512-inc-dec.ll | 12 +- llvm/test/CodeGen/X86/avx512-intel-ocl.ll | 316 +++++++++--------- llvm/test/CodeGen/X86/avx512-mask-spills.ll | 16 +- llvm/test/CodeGen/X86/avx512-memfold.ll | 2 +- llvm/test/CodeGen/X86/avx512-nontemporal.ll | 42 ++- llvm/test/CodeGen/X86/avx512-scalar_mask.ll | 12 +- llvm/test/CodeGen/X86/avx512vl-nontemporal.ll | 34 +- 9 files changed, 255 insertions(+), 199 deletions(-) diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll b/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll index b48bf4fe2549..c3466df4d9ee 100644 --- a/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll +++ b/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll @@ -11,12 +11,12 @@ define void @bar__512(<16 x i32>* %var) #0 { ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: vmovups (%rdi), %zmm0 ; CHECK-NEXT: vmovups %zmm0, (%rsp) ## 64-byte Spill -; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %zmm1 +; CHECK-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; CHECK-NEXT: vmovaps %zmm1, (%rdi) ; CHECK-NEXT: callq _Print__512 ; CHECK-NEXT: vmovups (%rsp), %zmm0 ## 64-byte Reload ; CHECK-NEXT: callq _Print__512 -; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %zmm0 +; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; CHECK-NEXT: vmovaps %zmm0, (%rbx) ; CHECK-NEXT: addq $112, %rsp ; CHECK-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/avx512-fma-commute.ll b/llvm/test/CodeGen/X86/avx512-fma-commute.ll index 194255179270..bf156b41a2db 100644 --- a/llvm/test/CodeGen/X86/avx512-fma-commute.ll +++ b/llvm/test/CodeGen/X86/avx512-fma-commute.ll @@ -9,7 +9,7 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double> define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss_load0(<4 x float>* %x0ptr, <4 x float> %x1, <4 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_load0: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1 +; CHECK-NEXT: vfmadd231ss {{.*#+}} xmm1 = (xmm0 * mem) + xmm1 ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %x0 = load <4 x float>, <4 x float>* %x0ptr @@ -20,7 +20,7 @@ define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss_load0(<4 x float>* %x0pt define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss_load1(<4 x float> %x0, <4 x float>* %x1ptr, <4 x float> %x2){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_load1: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1 +; CHECK-NEXT: vfmadd231ss {{.*#+}} xmm1 = (xmm0 * mem) + xmm1 ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %x1 = load <4 x float>, <4 x float>* %x1ptr @@ -31,7 +31,7 @@ define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss_load1(<4 x float> %x0, < define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd_load0(<2 x double>* %x0ptr, <2 x double> %x1, <2 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd_load0: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmadd231sd (%rdi), %xmm0, %xmm1 +; CHECK-NEXT: vfmadd231sd {{.*#+}} xmm1 = (xmm0 * mem) + xmm1 ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: retq %x0 = load <2 x double>, <2 x double>* %x0ptr @@ -42,7 +42,7 @@ define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd_load0(<2 x double>* %x0 define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd_load1(<2 x double> %x0, <2 x double>* %x1ptr, <2 x double> %x2){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd_load1: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmadd231sd (%rdi), %xmm0, %xmm1 +; CHECK-NEXT: vfmadd231sd {{.*#+}} xmm1 = (xmm0 * mem) + xmm1 ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: retq %x1 = load <2 x double>, <2 x double>* %x1ptr @@ -53,7 +53,7 @@ define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd_load1(<2 x double> %x0, define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss_load0(<4 x float>* %x0ptr, <4 x float> %x1, <4 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ss_load0: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmsub231ss (%rdi), %xmm0, %xmm1 +; CHECK-NEXT: vfmsub231ss {{.*#+}} xmm1 = (xmm0 * mem) - xmm1 ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %x0 = load <4 x float>, <4 x float>* %x0ptr @@ -64,7 +64,7 @@ define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss_load0(<4 x float>* %x0pt define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss_load1(<4 x float> %x0, <4 x float>* %x1ptr, <4 x float> %x2){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ss_load1: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmsub231ss (%rdi), %xmm0, %xmm1 +; CHECK-NEXT: vfmsub231ss {{.*#+}} xmm1 = (xmm0 * mem) - xmm1 ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %x1 = load <4 x float>, <4 x float>* %x1ptr @@ -75,7 +75,7 @@ define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss_load1(<4 x float> %x0, < define <2 x double> @test_int_x86_avx512_mask3_vfmsub_sd_load0(<2 x double>* %x0ptr, <2 x double> %x1, <2 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_sd_load0: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1 +; CHECK-NEXT: vfmsub231sd {{.*#+}} xmm1 = (xmm0 * mem) - xmm1 ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: retq %x0 = load <2 x double>, <2 x double>* %x0ptr @@ -86,7 +86,7 @@ define <2 x double> @test_int_x86_avx512_mask3_vfmsub_sd_load0(<2 x double>* %x0 define <2 x double> @test_int_x86_avx512_mask3_vfmsub_sd_load1(<2 x double> %x0, <2 x double>* %x1ptr, <2 x double> %x2){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_sd_load1: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1 +; CHECK-NEXT: vfmsub231sd {{.*#+}} xmm1 = (xmm0 * mem) - xmm1 ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: retq %x1 = load <2 x double>, <2 x double>* %x1ptr diff --git a/llvm/test/CodeGen/X86/avx512-inc-dec.ll b/llvm/test/CodeGen/X86/avx512-inc-dec.ll index 4fa4f27beb79..73a2d67f8e1d 100644 --- a/llvm/test/CodeGen/X86/avx512-inc-dec.ll +++ b/llvm/test/CodeGen/X86/avx512-inc-dec.ll @@ -1,10 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -;CHECK-LABEL: test -;CHECK-NOT: dec -;CHECK-NOT: enc -;CHECK: ret define i32 @test(i32 %a, i32 %b) { +; CHECK-LABEL: test: +; CHECK: ## %bb.0: +; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi +; CHECK-NEXT: leal -1(%rdi), %eax +; CHECK-NEXT: addl $1, %esi +; CHECK-NEXT: imull %esi, %eax +; CHECK-NEXT: retq %a1 = add i32 %a, -1 %b1 = add i32 %b, 1 %res = mul i32 %a1, %b1 diff --git a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll index 4b1b681b0554..751d610c2ca7 100644 --- a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll +++ b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll @@ -94,12 +94,12 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind { ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: andl $-64, %esp ; X32-NEXT: subl $256, %esp ## imm = 0x100 -; X32-NEXT: vmovaps %zmm1, {{[0-9]+}}(%esp) ## 64-byte Spill +; X32-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill ; X32-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl %eax, (%esp) ; X32-NEXT: calll _func_float16_ptr -; X32-NEXT: vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0 ## 64-byte Folded Reload +; X32-NEXT: vaddps {{[-0-9]+}}(%e{{[sb]}}p), %zmm0, %zmm0 ## 64-byte Folded Reload ; X32-NEXT: vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0 ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp @@ -186,52 +186,52 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl ; WIN64-KNL-NEXT: pushq %rbp ; WIN64-KNL-NEXT: subq $1264, %rsp # imm = 0x4F0 ; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rbp -; WIN64-KNL-NEXT: kmovw %k7, 1134(%rbp) # 2-byte Spill -; WIN64-KNL-NEXT: kmovw %k6, 1132(%rbp) # 2-byte Spill -; WIN64-KNL-NEXT: kmovw %k5, 1130(%rbp) # 2-byte Spill -; WIN64-KNL-NEXT: kmovw %k4, 1128(%rbp) # 2-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm21, 1024(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm20, 960(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm19, 896(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm18, 832(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm17, 768(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm16, 704(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm15, 640(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm14, 576(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm13, 512(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm12, 448(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm11, 384(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm10, 320(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm9, 256(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm8, 192(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm7, 128(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm6, 64(%rbp) # 64-byte Spill +; WIN64-KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; WIN64-KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; WIN64-KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; WIN64-KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; WIN64-KNL-NEXT: andq $-64, %rsp ; WIN64-KNL-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp) ; WIN64-KNL-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) ; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-KNL-NEXT: callq func_float16 -; WIN64-KNL-NEXT: vmovaps 64(%rbp), %zmm6 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 128(%rbp), %zmm7 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 192(%rbp), %zmm8 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 256(%rbp), %zmm9 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 320(%rbp), %zmm10 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 384(%rbp), %zmm11 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 448(%rbp), %zmm12 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 512(%rbp), %zmm13 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 576(%rbp), %zmm14 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 640(%rbp), %zmm15 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 704(%rbp), %zmm16 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 768(%rbp), %zmm17 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 832(%rbp), %zmm18 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 896(%rbp), %zmm19 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 960(%rbp), %zmm20 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 1024(%rbp), %zmm21 # 64-byte Reload -; WIN64-KNL-NEXT: kmovw 1128(%rbp), %k4 # 2-byte Reload -; WIN64-KNL-NEXT: kmovw 1130(%rbp), %k5 # 2-byte Reload -; WIN64-KNL-NEXT: kmovw 1132(%rbp), %k6 # 2-byte Reload -; WIN64-KNL-NEXT: kmovw 1134(%rbp), %k7 # 2-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; WIN64-KNL-NEXT: leaq 1136(%rbp), %rsp ; WIN64-KNL-NEXT: popq %rbp ; WIN64-KNL-NEXT: retq @@ -241,52 +241,52 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl ; WIN64-SKX-NEXT: pushq %rbp ; WIN64-SKX-NEXT: subq $1264, %rsp # imm = 0x4F0 ; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rbp -; WIN64-SKX-NEXT: kmovq %k7, 1128(%rbp) # 8-byte Spill -; WIN64-SKX-NEXT: kmovq %k6, 1120(%rbp) # 8-byte Spill -; WIN64-SKX-NEXT: kmovq %k5, 1112(%rbp) # 8-byte Spill -; WIN64-SKX-NEXT: kmovq %k4, 1104(%rbp) # 8-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm21, 1024(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm20, 960(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm19, 896(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm18, 832(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm17, 768(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm16, 704(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm15, 640(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm14, 576(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm13, 512(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm12, 448(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm11, 384(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm10, 320(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm9, 256(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm8, 192(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm7, 128(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm6, 64(%rbp) # 64-byte Spill +; WIN64-SKX-NEXT: kmovq %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WIN64-SKX-NEXT: kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WIN64-SKX-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WIN64-SKX-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; WIN64-SKX-NEXT: andq $-64, %rsp ; WIN64-SKX-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp) ; WIN64-SKX-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) ; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-SKX-NEXT: callq func_float16 -; WIN64-SKX-NEXT: vmovaps 64(%rbp), %zmm6 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 128(%rbp), %zmm7 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 192(%rbp), %zmm8 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 256(%rbp), %zmm9 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 320(%rbp), %zmm10 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 384(%rbp), %zmm11 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 448(%rbp), %zmm12 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 512(%rbp), %zmm13 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 576(%rbp), %zmm14 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 640(%rbp), %zmm15 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 704(%rbp), %zmm16 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 768(%rbp), %zmm17 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 832(%rbp), %zmm18 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 896(%rbp), %zmm19 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 960(%rbp), %zmm20 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 1024(%rbp), %zmm21 # 64-byte Reload -; WIN64-SKX-NEXT: kmovq 1104(%rbp), %k4 # 8-byte Reload -; WIN64-SKX-NEXT: kmovq 1112(%rbp), %k5 # 8-byte Reload -; WIN64-SKX-NEXT: kmovq 1120(%rbp), %k6 # 8-byte Reload -; WIN64-SKX-NEXT: kmovq 1128(%rbp), %k7 # 8-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload +; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload +; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 8-byte Reload +; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload ; WIN64-SKX-NEXT: leaq 1136(%rbp), %rsp ; WIN64-SKX-NEXT: popq %rbp ; WIN64-SKX-NEXT: retq @@ -296,47 +296,47 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl ; X64-KNL-NEXT: pushq %rsi ; X64-KNL-NEXT: pushq %rdi ; X64-KNL-NEXT: subq $1064, %rsp ## imm = 0x428 -; X64-KNL-NEXT: kmovw %k7, {{[0-9]+}}(%rsp) ## 2-byte Spill -; X64-KNL-NEXT: kmovw %k6, {{[0-9]+}}(%rsp) ## 2-byte Spill -; X64-KNL-NEXT: kmovw %k5, {{[0-9]+}}(%rsp) ## 2-byte Spill -; X64-KNL-NEXT: kmovw %k4, {{[0-9]+}}(%rsp) ## 2-byte Spill -; X64-KNL-NEXT: vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill +; X64-KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; X64-KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; X64-KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; X64-KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; X64-KNL-NEXT: vmovups %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill ; X64-KNL-NEXT: vmovups %zmm16, (%rsp) ## 64-byte Spill ; X64-KNL-NEXT: callq _func_float16 ; X64-KNL-NEXT: vmovups (%rsp), %zmm16 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload -; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k4 ## 2-byte Reload -; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k5 ## 2-byte Reload -; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k6 ## 2-byte Reload -; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k7 ## 2-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload +; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload +; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload ; X64-KNL-NEXT: addq $1064, %rsp ## imm = 0x428 ; X64-KNL-NEXT: popq %rdi ; X64-KNL-NEXT: popq %rsi @@ -347,47 +347,47 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl ; X64-SKX-NEXT: pushq %rsi ; X64-SKX-NEXT: pushq %rdi ; X64-SKX-NEXT: subq $1064, %rsp ## imm = 0x428 -; X64-SKX-NEXT: kmovq %k7, {{[0-9]+}}(%rsp) ## 8-byte Spill -; X64-SKX-NEXT: kmovq %k6, {{[0-9]+}}(%rsp) ## 8-byte Spill -; X64-SKX-NEXT: kmovq %k5, {{[0-9]+}}(%rsp) ## 8-byte Spill -; X64-SKX-NEXT: kmovq %k4, {{[0-9]+}}(%rsp) ## 8-byte Spill -; X64-SKX-NEXT: vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill +; X64-SKX-NEXT: kmovq %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-SKX-NEXT: kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-SKX-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-SKX-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-SKX-NEXT: vmovups %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill ; X64-SKX-NEXT: vmovups %zmm16, (%rsp) ## 64-byte Spill ; X64-SKX-NEXT: callq _func_float16 ; X64-SKX-NEXT: vmovups (%rsp), %zmm16 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload -; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k4 ## 8-byte Reload -; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k5 ## 8-byte Reload -; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k6 ## 8-byte Reload -; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k7 ## 8-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload +; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 8-byte Reload +; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 8-byte Reload +; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 8-byte Reload +; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 8-byte Reload ; X64-SKX-NEXT: addq $1064, %rsp ## imm = 0x428 ; X64-SKX-NEXT: popq %rdi ; X64-SKX-NEXT: popq %rsi diff --git a/llvm/test/CodeGen/X86/avx512-mask-spills.ll b/llvm/test/CodeGen/X86/avx512-mask-spills.ll index b9f483e997c4..44643f42e600 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-spills.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-spills.ll @@ -10,9 +10,9 @@ define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0 ; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; CHECK-NEXT: korw %k1, %k0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK-NEXT: callq _f -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload +; CHECK-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload ; CHECK-NEXT: vpmovm2d %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -32,10 +32,10 @@ define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) { ; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0 ; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 ; CHECK-NEXT: korb %k1, %k0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload +; CHECK-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -55,10 +55,10 @@ define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) { ; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 ; CHECK-NEXT: korw %k1, %k0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload +; CHECK-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload ; CHECK-NEXT: vpmovm2b %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -77,10 +77,10 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) { ; CHECK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; CHECK-NEXT: kord %k1, %k0, %k0 -; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill +; CHECK-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f -; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload +; CHECK-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 4-byte Reload ; CHECK-NEXT: vpmovm2b %k0, %ymm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-memfold.ll b/llvm/test/CodeGen/X86/avx512-memfold.ll index d80891868d0b..6feb622a00de 100644 --- a/llvm/test/CodeGen/X86/avx512-memfold.ll +++ b/llvm/test/CodeGen/X86/avx512-memfold.ll @@ -58,7 +58,7 @@ define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %a, <2 x do ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vfmadd213sd (%rdi), %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + mem ; CHECK-NEXT: retq %c.val = load double, double* %c %cv0 = insertelement <2 x double> undef, double %c.val, i32 0 diff --git a/llvm/test/CodeGen/X86/avx512-nontemporal.ll b/llvm/test/CodeGen/X86/avx512-nontemporal.ll index 9bc8a8f97526..617b44f0be4a 100644 --- a/llvm/test/CodeGen/X86/avx512-nontemporal.ll +++ b/llvm/test/CodeGen/X86/avx512-nontemporal.ll @@ -1,33 +1,63 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw | FileCheck %s define i32 @f(<16 x float> %A, <16 x float> %AA, i8* %B, <8 x double> %C, <8 x double> %CC, <8 x i64> %E, <8 x i64> %EE, <16 x i32> %F, <16 x i32> %FF, <32 x i16> %G, <32 x i16> %GG, <64 x i8> %H, <64 x i8> %HH, i32 * %loadptr) { -; CHECK: vmovntps %z +; CHECK-LABEL: f: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-64, %rsp +; CHECK-NEXT: subq $64, %rsp +; CHECK-NEXT: vmovdqa64 144(%rbp), %zmm8 +; CHECK-NEXT: vmovdqa64 16(%rbp), %zmm9 +; CHECK-NEXT: movl (%rsi), %eax +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmovntps %zmm0, (%rdi) +; CHECK-NEXT: vpaddq %zmm5, %zmm4, %zmm0 +; CHECK-NEXT: addl (%rsi), %eax +; CHECK-NEXT: vmovntdq %zmm0, (%rdi) +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm0 +; CHECK-NEXT: addl (%rsi), %eax +; CHECK-NEXT: vmovntpd %zmm0, (%rdi) +; CHECK-NEXT: vpaddd %zmm7, %zmm6, %zmm0 +; CHECK-NEXT: addl (%rsi), %eax +; CHECK-NEXT: vmovntdq %zmm0, (%rdi) +; CHECK-NEXT: vpaddw 80(%rbp), %zmm9, %zmm0 +; CHECK-NEXT: addl (%rsi), %eax +; CHECK-NEXT: vmovntdq %zmm0, (%rdi) +; CHECK-NEXT: vpaddb 208(%rbp), %zmm8, %zmm0 +; CHECK-NEXT: addl (%rsi), %eax +; CHECK-NEXT: vmovntdq %zmm0, (%rdi) +; CHECK-NEXT: addl (%rsi), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %v0 = load i32, i32* %loadptr, align 1 %cast = bitcast i8* %B to <16 x float>* %A2 = fadd <16 x float> %A, %AA store <16 x float> %A2, <16 x float>* %cast, align 64, !nontemporal !0 %v1 = load i32, i32* %loadptr, align 1 -; CHECK: vmovntdq %z %cast1 = bitcast i8* %B to <8 x i64>* %E2 = add <8 x i64> %E, %EE store <8 x i64> %E2, <8 x i64>* %cast1, align 64, !nontemporal !0 %v2 = load i32, i32* %loadptr, align 1 -; CHECK: vmovntpd %z %cast2 = bitcast i8* %B to <8 x double>* %C2 = fadd <8 x double> %C, %CC store <8 x double> %C2, <8 x double>* %cast2, align 64, !nontemporal !0 %v3 = load i32, i32* %loadptr, align 1 -; CHECK: vmovntdq %z %cast3 = bitcast i8* %B to <16 x i32>* %F2 = add <16 x i32> %F, %FF store <16 x i32> %F2, <16 x i32>* %cast3, align 64, !nontemporal !0 %v4 = load i32, i32* %loadptr, align 1 -; CHECK: vmovntdq %z %cast4 = bitcast i8* %B to <32 x i16>* %G2 = add <32 x i16> %G, %GG store <32 x i16> %G2, <32 x i16>* %cast4, align 64, !nontemporal !0 %v5 = load i32, i32* %loadptr, align 1 -; CHECK: vmovntdq %z %cast5 = bitcast i8* %B to <64 x i8>* %H2 = add <64 x i8> %H, %HH store <64 x i8> %H2, <64 x i8>* %cast5, align 64, !nontemporal !0 diff --git a/llvm/test/CodeGen/X86/avx512-scalar_mask.ll b/llvm/test/CodeGen/X86/avx512-scalar_mask.ll index 1a98bd958e3c..27468fecfde7 100644 --- a/llvm/test/CodeGen/X86/avx512-scalar_mask.ll +++ b/llvm/test/CodeGen/X86/avx512-scalar_mask.ll @@ -8,7 +8,7 @@ define <4 x float>@test_var_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> % ; CHECK-LABEL: test_var_mask: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 %mask, i32 4) ret < 4 x float> %res @@ -18,7 +18,7 @@ define <4 x float>@test_var_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> ; CHECK-LABEL: test_var_maskz: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 %mask, i32 4) ret < 4 x float> %res @@ -67,7 +67,7 @@ define <4 x float>@test_const2_maskz(<4 x float> %v0, <4 x float> %v1, <4 x floa define <4 x float>@test_const_allone_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { ; CHECK-LABEL: test_const_allone_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 -1, i32 4) ret < 4 x float> %res @@ -76,7 +76,7 @@ define <4 x float>@test_const_allone_mask(<4 x float> %v0, <4 x float> %v1, <4 x define <4 x float>@test_const_allone_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { ; CHECK-LABEL: test_const_allone_maskz: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 -1, i32 4) ret < 4 x float> %res @@ -85,7 +85,7 @@ define <4 x float>@test_const_allone_maskz(<4 x float> %v0, <4 x float> %v1, <4 define <4 x float>@test_const_3_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { ; CHECK-LABEL: test_const_3_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 3, i32 4) ret < 4 x float> %res @@ -94,7 +94,7 @@ define <4 x float>@test_const_3_mask(<4 x float> %v0, <4 x float> %v1, <4 x floa define <4 x float>@test_const_3_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { ; CHECK-LABEL: test_const_3_maskz: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 3, i32 4) ret < 4 x float> %res diff --git a/llvm/test/CodeGen/X86/avx512vl-nontemporal.ll b/llvm/test/CodeGen/X86/avx512vl-nontemporal.ll index 683cae69bcae..48cb47947d45 100644 --- a/llvm/test/CodeGen/X86/avx512vl-nontemporal.ll +++ b/llvm/test/CodeGen/X86/avx512vl-nontemporal.ll @@ -1,17 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding | FileCheck %s define i32 @f256(<8 x float> %A, <8 x float> %AA, i8* %B, <4 x double> %C, <4 x double> %CC, i32 %D, <4 x i64> %E, <4 x i64> %EE, i32* %loadptr) { -; CHECK: vmovntps %ymm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5 +; CHECK-LABEL: f256: +; CHECK: ## %bb.0: +; CHECK-NEXT: movl (%rdx), %eax ## encoding: [0x8b,0x02] +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] +; CHECK-NEXT: vmovntps %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x2b,0x07] +; CHECK-NEXT: vpaddq %ymm5, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xd4,0xc5] +; CHECK-NEXT: addl (%rdx), %eax ## encoding: [0x03,0x02] +; CHECK-NEXT: vmovntdq %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x07] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] +; CHECK-NEXT: addl (%rdx), %eax ## encoding: [0x03,0x02] +; CHECK-NEXT: vmovntpd %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x2b,0x07] +; CHECK-NEXT: addl (%rdx), %eax ## encoding: [0x03,0x02] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retq ## encoding: [0xc3] %v0 = load i32, i32* %loadptr, align 1 %cast = bitcast i8* %B to <8 x float>* %A2 = fadd <8 x float> %A, %AA store <8 x float> %A2, <8 x float>* %cast, align 64, !nontemporal !0 -; CHECK: vmovntdq %ymm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5 %v1 = load i32, i32* %loadptr, align 1 %cast1 = bitcast i8* %B to <4 x i64>* %E2 = add <4 x i64> %E, %EE store <4 x i64> %E2, <4 x i64>* %cast1, align 64, !nontemporal !0 -; CHECK: vmovntpd %ymm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5 %v2 = load i32, i32* %loadptr, align 1 %cast2 = bitcast i8* %B to <4 x double>* %C2 = fadd <4 x double> %C, %CC @@ -24,17 +36,27 @@ define i32 @f256(<8 x float> %A, <8 x float> %AA, i8* %B, <4 x double> %C, <4 x } define i32 @f128(<4 x float> %A, <4 x float> %AA, i8* %B, <2 x double> %C, <2 x double> %CC, i32 %D, <2 x i64> %E, <2 x i64> %EE, i32* %loadptr) { +; CHECK-LABEL: f128: +; CHECK: ## %bb.0: +; CHECK-NEXT: movl (%rdx), %eax ## encoding: [0x8b,0x02] +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] +; CHECK-NEXT: vmovntps %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x07] +; CHECK-NEXT: vpaddq %xmm5, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xd4,0xc5] +; CHECK-NEXT: addl (%rdx), %eax ## encoding: [0x03,0x02] +; CHECK-NEXT: vmovntdq %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe7,0x07] +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3] +; CHECK-NEXT: addl (%rdx), %eax ## encoding: [0x03,0x02] +; CHECK-NEXT: vmovntpd %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2b,0x07] +; CHECK-NEXT: addl (%rdx), %eax ## encoding: [0x03,0x02] +; CHECK-NEXT: retq ## encoding: [0xc3] %v0 = load i32, i32* %loadptr, align 1 -; CHECK: vmovntps %xmm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5 %cast = bitcast i8* %B to <4 x float>* %A2 = fadd <4 x float> %A, %AA store <4 x float> %A2, <4 x float>* %cast, align 64, !nontemporal !0 -; CHECK: vmovntdq %xmm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5 %v1 = load i32, i32* %loadptr, align 1 %cast1 = bitcast i8* %B to <2 x i64>* %E2 = add <2 x i64> %E, %EE store <2 x i64> %E2, <2 x i64>* %cast1, align 64, !nontemporal !0 -; CHECK: vmovntpd %xmm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5 %v2 = load i32, i32* %loadptr, align 1 %cast2 = bitcast i8* %B to <2 x double>* %C2 = fadd <2 x double> %C, %CC