llvm-project/llvm/test/CodeGen/X86/avx512-mask-op.ll

3449 lines
108 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512BW
; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512DQ
; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=X86
define i16 @mask16(i16 %x) {
; CHECK-LABEL: mask16:
; CHECK: ## %bb.0:
; CHECK-NEXT: notl %edi
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
;
; X86-LABEL: mask16:
; X86: ## %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: notl %eax
; X86-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
%m0 = bitcast i16 %x to <16 x i1>
%m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <16 x i1> %m1 to i16
ret i16 %ret
}
define i32 @mask16_zext(i16 %x) {
; CHECK-LABEL: mask16_zext:
; CHECK: ## %bb.0:
; CHECK-NEXT: notl %edi
; CHECK-NEXT: movzwl %di, %eax
; CHECK-NEXT: retq
;
; X86-LABEL: mask16_zext:
; X86: ## %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: xorl $65535, %eax ## imm = 0xFFFF
; X86-NEXT: retl
%m0 = bitcast i16 %x to <16 x i1>
%m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%m2 = bitcast <16 x i1> %m1 to i16
%ret = zext i16 %m2 to i32
ret i32 %ret
}
define i8 @mask8(i8 %x) {
; CHECK-LABEL: mask8:
; CHECK: ## %bb.0:
; CHECK-NEXT: notb %dil
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
;
; X86-LABEL: mask8:
; X86: ## %bb.0:
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: notb %al
; X86-NEXT: retl
%m0 = bitcast i8 %x to <8 x i1>
%m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <8 x i1> %m1 to i8
ret i8 %ret
}
define i32 @mask8_zext(i8 %x) {
; CHECK-LABEL: mask8_zext:
; CHECK: ## %bb.0:
; CHECK-NEXT: notb %dil
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: retq
;
; X86-LABEL: mask8_zext:
; X86: ## %bb.0:
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: notb %al
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: retl
%m0 = bitcast i8 %x to <8 x i1>
%m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%m2 = bitcast <8 x i1> %m1 to i8
%ret = zext i8 %m2 to i32
ret i32 %ret
}
define void @mask16_mem(i16* %ptr) {
; CHECK-LABEL: mask16_mem:
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw (%rdi), %k0
; CHECK-NEXT: knotw %k0, %k0
; CHECK-NEXT: kmovw %k0, (%rdi)
; CHECK-NEXT: retq
;
; X86-LABEL: mask16_mem:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw (%eax), %k0
; X86-NEXT: knotw %k0, %k0
; X86-NEXT: kmovw %k0, (%eax)
; X86-NEXT: retl
%x = load i16, i16* %ptr, align 4
%m0 = bitcast i16 %x to <16 x i1>
%m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <16 x i1> %m1 to i16
store i16 %ret, i16* %ptr, align 4
ret void
}
define void @mask8_mem(i8* %ptr) {
; KNL-LABEL: mask8_mem:
; KNL: ## %bb.0:
; KNL-NEXT: notb (%rdi)
; KNL-NEXT: retq
;
; SKX-LABEL: mask8_mem:
; SKX: ## %bb.0:
; SKX-NEXT: kmovb (%rdi), %k0
; SKX-NEXT: knotb %k0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512BW-LABEL: mask8_mem:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: notb (%rdi)
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: mask8_mem:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
; AVX512DQ-NEXT: knotb %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: retq
;
; X86-LABEL: mask8_mem:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovb (%eax), %k0
; X86-NEXT: knotb %k0, %k0
; X86-NEXT: kmovb %k0, (%eax)
; X86-NEXT: retl
%x = load i8, i8* %ptr, align 4
%m0 = bitcast i8 %x to <8 x i1>
%m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <8 x i1> %m1 to i8
store i8 %ret, i8* %ptr, align 4
ret void
}
define i16 @mand16(i16 %x, i16 %y) {
; CHECK-LABEL: mand16:
; CHECK: ## %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: xorl %esi, %eax
; CHECK-NEXT: andl %esi, %edi
; CHECK-NEXT: orl %eax, %edi
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
;
; X86-LABEL: mand16:
; X86: ## %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %edx
; X86-NEXT: andl %ecx, %edx
; X86-NEXT: xorl %ecx, %eax
; X86-NEXT: orl %edx, %eax
; X86-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
%ma = bitcast i16 %x to <16 x i1>
%mb = bitcast i16 %y to <16 x i1>
%mc = and <16 x i1> %ma, %mb
%md = xor <16 x i1> %ma, %mb
%me = or <16 x i1> %mc, %md
%ret = bitcast <16 x i1> %me to i16
ret i16 %ret
}
define i16 @mand16_mem(<16 x i1>* %x, <16 x i1>* %y) {
; KNL-LABEL: mand16_mem:
; KNL: ## %bb.0:
; KNL-NEXT: kmovw (%rdi), %k0
; KNL-NEXT: kmovw (%rsi), %k1
; KNL-NEXT: kandw %k1, %k0, %k2
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: korw %k0, %k2, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: def $ax killed $ax killed $eax
; KNL-NEXT: retq
;
; SKX-LABEL: mand16_mem:
; SKX: ## %bb.0:
; SKX-NEXT: kmovw (%rdi), %k0
; SKX-NEXT: kmovw (%rsi), %k1
; SKX-NEXT: kandw %k1, %k0, %k2
; SKX-NEXT: kxorw %k1, %k0, %k0
; SKX-NEXT: korw %k0, %k2, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $ax killed $ax killed $eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: mand16_mem:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k0
; AVX512BW-NEXT: kmovw (%rsi), %k1
; AVX512BW-NEXT: kandw %k1, %k0, %k2
; AVX512BW-NEXT: kxorw %k1, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k2, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: ## kill: def $ax killed $ax killed $eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: mand16_mem:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: kmovw (%rsi), %k1
; AVX512DQ-NEXT: kandw %k1, %k0, %k2
; AVX512DQ-NEXT: kxorw %k1, %k0, %k0
; AVX512DQ-NEXT: korw %k0, %k2, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: ## kill: def $ax killed $ax killed $eax
; AVX512DQ-NEXT: retq
;
; X86-LABEL: mand16_mem:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: kmovw (%ecx), %k0
; X86-NEXT: kmovw (%eax), %k1
; X86-NEXT: kandw %k1, %k0, %k2
; X86-NEXT: kxorw %k1, %k0, %k0
; X86-NEXT: korw %k0, %k2, %k0
; X86-NEXT: kmovd %k0, %eax
; X86-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
%ma = load <16 x i1>, <16 x i1>* %x
%mb = load <16 x i1>, <16 x i1>* %y
%mc = and <16 x i1> %ma, %mb
%md = xor <16 x i1> %ma, %mb
%me = or <16 x i1> %mc, %md
%ret = bitcast <16 x i1> %me to i16
ret i16 %ret
}
define i8 @shuf_test1(i16 %v) nounwind {
; KNL-LABEL: shuf_test1:
; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kshiftrw $8, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: def $al killed $al killed $eax
; KNL-NEXT: retq
;
; SKX-LABEL: shuf_test1:
; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: kshiftrw $8, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $al killed $al killed $eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: shuf_test1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kshiftrw $8, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: ## kill: def $al killed $al killed $eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuf_test1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: ## kill: def $al killed $al killed $eax
; AVX512DQ-NEXT: retq
;
; X86-LABEL: shuf_test1:
; X86: ## %bb.0:
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: retl
%v1 = bitcast i16 %v to <16 x i1>
%mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%mask1 = bitcast <8 x i1> %mask to i8
ret i8 %mask1
}
define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
; KNL-LABEL: zext_test1:
; KNL: ## %bb.0:
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; KNL-NEXT: kshiftrw $5, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: zext_test1:
; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; SKX-NEXT: kshiftrw $5, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; AVX512BW-LABEL: zext_test1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: andl $1, %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: zext_test1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: andl $1, %eax
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: zext_test1:
; X86: ## %bb.0:
; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; X86-NEXT: kshiftrw $5, %k0, %k0
; X86-NEXT: kmovd %k0, %eax
; X86-NEXT: andl $1, %eax
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i32
ret i32 %res
}
define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
; KNL-LABEL: zext_test2:
; KNL: ## %bb.0:
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; KNL-NEXT: kshiftrw $5, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: ## kill: def $ax killed $ax killed $eax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: zext_test2:
; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; SKX-NEXT: kshiftrw $5, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: ## kill: def $ax killed $ax killed $eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; AVX512BW-LABEL: zext_test2:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: andl $1, %eax
; AVX512BW-NEXT: ## kill: def $ax killed $ax killed $eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: zext_test2:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: andl $1, %eax
; AVX512DQ-NEXT: ## kill: def $ax killed $ax killed $eax
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: zext_test2:
; X86: ## %bb.0:
; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; X86-NEXT: kshiftrw $5, %k0, %k0
; X86-NEXT: kmovd %k0, %eax
; X86-NEXT: andl $1, %eax
; X86-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i16
ret i16 %res
}
define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
; KNL-LABEL: zext_test3:
; KNL: ## %bb.0:
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; KNL-NEXT: kshiftrw $5, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $1, %al
; KNL-NEXT: ## kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: zext_test3:
; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; SKX-NEXT: kshiftrw $5, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: andb $1, %al
; SKX-NEXT: ## kill: def $al killed $al killed $eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; AVX512BW-LABEL: zext_test3:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: andb $1, %al
; AVX512BW-NEXT: ## kill: def $al killed $al killed $eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: zext_test3:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: andb $1, %al
; AVX512DQ-NEXT: ## kill: def $al killed $al killed $eax
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: zext_test3:
; X86: ## %bb.0:
; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; X86-NEXT: kshiftrw $5, %k0, %k0
; X86-NEXT: kmovd %k0, %eax
; X86-NEXT: andb $1, %al
; X86-NEXT: ## kill: def $al killed $al killed $eax
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i8
ret i8 %res
}
define i8 @conv1(<8 x i1>* %R) {
; CHECK-LABEL: conv1:
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movb $-1, (%rdi)
; CHECK-NEXT: movb $-2, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $-2, %al
; CHECK-NEXT: retq
;
; X86-LABEL: conv1:
; X86: ## %bb.0: ## %entry
; X86-NEXT: subl $12, %esp
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb $-1, (%eax)
; X86-NEXT: movb $-2, (%esp)
; X86-NEXT: movb $-2, %al
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
entry:
store <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1>* %R
%maskPtr = alloca <8 x i1>
store <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1>* %maskPtr
%mask = load <8 x i1>, <8 x i1>* %maskPtr
%mask_convert = bitcast <8 x i1> %mask to i8
ret i8 %mask_convert
}
define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) {
; KNL-LABEL: test4:
; KNL: ## %bb.0:
; KNL-NEXT: ## kill: def $ymm3 killed $ymm3 def $zmm3
; KNL-NEXT: ## kill: def $ymm2 killed $ymm2 def $zmm2
; KNL-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
; KNL-NEXT: vpcmpleq %zmm1, %zmm0, %k1
; KNL-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 {%k1}
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test4:
; SKX: ## %bb.0:
; SKX-NEXT: vpcmpleq %ymm1, %ymm0, %k1
; SKX-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1}
; SKX-NEXT: vpmovm2d %k0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test4:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: ## kill: def $ymm3 killed $ymm3 def $zmm3
; AVX512BW-NEXT: ## kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512BW-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpcmpleq %zmm1, %zmm0, %k1
; AVX512BW-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 {%k1}
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test4:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: ## kill: def $ymm3 killed $ymm3 def $zmm3
; AVX512DQ-NEXT: ## kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512DQ-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512DQ-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512DQ-NEXT: vpcmpleq %zmm1, %zmm0, %k1
; AVX512DQ-NEXT: vpcmpgtq %zmm3, %zmm2, %k0 {%k1}
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test4:
; X86: ## %bb.0:
; X86-NEXT: vpcmpleq %ymm1, %ymm0, %k1
; X86-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1}
; X86-NEXT: vpmovm2d %k0, %xmm0
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%x_gt_y = icmp sgt <4 x i64> %x, %y
%x1_gt_y1 = icmp sgt <4 x i64> %x1, %y1
%res = icmp sgt <4 x i1>%x_gt_y, %x1_gt_y1
%resse = sext <4 x i1>%res to <4 x i32>
ret <4 x i32> %resse
}
define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) {
; KNL-LABEL: test5:
; KNL: ## %bb.0:
; KNL-NEXT: ## kill: def $xmm3 killed $xmm3 def $zmm3
; KNL-NEXT: ## kill: def $xmm2 killed $xmm2 def $zmm2
; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; KNL-NEXT: vpcmpleq %zmm3, %zmm2, %k1
; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 {%k1}
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test5:
; SKX: ## %bb.0:
; SKX-NEXT: vpcmpleq %xmm3, %xmm2, %k1
; SKX-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 {%k1}
; SKX-NEXT: vpmovm2q %k0, %xmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test5:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: ## kill: def $xmm3 killed $xmm3 def $zmm3
; AVX512BW-NEXT: ## kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512BW-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpcmpleq %zmm3, %zmm2, %k1
; AVX512BW-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 {%k1}
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test5:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: ## kill: def $xmm3 killed $xmm3 def $zmm3
; AVX512DQ-NEXT: ## kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512DQ-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512DQ-NEXT: vpcmpleq %zmm3, %zmm2, %k1
; AVX512DQ-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 {%k1}
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test5:
; X86: ## %bb.0:
; X86-NEXT: vpcmpleq %xmm3, %xmm2, %k1
; X86-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 {%k1}
; X86-NEXT: vpmovm2q %k0, %xmm0
; X86-NEXT: retl
%x_gt_y = icmp slt <2 x i64> %x, %y
%x1_gt_y1 = icmp sgt <2 x i64> %x1, %y1
%res = icmp slt <2 x i1>%x_gt_y, %x1_gt_y1
%resse = sext <2 x i1>%res to <2 x i64>
ret <2 x i64> %resse
}define void @test6(<16 x i1> %mask) {
allocas:
%a= and <16 x i1> %mask, <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
%b = bitcast <16 x i1> %a to i16
%c = icmp eq i16 %b, 0
br i1 %c, label %true, label %false
true:
ret void
false:
ret void
}
define void @test7(<8 x i1> %mask) {
; KNL-LABEL: test7:
; KNL: ## %bb.0: ## %allocas
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: orb $85, %al
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test7:
; SKX: ## %bb.0: ## %allocas
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: orb $85, %al
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test7:
; AVX512BW: ## %bb.0: ## %allocas
; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: orb $85, %al
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test7:
; AVX512DQ: ## %bb.0: ## %allocas
; AVX512DQ-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: orb $85, %al
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test7:
; X86: ## %bb.0: ## %allocas
; X86-NEXT: vpsllw $15, %xmm0, %xmm0
; X86-NEXT: vpmovw2m %xmm0, %k0
; X86-NEXT: kmovd %k0, %eax
; X86-NEXT: orb $85, %al
; X86-NEXT: retl
allocas:
%a= or <8 x i1> %mask, <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
%b = bitcast <8 x i1> %a to i8
%c = icmp eq i8 %b, 0
br i1 %c, label %true, label %false
true:
ret void
false:
ret void
}
define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
; KNL-LABEL: test8:
; KNL: ## %bb.0:
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: jg LBB17_1
; KNL-NEXT: ## %bb.2:
; KNL-NEXT: kxorw %k0, %k0, %k1
; KNL-NEXT: jmp LBB17_3
; KNL-NEXT: LBB17_1:
; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
; KNL-NEXT: LBB17_3:
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test8:
; SKX: ## %bb.0:
; SKX-NEXT: cmpl %esi, %edi
; SKX-NEXT: jg LBB17_1
; SKX-NEXT: ## %bb.2:
; SKX-NEXT: kxorw %k0, %k0, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB17_1:
; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test8:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: cmpl %esi, %edi
; AVX512BW-NEXT: jg LBB17_1
; AVX512BW-NEXT: ## %bb.2:
; AVX512BW-NEXT: kxorw %k0, %k0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512BW-NEXT: LBB17_1:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test8:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: cmpl %esi, %edi
; AVX512DQ-NEXT: jg LBB17_1
; AVX512DQ-NEXT: ## %bb.2:
; AVX512DQ-NEXT: kxorw %k0, %k0, %k0
; AVX512DQ-NEXT: jmp LBB17_3
; AVX512DQ-NEXT: LBB17_1:
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; AVX512DQ-NEXT: LBB17_3:
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test8:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; X86-NEXT: jg LBB17_1
; X86-NEXT: ## %bb.2:
; X86-NEXT: kxorw %k0, %k0, %k0
; X86-NEXT: vpmovm2b %k0, %xmm0
; X86-NEXT: vzeroupper
; X86-NEXT: retl
; X86-NEXT: LBB17_1:
; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; X86-NEXT: vpmovm2b %k0, %xmm0
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%cond = icmp sgt i32 %a1, %b1
%cmp1 = icmp sgt <16 x i32> %a, zeroinitializer
%cmp2 = icmp ult <16 x i32> %b, zeroinitializer
%mix = select i1 %cond, <16 x i1> %cmp1, <16 x i1> %cmp2
%res = sext <16 x i1> %mix to <16 x i8>
ret <16 x i8> %res
}
define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
; KNL-LABEL: test9:
; KNL: ## %bb.0:
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: jg LBB18_1
; KNL-NEXT: ## %bb.2:
; KNL-NEXT: vpmovsxbd %xmm1, %zmm0
; KNL-NEXT: jmp LBB18_3
; KNL-NEXT: LBB18_1:
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: LBB18_3:
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test9:
; SKX: ## %bb.0:
; SKX-NEXT: cmpl %esi, %edi
; SKX-NEXT: jg LBB18_1
; SKX-NEXT: ## %bb.2:
; SKX-NEXT: vpsllw $7, %xmm1, %xmm0
; SKX-NEXT: jmp LBB18_3
; SKX-NEXT: LBB18_1:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: LBB18_3:
; SKX-NEXT: vpmovb2m %xmm0, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test9:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: cmpl %esi, %edi
; AVX512BW-NEXT: jg LBB18_1
; AVX512BW-NEXT: ## %bb.2:
; AVX512BW-NEXT: vpsllw $7, %xmm1, %xmm0
; AVX512BW-NEXT: jmp LBB18_3
; AVX512BW-NEXT: LBB18_1:
; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0
; AVX512BW-NEXT: LBB18_3:
; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test9:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: cmpl %esi, %edi
; AVX512DQ-NEXT: jg LBB18_1
; AVX512DQ-NEXT: ## %bb.2:
; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm0
; AVX512DQ-NEXT: jmp LBB18_3
; AVX512DQ-NEXT: LBB18_1:
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: LBB18_3:
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test9:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; X86-NEXT: jg LBB18_1
; X86-NEXT: ## %bb.2:
; X86-NEXT: vpsllw $7, %xmm1, %xmm0
; X86-NEXT: jmp LBB18_3
; X86-NEXT: LBB18_1:
; X86-NEXT: vpsllw $7, %xmm0, %xmm0
; X86-NEXT: LBB18_3:
; X86-NEXT: vpmovb2m %xmm0, %k0
; X86-NEXT: vpmovm2b %k0, %xmm0
; X86-NEXT: retl
%mask = icmp sgt i32 %a1, %b1
%c = select i1 %mask, <16 x i1>%a, <16 x i1>%b
ret <16 x i1>%c
}define <8 x i1> @test10(<8 x i1>%a, <8 x i1>%b, i32 %a1, i32 %b1) {
%mask = icmp sgt i32 %a1, %b1
%c = select i1 %mask, <8 x i1>%a, <8 x i1>%b
ret <8 x i1>%c
}
define <4 x i1> @test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) {
; KNL-LABEL: test11:
; KNL: ## %bb.0:
; KNL-NEXT: cmpl %esi, %edi
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: jg LBB20_1
; KNL-NEXT: ## %bb.2:
; KNL-NEXT: vpslld $31, %xmm1, %xmm0
; KNL-NEXT: jmp LBB20_3
; KNL-NEXT: LBB20_1:
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
; KNL-NEXT: LBB20_3:
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test11:
; SKX: ## %bb.0:
; SKX-NEXT: cmpl %esi, %edi
; SKX-NEXT: jg LBB20_1
; SKX-NEXT: ## %bb.2:
; SKX-NEXT: vpslld $31, %xmm1, %xmm0
; SKX-NEXT: jmp LBB20_3
; SKX-NEXT: LBB20_1:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: LBB20_3:
; SKX-NEXT: vpmovd2m %xmm0, %k0
; SKX-NEXT: vpmovm2d %k0, %xmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test11:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: cmpl %esi, %edi
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512BW-NEXT: jg LBB20_1
; AVX512BW-NEXT: ## %bb.2:
; AVX512BW-NEXT: vpslld $31, %xmm1, %xmm0
; AVX512BW-NEXT: jmp LBB20_3
; AVX512BW-NEXT: LBB20_1:
; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512BW-NEXT: LBB20_3:
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test11:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: cmpl %esi, %edi
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512DQ-NEXT: jg LBB20_1
; AVX512DQ-NEXT: ## %bb.2:
; AVX512DQ-NEXT: vpslld $31, %xmm1, %xmm0
; AVX512DQ-NEXT: jmp LBB20_3
; AVX512DQ-NEXT: LBB20_1:
; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512DQ-NEXT: LBB20_3:
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test11:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; X86-NEXT: jg LBB20_1
; X86-NEXT: ## %bb.2:
; X86-NEXT: vpslld $31, %xmm1, %xmm0
; X86-NEXT: jmp LBB20_3
; X86-NEXT: LBB20_1:
; X86-NEXT: vpslld $31, %xmm0, %xmm0
; X86-NEXT: LBB20_3:
; X86-NEXT: vpmovd2m %xmm0, %k0
; X86-NEXT: vpmovm2d %k0, %xmm0
; X86-NEXT: retl
%mask = icmp sgt i32 %a1, %b1
%c = select i1 %mask, <4 x i1>%a, <4 x i1>%b
ret <4 x i1>%c
}
define i32 @test12(i32 %x, i32 %y) {
; CHECK-LABEL: test12:
; CHECK: ## %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
;
; X86-LABEL: test12:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
%a = bitcast i16 21845 to <16 x i1>
%b = extractelement <16 x i1> %a, i32 0
%c = select i1 %b, i32 %x, i32 %y
ret i32 %c
}
define i32 @test13(i32 %x, i32 %y) {
; CHECK-LABEL: test13:
; CHECK: ## %bb.0:
; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: retq
;
; X86-LABEL: test13:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
%a = bitcast i16 21845 to <16 x i1>
%b = extractelement <16 x i1> %a, i32 3
%c = select i1 %b, i32 %x, i32 %y
ret i32 %c
}
; Make sure we don't crash on a large vector.
define i32 @test13_crash(i32 %x, i32 %y) {
; CHECK-LABEL: test13_crash:
; CHECK: ## %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
;
; X86-LABEL: test13_crash:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
%a = bitcast i128 2184568686868686868686868686 to <128 x i1>
%b = extractelement <128 x i1> %a, i32 3
%c = select i1 %b, i32 %x, i32 %y
ret i32 %c
}
define <4 x i1> @test14() {
; CHECK-LABEL: test14:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,0,1]
; CHECK-NEXT: retq
;
; X86-LABEL: test14:
; X86: ## %bb.0:
; X86-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,0,1]
; X86-NEXT: retl
%a = bitcast i16 21845 to <16 x i1>
%b = extractelement <16 x i1> %a, i32 2
%c = insertelement <4 x i1> <i1 true, i1 false, i1 false, i1 true>, i1 %b, i32 1
ret <4 x i1> %c
}
define <16 x i1> @test15(i32 %x, i32 %y) {
; KNL-LABEL: test15:
; KNL: ## %bb.0:
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: movl $21845, %eax ## imm = 0x5555
; KNL-NEXT: movl $1, %ecx
; KNL-NEXT: cmovgl %eax, %ecx
; KNL-NEXT: kmovw %ecx, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test15:
; SKX: ## %bb.0:
; SKX-NEXT: cmpl %esi, %edi
; SKX-NEXT: movl $21845, %eax ## imm = 0x5555
; SKX-NEXT: movl $1, %ecx
; SKX-NEXT: cmovgl %eax, %ecx
; SKX-NEXT: kmovd %ecx, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test15:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: cmpl %esi, %edi
; AVX512BW-NEXT: movl $21845, %eax ## imm = 0x5555
; AVX512BW-NEXT: movl $1, %ecx
; AVX512BW-NEXT: cmovgl %eax, %ecx
; AVX512BW-NEXT: kmovd %ecx, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test15:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: cmpl %esi, %edi
; AVX512DQ-NEXT: movl $21845, %eax ## imm = 0x5555
; AVX512DQ-NEXT: movl $1, %ecx
; AVX512DQ-NEXT: cmovgl %eax, %ecx
; AVX512DQ-NEXT: kmovw %ecx, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test15:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl $21845, %eax ## imm = 0x5555
; X86-NEXT: movl $1, %ecx
; X86-NEXT: cmovgl %eax, %ecx
; X86-NEXT: kmovd %ecx, %k0
; X86-NEXT: vpmovm2b %k0, %xmm0
; X86-NEXT: retl
%a = bitcast i16 21845 to <16 x i1>
%b = bitcast i16 1 to <16 x i1>
%mask = icmp sgt i32 %x, %y
%c = select i1 %mask, <16 x i1> %a, <16 x i1> %b
ret <16 x i1> %c
}
define <64 x i8> @test16(i64 %x) {
;
; KNL-LABEL: test16:
; KNL: ## %bb.0:
; KNL-NEXT: movq %rdi, %rax
; KNL-NEXT: movl %edi, %ecx
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: shrq $32, %rdi
; KNL-NEXT: shrq $48, %rax
; KNL-NEXT: shrl $16, %ecx
; KNL-NEXT: kmovw %ecx, %k1
; KNL-NEXT: kmovw %eax, %k2
; KNL-NEXT: kmovw %edi, %k3
; KNL-NEXT: movb $1, %al
; KNL-NEXT: kmovw %eax, %k4
; KNL-NEXT: kshiftrw $5, %k0, %k5
; KNL-NEXT: kxorw %k4, %k5, %k4
; KNL-NEXT: kshiftlw $15, %k4, %k4
; KNL-NEXT: kshiftrw $10, %k4, %k4
; KNL-NEXT: kxorw %k4, %k0, %k4
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm2, %xmm2
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test16:
; SKX: ## %bb.0:
; SKX-NEXT: kmovq %rdi, %k0
; SKX-NEXT: movb $1, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: kshiftrq $5, %k0, %k2
; SKX-NEXT: kxorq %k1, %k2, %k1
; SKX-NEXT: kshiftlq $63, %k1, %k1
; SKX-NEXT: kshiftrq $58, %k1, %k1
; SKX-NEXT: kxorq %k1, %k0, %k0
; SKX-NEXT: vpmovm2b %k0, %zmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test16:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k0
; AVX512BW-NEXT: movb $1, %al
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kshiftrq $5, %k0, %k2
; AVX512BW-NEXT: kxorq %k1, %k2, %k1
; AVX512BW-NEXT: kshiftlq $63, %k1, %k1
; AVX512BW-NEXT: kshiftrq $58, %k1, %k1
; AVX512BW-NEXT: kxorq %k1, %k0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test16:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: movq %rdi, %rax
; AVX512DQ-NEXT: movl %edi, %ecx
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: shrq $32, %rdi
; AVX512DQ-NEXT: shrq $48, %rax
; AVX512DQ-NEXT: shrl $16, %ecx
; AVX512DQ-NEXT: kmovw %ecx, %k1
; AVX512DQ-NEXT: kmovw %eax, %k2
; AVX512DQ-NEXT: kmovw %edi, %k3
; AVX512DQ-NEXT: movb $1, %al
; AVX512DQ-NEXT: kmovw %eax, %k4
; AVX512DQ-NEXT: kshiftrw $5, %k0, %k5
; AVX512DQ-NEXT: kxorw %k4, %k5, %k4
; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4
; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4
; AVX512DQ-NEXT: kxorw %k4, %k0, %k0
; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2
; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test16:
; X86: ## %bb.0:
; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0
; X86-NEXT: movb $1, %al
; X86-NEXT: kmovd %eax, %k1
; X86-NEXT: kshiftrq $5, %k0, %k2
; X86-NEXT: kxorq %k1, %k2, %k1
; X86-NEXT: kshiftlq $63, %k1, %k1
; X86-NEXT: kshiftrq $58, %k1, %k1
; X86-NEXT: kxorq %k1, %k0, %k0
; X86-NEXT: vpmovm2b %k0, %zmm0
; X86-NEXT: retl
%a = bitcast i64 %x to <64 x i1>
%b = insertelement <64 x i1>%a, i1 true, i32 5
%c = sext <64 x i1>%b to <64 x i8>
ret <64 x i8>%c
}
define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
;
; KNL-LABEL: test17:
; KNL: ## %bb.0:
; KNL-NEXT: movq %rdi, %rax
; KNL-NEXT: movl %edi, %ecx
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: shrq $32, %rdi
; KNL-NEXT: shrq $48, %rax
; KNL-NEXT: shrl $16, %ecx
; KNL-NEXT: kmovw %ecx, %k1
; KNL-NEXT: kmovw %eax, %k2
; KNL-NEXT: kmovw %edi, %k3
; KNL-NEXT: cmpl %edx, %esi
; KNL-NEXT: setg %al
; KNL-NEXT: kshiftrw $5, %k0, %k4
; KNL-NEXT: kmovw %eax, %k5
; KNL-NEXT: kxorw %k5, %k4, %k4
; KNL-NEXT: kshiftlw $15, %k4, %k4
; KNL-NEXT: kshiftrw $10, %k4, %k4
; KNL-NEXT: kxorw %k4, %k0, %k4
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm2, %xmm2
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test17:
; SKX: ## %bb.0:
; SKX-NEXT: kmovq %rdi, %k0
; SKX-NEXT: cmpl %edx, %esi
; SKX-NEXT: setg %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: kshiftrq $5, %k0, %k2
; SKX-NEXT: kxorq %k1, %k2, %k1
; SKX-NEXT: kshiftlq $63, %k1, %k1
; SKX-NEXT: kshiftrq $58, %k1, %k1
; SKX-NEXT: kxorq %k1, %k0, %k0
; SKX-NEXT: vpmovm2b %k0, %zmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test17:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k0
; AVX512BW-NEXT: cmpl %edx, %esi
; AVX512BW-NEXT: setg %al
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kshiftrq $5, %k0, %k2
; AVX512BW-NEXT: kxorq %k1, %k2, %k1
; AVX512BW-NEXT: kshiftlq $63, %k1, %k1
; AVX512BW-NEXT: kshiftrq $58, %k1, %k1
; AVX512BW-NEXT: kxorq %k1, %k0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test17:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: movq %rdi, %rax
; AVX512DQ-NEXT: movl %edi, %ecx
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: shrq $32, %rdi
; AVX512DQ-NEXT: shrq $48, %rax
; AVX512DQ-NEXT: shrl $16, %ecx
; AVX512DQ-NEXT: kmovw %ecx, %k1
; AVX512DQ-NEXT: kmovw %eax, %k2
; AVX512DQ-NEXT: kmovw %edi, %k3
; AVX512DQ-NEXT: cmpl %edx, %esi
; AVX512DQ-NEXT: setg %al
; AVX512DQ-NEXT: kshiftrw $5, %k0, %k4
; AVX512DQ-NEXT: kmovw %eax, %k5
; AVX512DQ-NEXT: kxorw %k5, %k4, %k4
; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4
; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4
; AVX512DQ-NEXT: kxorw %k4, %k0, %k0
; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2
; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test17:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; X86-NEXT: setg %al
; X86-NEXT: kmovd %eax, %k1
; X86-NEXT: kshiftrq $5, %k0, %k2
; X86-NEXT: kxorq %k1, %k2, %k1
; X86-NEXT: kshiftlq $63, %k1, %k1
; X86-NEXT: kshiftrq $58, %k1, %k1
; X86-NEXT: kxorq %k1, %k0, %k0
; X86-NEXT: vpmovm2b %k0, %zmm0
; X86-NEXT: retl
%a = bitcast i64 %x to <64 x i1>
%b = icmp sgt i32 %y, %z
%c = insertelement <64 x i1>%a, i1 %b, i32 5
%d = sext <64 x i1>%c to <64 x i8>
ret <64 x i8>%d
}
define <8 x i1> @test18(i8 %a, i16 %y) {
; KNL-LABEL: test18:
; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: kmovw %esi, %k2
; KNL-NEXT: kshiftrw $8, %k2, %k0
; KNL-NEXT: kshiftrw $9, %k2, %k2
; KNL-NEXT: kshiftrw $6, %k1, %k3
; KNL-NEXT: kxorw %k2, %k3, %k2
; KNL-NEXT: kshiftlw $15, %k2, %k2
; KNL-NEXT: kshiftrw $9, %k2, %k2
; KNL-NEXT: kxorw %k2, %k1, %k1
; KNL-NEXT: kshiftlw $9, %k1, %k1
; KNL-NEXT: kshiftrw $9, %k1, %k1
; KNL-NEXT: kshiftlw $7, %k0, %k0
; KNL-NEXT: korw %k0, %k1, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test18:
; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: kmovd %esi, %k2
; SKX-NEXT: kshiftrw $8, %k2, %k0
; SKX-NEXT: kshiftrw $9, %k2, %k2
; SKX-NEXT: kshiftrb $6, %k1, %k3
; SKX-NEXT: kxorb %k2, %k3, %k2
; SKX-NEXT: kshiftlb $7, %k2, %k2
; SKX-NEXT: kshiftrb $1, %k2, %k2
; SKX-NEXT: kxorb %k2, %k1, %k1
; SKX-NEXT: kshiftlb $1, %k1, %k1
; SKX-NEXT: kshiftrb $1, %k1, %k1
; SKX-NEXT: kshiftlb $7, %k0, %k0
; SKX-NEXT: korb %k0, %k1, %k0
; SKX-NEXT: vpmovm2w %k0, %xmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test18:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: kmovd %esi, %k2
; AVX512BW-NEXT: kshiftrw $8, %k2, %k0
; AVX512BW-NEXT: kshiftrw $9, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k1, %k3
; AVX512BW-NEXT: kxorw %k2, %k3, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k2, %k2
; AVX512BW-NEXT: kxorw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $9, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k1
; AVX512BW-NEXT: kshiftlw $7, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test18:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k1
; AVX512DQ-NEXT: kmovw %esi, %k2
; AVX512DQ-NEXT: kshiftrw $8, %k2, %k0
; AVX512DQ-NEXT: kshiftrw $9, %k2, %k2
; AVX512DQ-NEXT: kshiftrb $6, %k1, %k3
; AVX512DQ-NEXT: kxorb %k2, %k3, %k2
; AVX512DQ-NEXT: kshiftlb $7, %k2, %k2
; AVX512DQ-NEXT: kshiftrb $1, %k2, %k2
; AVX512DQ-NEXT: kxorb %k2, %k1, %k1
; AVX512DQ-NEXT: kshiftlb $1, %k1, %k1
; AVX512DQ-NEXT: kshiftrb $1, %k1, %k1
; AVX512DQ-NEXT: kshiftlb $7, %k0, %k0
; AVX512DQ-NEXT: korb %k0, %k1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test18:
; X86: ## %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: kshiftrw $9, %k1, %k2
; X86-NEXT: kshiftrw $8, %k1, %k1
; X86-NEXT: kshiftlb $7, %k1, %k1
; X86-NEXT: kshiftrb $6, %k0, %k3
; X86-NEXT: kxorb %k2, %k3, %k2
; X86-NEXT: kshiftlb $7, %k2, %k2
; X86-NEXT: kshiftrb $1, %k2, %k2
; X86-NEXT: kxorb %k2, %k0, %k0
; X86-NEXT: kshiftlb $1, %k0, %k0
; X86-NEXT: kshiftrb $1, %k0, %k0
; X86-NEXT: korb %k1, %k0, %k0
; X86-NEXT: vpmovm2w %k0, %xmm0
; X86-NEXT: retl
%b = bitcast i8 %a to <8 x i1>
%b1 = bitcast i16 %y to <16 x i1>
%el1 = extractelement <16 x i1>%b1, i32 8
%el2 = extractelement <16 x i1>%b1, i32 9
%c = insertelement <8 x i1>%b, i1 %el1, i32 7
%d = insertelement <8 x i1>%c, i1 %el2, i32 6
ret <8 x i1>%d
}
define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
; KNL-LABEL: test21:
; KNL: ## %bb.0:
; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
; KNL-NEXT: vpsllw $15, %ymm2, %ymm2
; KNL-NEXT: vpsraw $15, %ymm2, %ymm2
; KNL-NEXT: vpand %ymm0, %ymm2, %ymm0
; KNL-NEXT: vpsllw $15, %ymm3, %ymm2
; KNL-NEXT: vpsraw $15, %ymm2, %ymm2
; KNL-NEXT: vpand %ymm1, %ymm2, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: test21:
; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %ymm1, %ymm1
; SKX-NEXT: vpmovb2m %ymm1, %k1
; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test21:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $7, %ymm1, %ymm1
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test21:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
; AVX512DQ-NEXT: vpsllw $15, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsraw $15, %ymm2, %ymm2
; AVX512DQ-NEXT: vpand %ymm0, %ymm2, %ymm0
; AVX512DQ-NEXT: vpsllw $15, %ymm3, %ymm2
; AVX512DQ-NEXT: vpsraw $15, %ymm2, %ymm2
; AVX512DQ-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test21:
; X86: ## %bb.0:
; X86-NEXT: vpsllw $7, %ymm1, %ymm1
; X86-NEXT: vpmovb2m %ymm1, %k1
; X86-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; X86-NEXT: retl
%ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
ret <32 x i16> %ret
}
define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
; KNL-LABEL: test22:
; KNL: ## %bb.0:
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test22:
; SKX: ## %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vpmovd2m %xmm0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test22:
; AVX512BW: ## %bb.0:
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test22:
; AVX512DQ: ## %bb.0:
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test22:
; X86: ## %bb.0:
; X86-NEXT: vpslld $31, %xmm0, %xmm0
; X86-NEXT: vpmovd2m %xmm0, %k0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovb %k0, (%eax)
; X86-NEXT: retl
store <4 x i1> %a, <4 x i1>* %addr
ret void
}
define void @test23(<2 x i1> %a, <2 x i1>* %addr) {
; KNL-LABEL: test23:
; KNL: ## %bb.0:
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vpsllq $63, %xmm0, %xmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test23:
; SKX: ## %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
; SKX-NEXT: vpmovq2m %xmm0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test23:
; AVX512BW: ## %bb.0:
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512BW-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test23:
; AVX512DQ: ## %bb.0:
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512DQ-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test23:
; X86: ## %bb.0:
; X86-NEXT: vpsllq $63, %xmm0, %xmm0
; X86-NEXT: vpmovq2m %xmm0, %k0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovb %k0, (%eax)
; X86-NEXT: retl
store <2 x i1> %a, <2 x i1>* %addr
ret void
}
define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
; KNL-LABEL: store_v1i1:
; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kxnorw %k0, %k0, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rsi)
; KNL-NEXT: retq
;
; SKX-LABEL: store_v1i1:
; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: kxorw %k1, %k0, %k0
; SKX-NEXT: kmovb %k0, (%rsi)
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_v1i1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kxnorw %k0, %k0, %k1
; AVX512BW-NEXT: kxorw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_v1i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: kxnorw %k0, %k0, %k1
; AVX512DQ-NEXT: kxorw %k1, %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rsi)
; AVX512DQ-NEXT: retq
;
; X86-LABEL: store_v1i1:
; X86: ## %bb.0:
; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kxnorw %k0, %k0, %k1
; X86-NEXT: kxorw %k1, %k0, %k0
; X86-NEXT: kmovb %k0, (%eax)
; X86-NEXT: retl
%x = xor <1 x i1> %c, <i1 1>
store <1 x i1> %x, <1 x i1>* %ptr, align 4
ret void
}
define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
; KNL-LABEL: store_v2i1:
; KNL: ## %bb.0:
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vpsllq $63, %xmm0, %xmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: knotw %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_v2i1:
; SKX: ## %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
; SKX-NEXT: vpmovq2m %xmm0, %k0
; SKX-NEXT: knotw %k0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_v2i1:
; AVX512BW: ## %bb.0:
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512BW-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512BW-NEXT: knotw %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_v2i1:
; AVX512DQ: ## %bb.0:
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512DQ-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512DQ-NEXT: knotw %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: store_v2i1:
; X86: ## %bb.0:
; X86-NEXT: vpsllq $63, %xmm0, %xmm0
; X86-NEXT: vpmovq2m %xmm0, %k0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: knotw %k0, %k0
; X86-NEXT: kmovb %k0, (%eax)
; X86-NEXT: retl
%x = xor <2 x i1> %c, <i1 1, i1 1>
store <2 x i1> %x, <2 x i1>* %ptr, align 4
ret void
}
define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
; KNL-LABEL: store_v4i1:
; KNL: ## %bb.0:
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: knotw %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_v4i1:
; SKX: ## %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vpmovd2m %xmm0, %k0
; SKX-NEXT: knotw %k0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_v4i1:
; AVX512BW: ## %bb.0:
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512BW-NEXT: knotw %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_v4i1:
; AVX512DQ: ## %bb.0:
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512DQ-NEXT: knotw %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: store_v4i1:
; X86: ## %bb.0:
; X86-NEXT: vpslld $31, %xmm0, %xmm0
; X86-NEXT: vpmovd2m %xmm0, %k0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: knotw %k0, %k0
; X86-NEXT: kmovb %k0, (%eax)
; X86-NEXT: retl
%x = xor <4 x i1> %c, <i1 1, i1 1, i1 1, i1 1>
store <4 x i1> %x, <4 x i1>* %ptr, align 4
ret void
}
define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) {
; KNL-LABEL: store_v8i1:
; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: knotw %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_v8i1:
; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k0
; SKX-NEXT: knotb %k0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_v8i1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
; AVX512BW-NEXT: knotw %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_v8i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0
; AVX512DQ-NEXT: knotb %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: store_v8i1:
; X86: ## %bb.0:
; X86-NEXT: vpsllw $15, %xmm0, %xmm0
; X86-NEXT: vpmovw2m %xmm0, %k0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: knotb %k0, %k0
; X86-NEXT: kmovb %k0, (%eax)
; X86-NEXT: retl
%x = xor <8 x i1> %c, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
store <8 x i1> %x, <8 x i1>* %ptr, align 4
ret void
}
define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) {
; KNL-LABEL: store_v16i1:
; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: knotw %k0, %k0
; KNL-NEXT: kmovw %k0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_v16i1:
; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k0
; SKX-NEXT: knotw %k0, %k0
; SKX-NEXT: kmovw %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_v16i1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
; AVX512BW-NEXT: knotw %k0, %k0
; AVX512BW-NEXT: kmovw %k0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_v16i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: knotw %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: store_v16i1:
; X86: ## %bb.0:
; X86-NEXT: vpsllw $7, %xmm0, %xmm0
; X86-NEXT: vpmovb2m %xmm0, %k0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: knotw %k0, %k0
; X86-NEXT: kmovw %k0, (%eax)
; X86-NEXT: retl
%x = xor <16 x i1> %c, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
store <16 x i1> %x, <16 x i1>* %ptr, align 4
ret void
}
;void f2(int);
;void f1(int c)
;{
; static int v = 0;
; if (v == 0)
; v = 1;
; else
; v = 0;
; f2(v);
;}
@f1.v = internal unnamed_addr global i1 false, align 4
define void @f1(i32 %c) {
; CHECK-LABEL: f1:
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movzbl {{.*}}(%rip), %edi
; CHECK-NEXT: xorl $1, %edi
; CHECK-NEXT: movb %dil, {{.*}}(%rip)
; CHECK-NEXT: jmp _f2 ## TAILCALL
;
; X86-LABEL: f1:
; X86: ## %bb.0: ## %entry
; X86-NEXT: subl $12, %esp
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: movzbl _f1.v, %eax
; X86-NEXT: xorl $1, %eax
; X86-NEXT: movb %al, _f1.v
; X86-NEXT: movl %eax, (%esp)
; X86-NEXT: calll _f2
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
entry:
%.b1 = load i1, i1* @f1.v, align 4
%not..b1 = xor i1 %.b1, true
store i1 %not..b1, i1* @f1.v, align 4
%0 = zext i1 %not..b1 to i32
tail call void @f2(i32 %0) #2
ret void
}
declare void @f2(i32) #1
define void @store_i16_i1(i16 %x, i1 *%y) {
; CHECK-LABEL: store_i16_i1:
; CHECK: ## %bb.0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: movb %dil, (%rsi)
; CHECK-NEXT: retq
;
; X86-LABEL: store_i16_i1:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: andl $1, %ecx
; X86-NEXT: movb %cl, (%eax)
; X86-NEXT: retl
%c = trunc i16 %x to i1
store i1 %c, i1* %y
ret void
}
define void @store_i8_i1(i8 %x, i1 *%y) {
; CHECK-LABEL: store_i8_i1:
; CHECK: ## %bb.0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: movb %dil, (%rsi)
; CHECK-NEXT: retq
;
; X86-LABEL: store_i8_i1:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: andb $1, %cl
; X86-NEXT: movb %cl, (%eax)
; X86-NEXT: retl
%c = trunc i8 %x to i1
store i1 %c, i1* %y
ret void
}
define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) {
; KNL-LABEL: test_build_vec_v32i1:
; KNL: ## %bb.0:
; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: test_build_vec_v32i1:
; SKX: ## %bb.0:
; SKX-NEXT: movl $1497715861, %eax ## imm = 0x59455495
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_build_vec_v32i1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: movl $1497715861, %eax ## imm = 0x59455495
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_build_vec_v32i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_build_vec_v32i1:
; X86: ## %bb.0:
; X86-NEXT: movl $1497715861, %eax ## imm = 0x59455495
; X86-NEXT: kmovd %eax, %k1
; X86-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; X86-NEXT: retl
%ret = select <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <32 x i16> %x, <32 x i16> zeroinitializer
ret <32 x i16> %ret
}
define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {
; KNL-LABEL: test_build_vec_v64i1:
; KNL: ## %bb.0:
; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: test_build_vec_v64i1:
; SKX: ## %bb.0:
; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_build_vec_v64i1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_build_vec_v64i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_build_vec_v64i1:
; X86: ## %bb.0:
; X86-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero
; X86-NEXT: retl
%ret = select <64 x i1> <i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <64 x i8> %x, <64 x i8> zeroinitializer
ret <64 x i8> %ret
}
define void @ktest_1(<8 x double> %in, double * %base) {
; KNL-LABEL: ktest_1:
; KNL: ## %bb.0:
; KNL-NEXT: vmovupd (%rdi), %zmm1
; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; KNL-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z}
; KNL-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb %al, %al
; KNL-NEXT: je LBB42_2
; KNL-NEXT: ## %bb.1: ## %L1
; KNL-NEXT: vmovapd %zmm0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
; KNL-NEXT: LBB42_2: ## %L2
; KNL-NEXT: vmovapd %zmm0, 8(%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: ktest_1:
; SKX: ## %bb.0:
; SKX-NEXT: vmovupd (%rdi), %zmm1
; SKX-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; SKX-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z}
; SKX-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1}
; SKX-NEXT: kortestb %k0, %k0
; SKX-NEXT: je LBB42_2
; SKX-NEXT: ## %bb.1: ## %L1
; SKX-NEXT: vmovapd %zmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB42_2: ## %L2
; SKX-NEXT: vmovapd %zmm0, 8(%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; AVX512BW-LABEL: ktest_1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vmovupd (%rdi), %zmm1
; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; AVX512BW-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: testb %al, %al
; AVX512BW-NEXT: je LBB42_2
; AVX512BW-NEXT: ## %bb.1: ## %L1
; AVX512BW-NEXT: vmovapd %zmm0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512BW-NEXT: LBB42_2: ## %L2
; AVX512BW-NEXT: vmovapd %zmm0, 8(%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: ktest_1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vmovupd (%rdi), %zmm1
; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; AVX512DQ-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1}
; AVX512DQ-NEXT: kortestb %k0, %k0
; AVX512DQ-NEXT: je LBB42_2
; AVX512DQ-NEXT: ## %bb.1: ## %L1
; AVX512DQ-NEXT: vmovapd %zmm0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512DQ-NEXT: LBB42_2: ## %L2
; AVX512DQ-NEXT: vmovapd %zmm0, 8(%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: ktest_1:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovupd (%eax), %zmm1
; X86-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; X86-NEXT: vmovupd 8(%eax), %zmm1 {%k1} {z}
; X86-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1}
; X86-NEXT: kortestb %k0, %k0
; X86-NEXT: je LBB42_2
; X86-NEXT: ## %bb.1: ## %L1
; X86-NEXT: vmovapd %zmm0, (%eax)
; X86-NEXT: vzeroupper
; X86-NEXT: retl
; X86-NEXT: LBB42_2: ## %L2
; X86-NEXT: vmovapd %zmm0, 8(%eax)
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%addr1 = getelementptr double, double * %base, i64 0
%addr2 = getelementptr double, double * %base, i64 1
%vaddr1 = bitcast double* %addr1 to <8 x double>*
%vaddr2 = bitcast double* %addr2 to <8 x double>*
%val1 = load <8 x double>, <8 x double> *%vaddr1, align 1
%val2 = load <8 x double>, <8 x double> *%vaddr2, align 1
%sel1 = fcmp ogt <8 x double>%in, %val1
%val3 = select <8 x i1> %sel1, <8 x double> %val2, <8 x double> zeroinitializer
%sel2 = fcmp olt <8 x double> %in, %val3
%sel3 = and <8 x i1> %sel1, %sel2
%int_sel3 = bitcast <8 x i1> %sel3 to i8
%res = icmp eq i8 %int_sel3, zeroinitializer
br i1 %res, label %L2, label %L1
L1:
store <8 x double> %in, <8 x double>* %vaddr1
br label %End
L2:
store <8 x double> %in, <8 x double>* %vaddr2
br label %End
End:
ret void
}
define void @ktest_2(<32 x float> %in, float * %base) {
;
; KNL-LABEL: ktest_2:
; KNL: ## %bb.0:
; KNL-NEXT: vmovups (%rdi), %zmm2
; KNL-NEXT: vmovups 64(%rdi), %zmm3
; KNL-NEXT: vcmpltps %zmm1, %zmm3, %k1
; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k2
; KNL-NEXT: vmovups 4(%rdi), %zmm2 {%k2} {z}
; KNL-NEXT: vmovups 68(%rdi), %zmm3 {%k1} {z}
; KNL-NEXT: vcmpltps %zmm3, %zmm1, %k0
; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k3
; KNL-NEXT: korw %k3, %k2, %k2
; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: korw %k0, %k1, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: orl %eax, %ecx
; KNL-NEXT: je LBB43_2
; KNL-NEXT: ## %bb.1: ## %L1
; KNL-NEXT: vmovaps %zmm0, (%rdi)
; KNL-NEXT: vmovaps %zmm1, 64(%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
; KNL-NEXT: LBB43_2: ## %L2
; KNL-NEXT: vmovaps %zmm0, 4(%rdi)
; KNL-NEXT: vmovaps %zmm1, 68(%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: ktest_2:
; SKX: ## %bb.0:
; SKX-NEXT: vmovups (%rdi), %zmm2
; SKX-NEXT: vmovups 64(%rdi), %zmm3
; SKX-NEXT: vcmpltps %zmm0, %zmm2, %k1
; SKX-NEXT: vcmpltps %zmm1, %zmm3, %k2
; SKX-NEXT: kunpckwd %k1, %k2, %k0
; SKX-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z}
; SKX-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z}
; SKX-NEXT: vcmpltps %zmm3, %zmm0, %k1
; SKX-NEXT: vcmpltps %zmm2, %zmm1, %k2
; SKX-NEXT: kunpckwd %k1, %k2, %k1
; SKX-NEXT: kortestd %k1, %k0
; SKX-NEXT: je LBB43_2
; SKX-NEXT: ## %bb.1: ## %L1
; SKX-NEXT: vmovaps %zmm0, (%rdi)
; SKX-NEXT: vmovaps %zmm1, 64(%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB43_2: ## %L2
; SKX-NEXT: vmovaps %zmm0, 4(%rdi)
; SKX-NEXT: vmovaps %zmm1, 68(%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; AVX512BW-LABEL: ktest_2:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vmovups (%rdi), %zmm2
; AVX512BW-NEXT: vmovups 64(%rdi), %zmm3
; AVX512BW-NEXT: vcmpltps %zmm0, %zmm2, %k1
; AVX512BW-NEXT: vcmpltps %zmm1, %zmm3, %k2
; AVX512BW-NEXT: kunpckwd %k1, %k2, %k0
; AVX512BW-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z}
; AVX512BW-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z}
; AVX512BW-NEXT: vcmpltps %zmm3, %zmm0, %k1
; AVX512BW-NEXT: vcmpltps %zmm2, %zmm1, %k2
; AVX512BW-NEXT: kunpckwd %k1, %k2, %k1
; AVX512BW-NEXT: kortestd %k1, %k0
; AVX512BW-NEXT: je LBB43_2
; AVX512BW-NEXT: ## %bb.1: ## %L1
; AVX512BW-NEXT: vmovaps %zmm0, (%rdi)
; AVX512BW-NEXT: vmovaps %zmm1, 64(%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512BW-NEXT: LBB43_2: ## %L2
; AVX512BW-NEXT: vmovaps %zmm0, 4(%rdi)
; AVX512BW-NEXT: vmovaps %zmm1, 68(%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: ktest_2:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vmovups (%rdi), %zmm2
; AVX512DQ-NEXT: vmovups 64(%rdi), %zmm3
; AVX512DQ-NEXT: vcmpltps %zmm1, %zmm3, %k1
; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm2, %k2
; AVX512DQ-NEXT: vmovups 4(%rdi), %zmm2 {%k2} {z}
; AVX512DQ-NEXT: vmovups 68(%rdi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT: vcmpltps %zmm3, %zmm1, %k0
; AVX512DQ-NEXT: vcmpltps %zmm2, %zmm0, %k3
; AVX512DQ-NEXT: korw %k3, %k2, %k2
; AVX512DQ-NEXT: kmovw %k2, %eax
; AVX512DQ-NEXT: korw %k0, %k1, %k0
; AVX512DQ-NEXT: kmovw %k0, %ecx
; AVX512DQ-NEXT: shll $16, %ecx
; AVX512DQ-NEXT: orl %eax, %ecx
; AVX512DQ-NEXT: je LBB43_2
; AVX512DQ-NEXT: ## %bb.1: ## %L1
; AVX512DQ-NEXT: vmovaps %zmm0, (%rdi)
; AVX512DQ-NEXT: vmovaps %zmm1, 64(%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512DQ-NEXT: LBB43_2: ## %L2
; AVX512DQ-NEXT: vmovaps %zmm0, 4(%rdi)
; AVX512DQ-NEXT: vmovaps %zmm1, 68(%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: ktest_2:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovups (%eax), %zmm2
; X86-NEXT: vmovups 64(%eax), %zmm3
; X86-NEXT: vcmpltps %zmm0, %zmm2, %k1
; X86-NEXT: vcmpltps %zmm1, %zmm3, %k2
; X86-NEXT: kunpckwd %k1, %k2, %k0
; X86-NEXT: vmovups 68(%eax), %zmm2 {%k2} {z}
; X86-NEXT: vmovups 4(%eax), %zmm3 {%k1} {z}
; X86-NEXT: vcmpltps %zmm3, %zmm0, %k1
; X86-NEXT: vcmpltps %zmm2, %zmm1, %k2
; X86-NEXT: kunpckwd %k1, %k2, %k1
; X86-NEXT: kortestd %k1, %k0
; X86-NEXT: je LBB43_2
; X86-NEXT: ## %bb.1: ## %L1
; X86-NEXT: vmovaps %zmm0, (%eax)
; X86-NEXT: vmovaps %zmm1, 64(%eax)
; X86-NEXT: vzeroupper
; X86-NEXT: retl
; X86-NEXT: LBB43_2: ## %L2
; X86-NEXT: vmovaps %zmm0, 4(%eax)
; X86-NEXT: vmovaps %zmm1, 68(%eax)
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%addr1 = getelementptr float, float * %base, i64 0
%addr2 = getelementptr float, float * %base, i64 1
%vaddr1 = bitcast float* %addr1 to <32 x float>*
%vaddr2 = bitcast float* %addr2 to <32 x float>*
%val1 = load <32 x float>, <32 x float> *%vaddr1, align 1
%val2 = load <32 x float>, <32 x float> *%vaddr2, align 1
%sel1 = fcmp ogt <32 x float>%in, %val1
%val3 = select <32 x i1> %sel1, <32 x float> %val2, <32 x float> zeroinitializer
%sel2 = fcmp olt <32 x float> %in, %val3
%sel3 = or <32 x i1> %sel1, %sel2
%int_sel3 = bitcast <32 x i1> %sel3 to i32
%res = icmp eq i32 %int_sel3, zeroinitializer
br i1 %res, label %L2, label %L1
L1:
store <32 x float> %in, <32 x float>* %vaddr1
br label %End
L2:
store <32 x float> %in, <32 x float>* %vaddr2
br label %End
End:
ret void
}
define <8 x i64> @load_8i1(<8 x i1>* %a) {
; KNL-LABEL: load_8i1:
; KNL: ## %bb.0:
; KNL-NEXT: movzbl (%rdi), %eax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: load_8i1:
; SKX: ## %bb.0:
; SKX-NEXT: kmovb (%rdi), %k0
; SKX-NEXT: vpmovm2q %k0, %zmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: load_8i1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: movzbl (%rdi), %eax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: load_8i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: load_8i1:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovb (%eax), %k0
; X86-NEXT: vpmovm2q %k0, %zmm0
; X86-NEXT: retl
%b = load <8 x i1>, <8 x i1>* %a
%c = sext <8 x i1> %b to <8 x i64>
ret <8 x i64> %c
}
define <16 x i32> @load_16i1(<16 x i1>* %a) {
; KNL-LABEL: load_16i1:
; KNL: ## %bb.0:
; KNL-NEXT: kmovw (%rdi), %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: load_16i1:
; SKX: ## %bb.0:
; SKX-NEXT: kmovw (%rdi), %k0
; SKX-NEXT: vpmovm2d %k0, %zmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: load_16i1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: load_16i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: load_16i1:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw (%eax), %k0
; X86-NEXT: vpmovm2d %k0, %zmm0
; X86-NEXT: retl
%b = load <16 x i1>, <16 x i1>* %a
%c = sext <16 x i1> %b to <16 x i32>
ret <16 x i32> %c
}
define <2 x i16> @load_2i1(<2 x i1>* %a) {
; KNL-LABEL: load_2i1:
; KNL: ## %bb.0:
; KNL-NEXT: movzbl (%rdi), %eax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: load_2i1:
; SKX: ## %bb.0:
; SKX-NEXT: kmovb (%rdi), %k0
; SKX-NEXT: vpmovm2q %k0, %xmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: load_2i1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: movzbl (%rdi), %eax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: load_2i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: load_2i1:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovb (%eax), %k0
; X86-NEXT: vpmovm2q %k0, %xmm0
; X86-NEXT: retl
%b = load <2 x i1>, <2 x i1>* %a
%c = sext <2 x i1> %b to <2 x i16>
ret <2 x i16> %c
}
define <4 x i16> @load_4i1(<4 x i1>* %a) {
; KNL-LABEL: load_4i1:
; KNL: ## %bb.0:
; KNL-NEXT: movzbl (%rdi), %eax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: load_4i1:
; SKX: ## %bb.0:
; SKX-NEXT: kmovb (%rdi), %k0
; SKX-NEXT: vpmovm2d %k0, %xmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: load_4i1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: movzbl (%rdi), %eax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: load_4i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: load_4i1:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovb (%eax), %k0
; X86-NEXT: vpmovm2d %k0, %xmm0
; X86-NEXT: retl
%b = load <4 x i1>, <4 x i1>* %a
%c = sext <4 x i1> %b to <4 x i16>
ret <4 x i16> %c
}
define <32 x i16> @load_32i1(<32 x i1>* %a) {
; KNL-LABEL: load_32i1:
; KNL: ## %bb.0:
; KNL-NEXT: kmovw (%rdi), %k1
; KNL-NEXT: kmovw 2(%rdi), %k2
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; KNL-NEXT: vpmovdw %zmm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: load_32i1:
; SKX: ## %bb.0:
; SKX-NEXT: kmovd (%rdi), %k0
; SKX-NEXT: vpmovm2w %k0, %zmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: load_32i1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd (%rdi), %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: load_32i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: kmovw 2(%rdi), %k1
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1
; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1
; AVX512DQ-NEXT: retq
;
; X86-LABEL: load_32i1:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovd (%eax), %k0
; X86-NEXT: vpmovm2w %k0, %zmm0
; X86-NEXT: retl
%b = load <32 x i1>, <32 x i1>* %a
%c = sext <32 x i1> %b to <32 x i16>
ret <32 x i16> %c
}
define <64 x i8> @load_64i1(<64 x i1>* %a) {
; KNL-LABEL: load_64i1:
; KNL: ## %bb.0:
; KNL-NEXT: kmovw (%rdi), %k1
; KNL-NEXT: kmovw 2(%rdi), %k2
; KNL-NEXT: kmovw 4(%rdi), %k3
; KNL-NEXT: kmovw 6(%rdi), %k4
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k4} {z}
; KNL-NEXT: vpmovdb %zmm2, %xmm2
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: load_64i1:
; SKX: ## %bb.0:
; SKX-NEXT: kmovq (%rdi), %k0
; SKX-NEXT: vpmovm2b %k0, %zmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: load_64i1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq (%rdi), %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: load_64i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: kmovw 2(%rdi), %k1
; AVX512DQ-NEXT: kmovw 4(%rdi), %k2
; AVX512DQ-NEXT: kmovw 6(%rdi), %k3
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vpmovm2d %k3, %zmm2
; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512DQ-NEXT: retq
;
; X86-LABEL: load_64i1:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovq (%eax), %k0
; X86-NEXT: vpmovm2b %k0, %zmm0
; X86-NEXT: retl
%b = load <64 x i1>, <64 x i1>* %a
%c = sext <64 x i1> %b to <64 x i8>
ret <64 x i8> %c
}
define void @store_8i1(<8 x i1>* %a, <8 x i1> %v) {
; KNL-LABEL: store_8i1:
; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_8i1:
; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_8i1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_8i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: store_8i1:
; X86: ## %bb.0:
; X86-NEXT: vpsllw $15, %xmm0, %xmm0
; X86-NEXT: vpmovw2m %xmm0, %k0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovb %k0, (%eax)
; X86-NEXT: retl
store <8 x i1> %v, <8 x i1>* %a
ret void
}
define void @store_8i1_1(<8 x i1>* %a, <8 x i16> %v) {
; KNL-LABEL: store_8i1_1:
; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_8i1_1:
; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_8i1_1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_8i1_1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: store_8i1_1:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpsllw $15, %xmm0, %xmm0
; X86-NEXT: vpmovw2m %xmm0, %k0
; X86-NEXT: kmovb %k0, (%eax)
; X86-NEXT: retl
%v1 = trunc <8 x i16> %v to <8 x i1>
store <8 x i1> %v1, <8 x i1>* %a
ret void
}
define void @store_16i1(<16 x i1>* %a, <16 x i1> %v) {
; KNL-LABEL: store_16i1:
; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_16i1:
; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k0
; SKX-NEXT: kmovw %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_16i1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
; AVX512BW-NEXT: kmovw %k0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_16i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: store_16i1:
; X86: ## %bb.0:
; X86-NEXT: vpsllw $7, %xmm0, %xmm0
; X86-NEXT: vpmovb2m %xmm0, %k0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw %k0, (%eax)
; X86-NEXT: retl
store <16 x i1> %v, <16 x i1>* %a
ret void
}
define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
; KNL-LABEL: store_32i1:
; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxbd %xmm0, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: kmovw %k1, 2(%rdi)
; KNL-NEXT: kmovw %k0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_32i1:
; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k0
; SKX-NEXT: kmovd %k0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_32i1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_32i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: kmovw %k1, 2(%rdi)
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: store_32i1:
; X86: ## %bb.0:
; X86-NEXT: vpsllw $7, %ymm0, %ymm0
; X86-NEXT: vpmovb2m %ymm0, %k0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovd %k0, (%eax)
; X86-NEXT: vzeroupper
; X86-NEXT: retl
store <32 x i1> %v, <32 x i1>* %a
ret void
}
define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
; KNL-LABEL: store_32i1_1:
; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: vpmovsxwd %ymm1, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: kmovw %k1, 2(%rdi)
; KNL-NEXT: kmovw %k0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_32i1_1:
; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $15, %zmm0, %zmm0
; SKX-NEXT: vpmovw2m %zmm0, %k0
; SKX-NEXT: kmovd %k0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_32i1_1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $15, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_32i1_1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: kmovw %k1, 2(%rdi)
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: store_32i1_1:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpsllw $15, %zmm0, %zmm0
; X86-NEXT: vpmovw2m %zmm0, %k0
; X86-NEXT: kmovd %k0, (%eax)
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%v1 = trunc <32 x i16> %v to <32 x i1>
store <32 x i1> %v1, <32 x i1>* %a
ret void
}
define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
;
; KNL-LABEL: store_64i1:
; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: vpmovsxbd %xmm1, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: vpmovsxbd %xmm2, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2
; KNL-NEXT: vpmovsxbd %xmm3, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k3
; KNL-NEXT: kmovw %k3, 6(%rdi)
; KNL-NEXT: kmovw %k2, 4(%rdi)
; KNL-NEXT: kmovw %k1, 2(%rdi)
; KNL-NEXT: kmovw %k0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_64i1:
; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %zmm0, %zmm0
; SKX-NEXT: vpmovb2m %zmm0, %k0
; SKX-NEXT: kmovq %k0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_64i1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
; AVX512BW-NEXT: kmovq %k0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_64i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3
; AVX512DQ-NEXT: kmovw %k3, 6(%rdi)
; AVX512DQ-NEXT: kmovw %k2, 4(%rdi)
; AVX512DQ-NEXT: kmovw %k1, 2(%rdi)
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: store_64i1:
; X86: ## %bb.0:
; X86-NEXT: vpsllw $7, %zmm0, %zmm0
; X86-NEXT: vpmovb2m %zmm0, %k0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovq %k0, (%eax)
; X86-NEXT: vzeroupper
; X86-NEXT: retl
store <64 x i1> %v, <64 x i1>* %a
ret void
}
define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
; KNL-LABEL: test_bitcast_v8i1_zext:
; KNL: ## %bb.0:
; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movzbl %al, %eax
; KNL-NEXT: addl %eax, %eax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_bitcast_v8i1_zext:
; SKX: ## %bb.0:
; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k0
; SKX-NEXT: kmovb %k0, %eax
; SKX-NEXT: addl %eax, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_bitcast_v8i1_zext:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movzbl %al, %eax
; AVX512BW-NEXT: addl %eax, %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_bitcast_v8i1_zext:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512DQ-NEXT: kmovb %k0, %eax
; AVX512DQ-NEXT: addl %eax, %eax
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_bitcast_v8i1_zext:
; X86: ## %bb.0:
; X86-NEXT: vptestnmd %zmm0, %zmm0, %k0
; X86-NEXT: kmovb %k0, %eax
; X86-NEXT: addl %eax, %eax
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%v1 = icmp eq <16 x i32> %a, zeroinitializer
%mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%mask1 = bitcast <8 x i1> %mask to i8
%val = zext i8 %mask1 to i32
%val1 = add i32 %val, %val
ret i32 %val1
}
define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
; CHECK-LABEL: test_bitcast_v16i1_zext:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: addl %eax, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
;
; X86-LABEL: test_bitcast_v16i1_zext:
; X86: ## %bb.0:
; X86-NEXT: vptestnmd %zmm0, %zmm0, %k0
; X86-NEXT: kmovw %k0, %eax
; X86-NEXT: addl %eax, %eax
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%v1 = icmp eq <16 x i32> %a, zeroinitializer
%mask1 = bitcast <16 x i1> %v1 to i16
%val = zext i16 %mask1 to i32
%val1 = add i32 %val, %val
ret i32 %val1
}
define i16 @test_v16i1_add(i16 %x, i16 %y) {
; KNL-LABEL: test_v16i1_add:
; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kmovw %esi, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: def $ax killed $ax killed $eax
; KNL-NEXT: retq
;
; SKX-LABEL: test_v16i1_add:
; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: kxorw %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $ax killed $ax killed $eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_v16i1_add:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: kxorw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: ## kill: def $ax killed $ax killed $eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_v16i1_add:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: kmovw %esi, %k1
; AVX512DQ-NEXT: kxorw %k1, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: ## kill: def $ax killed $ax killed $eax
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_v16i1_add:
; X86: ## %bb.0:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: kxorw %k1, %k0, %k0
; X86-NEXT: kmovd %k0, %eax
; X86-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
%m0 = bitcast i16 %x to <16 x i1>
%m1 = bitcast i16 %y to <16 x i1>
%m2 = add <16 x i1> %m0, %m1
%ret = bitcast <16 x i1> %m2 to i16
ret i16 %ret
}
define i16 @test_v16i1_sub(i16 %x, i16 %y) {
; KNL-LABEL: test_v16i1_sub:
; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kmovw %esi, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: def $ax killed $ax killed $eax
; KNL-NEXT: retq
;
; SKX-LABEL: test_v16i1_sub:
; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: kxorw %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $ax killed $ax killed $eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_v16i1_sub:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: kxorw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: ## kill: def $ax killed $ax killed $eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_v16i1_sub:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: kmovw %esi, %k1
; AVX512DQ-NEXT: kxorw %k1, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: ## kill: def $ax killed $ax killed $eax
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_v16i1_sub:
; X86: ## %bb.0:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: kxorw %k1, %k0, %k0
; X86-NEXT: kmovd %k0, %eax
; X86-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
%m0 = bitcast i16 %x to <16 x i1>
%m1 = bitcast i16 %y to <16 x i1>
%m2 = sub <16 x i1> %m0, %m1
%ret = bitcast <16 x i1> %m2 to i16
ret i16 %ret
}
define i16 @test_v16i1_mul(i16 %x, i16 %y) {
; KNL-LABEL: test_v16i1_mul:
; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kmovw %esi, %k1
; KNL-NEXT: kandw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: def $ax killed $ax killed $eax
; KNL-NEXT: retq
;
; SKX-LABEL: test_v16i1_mul:
; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: kandw %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $ax killed $ax killed $eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_v16i1_mul:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: ## kill: def $ax killed $ax killed $eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_v16i1_mul:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: kmovw %esi, %k1
; AVX512DQ-NEXT: kandw %k1, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: ## kill: def $ax killed $ax killed $eax
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_v16i1_mul:
; X86: ## %bb.0:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: kandw %k1, %k0, %k0
; X86-NEXT: kmovd %k0, %eax
; X86-NEXT: ## kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
%m0 = bitcast i16 %x to <16 x i1>
%m1 = bitcast i16 %y to <16 x i1>
%m2 = mul <16 x i1> %m0, %m1
%ret = bitcast <16 x i1> %m2 to i16
ret i16 %ret
}
define i8 @test_v8i1_add(i8 %x, i8 %y) {
; KNL-LABEL: test_v8i1_add:
; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kmovw %esi, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: def $al killed $al killed $eax
; KNL-NEXT: retq
;
; SKX-LABEL: test_v8i1_add:
; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: kxorb %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $al killed $al killed $eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_v8i1_add:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: kxorw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: ## kill: def $al killed $al killed $eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_v8i1_add:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: kmovw %esi, %k1
; AVX512DQ-NEXT: kxorb %k1, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: ## kill: def $al killed $al killed $eax
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_v8i1_add:
; X86: ## %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT: kxorb %k1, %k0, %k0
; X86-NEXT: kmovd %k0, %eax
; X86-NEXT: ## kill: def $al killed $al killed $eax
; X86-NEXT: retl
%m0 = bitcast i8 %x to <8 x i1>
%m1 = bitcast i8 %y to <8 x i1>
%m2 = add <8 x i1> %m0, %m1
%ret = bitcast <8 x i1> %m2 to i8
ret i8 %ret
}
define i8 @test_v8i1_sub(i8 %x, i8 %y) {
; KNL-LABEL: test_v8i1_sub:
; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kmovw %esi, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: def $al killed $al killed $eax
; KNL-NEXT: retq
;
; SKX-LABEL: test_v8i1_sub:
; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: kxorb %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $al killed $al killed $eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_v8i1_sub:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: kxorw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: ## kill: def $al killed $al killed $eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_v8i1_sub:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: kmovw %esi, %k1
; AVX512DQ-NEXT: kxorb %k1, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: ## kill: def $al killed $al killed $eax
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_v8i1_sub:
; X86: ## %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT: kxorb %k1, %k0, %k0
; X86-NEXT: kmovd %k0, %eax
; X86-NEXT: ## kill: def $al killed $al killed $eax
; X86-NEXT: retl
%m0 = bitcast i8 %x to <8 x i1>
%m1 = bitcast i8 %y to <8 x i1>
%m2 = sub <8 x i1> %m0, %m1
%ret = bitcast <8 x i1> %m2 to i8
ret i8 %ret
}
define i8 @test_v8i1_mul(i8 %x, i8 %y) {
; KNL-LABEL: test_v8i1_mul:
; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kmovw %esi, %k1
; KNL-NEXT: kandw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: def $al killed $al killed $eax
; KNL-NEXT: retq
;
; SKX-LABEL: test_v8i1_mul:
; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: kandb %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $al killed $al killed $eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_v8i1_mul:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: ## kill: def $al killed $al killed $eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_v8i1_mul:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: kmovw %esi, %k1
; AVX512DQ-NEXT: kandb %k1, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: ## kill: def $al killed $al killed $eax
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_v8i1_mul:
; X86: ## %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT: kandb %k1, %k0, %k0
; X86-NEXT: kmovd %k0, %eax
; X86-NEXT: ## kill: def $al killed $al killed $eax
; X86-NEXT: retl
%m0 = bitcast i8 %x to <8 x i1>
%m1 = bitcast i8 %y to <8 x i1>
%m2 = mul <8 x i1> %m0, %m1
%ret = bitcast <8 x i1> %m2 to i8
ret i8 %ret
}
; Make sure we don't emit a ktest for signed comparisons.
define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
; KNL-LABEL: ktest_signed:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rax
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: vporq %zmm1, %zmm0, %zmm0
; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testw %ax, %ax
; KNL-NEXT: jle LBB64_1
; KNL-NEXT: ## %bb.2: ## %bb.2
; KNL-NEXT: popq %rax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
; KNL-NEXT: LBB64_1: ## %bb.1
; KNL-NEXT: vzeroupper
; KNL-NEXT: callq _foo
; KNL-NEXT: popq %rax
; KNL-NEXT: retq
;
; SKX-LABEL: ktest_signed:
; SKX: ## %bb.0:
; SKX-NEXT: pushq %rax
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0
; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testw %ax, %ax
; SKX-NEXT: jle LBB64_1
; SKX-NEXT: ## %bb.2: ## %bb.2
; SKX-NEXT: popq %rax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB64_1: ## %bb.1
; SKX-NEXT: vzeroupper
; SKX-NEXT: callq _foo
; SKX-NEXT: popq %rax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: ktest_signed:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: pushq %rax
; AVX512BW-NEXT: .cfi_def_cfa_offset 16
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: testw %ax, %ax
; AVX512BW-NEXT: jle LBB64_1
; AVX512BW-NEXT: ## %bb.2: ## %bb.2
; AVX512BW-NEXT: popq %rax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512BW-NEXT: LBB64_1: ## %bb.1
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: callq _foo
; AVX512BW-NEXT: popq %rax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: ktest_signed:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: pushq %rax
; AVX512DQ-NEXT: .cfi_def_cfa_offset 16
; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: testw %ax, %ax
; AVX512DQ-NEXT: jle LBB64_1
; AVX512DQ-NEXT: ## %bb.2: ## %bb.2
; AVX512DQ-NEXT: popq %rax
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512DQ-NEXT: LBB64_1: ## %bb.1
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: callq _foo
; AVX512DQ-NEXT: popq %rax
; AVX512DQ-NEXT: retq
;
; X86-LABEL: ktest_signed:
; X86: ## %bb.0:
; X86-NEXT: subl $12, %esp
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: vporq %zmm1, %zmm0, %zmm0
; X86-NEXT: vptestnmd %zmm0, %zmm0, %k0
; X86-NEXT: kmovd %k0, %eax
; X86-NEXT: testw %ax, %ax
; X86-NEXT: jle LBB64_1
; X86-NEXT: ## %bb.2: ## %bb.2
; X86-NEXT: addl $12, %esp
; X86-NEXT: vzeroupper
; X86-NEXT: retl
; X86-NEXT: LBB64_1: ## %bb.1
; X86-NEXT: vzeroupper
; X86-NEXT: calll _foo
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
%a = icmp eq <16 x i32> %x, zeroinitializer
%b = icmp eq <16 x i32> %y, zeroinitializer
%c = and <16 x i1> %a, %b
%d = bitcast <16 x i1> %c to i16
%e = icmp sgt i16 %d, 0
br i1 %e, label %bb.2, label %bb.1
bb.1:
call void @foo()
br label %bb.2
bb.2:
ret void
}
declare void @foo()
; Make sure we can use the C flag from kortest to check for all ones.
define void @ktest_allones(<16 x i32> %x, <16 x i32> %y) {
; CHECK-LABEL: ktest_allones:
; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k0
; CHECK-NEXT: kortestw %k0, %k0
; CHECK-NEXT: jb LBB65_2
; CHECK-NEXT: ## %bb.1: ## %bb.1
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _foo
; CHECK-NEXT: LBB65_2: ## %bb.2
; CHECK-NEXT: popq %rax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
;
; X86-LABEL: ktest_allones:
; X86: ## %bb.0:
; X86-NEXT: subl $12, %esp
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: vporq %zmm1, %zmm0, %zmm0
; X86-NEXT: vptestnmd %zmm0, %zmm0, %k0
; X86-NEXT: kortestw %k0, %k0
; X86-NEXT: jb LBB65_2
; X86-NEXT: ## %bb.1: ## %bb.1
; X86-NEXT: vzeroupper
; X86-NEXT: calll _foo
; X86-NEXT: LBB65_2: ## %bb.2
; X86-NEXT: addl $12, %esp
; X86-NEXT: vzeroupper
; X86-NEXT: retl
%a = icmp eq <16 x i32> %x, zeroinitializer
%b = icmp eq <16 x i32> %y, zeroinitializer
%c = and <16 x i1> %a, %b
%d = bitcast <16 x i1> %c to i16
%e = icmp eq i16 %d, -1
br i1 %e, label %bb.2, label %bb.1
bb.1:
call void @foo()
br label %bb.2
bb.2:
ret void
}
; This is derived from an intrinsic test where v4i1 mask was created by _mm_cmp_epi32_mask, then it was passed to _mm512_mask_blend_epi32 which uses a v16i1 mask.
; The widening happens in the scalar domain between the intrinsics. The middle end optmized it to this.
define <8 x i64> @mask_widening(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d, <8 x i64> %e, <8 x i64> %f) {
; KNL-LABEL: mask_widening:
; KNL: ## %bb.0: ## %entry
; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; KNL-NEXT: kshiftlw $12, %k0, %k0
; KNL-NEXT: kshiftrw $12, %k0, %k1
; KNL-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: mask_widening:
; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
; SKX-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1}
; SKX-NEXT: retq
;
; AVX512BW-LABEL: mask_widening:
; AVX512BW: ## %bb.0: ## %entry
; AVX512BW-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kshiftlw $12, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k0, %k1
; AVX512BW-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: mask_widening:
; AVX512DQ: ## %bb.0: ## %entry
; AVX512DQ-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512DQ-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; AVX512DQ-NEXT: kshiftlw $12, %k0, %k0
; AVX512DQ-NEXT: kshiftrw $12, %k0, %k1
; AVX512DQ-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1}
; AVX512DQ-NEXT: retq
;
; X86-LABEL: mask_widening:
; X86: ## %bb.0: ## %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: .cfi_offset %ebp, -8
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: .cfi_def_cfa_register %ebp
; X86-NEXT: andl $-64, %esp
; X86-NEXT: subl $64, %esp
; X86-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
; X86-NEXT: vmovdqa64 8(%ebp), %zmm0
; X86-NEXT: vmovdqa32 72(%ebp), %zmm0 {%k1}
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
; X86-NEXT: retl
entry:
%0 = bitcast <2 x i64> %a to <4 x i32>
%1 = bitcast <2 x i64> %b to <4 x i32>
%2 = icmp eq <4 x i32> %0, %1
%3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%4 = bitcast <8 x i64> %f to <16 x i32>
%5 = bitcast <8 x i64> %e to <16 x i32>
%6 = shufflevector <8 x i1> %3, <8 x i1> <i1 false, i1 undef, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
%7 = select <16 x i1> %6, <16 x i32> %4, <16 x i32> %5
%8 = bitcast <16 x i32> %7 to <8 x i64>
ret <8 x i64> %8
}
define void @store_v64i1_constant(<64 x i1>* %R) {
; CHECK-LABEL: store_v64i1_constant:
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movabsq $-2305843576149381123, %rax ## imm = 0xDFFFFF7BFFFFEFFD
; CHECK-NEXT: movq %rax, (%rdi)
; CHECK-NEXT: retq
;
; X86-LABEL: store_v64i1_constant:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl $-536871045, 4(%eax) ## imm = 0xDFFFFF7B
; X86-NEXT: movl $-4099, (%eax) ## imm = 0xEFFD
; X86-NEXT: retl
entry:
store <64 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <64 x i1>* %R
ret void
}
define void @store_v2i1_constant(<2 x i1>* %R) {
; CHECK-LABEL: store_v2i1_constant:
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movb $1, (%rdi)
; CHECK-NEXT: retq
;
; X86-LABEL: store_v2i1_constant:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb $1, (%eax)
; X86-NEXT: retl
entry:
store <2 x i1> <i1 1, i1 0>, <2 x i1>* %R
ret void
}
define void @store_v4i1_constant(<4 x i1>* %R) {
; CHECK-LABEL: store_v4i1_constant:
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movb $5, (%rdi)
; CHECK-NEXT: retq
;
; X86-LABEL: store_v4i1_constant:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb $5, (%eax)
; X86-NEXT: retl
entry:
store <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i1>* %R
ret void
}