[X86] Legalize v32i1 without BWI via splitting to v16i1 rather than the default of promoting to v32i8.

Summary:
For the most part its better to keep v32i1 as a mask type of a narrower width than trying to promote it to a ymm register.

I had to add some overrides to the methods that get the types for the calling convention so that we still use v32i8 for argument/return purposes.

There are still some regressions in here. I definitely saw some around shuffles. I think we probably should move vXi1 shuffle from lowering to a DAG combine where I think the extend and truncate we have to emit would be better combined.

I think we also need a DAG combine to remove trunc from (extract_vector_elt (trunc))

Overall this removes something like 13000 CHECK lines from lit tests.

Reviewers: zvi, RKSimon, delena, spatel

Reviewed By: RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D42031

llvm-svn: 323201
This commit is contained in:
Craig Topper 2018-01-23 14:25:39 +00:00
parent c2df6409c7
commit 76adcc86cd
16 changed files with 3597 additions and 16932 deletions

View File

@ -1720,6 +1720,9 @@ SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
TargetLoweringBase::LegalizeTypeAction
X86TargetLowering::getPreferredVectorAction(EVT VT) const {
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return TypeSplitVector;
if (ExperimentalVectorWideningLegalization &&
VT.getVectorNumElements() != 1 &&
VT.getVectorElementType().getSimpleVT() != MVT::i1)
@ -1728,6 +1731,26 @@ X86TargetLowering::getPreferredVectorAction(EVT VT) const {
return TargetLoweringBase::getPreferredVectorAction(VT);
}
MVT X86TargetLowering::getRegisterTypeForCallingConv(MVT VT) const {
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return MVT::v32i8;
return TargetLowering::getRegisterTypeForCallingConv(VT);
}
MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
EVT VT) const {
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return MVT::v32i8;
return TargetLowering::getRegisterTypeForCallingConv(Context, VT);
}
unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
EVT VT) const {
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return 1;
return TargetLowering::getNumRegistersForCallingConv(Context, VT);
}
EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext& Context,
EVT VT) const {

View File

@ -1084,6 +1084,14 @@ namespace llvm {
/// \brief Customize the preferred legalization strategy for certain types.
LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
MVT getRegisterTypeForCallingConv(MVT VT) const override;
MVT getRegisterTypeForCallingConv(LLVMContext &Context,
EVT VT) const override;
unsigned getNumRegistersForCallingConv(LLVMContext &Context,
EVT VT) const override;
bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
bool supportSwiftError() const override;

View File

@ -60,22 +60,16 @@ define <16 x i8> @avg_v16i8_maskz(<16 x i8> %a, <16 x i8> %b, i16 %mask) nounwin
define <32 x i8> @avg_v32i8_mask(<32 x i8> %a, <32 x i8> %b, <32 x i8> %src, i32 %mask) nounwind {
; AVX512F-LABEL: avg_v32i8_mask:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $32, %rsp
; AVX512F-NEXT: movl %edi, (%rsp)
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: shrl $16, %edi
; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: kmovw (%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: kmovw %edi, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i8_mask:
@ -98,22 +92,16 @@ define <32 x i8> @avg_v32i8_mask(<32 x i8> %a, <32 x i8> %b, <32 x i8> %src, i32
define <32 x i8> @avg_v32i8_maskz(<32 x i8> %a, <32 x i8> %b, i32 %mask) nounwind {
; AVX512F-LABEL: avg_v32i8_maskz:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $32, %rsp
; AVX512F-NEXT: movl %edi, (%rsp)
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: shrl $16, %edi
; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: kmovw (%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: kmovw %edi, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i8_maskz:
@ -135,33 +123,30 @@ define <32 x i8> @avg_v32i8_maskz(<32 x i8> %a, <32 x i8> %b, i32 %mask) nounwin
define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 %mask) nounwind {
; AVX512F-LABEL: avg_v64i8_mask:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $64, %rsp
; AVX512F-NEXT: movl %edi, (%rsp)
; AVX512F-NEXT: shrq $32, %rdi
; AVX512F-NEXT: movl %edi, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: movq %rdi, %rcx
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: movl %edi, %edx
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: shrq $32, %rax
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: kmovw (%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: kmovw %ecx, %k2
; AVX512F-NEXT: kmovw %eax, %k3
; AVX512F-NEXT: kmovw %edx, %k4
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k4} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v64i8_mask:
@ -184,33 +169,30 @@ define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64
define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind {
; AVX512F-LABEL: avg_v64i8_maskz:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $64, %rsp
; AVX512F-NEXT: movl %edi, (%rsp)
; AVX512F-NEXT: shrq $32, %rdi
; AVX512F-NEXT: movl %edi, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: movq %rdi, %rcx
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: movl %edi, %edx
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: shrq $32, %rax
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: kmovw (%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: kmovw %ecx, %k2
; AVX512F-NEXT: kmovw %eax, %k3
; AVX512F-NEXT: kmovw %edx, %k4
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k4} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v64i8_maskz:
@ -340,29 +322,17 @@ define <16 x i16> @avg_v16i16_maskz(<16 x i16> %a, <16 x i16> %b, i16 %mask) nou
define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src, i32 %mask) nounwind {
; AVX512F-LABEL: avg_v32i16_mask:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $32, %rsp
; AVX512F-NEXT: movl %edi, (%rsp)
; AVX512F-NEXT: kmovw (%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm6, %xmm6
; AVX512F-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm7, %xmm7
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: shrl $16, %edi
; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero,xmm7[8],zero,xmm7[9],zero,xmm7[10],zero,xmm7[11],zero,xmm7[12],zero,xmm7[13],zero,xmm7[14],zero,xmm7[15],zero
; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
; AVX512F-NEXT: kmovw %edi, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i16_mask:
@ -385,29 +355,17 @@ define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src
define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind {
; AVX512F-LABEL: avg_v32i16_maskz:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $32, %rsp
; AVX512F-NEXT: movl %edi, (%rsp)
; AVX512F-NEXT: kmovw (%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm5, %xmm5
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: shrl $16, %edi
; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
; AVX512F-NEXT: kmovw %edi, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i16_maskz:

View File

@ -782,39 +782,20 @@ define <32 x i8> @test_insert_128_v32i8(<32 x i8> %x, i8 %y) {
define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> %y) {
; KNL-LABEL: test_insertelement_v32i1:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $32, %rsp
; KNL-NEXT: xorl %eax, %eax
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: setb %al
; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k1
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rsp)
; KNL-NEXT: movl (%rsp), %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k0
; KNL-NEXT: kshiftrw $4, %k0, %k1
; KNL-NEXT: kmovw %eax, %k2
; KNL-NEXT: kxorw %k2, %k1, %k1
; KNL-NEXT: kshiftlw $15, %k1, %k1
; KNL-NEXT: kshiftrw $11, %k1, %k1
; KNL-NEXT: kxorw %k0, %k1, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: orl %ecx, %eax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
@ -1014,7 +995,10 @@ define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) {
; KNL-NEXT: vpxor %ymm2, %ymm1, %ymm1
; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0
; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpextrb $2, %xmm0, %eax
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftrw $2, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
@ -1041,7 +1025,10 @@ define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
; KNL-NEXT: vpxor %ymm0, %ymm1, %ymm0
; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpextrb $15, %xmm0, %eax
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $1, %al
; KNL-NEXT: movb $4, %cl
; KNL-NEXT: subb %al, %cl
@ -1074,7 +1061,10 @@ define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
; KNL-NEXT: vpxor %ymm0, %ymm1, %ymm0
; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpextrb $15, %xmm0, %eax
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $1, %al
; KNL-NEXT: movb $4, %cl
; KNL-NEXT: subb %al, %cl
@ -1717,26 +1707,25 @@ define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) {
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $96, %rsp
; KNL-NEXT: subq $64, %rsp
; KNL-NEXT: ## kill: def %esi killed %esi def %rsi
; KNL-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; KNL-NEXT: vpxor %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; KNL-NEXT: andl $31, %esi
; KNL-NEXT: testb %dil, %dil
; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
; KNL-NEXT: setne 32(%rsp,%rsi)
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vmovdqa %ymm0, (%rsp)
; KNL-NEXT: setne (%rsp,%rsi)
; KNL-NEXT: vpmovsxbd (%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rsp)
; KNL-NEXT: movl (%rsp), %eax
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
; KNL-NEXT: orl %ecx, %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
@ -1782,7 +1771,7 @@ define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) {
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $192, %rsp
; KNL-NEXT: subq $128, %rsp
; KNL-NEXT: ## kill: def %esi killed %esi def %rsi
; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0
@ -1792,30 +1781,32 @@ define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) {
; KNL-NEXT: andl $63, %esi
; KNL-NEXT: testb %dil, %dil
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
; KNL-NEXT: setne 64(%rsp,%rsi)
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
; KNL-NEXT: vmovdqa %ymm0, (%rsp)
; KNL-NEXT: setne (%rsp,%rsi)
; KNL-NEXT: vmovdqa (%rsp), %ymm0
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vpmovsxbd %xmm0, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: orl %eax, %ecx
; KNL-NEXT: vpmovsxbd %xmm1, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpmovsxbd %xmm1, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rsp)
; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; KNL-NEXT: movl (%rsp), %eax
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
; KNL-NEXT: orl %edx, %eax
; KNL-NEXT: shlq $32, %rax
; KNL-NEXT: orq %rcx, %rax
; KNL-NEXT: movq %rbp, %rsp
@ -1863,7 +1854,7 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-128, %rsp
; KNL-NEXT: subq $384, %rsp ## imm = 0x180
; KNL-NEXT: subq $256, %rsp ## imm = 0x100
; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm0, %xmm0
; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm0, %xmm0
@ -1977,56 +1968,60 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp)
; KNL-NEXT: setne 128(%rsp,%rax)
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm2
; KNL-NEXT: vmovdqa %ymm2, (%rsp)
; KNL-NEXT: setne (%rsp,%rax)
; KNL-NEXT: vmovdqa (%rsp), %ymm2
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm4
; KNL-NEXT: vpmovsxbd %xmm4, %zmm4
; KNL-NEXT: vpmovsxbd %xmm2, %zmm4
; KNL-NEXT: vpslld $31, %zmm4, %zmm4
; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: orl %eax, %ecx
; KNL-NEXT: vpmovsxbd %xmm3, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: vextracti128 $1, %ymm3, %xmm2
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
; KNL-NEXT: orl %edx, %eax
; KNL-NEXT: shlq $32, %rax
; KNL-NEXT: orq %rcx, %rax
; KNL-NEXT: vpmovsxbd %xmm1, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vextracti128 $1, %ymm2, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: kmovw %k0, %esi
; KNL-NEXT: shll $16, %esi
; KNL-NEXT: orl %ecx, %esi
; KNL-NEXT: vpmovsxbd %xmm0, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpmovsxbd %xmm2, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vextracti128 $1, %ymm3, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpmovsxbd %xmm3, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, (%rsp)
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax
; KNL-NEXT: shlq $32, %rax
; KNL-NEXT: orq %rcx, %rax
; KNL-NEXT: movl (%rsp), %ecx
; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edx
; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: shll $16, %edx
; KNL-NEXT: orl %ecx, %edx
; KNL-NEXT: shlq $32, %rdx
; KNL-NEXT: orq %rcx, %rdx
; KNL-NEXT: orq %rsi, %rdx
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
@ -2178,7 +2173,7 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-128, %rsp
; KNL-NEXT: subq $384, %rsp ## imm = 0x180
; KNL-NEXT: subq $256, %rsp ## imm = 0x100
; KNL-NEXT: ## kill: def %esi killed %esi def %rsi
; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; KNL-NEXT: vpxor %ymm4, %ymm0, %ymm0
@ -2194,56 +2189,60 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index
; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
; KNL-NEXT: setne 128(%rsp,%rsi)
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm2
; KNL-NEXT: vmovdqa %ymm0, (%rsp)
; KNL-NEXT: setne (%rsp,%rsi)
; KNL-NEXT: vmovdqa (%rsp), %ymm2
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm4
; KNL-NEXT: vpmovsxbd %xmm4, %zmm4
; KNL-NEXT: vpmovsxbd %xmm2, %zmm4
; KNL-NEXT: vpslld $31, %zmm4, %zmm4
; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: orl %eax, %ecx
; KNL-NEXT: vpmovsxbd %xmm3, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: vextracti128 $1, %ymm3, %xmm2
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
; KNL-NEXT: orl %edx, %eax
; KNL-NEXT: shlq $32, %rax
; KNL-NEXT: orq %rcx, %rax
; KNL-NEXT: vpmovsxbd %xmm1, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vextracti128 $1, %ymm2, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: kmovw %k0, %esi
; KNL-NEXT: shll $16, %esi
; KNL-NEXT: orl %ecx, %esi
; KNL-NEXT: vpmovsxbd %xmm0, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpmovsxbd %xmm2, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vextracti128 $1, %ymm3, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpmovsxbd %xmm3, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, (%rsp)
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax
; KNL-NEXT: shlq $32, %rax
; KNL-NEXT: orq %rcx, %rax
; KNL-NEXT: movl (%rsp), %ecx
; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edx
; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: shll $16, %edx
; KNL-NEXT: orl %ecx, %edx
; KNL-NEXT: shlq $32, %rdx
; KNL-NEXT: orq %rcx, %rdx
; KNL-NEXT: orq %rsi, %rdx
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper

View File

@ -975,38 +975,32 @@ define <64 x i8> @test16(i64 %x) {
;
; KNL-LABEL: test16:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
; KNL-NEXT: movl %edi, (%rsp)
; KNL-NEXT: movq %rdi, %rax
; KNL-NEXT: movl %edi, %ecx
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: shrq $32, %rdi
; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp)
; KNL-NEXT: kmovw (%rsp), %k1
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; KNL-NEXT: shrq $48, %rax
; KNL-NEXT: shrl $16, %ecx
; KNL-NEXT: kmovw %ecx, %k1
; KNL-NEXT: kmovw %eax, %k2
; KNL-NEXT: kmovw %edi, %k3
; KNL-NEXT: movb $1, %al
; KNL-NEXT: kmovw %eax, %k4
; KNL-NEXT: kshiftrw $5, %k0, %k5
; KNL-NEXT: kxorw %k4, %k5, %k4
; KNL-NEXT: kshiftlw $15, %k4, %k4
; KNL-NEXT: kshiftrw $10, %k4, %k4
; KNL-NEXT: kxorw %k0, %k4, %k4
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: movl $1, %eax
; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm2, %xmm2
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test16:
@ -1037,38 +1031,32 @@ define <64 x i8> @test16(i64 %x) {
;
; AVX512DQ-LABEL: test16:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: pushq %rbp
; AVX512DQ-NEXT: .cfi_def_cfa_offset 16
; AVX512DQ-NEXT: .cfi_offset %rbp, -16
; AVX512DQ-NEXT: movq %rsp, %rbp
; AVX512DQ-NEXT: .cfi_def_cfa_register %rbp
; AVX512DQ-NEXT: andq $-32, %rsp
; AVX512DQ-NEXT: subq $64, %rsp
; AVX512DQ-NEXT: movl %edi, (%rsp)
; AVX512DQ-NEXT: movq %rdi, %rax
; AVX512DQ-NEXT: movl %edi, %ecx
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: shrq $32, %rdi
; AVX512DQ-NEXT: movl %edi, {{[0-9]+}}(%rsp)
; AVX512DQ-NEXT: kmovw (%rsp), %k0
; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
; AVX512DQ-NEXT: shrq $48, %rax
; AVX512DQ-NEXT: shrl $16, %ecx
; AVX512DQ-NEXT: kmovw %ecx, %k1
; AVX512DQ-NEXT: kmovw %eax, %k2
; AVX512DQ-NEXT: kmovw %edi, %k3
; AVX512DQ-NEXT: movb $1, %al
; AVX512DQ-NEXT: kmovw %eax, %k4
; AVX512DQ-NEXT: kshiftrw $5, %k0, %k5
; AVX512DQ-NEXT: kxorw %k4, %k5, %k4
; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4
; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4
; AVX512DQ-NEXT: kxorw %k0, %k4, %k0
; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: movl $1, %eax
; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k0
; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2
; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
; AVX512DQ-NEXT: movq %rbp, %rsp
; AVX512DQ-NEXT: popq %rbp
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX512DQ-NEXT: retq
%a = bitcast i64 %x to <64 x i1>
%b = insertelement <64 x i1>%a, i1 true, i32 5
@ -1080,40 +1068,33 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
;
; KNL-LABEL: test17:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
; KNL-NEXT: movl %edi, (%rsp)
; KNL-NEXT: movq %rdi, %rax
; KNL-NEXT: movl %edi, %ecx
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: shrq $32, %rdi
; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp)
; KNL-NEXT: kmovw (%rsp), %k1
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: xorl %eax, %eax
; KNL-NEXT: shrq $48, %rax
; KNL-NEXT: shrl $16, %ecx
; KNL-NEXT: kmovw %ecx, %k1
; KNL-NEXT: kmovw %eax, %k2
; KNL-NEXT: kmovw %edi, %k3
; KNL-NEXT: cmpl %edx, %esi
; KNL-NEXT: setg %al
; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: kmovw %eax, %k4
; KNL-NEXT: kshiftrw $5, %k0, %k5
; KNL-NEXT: kxorw %k4, %k5, %k4
; KNL-NEXT: kshiftlw $15, %k4, %k4
; KNL-NEXT: kshiftrw $10, %k4, %k4
; KNL-NEXT: kxorw %k0, %k4, %k4
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm2, %xmm2
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test17:
@ -1146,40 +1127,33 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
;
; AVX512DQ-LABEL: test17:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: pushq %rbp
; AVX512DQ-NEXT: .cfi_def_cfa_offset 16
; AVX512DQ-NEXT: .cfi_offset %rbp, -16
; AVX512DQ-NEXT: movq %rsp, %rbp
; AVX512DQ-NEXT: .cfi_def_cfa_register %rbp
; AVX512DQ-NEXT: andq $-32, %rsp
; AVX512DQ-NEXT: subq $64, %rsp
; AVX512DQ-NEXT: movl %edi, (%rsp)
; AVX512DQ-NEXT: movq %rdi, %rax
; AVX512DQ-NEXT: movl %edi, %ecx
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: shrq $32, %rdi
; AVX512DQ-NEXT: movl %edi, {{[0-9]+}}(%rsp)
; AVX512DQ-NEXT: kmovw (%rsp), %k0
; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: xorl %eax, %eax
; AVX512DQ-NEXT: shrq $48, %rax
; AVX512DQ-NEXT: shrl $16, %ecx
; AVX512DQ-NEXT: kmovw %ecx, %k1
; AVX512DQ-NEXT: kmovw %eax, %k2
; AVX512DQ-NEXT: kmovw %edi, %k3
; AVX512DQ-NEXT: cmpl %edx, %esi
; AVX512DQ-NEXT: setg %al
; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k0
; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: kmovw %eax, %k4
; AVX512DQ-NEXT: kshiftrw $5, %k0, %k5
; AVX512DQ-NEXT: kxorw %k4, %k5, %k4
; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4
; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4
; AVX512DQ-NEXT: kxorw %k0, %k4, %k0
; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2
; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
; AVX512DQ-NEXT: movq %rbp, %rsp
; AVX512DQ-NEXT: popq %rbp
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX512DQ-NEXT: retq
%a = bitcast i64 %x to <64 x i1>
%b = icmp sgt i32 %y, %z
@ -1815,51 +1789,29 @@ define void @ktest_2(<32 x float> %in, float * %base) {
;
; KNL-LABEL: ktest_2:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $32, %rsp
; KNL-NEXT: vmovups (%rdi), %zmm2
; KNL-NEXT: vmovups 64(%rdi), %zmm3
; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k1
; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm2, %xmm2
; KNL-NEXT: vcmpltps %zmm1, %zmm3, %k2
; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; KNL-NEXT: vpmovdb %zmm3, %xmm3
; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k2} {z}
; KNL-NEXT: vmovups 4(%rdi), %zmm5 {%k1} {z}
; KNL-NEXT: vcmpltps %zmm5, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm5, %xmm5
; KNL-NEXT: vpor %xmm5, %xmm2, %xmm2
; KNL-NEXT: vcmpltps %zmm4, %zmm1, %k1
; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm4, %xmm4
; KNL-NEXT: vpor %xmm4, %xmm3, %xmm3
; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
; KNL-NEXT: vpslld $31, %zmm3, %zmm3
; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, (%rsp)
; KNL-NEXT: cmpl $0, (%rsp)
; KNL-NEXT: vcmpltps %zmm1, %zmm3, %k1
; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k2
; KNL-NEXT: vmovups 4(%rdi), %zmm2 {%k2} {z}
; KNL-NEXT: vmovups 68(%rdi), %zmm3 {%k1} {z}
; KNL-NEXT: vcmpltps %zmm3, %zmm1, %k0
; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k3
; KNL-NEXT: korw %k3, %k2, %k2
; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: korw %k0, %k1, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: orl %eax, %ecx
; KNL-NEXT: je LBB42_2
; KNL-NEXT: ## %bb.1: ## %L1
; KNL-NEXT: vmovaps %zmm0, (%rdi)
; KNL-NEXT: vmovaps %zmm1, 64(%rdi)
; KNL-NEXT: jmp LBB42_3
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
; KNL-NEXT: LBB42_2: ## %L2
; KNL-NEXT: vmovaps %zmm0, 4(%rdi)
; KNL-NEXT: vmovaps %zmm1, 68(%rdi)
; KNL-NEXT: LBB42_3: ## %End
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
@ -1917,51 +1869,29 @@ define void @ktest_2(<32 x float> %in, float * %base) {
;
; AVX512DQ-LABEL: ktest_2:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: pushq %rbp
; AVX512DQ-NEXT: .cfi_def_cfa_offset 16
; AVX512DQ-NEXT: .cfi_offset %rbp, -16
; AVX512DQ-NEXT: movq %rsp, %rbp
; AVX512DQ-NEXT: .cfi_def_cfa_register %rbp
; AVX512DQ-NEXT: andq $-32, %rsp
; AVX512DQ-NEXT: subq $32, %rsp
; AVX512DQ-NEXT: vmovups (%rdi), %zmm2
; AVX512DQ-NEXT: vmovups 64(%rdi), %zmm3
; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm2, %k1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2
; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2
; AVX512DQ-NEXT: vcmpltps %zmm1, %zmm3, %k2
; AVX512DQ-NEXT: vpmovm2d %k2, %zmm3
; AVX512DQ-NEXT: vpmovdb %zmm3, %xmm3
; AVX512DQ-NEXT: vmovups 68(%rdi), %zmm4 {%k2} {z}
; AVX512DQ-NEXT: vmovups 4(%rdi), %zmm5 {%k1} {z}
; AVX512DQ-NEXT: vcmpltps %zmm5, %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5
; AVX512DQ-NEXT: vpmovdb %zmm5, %xmm5
; AVX512DQ-NEXT: vpor %xmm5, %xmm2, %xmm2
; AVX512DQ-NEXT: vcmpltps %zmm4, %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4
; AVX512DQ-NEXT: vpmovdb %zmm4, %xmm4
; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3
; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3
; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3
; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0
; AVX512DQ-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2
; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0
; AVX512DQ-NEXT: kmovw %k0, (%rsp)
; AVX512DQ-NEXT: cmpl $0, (%rsp)
; AVX512DQ-NEXT: vcmpltps %zmm1, %zmm3, %k1
; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm2, %k2
; AVX512DQ-NEXT: vmovups 4(%rdi), %zmm2 {%k2} {z}
; AVX512DQ-NEXT: vmovups 68(%rdi), %zmm3 {%k1} {z}
; AVX512DQ-NEXT: vcmpltps %zmm3, %zmm1, %k0
; AVX512DQ-NEXT: vcmpltps %zmm2, %zmm0, %k3
; AVX512DQ-NEXT: korw %k3, %k2, %k2
; AVX512DQ-NEXT: kmovw %k2, %eax
; AVX512DQ-NEXT: korw %k0, %k1, %k0
; AVX512DQ-NEXT: kmovw %k0, %ecx
; AVX512DQ-NEXT: shll $16, %ecx
; AVX512DQ-NEXT: orl %eax, %ecx
; AVX512DQ-NEXT: je LBB42_2
; AVX512DQ-NEXT: ## %bb.1: ## %L1
; AVX512DQ-NEXT: vmovaps %zmm0, (%rdi)
; AVX512DQ-NEXT: vmovaps %zmm1, 64(%rdi)
; AVX512DQ-NEXT: jmp LBB42_3
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512DQ-NEXT: LBB42_2: ## %L2
; AVX512DQ-NEXT: vmovaps %zmm0, 4(%rdi)
; AVX512DQ-NEXT: vmovaps %zmm1, 68(%rdi)
; AVX512DQ-NEXT: LBB42_3: ## %End
; AVX512DQ-NEXT: movq %rbp, %rsp
; AVX512DQ-NEXT: popq %rbp
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%addr1 = getelementptr float, float * %base, i64 0
@ -2334,14 +2264,14 @@ define void @store_16i1(<16 x i1>* %a, <16 x i1> %v) {
define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
; KNL-LABEL: store_32i1:
; KNL: ## %bb.0:
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpmovsxbd %xmm0, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, 2(%rdi)
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: kmovw %k1, 2(%rdi)
; KNL-NEXT: kmovw %k0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
@ -2364,14 +2294,14 @@ define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
;
; AVX512DQ-LABEL: store_32i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512DQ-NEXT: kmovw %k0, 2(%rdi)
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512DQ-NEXT: kmovw %k1, 2(%rdi)
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@ -2383,12 +2313,12 @@ define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
; KNL-LABEL: store_32i1_1:
; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, 2(%rdi)
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: vpmovsxwd %ymm1, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: kmovw %k1, 2(%rdi)
; KNL-NEXT: kmovw %k0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
@ -2412,12 +2342,12 @@ define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
; AVX512DQ-LABEL: store_32i1_1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1
; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512DQ-NEXT: kmovw %k0, 2(%rdi)
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512DQ-NEXT: kmovw %k1, 2(%rdi)
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@ -2431,21 +2361,21 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
;
; KNL-LABEL: store_64i1:
; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
; KNL-NEXT: vpslld $31, %zmm3, %zmm3
; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0
; KNL-NEXT: kmovw %k0, 6(%rdi)
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, 4(%rdi)
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, 2(%rdi)
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: vpmovsxbd %xmm1, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: vpmovsxbd %xmm2, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2
; KNL-NEXT: vpmovsxbd %xmm3, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k3
; KNL-NEXT: kmovw %k3, 6(%rdi)
; KNL-NEXT: kmovw %k2, 4(%rdi)
; KNL-NEXT: kmovw %k1, 2(%rdi)
; KNL-NEXT: kmovw %k0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
@ -2468,21 +2398,21 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
;
; AVX512DQ-LABEL: store_64i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3
; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3
; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0
; AVX512DQ-NEXT: kmovw %k0, 6(%rdi)
; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2
; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0
; AVX512DQ-NEXT: kmovw %k0, 4(%rdi)
; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512DQ-NEXT: kmovw %k0, 2(%rdi)
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k3
; AVX512DQ-NEXT: kmovw %k3, 6(%rdi)
; AVX512DQ-NEXT: kmovw %k2, 4(%rdi)
; AVX512DQ-NEXT: kmovw %k1, 2(%rdi)
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq

View File

@ -240,18 +240,18 @@ declare <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i3
define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) {
; AVX512F-LABEL: test_load_32f64:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5
; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm5
; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5
; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1
; AVX512F-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k1}
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k2}
; AVX512F-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k2}
; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: kshiftrw $8, %k2, %k2
; AVX512F-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k2}
; AVX512F-NEXT: kshiftrw $8, %k1, %k1
; AVX512F-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
; AVX512F-NEXT: kshiftrw $8, %k2, %k1
; AVX512F-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1}
; AVX512F-NEXT: vmovapd %zmm5, %zmm2
; AVX512F-NEXT: retq

View File

@ -206,21 +206,12 @@ define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
; KNL-LABEL: test12_v32i32:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $32, %rsp
; KNL-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rsp)
; KNL-NEXT: movl (%rsp), %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: vpcmpeqd %zmm3, %zmm1, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
; KNL-NEXT: orl %ecx, %eax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
@ -249,32 +240,28 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
; KNL-LABEL: test12_v64i16:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
; KNL-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm1
; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rsp)
; KNL-NEXT: vpcmpeqw %ymm7, %ymm3, %ymm0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: orl %eax, %ecx
; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: movl (%rsp), %ecx
; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax
; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: vpcmpeqw %ymm7, %ymm3, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
; KNL-NEXT: orl %edx, %eax
; KNL-NEXT: shlq $32, %rax
; KNL-NEXT: orq %rcx, %rax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;

File diff suppressed because it is too large Load Diff

View File

@ -411,28 +411,22 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
;
; AVX512F-LABEL: v32i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: .cfi_def_cfa_offset 16
; AVX512F-NEXT: .cfi_offset %rbp, -16
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: .cfi_def_cfa_register %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $32, %rsp
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm1
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, (%rsp)
; AVX512F-NEXT: movl (%rsp), %eax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k2}
; AVX512F-NEXT: kmovw %k0, %ecx
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: shll $16, %eax
; AVX512F-NEXT: orl %ecx, %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;

View File

@ -280,38 +280,22 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) {
;
; AVX512F-LABEL: v32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: .cfi_def_cfa_offset 16
; AVX512F-NEXT: .cfi_offset %rbp, -16
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: .cfi_def_cfa_register %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $32, %rsp
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpcmpgtw %ymm6, %ymm4, %ymm2
; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpcmpgtw %ymm7, %ymm5, %ymm2
; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, (%rsp)
; AVX512F-NEXT: movl (%rsp), %eax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512F-NEXT: vpcmpgtw %ymm7, %ymm5, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpcmpgtw %ymm6, %ymm4, %ymm1
; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k2}
; AVX512F-NEXT: kmovw %k0, %ecx
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: shll $16, %eax
; AVX512F-NEXT: orl %ecx, %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@ -1332,43 +1316,40 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
;
; AVX512F-LABEL: v64i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: .cfi_def_cfa_offset 16
; AVX512F-NEXT: .cfi_offset %rbp, -16
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: .cfi_def_cfa_register %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $64, %rsp
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3
; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtb %ymm7, %ymm5, %ymm2
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k3
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k4
; AVX512F-NEXT: vpcmpgtb %ymm7, %ymm5, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpcmpgtb %ymm6, %ymm4, %ymm2
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3
; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, (%rsp)
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl (%rsp), %ecx
; AVX512F-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 {%k4}
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 {%k3}
; AVX512F-NEXT: kmovw %k0, %ecx
; AVX512F-NEXT: shll $16, %ecx
; AVX512F-NEXT: orl %eax, %ecx
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k2}
; AVX512F-NEXT: kmovw %k0, %edx
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: shll $16, %eax
; AVX512F-NEXT: orl %edx, %eax
; AVX512F-NEXT: shlq $32, %rax
; AVX512F-NEXT: orq %rcx, %rax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;

View File

@ -505,24 +505,15 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) {
;
; AVX512F-LABEL: ext_i32_32i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: .cfi_def_cfa_offset 16
; AVX512F-NEXT: .cfi_offset %rbp, -16
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: .cfi_def_cfa_register %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $32, %rsp
; AVX512F-NEXT: movl %edi, (%rsp)
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: shrl $16, %edi
; AVX512F-NEXT: kmovw %edi, %k2
; AVX512F-NEXT: movl {{.*}}(%rip), %eax
; AVX512F-NEXT: kmovw (%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: ext_i32_32i8:
@ -792,27 +783,14 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
;
; AVX512F-LABEL: ext_i32_32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: .cfi_def_cfa_offset 16
; AVX512F-NEXT: .cfi_offset %rbp, -16
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: .cfi_def_cfa_register %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $32, %rsp
; AVX512F-NEXT: movl %edi, (%rsp)
; AVX512F-NEXT: kmovw (%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: shrl $16, %edi
; AVX512F-NEXT: kmovw %edi, %k2
; AVX512F-NEXT: movl {{.*}}(%rip), %eax
; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k2} {z}
; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: ext_i32_32i16:
@ -950,33 +928,27 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
;
; AVX512F-LABEL: ext_i64_64i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: .cfi_def_cfa_offset 16
; AVX512F-NEXT: .cfi_offset %rbp, -16
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: .cfi_def_cfa_register %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $64, %rsp
; AVX512F-NEXT: movl %edi, (%rsp)
; AVX512F-NEXT: shrq $32, %rdi
; AVX512F-NEXT: movl %edi, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: movq %rdi, %rcx
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: movl %edi, %edx
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: shrq $32, %rax
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: kmovw %ecx, %k2
; AVX512F-NEXT: kmovw %eax, %k3
; AVX512F-NEXT: kmovw %edx, %k4
; AVX512F-NEXT: movl {{.*}}(%rip), %eax
; AVX512F-NEXT: kmovw (%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k2} {z}
; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k4} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k3} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpbroadcastd %eax, %zmm2 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: ext_i64_64i8:

View File

@ -183,24 +183,16 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) {
;
; AVX512F-LABEL: v32i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: .cfi_def_cfa_offset 16
; AVX512F-NEXT: .cfi_offset %rbp, -16
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: .cfi_def_cfa_register %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $32, %rsp
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: kmovw %k0, %ecx
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, (%rsp)
; AVX512F-NEXT: movl (%rsp), %eax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: shll $16, %eax
; AVX512F-NEXT: orl %ecx, %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;

View File

@ -51,24 +51,16 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b) {
;
; AVX512F-LABEL: v32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: .cfi_def_cfa_offset 16
; AVX512F-NEXT: .cfi_offset %rbp, -16
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: .cfi_def_cfa_register %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $32, %rsp
; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, (%rsp)
; AVX512F-NEXT: movl (%rsp), %eax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: kmovw %k0, %ecx
; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: shll $16, %eax
; AVX512F-NEXT: orl %ecx, %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@ -1006,35 +998,28 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
;
; AVX512F-LABEL: v64i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: .cfi_def_cfa_offset 16
; AVX512F-NEXT: .cfi_offset %rbp, -16
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: .cfi_def_cfa_register %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $64, %rsp
; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm2
; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, (%rsp)
; AVX512F-NEXT: kmovw %k0, %ecx
; AVX512F-NEXT: shll $16, %ecx
; AVX512F-NEXT: orl %eax, %ecx
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: kmovw %k0, %edx
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl (%rsp), %ecx
; AVX512F-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: shll $16, %eax
; AVX512F-NEXT: orl %edx, %eax
; AVX512F-NEXT: shlq $32, %rax
; AVX512F-NEXT: orq %rcx, %rax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;

View File

@ -132,22 +132,70 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
; AVX256VL: # %bb.0:
; AVX256VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX256VL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; AVX256VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
; AVX256VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX256VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
; AVX256VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
; AVX256VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX256VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX256VL-NEXT: vpmovsxbw %xmm1, %ymm1
; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k1
; AVX256VL-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX256VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k2
; AVX256VL-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX256VL-NEXT: vptestmd %ymm0, %ymm0, %k3
; AVX256VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k3} {z}
; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm2 {%k2} {z}
; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2
; AVX256VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX256VL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX256VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
; AVX256VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,12,13,u,u,8,9,6,7,14,15,14,15,0,1,22,23,28,29,18,19,26,27,22,23,u,u,30,31,16,17]
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm2 {%k1} {z}
; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2
; AVX256VL-NEXT: kshiftrw $8, %k1, %k1
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm3 {%k1} {z}
; AVX256VL-NEXT: vpmovdw %ymm3, %xmm3
; AVX256VL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX256VL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,1]
; AVX256VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
; AVX256VL-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm2
; AVX256VL-NEXT: vpslld $31, %ymm2, %ymm2
; AVX256VL-NEXT: vptestmd %ymm2, %ymm2, %k1
; AVX256VL-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1
; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k0
; AVX256VL-NEXT: kunpckbw %k1, %k0, %k0
; AVX256VL-NEXT: kshiftrw $8, %k0, %k2
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1
; AVX256VL-NEXT: vpacksswb %xmm0, %xmm1, %xmm1
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256VL-NEXT: vpmovdw %ymm0, %xmm0
; AVX256VL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX256VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX256VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX256VL-NEXT: retq
;
; AVX512NOBW-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX512NOBW: # %bb.0:
; AVX512NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512NOBW-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; AVX512NOBW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
; AVX512NOBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512NOBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
; AVX512NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
; AVX512NOBW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX512NOBW-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512NOBW-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512NOBW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512NOBW-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512NOBW-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512NOBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512NOBW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
; AVX512NOBW-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512NOBW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512NOBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512NOBW-NEXT: retq
;
; AVX256VLBW-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:

View File

@ -2257,23 +2257,25 @@ define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
;
; AVX512F-LABEL: test_cmp_v64i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4
; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm3
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm2
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512F-NEXT: vmovdqa %xmm4, %xmm2
; AVX512F-NEXT: vmovdqa %xmm4, %xmm1
; AVX512F-NEXT: # kill: def %xmm2 killed %xmm2 killed %ymm2
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v64i8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4
; AVX512DQ-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm3
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX512DQ-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm2
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm2
; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm1
; AVX512DQ-NEXT: # kill: def %xmm2 killed %xmm2 killed %ymm2
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@ -2711,32 +2713,24 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
;
; AVX512F-LABEL: test_cmp_v32f32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vcmpltps %zmm0, %zmm2, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vcmpltps %zmm1, %zmm3, %k1
; AVX512F-NEXT: vcmpltps %zmm0, %zmm2, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v32f32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm2, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vcmpltps %zmm1, %zmm3, %k0
; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm2, %k1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v32f32:
@ -3262,32 +3256,24 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
;
; AVX512F-LABEL: test_cmp_v32i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpcmpgtd %zmm3, %zmm1, %k1
; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm0, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v32i32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpcmpgtd %zmm2, %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpcmpgtd %zmm3, %zmm1, %k0
; AVX512DQ-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v32i32:
@ -6386,36 +6372,36 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
; AVX512F-LABEL: test_cmp_v128i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm2
; AVX512F-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm3
; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX512F-NEXT: vpmovsxbd %xmm4, %zmm4
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm4
; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0
; AVX512F-NEXT: kmovw %k0, 14(%rdi)
; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3
; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0
; AVX512F-NEXT: kmovw %k0, 12(%rdi)
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3
; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0
; AVX512F-NEXT: kmovw %k0, 10(%rdi)
; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
; AVX512F-NEXT: kmovw %k0, 8(%rdi)
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
; AVX512F-NEXT: kmovw %k0, 6(%rdi)
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, 4(%rdi)
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, 2(%rdi)
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k3
; AVX512F-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k4
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k5
; AVX512F-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k6
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k7
; AVX512F-NEXT: kmovw %k7, 14(%rdi)
; AVX512F-NEXT: kmovw %k6, 12(%rdi)
; AVX512F-NEXT: kmovw %k5, 10(%rdi)
; AVX512F-NEXT: kmovw %k4, 8(%rdi)
; AVX512F-NEXT: kmovw %k3, 6(%rdi)
; AVX512F-NEXT: kmovw %k2, 4(%rdi)
; AVX512F-NEXT: kmovw %k1, 2(%rdi)
; AVX512F-NEXT: kmovw %k0, (%rdi)
; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: vzeroupper
@ -6424,36 +6410,36 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
; AVX512DQ-LABEL: test_cmp_v128i8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0
; AVX512DQ-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm1
; AVX512DQ-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm2
; AVX512DQ-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm3
; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX512DQ-NEXT: vpmovsxbd %xmm4, %zmm4
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm4
; AVX512DQ-NEXT: vptestmd %zmm4, %zmm4, %k0
; AVX512DQ-NEXT: kmovw %k0, 14(%rdi)
; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3
; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0
; AVX512DQ-NEXT: kmovw %k0, 12(%rdi)
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3
; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0
; AVX512DQ-NEXT: kmovw %k0, 10(%rdi)
; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0
; AVX512DQ-NEXT: kmovw %k0, 8(%rdi)
; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0
; AVX512DQ-NEXT: kmovw %k0, 6(%rdi)
; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512DQ-NEXT: kmovw %k0, 4(%rdi)
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512DQ-NEXT: kmovw %k0, 2(%rdi)
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512DQ-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k2
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k3
; AVX512DQ-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k4
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k5
; AVX512DQ-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k6
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k7
; AVX512DQ-NEXT: kmovw %k7, 14(%rdi)
; AVX512DQ-NEXT: kmovw %k6, 12(%rdi)
; AVX512DQ-NEXT: kmovw %k5, 10(%rdi)
; AVX512DQ-NEXT: kmovw %k4, 8(%rdi)
; AVX512DQ-NEXT: kmovw %k3, 6(%rdi)
; AVX512DQ-NEXT: kmovw %k2, 4(%rdi)
; AVX512DQ-NEXT: kmovw %k1, 2(%rdi)
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: movq %rdi, %rax
; AVX512DQ-NEXT: vzeroupper
@ -6910,40 +6896,32 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
;
; AVX512F-LABEL: test_cmp_v32f64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vcmpltpd %zmm0, %zmm4, %k0
; AVX512F-NEXT: vcmpltpd %zmm1, %zmm5, %k1
; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vcmpltpd %zmm2, %zmm6, %k0
; AVX512F-NEXT: vcmpltpd %zmm3, %zmm7, %k1
; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
; AVX512F-NEXT: vcmpltpd %zmm0, %zmm4, %k0
; AVX512F-NEXT: vcmpltpd %zmm1, %zmm5, %k2
; AVX512F-NEXT: kunpckbw %k0, %k2, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v32f64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm4, %k0
; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm5, %k1
; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vcmpltpd %zmm2, %zmm6, %k0
; AVX512DQ-NEXT: vcmpltpd %zmm3, %zmm7, %k1
; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k0
; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm4, %k1
; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm5, %k2
; AVX512DQ-NEXT: kunpckbw %k1, %k2, %k1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v32f64:
@ -7561,40 +7539,32 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
;
; AVX512F-LABEL: test_cmp_v32i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtq %zmm4, %zmm0, %k0
; AVX512F-NEXT: vpcmpgtq %zmm5, %zmm1, %k1
; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpcmpgtq %zmm6, %zmm2, %k0
; AVX512F-NEXT: vpcmpgtq %zmm7, %zmm3, %k1
; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
; AVX512F-NEXT: vpcmpgtq %zmm4, %zmm0, %k0
; AVX512F-NEXT: vpcmpgtq %zmm5, %zmm1, %k2
; AVX512F-NEXT: kunpckbw %k0, %k2, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v32i64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpcmpgtq %zmm4, %zmm0, %k0
; AVX512DQ-NEXT: vpcmpgtq %zmm5, %zmm1, %k1
; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpcmpgtq %zmm6, %zmm2, %k0
; AVX512DQ-NEXT: vpcmpgtq %zmm7, %zmm3, %k1
; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k0
; AVX512DQ-NEXT: vpcmpgtq %zmm4, %zmm0, %k1
; AVX512DQ-NEXT: vpcmpgtq %zmm5, %zmm1, %k2
; AVX512DQ-NEXT: kunpckbw %k1, %k2, %k1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v32i64:

View File

@ -214,20 +214,40 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1
define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) {
; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512VL-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512VL-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
@ -250,25 +270,19 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_
; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm1, %ymm1
; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
; AVX512F-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm0
; AVX512F-NEXT: vpandn %ymm4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1
; AVX512F-NEXT: vpandn %ymm5, %ymm1, %ymm2
; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm1, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6
; AVX512F-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm0, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
@ -276,25 +290,19 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_
; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm1, %ymm1
; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1
; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
; AVX512VL-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm1
; AVX512VL-NEXT: vpmovsxbw %xmm1, %ymm0
; AVX512VL-NEXT: vpandn %ymm4, %ymm0, %ymm4
; AVX512VL-NEXT: vpand %ymm0, %ymm2, %ymm0
; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512VL-NEXT: vpmovsxbw %xmm1, %ymm1
; AVX512VL-NEXT: vpandn %ymm5, %ymm1, %ymm2
; AVX512VL-NEXT: vpand %ymm1, %ymm3, %ymm1
; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm1, %ymm0
; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm6
; AVX512VL-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
; AVX512VL-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
@ -318,11 +326,19 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm3
; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512F-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
; AVX512F-NEXT: retq
;
@ -330,11 +346,19 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm3
; AVX512VL-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512VL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
;
@ -360,24 +384,14 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_
; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512F-NEXT: vpcmpeqd %zmm6, %zmm0, %k1
; AVX512F-NEXT: vpcmpeqd %zmm6, %zmm1, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6
; AVX512F-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
; AVX512F-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512F-NEXT: vpsllw $15, %ymm0, %ymm0
; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0
; AVX512F-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512F-NEXT: vpsllw $15, %ymm1, %ymm1
; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1
; AVX512F-NEXT: vpmovdw %zmm0, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
; AVX512F-NEXT: retq
;
@ -386,24 +400,14 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_
; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512VL-NEXT: vpcmpeqd %zmm6, %zmm0, %k1
; AVX512VL-NEXT: vpcmpeqd %zmm6, %zmm1, %k2
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm6
; AVX512VL-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
; AVX512VL-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm1
; AVX512VL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VL-NEXT: vpsllw $15, %ymm0, %ymm0
; AVX512VL-NEXT: vpsraw $15, %ymm0, %ymm0
; AVX512VL-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512VL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VL-NEXT: vpsllw $15, %ymm1, %ymm1
; AVX512VL-NEXT: vpsraw $15, %ymm1, %ymm1
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
; AVX512VL-NEXT: retq
;
@ -433,18 +437,14 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512F-NEXT: vpcmpeqd %zmm4, %zmm0, %k1
; AVX512F-NEXT: vpcmpeqd %zmm4, %zmm1, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4
; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512F-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
; AVX512F-NEXT: retq
;
@ -453,18 +453,14 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VL-NEXT: vpcmpeqd %zmm4, %zmm0, %k1
; AVX512VL-NEXT: vpcmpeqd %zmm4, %zmm1, %k2
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4
; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
; AVX512VL-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
; AVX512VL-NEXT: retq
;
@ -852,65 +848,33 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
define i64 @shuf64i1_zero(i64 %a) {
; AVX512F-LABEL: shuf64i1_zero:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: .cfi_def_cfa_offset 16
; AVX512F-NEXT: .cfi_offset %rbp, -16
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: .cfi_def_cfa_register %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $96, %rsp
; AVX512F-NEXT: movl %edi, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, (%rsp)
; AVX512F-NEXT: movl (%rsp), %ecx
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: kmovw %k0, %ecx
; AVX512F-NEXT: shll $16, %ecx
; AVX512F-NEXT: orl %eax, %ecx
; AVX512F-NEXT: movq %rcx, %rax
; AVX512F-NEXT: shlq $32, %rax
; AVX512F-NEXT: orq %rcx, %rax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuf64i1_zero:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: pushq %rbp
; AVX512VL-NEXT: .cfi_def_cfa_offset 16
; AVX512VL-NEXT: .cfi_offset %rbp, -16
; AVX512VL-NEXT: movq %rsp, %rbp
; AVX512VL-NEXT: .cfi_def_cfa_register %rbp
; AVX512VL-NEXT: andq $-32, %rsp
; AVX512VL-NEXT: subq $96, %rsp
; AVX512VL-NEXT: movl %edi, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; AVX512VL-NEXT: kmovw %edi, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512VL-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512VL-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512VL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0
; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512VL-NEXT: kmovw %k0, (%rsp)
; AVX512VL-NEXT: movl (%rsp), %ecx
; AVX512VL-NEXT: kmovw %k0, %eax
; AVX512VL-NEXT: kmovw %k0, %ecx
; AVX512VL-NEXT: shll $16, %ecx
; AVX512VL-NEXT: orl %eax, %ecx
; AVX512VL-NEXT: movq %rcx, %rax
; AVX512VL-NEXT: shlq $32, %rax
; AVX512VL-NEXT: orq %rcx, %rax
; AVX512VL-NEXT: movq %rbp, %rsp
; AVX512VL-NEXT: popq %rbp
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;