From 95aea7449493e6711a326a8a76b086f87e580619 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 17 Sep 2019 04:41:14 +0000 Subject: [PATCH] [X86] Split oversized vXi1 vector arguments and return values into scalars on avx512 targets. Previously we tried to split them into narrower v64i1 or v16i1 pieces that each got promoted to vXi8 and then passed in a zmm or xmm register. But this crashes when you need to pass more pieces than available registers reserved for argument passing. The scalarizing done here generates much longer and slower code, but is consistent with the behavior of avx2 and earlier targets for these types. Fixes PR43323. llvm-svn: 372069 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 30 + llvm/lib/Target/X86/X86ISelLowering.h | 4 + llvm/test/CodeGen/X86/avx512-ext.ll | 880 +++++++++++++++++- llvm/test/CodeGen/X86/avx512-mask-op.ll | 864 ++++++++++++++++- .../CodeGen/X86/vector-compare-results.ll | 102 +- 5 files changed, 1794 insertions(+), 86 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4402682d0ec7..184645895dcf 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1996,6 +1996,12 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const { MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { + // Break wide vXi1 vectors into scalars to match avx2 behavior. + if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && + Subtarget.hasAVX512() && + ((VT.getVectorNumElements() > 32 && !Subtarget.hasBWI()) || + (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) + return MVT::i8; if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return MVT::v32i8; // FIXME: Should we just make these types legal and custom split operations? @@ -2008,6 +2014,12 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { + // Break wide vXi1 vectors into scalars to match avx2 behavior. + if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && + Subtarget.hasAVX512() && + ((VT.getVectorNumElements() > 32 && !Subtarget.hasBWI()) || + (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) + return VT.getVectorNumElements(); if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return 1; // FIXME: Should we just make these types legal and custom split operations? @@ -2017,6 +2029,24 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } +unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( + LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, + unsigned &NumIntermediates, MVT &RegisterVT) const { + // Break wide vXi1 vectors into scalars to match avx2 behavior. + if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && + Subtarget.hasAVX512() && + ((VT.getVectorNumElements() > 32 && !Subtarget.hasBWI()) || + (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) { + RegisterVT = MVT::i8; + IntermediateVT = MVT::i1; + NumIntermediates = VT.getVectorNumElements(); + return NumIntermediates; + } + + return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT, + NumIntermediates, RegisterVT); +} + EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext& Context, EVT VT) const { diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 61a47442a675..4470de2d3780 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1203,6 +1203,10 @@ namespace llvm { CallingConv::ID CC, EVT VT) const override; + unsigned getVectorTypeBreakdownForCallingConv( + LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, + unsigned &NumIntermediates, MVT &RegisterVT) const override; + bool isIntDivCheap(EVT VT, AttributeList Attr) const override; bool supportSwiftError() const override; diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll index 1e484da2fffc..167a822ac7c7 100644 --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -1886,22 +1886,436 @@ define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) { define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-LABEL: test21: ; KNL: # %bb.0: -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero,xmm7[8],zero,xmm7[9],zero,xmm7[10],zero,xmm7[11],zero,xmm7[12],zero,xmm7[13],zero,xmm7[14],zero,xmm7[15],zero -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; KNL-NEXT: vpsllw $15, %ymm4, %ymm4 -; KNL-NEXT: vpsraw $15, %ymm4, %ymm4 -; KNL-NEXT: vpand %ymm0, %ymm4, %ymm0 -; KNL-NEXT: vpsllw $15, %ymm5, %ymm4 -; KNL-NEXT: vpsraw $15, %ymm4, %ymm4 +; KNL-NEXT: kmovw %esi, %k0 +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftrw $1, %k1, %k2 +; KNL-NEXT: kxorw %k0, %k2, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: kxorw %k0, %k1, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k1 +; KNL-NEXT: kmovw %edx, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $13, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k1 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $12, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k1 +; KNL-NEXT: kmovw %r8d, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $11, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k1 +; KNL-NEXT: kmovw %r9d, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $10, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $9, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $8, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $7, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $6, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $5, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $4, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $3, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $2, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $1, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftrw $1, %k2, %k3 +; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: kxorw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $9, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $8, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $7, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $6, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $5, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $4, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $3, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $2, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $1, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kshiftrw $1, %k3, %k4 +; KNL-NEXT: kxorw %k0, %k4, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $13, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $12, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $11, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $10, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $9, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $8, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $7, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $6, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $5, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $4, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $3, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $2, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $1, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kshiftrw $1, %k4, %k5 +; KNL-NEXT: kxorw %k0, %k5, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: kxorw %k0, %k4, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $13, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $12, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $11, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $10, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $9, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $8, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $7, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $6, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $5, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $4, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $3, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $2, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $1, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: korw %k4, %k0, %k4 +; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k4} {z} +; KNL-NEXT: vpmovdw %zmm4, %ymm4 ; KNL-NEXT: vpand %ymm1, %ymm4, %ymm1 -; KNL-NEXT: vpsllw $15, %ymm6, %ymm4 -; KNL-NEXT: vpsraw $15, %ymm4, %ymm4 +; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k3} {z} +; KNL-NEXT: vpmovdw %zmm4, %ymm4 ; KNL-NEXT: vpand %ymm2, %ymm4, %ymm2 -; KNL-NEXT: vpsllw $15, %ymm7, %ymm4 -; KNL-NEXT: vpsraw $15, %ymm4, %ymm4 +; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} +; KNL-NEXT: vpmovdw %zmm4, %ymm4 ; KNL-NEXT: vpand %ymm3, %ymm4, %ymm3 +; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} +; KNL-NEXT: vpmovdw %zmm4, %ymm4 +; KNL-NEXT: vpand %ymm0, %ymm4, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test21: @@ -1915,22 +2329,436 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; ; AVX512DQNOBW-LABEL: test21: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero,xmm7[8],zero,xmm7[9],zero,xmm7[10],zero,xmm7[11],zero,xmm7[12],zero,xmm7[13],zero,xmm7[14],zero,xmm7[15],zero -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512DQNOBW-NEXT: vpsllw $15, %ymm4, %ymm4 -; AVX512DQNOBW-NEXT: vpsraw $15, %ymm4, %ymm4 -; AVX512DQNOBW-NEXT: vpand %ymm0, %ymm4, %ymm0 -; AVX512DQNOBW-NEXT: vpsllw $15, %ymm5, %ymm4 -; AVX512DQNOBW-NEXT: vpsraw $15, %ymm4, %ymm4 +; AVX512DQNOBW-NEXT: kmovw %esi, %k0 +; AVX512DQNOBW-NEXT: kmovw %edi, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k1, %k2 +; AVX512DQNOBW-NEXT: kxorw %k0, %k2, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kxorw %k0, %k1, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k0, %k1 +; AVX512DQNOBW-NEXT: kmovw %edx, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k1, %k1 +; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k1 +; AVX512DQNOBW-NEXT: kmovw %ecx, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k1, %k1 +; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k1 +; AVX512DQNOBW-NEXT: kmovw %r8d, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k1, %k1 +; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k1 +; AVX512DQNOBW-NEXT: kmovw %r9d, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k1, %k1 +; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k1, %k1 +; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k1, %k1 +; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k1, %k1 +; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k1, %k1 +; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k1, %k1 +; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k1, %k1 +; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k1, %k1 +; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k1, %k1 +; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k2, %k3 +; AVX512DQNOBW-NEXT: kxorw %k1, %k3, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k1, %k1 +; AVX512DQNOBW-NEXT: kxorw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k1, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k2, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k1, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k2, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k1, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k2, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k1, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k2, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k2, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k1, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k2, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k2, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k1, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k2, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k1, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k2, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k2, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k1, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k2, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k1, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k1, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k3, %k4 +; AVX512DQNOBW-NEXT: kxorw %k2, %k4, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k3, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k3 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k3, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k2, %k3 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k3, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k2, %k3 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k3, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k2, %k3 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k3, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k2, %k3 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k3, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k2, %k3 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k3, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k2, %k3 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k3, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k2, %k3 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k3, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k2, %k3 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k3, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k2, %k3 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k3, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k2, %k3 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k3, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k2, %k3 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k3, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k2, %k3 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k3, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: korw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k4, %k5 +; AVX512DQNOBW-NEXT: kxorw %k3, %k5, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k3, %k3 +; AVX512DQNOBW-NEXT: kxorw %k3, %k4, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k3, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k3, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k3, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k3, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k3, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k3, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k3, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k3, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k3, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k3, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k3, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k3, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k3, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k3, %k3 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: korw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: vpmovm2d %k3, %zmm4 +; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm1, %ymm4, %ymm1 -; AVX512DQNOBW-NEXT: vpsllw $15, %ymm6, %ymm4 -; AVX512DQNOBW-NEXT: vpsraw $15, %ymm4, %ymm4 +; AVX512DQNOBW-NEXT: vpmovm2d %k2, %zmm4 +; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm4, %ymm2 -; AVX512DQNOBW-NEXT: vpsllw $15, %ymm7, %ymm4 -; AVX512DQNOBW-NEXT: vpsraw $15, %ymm4, %ymm4 +; AVX512DQNOBW-NEXT: vpmovm2d %k1, %zmm4 +; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm4 +; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512DQNOBW-NEXT: vpand %ymm0, %ymm4, %ymm0 ; AVX512DQNOBW-NEXT: retq %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer ret <64 x i16> %ret diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index db878ff34de3..4b34adad14d4 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -2753,23 +2753,429 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; ; KNL-LABEL: store_64i1: ; KNL: ## %bb.0: -; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: vpmovsxbd %xmm1, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL-NEXT: vpmovsxbd %xmm2, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2 -; KNL-NEXT: vpmovsxbd %xmm3, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k3 +; KNL-NEXT: kmovw %edx, %k0 +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kshiftrw $1, %k1, %k2 +; KNL-NEXT: kxorw %k0, %k2, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: kxorw %k0, %k1, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k1 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $13, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k1 +; KNL-NEXT: kmovw %r8d, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $12, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k1 +; KNL-NEXT: kmovw %r9d, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $11, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $10, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $9, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $8, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $7, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $6, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $5, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $4, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $3, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $2, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $1, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftrw $1, %k2, %k3 +; KNL-NEXT: kxorw %k1, %k3, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $14, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $2, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $3, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $4, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $5, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $6, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $9, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $7, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $8, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $8, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $7, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $9, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $6, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $10, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $5, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $11, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $4, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $12, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $3, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $13, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $2, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $14, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $1, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: kshiftrw $1, %k1, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kshiftrw $1, %k3, %k4 +; KNL-NEXT: kxorw %k2, %k4, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k3, %k2 +; KNL-NEXT: kshiftrw $2, %k2, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $13, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $3, %k2, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $12, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $4, %k2, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $11, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $5, %k2, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $10, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $6, %k2, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $9, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $7, %k2, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $8, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $8, %k2, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $7, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $9, %k2, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $6, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $5, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $4, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $12, %k2, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $3, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $13, %k2, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $2, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $1, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $1, %k2, %k2 +; KNL-NEXT: kshiftrw $1, %k2, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kshiftrw $1, %k4, %k5 +; KNL-NEXT: kxorw %k3, %k5, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $14, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k4, %k3 +; KNL-NEXT: kshiftrw $2, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $13, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $3, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $12, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $4, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $11, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $5, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $10, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $6, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $9, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $7, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $8, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $8, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $7, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $9, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $6, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $10, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $5, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $11, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $4, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $12, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $3, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $13, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $2, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $14, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $1, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $1, %k3, %k3 +; KNL-NEXT: kshiftrw $1, %k3, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: korw %k4, %k3, %k3 ; KNL-NEXT: kmovw %k3, 6(%rdi) ; KNL-NEXT: kmovw %k2, 4(%rdi) ; KNL-NEXT: kmovw %k1, 2(%rdi) ; KNL-NEXT: kmovw %k0, (%rdi) -; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: store_64i1: @@ -2790,23 +3196,429 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; ; AVX512DQ-LABEL: store_64i1: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 -; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm0 -; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm0 -; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 -; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm0 -; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 +; AVX512DQ-NEXT: kmovw %edx, %k0 +; AVX512DQ-NEXT: kmovw %esi, %k1 +; AVX512DQ-NEXT: kshiftrw $1, %k1, %k2 +; AVX512DQ-NEXT: kxorw %k0, %k2, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k0 +; AVX512DQ-NEXT: kxorw %k0, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k1 +; AVX512DQ-NEXT: kmovw %ecx, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $13, %k1, %k1 +; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k1 +; AVX512DQ-NEXT: kmovw %r8d, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $12, %k1, %k1 +; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k1 +; AVX512DQ-NEXT: kmovw %r9d, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k1 +; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k1 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $10, %k1, %k1 +; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k1 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $9, %k1, %k1 +; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k1 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $8, %k1, %k1 +; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k1 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $7, %k1, %k1 +; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k1 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k1 +; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k1 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $5, %k1, %k1 +; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k1 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $4, %k1, %k1 +; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $3, %k1, %k1 +; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k1 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $2, %k1, %k1 +; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k1 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $1, %k1, %k1 +; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftrw $1, %k2, %k3 +; AVX512DQ-NEXT: kxorw %k1, %k3, %k1 +; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $14, %k1, %k1 +; AVX512DQ-NEXT: kxorw %k1, %k2, %k1 +; AVX512DQ-NEXT: kshiftrw $2, %k1, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $13, %k2, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $3, %k1, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $12, %k2, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $4, %k1, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $11, %k2, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $5, %k1, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $10, %k2, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $9, %k2, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $7, %k1, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $8, %k2, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $8, %k1, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $7, %k2, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $9, %k1, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $6, %k2, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $10, %k1, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $5, %k2, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $4, %k2, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $12, %k1, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $3, %k2, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $13, %k1, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $2, %k2, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $14, %k1, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $1, %k2, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $1, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $1, %k1, %k1 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k1, %k1 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kshiftrw $1, %k3, %k4 +; AVX512DQ-NEXT: kxorw %k2, %k4, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $14, %k2, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k3, %k2 +; AVX512DQ-NEXT: kshiftrw $2, %k2, %k3 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $13, %k3, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $3, %k2, %k3 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $12, %k3, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $4, %k2, %k3 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $11, %k3, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $5, %k2, %k3 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $10, %k3, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $6, %k2, %k3 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $9, %k3, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $7, %k2, %k3 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $8, %k3, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $8, %k2, %k3 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $7, %k3, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $9, %k2, %k3 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $6, %k3, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $10, %k2, %k3 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $5, %k3, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $11, %k2, %k3 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $4, %k3, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $12, %k2, %k3 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $3, %k3, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $13, %k2, %k3 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $2, %k3, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $14, %k2, %k3 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $1, %k3, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 +; AVX512DQ-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $1, %k2, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQ-NEXT: korw %k3, %k2, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kshiftrw $1, %k4, %k5 +; AVX512DQ-NEXT: kxorw %k3, %k5, %k3 +; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $14, %k3, %k3 +; AVX512DQ-NEXT: kxorw %k3, %k4, %k3 +; AVX512DQ-NEXT: kshiftrw $2, %k3, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $13, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $3, %k3, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $12, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $4, %k3, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $11, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $5, %k3, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $6, %k3, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $9, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $7, %k3, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $8, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $8, %k3, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $7, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $9, %k3, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $6, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $10, %k3, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $5, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $11, %k3, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $4, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $12, %k3, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $3, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $13, %k3, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $2, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $14, %k3, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $1, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 +; AVX512DQ-NEXT: kshiftlw $1, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $1, %k3, %k3 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: korw %k4, %k3, %k3 ; AVX512DQ-NEXT: kmovw %k3, 6(%rdi) ; AVX512DQ-NEXT: kmovw %k2, 4(%rdi) ; AVX512DQ-NEXT: kmovw %k1, 2(%rdi) ; AVX512DQ-NEXT: kmovw %k0, (%rdi) -; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; X86-LABEL: store_64i1: diff --git a/llvm/test/CodeGen/X86/vector-compare-results.ll b/llvm/test/CodeGen/X86/vector-compare-results.ll index 2d9fb798267b..47a6ffb0aed5 100644 --- a/llvm/test/CodeGen/X86/vector-compare-results.ll +++ b/llvm/test/CodeGen/X86/vector-compare-results.ll @@ -827,27 +827,49 @@ define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind { ; ; AVX512F-LABEL: test_cmp_v64i8: ; AVX512F: # %bb.0: +; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k3 +; AVX512F-NEXT: kmovw %k3, 6(%rdi) +; AVX512F-NEXT: kmovw %k2, 4(%rdi) +; AVX512F-NEXT: kmovw %k1, 2(%rdi) +; AVX512F-NEXT: kmovw %k0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_cmp_v64i8: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: movq %rdi, %rax ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQ-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2 ; AVX512DQ-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 +; AVX512DQ-NEXT: kmovw %k3, 6(%rdi) +; AVX512DQ-NEXT: kmovw %k2, 4(%rdi) +; AVX512DQ-NEXT: kmovw %k1, 2(%rdi) +; AVX512DQ-NEXT: kmovw %k0, (%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -1453,35 +1475,45 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind { ; ; AVX512F-LABEL: test_cmp_v64i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm0 +; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm0 +; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 +; AVX512F-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm0 +; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k3 +; AVX512F-NEXT: kmovw %k3, 6(%rdi) +; AVX512F-NEXT: kmovw %k2, 4(%rdi) +; AVX512F-NEXT: kmovw %k1, 2(%rdi) +; AVX512F-NEXT: kmovw %k0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_cmp_v64i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1 +; AVX512DQ-NEXT: movq %rdi, %rax ; AVX512DQ-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm0 +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm0 +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 +; AVX512DQ-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm0 +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 +; AVX512DQ-NEXT: kmovw %k3, 6(%rdi) +; AVX512DQ-NEXT: kmovw %k2, 4(%rdi) +; AVX512DQ-NEXT: kmovw %k1, 2(%rdi) +; AVX512DQ-NEXT: kmovw %k0, (%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -1674,10 +1706,12 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind { ; ; AVX512BW-LABEL: test_cmp_v128i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpcmpgtb %zmm3, %zmm1, %k0 -; AVX512BW-NEXT: vpcmpgtb %zmm2, %zmm0, %k1 -; AVX512BW-NEXT: vpmovm2b %k1, %zmm0 -; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: vpcmpgtb %zmm2, %zmm0, %k0 +; AVX512BW-NEXT: vpcmpgtb %zmm3, %zmm1, %k1 +; AVX512BW-NEXT: kmovq %k1, 8(%rdi) +; AVX512BW-NEXT: kmovq %k0, (%rdi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = icmp sgt <128 x i8> %a0, %a1 ret <128 x i1> %1