[AVX512] Use vpternlog with an immediate of 0xff to create 512-bit all one vectors.

llvm-svn: 275045
This commit is contained in:
Craig Topper 2016-07-11 05:36:48 +00:00
parent 8674849d6e
commit 516e14cd8e
15 changed files with 195 additions and 139 deletions

View File

@ -6554,11 +6554,11 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
// vpcmpeqd on 256-bit vectors.
if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
(VT == MVT::v8i32 && Subtarget.hasInt256()))
return Op;
if (!VT.is512BitVector())
return getOnesVector(VT, Subtarget, DAG, DL);
return getOnesVector(VT, Subtarget, DAG, DL);
}
return SDValue();

View File

@ -416,6 +416,8 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteZero] in {
def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
[(set VR512:$dst, (v16i32 immAllZerosV))]>;
def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
[(set VR512:$dst, (v16i32 immAllOnesV))]>;
}
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,

View File

@ -5547,6 +5547,15 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
case X86::AVX2_SETALLONES:
return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
case X86::AVX512_512_SETALLONES: {
unsigned Reg = MIB->getOperand(0).getReg();
MIB->setDesc(get(X86::VPTERNLOGDZrri));
// VPTERNLOGD needs 3 register inputs and an immediate.
// 0xff will return 1s for any input.
MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef)
.addReg(Reg, RegState::Undef).addImm(0xff);
return true;
}
case X86::TEST8ri_NOREX:
MI.setDesc(get(X86::TEST8ri));
return true;
@ -6231,6 +6240,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
else
switch (LoadMI.getOpcode()) {
case X86::AVX512_512_SET0:
case X86::AVX512_512_SETALLONES:
Alignment = 64;
break;
case X86::AVX2_SETALLONES:
@ -6281,6 +6291,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::AVX512_128_SET0:
case X86::AVX512_256_SET0:
case X86::AVX512_512_SET0:
case X86::AVX512_512_SETALLONES:
case X86::FsFLD0SD:
case X86::FsFLD0SS: {
// Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
@ -6312,7 +6323,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
Ty = Type::getFloatTy(MF.getFunction()->getContext());
else if (Opc == X86::FsFLD0SD)
Ty = Type::getDoubleTy(MF.getFunction()->getContext());
else if (Opc == X86::AVX512_512_SET0)
else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16);
else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
Opc == X86::AVX512_256_SET0)
@ -6320,7 +6331,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
else
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES);
bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
Opc == X86::AVX512_512_SETALLONES);
const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
Constant::getNullValue(Ty);
unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);

View File

@ -4,7 +4,8 @@
define <16 x i32> @test2(<16 x i32> %x) {
; CHECK-LABEL: test2:
; CHECK: ## BB#0:
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = add <16 x i32><i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %x
ret <16 x i32>%res
@ -15,8 +16,8 @@ define <16 x float> @test3(<4 x float> %a) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vmovss %xmm0, %xmm2, %xmm0
; CHECK-NEXT: vmovss %xmm1, %xmm2, %xmm1
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; CHECK-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,0],xmm0[0,1]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1

View File

@ -30,7 +30,8 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;
@ -52,7 +53,8 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
; KNL_X32-NEXT: vpslld $31, %zmm0, %zmm0
; KNL_X32-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL_X32-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
; KNL_X32-NEXT: vpbroadcastd LCPI1_0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL_X32-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0
; KNL_X32-NEXT: retl
%c = and <16 x i1>%a, %b
@ -68,7 +70,8 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1}
; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqw %zmm0, %xmm0
; KNL-NEXT: retq
;
@ -91,7 +94,7 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
; KNL_X32-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL_X32-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1}
; KNL_X32-NEXT: vpbroadcastd LCPI2_1, %zmm0
; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL_X32-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0
; KNL_X32-NEXT: retl
@ -183,7 +186,8 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
; KNL-NEXT: Ltmp1:
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: callq _func16xi1
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@ -212,7 +216,8 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
; KNL_X32-NEXT: Ltmp1:
; KNL_X32-NEXT: .cfi_def_cfa_offset 16
; KNL_X32-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
; KNL_X32-NEXT: vpbroadcastd LCPI5_0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL_X32-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0
; KNL_X32-NEXT: calll _func16xi1
; KNL_X32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@ -286,7 +291,8 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
; KNL-NEXT: movb $85, %al
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqw %zmm0, %xmm0
; KNL-NEXT: popq %rax
; KNL-NEXT: retq
@ -322,7 +328,7 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
; KNL_X32-NEXT: movb $85, %al
; KNL_X32-NEXT: kmovw %eax, %k1
; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
; KNL_X32-NEXT: vpbroadcastd LCPI7_1, %zmm0
; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL_X32-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0
; KNL_X32-NEXT: addl $12, %esp

View File

@ -685,7 +685,8 @@ define <16 x float> @sitofp_16i1_float(<16 x i32> %a) {
; KNL: ## BB#0:
; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1
; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vcvtdq2ps %zmm0, %zmm0
; KNL-NEXT: retq
;
@ -748,7 +749,8 @@ define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
; KNL: ## BB#0:
; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1
; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0
; KNL-NEXT: retq

View File

@ -1409,7 +1409,8 @@ define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
; KNL-NEXT: knotw %k0, %k1
; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: retq
;
@ -1465,7 +1466,8 @@ define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
; KNL-LABEL: sext_16i1_16i32:
; KNL: ## BB#0:
; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: sext_16i1_16i32:

View File

@ -349,7 +349,8 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
; KNL-NEXT: LBB17_1:
; KNL-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
; KNL-NEXT: LBB17_3:
; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;
@ -386,7 +387,8 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
; KNL-NEXT: LBB18_3:
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;
@ -475,7 +477,8 @@ define <16 x i1> @test15(i32 %x, i32 %y) {
; KNL-NEXT: movw $1, %cx
; KNL-NEXT: cmovgw %ax, %cx
; KNL-NEXT: kmovw %ecx, %k1
; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;
@ -512,25 +515,25 @@ define <64 x i8> @test16(i64 %x) {
; KNL-NEXT: movl %edi, (%rsp)
; KNL-NEXT: shrq $32, %rdi
; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp)
; KNL-NEXT: movl {{.*}}(%rip), %eax
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: kmovw (%rsp), %k1
; KNL-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; KNL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; KNL-NEXT: movl $1, %ecx
; KNL-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; KNL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; KNL-NEXT: vpbroadcastd %eax, %zmm2 {%k1} {z}
; KNL-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm2, %xmm2
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm2
; KNL-NEXT: movl $1, %eax
; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; KNL-NEXT: vpsllw $7, %ymm2, %ymm0
; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2
; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
@ -570,30 +573,30 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
; KNL-NEXT: movl %edi, (%rsp)
; KNL-NEXT: shrq $32, %rdi
; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp)
; KNL-NEXT: movl {{.*}}(%rip), %eax
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; KNL-NEXT: kmovw (%rsp), %k1
; KNL-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; KNL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; KNL-NEXT: xorl %ecx, %ecx
; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm2, %xmm2
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; KNL-NEXT: xorl %eax, %eax
; KNL-NEXT: cmpl %edx, %esi
; KNL-NEXT: setg %cl
; KNL-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; KNL-NEXT: setg %al
; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; KNL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2
; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; KNL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; KNL-NEXT: vpbroadcastd %eax, %zmm2 {%k1} {z}
; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm2, %xmm2
; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
@ -628,7 +631,8 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kshiftlw $7, %k2, %k1
; KNL-NEXT: korw %k1, %k0, %k1
; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqw %zmm0, %xmm0
; KNL-NEXT: retq
;
@ -1368,7 +1372,8 @@ define <8 x i64> @load_8i1(<8 x i1>* %a) {
; KNL: ## BB#0:
; KNL-NEXT: movzbl (%rdi), %eax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: load_8i1:
@ -1385,7 +1390,8 @@ define <16 x i32> @load_16i1(<16 x i1>* %a) {
; KNL-LABEL: load_16i1:
; KNL: ## BB#0:
; KNL-NEXT: kmovw (%rdi), %k1
; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: load_16i1:
@ -1403,7 +1409,8 @@ define <2 x i16> @load_2i1(<2 x i1>* %a) {
; KNL: ## BB#0:
; KNL-NEXT: movzbl (%rdi), %eax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
@ -1422,7 +1429,8 @@ define <4 x i16> @load_4i1(<4 x i1>* %a) {
; KNL: ## BB#0:
; KNL-NEXT: movzbl (%rdi), %eax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: retq
@ -1441,11 +1449,11 @@ define <32 x i16> @load_32i1(<32 x i1>* %a) {
; KNL-LABEL: load_32i1:
; KNL: ## BB#0:
; KNL-NEXT: kmovw (%rdi), %k1
; KNL-NEXT: movl {{.*}}(%rip), %eax
; KNL-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: kmovw 2(%rdi), %k1
; KNL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdw %zmm1, %ymm1
; KNL-NEXT: retq
;
@ -1463,20 +1471,20 @@ define <64 x i8> @load_64i1(<64 x i1>* %a) {
; KNL-LABEL: load_64i1:
; KNL: ## BB#0:
; KNL-NEXT: kmovw (%rdi), %k1
; KNL-NEXT: movl {{.*}}(%rip), %eax
; KNL-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: kmovw 2(%rdi), %k1
; KNL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; KNL-NEXT: kmovw 4(%rdi), %k1
; KNL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: kmovw 6(%rdi), %k1
; KNL-NEXT: vpbroadcastd %eax, %zmm2 {%k1} {z}
; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm2, %xmm2
; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; KNL-NEXT: kmovw 4(%rdi), %k1
; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm2, %xmm2
; KNL-NEXT: kmovw 6(%rdi), %k1
; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: load_64i1:

View File

@ -218,7 +218,8 @@ define <16 x i32> @test_vbroadcast() {
; ALL: # BB#0: # %entry
; ALL-NEXT: vpxord %zmm0, %zmm0, %zmm0
; ALL-NEXT: vcmpunordps %zmm0, %zmm0, %k1
; ALL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; ALL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; ALL-NEXT: knotw %k1, %k1
; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; ALL-NEXT: retq

View File

@ -865,7 +865,8 @@ define <8 x i32>@test28(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1
; KNL-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
; KNL-NEXT: vpcmpgtq %zmm3, %zmm2, %k1
; KNL-NEXT: kxnorw %k1, %k0, %k1
; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: retq
;
@ -889,7 +890,8 @@ define <16 x i8>@test29(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32>
; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; KNL-NEXT: vpcmpgtd %zmm3, %zmm2, %k1
; KNL-NEXT: kxorw %k1, %k0, %k1
; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;

View File

@ -2473,7 +2473,8 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x
; AVX512F-NEXT: ## BB#31: ## %cond.load43
; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: LBB50_32: ## %else44
; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k1} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: retq
@ -5676,7 +5677,8 @@ define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i1
; AVX512F-NEXT: ## BB#15: ## %cond.load19
; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: LBB53_16: ## %else20
; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 {%k1} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpxor %xmm2, %xmm1, %xmm2
@ -6116,7 +6118,8 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16
; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: LBB54_32: ## %else44
; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k1} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq

View File

@ -706,7 +706,8 @@ define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
; AVX512-LABEL: test_cmp_v8f64:
; AVX512: # BB#0:
; AVX512-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: retq
%1 = fcmp ogt <8 x double> %a0, %a1
@ -767,7 +768,8 @@ define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
; AVX512-LABEL: test_cmp_v16f32:
; AVX512: # BB#0:
; AVX512-NEXT: vcmpltps %zmm0, %zmm1, %k1
; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: retq
%1 = fcmp ogt <16 x float> %a0, %a1
@ -890,7 +892,8 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX512-LABEL: test_cmp_v8i64:
; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: retq
%1 = icmp sgt <8 x i64> %a0, %a1
@ -954,7 +957,8 @@ define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
; AVX512-LABEL: test_cmp_v16i32:
; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: retq
%1 = icmp sgt <16 x i32> %a0, %a1

View File

@ -785,7 +785,8 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
; AVX512: # BB#0: # %entry
; AVX512-NEXT: movzbl (%rdi), %eax
; AVX512-NEXT: kmovw %eax, %k1
; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
;
@ -966,7 +967,8 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
; AVX512: # BB#0: # %entry
; AVX512-NEXT: movzbl (%rdi), %eax
; AVX512-NEXT: kmovw %eax, %k1
; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512-NEXT: retq
@ -1162,7 +1164,8 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
; AVX512: # BB#0: # %entry
; AVX512-NEXT: movzbl (%rdi), %eax
; AVX512-NEXT: kmovw %eax, %k1
; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
;
@ -1455,7 +1458,8 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
; AVX512: # BB#0: # %entry
; AVX512-NEXT: movzbl (%rdi), %eax
; AVX512-NEXT: kmovw %eax, %k1
; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: retq
;
@ -1848,7 +1852,8 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
; AVX512: # BB#0: # %entry
; AVX512-NEXT: movzbl (%rdi), %eax
; AVX512-NEXT: kmovw %eax, %k1
; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: retq
;
@ -2350,7 +2355,8 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
; AVX512-LABEL: load_sext_16i1_to_16i8:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: kmovw (%rdi), %k1
; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: retq
;
@ -2887,7 +2893,8 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
; AVX512-LABEL: load_sext_16i1_to_16i16:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: kmovw (%rdi), %k1
; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: retq
;
@ -3731,13 +3738,13 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
; AVX512-LABEL: load_sext_32i1_to_32i8:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: kmovw (%rdi), %k1
; AVX512-NEXT: movl {{.*}}(%rip), %eax
; AVX512-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: kmovw 2(%rdi), %k1
; AVX512-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z}
; AVX512-NEXT: vpmovdb %zmm1, %xmm1
; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: kmovw 2(%rdi), %k1
; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_32i1_to_32i8:

View File

@ -358,9 +358,9 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
; CHECK-LABEL: combine_pshufb_identity_mask:
; CHECK: # BB#0:
; CHECK-NEXT: kmovq %rdi, %k1
; CHECK-NEXT: vmovdqu8 {{.*#+}} zmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; CHECK-NEXT: vmovdqu8 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; CHECK-NEXT: vmovaps %zmm1, %zmm3
; CHECK-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3
; CHECK-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vpshufb %zmm2, %zmm3, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0

View File

@ -74,13 +74,13 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
; AVX512F-NEXT: movq {{.*}}(%rip), %rax
; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0]
; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; AVX512F-NEXT: retq
;
@ -105,14 +105,14 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1
; AVX512F: # BB#0:
; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2
; AVX512F-NEXT: movl {{.*}}(%rip), %eax
; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vpslld $31, %zmm1, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z}
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z}
; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2
; AVX512F-NEXT: vpslld $31, %zmm2, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: retq
;
@ -163,13 +163,13 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
; AVX512F: # BB#0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: movq {{.*}}(%rip), %rax
; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z}
; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm0
; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z}
; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm1
; AVX512F-NEXT: vpbroadcastq %xmm1, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; AVX512F-NEXT: retq
;
@ -192,7 +192,8 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
; AVX512F: # BB#0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
@ -224,7 +225,8 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
; AVX512F: # BB#0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
@ -252,7 +254,8 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
; AVX512F: # BB#0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
@ -284,7 +287,8 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
; AVX512F: # BB#0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2
; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
@ -316,14 +320,14 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
; AVX512F: # BB#0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: movq {{.*}}(%rip), %rax
; AVX512F-NEXT: movb $51, %cl
; AVX512F-NEXT: kmovw %ecx, %k2
; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpbroadcastq %rax, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: movb $51, %al
; AVX512F-NEXT: kmovw %eax, %k2
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z}
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
@ -355,11 +359,11 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
@ -371,7 +375,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
; VL_BW_DQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
; VL_BW_DQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0
; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
@ -388,7 +392,8 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
; AVX512F: # BB#0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
@ -426,8 +431,9 @@ define i64 @shuf64i1_zero(i64 %a) {
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $96, %rsp
; AVX512F-NEXT: movl %edi, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1