AVX-512: fixed algorithm of building vectors of i1 elements

fixed extract-insert i1 element, load i1, zextload i1 should be with "and $1, %reg" to prevent loading garbage. added a bunch of new tests. llvm-svn: 237793
2015-05-20 14:32:03 +00:00 · 2015-05-20 14:32:03 +00:00 · f61727d880
parent 69c6008e49
commit f61727d880
7 changed files with 281 additions and 90 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -1471,6 +1471,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
    setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
    setOperationAction(ISD::SELECT,             MVT::v64i1, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);

    for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
      const MVT VT = (MVT::SimpleValueType)i;
@ -1500,6 +1504,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
    setOperationAction(ISD::SELECT,             MVT::v4i1, Custom);
    setOperationAction(ISD::SELECT,             MVT::v2i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i1, Custom);

    setOperationAction(ISD::AND,                MVT::v8i32, Legal);
    setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
@ -5188,12 +5194,27 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
  return NV;
 }

+static SDValue ConvertI1VectorToInterger(SDValue Op, SelectionDAG &DAG) {
+  assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
+         Op.getScalarValueSizeInBits() == 1 &&
+         "Can not convert non-constant vector");
+  uint64_t Immediate = 0;
+  for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
+    SDValue In = Op.getOperand(idx);
+    if (In.getOpcode() != ISD::UNDEF)
+      Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+  }
+  SDLoc dl(Op);
+  MVT VT =
+   MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8));
+  return DAG.getConstant(Immediate, dl, VT);
+}
 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
 SDValue
 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {

  MVT VT = Op.getSimpleValueType();
-  assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
+  assert((VT.getVectorElementType() == MVT::i1) &&
         "Unexpected type in LowerBUILD_VECTORvXi1!");

  SDLoc dl(Op);
@ -5209,62 +5230,69 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
  }

-  bool AllContants = true;
+  if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+    SDValue Imm = ConvertI1VectorToInterger(Op, DAG);
+    if (Imm.getValueSizeInBits() == VT.getSizeInBits())
+      return DAG.getNode(ISD::BITCAST, dl, VT, Imm);
+    SDValue ExtVec = DAG.getNode(ISD::BITCAST, dl, MVT::v8i1, Imm);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+                        DAG.getIntPtrConstant(0, dl));
+  }
+
+  // Vector has one or more non-const elements
  uint64_t Immediate = 0;
-  int NonConstIdx = -1;
+  SmallVector<unsigned, 16> NonConstIdx;
  bool IsSplat = true;
-  unsigned NumNonConsts = 0;
-  unsigned NumConsts = 0;
+  bool HasConstElts = false;
+  int SplatIdx = -1;
  for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
    SDValue In = Op.getOperand(idx);
    if (In.getOpcode() == ISD::UNDEF)
      continue;
-    if (!isa<ConstantSDNode>(In)) {
-      AllContants = false;
-      NonConstIdx = idx;
-      NumNonConsts++;
-    } else {
-      NumConsts++;
-      if (cast<ConstantSDNode>(In)->getZExtValue())
-      Immediate |= (1ULL << idx);
+    if (!isa<ConstantSDNode>(In)) 
+      NonConstIdx.push_back(idx);
+    else {
+      Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+      HasConstElts = true;
    }
-    if (In != Op.getOperand(0))
+    if (SplatIdx == -1)
+      SplatIdx = idx;
+    else if (In != Op.getOperand(SplatIdx))
      IsSplat = false;
  }

-  if (AllContants) {
-    SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
-      DAG.getConstant(Immediate, dl, MVT::i16));
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
-                       DAG.getIntPtrConstant(0, dl));
+  // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
+  if (IsSplat)
+    return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
+                       DAG.getConstant(1, dl, VT),
+                       DAG.getConstant(0, dl, VT));
+
+  // insert elements one by one
+  SDValue DstVec;
+  SDValue Imm;
+  if (Immediate) {
+    MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
+    Imm = DAG.getConstant(Immediate, dl, ImmVT);
+  }
+  else if (HasConstElts)
+    Imm = DAG.getConstant(0, dl, VT);
+  else 
+    Imm = DAG.getUNDEF(VT);
+  if (Imm.getValueSizeInBits() == VT.getSizeInBits())
+    DstVec = DAG.getNode(ISD::BITCAST, dl, VT, Imm);
+  else {
+    SDValue ExtVec = DAG.getNode(ISD::BITCAST, dl, MVT::v8i1, Imm);
+    DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+                         DAG.getIntPtrConstant(0, dl));
  }

-  if (NumNonConsts == 1 && NonConstIdx != 0) {
-    SDValue DstVec;
-    if (NumConsts) {
-      SDValue VecAsImm = DAG.getConstant(Immediate, dl,
-                                         MVT::getIntegerVT(VT.getSizeInBits()));
-      DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
-    }
-    else
-      DstVec = DAG.getUNDEF(VT);
-    return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
-                       Op.getOperand(NonConstIdx),
-                       DAG.getIntPtrConstant(NonConstIdx, dl));
+  for (unsigned i = 0; i < NonConstIdx.size(); ++i) {
+    unsigned InsertIdx = NonConstIdx[i];
+    DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+                         Op.getOperand(InsertIdx),
+                         DAG.getIntPtrConstant(InsertIdx, dl));
  }
-  if (!IsSplat && (NonConstIdx != 0))
-    llvm_unreachable("Unsupported BUILD_VECTOR operation");
-  MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
-  SDValue Select;
-  if (IsSplat)
-    Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
-                          DAG.getConstant(-1, dl, SelectVT),
-                          DAG.getConstant(0, dl, SelectVT));
-  else
-    Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
-                         DAG.getConstant((Immediate | 1), dl, SelectVT),
-                         DAG.getConstant(Immediate, dl, SelectVT));
-  return DAG.getNode(ISD::BITCAST, dl, VT, Select);
+  return DstVec;
 }

 /// \brief Return true if \p N implements a horizontal binop and return the
@ -10670,15 +10698,11 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {

  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
  SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
+  if (IdxVal)
+    EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+                           DAG.getConstant(IdxVal, dl, MVT::i8));
  if (Vec.getOpcode() == ISD::UNDEF)
-    return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
-                       DAG.getConstant(IdxVal, dl, MVT::i8));
-  const TargetRegisterClass* rc = getRegClassFor(VecVT);
-  unsigned MaxSift = rc->getSize()*8 - 1;
-  EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
-                    DAG.getConstant(MaxSift, dl, MVT::i8));
-  EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
-                    DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
+    return EltInVec;
  return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
 }

@ -13623,6 +13647,29 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    }
  }

+    if (VT.isVector() && VT.getScalarType() == MVT::i1) {
+      SDValue Op1Scalar;
+      if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
+        Op1Scalar = ConvertI1VectorToInterger(Op1, DAG);
+      else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
+        Op1Scalar = Op1.getOperand(0);
+      SDValue Op2Scalar;
+      if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
+        Op2Scalar = ConvertI1VectorToInterger(Op2, DAG);
+      else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
+        Op2Scalar = Op2.getOperand(0);
+      if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
+        SDValue newSelect = DAG.getNode(ISD::SELECT, DL, 
+                                        Op1Scalar.getValueType(),
+                                        Cond, Op1Scalar, Op2Scalar);
+        if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
+          return DAG.getNode(ISD::BITCAST, DL, VT, newSelect);
+        SDValue ExtVec = DAG.getNode(ISD::BITCAST, DL, MVT::v8i1, newSelect);
+        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
+                           DAG.getIntPtrConstant(0, DL));
+    }
+  }
+
  if (VT == MVT::v4i1 || VT == MVT::v2i1) {
    SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
    Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
@ -20728,7 +20775,8 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
    if (!InVec.hasOneUse())
      return SDValue();
    EVT BCVT = InVec.getOperand(0).getValueType();
-    if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
+    if (!BCVT.isVector() || 
+        BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
      return SDValue();
    InVec = InVec.getOperand(0);
  }
@ -20833,7 +20881,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
    return NewOp;

  SDValue InputVector = N->getOperand(0);
-
+  SDLoc dl(InputVector);
  // Detect mmx to i32 conversion through a v2i32 elt extract.
  if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
      N->getValueType(0) == MVT::i32 &&
@ -20858,6 +20906,18 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
                         MMXSrcOp.getOperand(0));
  }

+  EVT VT = N->getValueType(0);
+  
+  if (VT == MVT::i1 && dyn_cast<ConstantSDNode>(N->getOperand(1)) &&
+      InputVector.getOpcode() == ISD::BITCAST &&
+      dyn_cast<ConstantSDNode>(InputVector.getOperand(0))) {
+    uint64_t ExtractedElt =
+	  cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    uint64_t InputValue =
+	  cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
+    uint64_t Res = (InputValue >> ExtractedElt) & 1;
+    return DAG.getConstant(Res, dl, MVT::i1);
+  }
  // Only operate on vectors of 4 elements, where the alternative shuffling
  // gets to be more expensive.
  if (InputVector.getValueType() != MVT::v4i32)
@ -20903,7 +20963,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
  // otherwise bounce the vector off the cache.
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  SDValue Vals[4];
-  SDLoc dl(InputVector);

  if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
    SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
@ -23606,6 +23665,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
                                  const X86Subtarget *Subtarget) {
  SDValue N0 = N->getOperand(0);
  EVT VT = N->getValueType(0);
+  SDLoc dl(N);

  // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
  // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
@ -23613,7 +23673,6 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
  // from AH (which we otherwise need to do contortions to access).
  if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
      N0.getValueType() == MVT::i8 && VT == MVT::i32) {
-    SDLoc dl(N);
    SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
    SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
                            N0.getOperand(0), N0.getOperand(1));
@ -23621,8 +23680,15 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
    return R.getValue(1);
  }

-  if (!DCI.isBeforeLegalizeOps())
+  if (!DCI.isBeforeLegalizeOps()) {
+    if (N0.getValueType() == MVT::i1) {
+      SDValue Zero = DAG.getConstant(0, dl, VT);
+      SDValue AllOnes =
+        DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, VT);
+      return DAG.getNode(ISD::SELECT, dl, VT, N0, AllOnes, Zero);
+    }
    return SDValue();
+  }

  if (!Subtarget->hasFp256())
    return SDValue();
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@ -1855,7 +1855,9 @@ let Predicates = [HasAVX512] in {
  def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
            (KMOVWmk addr:$dst, VK16:$src)>;
  def : Pat<(i1 (load addr:$src)),
-            (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK1)>;
+            (COPY_TO_REGCLASS (AND16ri (i16 (SUBREG_TO_REG (i32 0),
+                                              (MOV8rm addr:$src), sub_8bit)),
+                                (i16 1)), VK1)>;
  def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))),
            (KMOVWkm addr:$src)>;
 }
@ -1920,13 +1922,13 @@ let Predicates = [HasAVX512, NoDQI] in {
  // GR from/to 8-bit mask without native support
  def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
            (COPY_TO_REGCLASS
-              (KMOVWkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
-              VK8)>;
+             (KMOVWkr (MOVZX32rr8 GR8 :$src)), VK8)>;
  def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
            (EXTRACT_SUBREG
              (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
              sub_8bit)>;
 }
+
 let Predicates = [HasAVX512] in {
  def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))),
            (COPY_TO_REGCLASS VK16:$src, VK1)>;
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@ -1064,11 +1064,12 @@ defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>;
 defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;

 // zextload bool -> zextload byte
-def : Pat<(zextloadi8i1  addr:$src), (MOV8rm     addr:$src)>;
-def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
-def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(zextloadi8i1  addr:$src), (AND8ri (MOV8rm addr:$src), (i8 1))>;
+def : Pat<(zextloadi16i1 addr:$src), (AND16ri (MOVZX16rm8 addr:$src), (i16 1))>;
+def : Pat<(zextloadi32i1 addr:$src), (AND32ri (MOVZX32rm8 addr:$src), (i32 1))>;
 def : Pat<(zextloadi64i1 addr:$src),
-          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+          (SUBREG_TO_REG (i64 0),
+           (AND32ri (MOVZX32rm8 addr:$src), (i32 1)), sub_32bit)>;

 // extload bool -> extload byte
 // When extloading from 16-bit and smaller memory locations into 64-bit
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@ -137,10 +137,12 @@ define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
 }

 ;CHECK-LABEL: test13
-;CHECK: cmpl
-;CHECK: sbbl
-;CHECK: orl $65532
-;CHECK: ret
+;CHECK: cmpl    %esi, %edi
+;CHECK: setb    %al
+;CHECK: andl    $1, %eax
+;CHECK: kmovw   %eax, %k0
+;CHECK: movw    $-4
+;CHECK: korw    
 define i16 @test13(i32 %a, i32 %b) {
  %cmp_res = icmp ult i32 %a, %b
  %maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %cmp_res, i32 0
@ -167,19 +169,22 @@ define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
 }

 ;CHECK-LABEL: test15
-;CHECK: kshiftlw
-;CHECK: kmovw
-;CHECK: ret
+;CHECK: movb (%rdi), %al
+;CHECK: andb $1, %al
+;CHECK: movw    $-1, %ax
+;CHECK: cmovew
 define i16 @test15(i1 *%addr) {
-  %x = load i1 , i1 * %addr, align 128
+  %x = load i1 , i1 * %addr, align 1
  %x1 = insertelement <16 x i1> undef, i1 %x, i32 10
  %x2 = bitcast <16 x i1>%x1 to i16
  ret i16 %x2
 }

 ;CHECK-LABEL: test16
-;CHECK: kshiftlw
-;CHECK: kshiftrw
+;CHECK: movb (%rdi), %al
+;CHECK: andw $1, %ax
+;CHECK: kmovw
+;CHECK: kshiftlw        $10
 ;CHECK: korw
 ;CHECK: ret
 define i16 @test16(i1 *%addr, i16 %a) {
@ -191,11 +196,11 @@ define i16 @test16(i1 *%addr, i16 %a) {
 }

 ;CHECK-LABEL: test17
-;KNL: kshiftlw
-;KNL: kshiftrw
+;KNL: movb (%rdi), %al
+;KNL: andw $1, %ax
+;KNL: kshiftlw $4
 ;KNL: korw
-;SKX: kshiftlb
-;SKX: kshiftrb
+;SKX: kshiftlb $4
 ;SKX: korb
 ;CHECK: ret
 define i8 @test17(i1 *%addr, i8 %a) {
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@ -191,7 +191,7 @@ false:

 ; SKX-LABEL: test7
 ; SKX: vpmovw2m
-; SKX: kmovw   %eax, %k1
+; SKX: kmovb   %eax, %k1
 ; SKX: korb

 define void @test7(<8 x i1> %mask)  {
@ -282,3 +282,114 @@ define <4 x i1> @test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) {
  ret <4 x i1>%c
 }

+; KNL-LABEL: test12
+; KNL: movl    %edi, %eax
+define i32 @test12(i32 %x, i32 %y)  {
+  %a = bitcast i16 21845 to <16 x i1>
+  %b = extractelement <16 x i1> %a, i32 0
+  %c = select i1 %b, i32 %x, i32 %y
+  ret i32 %c
+}
+
+; KNL-LABEL: test13
+; KNL: movl    %esi, %eax
+define i32 @test13(i32 %x, i32 %y)  {
+  %a = bitcast i16 21845 to <16 x i1>
+  %b = extractelement <16 x i1> %a, i32 3
+  %c = select i1 %b, i32 %x, i32 %y
+  ret i32 %c
+}
+
+; SKX-LABEL: test14
+; SKX: movb     $11, %al
+; SKX: kmovb    %eax, %k0
+; SKX: vpmovm2d %k0, %xmm0
+
+define <4 x i1> @test14()  {
+  %a = bitcast i16 21845 to <16 x i1>
+  %b = extractelement <16 x i1> %a, i32 2
+  %c = insertelement <4 x i1> <i1 true, i1 false, i1 false, i1 true>, i1 %b, i32 1
+  ret <4 x i1> %c
+}
+
+; KNL-LABEL: test15
+; KNL: cmovgw
+define <16 x i1> @test15(i32 %x, i32 %y)  {
+  %a = bitcast i16 21845 to <16 x i1>
+  %b = bitcast i16 1 to <16 x i1>
+  %mask = icmp sgt i32 %x, %y
+  %c = select i1 %mask, <16 x i1> %a, <16 x i1> %b
+  ret <16 x i1> %c
+}
+
+; SKX-LABEL: test16
+; SKX: kxnorw  %k1, %k1, %k1
+; SKX: kshiftrw        $15, %k1, %k1
+; SKX: kshiftlq        $5, %k1, %k1
+; SKX: korq    %k1, %k0, %k0
+; SKX: vpmovm2b        %k0, %zmm0
+define <64 x i8> @test16(i64 %x) {
+  %a = bitcast i64 %x to <64 x i1>
+  %b = insertelement <64 x i1>%a, i1 true, i32 5
+  %c = sext <64 x i1>%b to <64 x i8>
+  ret <64 x i8>%c
+}
+
+; SKX-LABEL: test17
+; SKX: setg    %al
+; SKX: andl    $1, %eax
+; SKX: kmovw   %eax, %k1
+; SKX: kshiftlq        $5, %k1, %k1
+; SKX: korq    %k1, %k0, %k0
+; SKX: vpmovm2b        %k0, %zmm0
+define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
+  %a = bitcast i64 %x to <64 x i1>
+  %b = icmp sgt i32 %y, %z
+  %c = insertelement <64 x i1>%a, i1 %b, i32 5
+  %d = sext <64 x i1>%c to <64 x i8>
+  ret <64 x i8>%d
+}
+
+; KNL-LABEL: test18
+define <8 x i1> @test18(i8 %a, i16 %y) {
+  %b = bitcast i8 %a to <8 x i1>
+  %b1 = bitcast i16 %y to <16 x i1>
+  %el1 = extractelement <16 x i1>%b1, i32 8
+  %el2 = extractelement <16 x i1>%b1, i32 9
+  %c = insertelement <8 x i1>%b, i1 %el1, i32 7
+  %d = insertelement <8 x i1>%c, i1 %el2, i32 6
+  ret <8 x i1>%d
+}
+
+; KNL-LABEL: test19
+; KNL: movzbl  %dil, %eax
+; KNL: kmovw   %eax, %k0
+; KNL: kshiftlw        $13, %k0, %k0
+; KNL: kshiftrw        $15, %k0, %k0
+; KNL: kmovw   %k0, %eax
+; KNL: andl    $1, %eax
+; KNL: testb   %al, %al
+
+define <8 x i1> @test19(i8 %a) {
+  %b = bitcast i8 %a to <8 x i1>
+  %c = shufflevector < 8 x i1>%b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
+  ret <8 x i1> %c
+}
+
+; KNL-LABEL: test20
+; KNL: movzbl  %dil, %eax
+; KNL: kmovw   %eax, %k0
+; KNL: kshiftlw        $13, %k0, %k1
+; KNL: kshiftrw        $15, %k1, %k1
+; KNL: kshiftlw        $12, %k0, %k0
+; KNL: kshiftrw        $15, %k0, %k0
+; KNL: kshiftlw        $4, %k0, %k0
+; KNL: kshiftlw        $1, %k1, %k2
+; KNL: korw    %k0, %k2, %k0
+; KNL: kshiftlw        $6, %k1, %k1
+; KNL: korw    %k1, %k0, %k1
+define <8 x i1> @test20(i8 %a, i16 %y) {
+  %b = bitcast i8 %a to <8 x i1>
+  %c = shufflevector < 8 x i1>%b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
+  ret <8 x i1> %c
+}
--- a/llvm/test/CodeGen/X86/avx512-select.ll
+++ b/llvm/test/CodeGen/X86/avx512-select.ll
@ -50,8 +50,10 @@ define <16 x double> @select04(<16 x double> %a, <16 x double> %b) {
 }

 ; CHECK-LABEL: select05
-; CHECK: kmovw   %esi, %k0
-; CHECK-NEXT: kmovw   %edi, %k1
+; CHECK: movzbl  %sil, %eax
+; CHECK: kmovw   %eax, %k0
+; CHECK: movzbl  %dil, %eax
+; CHECK: kmovw   %eax, %k1
 ; CHECK-NEXT: korw    %k1, %k0, %k0
 ; CHECK-NEXT: kmovw   %k0, %eax
 define i8 @select05(i8 %a.0, i8 %m) {
@ -63,8 +65,10 @@ define i8 @select05(i8 %a.0, i8 %m) {
 }

 ; CHECK-LABEL: select06
-; CHECK: kmovw   %esi, %k0
-; CHECK-NEXT: kmovw   %edi, %k1
+; CHECK: movzbl  %sil, %eax
+; CHECK: kmovw   %eax, %k0
+; CHECK: movzbl  %dil, %eax
+; CHECK: kmovw   %eax, %k1
 ; CHECK-NEXT: kandw    %k1, %k0, %k0
 ; CHECK-NEXT: kmovw   %k0, %eax
 define i8 @select06(i8 %a.0, i8 %m) {
@ -76,9 +80,12 @@ define i8 @select06(i8 %a.0, i8 %m) {
 }

 ; CHECK-LABEL: select07
-; CHECK-DAG:  kmovw   %edx, %k0
-; CHECK-DAG:  kmovw   %edi, %k1
-; CHECK-DAG:  kmovw   %esi, %k2
+; CHECK-DAG: movzbl  %dl, %eax
+; CHECK-DAG: kmovw   %eax, %k0
+; CHECK-DAG: movzbl  %dil, %eax
+; CHECK-DAG: kmovw   %eax, %k1
+; CHECK-DAG: movzbl  %sil, %eax
+; CHECK-DAG: kmovw   %eax, %k2
 ; CHECK: kandw %k0, %k1, %k1
 ; CHECK-NEXT: knotw    %k0, %k0
 ; CHECK-NEXT: kandw    %k0, %k2, %k0
--- a/llvm/test/CodeGen/X86/avx512-trunc-ext.ll
+++ b/llvm/test/CodeGen/X86/avx512-trunc-ext.ll
@ -156,10 +156,9 @@ define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) {
 }

 ; CHECK-LABEL: trunc_i32_to_i1
-; CHECK: testb
-; CHECK: setne
-; CKECK: orl
-; CHECK: ret
+; CHECK: movw    $-4, %ax
+; CHECK: kmovw   %eax, %k1
+; CKECK: korw
 define i16 @trunc_i32_to_i1(i32 %a) {
  %a_i = trunc i32 %a to i1
  %maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %a_i, i32 0