[X86] Remove masking from avx512vbmi2 concat and shift by immediate intrinsics. Use select in IR instead.

llvm-svn: 334576
2018-06-13 07:19:21 +00:00 · 2018-06-13 07:19:21 +00:00 · 3829d258ee
parent 7252e2a8bd
commit 3829d258ee
10 changed files with 983 additions and 391 deletions
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@ -4739,101 +4739,83 @@ let TargetPrefix = "x86" in {

 // VBMI2 Concat & Shift
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx512_mask_vpshld_q_512 :
-        GCCBuiltin<"__builtin_ia32_vpshldq512_mask">,
+  def int_x86_avx512_vpshld_q_512 :
+        GCCBuiltin<"__builtin_ia32_vpshldq512">,
        Intrinsic<[llvm_v8i64_ty],
-                  [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty, llvm_v8i64_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshld_q_256 :
-        GCCBuiltin<"__builtin_ia32_vpshldq256_mask">,
+                  [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshld_q_256 :
+        GCCBuiltin<"__builtin_ia32_vpshldq256">,
        Intrinsic<[llvm_v4i64_ty],
-                  [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_v4i64_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshld_q_128 :
-        GCCBuiltin<"__builtin_ia32_vpshldq128_mask">,
+                  [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshld_q_128 :
+        GCCBuiltin<"__builtin_ia32_vpshldq128">,
        Intrinsic<[llvm_v2i64_ty],
-                  [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_v2i64_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
+                  [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;

-  def int_x86_avx512_mask_vpshld_d_512 :
-        GCCBuiltin<"__builtin_ia32_vpshldd512_mask">,
+  def int_x86_avx512_vpshld_d_512 :
+        GCCBuiltin<"__builtin_ia32_vpshldd512">,
        Intrinsic<[llvm_v16i32_ty],
-                  [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty, llvm_v16i32_ty,
-                   llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshld_d_256 :
-        GCCBuiltin<"__builtin_ia32_vpshldd256_mask">,
+                  [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshld_d_256 :
+        GCCBuiltin<"__builtin_ia32_vpshldd256">,
        Intrinsic<[llvm_v8i32_ty],
-                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_v8i32_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshld_d_128 :
-        GCCBuiltin<"__builtin_ia32_vpshldd128_mask">,
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshld_d_128 :
+        GCCBuiltin<"__builtin_ia32_vpshldd128">,
        Intrinsic<[llvm_v4i32_ty],
-                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
+                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;

-  def int_x86_avx512_mask_vpshld_w_512 :
-        GCCBuiltin<"__builtin_ia32_vpshldw512_mask">,
+  def int_x86_avx512_vpshld_w_512 :
+        GCCBuiltin<"__builtin_ia32_vpshldw512">,
        Intrinsic<[llvm_v32i16_ty],
-                  [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_v32i16_ty,
-                   llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshld_w_256 :
-        GCCBuiltin<"__builtin_ia32_vpshldw256_mask">,
+                  [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshld_w_256 :
+        GCCBuiltin<"__builtin_ia32_vpshldw256">,
        Intrinsic<[llvm_v16i16_ty],
-                  [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty, llvm_v16i16_ty,
-                   llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshld_w_128 :
-        GCCBuiltin<"__builtin_ia32_vpshldw128_mask">,
+                  [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshld_w_128 :
+        GCCBuiltin<"__builtin_ia32_vpshldw128">,
        Intrinsic<[llvm_v8i16_ty],
-                  [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty, llvm_v8i16_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
+                  [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;

-  def int_x86_avx512_mask_vpshrd_q_512 :
-        GCCBuiltin<"__builtin_ia32_vpshrdq512_mask">,
+  def int_x86_avx512_vpshrd_q_512 :
+        GCCBuiltin<"__builtin_ia32_vpshrdq512">,
        Intrinsic<[llvm_v8i64_ty],
-                  [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty, llvm_v8i64_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshrd_q_256 :
-        GCCBuiltin<"__builtin_ia32_vpshrdq256_mask">,
+                  [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshrd_q_256 :
+        GCCBuiltin<"__builtin_ia32_vpshrdq256">,
        Intrinsic<[llvm_v4i64_ty],
-                  [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_v4i64_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshrd_q_128 :
-        GCCBuiltin<"__builtin_ia32_vpshrdq128_mask">,
+                  [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshrd_q_128 :
+        GCCBuiltin<"__builtin_ia32_vpshrdq128">,
        Intrinsic<[llvm_v2i64_ty],
-                  [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_v2i64_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
+                  [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;

-  def int_x86_avx512_mask_vpshrd_d_512 :
-        GCCBuiltin<"__builtin_ia32_vpshrdd512_mask">,
+  def int_x86_avx512_vpshrd_d_512 :
+        GCCBuiltin<"__builtin_ia32_vpshrdd512">,
        Intrinsic<[llvm_v16i32_ty],
-                  [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty, llvm_v16i32_ty,
-                   llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshrd_d_256 :
-        GCCBuiltin<"__builtin_ia32_vpshrdd256_mask">,
+                  [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshrd_d_256 :
+        GCCBuiltin<"__builtin_ia32_vpshrdd256">,
        Intrinsic<[llvm_v8i32_ty],
-                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_v8i32_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshrd_d_128 :
-        GCCBuiltin<"__builtin_ia32_vpshrdd128_mask">,
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshrd_d_128 :
+        GCCBuiltin<"__builtin_ia32_vpshrdd128">,
        Intrinsic<[llvm_v4i32_ty],
-                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
+                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;

-  def int_x86_avx512_mask_vpshrd_w_512 :
-        GCCBuiltin<"__builtin_ia32_vpshrdw512_mask">,
+  def int_x86_avx512_vpshrd_w_512 :
+        GCCBuiltin<"__builtin_ia32_vpshrdw512">,
        Intrinsic<[llvm_v32i16_ty],
-                  [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_v32i16_ty,
-                   llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshrd_w_256 :
-        GCCBuiltin<"__builtin_ia32_vpshrdw256_mask">,
+                  [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshrd_w_256 :
+        GCCBuiltin<"__builtin_ia32_vpshrdw256">,
        Intrinsic<[llvm_v16i16_ty],
-                  [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty, llvm_v16i16_ty,
-                   llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpshrd_w_128 :
-        GCCBuiltin<"__builtin_ia32_vpshrdw128_mask">,
+                  [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpshrd_w_128 :
+        GCCBuiltin<"__builtin_ia32_vpshrdw128">,
        Intrinsic<[llvm_v8i16_ty],
-                  [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty, llvm_v8i16_ty,
-                   llvm_i8_ty], [IntrNoMem]>;
+                  [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;

  def int_x86_avx512_mask_vpshldv_w_128 :
        GCCBuiltin<"__builtin_ia32_vpshldvw128_mask">,
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@ -263,6 +263,8 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
      Name.startswith("avx512.mask.vpdpwssds.") || // Added in 7.0
      Name.startswith("avx512.maskz.vpdpwssds.") || // Added in 7.0
      Name.startswith("avx512.mask.dbpsadbw.") || // Added in 7.0
+      Name.startswith("avx512.mask.vpshld.") || // Added in 7.0
+      Name.startswith("avx512.mask.vpshrd.") || // Added in 7.0
      Name.startswith("avx512.mask.add.p") || // Added in 7.0
      Name.startswith("avx512.mask.sub.p") || // Added in 7.0
      Name.startswith("avx512.mask.mul.p") || // Added in 7.0
@ -1258,6 +1260,48 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
      IID = Intrinsic::x86_avx512_dbpsadbw_512;
    else
      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("vpshld.")) {
+    if (VecWidth == 128 && Name[7] == 'q')
+      IID = Intrinsic::x86_avx512_vpshld_q_128;
+    else if (VecWidth == 128 && Name[7] == 'd')
+      IID = Intrinsic::x86_avx512_vpshld_d_128;
+    else if (VecWidth == 128 && Name[7] == 'w')
+      IID = Intrinsic::x86_avx512_vpshld_w_128;
+    else if (VecWidth == 256 && Name[7] == 'q')
+      IID = Intrinsic::x86_avx512_vpshld_q_256;
+    else if (VecWidth == 256 && Name[7] == 'd')
+      IID = Intrinsic::x86_avx512_vpshld_d_256;
+    else if (VecWidth == 256 && Name[7] == 'w')
+      IID = Intrinsic::x86_avx512_vpshld_w_256;
+    else if (VecWidth == 512 && Name[7] == 'q')
+      IID = Intrinsic::x86_avx512_vpshld_q_512;
+    else if (VecWidth == 512 && Name[7] == 'd')
+      IID = Intrinsic::x86_avx512_vpshld_d_512;
+    else if (VecWidth == 512 && Name[7] == 'w')
+      IID = Intrinsic::x86_avx512_vpshld_w_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("vpshrd.")) {
+    if (VecWidth == 128 && Name[7] == 'q')
+      IID = Intrinsic::x86_avx512_vpshrd_q_128;
+    else if (VecWidth == 128 && Name[7] == 'd')
+      IID = Intrinsic::x86_avx512_vpshrd_d_128;
+    else if (VecWidth == 128 && Name[7] == 'w')
+      IID = Intrinsic::x86_avx512_vpshrd_w_128;
+    else if (VecWidth == 256 && Name[7] == 'q')
+      IID = Intrinsic::x86_avx512_vpshrd_q_256;
+    else if (VecWidth == 256 && Name[7] == 'd')
+      IID = Intrinsic::x86_avx512_vpshrd_d_256;
+    else if (VecWidth == 256 && Name[7] == 'w')
+      IID = Intrinsic::x86_avx512_vpshrd_w_256;
+    else if (VecWidth == 512 && Name[7] == 'q')
+      IID = Intrinsic::x86_avx512_vpshrd_q_512;
+    else if (VecWidth == 512 && Name[7] == 'd')
+      IID = Intrinsic::x86_avx512_vpshrd_d_512;
+    else if (VecWidth == 512 && Name[7] == 'w')
+      IID = Intrinsic::x86_avx512_vpshrd_w_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
  } else
    return false;

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -20488,7 +20488,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                              Src2, Src3),
                                  Mask, PassThru, Subtarget, DAG);
    }
-    case INTR_TYPE_3OP_IMM8_MASK:
    case INTR_TYPE_3OP_MASK: {
      SDValue Src1 = Op.getOperand(1);
      SDValue Src2 = Op.getOperand(2);
@ -20496,9 +20495,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
      SDValue PassThru = Op.getOperand(4);
      SDValue Mask = Op.getOperand(5);

-      if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
-        Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
-
      // We specify 2 possible opcodes for intrinsics with rounding modes.
      // First, we check if the intrinsic may have non-default rounding mode,
      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@ -28,7 +28,7 @@ enum IntrinsicType : uint16_t {
  CVTPD2PS, CVTPD2PS_MASK,
  INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
  INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK,
-  INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_IMM8_MASK,
+  INTR_TYPE_3OP_MASK,
  FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3,
  FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
  IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK,
@ -960,15 +960,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB,
                     X86ISD::FNMSUB_RND),

-  X86_INTRINSIC_DATA(avx512_mask_vpshld_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshld_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshld_d_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshld_q_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshld_q_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshld_q_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshld_w_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshld_w_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshld_w_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
  X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_128, FMA_OP_MASK, X86ISD::VSHLDV, 0),
  X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_256, FMA_OP_MASK, X86ISD::VSHLDV, 0),
  X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_512, FMA_OP_MASK, X86ISD::VSHLDV, 0),
@ -978,15 +969,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
  X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_128, FMA_OP_MASK, X86ISD::VSHLDV, 0),
  X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_256, FMA_OP_MASK, X86ISD::VSHLDV, 0),
  X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_512, FMA_OP_MASK, X86ISD::VSHLDV, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
  X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_128, FMA_OP_MASK, X86ISD::VSHRDV, 0),
  X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_256, FMA_OP_MASK, X86ISD::VSHRDV, 0),
  X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_512, FMA_OP_MASK, X86ISD::VSHRDV, 0),
@ -1261,6 +1243,24 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
  X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_128 , IFMA_OP, X86ISD::VPMADD52L, 0),
  X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_256 , IFMA_OP, X86ISD::VPMADD52L, 0),
  X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_512 , IFMA_OP, X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_d_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_d_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_d_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_q_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_q_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_q_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_w_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_w_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshld_w_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_d_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_d_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_d_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_q_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_q_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_q_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_w_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_w_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
+  X86_INTRINSIC_DATA(avx512_vpshrd_w_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
  X86_INTRINSIC_DATA(bmi_bextr_32,         INTR_TYPE_2OP, X86ISD::BEXTR, 0),
  X86_INTRINSIC_DATA(bmi_bextr_64,         INTR_TYPE_2OP, X86ISD::BEXTR, 0),
  X86_INTRINSIC_DATA(fma_vfmadd_pd,        INTR_TYPE_3OP, ISD::FMA, 0),
--- a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-fast-isel.ll
@ -317,11 +317,13 @@ define <8 x i64> @test_mm512_mask_shldi_epi64(<8 x i64> %__S, i8 zeroext %__U, <
 ; X64-NEXT:    vpshldq $127, %zmm2, %zmm1, %zmm0 {%k1}
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 127, <8 x i64> %__S, i8 %__U)
-  ret <8 x i64> %0
+  %0 = tail call <8 x i64> @llvm.x86.avx512.vpshld.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 127)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__S
+  ret <8 x i64> %2
 }

-declare <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.vpshld.q.512(<8 x i64>, <8 x i64>, i32)

 define <8 x i64> @test_mm512_maskz_shldi_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
 ; X86-LABEL: test_mm512_maskz_shldi_epi64:
@ -337,8 +339,10 @@ define <8 x i64> @test_mm512_maskz_shldi_epi64(i8 zeroext %__U, <8 x i64> %__A,
 ; X64-NEXT:    vpshldq $63, %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 63, <8 x i64> zeroinitializer, i8 %__U)
-  ret <8 x i64> %0
+  %0 = tail call <8 x i64> @llvm.x86.avx512.vpshld.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 63)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
+  ret <8 x i64> %2
 }

 define <8 x i64> @test_mm512_shldi_epi64(<8 x i64> %__A, <8 x i64> %__B) {
@ -347,7 +351,7 @@ define <8 x i64> @test_mm512_shldi_epi64(<8 x i64> %__A, <8 x i64> %__B) {
 ; CHECK-NEXT:    vpshldq $31, %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 entry:
-  %0 = tail call <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 31, <8 x i64> zeroinitializer, i8 -1)
+  %0 = tail call <8 x i64> @llvm.x86.avx512.vpshld.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 31)
  ret <8 x i64> %0
 }

@ -366,13 +370,15 @@ define <8 x i64> @test_mm512_mask_shldi_epi32(<8 x i64> %__S, i16 zeroext %__U,
 entry:
  %0 = bitcast <8 x i64> %__A to <16 x i32>
  %1 = bitcast <8 x i64> %__B to <16 x i32>
-  %2 = bitcast <8 x i64> %__S to <16 x i32>
-  %3 = tail call <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32> %0, <16 x i32> %1, i32 127, <16 x i32> %2, i16 %__U)
-  %4 = bitcast <16 x i32> %3 to <8 x i64>
-  ret <8 x i64> %4
+  %2 = tail call <16 x i32> @llvm.x86.avx512.vpshld.d.512(<16 x i32> %0, <16 x i32> %1, i32 127)
+  %3 = bitcast <8 x i64> %__S to <16 x i32>
+  %4 = bitcast i16 %__U to <16 x i1>
+  %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
+  %6 = bitcast <16 x i32> %5 to <8 x i64>
+  ret <8 x i64> %6
 }

-declare <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.vpshld.d.512(<16 x i32>, <16 x i32>, i32)

 define <8 x i64> @test_mm512_maskz_shldi_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
 ; X86-LABEL: test_mm512_maskz_shldi_epi32:
@ -389,9 +395,11 @@ define <8 x i64> @test_mm512_maskz_shldi_epi32(i16 zeroext %__U, <8 x i64> %__A,
 entry:
  %0 = bitcast <8 x i64> %__A to <16 x i32>
  %1 = bitcast <8 x i64> %__B to <16 x i32>
-  %2 = tail call <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32> %0, <16 x i32> %1, i32 63, <16 x i32> zeroinitializer, i16 %__U)
-  %3 = bitcast <16 x i32> %2 to <8 x i64>
-  ret <8 x i64> %3
+  %2 = tail call <16 x i32> @llvm.x86.avx512.vpshld.d.512(<16 x i32> %0, <16 x i32> %1, i32 63)
+  %3 = bitcast i16 %__U to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
+  %5 = bitcast <16 x i32> %4 to <8 x i64>
+  ret <8 x i64> %5
 }

 define <8 x i64> @test_mm512_shldi_epi32(<8 x i64> %__A, <8 x i64> %__B) {
@ -402,7 +410,7 @@ define <8 x i64> @test_mm512_shldi_epi32(<8 x i64> %__A, <8 x i64> %__B) {
 entry:
  %0 = bitcast <8 x i64> %__A to <16 x i32>
  %1 = bitcast <8 x i64> %__B to <16 x i32>
-  %2 = tail call <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32> %0, <16 x i32> %1, i32 31, <16 x i32> zeroinitializer, i16 -1)
+  %2 = tail call <16 x i32> @llvm.x86.avx512.vpshld.d.512(<16 x i32> %0, <16 x i32> %1, i32 31)
  %3 = bitcast <16 x i32> %2 to <8 x i64>
  ret <8 x i64> %3
 }
@ -422,13 +430,15 @@ define <8 x i64> @test_mm512_mask_shldi_epi16(<8 x i64> %__S, i32 %__U, <8 x i64
 entry:
  %0 = bitcast <8 x i64> %__A to <32 x i16>
  %1 = bitcast <8 x i64> %__B to <32 x i16>
-  %2 = bitcast <8 x i64> %__S to <32 x i16>
-  %3 = tail call <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16> %0, <32 x i16> %1, i32 127, <32 x i16> %2, i32 %__U)
-  %4 = bitcast <32 x i16> %3 to <8 x i64>
-  ret <8 x i64> %4
+  %2 = tail call <32 x i16> @llvm.x86.avx512.vpshld.w.512(<32 x i16> %0, <32 x i16> %1, i32 127)
+  %3 = bitcast <8 x i64> %__S to <32 x i16>
+  %4 = bitcast i32 %__U to <32 x i1>
+  %5 = select <32 x i1> %4, <32 x i16> %2, <32 x i16> %3
+  %6 = bitcast <32 x i16> %5 to <8 x i64>
+  ret <8 x i64> %6
 }

-declare <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16>, <32 x i16>, i32, <32 x i16>, i32)
+declare <32 x i16> @llvm.x86.avx512.vpshld.w.512(<32 x i16>, <32 x i16>, i32)

 define <8 x i64> @test_mm512_maskz_shldi_epi16(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) {
 ; X86-LABEL: test_mm512_maskz_shldi_epi16:
@ -445,9 +455,11 @@ define <8 x i64> @test_mm512_maskz_shldi_epi16(i32 %__U, <8 x i64> %__A, <8 x i6
 entry:
  %0 = bitcast <8 x i64> %__A to <32 x i16>
  %1 = bitcast <8 x i64> %__B to <32 x i16>
-  %2 = tail call <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16> %0, <32 x i16> %1, i32 63, <32 x i16> zeroinitializer, i32 %__U)
-  %3 = bitcast <32 x i16> %2 to <8 x i64>
-  ret <8 x i64> %3
+  %2 = tail call <32 x i16> @llvm.x86.avx512.vpshld.w.512(<32 x i16> %0, <32 x i16> %1, i32 63)
+  %3 = bitcast i32 %__U to <32 x i1>
+  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
+  %5 = bitcast <32 x i16> %4 to <8 x i64>
+  ret <8 x i64> %5
 }

 define <8 x i64> @test_mm512_shldi_epi16(<8 x i64> %__A, <8 x i64> %__B) {
@ -458,7 +470,7 @@ define <8 x i64> @test_mm512_shldi_epi16(<8 x i64> %__A, <8 x i64> %__B) {
 entry:
  %0 = bitcast <8 x i64> %__A to <32 x i16>
  %1 = bitcast <8 x i64> %__B to <32 x i16>
-  %2 = tail call <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16> %0, <32 x i16> %1, i32 31, <32 x i16> zeroinitializer, i32 -1)
+  %2 = tail call <32 x i16> @llvm.x86.avx512.vpshld.w.512(<32 x i16> %0, <32 x i16> %1, i32 31)
  %3 = bitcast <32 x i16> %2 to <8 x i64>
  ret <8 x i64> %3
 }
@ -477,11 +489,13 @@ define <8 x i64> @test_mm512_mask_shrdi_epi64(<8 x i64> %__S, i8 zeroext %__U, <
 ; X64-NEXT:    vpshrdq $127, %zmm2, %zmm1, %zmm0 {%k1}
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 127, <8 x i64> %__S, i8 %__U)
-  ret <8 x i64> %0
+  %0 = tail call <8 x i64> @llvm.x86.avx512.vpshrd.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 127)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__S
+  ret <8 x i64> %2
 }

-declare <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.vpshrd.q.512(<8 x i64>, <8 x i64>, i32)

 define <8 x i64> @test_mm512_maskz_shrdi_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
 ; X86-LABEL: test_mm512_maskz_shrdi_epi64:
@ -497,8 +511,10 @@ define <8 x i64> @test_mm512_maskz_shrdi_epi64(i8 zeroext %__U, <8 x i64> %__A,
 ; X64-NEXT:    vpshrdq $63, %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 63, <8 x i64> zeroinitializer, i8 %__U)
-  ret <8 x i64> %0
+  %0 = tail call <8 x i64> @llvm.x86.avx512.vpshrd.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 63)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
+  ret <8 x i64> %2
 }

 define <8 x i64> @test_mm512_shrdi_epi64(<8 x i64> %__A, <8 x i64> %__B) {
@ -507,7 +523,7 @@ define <8 x i64> @test_mm512_shrdi_epi64(<8 x i64> %__A, <8 x i64> %__B) {
 ; CHECK-NEXT:    vpshrdq $31, %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 entry:
-  %0 = tail call <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 31, <8 x i64> zeroinitializer, i8 -1)
+  %0 = tail call <8 x i64> @llvm.x86.avx512.vpshrd.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 31)
  ret <8 x i64> %0
 }

@ -526,13 +542,15 @@ define <8 x i64> @test_mm512_mask_shrdi_epi32(<8 x i64> %__S, i16 zeroext %__U,
 entry:
  %0 = bitcast <8 x i64> %__A to <16 x i32>
  %1 = bitcast <8 x i64> %__B to <16 x i32>
-  %2 = bitcast <8 x i64> %__S to <16 x i32>
-  %3 = tail call <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32> %0, <16 x i32> %1, i32 127, <16 x i32> %2, i16 %__U)
-  %4 = bitcast <16 x i32> %3 to <8 x i64>
-  ret <8 x i64> %4
+  %2 = tail call <16 x i32> @llvm.x86.avx512.vpshrd.d.512(<16 x i32> %0, <16 x i32> %1, i32 127)
+  %3 = bitcast <8 x i64> %__S to <16 x i32>
+  %4 = bitcast i16 %__U to <16 x i1>
+  %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
+  %6 = bitcast <16 x i32> %5 to <8 x i64>
+  ret <8 x i64> %6
 }

-declare <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.vpshrd.d.512(<16 x i32>, <16 x i32>, i32)

 define <8 x i64> @test_mm512_maskz_shrdi_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
 ; X86-LABEL: test_mm512_maskz_shrdi_epi32:
@ -549,9 +567,11 @@ define <8 x i64> @test_mm512_maskz_shrdi_epi32(i16 zeroext %__U, <8 x i64> %__A,
 entry:
  %0 = bitcast <8 x i64> %__A to <16 x i32>
  %1 = bitcast <8 x i64> %__B to <16 x i32>
-  %2 = tail call <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32> %0, <16 x i32> %1, i32 63, <16 x i32> zeroinitializer, i16 %__U)
-  %3 = bitcast <16 x i32> %2 to <8 x i64>
-  ret <8 x i64> %3
+  %2 = tail call <16 x i32> @llvm.x86.avx512.vpshrd.d.512(<16 x i32> %0, <16 x i32> %1, i32 63)
+  %3 = bitcast i16 %__U to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
+  %5 = bitcast <16 x i32> %4 to <8 x i64>
+  ret <8 x i64> %5
 }

 define <8 x i64> @test_mm512_shrdi_epi32(<8 x i64> %__A, <8 x i64> %__B) {
@ -562,7 +582,7 @@ define <8 x i64> @test_mm512_shrdi_epi32(<8 x i64> %__A, <8 x i64> %__B) {
 entry:
  %0 = bitcast <8 x i64> %__A to <16 x i32>
  %1 = bitcast <8 x i64> %__B to <16 x i32>
-  %2 = tail call <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32> %0, <16 x i32> %1, i32 31, <16 x i32> zeroinitializer, i16 -1)
+  %2 = tail call <16 x i32> @llvm.x86.avx512.vpshrd.d.512(<16 x i32> %0, <16 x i32> %1, i32 31)
  %3 = bitcast <16 x i32> %2 to <8 x i64>
  ret <8 x i64> %3
 }
@ -582,13 +602,15 @@ define <8 x i64> @test_mm512_mask_shrdi_epi16(<8 x i64> %__S, i32 %__U, <8 x i64
 entry:
  %0 = bitcast <8 x i64> %__A to <32 x i16>
  %1 = bitcast <8 x i64> %__B to <32 x i16>
-  %2 = bitcast <8 x i64> %__S to <32 x i16>
-  %3 = tail call <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16> %0, <32 x i16> %1, i32 127, <32 x i16> %2, i32 %__U)
-  %4 = bitcast <32 x i16> %3 to <8 x i64>
-  ret <8 x i64> %4
+  %2 = tail call <32 x i16> @llvm.x86.avx512.vpshrd.w.512(<32 x i16> %0, <32 x i16> %1, i32 127)
+  %3 = bitcast <8 x i64> %__S to <32 x i16>
+  %4 = bitcast i32 %__U to <32 x i1>
+  %5 = select <32 x i1> %4, <32 x i16> %2, <32 x i16> %3
+  %6 = bitcast <32 x i16> %5 to <8 x i64>
+  ret <8 x i64> %6
 }

-declare <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16>, <32 x i16>, i32, <32 x i16>, i32)
+declare <32 x i16> @llvm.x86.avx512.vpshrd.w.512(<32 x i16>, <32 x i16>, i32)

 define <8 x i64> @test_mm512_maskz_shrdi_epi16(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) {
 ; X86-LABEL: test_mm512_maskz_shrdi_epi16:
@ -605,9 +627,11 @@ define <8 x i64> @test_mm512_maskz_shrdi_epi16(i32 %__U, <8 x i64> %__A, <8 x i6
 entry:
  %0 = bitcast <8 x i64> %__A to <32 x i16>
  %1 = bitcast <8 x i64> %__B to <32 x i16>
-  %2 = tail call <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16> %0, <32 x i16> %1, i32 63, <32 x i16> zeroinitializer, i32 %__U)
-  %3 = bitcast <32 x i16> %2 to <8 x i64>
-  ret <8 x i64> %3
+  %2 = tail call <32 x i16> @llvm.x86.avx512.vpshrd.w.512(<32 x i16> %0, <32 x i16> %1, i32 63)
+  %3 = bitcast i32 %__U to <32 x i1>
+  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
+  %5 = bitcast <32 x i16> %4 to <8 x i64>
+  ret <8 x i64> %5
 }

 define <8 x i64> @test_mm512_shrdi_epi16(<8 x i64> %__A, <8 x i64> %__B) {
@ -618,7 +642,7 @@ define <8 x i64> @test_mm512_shrdi_epi16(<8 x i64> %__A, <8 x i64> %__B) {
 entry:
  %0 = bitcast <8 x i64> %__A to <32 x i16>
  %1 = bitcast <8 x i64> %__B to <32 x i16>
-  %2 = tail call <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16> %0, <32 x i16> %1, i32 31, <32 x i16> zeroinitializer, i32 -1)
+  %2 = tail call <32 x i16> @llvm.x86.avx512.vpshrd.w.512(<32 x i16> %0, <32 x i16> %1, i32 31)
  %3 = bitcast <32 x i16> %2 to <8 x i64>
  ret <8 x i64> %3
 }
--- a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll
@ -279,3 +279,143 @@ define void @test_compress_store_b_512(i8* %addr, <64 x i8> %data) {
  call void @llvm.x86.avx512.mask.compress.store.b.512(i8* %addr, <64 x i8> %data, i64 -1)
  ret void
 }
+
+define <16 x i32>@test_int_x86_avx512_mask_vpshld_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshld_d_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshldd $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x71,0xd9,0x16]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpshldd $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x71,0xd1,0x16]
+; X86-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshldd $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x71,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshldd $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x71,0xd1,0x16]
+; X64-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+declare <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
+
+define <8 x i64>@test_int_x86_avx512_mask_vpshld_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshld_q_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshldq $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x71,0xd9,0x16]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpshldq $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x71,0xd1,0x16]
+; X86-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshldq $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x71,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshldq $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x71,0xd1,0x16]
+; X64-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+declare <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
+
+define <32 x i16>@test_int_x86_avx512_mask_vpshld_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x3, i32 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshld_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshldw $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x70,0xd9,0x16]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpshldw $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x70,0xd1,0x16]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshldw $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x70,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshldw $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x70,0xd1,0x16]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 %x4)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 -1)
+  %res2 = add <32 x i16> %res, %res1
+  ret <32 x i16> %res2
+}
+declare <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16>, <32 x i16>, i32, <32 x i16>, i32)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpshrd_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshrd_d_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshrdd $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x73,0xd9,0x16]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpshrdd $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x73,0xd1,0x16]
+; X86-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshrdd $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x73,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshrdd $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x73,0xd1,0x16]
+; X64-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+declare <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
+
+define <8 x i64>@test_int_x86_avx512_mask_vpshrd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshrd_q_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshrdq $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x73,0xd9,0x16]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpshrdq $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x73,0xd1,0x16]
+; X86-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshrdq $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x73,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshrdq $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x73,0xd1,0x16]
+; X64-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+declare <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
+
+define <32 x i16>@test_int_x86_avx512_mask_vpshrd_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x3, i32 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshrd_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshrdw $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x72,0xd9,0x16]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpshrdw $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x72,0xd1,0x16]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshrdw $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x72,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshrdw $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x72,0xd1,0x16]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 %x4)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 -1)
+  %res2 = add <32 x i16> %res, %res1
+  ret <32 x i16> %res2
+}
+declare <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16>, <32 x i16>, i32, <32 x i16>, i32)
--- a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll
@ -97,142 +97,154 @@ declare <64 x i8> @llvm.x86.avx512.mask.expand.b.512(<64 x i8> %data, <64 x i8>
 define <16 x i32>@test_int_x86_avx512_mask_vpshld_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshld_d_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshldd $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x71,0xd9,0x16]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpshldd $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x71,0xd1,0x16]
-; X86-NEXT:    vpshldd $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x71,0xc1,0x16]
-; X86-NEXT:    vpaddd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0]
+; X86-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_512:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshldd $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x71,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshldd $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x71,0xd1,0x16]
-; X64-NEXT:    vpshldd $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x71,0xc1,0x16]
-; X64-NEXT:    vpaddd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0]
+; X64-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1)
-  %res2 = add <16 x i32> %res, %res1
+  %1 = call <16 x i32> @llvm.x86.avx512.vpshld.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22)
+  %2 = bitcast i16 %x4 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x3
+  %4 = call <16 x i32> @llvm.x86.avx512.vpshld.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22)
+  %res2 = add <16 x i32> %3, %4
  ret <16 x i32> %res2
 }
-declare <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.vpshld.d.512(<16 x i32>, <16 x i32>, i32)

 define <8 x i64>@test_int_x86_avx512_mask_vpshld_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshld_q_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshldq $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x71,0xd9,0x16]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshldq $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x71,0xd1,0x16]
-; X86-NEXT:    vpshldq $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x71,0xc1,0x16]
-; X86-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
+; X86-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_512:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshldq $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x71,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshldq $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x71,0xd1,0x16]
-; X64-NEXT:    vpshldq $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x71,0xc1,0x16]
-; X64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
+; X64-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)
-  %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1)
-  %res2 = add <8 x i64> %res, %res1
+  %1 = call <8 x i64> @llvm.x86.avx512.vpshld.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x3
+  %4 = call <8 x i64> @llvm.x86.avx512.vpshld.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22)
+  %res2 = add <8 x i64> %3, %4
  ret <8 x i64> %res2
 }
-declare <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.vpshld.q.512(<8 x i64>, <8 x i64>, i32)

 define <32 x i16>@test_int_x86_avx512_mask_vpshld_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x3, i32 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshld_w_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshldw $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x70,0xd9,0x16]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpshldw $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x70,0xd1,0x16]
-; X86-NEXT:    vpshldw $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x70,0xc1,0x16]
-; X86-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_512:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshldw $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x70,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshldw $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x70,0xd1,0x16]
-; X64-NEXT:    vpshldw $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x70,0xc1,0x16]
-; X64-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 %x4)
-  %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 -1)
-  %res2 = add <32 x i16> %res, %res1
+  %1 = call <32 x i16> @llvm.x86.avx512.vpshld.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22)
+  %2 = bitcast i32 %x4 to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x3
+  %4 = call <32 x i16> @llvm.x86.avx512.vpshld.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22)
+  %res2 = add <32 x i16> %3, %4
  ret <32 x i16> %res2
 }
-declare <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16>, <32 x i16>, i32, <32 x i16>, i32)
+declare <32 x i16> @llvm.x86.avx512.vpshld.w.512(<32 x i16>, <32 x i16>, i32)

 define <16 x i32>@test_int_x86_avx512_mask_vpshrd_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_d_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshrdd $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x73,0xd9,0x16]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpshrdd $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x73,0xd1,0x16]
-; X86-NEXT:    vpshrdd $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x73,0xc1,0x16]
-; X86-NEXT:    vpaddd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0]
+; X86-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_512:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshrdd $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x73,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshrdd $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x73,0xd1,0x16]
-; X64-NEXT:    vpshrdd $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x73,0xc1,0x16]
-; X64-NEXT:    vpaddd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0]
+; X64-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1)
-  %res2 = add <16 x i32> %res, %res1
+  %1 = call <16 x i32> @llvm.x86.avx512.vpshrd.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22)
+  %2 = bitcast i16 %x4 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x3
+  %4 = call <16 x i32> @llvm.x86.avx512.vpshrd.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22)
+  %res2 = add <16 x i32> %3, %4
  ret <16 x i32> %res2
 }
-declare <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.vpshrd.d.512(<16 x i32>, <16 x i32>, i32)

 define <8 x i64>@test_int_x86_avx512_mask_vpshrd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_q_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshrdq $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x73,0xd9,0x16]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshrdq $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x73,0xd1,0x16]
-; X86-NEXT:    vpshrdq $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x73,0xc1,0x16]
-; X86-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
+; X86-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_512:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshrdq $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x73,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshrdq $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x73,0xd1,0x16]
-; X64-NEXT:    vpshrdq $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x73,0xc1,0x16]
-; X64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
+; X64-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)
-  %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1)
-  %res2 = add <8 x i64> %res, %res1
+  %1 = call <8 x i64> @llvm.x86.avx512.vpshrd.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x3
+  %4 = call <8 x i64> @llvm.x86.avx512.vpshrd.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22)
+  %res2 = add <8 x i64> %3, %4
  ret <8 x i64> %res2
 }
-declare <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.vpshrd.q.512(<8 x i64>, <8 x i64>, i32)

 define <32 x i16>@test_int_x86_avx512_mask_vpshrd_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x3, i32 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_w_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshrdw $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x72,0xd9,0x16]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpshrdw $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x72,0xd1,0x16]
-; X86-NEXT:    vpshrdw $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x72,0xc1,0x16]
-; X86-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_512:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshrdw $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x72,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshrdw $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x72,0xd1,0x16]
-; X64-NEXT:    vpshrdw $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x72,0xc1,0x16]
-; X64-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 %x4)
-  %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 -1)
-  %res2 = add <32 x i16> %res, %res1
+  %1 = call <32 x i16> @llvm.x86.avx512.vpshrd.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22)
+  %2 = bitcast i32 %x4 to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x3
+  %4 = call <32 x i16> @llvm.x86.avx512.vpshrd.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22)
+  %res2 = add <32 x i16> %3, %4
  ret <32 x i16> %res2
 }
-declare <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16>, <32 x i16>, i32, <32 x i16>, i32)
+declare <32 x i16> @llvm.x86.avx512.vpshrd.w.512(<32 x i16>, <32 x i16>, i32)

 declare <16 x i32> @llvm.x86.avx512.mask.vpshrdv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 declare <16 x i32> @llvm.x86.avx512.maskz.vpshrdv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
--- a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-fast-isel.ll
@ -591,11 +591,14 @@ define <4 x i64> @test_mm256_mask_shldi_epi64(<4 x i64> %__S, i8 zeroext %__U, <
 ; X64-NEXT:    vpshldq $127, %ymm2, %ymm1, %ymm0 {%k1}
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 127, <4 x i64> %__S, i8 %__U)
-  ret <4 x i64> %0
+  %0 = tail call <4 x i64> @llvm.x86.avx512.vpshld.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 127)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__S
+  ret <4 x i64> %2
 }

-declare <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.vpshld.q.256(<4 x i64>, <4 x i64>, i32)

 define <4 x i64> @test_mm256_maskz_shldi_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
 ; X86-LABEL: test_mm256_maskz_shldi_epi64:
@ -611,8 +614,11 @@ define <4 x i64> @test_mm256_maskz_shldi_epi64(i8 zeroext %__U, <4 x i64> %__A,
 ; X64-NEXT:    vpshldq $63, %ymm1, %ymm0, %ymm0 {%k1} {z}
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 63, <4 x i64> zeroinitializer, i8 %__U)
-  ret <4 x i64> %0
+  %0 = tail call <4 x i64> @llvm.x86.avx512.vpshld.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 63)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
+  ret <4 x i64> %2
 }

 define <4 x i64> @test_mm256_shldi_epi64(<4 x i64> %__A, <4 x i64> %__B) {
@ -621,7 +627,7 @@ define <4 x i64> @test_mm256_shldi_epi64(<4 x i64> %__A, <4 x i64> %__B) {
 ; CHECK-NEXT:    vpshldq $31, %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 entry:
-  %0 = tail call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 31, <4 x i64> zeroinitializer, i8 -1)
+  %0 = tail call <4 x i64> @llvm.x86.avx512.vpshld.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 31)
  ret <4 x i64> %0
 }

@ -639,11 +645,14 @@ define <2 x i64> @test_mm_mask_shldi_epi64(<2 x i64> %__S, i8 zeroext %__U, <2 x
 ; X64-NEXT:    vpshldq $127, %xmm2, %xmm1, %xmm0 {%k1}
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 127, <2 x i64> %__S, i8 %__U)
-  ret <2 x i64> %0
+  %0 = tail call <2 x i64> @llvm.x86.avx512.vpshld.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 127)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__S
+  ret <2 x i64> %2
 }

-declare <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.vpshld.q.128(<2 x i64>, <2 x i64>, i32) #3

 define <2 x i64> @test_mm_maskz_shldi_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
 ; X86-LABEL: test_mm_maskz_shldi_epi64:
@ -659,8 +668,11 @@ define <2 x i64> @test_mm_maskz_shldi_epi64(i8 zeroext %__U, <2 x i64> %__A, <2
 ; X64-NEXT:    vpshldq $63, %xmm1, %xmm0, %xmm0 {%k1} {z}
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 63, <2 x i64> zeroinitializer, i8 %__U)
-  ret <2 x i64> %0
+  %0 = tail call <2 x i64> @llvm.x86.avx512.vpshld.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 63)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
+  ret <2 x i64> %2
 }

 define <2 x i64> @test_mm_shldi_epi64(<2 x i64> %__A, <2 x i64> %__B) {
@ -669,7 +681,7 @@ define <2 x i64> @test_mm_shldi_epi64(<2 x i64> %__A, <2 x i64> %__B) {
 ; CHECK-NEXT:    vpshldq $31, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 entry:
-  %0 = tail call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 31, <2 x i64> zeroinitializer, i8 -1)
+  %0 = tail call <2 x i64> @llvm.x86.avx512.vpshld.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 31)
  ret <2 x i64> %0
 }

@ -689,13 +701,15 @@ define <4 x i64> @test_mm256_mask_shldi_epi32(<4 x i64> %__S, i8 zeroext %__U, <
 entry:
  %0 = bitcast <4 x i64> %__A to <8 x i32>
  %1 = bitcast <4 x i64> %__B to <8 x i32>
-  %2 = bitcast <4 x i64> %__S to <8 x i32>
-  %3 = tail call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %0, <8 x i32> %1, i32 127, <8 x i32> %2, i8 %__U)
-  %4 = bitcast <8 x i32> %3 to <4 x i64>
-  ret <4 x i64> %4
+  %2 = tail call <8 x i32> @llvm.x86.avx512.vpshld.d.256(<8 x i32> %0, <8 x i32> %1, i32 127)
+  %3 = bitcast <4 x i64> %__S to <8 x i32>
+  %4 = bitcast i8 %__U to <8 x i1>
+  %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
+  %6 = bitcast <8 x i32> %5 to <4 x i64>
+  ret <4 x i64> %6
 }

-declare <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.vpshld.d.256(<8 x i32>, <8 x i32>, i32)

 define <4 x i64> @test_mm256_maskz_shldi_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
 ; X86-LABEL: test_mm256_maskz_shldi_epi32:
@ -713,9 +727,11 @@ define <4 x i64> @test_mm256_maskz_shldi_epi32(i8 zeroext %__U, <4 x i64> %__A,
 entry:
  %0 = bitcast <4 x i64> %__A to <8 x i32>
  %1 = bitcast <4 x i64> %__B to <8 x i32>
-  %2 = tail call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %0, <8 x i32> %1, i32 63, <8 x i32> zeroinitializer, i8 %__U)
-  %3 = bitcast <8 x i32> %2 to <4 x i64>
-  ret <4 x i64> %3
+  %2 = tail call <8 x i32> @llvm.x86.avx512.vpshld.d.256(<8 x i32> %0, <8 x i32> %1, i32 63)
+  %3 = bitcast i8 %__U to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
+  %5 = bitcast <8 x i32> %4 to <4 x i64>
+  ret <4 x i64> %5
 }

 define <4 x i64> @test_mm256_shldi_epi32(<4 x i64> %__A, <4 x i64> %__B) {
@ -726,7 +742,7 @@ define <4 x i64> @test_mm256_shldi_epi32(<4 x i64> %__A, <4 x i64> %__B) {
 entry:
  %0 = bitcast <4 x i64> %__A to <8 x i32>
  %1 = bitcast <4 x i64> %__B to <8 x i32>
-  %2 = tail call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %0, <8 x i32> %1, i32 31, <8 x i32> zeroinitializer, i8 -1)
+  %2 = tail call <8 x i32> @llvm.x86.avx512.vpshld.d.256(<8 x i32> %0, <8 x i32> %1, i32 31)
  %3 = bitcast <8 x i32> %2 to <4 x i64>
  ret <4 x i64> %3
 }
@ -747,13 +763,16 @@ define <2 x i64> @test_mm_mask_shldi_epi32(<2 x i64> %__S, i8 zeroext %__U, <2 x
 entry:
  %0 = bitcast <2 x i64> %__A to <4 x i32>
  %1 = bitcast <2 x i64> %__B to <4 x i32>
-  %2 = bitcast <2 x i64> %__S to <4 x i32>
-  %3 = tail call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %0, <4 x i32> %1, i32 127, <4 x i32> %2, i8 %__U)
-  %4 = bitcast <4 x i32> %3 to <2 x i64>
-  ret <2 x i64> %4
+  %2 = tail call <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32> %0, <4 x i32> %1, i32 127)
+  %3 = bitcast <2 x i64> %__S to <4 x i32>
+  %4 = bitcast i8 %__U to <8 x i1>
+  %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> %3
+  %6 = bitcast <4 x i32> %5 to <2 x i64>
+  ret <2 x i64> %6
 }

-declare <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32>, <4 x i32>, i32)

 define <2 x i64> @test_mm_maskz_shldi_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
 ; X86-LABEL: test_mm_maskz_shldi_epi32:
@ -771,9 +790,12 @@ define <2 x i64> @test_mm_maskz_shldi_epi32(i8 zeroext %__U, <2 x i64> %__A, <2
 entry:
  %0 = bitcast <2 x i64> %__A to <4 x i32>
  %1 = bitcast <2 x i64> %__B to <4 x i32>
-  %2 = tail call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %0, <4 x i32> %1, i32 63, <4 x i32> zeroinitializer, i8 %__U)
-  %3 = bitcast <4 x i32> %2 to <2 x i64>
-  ret <2 x i64> %3
+  %2 = tail call <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32> %0, <4 x i32> %1, i32 63)
+  %3 = bitcast i8 %__U to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> zeroinitializer
+  %5 = bitcast <4 x i32> %4 to <2 x i64>
+  ret <2 x i64> %5
 }

 define <2 x i64> @test_mm_shldi_epi32(<2 x i64> %__A, <2 x i64> %__B) {
@ -784,7 +806,7 @@ define <2 x i64> @test_mm_shldi_epi32(<2 x i64> %__A, <2 x i64> %__B) {
 entry:
  %0 = bitcast <2 x i64> %__A to <4 x i32>
  %1 = bitcast <2 x i64> %__B to <4 x i32>
-  %2 = tail call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %0, <4 x i32> %1, i32 31, <4 x i32> zeroinitializer, i8 -1)
+  %2 = tail call <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32> %0, <4 x i32> %1, i32 31)
  %3 = bitcast <4 x i32> %2 to <2 x i64>
  ret <2 x i64> %3
 }
@ -804,13 +826,15 @@ define <4 x i64> @test_mm256_mask_shldi_epi16(<4 x i64> %__S, i16 zeroext %__U,
 entry:
  %0 = bitcast <4 x i64> %__A to <16 x i16>
  %1 = bitcast <4 x i64> %__B to <16 x i16>
-  %2 = bitcast <4 x i64> %__S to <16 x i16>
-  %3 = tail call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %0, <16 x i16> %1, i32 127, <16 x i16> %2, i16 %__U)
-  %4 = bitcast <16 x i16> %3 to <4 x i64>
-  ret <4 x i64> %4
+  %2 = tail call <16 x i16> @llvm.x86.avx512.vpshld.w.256(<16 x i16> %0, <16 x i16> %1, i32 127)
+  %3 = bitcast <4 x i64> %__S to <16 x i16>
+  %4 = bitcast i16 %__U to <16 x i1>
+  %5 = select <16 x i1> %4, <16 x i16> %2, <16 x i16> %3
+  %6 = bitcast <16 x i16> %5 to <4 x i64>
+  ret <4 x i64> %6
 }

-declare <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16>, <16 x i16>, i32, <16 x i16>, i16)
+declare <16 x i16> @llvm.x86.avx512.vpshld.w.256(<16 x i16>, <16 x i16>, i32)

 define <4 x i64> @test_mm256_maskz_shldi_epi16(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
 ; X86-LABEL: test_mm256_maskz_shldi_epi16:
@ -827,9 +851,11 @@ define <4 x i64> @test_mm256_maskz_shldi_epi16(i16 zeroext %__U, <4 x i64> %__A,
 entry:
  %0 = bitcast <4 x i64> %__A to <16 x i16>
  %1 = bitcast <4 x i64> %__B to <16 x i16>
-  %2 = tail call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %0, <16 x i16> %1, i32 63, <16 x i16> zeroinitializer, i16 %__U)
-  %3 = bitcast <16 x i16> %2 to <4 x i64>
-  ret <4 x i64> %3
+  %2 = tail call <16 x i16> @llvm.x86.avx512.vpshld.w.256(<16 x i16> %0, <16 x i16> %1, i32 63)
+  %3 = bitcast i16 %__U to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
+  %5 = bitcast <16 x i16> %4 to <4 x i64>
+  ret <4 x i64> %5
 }

 define <4 x i64> @test_mm256_shldi_epi16(<4 x i64> %__A, <4 x i64> %__B) {
@ -840,7 +866,7 @@ define <4 x i64> @test_mm256_shldi_epi16(<4 x i64> %__A, <4 x i64> %__B) {
 entry:
  %0 = bitcast <4 x i64> %__A to <16 x i16>
  %1 = bitcast <4 x i64> %__B to <16 x i16>
-  %2 = tail call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %0, <16 x i16> %1, i32 31, <16 x i16> zeroinitializer, i16 -1)
+  %2 = tail call <16 x i16> @llvm.x86.avx512.vpshld.w.256(<16 x i16> %0, <16 x i16> %1, i32 31)
  %3 = bitcast <16 x i16> %2 to <4 x i64>
  ret <4 x i64> %3
 }
@ -861,13 +887,15 @@ define <2 x i64> @test_mm_mask_shldi_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x
 entry:
  %0 = bitcast <2 x i64> %__A to <8 x i16>
  %1 = bitcast <2 x i64> %__B to <8 x i16>
-  %2 = bitcast <2 x i64> %__S to <8 x i16>
-  %3 = tail call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %0, <8 x i16> %1, i32 127, <8 x i16> %2, i8 %__U)
-  %4 = bitcast <8 x i16> %3 to <2 x i64>
-  ret <2 x i64> %4
+  %2 = tail call <8 x i16> @llvm.x86.avx512.vpshld.w.128(<8 x i16> %0, <8 x i16> %1, i32 127)
+  %3 = bitcast <2 x i64> %__S to <8 x i16>
+  %4 = bitcast i8 %__U to <8 x i1>
+  %5 = select <8 x i1> %4, <8 x i16> %2, <8 x i16> %3
+  %6 = bitcast <8 x i16> %5 to <2 x i64>
+  ret <2 x i64> %6
 }

-declare <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16>, <8 x i16>, i32, <8 x i16>, i8)
+declare <8 x i16> @llvm.x86.avx512.vpshld.w.128(<8 x i16>, <8 x i16>, i32)

 define <2 x i64> @test_mm_maskz_shldi_epi16(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
 ; X86-LABEL: test_mm_maskz_shldi_epi16:
@ -885,9 +913,11 @@ define <2 x i64> @test_mm_maskz_shldi_epi16(i8 zeroext %__U, <2 x i64> %__A, <2
 entry:
  %0 = bitcast <2 x i64> %__A to <8 x i16>
  %1 = bitcast <2 x i64> %__B to <8 x i16>
-  %2 = tail call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %0, <8 x i16> %1, i32 63, <8 x i16> zeroinitializer, i8 %__U)
-  %3 = bitcast <8 x i16> %2 to <2 x i64>
-  ret <2 x i64> %3
+  %2 = tail call <8 x i16> @llvm.x86.avx512.vpshld.w.128(<8 x i16> %0, <8 x i16> %1, i32 63)
+  %3 = bitcast i8 %__U to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
+  %5 = bitcast <8 x i16> %4 to <2 x i64>
+  ret <2 x i64> %5
 }

 define <2 x i64> @test_mm_shldi_epi16(<2 x i64> %__A, <2 x i64> %__B) {
@ -898,7 +928,7 @@ define <2 x i64> @test_mm_shldi_epi16(<2 x i64> %__A, <2 x i64> %__B) {
 entry:
  %0 = bitcast <2 x i64> %__A to <8 x i16>
  %1 = bitcast <2 x i64> %__B to <8 x i16>
-  %2 = tail call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %0, <8 x i16> %1, i32 31, <8 x i16> zeroinitializer, i8 -1)
+  %2 = tail call <8 x i16> @llvm.x86.avx512.vpshld.w.128(<8 x i16> %0, <8 x i16> %1, i32 31)
  %3 = bitcast <8 x i16> %2 to <2 x i64>
  ret <2 x i64> %3
 }
@ -917,11 +947,14 @@ define <4 x i64> @test_mm256_mask_shrdi_epi64(<4 x i64> %__S, i8 zeroext %__U, <
 ; X64-NEXT:    vpshrdq $127, %ymm2, %ymm1, %ymm0 {%k1}
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 127, <4 x i64> %__S, i8 %__U)
-  ret <4 x i64> %0
+  %0 = tail call <4 x i64> @llvm.x86.avx512.vpshrd.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 127)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__S
+  ret <4 x i64> %2
 }

-declare <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.vpshrd.q.256(<4 x i64>, <4 x i64>, i32)

 define <4 x i64> @test_mm256_maskz_shrdi_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
 ; X86-LABEL: test_mm256_maskz_shrdi_epi64:
@ -937,8 +970,11 @@ define <4 x i64> @test_mm256_maskz_shrdi_epi64(i8 zeroext %__U, <4 x i64> %__A,
 ; X64-NEXT:    vpshrdq $63, %ymm1, %ymm0, %ymm0 {%k1} {z}
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 63, <4 x i64> zeroinitializer, i8 %__U)
-  ret <4 x i64> %0
+  %0 = tail call <4 x i64> @llvm.x86.avx512.vpshrd.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 63)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
+  ret <4 x i64> %2
 }

 define <4 x i64> @test_mm256_shrdi_epi64(<4 x i64> %__A, <4 x i64> %__B) {
@ -947,7 +983,7 @@ define <4 x i64> @test_mm256_shrdi_epi64(<4 x i64> %__A, <4 x i64> %__B) {
 ; CHECK-NEXT:    vpshrdq $31, %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 entry:
-  %0 = tail call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 31, <4 x i64> zeroinitializer, i8 -1)
+  %0 = tail call <4 x i64> @llvm.x86.avx512.vpshrd.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 31)
  ret <4 x i64> %0
 }

@ -965,11 +1001,14 @@ define <2 x i64> @test_mm_mask_shrdi_epi64(<2 x i64> %__S, i8 zeroext %__U, <2 x
 ; X64-NEXT:    vpshrdq $127, %xmm2, %xmm1, %xmm0 {%k1}
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 127, <2 x i64> %__S, i8 %__U)
-  ret <2 x i64> %0
+  %0 = tail call <2 x i64> @llvm.x86.avx512.vpshrd.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 127)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__S
+  ret <2 x i64> %2
 }

-declare <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.vpshrd.q.128(<2 x i64>, <2 x i64>, i32)

 define <2 x i64> @test_mm_maskz_shrdi_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
 ; X86-LABEL: test_mm_maskz_shrdi_epi64:
@ -985,8 +1024,11 @@ define <2 x i64> @test_mm_maskz_shrdi_epi64(i8 zeroext %__U, <2 x i64> %__A, <2
 ; X64-NEXT:    vpshrdq $63, %xmm1, %xmm0, %xmm0 {%k1} {z}
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 63, <2 x i64> zeroinitializer, i8 %__U)
-  ret <2 x i64> %0
+  %0 = tail call <2 x i64> @llvm.x86.avx512.vpshrd.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 63)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
+  ret <2 x i64> %2
 }

 define <2 x i64> @test_mm_shrdi_epi64(<2 x i64> %__A, <2 x i64> %__B) {
@ -995,7 +1037,7 @@ define <2 x i64> @test_mm_shrdi_epi64(<2 x i64> %__A, <2 x i64> %__B) {
 ; CHECK-NEXT:    vpshrdq $31, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 entry:
-  %0 = tail call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 31, <2 x i64> zeroinitializer, i8 -1)
+  %0 = tail call <2 x i64> @llvm.x86.avx512.vpshrd.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 31)
  ret <2 x i64> %0
 }

@ -1015,13 +1057,15 @@ define <4 x i64> @test_mm256_mask_shrdi_epi32(<4 x i64> %__S, i8 zeroext %__U, <
 entry:
  %0 = bitcast <4 x i64> %__A to <8 x i32>
  %1 = bitcast <4 x i64> %__B to <8 x i32>
-  %2 = bitcast <4 x i64> %__S to <8 x i32>
-  %3 = tail call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %0, <8 x i32> %1, i32 127, <8 x i32> %2, i8 %__U)
-  %4 = bitcast <8 x i32> %3 to <4 x i64>
-  ret <4 x i64> %4
+  %2 = tail call <8 x i32> @llvm.x86.avx512.vpshrd.d.256(<8 x i32> %0, <8 x i32> %1, i32 127)
+  %3 = bitcast <4 x i64> %__S to <8 x i32>
+  %4 = bitcast i8 %__U to <8 x i1>
+  %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
+  %6 = bitcast <8 x i32> %5 to <4 x i64>
+  ret <4 x i64> %6
 }

-declare <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.vpshrd.d.256(<8 x i32>, <8 x i32>, i32)

 define <4 x i64> @test_mm256_maskz_shrdi_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
 ; X86-LABEL: test_mm256_maskz_shrdi_epi32:
@ -1039,9 +1083,11 @@ define <4 x i64> @test_mm256_maskz_shrdi_epi32(i8 zeroext %__U, <4 x i64> %__A,
 entry:
  %0 = bitcast <4 x i64> %__A to <8 x i32>
  %1 = bitcast <4 x i64> %__B to <8 x i32>
-  %2 = tail call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %0, <8 x i32> %1, i32 63, <8 x i32> zeroinitializer, i8 %__U)
-  %3 = bitcast <8 x i32> %2 to <4 x i64>
-  ret <4 x i64> %3
+  %2 = tail call <8 x i32> @llvm.x86.avx512.vpshrd.d.256(<8 x i32> %0, <8 x i32> %1, i32 63)
+  %3 = bitcast i8 %__U to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
+  %5 = bitcast <8 x i32> %4 to <4 x i64>
+  ret <4 x i64> %5
 }

 define <4 x i64> @test_mm256_shrdi_epi32(<4 x i64> %__A, <4 x i64> %__B) {
@ -1052,7 +1098,7 @@ define <4 x i64> @test_mm256_shrdi_epi32(<4 x i64> %__A, <4 x i64> %__B) {
 entry:
  %0 = bitcast <4 x i64> %__A to <8 x i32>
  %1 = bitcast <4 x i64> %__B to <8 x i32>
-  %2 = tail call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %0, <8 x i32> %1, i32 31, <8 x i32> zeroinitializer, i8 -1)
+  %2 = tail call <8 x i32> @llvm.x86.avx512.vpshrd.d.256(<8 x i32> %0, <8 x i32> %1, i32 31)
  %3 = bitcast <8 x i32> %2 to <4 x i64>
  ret <4 x i64> %3
 }
@ -1073,13 +1119,16 @@ define <2 x i64> @test_mm_mask_shrdi_epi32(<2 x i64> %__S, i8 zeroext %__U, <2 x
 entry:
  %0 = bitcast <2 x i64> %__A to <4 x i32>
  %1 = bitcast <2 x i64> %__B to <4 x i32>
-  %2 = bitcast <2 x i64> %__S to <4 x i32>
-  %3 = tail call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %0, <4 x i32> %1, i32 127, <4 x i32> %2, i8 %__U)
-  %4 = bitcast <4 x i32> %3 to <2 x i64>
-  ret <2 x i64> %4
+  %2 = tail call <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32> %0, <4 x i32> %1, i32 127)
+  %3 = bitcast <2 x i64> %__S to <4 x i32>
+  %4 = bitcast i8 %__U to <8 x i1>
+  %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> %3
+  %6 = bitcast <4 x i32> %5 to <2 x i64>
+  ret <2 x i64> %6
 }

-declare <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32>, <4 x i32>, i32)

 define <2 x i64> @test_mm_maskz_shrdi_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
 ; X86-LABEL: test_mm_maskz_shrdi_epi32:
@ -1097,9 +1146,12 @@ define <2 x i64> @test_mm_maskz_shrdi_epi32(i8 zeroext %__U, <2 x i64> %__A, <2
 entry:
  %0 = bitcast <2 x i64> %__A to <4 x i32>
  %1 = bitcast <2 x i64> %__B to <4 x i32>
-  %2 = tail call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %0, <4 x i32> %1, i32 63, <4 x i32> zeroinitializer, i8 %__U)
-  %3 = bitcast <4 x i32> %2 to <2 x i64>
-  ret <2 x i64> %3
+  %2 = tail call <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32> %0, <4 x i32> %1, i32 63)
+  %3 = bitcast i8 %__U to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> zeroinitializer
+  %5 = bitcast <4 x i32> %4 to <2 x i64>
+  ret <2 x i64> %5
 }

 define <2 x i64> @test_mm_shrdi_epi32(<2 x i64> %__A, <2 x i64> %__B) {
@ -1110,7 +1162,7 @@ define <2 x i64> @test_mm_shrdi_epi32(<2 x i64> %__A, <2 x i64> %__B) {
 entry:
  %0 = bitcast <2 x i64> %__A to <4 x i32>
  %1 = bitcast <2 x i64> %__B to <4 x i32>
-  %2 = tail call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %0, <4 x i32> %1, i32 31, <4 x i32> zeroinitializer, i8 -1)
+  %2 = tail call <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32> %0, <4 x i32> %1, i32 31)
  %3 = bitcast <4 x i32> %2 to <2 x i64>
  ret <2 x i64> %3
 }
@ -1130,13 +1182,15 @@ define <4 x i64> @test_mm256_mask_shrdi_epi16(<4 x i64> %__S, i16 zeroext %__U,
 entry:
  %0 = bitcast <4 x i64> %__A to <16 x i16>
  %1 = bitcast <4 x i64> %__B to <16 x i16>
-  %2 = bitcast <4 x i64> %__S to <16 x i16>
-  %3 = tail call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %0, <16 x i16> %1, i32 127, <16 x i16> %2, i16 %__U)
-  %4 = bitcast <16 x i16> %3 to <4 x i64>
-  ret <4 x i64> %4
+  %2 = tail call <16 x i16> @llvm.x86.avx512.vpshrd.w.256(<16 x i16> %0, <16 x i16> %1, i32 127)
+  %3 = bitcast <4 x i64> %__S to <16 x i16>
+  %4 = bitcast i16 %__U to <16 x i1>
+  %5 = select <16 x i1> %4, <16 x i16> %2, <16 x i16> %3
+  %6 = bitcast <16 x i16> %5 to <4 x i64>
+  ret <4 x i64> %6
 }

-declare <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16>, <16 x i16>, i32, <16 x i16>, i16)
+declare <16 x i16> @llvm.x86.avx512.vpshrd.w.256(<16 x i16>, <16 x i16>, i32)

 define <4 x i64> @test_mm256_maskz_shrdi_epi16(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
 ; X86-LABEL: test_mm256_maskz_shrdi_epi16:
@ -1153,9 +1207,11 @@ define <4 x i64> @test_mm256_maskz_shrdi_epi16(i16 zeroext %__U, <4 x i64> %__A,
 entry:
  %0 = bitcast <4 x i64> %__A to <16 x i16>
  %1 = bitcast <4 x i64> %__B to <16 x i16>
-  %2 = tail call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %0, <16 x i16> %1, i32 63, <16 x i16> zeroinitializer, i16 %__U)
-  %3 = bitcast <16 x i16> %2 to <4 x i64>
-  ret <4 x i64> %3
+  %2 = tail call <16 x i16> @llvm.x86.avx512.vpshrd.w.256(<16 x i16> %0, <16 x i16> %1, i32 63)
+  %3 = bitcast i16 %__U to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
+  %5 = bitcast <16 x i16> %4 to <4 x i64>
+  ret <4 x i64> %5
 }

 define <4 x i64> @test_mm256_shrdi_epi16(<4 x i64> %__A, <4 x i64> %__B) {
@ -1166,7 +1222,7 @@ define <4 x i64> @test_mm256_shrdi_epi16(<4 x i64> %__A, <4 x i64> %__B) {
 entry:
  %0 = bitcast <4 x i64> %__A to <16 x i16>
  %1 = bitcast <4 x i64> %__B to <16 x i16>
-  %2 = tail call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %0, <16 x i16> %1, i32 31, <16 x i16> zeroinitializer, i16 -1)
+  %2 = tail call <16 x i16> @llvm.x86.avx512.vpshrd.w.256(<16 x i16> %0, <16 x i16> %1, i32 31)
  %3 = bitcast <16 x i16> %2 to <4 x i64>
  ret <4 x i64> %3
 }
@ -1187,13 +1243,15 @@ define <2 x i64> @test_mm_mask_shrdi_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x
 entry:
  %0 = bitcast <2 x i64> %__A to <8 x i16>
  %1 = bitcast <2 x i64> %__B to <8 x i16>
-  %2 = bitcast <2 x i64> %__S to <8 x i16>
-  %3 = tail call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %0, <8 x i16> %1, i32 127, <8 x i16> %2, i8 %__U)
-  %4 = bitcast <8 x i16> %3 to <2 x i64>
-  ret <2 x i64> %4
+  %2 = tail call <8 x i16> @llvm.x86.avx512.vpshrd.w.128(<8 x i16> %0, <8 x i16> %1, i32 127)
+  %3 = bitcast <2 x i64> %__S to <8 x i16>
+  %4 = bitcast i8 %__U to <8 x i1>
+  %5 = select <8 x i1> %4, <8 x i16> %2, <8 x i16> %3
+  %6 = bitcast <8 x i16> %5 to <2 x i64>
+  ret <2 x i64> %6
 }

-declare <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16>, <8 x i16>, i32, <8 x i16>, i8)
+declare <8 x i16> @llvm.x86.avx512.vpshrd.w.128(<8 x i16>, <8 x i16>, i32)

 define <2 x i64> @test_mm_maskz_shrdi_epi16(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
 ; X86-LABEL: test_mm_maskz_shrdi_epi16:
@ -1211,9 +1269,11 @@ define <2 x i64> @test_mm_maskz_shrdi_epi16(i8 zeroext %__U, <2 x i64> %__A, <2
 entry:
  %0 = bitcast <2 x i64> %__A to <8 x i16>
  %1 = bitcast <2 x i64> %__B to <8 x i16>
-  %2 = tail call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %0, <8 x i16> %1, i32 63, <8 x i16> zeroinitializer, i8 %__U)
-  %3 = bitcast <8 x i16> %2 to <2 x i64>
-  ret <2 x i64> %3
+  %2 = tail call <8 x i16> @llvm.x86.avx512.vpshrd.w.128(<8 x i16> %0, <8 x i16> %1, i32 63)
+  %3 = bitcast i8 %__U to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
+  %5 = bitcast <8 x i16> %4 to <2 x i64>
+  ret <2 x i64> %5
 }

 define <2 x i64> @test_mm_shrdi_epi16(<2 x i64> %__A, <2 x i64> %__B) {
@ -1224,7 +1284,7 @@ define <2 x i64> @test_mm_shrdi_epi16(<2 x i64> %__A, <2 x i64> %__B) {
 entry:
  %0 = bitcast <2 x i64> %__A to <8 x i16>
  %1 = bitcast <2 x i64> %__B to <8 x i16>
-  %2 = tail call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %0, <8 x i16> %1, i32 31, <8 x i16> zeroinitializer, i8 -1)
+  %2 = tail call <8 x i16> @llvm.x86.avx512.vpshrd.w.128(<8 x i16> %0, <8 x i16> %1, i32 31)
  %3 = bitcast <8 x i16> %2 to <2 x i64>
  ret <2 x i64> %3
 }
--- a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll
@ -368,3 +368,301 @@ define void @test_compress_store_b_256(i8* %addr, <32 x i8> %data) {
  call void @llvm.x86.avx512.mask.compress.store.b.256(i8* %addr, <32 x i8> %data, i32 -1)
  ret void
 }
+
+define <4 x i32>@test_int_x86_avx512_mask_vpshld_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshld_d_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x16]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xd1,0x16]
+; X86-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xc1,0x16]
+; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xd1,0x16]
+; X64-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xc1,0x16]
+; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 -1)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> zeroinitializer,i8 %x4)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+declare <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_vpshld_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshld_d_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshldd $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xd9,0x16]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpshldd $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x71,0xd1,0x16]
+; X86-NEXT:    vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshldd $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshldd $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x71,0xd1,0x16]
+; X64-NEXT:    vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+declare <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_vpshld_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshld_q_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshldq $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xd9,0x16]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpshldq $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x71,0xd1,0x16]
+; X86-NEXT:    vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshldq $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshldq $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x71,0xd1,0x16]
+; X64-NEXT:    vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+declare <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_vpshld_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshld_q_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshldq $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xd9,0x16]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpshldq $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x71,0xd1,0x16]
+; X86-NEXT:    vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshldq $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshldq $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x71,0xd1,0x16]
+; X64-NEXT:    vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+declare <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_vpshld_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x3, i8 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshld_w_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshldw $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xd9,0x16]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpshldw $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x70,0xd1,0x16]
+; X86-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshldw $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshldw $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x70,0xd1,0x16]
+; X64-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 %x4)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 -1)
+  %res2 = add <8 x i16> %res, %res1
+  ret <8 x i16> %res2
+}
+declare <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16>, <8 x i16>, i32, <8 x i16>, i8)
+
+define <16 x i16>@test_int_x86_avx512_mask_vpshld_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x3, i16 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshld_w_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshldw $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xd9,0x16]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpshldw $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x70,0xd1,0x16]
+; X86-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshldw $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshldw $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x70,0xd1,0x16]
+; X64-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 %x4)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 -1)
+  %res2 = add <16 x i16> %res, %res1
+  ret <16 x i16> %res2
+}
+declare <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16>, <16 x i16>, i32, <16 x i16>, i16)
+
+define <4 x i32>@test_int_x86_avx512_mask_vpshrd_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshrd_d_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x16]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xd1,0x16]
+; X86-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xc1,0x16]
+; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xd1,0x16]
+; X64-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xc1,0x16]
+; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 -1)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> zeroinitializer,i8 %x4)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+declare <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_vpshrd_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshrd_d_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshrdd $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xd9,0x16]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpshrdd $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x73,0xd1,0x16]
+; X86-NEXT:    vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshrdd $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshrdd $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x73,0xd1,0x16]
+; X64-NEXT:    vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+declare <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_vpshrd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshrd_q_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshrdq $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xd9,0x16]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpshrdq $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x73,0xd1,0x16]
+; X86-NEXT:    vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshrdq $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshrdq $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x73,0xd1,0x16]
+; X64-NEXT:    vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+declare <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_vpshrd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshrd_q_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshrdq $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xd9,0x16]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpshrdq $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x73,0xd1,0x16]
+; X86-NEXT:    vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshrdq $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshrdq $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x73,0xd1,0x16]
+; X64-NEXT:    vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+declare <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_vpshrd_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x3, i8 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshrd_w_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshrdw $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xd9,0x16]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpshrdw $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x72,0xd1,0x16]
+; X86-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshrdw $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshrdw $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x72,0xd1,0x16]
+; X64-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 %x4)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 -1)
+  %res2 = add <8 x i16> %res, %res1
+  ret <8 x i16> %res2
+}
+declare <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16>, <8 x i16>, i32, <8 x i16>, i8)
+
+define <16 x i16>@test_int_x86_avx512_mask_vpshrd_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x3, i16 %x4) {
+; X86-LABEL: test_int_x86_avx512_mask_vpshrd_w_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshrdw $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xd9,0x16]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpshrdw $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x72,0xd1,0x16]
+; X86-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshrdw $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xd9,0x16]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshrdw $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x72,0xd1,0x16]
+; X64-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 %x4)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 -1)
+  %res2 = add <16 x i16> %res, %res1
+  ret <16 x i16> %res2
+}
+declare <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16>, <16 x i16>, i32, <16 x i16>, i16)
--- a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll
@ -361,300 +361,336 @@ declare <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8> %data, <32 x i8
 define <4 x i32>@test_int_x86_avx512_mask_vpshld_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshld_d_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x16]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xd1,0x16]
-; X86-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd9,0x16]
-; X86-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc1,0x16]
-; X86-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3]
+; X86-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xc1,0x16]
+; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
 ; X86-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_128:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd9,0x16]
 ; X64-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xd1,0x16]
-; X64-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc1,0x16]
-; X64-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3]
+; X64-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xc1,0x16]
+; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
 ; X64-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4)
-  %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 -1)
-  %res2 = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> zeroinitializer,i8 %x4)
-  %res3 = add <4 x i32> %res, %res1
-  %res4 = add <4 x i32> %res3, %res2
+  %1 = call <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract1, <4 x i32> %1, <4 x i32> %x3
+  %4 = call <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22)
+  %5 = call <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22)
+  %6 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %6, <8 x i1> %6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %7 = select <4 x i1> %extract, <4 x i32> %5, <4 x i32> zeroinitializer
+  %res3 = add <4 x i32> %3, %4
+  %res4 = add <4 x i32> %res3, %7
  ret <4 x i32> %res4
 }
-declare <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32>, <4 x i32>, i32)

 define <8 x i32>@test_int_x86_avx512_mask_vpshld_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshld_d_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshldd $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xd9,0x16]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshldd $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x71,0xd1,0x16]
-; X86-NEXT:    vpshldd $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xc1,0x16]
-; X86-NEXT:    vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
+; X86-NEXT:    vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_256:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshldd $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshldd $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x71,0xd1,0x16]
-; X64-NEXT:    vpshldd $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xc1,0x16]
-; X64-NEXT:    vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
+; X64-NEXT:    vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
-  %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1)
-  %res2 = add <8 x i32> %res, %res1
+  %1 = call <8 x i32> @llvm.x86.avx512.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x3
+  %4 = call <8 x i32> @llvm.x86.avx512.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22)
+  %res2 = add <8 x i32> %3, %4
  ret <8 x i32> %res2
 }
-declare <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.vpshld.d.256(<8 x i32>, <8 x i32>, i32)

 define <2 x i64>@test_int_x86_avx512_mask_vpshld_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshld_q_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshldq $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xd9,0x16]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshldq $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x71,0xd1,0x16]
-; X86-NEXT:    vpshldq $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xc1,0x16]
-; X86-NEXT:    vpaddq %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0]
+; X86-NEXT:    vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_128:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshldq $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshldq $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x71,0xd1,0x16]
-; X64-NEXT:    vpshldq $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xc1,0x16]
-; X64-NEXT:    vpaddq %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0]
+; X64-NEXT:    vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4)
-  %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 -1)
-  %res2 = add <2 x i64> %res, %res1
+  %1 = call <2 x i64> @llvm.x86.avx512.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x3
+  %4 = call <2 x i64> @llvm.x86.avx512.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22)
+  %res2 = add <2 x i64> %3, %4
  ret <2 x i64> %res2
 }
-declare <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.vpshld.q.128(<2 x i64>, <2 x i64>, i32)

 define <4 x i64>@test_int_x86_avx512_mask_vpshld_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshld_q_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshldq $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xd9,0x16]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshldq $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x71,0xd1,0x16]
-; X86-NEXT:    vpshldq $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xc1,0x16]
-; X86-NEXT:    vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
+; X86-NEXT:    vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_256:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshldq $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshldq $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x71,0xd1,0x16]
-; X64-NEXT:    vpshldq $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xc1,0x16]
-; X64-NEXT:    vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
+; X64-NEXT:    vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
-  %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1)
-  %res2 = add <4 x i64> %res, %res1
+  %1 = call <4 x i64> @llvm.x86.avx512.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x3
+  %4 = call <4 x i64> @llvm.x86.avx512.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22)
+  %res2 = add <4 x i64> %3, %4
  ret <4 x i64> %res2
 }
-declare <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.vpshld.q.256(<4 x i64>, <4 x i64>, i32)

 define <8 x i16>@test_int_x86_avx512_mask_vpshld_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshld_w_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshldw $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xd9,0x16]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshldw $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x70,0xd1,0x16]
-; X86-NEXT:    vpshldw $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xc1,0x16]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
+; X86-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_128:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshldw $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshldw $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x70,0xd1,0x16]
-; X64-NEXT:    vpshldw $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xc1,0x16]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
+; X64-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 %x4)
-  %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 -1)
-  %res2 = add <8 x i16> %res, %res1
+  %1 = call <8 x i16> @llvm.x86.avx512.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x3
+  %4 = call <8 x i16> @llvm.x86.avx512.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22)
+  %res2 = add <8 x i16> %3, %4
  ret <8 x i16> %res2
 }
-declare <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16>, <8 x i16>, i32, <8 x i16>, i8)
+declare <8 x i16> @llvm.x86.avx512.vpshld.w.128(<8 x i16>, <8 x i16>, i32)

 define <16 x i16>@test_int_x86_avx512_mask_vpshld_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x3, i16 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshld_w_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshldw $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xd9,0x16]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpshldw $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x70,0xd1,0x16]
-; X86-NEXT:    vpshldw $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xc1,0x16]
-; X86-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
+; X86-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_256:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshldw $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshldw $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x70,0xd1,0x16]
-; X64-NEXT:    vpshldw $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xc1,0x16]
-; X64-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
+; X64-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 %x4)
-  %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 -1)
-  %res2 = add <16 x i16> %res, %res1
+  %1 = call <16 x i16> @llvm.x86.avx512.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22)
+  %2 = bitcast i16 %x4 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x3
+  %4 = call <16 x i16> @llvm.x86.avx512.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22)
+  %res2 = add <16 x i16> %3, %4
  ret <16 x i16> %res2
 }
-declare <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16>, <16 x i16>, i32, <16 x i16>, i16)
+declare <16 x i16> @llvm.x86.avx512.vpshld.w.256(<16 x i16>, <16 x i16>, i32)

 define <4 x i32>@test_int_x86_avx512_mask_vpshrd_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_d_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x16]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xd1,0x16]
-; X86-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd9,0x16]
-; X86-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc1,0x16]
-; X86-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3]
+; X86-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xc1,0x16]
+; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
 ; X86-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_128:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd9,0x16]
 ; X64-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xd1,0x16]
-; X64-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc1,0x16]
-; X64-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3]
+; X64-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xc1,0x16]
+; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
 ; X64-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4)
-  %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 -1)
-  %res2 = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> zeroinitializer,i8 %x4)
-  %res3 = add <4 x i32> %res, %res1
-  %res4 = add <4 x i32> %res3, %res2
+  %1 = call <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract1, <4 x i32> %1, <4 x i32> %x3
+  %4 = call <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22)
+  %5 = call <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22)
+  %6 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %6, <8 x i1> %6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %7 = select <4 x i1> %extract, <4 x i32> %5, <4 x i32> zeroinitializer
+  %res3 = add <4 x i32> %3, %4
+  %res4 = add <4 x i32> %res3, %7
  ret <4 x i32> %res4
 }
-declare <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32>, <4 x i32>, i32)

 define <8 x i32>@test_int_x86_avx512_mask_vpshrd_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_d_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshrdd $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xd9,0x16]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshrdd $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x73,0xd1,0x16]
-; X86-NEXT:    vpshrdd $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xc1,0x16]
-; X86-NEXT:    vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
+; X86-NEXT:    vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_256:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshrdd $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshrdd $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x73,0xd1,0x16]
-; X64-NEXT:    vpshrdd $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xc1,0x16]
-; X64-NEXT:    vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
+; X64-NEXT:    vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
-  %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1)
-  %res2 = add <8 x i32> %res, %res1
+  %1 = call <8 x i32> @llvm.x86.avx512.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x3
+  %4 = call <8 x i32> @llvm.x86.avx512.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22)
+  %res2 = add <8 x i32> %3, %4
  ret <8 x i32> %res2
 }
-declare <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.vpshrd.d.256(<8 x i32>, <8 x i32>, i32)

 define <2 x i64>@test_int_x86_avx512_mask_vpshrd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_q_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshrdq $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xd9,0x16]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshrdq $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x73,0xd1,0x16]
-; X86-NEXT:    vpshrdq $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xc1,0x16]
-; X86-NEXT:    vpaddq %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0]
+; X86-NEXT:    vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_128:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshrdq $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshrdq $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x73,0xd1,0x16]
-; X64-NEXT:    vpshrdq $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xc1,0x16]
-; X64-NEXT:    vpaddq %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0]
+; X64-NEXT:    vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4)
-  %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 -1)
-  %res2 = add <2 x i64> %res, %res1
+  %1 = call <2 x i64> @llvm.x86.avx512.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x3
+  %4 = call <2 x i64> @llvm.x86.avx512.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22)
+  %res2 = add <2 x i64> %3, %4
  ret <2 x i64> %res2
 }
-declare <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.vpshrd.q.128(<2 x i64>, <2 x i64>, i32)

 define <4 x i64>@test_int_x86_avx512_mask_vpshrd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_q_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshrdq $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xd9,0x16]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshrdq $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x73,0xd1,0x16]
-; X86-NEXT:    vpshrdq $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xc1,0x16]
-; X86-NEXT:    vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
+; X86-NEXT:    vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_256:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshrdq $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshrdq $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x73,0xd1,0x16]
-; X64-NEXT:    vpshrdq $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xc1,0x16]
-; X64-NEXT:    vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
+; X64-NEXT:    vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
-  %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1)
-  %res2 = add <4 x i64> %res, %res1
+  %1 = call <4 x i64> @llvm.x86.avx512.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x3
+  %4 = call <4 x i64> @llvm.x86.avx512.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22)
+  %res2 = add <4 x i64> %3, %4
  ret <4 x i64> %res2
 }
-declare <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.vpshrd.q.256(<4 x i64>, <4 x i64>, i32)

 define <8 x i16>@test_int_x86_avx512_mask_vpshrd_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_w_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshrdw $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xd9,0x16]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshrdw $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x72,0xd1,0x16]
-; X86-NEXT:    vpshrdw $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xc1,0x16]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
+; X86-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_128:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshrdw $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshrdw $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x72,0xd1,0x16]
-; X64-NEXT:    vpshrdw $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xc1,0x16]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
+; X64-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 %x4)
-  %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 -1)
-  %res2 = add <8 x i16> %res, %res1
+  %1 = call <8 x i16> @llvm.x86.avx512.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x3
+  %4 = call <8 x i16> @llvm.x86.avx512.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22)
+  %res2 = add <8 x i16> %3, %4
  ret <8 x i16> %res2
 }
-declare <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16>, <8 x i16>, i32, <8 x i16>, i8)
+declare <8 x i16> @llvm.x86.avx512.vpshrd.w.128(<8 x i16>, <8 x i16>, i32)

 define <16 x i16>@test_int_x86_avx512_mask_vpshrd_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x3, i16 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_w_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpshrdw $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xd9,0x16]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpshrdw $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x72,0xd1,0x16]
-; X86-NEXT:    vpshrdw $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xc1,0x16]
-; X86-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
+; X86-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_256:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpshrdw $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xd9,0x16]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshrdw $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x72,0xd1,0x16]
-; X64-NEXT:    vpshrdw $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xc1,0x16]
-; X64-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
+; X64-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 %x4)
-  %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 -1)
-  %res2 = add <16 x i16> %res, %res1
+  %1 = call <16 x i16> @llvm.x86.avx512.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22)
+  %2 = bitcast i16 %x4 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x3
+  %4 = call <16 x i16> @llvm.x86.avx512.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22)
+  %res2 = add <16 x i16> %3, %4
  ret <16 x i16> %res2
 }
-declare <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16>, <16 x i16>, i32, <16 x i16>, i16)
+declare <16 x i16> @llvm.x86.avx512.vpshrd.w.256(<16 x i16>, <16 x i16>, i32)

 declare <8 x i32> @llvm.x86.avx512.mask.vpshrdv.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
 declare <8 x i32> @llvm.x86.avx512.maskz.vpshrdv.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)