diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 89c900f45f21..aac41cf981cf 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -4739,101 +4739,83 @@ let TargetPrefix = "x86" in { // VBMI2 Concat & Shift let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx512_mask_vpshld_q_512 : - GCCBuiltin<"__builtin_ia32_vpshldq512_mask">, + def int_x86_avx512_vpshld_q_512 : + GCCBuiltin<"__builtin_ia32_vpshldq512">, Intrinsic<[llvm_v8i64_ty], - [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty, llvm_v8i64_ty, - llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpshld_q_256 : - GCCBuiltin<"__builtin_ia32_vpshldq256_mask">, + [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_vpshld_q_256 : + GCCBuiltin<"__builtin_ia32_vpshldq256">, Intrinsic<[llvm_v4i64_ty], - [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_v4i64_ty, - llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpshld_q_128 : - GCCBuiltin<"__builtin_ia32_vpshldq128_mask">, + [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_vpshld_q_128 : + GCCBuiltin<"__builtin_ia32_vpshldq128">, Intrinsic<[llvm_v2i64_ty], - [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_v2i64_ty, - llvm_i8_ty], [IntrNoMem]>; + [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpshld_d_512 : - GCCBuiltin<"__builtin_ia32_vpshldd512_mask">, + def int_x86_avx512_vpshld_d_512 : + GCCBuiltin<"__builtin_ia32_vpshldd512">, Intrinsic<[llvm_v16i32_ty], - [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty, llvm_v16i32_ty, - llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpshld_d_256 : - GCCBuiltin<"__builtin_ia32_vpshldd256_mask">, + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_vpshld_d_256 : + GCCBuiltin<"__builtin_ia32_vpshldd256">, Intrinsic<[llvm_v8i32_ty], - [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_v8i32_ty, - llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpshld_d_128 : - GCCBuiltin<"__builtin_ia32_vpshldd128_mask">, + [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_vpshld_d_128 : + GCCBuiltin<"__builtin_ia32_vpshldd128">, Intrinsic<[llvm_v4i32_ty], - [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty, - llvm_i8_ty], [IntrNoMem]>; + [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpshld_w_512 : - GCCBuiltin<"__builtin_ia32_vpshldw512_mask">, + def int_x86_avx512_vpshld_w_512 : + GCCBuiltin<"__builtin_ia32_vpshldw512">, Intrinsic<[llvm_v32i16_ty], - [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_v32i16_ty, - llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpshld_w_256 : - GCCBuiltin<"__builtin_ia32_vpshldw256_mask">, + [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_vpshld_w_256 : + GCCBuiltin<"__builtin_ia32_vpshldw256">, Intrinsic<[llvm_v16i16_ty], - [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty, llvm_v16i16_ty, - llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpshld_w_128 : - GCCBuiltin<"__builtin_ia32_vpshldw128_mask">, + [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_vpshld_w_128 : + GCCBuiltin<"__builtin_ia32_vpshldw128">, Intrinsic<[llvm_v8i16_ty], - [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty, llvm_v8i16_ty, - llvm_i8_ty], [IntrNoMem]>; + [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpshrd_q_512 : - GCCBuiltin<"__builtin_ia32_vpshrdq512_mask">, + def int_x86_avx512_vpshrd_q_512 : + GCCBuiltin<"__builtin_ia32_vpshrdq512">, Intrinsic<[llvm_v8i64_ty], - [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty, llvm_v8i64_ty, - llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpshrd_q_256 : - GCCBuiltin<"__builtin_ia32_vpshrdq256_mask">, + [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_vpshrd_q_256 : + GCCBuiltin<"__builtin_ia32_vpshrdq256">, Intrinsic<[llvm_v4i64_ty], - [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_v4i64_ty, - llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpshrd_q_128 : - GCCBuiltin<"__builtin_ia32_vpshrdq128_mask">, + [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_vpshrd_q_128 : + GCCBuiltin<"__builtin_ia32_vpshrdq128">, Intrinsic<[llvm_v2i64_ty], - [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_v2i64_ty, - llvm_i8_ty], [IntrNoMem]>; + [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpshrd_d_512 : - GCCBuiltin<"__builtin_ia32_vpshrdd512_mask">, + def int_x86_avx512_vpshrd_d_512 : + GCCBuiltin<"__builtin_ia32_vpshrdd512">, Intrinsic<[llvm_v16i32_ty], - [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty, llvm_v16i32_ty, - llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpshrd_d_256 : - GCCBuiltin<"__builtin_ia32_vpshrdd256_mask">, + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_vpshrd_d_256 : + GCCBuiltin<"__builtin_ia32_vpshrdd256">, Intrinsic<[llvm_v8i32_ty], - [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_v8i32_ty, - llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpshrd_d_128 : - GCCBuiltin<"__builtin_ia32_vpshrdd128_mask">, + [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_vpshrd_d_128 : + GCCBuiltin<"__builtin_ia32_vpshrdd128">, Intrinsic<[llvm_v4i32_ty], - [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty, - llvm_i8_ty], [IntrNoMem]>; + [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpshrd_w_512 : - GCCBuiltin<"__builtin_ia32_vpshrdw512_mask">, + def int_x86_avx512_vpshrd_w_512 : + GCCBuiltin<"__builtin_ia32_vpshrdw512">, Intrinsic<[llvm_v32i16_ty], - [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_v32i16_ty, - llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpshrd_w_256 : - GCCBuiltin<"__builtin_ia32_vpshrdw256_mask">, + [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_vpshrd_w_256 : + GCCBuiltin<"__builtin_ia32_vpshrdw256">, Intrinsic<[llvm_v16i16_ty], - [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty, llvm_v16i16_ty, - llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpshrd_w_128 : - GCCBuiltin<"__builtin_ia32_vpshrdw128_mask">, + [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_vpshrd_w_128 : + GCCBuiltin<"__builtin_ia32_vpshrdw128">, Intrinsic<[llvm_v8i16_ty], - [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty, llvm_v8i16_ty, - llvm_i8_ty], [IntrNoMem]>; + [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_vpshldv_w_128 : GCCBuiltin<"__builtin_ia32_vpshldvw128_mask">, diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 2bec93fc3c88..ee8a0c3b12b2 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -263,6 +263,8 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx512.mask.vpdpwssds.") || // Added in 7.0 Name.startswith("avx512.maskz.vpdpwssds.") || // Added in 7.0 Name.startswith("avx512.mask.dbpsadbw.") || // Added in 7.0 + Name.startswith("avx512.mask.vpshld.") || // Added in 7.0 + Name.startswith("avx512.mask.vpshrd.") || // Added in 7.0 Name.startswith("avx512.mask.add.p") || // Added in 7.0 Name.startswith("avx512.mask.sub.p") || // Added in 7.0 Name.startswith("avx512.mask.mul.p") || // Added in 7.0 @@ -1258,6 +1260,48 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder, IID = Intrinsic::x86_avx512_dbpsadbw_512; else llvm_unreachable("Unexpected intrinsic"); + } else if (Name.startswith("vpshld.")) { + if (VecWidth == 128 && Name[7] == 'q') + IID = Intrinsic::x86_avx512_vpshld_q_128; + else if (VecWidth == 128 && Name[7] == 'd') + IID = Intrinsic::x86_avx512_vpshld_d_128; + else if (VecWidth == 128 && Name[7] == 'w') + IID = Intrinsic::x86_avx512_vpshld_w_128; + else if (VecWidth == 256 && Name[7] == 'q') + IID = Intrinsic::x86_avx512_vpshld_q_256; + else if (VecWidth == 256 && Name[7] == 'd') + IID = Intrinsic::x86_avx512_vpshld_d_256; + else if (VecWidth == 256 && Name[7] == 'w') + IID = Intrinsic::x86_avx512_vpshld_w_256; + else if (VecWidth == 512 && Name[7] == 'q') + IID = Intrinsic::x86_avx512_vpshld_q_512; + else if (VecWidth == 512 && Name[7] == 'd') + IID = Intrinsic::x86_avx512_vpshld_d_512; + else if (VecWidth == 512 && Name[7] == 'w') + IID = Intrinsic::x86_avx512_vpshld_w_512; + else + llvm_unreachable("Unexpected intrinsic"); + } else if (Name.startswith("vpshrd.")) { + if (VecWidth == 128 && Name[7] == 'q') + IID = Intrinsic::x86_avx512_vpshrd_q_128; + else if (VecWidth == 128 && Name[7] == 'd') + IID = Intrinsic::x86_avx512_vpshrd_d_128; + else if (VecWidth == 128 && Name[7] == 'w') + IID = Intrinsic::x86_avx512_vpshrd_w_128; + else if (VecWidth == 256 && Name[7] == 'q') + IID = Intrinsic::x86_avx512_vpshrd_q_256; + else if (VecWidth == 256 && Name[7] == 'd') + IID = Intrinsic::x86_avx512_vpshrd_d_256; + else if (VecWidth == 256 && Name[7] == 'w') + IID = Intrinsic::x86_avx512_vpshrd_w_256; + else if (VecWidth == 512 && Name[7] == 'q') + IID = Intrinsic::x86_avx512_vpshrd_q_512; + else if (VecWidth == 512 && Name[7] == 'd') + IID = Intrinsic::x86_avx512_vpshrd_d_512; + else if (VecWidth == 512 && Name[7] == 'w') + IID = Intrinsic::x86_avx512_vpshrd_w_512; + else + llvm_unreachable("Unexpected intrinsic"); } else return false; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d09355a1f4e2..0413f0f00b0d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -20488,7 +20488,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Src2, Src3), Mask, PassThru, Subtarget, DAG); } - case INTR_TYPE_3OP_IMM8_MASK: case INTR_TYPE_3OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); @@ -20496,9 +20495,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); - if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK) - Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3); - // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 138ffb2d4e02..5bb521c00614 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -28,7 +28,7 @@ enum IntrinsicType : uint16_t { CVTPD2PS, CVTPD2PS_MASK, INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK, - INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_IMM8_MASK, + INTR_TYPE_3OP_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3, IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK, @@ -960,15 +960,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB, X86ISD::FNMSUB_RND), - X86_INTRINSIC_DATA(avx512_mask_vpshld_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshld_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshld_d_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshld_q_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshld_q_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshld_q_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshld_w_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshld_w_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshld_w_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_128, FMA_OP_MASK, X86ISD::VSHLDV, 0), X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_256, FMA_OP_MASK, X86ISD::VSHLDV, 0), X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_512, FMA_OP_MASK, X86ISD::VSHLDV, 0), @@ -978,15 +969,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_128, FMA_OP_MASK, X86ISD::VSHLDV, 0), X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_256, FMA_OP_MASK, X86ISD::VSHLDV, 0), X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_512, FMA_OP_MASK, X86ISD::VSHLDV, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_128, FMA_OP_MASK, X86ISD::VSHRDV, 0), X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_256, FMA_OP_MASK, X86ISD::VSHRDV, 0), X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_512, FMA_OP_MASK, X86ISD::VSHRDV, 0), @@ -1261,6 +1243,24 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_128 , IFMA_OP, X86ISD::VPMADD52L, 0), X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_256 , IFMA_OP, X86ISD::VPMADD52L, 0), X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_512 , IFMA_OP, X86ISD::VPMADD52L, 0), + X86_INTRINSIC_DATA(avx512_vpshld_d_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_vpshld_d_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_vpshld_d_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_vpshld_q_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_vpshld_q_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_vpshld_q_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_vpshld_w_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_vpshld_w_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_vpshld_w_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_vpshrd_d_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0), + X86_INTRINSIC_DATA(avx512_vpshrd_d_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0), + X86_INTRINSIC_DATA(avx512_vpshrd_d_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0), + X86_INTRINSIC_DATA(avx512_vpshrd_q_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0), + X86_INTRINSIC_DATA(avx512_vpshrd_q_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0), + X86_INTRINSIC_DATA(avx512_vpshrd_q_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0), + X86_INTRINSIC_DATA(avx512_vpshrd_w_128, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0), + X86_INTRINSIC_DATA(avx512_vpshrd_w_256, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0), + X86_INTRINSIC_DATA(avx512_vpshrd_w_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0), X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, ISD::FMA, 0), diff --git a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-fast-isel.ll index f0f9af5aa130..0a0837d26715 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-fast-isel.ll @@ -317,11 +317,13 @@ define <8 x i64> @test_mm512_mask_shldi_epi64(<8 x i64> %__S, i8 zeroext %__U, < ; X64-NEXT: vpshldq $127, %zmm2, %zmm1, %zmm0 {%k1} ; X64-NEXT: retq entry: - %0 = tail call <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 127, <8 x i64> %__S, i8 %__U) - ret <8 x i64> %0 + %0 = tail call <8 x i64> @llvm.x86.avx512.vpshld.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 127) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__S + ret <8 x i64> %2 } -declare <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8) +declare <8 x i64> @llvm.x86.avx512.vpshld.q.512(<8 x i64>, <8 x i64>, i32) define <8 x i64> @test_mm512_maskz_shldi_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { ; X86-LABEL: test_mm512_maskz_shldi_epi64: @@ -337,8 +339,10 @@ define <8 x i64> @test_mm512_maskz_shldi_epi64(i8 zeroext %__U, <8 x i64> %__A, ; X64-NEXT: vpshldq $63, %zmm1, %zmm0, %zmm0 {%k1} {z} ; X64-NEXT: retq entry: - %0 = tail call <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 63, <8 x i64> zeroinitializer, i8 %__U) - ret <8 x i64> %0 + %0 = tail call <8 x i64> @llvm.x86.avx512.vpshld.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 63) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer + ret <8 x i64> %2 } define <8 x i64> @test_mm512_shldi_epi64(<8 x i64> %__A, <8 x i64> %__B) { @@ -347,7 +351,7 @@ define <8 x i64> @test_mm512_shldi_epi64(<8 x i64> %__A, <8 x i64> %__B) { ; CHECK-NEXT: vpshldq $31, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} entry: - %0 = tail call <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 31, <8 x i64> zeroinitializer, i8 -1) + %0 = tail call <8 x i64> @llvm.x86.avx512.vpshld.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 31) ret <8 x i64> %0 } @@ -366,13 +370,15 @@ define <8 x i64> @test_mm512_mask_shldi_epi32(<8 x i64> %__S, i16 zeroext %__U, entry: %0 = bitcast <8 x i64> %__A to <16 x i32> %1 = bitcast <8 x i64> %__B to <16 x i32> - %2 = bitcast <8 x i64> %__S to <16 x i32> - %3 = tail call <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32> %0, <16 x i32> %1, i32 127, <16 x i32> %2, i16 %__U) - %4 = bitcast <16 x i32> %3 to <8 x i64> - ret <8 x i64> %4 + %2 = tail call <16 x i32> @llvm.x86.avx512.vpshld.d.512(<16 x i32> %0, <16 x i32> %1, i32 127) + %3 = bitcast <8 x i64> %__S to <16 x i32> + %4 = bitcast i16 %__U to <16 x i1> + %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3 + %6 = bitcast <16 x i32> %5 to <8 x i64> + ret <8 x i64> %6 } -declare <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.vpshld.d.512(<16 x i32>, <16 x i32>, i32) define <8 x i64> @test_mm512_maskz_shldi_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { ; X86-LABEL: test_mm512_maskz_shldi_epi32: @@ -389,9 +395,11 @@ define <8 x i64> @test_mm512_maskz_shldi_epi32(i16 zeroext %__U, <8 x i64> %__A, entry: %0 = bitcast <8 x i64> %__A to <16 x i32> %1 = bitcast <8 x i64> %__B to <16 x i32> - %2 = tail call <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32> %0, <16 x i32> %1, i32 63, <16 x i32> zeroinitializer, i16 %__U) - %3 = bitcast <16 x i32> %2 to <8 x i64> - ret <8 x i64> %3 + %2 = tail call <16 x i32> @llvm.x86.avx512.vpshld.d.512(<16 x i32> %0, <16 x i32> %1, i32 63) + %3 = bitcast i16 %__U to <16 x i1> + %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer + %5 = bitcast <16 x i32> %4 to <8 x i64> + ret <8 x i64> %5 } define <8 x i64> @test_mm512_shldi_epi32(<8 x i64> %__A, <8 x i64> %__B) { @@ -402,7 +410,7 @@ define <8 x i64> @test_mm512_shldi_epi32(<8 x i64> %__A, <8 x i64> %__B) { entry: %0 = bitcast <8 x i64> %__A to <16 x i32> %1 = bitcast <8 x i64> %__B to <16 x i32> - %2 = tail call <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32> %0, <16 x i32> %1, i32 31, <16 x i32> zeroinitializer, i16 -1) + %2 = tail call <16 x i32> @llvm.x86.avx512.vpshld.d.512(<16 x i32> %0, <16 x i32> %1, i32 31) %3 = bitcast <16 x i32> %2 to <8 x i64> ret <8 x i64> %3 } @@ -422,13 +430,15 @@ define <8 x i64> @test_mm512_mask_shldi_epi16(<8 x i64> %__S, i32 %__U, <8 x i64 entry: %0 = bitcast <8 x i64> %__A to <32 x i16> %1 = bitcast <8 x i64> %__B to <32 x i16> - %2 = bitcast <8 x i64> %__S to <32 x i16> - %3 = tail call <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16> %0, <32 x i16> %1, i32 127, <32 x i16> %2, i32 %__U) - %4 = bitcast <32 x i16> %3 to <8 x i64> - ret <8 x i64> %4 + %2 = tail call <32 x i16> @llvm.x86.avx512.vpshld.w.512(<32 x i16> %0, <32 x i16> %1, i32 127) + %3 = bitcast <8 x i64> %__S to <32 x i16> + %4 = bitcast i32 %__U to <32 x i1> + %5 = select <32 x i1> %4, <32 x i16> %2, <32 x i16> %3 + %6 = bitcast <32 x i16> %5 to <8 x i64> + ret <8 x i64> %6 } -declare <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16>, <32 x i16>, i32, <32 x i16>, i32) +declare <32 x i16> @llvm.x86.avx512.vpshld.w.512(<32 x i16>, <32 x i16>, i32) define <8 x i64> @test_mm512_maskz_shldi_epi16(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) { ; X86-LABEL: test_mm512_maskz_shldi_epi16: @@ -445,9 +455,11 @@ define <8 x i64> @test_mm512_maskz_shldi_epi16(i32 %__U, <8 x i64> %__A, <8 x i6 entry: %0 = bitcast <8 x i64> %__A to <32 x i16> %1 = bitcast <8 x i64> %__B to <32 x i16> - %2 = tail call <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16> %0, <32 x i16> %1, i32 63, <32 x i16> zeroinitializer, i32 %__U) - %3 = bitcast <32 x i16> %2 to <8 x i64> - ret <8 x i64> %3 + %2 = tail call <32 x i16> @llvm.x86.avx512.vpshld.w.512(<32 x i16> %0, <32 x i16> %1, i32 63) + %3 = bitcast i32 %__U to <32 x i1> + %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer + %5 = bitcast <32 x i16> %4 to <8 x i64> + ret <8 x i64> %5 } define <8 x i64> @test_mm512_shldi_epi16(<8 x i64> %__A, <8 x i64> %__B) { @@ -458,7 +470,7 @@ define <8 x i64> @test_mm512_shldi_epi16(<8 x i64> %__A, <8 x i64> %__B) { entry: %0 = bitcast <8 x i64> %__A to <32 x i16> %1 = bitcast <8 x i64> %__B to <32 x i16> - %2 = tail call <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16> %0, <32 x i16> %1, i32 31, <32 x i16> zeroinitializer, i32 -1) + %2 = tail call <32 x i16> @llvm.x86.avx512.vpshld.w.512(<32 x i16> %0, <32 x i16> %1, i32 31) %3 = bitcast <32 x i16> %2 to <8 x i64> ret <8 x i64> %3 } @@ -477,11 +489,13 @@ define <8 x i64> @test_mm512_mask_shrdi_epi64(<8 x i64> %__S, i8 zeroext %__U, < ; X64-NEXT: vpshrdq $127, %zmm2, %zmm1, %zmm0 {%k1} ; X64-NEXT: retq entry: - %0 = tail call <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 127, <8 x i64> %__S, i8 %__U) - ret <8 x i64> %0 + %0 = tail call <8 x i64> @llvm.x86.avx512.vpshrd.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 127) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__S + ret <8 x i64> %2 } -declare <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8) +declare <8 x i64> @llvm.x86.avx512.vpshrd.q.512(<8 x i64>, <8 x i64>, i32) define <8 x i64> @test_mm512_maskz_shrdi_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { ; X86-LABEL: test_mm512_maskz_shrdi_epi64: @@ -497,8 +511,10 @@ define <8 x i64> @test_mm512_maskz_shrdi_epi64(i8 zeroext %__U, <8 x i64> %__A, ; X64-NEXT: vpshrdq $63, %zmm1, %zmm0, %zmm0 {%k1} {z} ; X64-NEXT: retq entry: - %0 = tail call <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 63, <8 x i64> zeroinitializer, i8 %__U) - ret <8 x i64> %0 + %0 = tail call <8 x i64> @llvm.x86.avx512.vpshrd.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 63) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer + ret <8 x i64> %2 } define <8 x i64> @test_mm512_shrdi_epi64(<8 x i64> %__A, <8 x i64> %__B) { @@ -507,7 +523,7 @@ define <8 x i64> @test_mm512_shrdi_epi64(<8 x i64> %__A, <8 x i64> %__B) { ; CHECK-NEXT: vpshrdq $31, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} entry: - %0 = tail call <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 31, <8 x i64> zeroinitializer, i8 -1) + %0 = tail call <8 x i64> @llvm.x86.avx512.vpshrd.q.512(<8 x i64> %__A, <8 x i64> %__B, i32 31) ret <8 x i64> %0 } @@ -526,13 +542,15 @@ define <8 x i64> @test_mm512_mask_shrdi_epi32(<8 x i64> %__S, i16 zeroext %__U, entry: %0 = bitcast <8 x i64> %__A to <16 x i32> %1 = bitcast <8 x i64> %__B to <16 x i32> - %2 = bitcast <8 x i64> %__S to <16 x i32> - %3 = tail call <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32> %0, <16 x i32> %1, i32 127, <16 x i32> %2, i16 %__U) - %4 = bitcast <16 x i32> %3 to <8 x i64> - ret <8 x i64> %4 + %2 = tail call <16 x i32> @llvm.x86.avx512.vpshrd.d.512(<16 x i32> %0, <16 x i32> %1, i32 127) + %3 = bitcast <8 x i64> %__S to <16 x i32> + %4 = bitcast i16 %__U to <16 x i1> + %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3 + %6 = bitcast <16 x i32> %5 to <8 x i64> + ret <8 x i64> %6 } -declare <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.vpshrd.d.512(<16 x i32>, <16 x i32>, i32) define <8 x i64> @test_mm512_maskz_shrdi_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { ; X86-LABEL: test_mm512_maskz_shrdi_epi32: @@ -549,9 +567,11 @@ define <8 x i64> @test_mm512_maskz_shrdi_epi32(i16 zeroext %__U, <8 x i64> %__A, entry: %0 = bitcast <8 x i64> %__A to <16 x i32> %1 = bitcast <8 x i64> %__B to <16 x i32> - %2 = tail call <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32> %0, <16 x i32> %1, i32 63, <16 x i32> zeroinitializer, i16 %__U) - %3 = bitcast <16 x i32> %2 to <8 x i64> - ret <8 x i64> %3 + %2 = tail call <16 x i32> @llvm.x86.avx512.vpshrd.d.512(<16 x i32> %0, <16 x i32> %1, i32 63) + %3 = bitcast i16 %__U to <16 x i1> + %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer + %5 = bitcast <16 x i32> %4 to <8 x i64> + ret <8 x i64> %5 } define <8 x i64> @test_mm512_shrdi_epi32(<8 x i64> %__A, <8 x i64> %__B) { @@ -562,7 +582,7 @@ define <8 x i64> @test_mm512_shrdi_epi32(<8 x i64> %__A, <8 x i64> %__B) { entry: %0 = bitcast <8 x i64> %__A to <16 x i32> %1 = bitcast <8 x i64> %__B to <16 x i32> - %2 = tail call <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32> %0, <16 x i32> %1, i32 31, <16 x i32> zeroinitializer, i16 -1) + %2 = tail call <16 x i32> @llvm.x86.avx512.vpshrd.d.512(<16 x i32> %0, <16 x i32> %1, i32 31) %3 = bitcast <16 x i32> %2 to <8 x i64> ret <8 x i64> %3 } @@ -582,13 +602,15 @@ define <8 x i64> @test_mm512_mask_shrdi_epi16(<8 x i64> %__S, i32 %__U, <8 x i64 entry: %0 = bitcast <8 x i64> %__A to <32 x i16> %1 = bitcast <8 x i64> %__B to <32 x i16> - %2 = bitcast <8 x i64> %__S to <32 x i16> - %3 = tail call <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16> %0, <32 x i16> %1, i32 127, <32 x i16> %2, i32 %__U) - %4 = bitcast <32 x i16> %3 to <8 x i64> - ret <8 x i64> %4 + %2 = tail call <32 x i16> @llvm.x86.avx512.vpshrd.w.512(<32 x i16> %0, <32 x i16> %1, i32 127) + %3 = bitcast <8 x i64> %__S to <32 x i16> + %4 = bitcast i32 %__U to <32 x i1> + %5 = select <32 x i1> %4, <32 x i16> %2, <32 x i16> %3 + %6 = bitcast <32 x i16> %5 to <8 x i64> + ret <8 x i64> %6 } -declare <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16>, <32 x i16>, i32, <32 x i16>, i32) +declare <32 x i16> @llvm.x86.avx512.vpshrd.w.512(<32 x i16>, <32 x i16>, i32) define <8 x i64> @test_mm512_maskz_shrdi_epi16(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) { ; X86-LABEL: test_mm512_maskz_shrdi_epi16: @@ -605,9 +627,11 @@ define <8 x i64> @test_mm512_maskz_shrdi_epi16(i32 %__U, <8 x i64> %__A, <8 x i6 entry: %0 = bitcast <8 x i64> %__A to <32 x i16> %1 = bitcast <8 x i64> %__B to <32 x i16> - %2 = tail call <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16> %0, <32 x i16> %1, i32 63, <32 x i16> zeroinitializer, i32 %__U) - %3 = bitcast <32 x i16> %2 to <8 x i64> - ret <8 x i64> %3 + %2 = tail call <32 x i16> @llvm.x86.avx512.vpshrd.w.512(<32 x i16> %0, <32 x i16> %1, i32 63) + %3 = bitcast i32 %__U to <32 x i1> + %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer + %5 = bitcast <32 x i16> %4 to <8 x i64> + ret <8 x i64> %5 } define <8 x i64> @test_mm512_shrdi_epi16(<8 x i64> %__A, <8 x i64> %__B) { @@ -618,7 +642,7 @@ define <8 x i64> @test_mm512_shrdi_epi16(<8 x i64> %__A, <8 x i64> %__B) { entry: %0 = bitcast <8 x i64> %__A to <32 x i16> %1 = bitcast <8 x i64> %__B to <32 x i16> - %2 = tail call <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16> %0, <32 x i16> %1, i32 31, <32 x i16> zeroinitializer, i32 -1) + %2 = tail call <32 x i16> @llvm.x86.avx512.vpshrd.w.512(<32 x i16> %0, <32 x i16> %1, i32 31) %3 = bitcast <32 x i16> %2 to <8 x i64> ret <8 x i64> %3 } diff --git a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll index e1b112924185..319008f4f29a 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll @@ -279,3 +279,143 @@ define void @test_compress_store_b_512(i8* %addr, <64 x i8> %data) { call void @llvm.x86.avx512.mask.compress.store.b.512(i8* %addr, <64 x i8> %data, i64 -1) ret void } + +define <16 x i32>@test_int_x86_avx512_mask_vpshld_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshld_d_512: +; X86: # %bb.0: +; X86-NEXT: vpshldd $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x71,0xd9,0x16] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpshldd $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x71,0xd1,0x16] +; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_512: +; X64: # %bb.0: +; X64-NEXT: vpshldd $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x71,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshldd $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x71,0xd1,0x16] +; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} +declare <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16) + +define <8 x i64>@test_int_x86_avx512_mask_vpshld_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshld_q_512: +; X86: # %bb.0: +; X86-NEXT: vpshldq $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x71,0xd9,0x16] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpshldq $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x71,0xd1,0x16] +; X86-NEXT: vpaddq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_512: +; X64: # %bb.0: +; X64-NEXT: vpshldq $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x71,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshldq $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x71,0xd1,0x16] +; X64-NEXT: vpaddq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} +declare <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8) + +define <32 x i16>@test_int_x86_avx512_mask_vpshld_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x3, i32 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshld_w_512: +; X86: # %bb.0: +; X86-NEXT: vpshldw $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x70,0xd9,0x16] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpshldw $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x70,0xd1,0x16] +; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_512: +; X64: # %bb.0: +; X64-NEXT: vpshldw $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x70,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshldw $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x70,0xd1,0x16] +; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 %x4) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} +declare <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16>, <32 x i16>, i32, <32 x i16>, i32) + +define <16 x i32>@test_int_x86_avx512_mask_vpshrd_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshrd_d_512: +; X86: # %bb.0: +; X86-NEXT: vpshrdd $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x73,0xd9,0x16] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpshrdd $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x73,0xd1,0x16] +; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_512: +; X64: # %bb.0: +; X64-NEXT: vpshrdd $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x73,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshrdd $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x73,0xd1,0x16] +; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} +declare <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16) + +define <8 x i64>@test_int_x86_avx512_mask_vpshrd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshrd_q_512: +; X86: # %bb.0: +; X86-NEXT: vpshrdq $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x73,0xd9,0x16] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpshrdq $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x73,0xd1,0x16] +; X86-NEXT: vpaddq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_512: +; X64: # %bb.0: +; X64-NEXT: vpshrdq $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x73,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshrdq $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x73,0xd1,0x16] +; X64-NEXT: vpaddq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} +declare <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8) + +define <32 x i16>@test_int_x86_avx512_mask_vpshrd_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x3, i32 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshrd_w_512: +; X86: # %bb.0: +; X86-NEXT: vpshrdw $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x72,0xd9,0x16] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpshrdw $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x72,0xd1,0x16] +; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_512: +; X64: # %bb.0: +; X64-NEXT: vpshrdw $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x72,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshrdw $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x72,0xd1,0x16] +; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 %x4) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} +declare <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16>, <32 x i16>, i32, <32 x i16>, i32) diff --git a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll index 5dbc27f59c90..fd98c16dab6f 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll @@ -97,142 +97,154 @@ declare <64 x i8> @llvm.x86.avx512.mask.expand.b.512(<64 x i8> %data, <64 x i8> define <16 x i32>@test_int_x86_avx512_mask_vpshld_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshld_d_512: ; X86: # %bb.0: +; X86-NEXT: vpshldd $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x71,0xd9,0x16] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpshldd $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x71,0xd1,0x16] -; X86-NEXT: vpshldd $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x71,0xc1,0x16] -; X86-NEXT: vpaddd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_512: ; X64: # %bb.0: +; X64-NEXT: vpshldd $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x71,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshldd $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x71,0xd1,0x16] -; X64-NEXT: vpshldd $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x71,0xc1,0x16] -; X64-NEXT: vpaddd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1) - %res2 = add <16 x i32> %res, %res1 + %1 = call <16 x i32> @llvm.x86.avx512.vpshld.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22) + %2 = bitcast i16 %x4 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x3 + %4 = call <16 x i32> @llvm.x86.avx512.vpshld.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22) + %res2 = add <16 x i32> %3, %4 ret <16 x i32> %res2 } -declare <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.vpshld.d.512(<16 x i32>, <16 x i32>, i32) define <8 x i64>@test_int_x86_avx512_mask_vpshld_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshld_q_512: ; X86: # %bb.0: +; X86-NEXT: vpshldq $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x71,0xd9,0x16] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshldq $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x71,0xd1,0x16] -; X86-NEXT: vpshldq $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x71,0xc1,0x16] -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X86-NEXT: vpaddq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_512: ; X64: # %bb.0: +; X64-NEXT: vpshldq $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x71,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshldq $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x71,0xd1,0x16] -; X64-NEXT: vpshldq $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x71,0xc1,0x16] -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X64-NEXT: vpaddq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1) - %res2 = add <8 x i64> %res, %res1 + %1 = call <8 x i64> @llvm.x86.avx512.vpshld.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22) + %2 = bitcast i8 %x4 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x3 + %4 = call <8 x i64> @llvm.x86.avx512.vpshld.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22) + %res2 = add <8 x i64> %3, %4 ret <8 x i64> %res2 } -declare <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8) +declare <8 x i64> @llvm.x86.avx512.vpshld.q.512(<8 x i64>, <8 x i64>, i32) define <32 x i16>@test_int_x86_avx512_mask_vpshld_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x3, i32 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshld_w_512: ; X86: # %bb.0: +; X86-NEXT: vpshldw $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x70,0xd9,0x16] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpshldw $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x70,0xd1,0x16] -; X86-NEXT: vpshldw $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x70,0xc1,0x16] -; X86-NEXT: vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] +; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_512: ; X64: # %bb.0: +; X64-NEXT: vpshldw $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x70,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshldw $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x70,0xd1,0x16] -; X64-NEXT: vpshldw $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x70,0xc1,0x16] -; X64-NEXT: vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] +; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 %x4) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 -1) - %res2 = add <32 x i16> %res, %res1 + %1 = call <32 x i16> @llvm.x86.avx512.vpshld.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22) + %2 = bitcast i32 %x4 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x3 + %4 = call <32 x i16> @llvm.x86.avx512.vpshld.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22) + %res2 = add <32 x i16> %3, %4 ret <32 x i16> %res2 } -declare <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16>, <32 x i16>, i32, <32 x i16>, i32) +declare <32 x i16> @llvm.x86.avx512.vpshld.w.512(<32 x i16>, <32 x i16>, i32) define <16 x i32>@test_int_x86_avx512_mask_vpshrd_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_d_512: ; X86: # %bb.0: +; X86-NEXT: vpshrdd $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x73,0xd9,0x16] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpshrdd $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x73,0xd1,0x16] -; X86-NEXT: vpshrdd $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x73,0xc1,0x16] -; X86-NEXT: vpaddd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_512: ; X64: # %bb.0: +; X64-NEXT: vpshrdd $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x73,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshrdd $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x73,0xd1,0x16] -; X64-NEXT: vpshrdd $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x73,0xc1,0x16] -; X64-NEXT: vpaddd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1) - %res2 = add <16 x i32> %res, %res1 + %1 = call <16 x i32> @llvm.x86.avx512.vpshrd.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22) + %2 = bitcast i16 %x4 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x3 + %4 = call <16 x i32> @llvm.x86.avx512.vpshrd.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22) + %res2 = add <16 x i32> %3, %4 ret <16 x i32> %res2 } -declare <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.vpshrd.d.512(<16 x i32>, <16 x i32>, i32) define <8 x i64>@test_int_x86_avx512_mask_vpshrd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_q_512: ; X86: # %bb.0: +; X86-NEXT: vpshrdq $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x73,0xd9,0x16] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshrdq $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x73,0xd1,0x16] -; X86-NEXT: vpshrdq $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x73,0xc1,0x16] -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X86-NEXT: vpaddq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_512: ; X64: # %bb.0: +; X64-NEXT: vpshrdq $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x73,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshrdq $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x73,0xd1,0x16] -; X64-NEXT: vpshrdq $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x73,0xc1,0x16] -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X64-NEXT: vpaddq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1) - %res2 = add <8 x i64> %res, %res1 + %1 = call <8 x i64> @llvm.x86.avx512.vpshrd.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22) + %2 = bitcast i8 %x4 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x3 + %4 = call <8 x i64> @llvm.x86.avx512.vpshrd.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22) + %res2 = add <8 x i64> %3, %4 ret <8 x i64> %res2 } -declare <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8) +declare <8 x i64> @llvm.x86.avx512.vpshrd.q.512(<8 x i64>, <8 x i64>, i32) define <32 x i16>@test_int_x86_avx512_mask_vpshrd_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x3, i32 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_w_512: ; X86: # %bb.0: +; X86-NEXT: vpshrdw $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x72,0xd9,0x16] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpshrdw $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x72,0xd1,0x16] -; X86-NEXT: vpshrdw $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x72,0xc1,0x16] -; X86-NEXT: vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] +; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_512: ; X64: # %bb.0: +; X64-NEXT: vpshrdw $22, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x72,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshrdw $22, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x72,0xd1,0x16] -; X64-NEXT: vpshrdw $22, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x72,0xc1,0x16] -; X64-NEXT: vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] +; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 %x4) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 -1) - %res2 = add <32 x i16> %res, %res1 + %1 = call <32 x i16> @llvm.x86.avx512.vpshrd.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22) + %2 = bitcast i32 %x4 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x3 + %4 = call <32 x i16> @llvm.x86.avx512.vpshrd.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22) + %res2 = add <32 x i16> %3, %4 ret <32 x i16> %res2 } -declare <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16>, <32 x i16>, i32, <32 x i16>, i32) +declare <32 x i16> @llvm.x86.avx512.vpshrd.w.512(<32 x i16>, <32 x i16>, i32) declare <16 x i32> @llvm.x86.avx512.mask.vpshrdv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) declare <16 x i32> @llvm.x86.avx512.maskz.vpshrdv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) diff --git a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-fast-isel.ll index 963609542213..392ea0b21f55 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-fast-isel.ll @@ -591,11 +591,14 @@ define <4 x i64> @test_mm256_mask_shldi_epi64(<4 x i64> %__S, i8 zeroext %__U, < ; X64-NEXT: vpshldq $127, %ymm2, %ymm1, %ymm0 {%k1} ; X64-NEXT: retq entry: - %0 = tail call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 127, <4 x i64> %__S, i8 %__U) - ret <4 x i64> %0 + %0 = tail call <4 x i64> @llvm.x86.avx512.vpshld.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 127) + %1 = bitcast i8 %__U to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__S + ret <4 x i64> %2 } -declare <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8) +declare <4 x i64> @llvm.x86.avx512.vpshld.q.256(<4 x i64>, <4 x i64>, i32) define <4 x i64> @test_mm256_maskz_shldi_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { ; X86-LABEL: test_mm256_maskz_shldi_epi64: @@ -611,8 +614,11 @@ define <4 x i64> @test_mm256_maskz_shldi_epi64(i8 zeroext %__U, <4 x i64> %__A, ; X64-NEXT: vpshldq $63, %ymm1, %ymm0, %ymm0 {%k1} {z} ; X64-NEXT: retq entry: - %0 = tail call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 63, <4 x i64> zeroinitializer, i8 %__U) - ret <4 x i64> %0 + %0 = tail call <4 x i64> @llvm.x86.avx512.vpshld.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 63) + %1 = bitcast i8 %__U to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer + ret <4 x i64> %2 } define <4 x i64> @test_mm256_shldi_epi64(<4 x i64> %__A, <4 x i64> %__B) { @@ -621,7 +627,7 @@ define <4 x i64> @test_mm256_shldi_epi64(<4 x i64> %__A, <4 x i64> %__B) { ; CHECK-NEXT: vpshldq $31, %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} entry: - %0 = tail call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 31, <4 x i64> zeroinitializer, i8 -1) + %0 = tail call <4 x i64> @llvm.x86.avx512.vpshld.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 31) ret <4 x i64> %0 } @@ -639,11 +645,14 @@ define <2 x i64> @test_mm_mask_shldi_epi64(<2 x i64> %__S, i8 zeroext %__U, <2 x ; X64-NEXT: vpshldq $127, %xmm2, %xmm1, %xmm0 {%k1} ; X64-NEXT: retq entry: - %0 = tail call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 127, <2 x i64> %__S, i8 %__U) - ret <2 x i64> %0 + %0 = tail call <2 x i64> @llvm.x86.avx512.vpshld.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 127) + %1 = bitcast i8 %__U to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__S + ret <2 x i64> %2 } -declare <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8) +declare <2 x i64> @llvm.x86.avx512.vpshld.q.128(<2 x i64>, <2 x i64>, i32) #3 define <2 x i64> @test_mm_maskz_shldi_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { ; X86-LABEL: test_mm_maskz_shldi_epi64: @@ -659,8 +668,11 @@ define <2 x i64> @test_mm_maskz_shldi_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 ; X64-NEXT: vpshldq $63, %xmm1, %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: retq entry: - %0 = tail call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 63, <2 x i64> zeroinitializer, i8 %__U) - ret <2 x i64> %0 + %0 = tail call <2 x i64> @llvm.x86.avx512.vpshld.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 63) + %1 = bitcast i8 %__U to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer + ret <2 x i64> %2 } define <2 x i64> @test_mm_shldi_epi64(<2 x i64> %__A, <2 x i64> %__B) { @@ -669,7 +681,7 @@ define <2 x i64> @test_mm_shldi_epi64(<2 x i64> %__A, <2 x i64> %__B) { ; CHECK-NEXT: vpshldq $31, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} entry: - %0 = tail call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 31, <2 x i64> zeroinitializer, i8 -1) + %0 = tail call <2 x i64> @llvm.x86.avx512.vpshld.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 31) ret <2 x i64> %0 } @@ -689,13 +701,15 @@ define <4 x i64> @test_mm256_mask_shldi_epi32(<4 x i64> %__S, i8 zeroext %__U, < entry: %0 = bitcast <4 x i64> %__A to <8 x i32> %1 = bitcast <4 x i64> %__B to <8 x i32> - %2 = bitcast <4 x i64> %__S to <8 x i32> - %3 = tail call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %0, <8 x i32> %1, i32 127, <8 x i32> %2, i8 %__U) - %4 = bitcast <8 x i32> %3 to <4 x i64> - ret <4 x i64> %4 + %2 = tail call <8 x i32> @llvm.x86.avx512.vpshld.d.256(<8 x i32> %0, <8 x i32> %1, i32 127) + %3 = bitcast <4 x i64> %__S to <8 x i32> + %4 = bitcast i8 %__U to <8 x i1> + %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3 + %6 = bitcast <8 x i32> %5 to <4 x i64> + ret <4 x i64> %6 } -declare <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8) +declare <8 x i32> @llvm.x86.avx512.vpshld.d.256(<8 x i32>, <8 x i32>, i32) define <4 x i64> @test_mm256_maskz_shldi_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { ; X86-LABEL: test_mm256_maskz_shldi_epi32: @@ -713,9 +727,11 @@ define <4 x i64> @test_mm256_maskz_shldi_epi32(i8 zeroext %__U, <4 x i64> %__A, entry: %0 = bitcast <4 x i64> %__A to <8 x i32> %1 = bitcast <4 x i64> %__B to <8 x i32> - %2 = tail call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %0, <8 x i32> %1, i32 63, <8 x i32> zeroinitializer, i8 %__U) - %3 = bitcast <8 x i32> %2 to <4 x i64> - ret <4 x i64> %3 + %2 = tail call <8 x i32> @llvm.x86.avx512.vpshld.d.256(<8 x i32> %0, <8 x i32> %1, i32 63) + %3 = bitcast i8 %__U to <8 x i1> + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer + %5 = bitcast <8 x i32> %4 to <4 x i64> + ret <4 x i64> %5 } define <4 x i64> @test_mm256_shldi_epi32(<4 x i64> %__A, <4 x i64> %__B) { @@ -726,7 +742,7 @@ define <4 x i64> @test_mm256_shldi_epi32(<4 x i64> %__A, <4 x i64> %__B) { entry: %0 = bitcast <4 x i64> %__A to <8 x i32> %1 = bitcast <4 x i64> %__B to <8 x i32> - %2 = tail call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %0, <8 x i32> %1, i32 31, <8 x i32> zeroinitializer, i8 -1) + %2 = tail call <8 x i32> @llvm.x86.avx512.vpshld.d.256(<8 x i32> %0, <8 x i32> %1, i32 31) %3 = bitcast <8 x i32> %2 to <4 x i64> ret <4 x i64> %3 } @@ -747,13 +763,16 @@ define <2 x i64> @test_mm_mask_shldi_epi32(<2 x i64> %__S, i8 zeroext %__U, <2 x entry: %0 = bitcast <2 x i64> %__A to <4 x i32> %1 = bitcast <2 x i64> %__B to <4 x i32> - %2 = bitcast <2 x i64> %__S to <4 x i32> - %3 = tail call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %0, <4 x i32> %1, i32 127, <4 x i32> %2, i8 %__U) - %4 = bitcast <4 x i32> %3 to <2 x i64> - ret <2 x i64> %4 + %2 = tail call <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32> %0, <4 x i32> %1, i32 127) + %3 = bitcast <2 x i64> %__S to <4 x i32> + %4 = bitcast i8 %__U to <8 x i1> + %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> + %5 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> %3 + %6 = bitcast <4 x i32> %5 to <2 x i64> + ret <2 x i64> %6 } -declare <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8) +declare <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32>, <4 x i32>, i32) define <2 x i64> @test_mm_maskz_shldi_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { ; X86-LABEL: test_mm_maskz_shldi_epi32: @@ -771,9 +790,12 @@ define <2 x i64> @test_mm_maskz_shldi_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 entry: %0 = bitcast <2 x i64> %__A to <4 x i32> %1 = bitcast <2 x i64> %__B to <4 x i32> - %2 = tail call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %0, <4 x i32> %1, i32 63, <4 x i32> zeroinitializer, i8 %__U) - %3 = bitcast <4 x i32> %2 to <2 x i64> - ret <2 x i64> %3 + %2 = tail call <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32> %0, <4 x i32> %1, i32 63) + %3 = bitcast i8 %__U to <8 x i1> + %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> zeroinitializer + %5 = bitcast <4 x i32> %4 to <2 x i64> + ret <2 x i64> %5 } define <2 x i64> @test_mm_shldi_epi32(<2 x i64> %__A, <2 x i64> %__B) { @@ -784,7 +806,7 @@ define <2 x i64> @test_mm_shldi_epi32(<2 x i64> %__A, <2 x i64> %__B) { entry: %0 = bitcast <2 x i64> %__A to <4 x i32> %1 = bitcast <2 x i64> %__B to <4 x i32> - %2 = tail call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %0, <4 x i32> %1, i32 31, <4 x i32> zeroinitializer, i8 -1) + %2 = tail call <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32> %0, <4 x i32> %1, i32 31) %3 = bitcast <4 x i32> %2 to <2 x i64> ret <2 x i64> %3 } @@ -804,13 +826,15 @@ define <4 x i64> @test_mm256_mask_shldi_epi16(<4 x i64> %__S, i16 zeroext %__U, entry: %0 = bitcast <4 x i64> %__A to <16 x i16> %1 = bitcast <4 x i64> %__B to <16 x i16> - %2 = bitcast <4 x i64> %__S to <16 x i16> - %3 = tail call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %0, <16 x i16> %1, i32 127, <16 x i16> %2, i16 %__U) - %4 = bitcast <16 x i16> %3 to <4 x i64> - ret <4 x i64> %4 + %2 = tail call <16 x i16> @llvm.x86.avx512.vpshld.w.256(<16 x i16> %0, <16 x i16> %1, i32 127) + %3 = bitcast <4 x i64> %__S to <16 x i16> + %4 = bitcast i16 %__U to <16 x i1> + %5 = select <16 x i1> %4, <16 x i16> %2, <16 x i16> %3 + %6 = bitcast <16 x i16> %5 to <4 x i64> + ret <4 x i64> %6 } -declare <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16>, <16 x i16>, i32, <16 x i16>, i16) +declare <16 x i16> @llvm.x86.avx512.vpshld.w.256(<16 x i16>, <16 x i16>, i32) define <4 x i64> @test_mm256_maskz_shldi_epi16(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { ; X86-LABEL: test_mm256_maskz_shldi_epi16: @@ -827,9 +851,11 @@ define <4 x i64> @test_mm256_maskz_shldi_epi16(i16 zeroext %__U, <4 x i64> %__A, entry: %0 = bitcast <4 x i64> %__A to <16 x i16> %1 = bitcast <4 x i64> %__B to <16 x i16> - %2 = tail call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %0, <16 x i16> %1, i32 63, <16 x i16> zeroinitializer, i16 %__U) - %3 = bitcast <16 x i16> %2 to <4 x i64> - ret <4 x i64> %3 + %2 = tail call <16 x i16> @llvm.x86.avx512.vpshld.w.256(<16 x i16> %0, <16 x i16> %1, i32 63) + %3 = bitcast i16 %__U to <16 x i1> + %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer + %5 = bitcast <16 x i16> %4 to <4 x i64> + ret <4 x i64> %5 } define <4 x i64> @test_mm256_shldi_epi16(<4 x i64> %__A, <4 x i64> %__B) { @@ -840,7 +866,7 @@ define <4 x i64> @test_mm256_shldi_epi16(<4 x i64> %__A, <4 x i64> %__B) { entry: %0 = bitcast <4 x i64> %__A to <16 x i16> %1 = bitcast <4 x i64> %__B to <16 x i16> - %2 = tail call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %0, <16 x i16> %1, i32 31, <16 x i16> zeroinitializer, i16 -1) + %2 = tail call <16 x i16> @llvm.x86.avx512.vpshld.w.256(<16 x i16> %0, <16 x i16> %1, i32 31) %3 = bitcast <16 x i16> %2 to <4 x i64> ret <4 x i64> %3 } @@ -861,13 +887,15 @@ define <2 x i64> @test_mm_mask_shldi_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x entry: %0 = bitcast <2 x i64> %__A to <8 x i16> %1 = bitcast <2 x i64> %__B to <8 x i16> - %2 = bitcast <2 x i64> %__S to <8 x i16> - %3 = tail call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %0, <8 x i16> %1, i32 127, <8 x i16> %2, i8 %__U) - %4 = bitcast <8 x i16> %3 to <2 x i64> - ret <2 x i64> %4 + %2 = tail call <8 x i16> @llvm.x86.avx512.vpshld.w.128(<8 x i16> %0, <8 x i16> %1, i32 127) + %3 = bitcast <2 x i64> %__S to <8 x i16> + %4 = bitcast i8 %__U to <8 x i1> + %5 = select <8 x i1> %4, <8 x i16> %2, <8 x i16> %3 + %6 = bitcast <8 x i16> %5 to <2 x i64> + ret <2 x i64> %6 } -declare <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16>, <8 x i16>, i32, <8 x i16>, i8) +declare <8 x i16> @llvm.x86.avx512.vpshld.w.128(<8 x i16>, <8 x i16>, i32) define <2 x i64> @test_mm_maskz_shldi_epi16(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { ; X86-LABEL: test_mm_maskz_shldi_epi16: @@ -885,9 +913,11 @@ define <2 x i64> @test_mm_maskz_shldi_epi16(i8 zeroext %__U, <2 x i64> %__A, <2 entry: %0 = bitcast <2 x i64> %__A to <8 x i16> %1 = bitcast <2 x i64> %__B to <8 x i16> - %2 = tail call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %0, <8 x i16> %1, i32 63, <8 x i16> zeroinitializer, i8 %__U) - %3 = bitcast <8 x i16> %2 to <2 x i64> - ret <2 x i64> %3 + %2 = tail call <8 x i16> @llvm.x86.avx512.vpshld.w.128(<8 x i16> %0, <8 x i16> %1, i32 63) + %3 = bitcast i8 %__U to <8 x i1> + %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer + %5 = bitcast <8 x i16> %4 to <2 x i64> + ret <2 x i64> %5 } define <2 x i64> @test_mm_shldi_epi16(<2 x i64> %__A, <2 x i64> %__B) { @@ -898,7 +928,7 @@ define <2 x i64> @test_mm_shldi_epi16(<2 x i64> %__A, <2 x i64> %__B) { entry: %0 = bitcast <2 x i64> %__A to <8 x i16> %1 = bitcast <2 x i64> %__B to <8 x i16> - %2 = tail call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %0, <8 x i16> %1, i32 31, <8 x i16> zeroinitializer, i8 -1) + %2 = tail call <8 x i16> @llvm.x86.avx512.vpshld.w.128(<8 x i16> %0, <8 x i16> %1, i32 31) %3 = bitcast <8 x i16> %2 to <2 x i64> ret <2 x i64> %3 } @@ -917,11 +947,14 @@ define <4 x i64> @test_mm256_mask_shrdi_epi64(<4 x i64> %__S, i8 zeroext %__U, < ; X64-NEXT: vpshrdq $127, %ymm2, %ymm1, %ymm0 {%k1} ; X64-NEXT: retq entry: - %0 = tail call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 127, <4 x i64> %__S, i8 %__U) - ret <4 x i64> %0 + %0 = tail call <4 x i64> @llvm.x86.avx512.vpshrd.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 127) + %1 = bitcast i8 %__U to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__S + ret <4 x i64> %2 } -declare <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8) +declare <4 x i64> @llvm.x86.avx512.vpshrd.q.256(<4 x i64>, <4 x i64>, i32) define <4 x i64> @test_mm256_maskz_shrdi_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { ; X86-LABEL: test_mm256_maskz_shrdi_epi64: @@ -937,8 +970,11 @@ define <4 x i64> @test_mm256_maskz_shrdi_epi64(i8 zeroext %__U, <4 x i64> %__A, ; X64-NEXT: vpshrdq $63, %ymm1, %ymm0, %ymm0 {%k1} {z} ; X64-NEXT: retq entry: - %0 = tail call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 63, <4 x i64> zeroinitializer, i8 %__U) - ret <4 x i64> %0 + %0 = tail call <4 x i64> @llvm.x86.avx512.vpshrd.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 63) + %1 = bitcast i8 %__U to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer + ret <4 x i64> %2 } define <4 x i64> @test_mm256_shrdi_epi64(<4 x i64> %__A, <4 x i64> %__B) { @@ -947,7 +983,7 @@ define <4 x i64> @test_mm256_shrdi_epi64(<4 x i64> %__A, <4 x i64> %__B) { ; CHECK-NEXT: vpshrdq $31, %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} entry: - %0 = tail call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 31, <4 x i64> zeroinitializer, i8 -1) + %0 = tail call <4 x i64> @llvm.x86.avx512.vpshrd.q.256(<4 x i64> %__A, <4 x i64> %__B, i32 31) ret <4 x i64> %0 } @@ -965,11 +1001,14 @@ define <2 x i64> @test_mm_mask_shrdi_epi64(<2 x i64> %__S, i8 zeroext %__U, <2 x ; X64-NEXT: vpshrdq $127, %xmm2, %xmm1, %xmm0 {%k1} ; X64-NEXT: retq entry: - %0 = tail call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 127, <2 x i64> %__S, i8 %__U) - ret <2 x i64> %0 + %0 = tail call <2 x i64> @llvm.x86.avx512.vpshrd.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 127) + %1 = bitcast i8 %__U to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__S + ret <2 x i64> %2 } -declare <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8) +declare <2 x i64> @llvm.x86.avx512.vpshrd.q.128(<2 x i64>, <2 x i64>, i32) define <2 x i64> @test_mm_maskz_shrdi_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { ; X86-LABEL: test_mm_maskz_shrdi_epi64: @@ -985,8 +1024,11 @@ define <2 x i64> @test_mm_maskz_shrdi_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 ; X64-NEXT: vpshrdq $63, %xmm1, %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: retq entry: - %0 = tail call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 63, <2 x i64> zeroinitializer, i8 %__U) - ret <2 x i64> %0 + %0 = tail call <2 x i64> @llvm.x86.avx512.vpshrd.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 63) + %1 = bitcast i8 %__U to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer + ret <2 x i64> %2 } define <2 x i64> @test_mm_shrdi_epi64(<2 x i64> %__A, <2 x i64> %__B) { @@ -995,7 +1037,7 @@ define <2 x i64> @test_mm_shrdi_epi64(<2 x i64> %__A, <2 x i64> %__B) { ; CHECK-NEXT: vpshrdq $31, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} entry: - %0 = tail call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 31, <2 x i64> zeroinitializer, i8 -1) + %0 = tail call <2 x i64> @llvm.x86.avx512.vpshrd.q.128(<2 x i64> %__A, <2 x i64> %__B, i32 31) ret <2 x i64> %0 } @@ -1015,13 +1057,15 @@ define <4 x i64> @test_mm256_mask_shrdi_epi32(<4 x i64> %__S, i8 zeroext %__U, < entry: %0 = bitcast <4 x i64> %__A to <8 x i32> %1 = bitcast <4 x i64> %__B to <8 x i32> - %2 = bitcast <4 x i64> %__S to <8 x i32> - %3 = tail call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %0, <8 x i32> %1, i32 127, <8 x i32> %2, i8 %__U) - %4 = bitcast <8 x i32> %3 to <4 x i64> - ret <4 x i64> %4 + %2 = tail call <8 x i32> @llvm.x86.avx512.vpshrd.d.256(<8 x i32> %0, <8 x i32> %1, i32 127) + %3 = bitcast <4 x i64> %__S to <8 x i32> + %4 = bitcast i8 %__U to <8 x i1> + %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3 + %6 = bitcast <8 x i32> %5 to <4 x i64> + ret <4 x i64> %6 } -declare <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8) +declare <8 x i32> @llvm.x86.avx512.vpshrd.d.256(<8 x i32>, <8 x i32>, i32) define <4 x i64> @test_mm256_maskz_shrdi_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { ; X86-LABEL: test_mm256_maskz_shrdi_epi32: @@ -1039,9 +1083,11 @@ define <4 x i64> @test_mm256_maskz_shrdi_epi32(i8 zeroext %__U, <4 x i64> %__A, entry: %0 = bitcast <4 x i64> %__A to <8 x i32> %1 = bitcast <4 x i64> %__B to <8 x i32> - %2 = tail call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %0, <8 x i32> %1, i32 63, <8 x i32> zeroinitializer, i8 %__U) - %3 = bitcast <8 x i32> %2 to <4 x i64> - ret <4 x i64> %3 + %2 = tail call <8 x i32> @llvm.x86.avx512.vpshrd.d.256(<8 x i32> %0, <8 x i32> %1, i32 63) + %3 = bitcast i8 %__U to <8 x i1> + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer + %5 = bitcast <8 x i32> %4 to <4 x i64> + ret <4 x i64> %5 } define <4 x i64> @test_mm256_shrdi_epi32(<4 x i64> %__A, <4 x i64> %__B) { @@ -1052,7 +1098,7 @@ define <4 x i64> @test_mm256_shrdi_epi32(<4 x i64> %__A, <4 x i64> %__B) { entry: %0 = bitcast <4 x i64> %__A to <8 x i32> %1 = bitcast <4 x i64> %__B to <8 x i32> - %2 = tail call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %0, <8 x i32> %1, i32 31, <8 x i32> zeroinitializer, i8 -1) + %2 = tail call <8 x i32> @llvm.x86.avx512.vpshrd.d.256(<8 x i32> %0, <8 x i32> %1, i32 31) %3 = bitcast <8 x i32> %2 to <4 x i64> ret <4 x i64> %3 } @@ -1073,13 +1119,16 @@ define <2 x i64> @test_mm_mask_shrdi_epi32(<2 x i64> %__S, i8 zeroext %__U, <2 x entry: %0 = bitcast <2 x i64> %__A to <4 x i32> %1 = bitcast <2 x i64> %__B to <4 x i32> - %2 = bitcast <2 x i64> %__S to <4 x i32> - %3 = tail call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %0, <4 x i32> %1, i32 127, <4 x i32> %2, i8 %__U) - %4 = bitcast <4 x i32> %3 to <2 x i64> - ret <2 x i64> %4 + %2 = tail call <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32> %0, <4 x i32> %1, i32 127) + %3 = bitcast <2 x i64> %__S to <4 x i32> + %4 = bitcast i8 %__U to <8 x i1> + %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> + %5 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> %3 + %6 = bitcast <4 x i32> %5 to <2 x i64> + ret <2 x i64> %6 } -declare <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8) +declare <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32>, <4 x i32>, i32) define <2 x i64> @test_mm_maskz_shrdi_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { ; X86-LABEL: test_mm_maskz_shrdi_epi32: @@ -1097,9 +1146,12 @@ define <2 x i64> @test_mm_maskz_shrdi_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 entry: %0 = bitcast <2 x i64> %__A to <4 x i32> %1 = bitcast <2 x i64> %__B to <4 x i32> - %2 = tail call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %0, <4 x i32> %1, i32 63, <4 x i32> zeroinitializer, i8 %__U) - %3 = bitcast <4 x i32> %2 to <2 x i64> - ret <2 x i64> %3 + %2 = tail call <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32> %0, <4 x i32> %1, i32 63) + %3 = bitcast i8 %__U to <8 x i1> + %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> zeroinitializer + %5 = bitcast <4 x i32> %4 to <2 x i64> + ret <2 x i64> %5 } define <2 x i64> @test_mm_shrdi_epi32(<2 x i64> %__A, <2 x i64> %__B) { @@ -1110,7 +1162,7 @@ define <2 x i64> @test_mm_shrdi_epi32(<2 x i64> %__A, <2 x i64> %__B) { entry: %0 = bitcast <2 x i64> %__A to <4 x i32> %1 = bitcast <2 x i64> %__B to <4 x i32> - %2 = tail call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %0, <4 x i32> %1, i32 31, <4 x i32> zeroinitializer, i8 -1) + %2 = tail call <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32> %0, <4 x i32> %1, i32 31) %3 = bitcast <4 x i32> %2 to <2 x i64> ret <2 x i64> %3 } @@ -1130,13 +1182,15 @@ define <4 x i64> @test_mm256_mask_shrdi_epi16(<4 x i64> %__S, i16 zeroext %__U, entry: %0 = bitcast <4 x i64> %__A to <16 x i16> %1 = bitcast <4 x i64> %__B to <16 x i16> - %2 = bitcast <4 x i64> %__S to <16 x i16> - %3 = tail call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %0, <16 x i16> %1, i32 127, <16 x i16> %2, i16 %__U) - %4 = bitcast <16 x i16> %3 to <4 x i64> - ret <4 x i64> %4 + %2 = tail call <16 x i16> @llvm.x86.avx512.vpshrd.w.256(<16 x i16> %0, <16 x i16> %1, i32 127) + %3 = bitcast <4 x i64> %__S to <16 x i16> + %4 = bitcast i16 %__U to <16 x i1> + %5 = select <16 x i1> %4, <16 x i16> %2, <16 x i16> %3 + %6 = bitcast <16 x i16> %5 to <4 x i64> + ret <4 x i64> %6 } -declare <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16>, <16 x i16>, i32, <16 x i16>, i16) +declare <16 x i16> @llvm.x86.avx512.vpshrd.w.256(<16 x i16>, <16 x i16>, i32) define <4 x i64> @test_mm256_maskz_shrdi_epi16(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { ; X86-LABEL: test_mm256_maskz_shrdi_epi16: @@ -1153,9 +1207,11 @@ define <4 x i64> @test_mm256_maskz_shrdi_epi16(i16 zeroext %__U, <4 x i64> %__A, entry: %0 = bitcast <4 x i64> %__A to <16 x i16> %1 = bitcast <4 x i64> %__B to <16 x i16> - %2 = tail call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %0, <16 x i16> %1, i32 63, <16 x i16> zeroinitializer, i16 %__U) - %3 = bitcast <16 x i16> %2 to <4 x i64> - ret <4 x i64> %3 + %2 = tail call <16 x i16> @llvm.x86.avx512.vpshrd.w.256(<16 x i16> %0, <16 x i16> %1, i32 63) + %3 = bitcast i16 %__U to <16 x i1> + %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer + %5 = bitcast <16 x i16> %4 to <4 x i64> + ret <4 x i64> %5 } define <4 x i64> @test_mm256_shrdi_epi16(<4 x i64> %__A, <4 x i64> %__B) { @@ -1166,7 +1222,7 @@ define <4 x i64> @test_mm256_shrdi_epi16(<4 x i64> %__A, <4 x i64> %__B) { entry: %0 = bitcast <4 x i64> %__A to <16 x i16> %1 = bitcast <4 x i64> %__B to <16 x i16> - %2 = tail call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %0, <16 x i16> %1, i32 31, <16 x i16> zeroinitializer, i16 -1) + %2 = tail call <16 x i16> @llvm.x86.avx512.vpshrd.w.256(<16 x i16> %0, <16 x i16> %1, i32 31) %3 = bitcast <16 x i16> %2 to <4 x i64> ret <4 x i64> %3 } @@ -1187,13 +1243,15 @@ define <2 x i64> @test_mm_mask_shrdi_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x entry: %0 = bitcast <2 x i64> %__A to <8 x i16> %1 = bitcast <2 x i64> %__B to <8 x i16> - %2 = bitcast <2 x i64> %__S to <8 x i16> - %3 = tail call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %0, <8 x i16> %1, i32 127, <8 x i16> %2, i8 %__U) - %4 = bitcast <8 x i16> %3 to <2 x i64> - ret <2 x i64> %4 + %2 = tail call <8 x i16> @llvm.x86.avx512.vpshrd.w.128(<8 x i16> %0, <8 x i16> %1, i32 127) + %3 = bitcast <2 x i64> %__S to <8 x i16> + %4 = bitcast i8 %__U to <8 x i1> + %5 = select <8 x i1> %4, <8 x i16> %2, <8 x i16> %3 + %6 = bitcast <8 x i16> %5 to <2 x i64> + ret <2 x i64> %6 } -declare <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16>, <8 x i16>, i32, <8 x i16>, i8) +declare <8 x i16> @llvm.x86.avx512.vpshrd.w.128(<8 x i16>, <8 x i16>, i32) define <2 x i64> @test_mm_maskz_shrdi_epi16(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { ; X86-LABEL: test_mm_maskz_shrdi_epi16: @@ -1211,9 +1269,11 @@ define <2 x i64> @test_mm_maskz_shrdi_epi16(i8 zeroext %__U, <2 x i64> %__A, <2 entry: %0 = bitcast <2 x i64> %__A to <8 x i16> %1 = bitcast <2 x i64> %__B to <8 x i16> - %2 = tail call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %0, <8 x i16> %1, i32 63, <8 x i16> zeroinitializer, i8 %__U) - %3 = bitcast <8 x i16> %2 to <2 x i64> - ret <2 x i64> %3 + %2 = tail call <8 x i16> @llvm.x86.avx512.vpshrd.w.128(<8 x i16> %0, <8 x i16> %1, i32 63) + %3 = bitcast i8 %__U to <8 x i1> + %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer + %5 = bitcast <8 x i16> %4 to <2 x i64> + ret <2 x i64> %5 } define <2 x i64> @test_mm_shrdi_epi16(<2 x i64> %__A, <2 x i64> %__B) { @@ -1224,7 +1284,7 @@ define <2 x i64> @test_mm_shrdi_epi16(<2 x i64> %__A, <2 x i64> %__B) { entry: %0 = bitcast <2 x i64> %__A to <8 x i16> %1 = bitcast <2 x i64> %__B to <8 x i16> - %2 = tail call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %0, <8 x i16> %1, i32 31, <8 x i16> zeroinitializer, i8 -1) + %2 = tail call <8 x i16> @llvm.x86.avx512.vpshrd.w.128(<8 x i16> %0, <8 x i16> %1, i32 31) %3 = bitcast <8 x i16> %2 to <2 x i64> ret <2 x i64> %3 } diff --git a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll index d26f1648e111..072f96276f65 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll @@ -368,3 +368,301 @@ define void @test_compress_store_b_256(i8* %addr, <32 x i8> %data) { call void @llvm.x86.avx512.mask.compress.store.b.256(i8* %addr, <32 x i8> %data, i32 -1) ret void } + +define <4 x i32>@test_int_x86_avx512_mask_vpshld_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshld_d_128: +; X86: # %bb.0: +; X86-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x16] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xd1,0x16] +; X86-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xc1,0x16] +; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] +; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_128: +; X64: # %bb.0: +; X64-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xd1,0x16] +; X64-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xc1,0x16] +; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] +; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 -1) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> zeroinitializer,i8 %x4) + %res3 = add <4 x i32> %res, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} +declare <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_vpshld_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshld_d_256: +; X86: # %bb.0: +; X86-NEXT: vpshldd $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xd9,0x16] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpshldd $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x71,0xd1,0x16] +; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_256: +; X64: # %bb.0: +; X64-NEXT: vpshldd $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshldd $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x71,0xd1,0x16] +; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} +declare <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_vpshld_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshld_q_128: +; X86: # %bb.0: +; X86-NEXT: vpshldq $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xd9,0x16] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpshldq $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x71,0xd1,0x16] +; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_128: +; X64: # %bb.0: +; X64-NEXT: vpshldq $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshldq $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x71,0xd1,0x16] +; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} +declare <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_vpshld_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshld_q_256: +; X86: # %bb.0: +; X86-NEXT: vpshldq $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xd9,0x16] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpshldq $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x71,0xd1,0x16] +; X86-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_256: +; X64: # %bb.0: +; X64-NEXT: vpshldq $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshldq $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x71,0xd1,0x16] +; X64-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} +declare <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_vpshld_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x3, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshld_w_128: +; X86: # %bb.0: +; X86-NEXT: vpshldw $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xd9,0x16] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpshldw $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x70,0xd1,0x16] +; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_128: +; X64: # %bb.0: +; X64-NEXT: vpshldw $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshldw $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x70,0xd1,0x16] +; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 %x4) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} +declare <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16>, <8 x i16>, i32, <8 x i16>, i8) + +define <16 x i16>@test_int_x86_avx512_mask_vpshld_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x3, i16 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshld_w_256: +; X86: # %bb.0: +; X86-NEXT: vpshldw $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xd9,0x16] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpshldw $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x70,0xd1,0x16] +; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_256: +; X64: # %bb.0: +; X64-NEXT: vpshldw $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshldw $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x70,0xd1,0x16] +; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 %x4) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} +declare <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16>, <16 x i16>, i32, <16 x i16>, i16) + +define <4 x i32>@test_int_x86_avx512_mask_vpshrd_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshrd_d_128: +; X86: # %bb.0: +; X86-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x16] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xd1,0x16] +; X86-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xc1,0x16] +; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] +; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_128: +; X64: # %bb.0: +; X64-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xd1,0x16] +; X64-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xc1,0x16] +; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] +; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 -1) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> zeroinitializer,i8 %x4) + %res3 = add <4 x i32> %res, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} +declare <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_vpshrd_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshrd_d_256: +; X86: # %bb.0: +; X86-NEXT: vpshrdd $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xd9,0x16] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpshrdd $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x73,0xd1,0x16] +; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_256: +; X64: # %bb.0: +; X64-NEXT: vpshrdd $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshrdd $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x73,0xd1,0x16] +; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} +declare <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_vpshrd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshrd_q_128: +; X86: # %bb.0: +; X86-NEXT: vpshrdq $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xd9,0x16] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpshrdq $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x73,0xd1,0x16] +; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_128: +; X64: # %bb.0: +; X64-NEXT: vpshrdq $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshrdq $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x73,0xd1,0x16] +; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} +declare <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_vpshrd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshrd_q_256: +; X86: # %bb.0: +; X86-NEXT: vpshrdq $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xd9,0x16] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpshrdq $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x73,0xd1,0x16] +; X86-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_256: +; X64: # %bb.0: +; X64-NEXT: vpshrdq $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshrdq $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x73,0xd1,0x16] +; X64-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} +declare <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_vpshrd_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x3, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshrd_w_128: +; X86: # %bb.0: +; X86-NEXT: vpshrdw $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xd9,0x16] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpshrdw $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x72,0xd1,0x16] +; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_128: +; X64: # %bb.0: +; X64-NEXT: vpshrdw $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshrdw $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x72,0xd1,0x16] +; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 %x4) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} +declare <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16>, <8 x i16>, i32, <8 x i16>, i8) + +define <16 x i16>@test_int_x86_avx512_mask_vpshrd_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x3, i16 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vpshrd_w_256: +; X86: # %bb.0: +; X86-NEXT: vpshrdw $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xd9,0x16] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpshrdw $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x72,0xd1,0x16] +; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_256: +; X64: # %bb.0: +; X64-NEXT: vpshrdw $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xd9,0x16] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpshrdw $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x72,0xd1,0x16] +; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 %x4) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} +declare <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16>, <16 x i16>, i32, <16 x i16>, i16) diff --git a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll index c5bff9a01502..341adb92e1d6 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll @@ -361,300 +361,336 @@ declare <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8> %data, <32 x i8 define <4 x i32>@test_int_x86_avx512_mask_vpshld_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshld_d_128: ; X86: # %bb.0: +; X86-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x16] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xd1,0x16] -; X86-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd9,0x16] -; X86-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc1,0x16] -; X86-NEXT: vpaddd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3] +; X86-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xc1,0x16] +; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] ; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_128: ; X64: # %bb.0: +; X64-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd9,0x16] ; X64-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xd1,0x16] -; X64-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc1,0x16] -; X64-NEXT: vpaddd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3] +; X64-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xc1,0x16] +; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] ; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 -1) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> zeroinitializer,i8 %x4) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res3, %res2 + %1 = call <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22) + %2 = bitcast i8 %x4 to <8 x i1> + %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract1, <4 x i32> %1, <4 x i32> %x3 + %4 = call <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22) + %5 = call <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22) + %6 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %6, <8 x i1> %6, <4 x i32> + %7 = select <4 x i1> %extract, <4 x i32> %5, <4 x i32> zeroinitializer + %res3 = add <4 x i32> %3, %4 + %res4 = add <4 x i32> %res3, %7 ret <4 x i32> %res4 } -declare <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8) +declare <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32>, <4 x i32>, i32) define <8 x i32>@test_int_x86_avx512_mask_vpshld_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshld_d_256: ; X86: # %bb.0: +; X86-NEXT: vpshldd $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xd9,0x16] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshldd $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x71,0xd1,0x16] -; X86-NEXT: vpshldd $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xc1,0x16] -; X86-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] +; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_256: ; X64: # %bb.0: +; X64-NEXT: vpshldd $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshldd $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x71,0xd1,0x16] -; X64-NEXT: vpshldd $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xc1,0x16] -; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] +; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1) - %res2 = add <8 x i32> %res, %res1 + %1 = call <8 x i32> @llvm.x86.avx512.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22) + %2 = bitcast i8 %x4 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x3 + %4 = call <8 x i32> @llvm.x86.avx512.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22) + %res2 = add <8 x i32> %3, %4 ret <8 x i32> %res2 } -declare <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8) +declare <8 x i32> @llvm.x86.avx512.vpshld.d.256(<8 x i32>, <8 x i32>, i32) define <2 x i64>@test_int_x86_avx512_mask_vpshld_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshld_q_128: ; X86: # %bb.0: +; X86-NEXT: vpshldq $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xd9,0x16] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshldq $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x71,0xd1,0x16] -; X86-NEXT: vpshldq $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xc1,0x16] -; X86-NEXT: vpaddq %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] +; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_128: ; X64: # %bb.0: +; X64-NEXT: vpshldq $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshldq $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x71,0xd1,0x16] -; X64-NEXT: vpshldq $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xc1,0x16] -; X64-NEXT: vpaddq %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] +; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 -1) - %res2 = add <2 x i64> %res, %res1 + %1 = call <2 x i64> @llvm.x86.avx512.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22) + %2 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x3 + %4 = call <2 x i64> @llvm.x86.avx512.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22) + %res2 = add <2 x i64> %3, %4 ret <2 x i64> %res2 } -declare <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8) +declare <2 x i64> @llvm.x86.avx512.vpshld.q.128(<2 x i64>, <2 x i64>, i32) define <4 x i64>@test_int_x86_avx512_mask_vpshld_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshld_q_256: ; X86: # %bb.0: +; X86-NEXT: vpshldq $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xd9,0x16] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshldq $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x71,0xd1,0x16] -; X86-NEXT: vpshldq $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xc1,0x16] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_256: ; X64: # %bb.0: +; X64-NEXT: vpshldq $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshldq $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x71,0xd1,0x16] -; X64-NEXT: vpshldq $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xc1,0x16] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1) - %res2 = add <4 x i64> %res, %res1 + %1 = call <4 x i64> @llvm.x86.avx512.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22) + %2 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x3 + %4 = call <4 x i64> @llvm.x86.avx512.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22) + %res2 = add <4 x i64> %3, %4 ret <4 x i64> %res2 } -declare <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8) +declare <4 x i64> @llvm.x86.avx512.vpshld.q.256(<4 x i64>, <4 x i64>, i32) define <8 x i16>@test_int_x86_avx512_mask_vpshld_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshld_w_128: ; X86: # %bb.0: +; X86-NEXT: vpshldw $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xd9,0x16] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshldw $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x70,0xd1,0x16] -; X86-NEXT: vpshldw $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xc1,0x16] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] +; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_128: ; X64: # %bb.0: +; X64-NEXT: vpshldw $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshldw $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x70,0xd1,0x16] -; X64-NEXT: vpshldw $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xc1,0x16] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] +; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 %x4) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 -1) - %res2 = add <8 x i16> %res, %res1 + %1 = call <8 x i16> @llvm.x86.avx512.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22) + %2 = bitcast i8 %x4 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x3 + %4 = call <8 x i16> @llvm.x86.avx512.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22) + %res2 = add <8 x i16> %3, %4 ret <8 x i16> %res2 } -declare <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16>, <8 x i16>, i32, <8 x i16>, i8) +declare <8 x i16> @llvm.x86.avx512.vpshld.w.128(<8 x i16>, <8 x i16>, i32) define <16 x i16>@test_int_x86_avx512_mask_vpshld_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshld_w_256: ; X86: # %bb.0: +; X86-NEXT: vpshldw $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xd9,0x16] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpshldw $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x70,0xd1,0x16] -; X86-NEXT: vpshldw $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xc1,0x16] -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_256: ; X64: # %bb.0: +; X64-NEXT: vpshldw $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshldw $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x70,0xd1,0x16] -; X64-NEXT: vpshldw $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xc1,0x16] -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 %x4) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 -1) - %res2 = add <16 x i16> %res, %res1 + %1 = call <16 x i16> @llvm.x86.avx512.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22) + %2 = bitcast i16 %x4 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x3 + %4 = call <16 x i16> @llvm.x86.avx512.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22) + %res2 = add <16 x i16> %3, %4 ret <16 x i16> %res2 } -declare <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16>, <16 x i16>, i32, <16 x i16>, i16) +declare <16 x i16> @llvm.x86.avx512.vpshld.w.256(<16 x i16>, <16 x i16>, i32) define <4 x i32>@test_int_x86_avx512_mask_vpshrd_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_d_128: ; X86: # %bb.0: +; X86-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x16] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xd1,0x16] -; X86-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd9,0x16] -; X86-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc1,0x16] -; X86-NEXT: vpaddd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3] +; X86-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xc1,0x16] +; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] ; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_128: ; X64: # %bb.0: +; X64-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd9,0x16] ; X64-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xd1,0x16] -; X64-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc1,0x16] -; X64-NEXT: vpaddd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3] +; X64-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xc1,0x16] +; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] ; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 -1) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> zeroinitializer,i8 %x4) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res3, %res2 + %1 = call <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22) + %2 = bitcast i8 %x4 to <8 x i1> + %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract1, <4 x i32> %1, <4 x i32> %x3 + %4 = call <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22) + %5 = call <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22) + %6 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %6, <8 x i1> %6, <4 x i32> + %7 = select <4 x i1> %extract, <4 x i32> %5, <4 x i32> zeroinitializer + %res3 = add <4 x i32> %3, %4 + %res4 = add <4 x i32> %res3, %7 ret <4 x i32> %res4 } -declare <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8) +declare <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32>, <4 x i32>, i32) define <8 x i32>@test_int_x86_avx512_mask_vpshrd_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_d_256: ; X86: # %bb.0: +; X86-NEXT: vpshrdd $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xd9,0x16] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshrdd $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x73,0xd1,0x16] -; X86-NEXT: vpshrdd $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xc1,0x16] -; X86-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] +; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_256: ; X64: # %bb.0: +; X64-NEXT: vpshrdd $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshrdd $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x73,0xd1,0x16] -; X64-NEXT: vpshrdd $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xc1,0x16] -; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] +; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1) - %res2 = add <8 x i32> %res, %res1 + %1 = call <8 x i32> @llvm.x86.avx512.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22) + %2 = bitcast i8 %x4 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x3 + %4 = call <8 x i32> @llvm.x86.avx512.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22) + %res2 = add <8 x i32> %3, %4 ret <8 x i32> %res2 } -declare <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8) +declare <8 x i32> @llvm.x86.avx512.vpshrd.d.256(<8 x i32>, <8 x i32>, i32) define <2 x i64>@test_int_x86_avx512_mask_vpshrd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_q_128: ; X86: # %bb.0: +; X86-NEXT: vpshrdq $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xd9,0x16] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshrdq $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x73,0xd1,0x16] -; X86-NEXT: vpshrdq $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xc1,0x16] -; X86-NEXT: vpaddq %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] +; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_128: ; X64: # %bb.0: +; X64-NEXT: vpshrdq $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshrdq $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x73,0xd1,0x16] -; X64-NEXT: vpshrdq $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xc1,0x16] -; X64-NEXT: vpaddq %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] +; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 -1) - %res2 = add <2 x i64> %res, %res1 + %1 = call <2 x i64> @llvm.x86.avx512.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22) + %2 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x3 + %4 = call <2 x i64> @llvm.x86.avx512.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22) + %res2 = add <2 x i64> %3, %4 ret <2 x i64> %res2 } -declare <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8) +declare <2 x i64> @llvm.x86.avx512.vpshrd.q.128(<2 x i64>, <2 x i64>, i32) define <4 x i64>@test_int_x86_avx512_mask_vpshrd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_q_256: ; X86: # %bb.0: +; X86-NEXT: vpshrdq $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xd9,0x16] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshrdq $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x73,0xd1,0x16] -; X86-NEXT: vpshrdq $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xc1,0x16] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_256: ; X64: # %bb.0: +; X64-NEXT: vpshrdq $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshrdq $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x73,0xd1,0x16] -; X64-NEXT: vpshrdq $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xc1,0x16] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1) - %res2 = add <4 x i64> %res, %res1 + %1 = call <4 x i64> @llvm.x86.avx512.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22) + %2 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x3 + %4 = call <4 x i64> @llvm.x86.avx512.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22) + %res2 = add <4 x i64> %3, %4 ret <4 x i64> %res2 } -declare <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8) +declare <4 x i64> @llvm.x86.avx512.vpshrd.q.256(<4 x i64>, <4 x i64>, i32) define <8 x i16>@test_int_x86_avx512_mask_vpshrd_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_w_128: ; X86: # %bb.0: +; X86-NEXT: vpshrdw $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xd9,0x16] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshrdw $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x72,0xd1,0x16] -; X86-NEXT: vpshrdw $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xc1,0x16] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] +; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_128: ; X64: # %bb.0: +; X64-NEXT: vpshrdw $22, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshrdw $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x72,0xd1,0x16] -; X64-NEXT: vpshrdw $22, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xc1,0x16] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] +; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 %x4) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 -1) - %res2 = add <8 x i16> %res, %res1 + %1 = call <8 x i16> @llvm.x86.avx512.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22) + %2 = bitcast i8 %x4 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x3 + %4 = call <8 x i16> @llvm.x86.avx512.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22) + %res2 = add <8 x i16> %3, %4 ret <8 x i16> %res2 } -declare <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16>, <8 x i16>, i32, <8 x i16>, i8) +declare <8 x i16> @llvm.x86.avx512.vpshrd.w.128(<8 x i16>, <8 x i16>, i32) define <16 x i16>@test_int_x86_avx512_mask_vpshrd_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_w_256: ; X86: # %bb.0: +; X86-NEXT: vpshrdw $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xd9,0x16] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpshrdw $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x72,0xd1,0x16] -; X86-NEXT: vpshrdw $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xc1,0x16] -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_256: ; X64: # %bb.0: +; X64-NEXT: vpshrdw $22, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xd9,0x16] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshrdw $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x72,0xd1,0x16] -; X64-NEXT: vpshrdw $22, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xc1,0x16] -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 %x4) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 -1) - %res2 = add <16 x i16> %res, %res1 + %1 = call <16 x i16> @llvm.x86.avx512.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22) + %2 = bitcast i16 %x4 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x3 + %4 = call <16 x i16> @llvm.x86.avx512.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22) + %res2 = add <16 x i16> %3, %4 ret <16 x i16> %res2 } -declare <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16>, <16 x i16>, i32, <16 x i16>, i16) +declare <16 x i16> @llvm.x86.avx512.vpshrd.w.256(<16 x i16>, <16 x i16>, i32) declare <8 x i32> @llvm.x86.avx512.mask.vpshrdv.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) declare <8 x i32> @llvm.x86.avx512.maskz.vpshrdv.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)