[X86] Add ISD node for masked version of CVTPS2PH.

The 128-bit input produces 64-bits of output and fills the upper 64-bits with 0. The mask only applies to the lower elements. But we can't represent this with a vselect like we normally do.

This also avoids the need to have a special X86ISD::SELECT when avx512bw isn't enabled since vselect v8i16 isn't legal there.

Fixes another instruction for PR34877.

llvm-svn: 350994
This commit is contained in:
Craig Topper 2019-01-12 08:05:12 +00:00
parent 61aa940074
commit 33b2cf50e3
7 changed files with 76 additions and 39 deletions

View File

@ -21383,12 +21383,6 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
case X86ISD::VPSHUFBITQMB: case X86ISD::VPSHUFBITQMB:
case X86ISD::VFPCLASS: case X86ISD::VFPCLASS:
return DAG.getNode(ISD::AND, dl, VT, Op, VMask); return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
case X86ISD::CVTPS2PH:
// We can't use ISD::VSELECT here because it is not always "Legal"
// for the destination type. For example vpmovqb require only AVX512
// and vselect that can operate on byte element type require BWI
OpcodeSelect = X86ISD::SELECT;
break;
} }
if (PreservedSrc.isUndef()) if (PreservedSrc.isUndef())
PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
@ -22068,9 +22062,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue Mask = Op.getOperand(3); SDValue Mask = Op.getOperand(3);
if (isAllOnesConstant(Mask)) if (isAllOnesConstant(Mask))
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
Op.getValueType(), Src),
Mask, PassThru, Subtarget, DAG);
MVT SrcVT = Src.getSimpleValueType(); MVT SrcVT = Src.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
@ -22078,6 +22070,22 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru, return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
Mask); Mask);
} }
case CVTPS2PH_MASK: {
SDValue Src = Op.getOperand(1);
SDValue Rnd = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
if (isAllOnesConstant(Mask))
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
MVT SrcVT = Src.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
PassThru, Mask);
}
default: default:
break; break;
} }
@ -27365,6 +27373,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND"; case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND"; case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH"; case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH";
case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS"; case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND"; case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI"; case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";

View File

@ -556,6 +556,10 @@ namespace llvm {
// Conversions between float and half-float. // Conversions between float and half-float.
CVTPS2PH, CVTPH2PS, CVTPH2PS_RND, CVTPS2PH, CVTPH2PS, CVTPH2PS_RND,
// Masked version of above.
// SRC, RND, PASSTHRU, MASK
MCVTPS2PH,
// Galois Field Arithmetic Instructions // Galois Field Arithmetic Instructions
GF2P8AFFINEINVQB, GF2P8AFFINEQB, GF2P8MULB, GF2P8AFFINEINVQB, GF2P8AFFINEQB, GF2P8MULB,

View File

@ -8726,12 +8726,28 @@ let Predicates = [HasVLX] in {
multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src, multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
X86MemOperand x86memop, SchedWrite RR, SchedWrite MR> { X86MemOperand x86memop, SchedWrite RR, SchedWrite MR> {
defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), let ExeDomain = GenericDomain in {
(ins _src.RC:$src1, i32u8imm:$src2), def rr : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
"vcvtps2ph", "$src2, $src1", "$src1, $src2", (ins _src.RC:$src1, i32u8imm:$src2),
(X86cvtps2ph (_src.VT _src.RC:$src1), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
(i32 imm:$src2)), 0, 0>, [(set _dest.RC:$dst,
AVX512AIi8Base, Sched<[RR]>; (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2)))]>,
Sched<[RR]>;
let Constraints = "$src0 = $dst" in
def rrk : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
(ins _dest.RC:$src0, _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _dest.RC:$dst,
(X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2),
_dest.RC:$src0, _src.KRCWM:$mask))]>,
Sched<[RR]>, EVEX_K;
def rrkz : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
(ins _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}",
[(set _dest.RC:$dst,
(X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2),
_dest.ImmAllZerosV, _src.KRCWM:$mask))]>,
Sched<[RR]>, EVEX_KZ;
let hasSideEffects = 0, mayStore = 1 in { let hasSideEffects = 0, mayStore = 1 in {
def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
(ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
@ -8743,6 +8759,7 @@ multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
EVEX_K, Sched<[MR]>, NotMemoryFoldable; EVEX_K, Sched<[MR]>, NotMemoryFoldable;
} }
} }
}
multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src, multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
SchedWrite Sched> { SchedWrite Sched> {

View File

@ -595,6 +595,13 @@ def X86cvtps2ph : SDNode<"X86ISD::CVTPS2PH",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>, SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
SDTCVecEltisVT<1, f32>, SDTCVecEltisVT<1, f32>,
SDTCisVT<2, i32>]> >; SDTCisVT<2, i32>]> >;
def X86mcvtps2ph : SDNode<"X86ISD::MCVTPS2PH",
SDTypeProfile<1, 4, [SDTCVecEltisVT<0, i16>,
SDTCVecEltisVT<1, f32>,
SDTCisVT<2, i32>,
SDTCisSameAs<0, 3>,
SDTCVecEltisVT<4, i1>,
SDTCisSameNumEltsAs<1, 4>]> >;
def X86vfpextRnd : SDNode<"X86ISD::VFPEXT_RND", def X86vfpextRnd : SDNode<"X86ISD::VFPEXT_RND",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>, SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
SDTCVecEltisVT<1, f32>, SDTCVecEltisVT<1, f32>,

View File

@ -32,7 +32,7 @@ enum IntrinsicType : uint16_t {
IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK, IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK,
INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK, INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK,
COMPRESS_EXPAND_IN_REG, COMPRESS_EXPAND_IN_REG,
TRUNCATE_TO_REG, TRUNCATE_TO_REG, CVTPS2PH_MASK,
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
FIXUPIMMS_MASKZ, GATHER_AVX2, FIXUPIMMS_MASKZ, GATHER_AVX2,
@ -838,12 +838,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::CVTPH2PS, 0), X86ISD::CVTPH2PS, 0),
X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK, X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK,
X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_RND), X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_RND),
X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK, X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, CVTPS2PH_MASK,
X86ISD::CVTPS2PH, 0), X86ISD::CVTPS2PH, X86ISD::MCVTPS2PH),
X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK, X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, CVTPS2PH_MASK,
X86ISD::CVTPS2PH, 0), X86ISD::CVTPS2PH, X86ISD::MCVTPS2PH),
X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK, X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, CVTPS2PH_MASK,
X86ISD::CVTPS2PH, 0), X86ISD::CVTPS2PH, X86ISD::MCVTPS2PH),
X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_128, CMP_MASK, X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_128, CMP_MASK,
X86ISD::VPSHUFBITQMB, 0), X86ISD::VPSHUFBITQMB, 0),

View File

@ -990,8 +990,8 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16
; CHECK-LABEL: test_x86_vcvtps2ph_256: ; CHECK-LABEL: test_x86_vcvtps2ph_256:
; CHECK: ## %bb.0: ; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1
; CHECK-NEXT: vcvtps2ph $2, %zmm0, (%rsi) ; CHECK-NEXT: vcvtps2ph $2, %zmm0, (%rsi)
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0

View File

@ -4306,21 +4306,21 @@ define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %s
; X86: # %bb.0: ; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02]
; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x02]
; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02] ; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02]
; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc2,0x02] ; X86-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1]
; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x02]
; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X86-NEXT: retl # encoding: [0xc3] ; X86-NEXT: retl # encoding: [0xc3]
; ;
; X64-LABEL: test_x86_vcvtps2ph_128: ; X64-LABEL: test_x86_vcvtps2ph_128:
; X64: # %bb.0: ; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02]
; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x02]
; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02] ; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02]
; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc2,0x02] ; X64-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1]
; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x02]
; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X64-NEXT: retq # encoding: [0xc3] ; X64-NEXT: retq # encoding: [0xc3]
%res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 %mask) %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 %mask)
@ -4337,22 +4337,22 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %s
; X86: # %bb.0: ; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02]
; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x02]
; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02] ; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02]
; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc2,0x02] ; X86-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1]
; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x02]
; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3] ; X86-NEXT: retl # encoding: [0xc3]
; ;
; X64-LABEL: test_x86_vcvtps2ph_256: ; X64-LABEL: test_x86_vcvtps2ph_256:
; X64: # %bb.0: ; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02]
; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x02]
; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02] ; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02]
; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc2,0x02] ; X64-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1]
; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x02]
; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3] ; X64-NEXT: retq # encoding: [0xc3]
%res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)