From 22c82af5c85e33508fee35d1785e801bc8136acc Mon Sep 17 00:00:00 2001 From: Mikhail Dvoretckii Date: Thu, 21 Jun 2018 14:16:45 +0000 Subject: [PATCH] [x86] Lower some trunc + shuffle patterns to vpmov[q|d][b|w] This should help in lowering the following four intrinsics: _mm256_cvtepi32_epi8 _mm256_cvtepi64_epi16 _mm256_cvtepi64_epi8 _mm512_cvtepi64_epi8 Differential Revision: https://reviews.llvm.org/D46957 llvm-svn: 335238 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 125 +++++++++++++++++- llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll | 81 ++++-------- llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 3 +- 3 files changed, 146 insertions(+), 63 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 033a343089d3..314a944858b7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4822,10 +4822,24 @@ static bool isUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size) { return true; } +/// Return true if Val falls within the specified range (L, H]. +static bool isInRange(int Val, int Low, int Hi) { + return (Val >= Low && Val < Hi); +} + +/// Return true if the value of any element in Mask falls within the specified +/// range (L, H]. +static bool isAnyInRange(ArrayRef Mask, int Low, int Hi) { + for (int M : Mask) + if (isInRange(M, Low, Hi)) + return true; + return false; +} + /// Return true if Val is undef or if its value falls within the /// specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { - return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi); + return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi); } /// Return true if every element in Mask is undef or if its value @@ -4841,7 +4855,7 @@ static bool isUndefOrInRange(ArrayRef Mask, /// Return true if Val is undef, zero or if its value falls within the /// specified range (L, H]. static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) { - return isUndefOrZero(Val) || (Val >= Low && Val < Hi); + return isUndefOrZero(Val) || isInRange(Val, Low, Hi); } /// Return true if every element in Mask is undef, zero or if its value @@ -4854,11 +4868,11 @@ static bool isUndefOrZeroOrInRange(ArrayRef Mask, int Low, int Hi) { } /// Return true if every element in Mask, beginning -/// from position Pos and ending in Pos+Size, falls within the specified -/// sequential range (Low, Low+Size]. or is undef. -static bool isSequentialOrUndefInRange(ArrayRef Mask, - unsigned Pos, unsigned Size, int Low) { - for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) +/// from position Pos and ending in Pos + Size, falls within the specified +/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef. +static bool isSequentialOrUndefInRange(ArrayRef Mask, unsigned Pos, + unsigned Size, int Low, int Step = 1) { + for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step) if (!isUndefOrEqual(Mask[i], Low)) return false; return true; @@ -9390,6 +9404,99 @@ static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, return SDValue(); } +static bool matchVectorShuffleAsVPMOV(ArrayRef Mask, bool SwappedOps, + int Delta) { + int Size = (int)Mask.size(); + int Split = Size / Delta; + int TruncatedVectorStart = SwappedOps ? Size : 0; + + // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,... + if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta)) + return false; + + // The rest of the mask should not refer to the truncated vector's elements. + if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart, + TruncatedVectorStart + Size)) + return false; + + return true; +} + +// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction. +// +// An example is the following: +// +// t0: ch = EntryToken +// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0 +// t25: v4i32 = truncate t2 +// t41: v8i16 = bitcast t25 +// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16, +// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0> +// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21 +// t18: v2i64 = bitcast t51 +// +// Without avx512vl, this is lowered to: +// +// vpmovqd %zmm0, %ymm0 +// vpshufb {{.*#+}} xmm0 = +// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +// +// But when avx512vl is available, one can just use a single vpmovdw +// instruction. +static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef Mask, + MVT VT, SDValue V1, SDValue V2, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (VT != MVT::v16i8 && VT != MVT::v8i16) + return SDValue(); + + if (Mask.size() != VT.getVectorNumElements()) + return SDValue(); + + bool SwappedOps = false; + + if (!ISD::isBuildVectorAllZeros(V2.getNode())) { + if (!ISD::isBuildVectorAllZeros(V1.getNode())) + return SDValue(); + + std::swap(V1, V2); + SwappedOps = true; + } + + // Look for: + // + // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8> + // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16> + // + // and similar ones. + if (V1.getOpcode() != ISD::BITCAST) + return SDValue(); + if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE) + return SDValue(); + + SDValue Src = V1.getOperand(0).getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + + // The vptrunc** instructions truncating 128 bit and 256 bit vectors + // are only available with avx512vl. + if (!SrcVT.is512BitVector() && !Subtarget.hasVLX()) + return SDValue(); + + // Down Convert Word to Byte is only available with avx512bw. The case with + // 256-bit output doesn't contain a shuffle and is therefore not handled here. + if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 && + !Subtarget.hasBWI()) + return SDValue(); + + // The first half/quarter of the mask should refer to every second/fourth + // element of the vector truncated and bitcasted. + if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) && + !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4)) + return SDValue(); + + return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src); +} + // X86 has dedicated pack instructions that can handle specific truncation // operations: PACKSS and PACKUS. static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, @@ -14923,6 +15030,10 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, if (canonicalizeShuffleMaskWithCommute(Mask)) return DAG.getCommutedVectorShuffle(*SVOp); + if (SDValue V = + lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget)) + return V; + // For each vector width, delegate to a specialized lowering routine. if (VT.is128BitVector()) return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll index 4576ef5a32b5..152742c80163 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -511,8 +511,7 @@ define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind { ; ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -526,15 +525,13 @@ define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind { ; ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %truncated.vec = trunc <8 x i32> %vec to <8 x i8> @@ -573,8 +570,7 @@ define <16 x i8> @trunc_v8i32_to_v8i8_with_zext_return_v16i8(<8 x i32> %vec) nou ; ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -588,15 +584,13 @@ define <16 x i8> @trunc_v8i32_to_v8i8_with_zext_return_v16i8(<8 x i32> %vec) nou ; ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %truncated = trunc <8 x i32> %vec to <8 x i8> @@ -636,8 +630,7 @@ define <16 x i8> @trunc_v8i32_to_v8i8_via_v8i16_return_v16i8(<8 x i32> %vec) nou ; ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -651,15 +644,13 @@ define <16 x i8> @trunc_v8i32_to_v8i8_via_v8i16_return_v16i8(<8 x i32> %vec) nou ; ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %truncated = trunc <8 x i32> %vec to <8 x i16> @@ -698,8 +689,7 @@ define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind { ; ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -713,15 +703,13 @@ define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind { ; ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %truncated = trunc <8 x i32> %vec to <8 x i8> @@ -766,8 +754,7 @@ define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind { ; ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -781,15 +768,13 @@ define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind { ; ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %truncated = trunc <4 x i64> %vec to <4 x i16> @@ -833,8 +818,7 @@ define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) no ; ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -848,15 +832,13 @@ define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) no ; ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %truncated = trunc <4 x i64> %vec to <4 x i16> @@ -901,8 +883,7 @@ define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) no ; ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -916,15 +897,13 @@ define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) no ; ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %truncated = trunc <4 x i64> %vec to <4 x i32> @@ -968,8 +947,7 @@ define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind { ; ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -983,15 +961,13 @@ define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind { ; ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %truncated = trunc <4 x i64> %vec to <4 x i16> @@ -1034,8 +1010,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind { ; ; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero +; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1049,15 +1024,13 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind { ; ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero +; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero +; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0 ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %truncated = trunc <4 x i64> %vec to <4 x i8> diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll index d242f8582a48..7c100cb165e3 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -945,8 +945,7 @@ define <4 x double> @PR34175(<32 x i16>* %p) { define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind { ; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %truncated = trunc <8 x i64> %vec to <8 x i8>