diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index dae29807acf0..a3f61b5ee352 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -937,6 +937,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); + if (ExperimentalVectorWideningLegalization) { + setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); + } + // In the customized shift lowering, the legal v4i32/v2i64 cases // in AVX2 will be recognized. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { @@ -17917,6 +17926,10 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); + // If called by the legalizer just return. + if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) + return SDValue(); + if (VT.getVectorElementType() == MVT::i1) return LowerTruncateVecI1(Op, DAG, Subtarget); @@ -26200,6 +26213,57 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(V); return; } + case ISD::TRUNCATE: { + MVT VT = N->getSimpleValueType(0); + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + return; + + // The generic legalizer will try to widen the input type to the same + // number of elements as the widened result type. But this isn't always + // the best thing so do some custom legalization to avoid some cases. + MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT(); + SDValue In = N->getOperand(0); + EVT InVT = In.getValueType(); + + unsigned InBits = InVT.getSizeInBits(); + if (128 % InBits == 0) { + // 128 bit and smaller inputs should avoid truncate all together and + // just use a build_vector that will become a shuffle. + // TODO: Widen and use a shuffle directly? + MVT InEltVT = InVT.getSimpleVT().getVectorElementType(); + EVT EltVT = VT.getVectorElementType(); + unsigned WidenNumElts = WidenVT.getVectorNumElements(); + SmallVector Ops(WidenNumElts, DAG.getUNDEF(EltVT)); + // Use the original element count so we don't do more scalar opts than + // necessary. + unsigned MinElts = VT.getVectorNumElements(); + for (unsigned i=0; i < MinElts; ++i) { + SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In, + DAG.getIntPtrConstant(i, dl)); + Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val); + } + Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops)); + return; + } + // With AVX512 there are some cases that can use a target specific + // truncate node to go from 256/512 to less than 128 with zeros in the + // upper elements of the 128 bit result. + if (Subtarget.hasAVX512() && isTypeLegal(InVT)) { + // We can use VTRUNC directly if for 256 bits with VLX or for any 512. + if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) { + Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In)); + return; + } + // There's one case we can widen to 512 bits and use VTRUNC. + if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) { + In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In, + DAG.getUNDEF(MVT::v4i64)); + Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In)); + return; + } + } + return; + } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: { bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; diff --git a/llvm/test/CodeGen/X86/avx512-cvt-widen.ll b/llvm/test/CodeGen/X86/avx512-cvt-widen.ll index 0ea613184ab3..a4d9a8272a38 100644 --- a/llvm/test/CodeGen/X86/avx512-cvt-widen.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt-widen.ll @@ -1645,9 +1645,8 @@ define <2 x double> @sbto2f64(<2 x double> %a) { ; NOVL: # %bb.0: ; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; NOVL-NEXT: vpmovqd %zmm0, %ymm0 +; NOVL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 -; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq ; ; VLDQ-LABEL: sbto2f64: diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll index 67901e992f4e..fae69ea5d80e 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll @@ -75,27 +75,22 @@ define void @trunc_v8i16_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512F-LABEL: trunc_v8i16_to_v8i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v8i16_to_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_v8i16_to_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v8i16_to_v8i8: @@ -171,9 +166,8 @@ define void @trunc_v4i32_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX512F-LABEL: trunc_v4i32_to_v4i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v4i32_to_v4i16: @@ -185,9 +179,8 @@ define void @trunc_v4i32_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX512BW-LABEL: trunc_v4i32_to_v4i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v4i32_to_v4i16: @@ -233,35 +226,16 @@ define void @trunc_v2i64_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind { ; SSE-NEXT: movq %xmm0, (%rsi) ; SSE-NEXT: retq ; -; AVX1-LABEL: trunc_v2i64_to_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX1-NEXT: vmovlps %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vmovlps %xmm0, (%rsi) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_v2i64_to_v2i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovlps %xmm0, (%rsi) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX-LABEL: trunc_v2i64_to_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX-NEXT: vmovlps %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_v2i64_to_v2i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512F-NEXT: vmovlps %xmm0, (%rsi) ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v2i64_to_v2i32: @@ -272,10 +246,8 @@ define void @trunc_v2i64_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind { ; ; AVX512BW-LABEL: trunc_v2i64_to_v2i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32: @@ -353,9 +325,8 @@ define void @trunc_v4i32_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind { ; AVX512F-LABEL: trunc_v4i32_to_v4i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v4i32_to_v4i8: @@ -367,9 +338,8 @@ define void @trunc_v4i32_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind { ; AVX512BW-LABEL: trunc_v4i32_to_v4i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v4i32_to_v4i8: @@ -477,10 +447,9 @@ define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind { ; ; AVX512F-LABEL: trunc_v2i64_to_v2i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v2i64_to_v2i16: @@ -492,9 +461,8 @@ define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind { ; AVX512BW-LABEL: trunc_v2i64_to_v2i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v2i64_to_v2i16: diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll index 7ac59a667c1c..940f475080f1 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll @@ -381,13 +381,42 @@ define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v8i32_to_v8i8_return_v2i64: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq %truncated.vec = trunc <8 x i32> %vec to <8 x i8> %bc = bitcast <8 x i8> %truncated.vec to i64 %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0 @@ -536,13 +565,42 @@ define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v8i32_to_v8i8_return_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq %truncated = trunc <8 x i32> %vec to <8 x i8> %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %result @@ -586,13 +644,42 @@ define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind { ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; -; AVX512-LABEL: trunc_v4i64_to_v4i16_return_v2i64: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq %truncated = trunc <4 x i64> %vec to <4 x i16> %bc = bitcast <4 x i16> %truncated to i64 %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0 @@ -764,13 +851,42 @@ define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind { ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; -; AVX512-LABEL: trunc_v4i64_to_v4i16_return_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq %truncated = trunc <4 x i64> %vec to <4 x i16> %result = shufflevector <4 x i16> %truncated, <4 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %result @@ -801,17 +917,47 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v4i64_to_v4i8_return_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v4i64_to_v4i8_return_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_to_v4i8_return_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VBMIVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: retq %truncated = trunc <4 x i64> %vec to <4 x i8> %result = shufflevector <4 x i8> %truncated, <4 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %result @@ -1025,13 +1171,10 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { ; ; AVX512F-LABEL: trunc_v4i64_to_v4i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v4i64_to_v4i8: @@ -1043,13 +1186,10 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { ; ; AVX512BW-LABEL: trunc_v4i64_to_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8: diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll index a9e45243381f..e78d0e050a85 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll @@ -911,118 +911,12 @@ define <4 x double> @PR34175(<32 x i16>* %p) { } define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind { -; AVX512F-LABEL: trunc_v8i64_to_v8i8_return_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm1, %r8 -; AVX512F-NEXT: vmovq %xmm1, %r9 -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm1, %r10 -; AVX512F-NEXT: vmovq %xmm1, %rsi -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm1, %rdi -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx -; AVX512F-NEXT: vmovq %xmm0, %rdx -; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v8i64_to_v8i8_return_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm1, %r8 -; AVX512VL-NEXT: vmovq %xmm1, %r9 -; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm1, %r10 -; AVX512VL-NEXT: vmovq %xmm1, %rsi -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm1, %rdi -; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx -; AVX512VL-NEXT: vmovq %xmm0, %rdx -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v8i64_to_v8i8_return_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v8i64_to_v8i8_return_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMI-LABEL: trunc_v8i64_to_v8i8_return_v16i8: -; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VBMI-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512VBMI-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512VBMI-NEXT: vzeroupper -; AVX512VBMI-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_v8i64_to_v8i8_return_v16i8: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,8,16,24,32,40,48,56,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VBMIVL-NEXT: vpermi2b %ymm1, %ymm0, %ymm2 -; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm2[0],zero -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq +; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %truncated = trunc <8 x i64> %vec to <8 x i8> %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %result diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll index 82904ef49fa2..2b456dde5240 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll @@ -1735,99 +1735,14 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_packus_v8i64_v8i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vmovq %xmm0, %rcx -; AVX512F-NEXT: vmovd %ecx, %xmm1 -; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm2, %rax -; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm2, %rax -; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_packus_v8i64_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vmovq %xmm0, %rcx -; AVX512VL-NEXT: vmovd %ecx, %xmm1 -; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vmovq %xmm2, %rax -; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax -; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512VL-NEXT: vmovq %xmm2, %rax -; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax -; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_packus_v8i64_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_packus_v8i64_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_packus_v8i64_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = icmp slt <8 x i64> %a0, %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> %3 = icmp sgt <8 x i64> %2, zeroinitializer @@ -3022,7 +2937,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { ; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -3041,7 +2956,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { ; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %1 = icmp slt <8 x i32> %a0, diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll index 11ed156316ab..d25d52549b4b 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll @@ -1633,95 +1633,13 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(<8 x i64> %a0) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_ssat_v8i64_v8i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vmovq %xmm0, %rcx -; AVX512F-NEXT: vmovd %ecx, %xmm1 -; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm2, %rax -; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm2, %rax -; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_ssat_v8i64_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vmovq %xmm0, %rcx -; AVX512VL-NEXT: vmovd %ecx, %xmm1 -; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vmovq %xmm2, %rax -; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax -; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512VL-NEXT: vmovq %xmm2, %rax -; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax -; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_ssat_v8i64_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_ssat_v8i64_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_ssat_v8i64_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = icmp slt <8 x i64> %a0, %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> %3 = icmp sgt <8 x i64> %2, @@ -2975,7 +2893,7 @@ define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -2993,7 +2911,7 @@ define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) { ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %1 = icmp slt <8 x i32> %a0, diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll index 3dfa844ff829..9dd97b33069d 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll @@ -1216,91 +1216,12 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<8 x i64> %a0) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_usat_v8i64_v8i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vmovq %xmm0, %rcx -; AVX512F-NEXT: vmovd %ecx, %xmm1 -; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm2, %rax -; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm2, %rax -; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_usat_v8i64_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vmovq %xmm0, %rcx -; AVX512VL-NEXT: vmovd %ecx, %xmm1 -; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vmovq %xmm2, %rax -; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax -; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512VL-NEXT: vmovq %xmm2, %rax -; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax -; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_usat_v8i64_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_usat_v8i64_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_usat_v8i64_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = icmp ult <8 x i64> %a0, %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> %3 = trunc <8 x i64> %2 to <8 x i8> @@ -2110,7 +2031,7 @@ define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) { ; AVX512VL-LABEL: trunc_usat_v8i32_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -2125,7 +2046,7 @@ define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) { ; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %1 = icmp ult <8 x i32> %a0, diff --git a/llvm/test/CodeGen/X86/vector-trunc-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-widen.ll index e05bdf753737..a15b2aa6779e 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-widen.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-widen.ll @@ -1562,15 +1562,41 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) { ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; -; AVX512-LABEL: trunc2x4i64_8i16: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: vpmovqw %zmm1, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc2x4i64_8i16: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc2x4i64_8i16: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovqw %ymm1, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc2x4i64_8i16: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc2x4i64_8i16: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovqw %ymm1, %xmm1 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq entry: %0 = trunc <4 x i64> %a to <4 x i16> %1 = trunc <4 x i64> %b to <4 x i16> @@ -1584,71 +1610,15 @@ define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) { ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: retq ; -; AVX1-LABEL: trunc2x2i64_4i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: retq +; AVX-LABEL: trunc2x2i64_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc2x2i64_4i32: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc2x2i64_4i32: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc2x2i64_4i32: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc2x2i64_4i32: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc2x2i64_4i32: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc2x2i64_4i32: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc2x2i64_4i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512-NEXT: retq entry: %0 = trunc <2 x i64> %a to <2 x i32> %1 = trunc <2 x i64> %b to <2 x i32> @@ -1663,60 +1633,17 @@ define i64 @trunc2i64_i64(<2 x i64> %inval) { ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; -; AVX1-LABEL: trunc2i64_i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: retq +; AVX-LABEL: trunc2i64_i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq ; -; AVX2-SLOW-LABEL: trunc2i64_i64: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vmovq %xmm0, %rax -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc2i64_i64: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovq %xmm0, %rax -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc2i64_i64: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc2i64_i64: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc2i64_i64: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc2i64_i64: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512BWVL-NEXT: vmovq %xmm0, %rax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc2i64_i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: retq entry: %0 = trunc <2 x i64> %inval to <2 x i32> %1 = bitcast <2 x i32> %0 to i64 @@ -1751,61 +1678,21 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) { ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc2x4i32_8i16: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: retq +; AVX-LABEL: trunc2x4i32_8i16: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq ; -; AVX2-LABEL: trunc2x4i32_8i16: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc2x4i32_8i16: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc2x4i32_8i16: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vpmovdw %ymm1, %xmm1 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc2x4i32_8i16: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc2x4i32_8i16: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdw %ymm1, %xmm1 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc2x4i32_8i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: retq entry: %0 = trunc <4 x i32> %a to <4 x i16> %1 = trunc <4 x i32> %b to <4 x i16> @@ -1841,37 +1728,11 @@ define i64 @trunc4i32_i64(<4 x i32> %inval) { ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc4i32_i64: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc4i32_i64: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc4i32_i64: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc4i32_i64: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vmovq %xmm0, %rax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc4i32_i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: retq entry: %0 = trunc <4 x i32> %inval to <4 x i16> %1 = bitcast <4 x i16> %0 to i64 @@ -1906,54 +1767,18 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) { ; AVX-LABEL: trunc2x8i16_16i8: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc2x8i16_16i8: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc2x8i16_16i8: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc2x8i16_16i8: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc2x8i16_16i8: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %ymm1, %xmm1 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc2x8i16_16i8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: retq entry: %0 = trunc <8 x i16> %a to <8 x i8> %1 = trunc <8 x i16> %b to <8 x i8> @@ -1988,39 +1813,11 @@ define i64 @trunc8i16_i64(<8 x i16> %inval) { ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc8i16_i64: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc8i16_i64: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc8i16_i64: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc8i16_i64: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vmovq %xmm0, %rax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc8i16_i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: retq entry: %0 = trunc <8 x i16> %inval to <8 x i8> %1 = bitcast <8 x i8> %0 to i64