diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3351f1321c85..0c1f9e6cffae 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31833,6 +31833,83 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify +/// the codegen. +/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) ) +static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + SDLoc &DL) { + assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode"); + SDValue Src = N->getOperand(0); + unsigned Opcode = Src.getOpcode(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + EVT VT = N->getValueType(0); + EVT SrcVT = Src.getValueType(); + + auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) { + // TODO: Add extra cases where we can truncate both inputs for the + // cost of one (or none). + // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y ) + if (Op0 == Op1) + return true; + + SDValue BC0 = peekThroughOneUseBitcasts(Op0); + SDValue BC1 = peekThroughOneUseBitcasts(Op1); + return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) || + ISD::isBuildVectorOfConstantSDNodes(BC1.getNode()); + }; + + auto TruncateArithmetic = [&](SDValue N0, SDValue N1) { + SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0); + SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1); + return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1); + }; + + // Don't combine if the operation has other uses. + if (!N->isOnlyUserOf(Src.getNode())) + return SDValue(); + + // Only support vector truncation for now. + // TODO: i64 scalar math would benefit as well. + if (!VT.isVector()) + return SDValue(); + + // In most cases its only worth pre-truncating if we're only facing the cost + // of one truncation. + // i.e. if one of the inputs will constant fold or the input is repeated. + switch (Opcode) { + case ISD::AND: + case ISD::XOR: + case ISD::OR: { + SDValue Op0 = Src.getOperand(0); + SDValue Op1 = Src.getOperand(1); + if (TLI.isOperationLegalOrPromote(Opcode, VT) && + IsRepeatedOpOrOneUseConstant(Op0, Op1)) + return TruncateArithmetic(Op0, Op1); + break; + } + + case ISD::MUL: + // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its + // better to truncate if we have the chance. + if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) && + !TLI.isOperationLegal(Opcode, SrcVT)) + return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1)); + LLVM_FALLTHROUGH; + case ISD::ADD: { + SDValue Op0 = Src.getOperand(0); + SDValue Op1 = Src.getOperand(1); + if (TLI.isOperationLegal(Opcode, VT) && + IsRepeatedOpOrOneUseConstant(Op0, Op1)) + return TruncateArithmetic(Op0, Op1); + break; + } + } + + return SDValue(); +} + /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS. static SDValue combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG, @@ -32019,6 +32096,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, SDValue Src = N->getOperand(0); SDLoc DL(N); + // Attempt to pre-truncate inputs to arithmetic ops instead. + if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL)) + return V; + // Try to detect AVG pattern first. if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) return Avg; diff --git a/llvm/test/CodeGen/X86/avx512-any_extend_load.ll b/llvm/test/CodeGen/X86/avx512-any_extend_load.ll index 656b618eff55..87f8cc9a418e 100644 --- a/llvm/test/CodeGen/X86/avx512-any_extend_load.ll +++ b/llvm/test/CodeGen/X86/avx512-any_extend_load.ll @@ -22,10 +22,8 @@ define void @any_extend_load_v8i64(<8 x i8> * %ptr) { define void @any_extend_load_v8i32(<8 x i8> * %ptr) { ; KNL-LABEL: any_extend_load_v8i32: ; KNL: # BB#0: -; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; KNL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; KNL-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; KNL-NEXT: vmovq %xmm0, (%rdi) ; KNL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/i64-to-float.ll b/llvm/test/CodeGen/X86/i64-to-float.ll index 8898551a9764..da92bdb55d7c 100644 --- a/llvm/test/CodeGen/X86/i64-to-float.ll +++ b/llvm/test/CodeGen/X86/i64-to-float.ll @@ -71,34 +71,32 @@ define <2 x double> @mask_uitofp_2i64_2f64(<2 x i64> %a) nounwind { define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind { ; X32-SSE-LABEL: mask_sitofp_4i64_4f32: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; X32-SSE-NEXT: retl ; ; X32-AVX-LABEL: mask_sitofp_4i64_4f32: ; X32-AVX: # BB#0: -; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0 ; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-AVX-NEXT: vzeroupper ; X32-AVX-NEXT: retl ; ; X64-SSE-LABEL: mask_sitofp_4i64_4f32: ; X64-SSE: # BB#0: -; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm1 -; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mask_sitofp_4i64_4f32: ; X64-AVX: # BB#0: -; X64-AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq @@ -110,34 +108,32 @@ define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind { define <4 x float> @mask_uitofp_4i64_4f32(<4 x i64> %a) nounwind { ; X32-SSE-LABEL: mask_uitofp_4i64_4f32: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; X32-SSE-NEXT: retl ; ; X32-AVX-LABEL: mask_uitofp_4i64_4f32: ; X32-AVX: # BB#0: -; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0 ; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-AVX-NEXT: vzeroupper ; X32-AVX-NEXT: retl ; ; X64-SSE-LABEL: mask_uitofp_4i64_4f32: ; X64-SSE: # BB#0: -; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm1 -; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mask_uitofp_4i64_4f32: ; X64-AVX: # BB#0: -; X64-AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll index 7664b25d38ad..f828ed0ba6e7 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -419,40 +419,31 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v4i64_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE-NEXT: paddq %xmm2, %xmm0 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v4i64_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_add_const_v4i64_v4i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = add <4 x i64> %a0, %2 = trunc <4 x i64> %1 to <4 x i32> @@ -462,52 +453,39 @@ define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; SSE-NEXT: paddq %xmm0, %xmm4 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm1 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm2 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: paddw {{.*}}(%rip), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -515,14 +493,14 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_add_const_v8i64_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = add <8 x i64> %a0, %2 = trunc <8 x i64> %1 to <8 x i16> @@ -532,41 +510,38 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v8i32_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_add_const_v8i32_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = add <8 x i32> %a0, %2 = trunc <8 x i32> %1 to <8 x i16> @@ -576,17 +551,6 @@ define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind { define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v16i64_v16i8: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm8 -; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] -; SSE-NEXT: paddq %xmm8, %xmm0 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm1 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm2 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm3 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm4 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm5 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm6 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: pand %xmm8, %xmm6 @@ -603,50 +567,37 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v16i64_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm4 -; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 -; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 -; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm3, %ymm3 -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] @@ -666,37 +617,35 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_add_const_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_add_const_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_add_const_v16i64_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = add <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> @@ -706,10 +655,6 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v16i32_v16i8: ; SSE: # BB#0: -; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 -; SSE-NEXT: paddd {{.*}}(%rip), %xmm2 -; SSE-NEXT: paddd {{.*}}(%rip), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm2 @@ -718,31 +663,27 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v16i32_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -752,13 +693,14 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_add_const_v16i32_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = add <16 x i32> %a0, %2 = trunc <16 x i32> %1 to <16 x i8> @@ -768,56 +710,54 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v16i16_v16i8: ; SSE: # BB#0: -; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSE-NEXT: paddw {{.*}}(%rip), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v16i16_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v16i16_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_add_const_v16i16_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = add <16 x i16> %a0, %2 = trunc <16 x i16> %1 to <16 x i8> @@ -1676,69 +1616,39 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX1-LABEL: trunc_mul_v4i64_v4i32: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 -; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_mul_v4i64_v4i32: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512F-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX512F-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX512F-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; AVX512F-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v4i64_v4i32: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32: @@ -1757,46 +1667,17 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-LABEL: trunc_mul_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: psrlq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm0, %xmm9 -; SSE-NEXT: paddq %xmm8, %xmm9 -; SSE-NEXT: psllq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm4, %xmm0 -; SSE-NEXT: paddq %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm1, %xmm4 -; SSE-NEXT: paddq %xmm8, %xmm4 -; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm5, %xmm1 -; SSE-NEXT: paddq %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm2, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm6, %xmm2 -; SSE-NEXT: paddq %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm3, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm7, %xmm3 -; SSE-NEXT: paddq %xmm5, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -1808,111 +1689,68 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: pmullw %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_mul_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5 -; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 -; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm2 -; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5 -; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5 -; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5 -; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6 -; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1,2,3],xmm2[4],xmm5[5,6,7] +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7] +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm4 -; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm4 -; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5 -; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5 -; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 -; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4 -; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4 -; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_mul_v8i64_v8i16: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512F-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512F-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512F-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512F-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512F-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v8i64_v8i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16: @@ -2186,104 +2024,60 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX2-LABEL: trunc_mul_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm8 -; AVX2-NEXT: vpmuludq %ymm5, %ymm8, %ymm8 -; AVX2-NEXT: vpsrlq $32, %ymm5, %ymm9 -; AVX2-NEXT: vpmuludq %ymm9, %ymm1, %ymm9 -; AVX2-NEXT: vpaddq %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vpsllq $32, %ymm8, %ymm8 -; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm8, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm5 -; AVX2-NEXT: vpmuludq %ymm4, %ymm5, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm4, %ymm8 -; AVX2-NEXT: vpmuludq %ymm8, %ymm0, %ymm8 -; AVX2-NEXT: vpaddq %ymm5, %ymm8, %ymm5 -; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5 -; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm4 -; AVX2-NEXT: vpmuludq %ymm7, %ymm4, %ymm4 -; AVX2-NEXT: vpsrlq $32, %ymm7, %ymm5 -; AVX2-NEXT: vpmuludq %ymm5, %ymm3, %ymm5 -; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 -; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4 -; AVX2-NEXT: vpmuludq %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpsrlq $32, %ymm6, %ymm5 -; AVX2-NEXT: vpmuludq %ymm5, %ymm2, %ymm5 -; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 -; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vpmulld %xmm7, %xmm3, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpmulld %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpmulld %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpmulld %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_mul_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm4 -; AVX512F-NEXT: vpmuludq %zmm3, %zmm4, %zmm4 -; AVX512F-NEXT: vpsrlq $32, %zmm3, %zmm5 -; AVX512F-NEXT: vpmuludq %zmm5, %zmm1, %zmm5 -; AVX512F-NEXT: vpaddq %zmm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpsllq $32, %zmm4, %zmm4 -; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 -; AVX512F-NEXT: vpaddq %zmm4, %zmm1, %zmm1 -; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm3 -; AVX512F-NEXT: vpmuludq %zmm2, %zmm3, %zmm3 -; AVX512F-NEXT: vpsrlq $32, %zmm2, %zmm4 -; AVX512F-NEXT: vpmuludq %zmm4, %zmm0, %zmm4 -; AVX512F-NEXT: vpaddq %zmm3, %zmm4, %zmm3 -; AVX512F-NEXT: vpsllq $32, %zmm3, %zmm3 -; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddq %zmm3, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqd %zmm3, %ymm3 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm4, %zmm4 -; AVX512BW-NEXT: vpsrlq $32, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmuludq %zmm5, %zmm1, %zmm5 -; AVX512BW-NEXT: vpaddq %zmm4, %zmm5, %zmm4 -; AVX512BW-NEXT: vpsllq $32, %zmm4, %zmm4 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddq %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm3, %zmm3 -; AVX512BW-NEXT: vpsrlq $32, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmuludq %zmm4, %zmm0, %zmm4 -; AVX512BW-NEXT: vpaddq %zmm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vpsllq $32, %zmm3, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm3, %ymm3 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; AVX512BW-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -2479,70 +2273,25 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3] -; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3] -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3] -; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3] -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 -; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddq %ymm0, %ymm2, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3] -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX512DQ-NEXT: retq +; AVX512-LABEL: trunc_mul_const_v4i64_v4i32: +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = mul <4 x i64> %a0, %2 = trunc <4 x i64> %1 to <4 x i32> ret <4 x i32> %2 @@ -2551,36 +2300,6 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: psrlq $32, %xmm0 -; SSE-NEXT: pmuludq %xmm4, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: paddq %xmm5, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,3] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: psrlq $32, %xmm1 -; SSE-NEXT: pmuludq %xmm4, %xmm1 -; SSE-NEXT: psllq $32, %xmm1 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm4, %xmm2 -; SSE-NEXT: psllq $32, %xmm2 -; SSE-NEXT: paddq %xmm5, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [6,7] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm4, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: paddq %xmm5, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -2592,64 +2311,28 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_mul_const_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3] -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5] -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5 -; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7] -; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7] -; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3] -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -2657,37 +2340,15 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_mul_const_v8i64_v8i16: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm0 -; AVX512F-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpsllq $32, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_mul_const_v8i64_v8i16: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllq $32, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_mul_const_v8i64_v8i16: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512DQ-NEXT: retq +; AVX512-LABEL: trunc_mul_const_v8i64_v8i16: +; AVX512: # BB#0: +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = mul <8 x i64> %a0, %2 = trunc <8 x i64> %1 to <8 x i16> ret <8 x i16> %2 @@ -2696,55 +2357,38 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: pmuludq %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE-NEXT: pmuludq %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: pmuludq %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_mul_const_v8i32_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_mul_const_v8i32_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = mul <8 x i32> %a0, %2 = trunc <8 x i32> %1 to <8 x i16> @@ -2907,34 +2551,12 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX2-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,6,7] -; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3] -; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15] -; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3 -; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,10,11] -; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm2, %ymm5, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -2943,8 +2565,10 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -2955,50 +2579,30 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [8,9,10,11,12,13,14,15] -; AVX512F-NEXT: vpmuludq %zmm2, %zmm1, %zmm3 -; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm1 -; AVX512F-NEXT: vpmuludq %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpsllq $32, %zmm1, %zmm1 -; AVX512F-NEXT: vpaddq %zmm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm3 -; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm0 -; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpsllq $32, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddq %zmm0, %zmm3, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm2 = [8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsllq $32, %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddq %zmm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllq $32, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: retq @@ -3073,15 +2677,15 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX2-LABEL: trunc_mul_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vzeroupper @@ -3547,36 +3151,31 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v4i64_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: andps {{.*}}(%rip), %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v4i64_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_and_const_v4i64_v4i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = and <4 x i64> %a0, %2 = trunc <4 x i64> %1 to <4 x i32> @@ -3586,30 +3185,23 @@ define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: andpd {{.*}}(%rip), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] @@ -3620,13 +3212,12 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -3634,14 +3225,14 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_and_const_v8i64_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = and <8 x i64> %a0, %2 = trunc <8 x i64> %1 to <8 x i16> @@ -3651,40 +3242,38 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v8i32_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_and_const_v8i32_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = and <8 x i32> %a0, %2 = trunc <8 x i32> %1 to <8 x i16> @@ -3694,41 +3283,27 @@ define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind { define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v16i64_v16i8: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm8 -; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE-NEXT: pand {{.*}}(%rip), %xmm5 -; SSE-NEXT: pand {{.*}}(%rip), %xmm6 -; SSE-NEXT: pand {{.*}}(%rip), %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE-NEXT: pand %xmm9, %xmm7 -; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 ; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: packuswb %xmm5, %xmm4 ; SSE-NEXT: packuswb %xmm6, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v16i64_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 @@ -3749,15 +3324,12 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] @@ -3777,37 +3349,35 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_and_const_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_and_const_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_and_const_v16i64_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = and <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> @@ -3817,10 +3387,6 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v16i32_v16i8: ; SSE: # BB#0: -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE-NEXT: pand {{.*}}(%rip), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm2 @@ -3829,12 +3395,11 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v16i32_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 @@ -3845,13 +3410,12 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -3861,13 +3425,14 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_and_const_v16i32_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = and <16 x i32> %a0, %2 = trunc <16 x i32> %1 to <16 x i8> @@ -3877,55 +3442,54 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v16i16_v16i8: ; SSE: # BB#0: -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v16i16_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v16i16_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_and_const_v16i16_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = and <16 x i16> %a0, %2 = trunc <16 x i16> %1 to <16 x i8> @@ -4323,36 +3887,31 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v4i64_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: xorps {{.*}}(%rip), %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: xorps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = xor <4 x i64> %a0, %2 = trunc <4 x i64> %1 to <4 x i32> @@ -4362,30 +3921,23 @@ define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; SSE-NEXT: pxor %xmm0, %xmm4 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm2 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: xorpd {{.*}}(%rip), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] @@ -4396,13 +3948,12 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -4410,14 +3961,14 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = xor <8 x i64> %a0, %2 = trunc <8 x i64> %1 to <8 x i16> @@ -4427,40 +3978,38 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v8i32_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_xor_const_v8i32_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = xor <8 x i32> %a0, %2 = trunc <8 x i32> %1 to <8 x i16> @@ -4470,17 +4019,6 @@ define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind { define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v16i64_v16i8: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm8 -; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] -; SSE-NEXT: pxor %xmm8, %xmm0 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm2 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm3 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm4 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm5 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm6 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: pand %xmm8, %xmm6 @@ -4497,14 +4035,11 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 @@ -4525,15 +4060,12 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm3, %ymm3 -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] @@ -4553,37 +4085,35 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = xor <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> @@ -4593,10 +4123,6 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v16i32_v16i8: ; SSE: # BB#0: -; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm2 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm2 @@ -4605,12 +4131,11 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 @@ -4621,13 +4146,12 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -4637,13 +4161,14 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_xor_const_v16i32_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpxord {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = xor <16 x i32> %a0, %2 = trunc <16 x i32> %1 to <16 x i8> @@ -4653,55 +4178,54 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v16i16_v16i8: ; SSE: # BB#0: -; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = xor <16 x i16> %a0, %2 = trunc <16 x i16> %1 to <16 x i8> @@ -5099,36 +4623,31 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v4i64_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: orps {{.*}}(%rip), %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: orps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v4i64_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_or_const_v4i64_v4i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = or <4 x i64> %a0, %2 = trunc <4 x i64> %1 to <4 x i32> @@ -5138,30 +4657,23 @@ define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: por {{.*}}(%rip), %xmm1 -; SSE-NEXT: por {{.*}}(%rip), %xmm2 -; SSE-NEXT: por {{.*}}(%rip), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: orpd {{.*}}(%rip), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] @@ -5172,13 +4684,12 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -5186,14 +4697,14 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_or_const_v8i64_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = or <8 x i64> %a0, %2 = trunc <8 x i64> %1 to <8 x i16> @@ -5203,40 +4714,38 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: por {{.*}}(%rip), %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v8i32_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_or_const_v8i32_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = or <8 x i32> %a0, %2 = trunc <8 x i32> %1 to <8 x i16> @@ -5246,17 +4755,6 @@ define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind { define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v16i64_v16i8: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm8 -; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm1 -; SSE-NEXT: por {{.*}}(%rip), %xmm2 -; SSE-NEXT: por {{.*}}(%rip), %xmm3 -; SSE-NEXT: por {{.*}}(%rip), %xmm4 -; SSE-NEXT: por {{.*}}(%rip), %xmm5 -; SSE-NEXT: por {{.*}}(%rip), %xmm6 -; SSE-NEXT: por {{.*}}(%rip), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: pand %xmm8, %xmm6 @@ -5273,14 +4771,11 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v16i64_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 @@ -5301,15 +4796,12 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm3, %ymm3 -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] @@ -5329,37 +4821,35 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_or_const_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_or_const_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_or_const_v16i64_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512DQ-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = or <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> @@ -5369,10 +4859,6 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v16i32_v16i8: ; SSE: # BB#0: -; SSE-NEXT: por {{.*}}(%rip), %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm1 -; SSE-NEXT: por {{.*}}(%rip), %xmm2 -; SSE-NEXT: por {{.*}}(%rip), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm2 @@ -5381,12 +4867,11 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v16i32_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 @@ -5397,13 +4882,12 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -5413,13 +4897,14 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_or_const_v16i32_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpord {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = or <16 x i32> %a0, %2 = trunc <16 x i32> %1 to <16 x i8> @@ -5429,55 +4914,54 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v16i16_v16i8: ; SSE: # BB#0: -; SSE-NEXT: por {{.*}}(%rip), %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v16i16_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v16i16_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_or_const_v16i16_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = or <16 x i16> %a0, %2 = trunc <16 x i16> %1 to <16 x i8> @@ -5492,99 +4976,65 @@ define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwi ; SSE-LABEL: mul_add_const_v4i64_v4i32: ; SSE: # BB#0: ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] ; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pmuludq %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm3, %xmm5 +; SSE-NEXT: pmuludq %xmm2, %xmm5 ; SSE-NEXT: paddq %xmm4, %xmm5 ; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm2 ; SSE-NEXT: paddq %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrlq $32, %xmm1 +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: psrlq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm0, %xmm4 -; SSE-NEXT: paddq %xmm3, %xmm4 +; SSE-NEXT: paddq %xmm1, %xmm4 ; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: pmuludq %xmm3, %xmm0 ; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm0 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: mul_add_const_v4i64_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 -; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2] +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: mul_add_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: mul_add_const_v4i64_v4i32: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX512F-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX512F-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_add_const_v4i64_v4i32: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX512BW-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX512BW-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: mul_add_const_v4i64_v4i32: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = sext <4 x i32> %a0 to <4 x i64> %2 = sext <4 x i32> %a1 to <4 x i64> @@ -5597,91 +5047,67 @@ define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwi define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { ; SSE-LABEL: mul_add_self_v4i64_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psrad $31, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: psrlq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm0, %xmm5 +; SSE-NEXT: paddq %xmm4, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: paddq %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: psrlq $32, %xmm0 +; SSE-NEXT: pmuludq %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: psrlq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm3, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm3, %xmm2 -; SSE-NEXT: paddq %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm0, %xmm4 -; SSE-NEXT: paddq %xmm3, %xmm4 +; SSE-NEXT: paddq %xmm0, %xmm4 ; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm0, %xmm1 -; SSE-NEXT: paddq %xmm4, %xmm1 -; SSE-NEXT: paddq %xmm1, %xmm1 -; SSE-NEXT: paddq %xmm2, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: paddq %xmm4, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] +; SSE-NEXT: paddd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: mul_add_self_v4i64_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 -; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2] +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: mul_add_self_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: mul_add_self_v4i64_v4i32: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX512F-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX512F-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddq %ymm0, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_add_self_v4i64_v4i32: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX512BW-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX512BW-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddq %ymm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: mul_add_self_v4i64_v4i32: @@ -5689,9 +5115,8 @@ define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwin ; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxdq %xmm1, %ymm1 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = sext <4 x i32> %a0 to <4 x i64> %2 = sext <4 x i32> %a1 to <4 x i64>