From fad5bdaf952bdc43fde97a45333356224cf65f87 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 15 Jan 2019 16:11:05 +0000 Subject: [PATCH] [DAGCombiner] reduce buildvec of zexted extracted element to shuffle The motivating case for this is shown in the first regression test. We are transferring to scalar and back rather than just zero-extending with 'vpmovzxdq'. That's a special-case for a more general pattern as shown here. In all tests, we're avoiding the vector-scalar-vector moves in favor of vector ops. We aren't producing optimal shuffle code in some cases though, so the patch is limited to reduce regressions. Differential Revision: https://reviews.llvm.org/D56281 llvm-svn: 351198 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 75 +++++ llvm/test/CodeGen/X86/buildvec-extract.ll | 316 +++++++++--------- 2 files changed, 236 insertions(+), 155 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b1770920f069..2ffaa9054ff9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16195,6 +16195,78 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, return Shuffle; } +static SDValue reduceBuildVecToShuffleWithZero(SDNode *BV, SelectionDAG &DAG) { + assert(BV->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector"); + + // First, determine where the build vector is not undef. + // TODO: We could extend this to handle zero elements as well as undefs. + int NumBVOps = BV->getNumOperands(); + int ZextElt = -1; + for (int i = 0; i != NumBVOps; ++i) { + SDValue Op = BV->getOperand(i); + if (Op.isUndef()) + continue; + if (ZextElt == -1) + ZextElt = i; + else + return SDValue(); + } + // Bail out if there's no non-undef element. + if (ZextElt == -1) + return SDValue(); + + // The build vector contains some number of undef elements and exactly + // one other element. That other element must be a zero-extended scalar + // extracted from a vector at a constant index to turn this into a shuffle. + // TODO: This could be enhanced to allow ANY_EXTEND as well as ZERO_EXTEND. + SDValue Zext = BV->getOperand(ZextElt); + if (Zext.getOpcode() != ISD::ZERO_EXTEND || !Zext.hasOneUse() || + Zext.getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa(Zext.getOperand(0).getOperand(1))) + return SDValue(); + + // The zero-extend must be a multiple of the source size. + SDValue Extract = Zext.getOperand(0); + unsigned DestSize = Zext.getValueSizeInBits(); + unsigned SrcSize = Extract.getValueSizeInBits(); + if (DestSize % SrcSize != 0) + return SDValue(); + + // Create a shuffle mask that will combine the extracted element with zeros + // and undefs. + int ZextRatio = DestSize / SrcSize; + int NumMaskElts = NumBVOps * ZextRatio; + SmallVector ShufMask(NumMaskElts, -1); + for (int i = 0; i != NumMaskElts; ++i) { + if (i / ZextRatio == ZextElt) { + // The low bits of the (potentially translated) extracted element map to + // the source vector. The high bits map to zero. We will use a zero vector + // as the 2nd source operand of the shuffle, so use the 1st element of + // that vector (mask value is number-of-elements) for the high bits. + if (i % ZextRatio == 0) + ShufMask[i] = Extract.getConstantOperandVal(1); + else + ShufMask[i] = NumMaskElts; + } + + // Undef elements of the build vector remain undef because we initialize + // the shuffle mask with -1. + } + + // Turn this into a shuffle with zero if that's legal. + EVT VecVT = Extract.getOperand(0).getValueType(); + if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(ShufMask, VecVT)) + return SDValue(); + + // buildvec undef, ..., (zext (extractelt V, IndexC)), undef... --> + // bitcast (shuffle V, ZeroVec, VectorMask) + SDLoc DL(BV); + SDValue ZeroVec = DAG.getConstant(0, DL, VecVT); + SDValue Shuf = DAG.getVectorShuffle(VecVT, DL, Extract.getOperand(0), ZeroVec, + ShufMask); + return DAG.getBitcast(BV->getValueType(0), Shuf); +} + // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT // operations. If the types of the vectors we're extracting from allow it, // turn this into a vector_shuffle node. @@ -16206,6 +16278,9 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { if (!isTypeLegal(VT)) return SDValue(); + if (SDValue V = reduceBuildVecToShuffleWithZero(N, DAG)) + return V; + // May only combine to shuffle after legalize if shuffle is legal. if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT)) return SDValue(); diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll index 15386907e24f..267eec4bdab4 100644 --- a/llvm/test/CodeGen/X86/buildvec-extract.ll +++ b/llvm/test/CodeGen/X86/buildvec-extract.ll @@ -4,16 +4,20 @@ ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=ANY,AVX define <2 x i64> @extract0_i32_zext_insert0_i64_undef(<4 x i32> %x) { -; SSE-LABEL: extract0_i32_zext_insert0_i64_undef: -; SSE: # %bb.0: -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: extract0_i32_zext_insert0_i64_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE41-LABEL: extract0_i32_zext_insert0_i64_undef: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: retq ; ; AVX-LABEL: extract0_i32_zext_insert0_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 0 %z = zext i32 %e to i64 @@ -40,23 +44,14 @@ define <2 x i64> @extract0_i32_zext_insert0_i64_zero(<4 x i32> %x) { } define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) { -; SSE2-LABEL: extract1_i32_zext_insert0_i64_undef: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: extract1_i32_zext_insert0_i64_undef: -; SSE41: # %bb.0: -; SSE41-NEXT: extractps $1, %xmm0, %eax -; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: extract1_i32_zext_insert0_i64_undef: +; SSE: # %bb.0: +; SSE-NEXT: psrlq $32, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: extract1_i32_zext_insert0_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vextractps $1, %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 1 %z = zext i32 %e to i64 @@ -90,23 +85,16 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) { } define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) { -; SSE2-LABEL: extract2_i32_zext_insert0_i64_undef: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: extract2_i32_zext_insert0_i64_undef: -; SSE41: # %bb.0: -; SSE41-NEXT: extractps $2, %xmm0, %eax -; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: extract2_i32_zext_insert0_i64_undef: +; SSE: # %bb.0: +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: retq ; ; AVX-LABEL: extract2_i32_zext_insert0_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vextractps $2, %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 2 %z = zext i32 %e to i64 @@ -140,23 +128,14 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) { } define <2 x i64> @extract3_i32_zext_insert0_i64_undef(<4 x i32> %x) { -; SSE2-LABEL: extract3_i32_zext_insert0_i64_undef: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: extract3_i32_zext_insert0_i64_undef: -; SSE41: # %bb.0: -; SSE41-NEXT: extractps $3, %xmm0, %eax -; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: extract3_i32_zext_insert0_i64_undef: +; SSE: # %bb.0: +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: retq ; ; AVX-LABEL: extract3_i32_zext_insert0_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vextractps $3, %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 %z = zext i32 %e to i64 @@ -190,18 +169,25 @@ define <2 x i64> @extract3_i32_zext_insert0_i64_zero(<4 x i32> %x) { } define <2 x i64> @extract0_i32_zext_insert1_i64_undef(<4 x i32> %x) { -; SSE-LABEL: extract0_i32_zext_insert1_i64_undef: -; SSE: # %bb.0: -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: retq +; SSE2-LABEL: extract0_i32_zext_insert1_i64_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE41-LABEL: extract0_i32_zext_insert1_i64_undef: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; SSE41-NEXT: retq ; ; AVX-LABEL: extract0_i32_zext_insert1_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 0 %z = zext i32 %e to i64 @@ -232,24 +218,18 @@ define <2 x i64> @extract0_i32_zext_insert1_i64_zero(<4 x i32> %x) { define <2 x i64> @extract1_i32_zext_insert1_i64_undef(<4 x i32> %x) { ; SSE2-LABEL: extract1_i32_zext_insert1_i64_undef: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract1_i32_zext_insert1_i64_undef: ; SSE41: # %bb.0: -; SSE41-NEXT: extractps $1, %xmm0, %eax -; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: retq ; ; AVX-LABEL: extract1_i32_zext_insert1_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vextractps $1, %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 1 %z = zext i32 %e to i64 @@ -288,24 +268,19 @@ define <2 x i64> @extract1_i32_zext_insert1_i64_zero(<4 x i32> %x) { define <2 x i64> @extract2_i32_zext_insert1_i64_undef(<4 x i32> %x) { ; SSE2-LABEL: extract2_i32_zext_insert1_i64_undef: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract2_i32_zext_insert1_i64_undef: ; SSE41: # %bb.0: -; SSE41-NEXT: extractps $2, %xmm0, %eax -; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; SSE41-NEXT: retq ; ; AVX-LABEL: extract2_i32_zext_insert1_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vextractps $2, %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 2 %z = zext i32 %e to i64 @@ -342,26 +317,14 @@ define <2 x i64> @extract2_i32_zext_insert1_i64_zero(<4 x i32> %x) { } define <2 x i64> @extract3_i32_zext_insert1_i64_undef(<4 x i32> %x) { -; SSE2-LABEL: extract3_i32_zext_insert1_i64_undef: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: extract3_i32_zext_insert1_i64_undef: -; SSE41: # %bb.0: -; SSE41-NEXT: extractps $3, %xmm0, %eax -; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE41-NEXT: retq +; SSE-LABEL: extract3_i32_zext_insert1_i64_undef: +; SSE: # %bb.0: +; SSE-NEXT: psrlq $32, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: extract3_i32_zext_insert1_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vextractps $3, %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 %z = zext i32 %e to i64 @@ -398,16 +361,21 @@ define <2 x i64> @extract3_i32_zext_insert1_i64_zero(<4 x i32> %x) { } define <2 x i64> @extract0_i16_zext_insert0_i64_undef(<8 x i16> %x) { -; SSE-LABEL: extract0_i16_zext_insert0_i64_undef: -; SSE: # %bb.0: -; SSE-NEXT: pextrw $0, %xmm0, %eax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: extract0_i16_zext_insert0_i64_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE41-LABEL: extract0_i16_zext_insert0_i64_undef: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: retq ; ; AVX-LABEL: extract0_i16_zext_insert0_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $0, %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 0 %z = zext i16 %e to i64 @@ -434,16 +402,25 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) { } define <2 x i64> @extract1_i16_zext_insert0_i64_undef(<8 x i16> %x) { -; SSE-LABEL: extract1_i16_zext_insert0_i64_undef: -; SSE: # %bb.0: -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: extract1_i16_zext_insert0_i64_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,0,1,4,5,6,7] +; SSE2-NEXT: retq +; +; SSE41-LABEL: extract1_i16_zext_insert0_i64_undef: +; SSE41: # %bb.0: +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: retq ; ; AVX-LABEL: extract1_i16_zext_insert0_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 1 %z = zext i16 %e to i64 @@ -470,16 +447,26 @@ define <2 x i64> @extract1_i16_zext_insert0_i64_zero(<8 x i16> %x) { } define <2 x i64> @extract2_i16_zext_insert0_i64_undef(<8 x i16> %x) { -; SSE-LABEL: extract2_i16_zext_insert0_i64_undef: -; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: extract2_i16_zext_insert0_i64_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; SSE2-NEXT: retq +; +; SSE41-LABEL: extract2_i16_zext_insert0_i64_undef: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: retq ; ; AVX-LABEL: extract2_i16_zext_insert0_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 2 %z = zext i16 %e to i64 @@ -508,14 +495,12 @@ define <2 x i64> @extract2_i16_zext_insert0_i64_zero(<8 x i16> %x) { define <2 x i64> @extract3_i16_zext_insert0_i64_undef(<8 x i16> %x) { ; SSE-LABEL: extract3_i16_zext_insert0_i64_undef: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: extract3_i16_zext_insert0_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 3 %z = zext i16 %e to i64 @@ -542,18 +527,24 @@ define <2 x i64> @extract3_i16_zext_insert0_i64_zero(<8 x i16> %x) { } define <2 x i64> @extract0_i16_zext_insert1_i64_undef(<8 x i16> %x) { -; SSE-LABEL: extract0_i16_zext_insert1_i64_undef: -; SSE: # %bb.0: -; SSE-NEXT: pextrw $0, %xmm0, %eax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: retq +; SSE2-LABEL: extract0_i16_zext_insert1_i64_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: extract0_i16_zext_insert1_i64_undef: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] +; SSE41-NEXT: retq ; ; AVX-LABEL: extract0_i16_zext_insert1_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $0, %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 0 %z = zext i16 %e to i64 @@ -582,18 +573,21 @@ define <2 x i64> @extract0_i16_zext_insert1_i64_zero(<8 x i16> %x) { } define <2 x i64> @extract1_i16_zext_insert1_i64_undef(<8 x i16> %x) { -; SSE-LABEL: extract1_i16_zext_insert1_i64_undef: -; SSE: # %bb.0: -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: retq +; SSE2-LABEL: extract1_i16_zext_insert1_i64_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE41-LABEL: extract1_i16_zext_insert1_i64_undef: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: retq ; ; AVX-LABEL: extract1_i16_zext_insert1_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 1 %z = zext i16 %e to i64 @@ -622,18 +616,24 @@ define <2 x i64> @extract1_i16_zext_insert1_i64_zero(<8 x i16> %x) { } define <2 x i64> @extract2_i16_zext_insert1_i64_undef(<8 x i16> %x) { -; SSE-LABEL: extract2_i16_zext_insert1_i64_undef: -; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: retq +; SSE2-LABEL: extract2_i16_zext_insert1_i64_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: extract2_i16_zext_insert1_i64_undef: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] +; SSE41-NEXT: retq ; ; AVX-LABEL: extract2_i16_zext_insert1_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 2 %z = zext i16 %e to i64 @@ -662,18 +662,24 @@ define <2 x i64> @extract2_i16_zext_insert1_i64_zero(<8 x i16> %x) { } define <2 x i64> @extract3_i16_zext_insert1_i64_undef(<8 x i16> %x) { -; SSE-LABEL: extract3_i16_zext_insert1_i64_undef: -; SSE: # %bb.0: -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: retq +; SSE2-LABEL: extract3_i16_zext_insert1_i64_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: extract3_i16_zext_insert1_i64_undef: +; SSE41: # %bb.0: +; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] +; SSE41-NEXT: retq ; ; AVX-LABEL: extract3_i16_zext_insert1_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 3 %z = zext i16 %e to i64