diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6aa42fba4eb0..155df1577f45 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7462,16 +7462,18 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, } // Peek through trunc/aext/zext. - // TODO: handle elements smaller than VT. // TODO: aext shouldn't require SM_SentinelZero padding. // TODO: handle shift of scalars. + unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits(); while (Scl.getOpcode() == ISD::TRUNCATE || Scl.getOpcode() == ISD::ANY_EXTEND || Scl.getOpcode() == ISD::ZERO_EXTEND) { Scl = Scl.getOperand(0); - if (Scl.getScalarValueSizeInBits() < NumBitsPerElt) - return false; + if (MinBitsPerElt > Scl.getScalarValueSizeInBits()) + MinBitsPerElt = Scl.getScalarValueSizeInBits(); } + if ((MinBitsPerElt % 8) != 0) + return false; // Attempt to find the source vector the scalar was extracted from. SDValue SrcExtract; @@ -7486,31 +7488,29 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SDValue SrcVec = SrcExtract.getOperand(0); EVT SrcVT = SrcVec.getValueType(); - unsigned NumSrcElts = SrcVT.getVectorNumElements(); - unsigned NumZeros = - std::max((NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1, 0); - - if ((NumSrcElts % NumElts) != 0) + if (!SrcVT.getScalarType().isByteSized()) return false; - unsigned SrcIdx = SrcExtract.getConstantOperandVal(1); - if (NumSrcElts <= SrcIdx) - return false; + unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8); + unsigned DstByte = DstIdx * NumBytesPerElt; + // Create 'identity' byte level shuffle mask and then add inserted bytes. if (Opcode == ISD::SCALAR_TO_VECTOR) { Ops.push_back(SrcVec); - Mask.append(NumSrcElts, SM_SentinelUndef); + Mask.append(NumSizeInBytes, SM_SentinelUndef); } else { Ops.push_back(SrcVec); Ops.push_back(N.getOperand(0)); - for (int i = 0; i != (int)NumSrcElts; ++i) - Mask.push_back(NumSrcElts + i); + for (int i = 0; i != (int)NumSizeInBytes; ++i) + Mask.push_back(NumSizeInBytes + i); } - int Scale = NumSrcElts / NumElts; - Mask[Scale * DstIdx] = SrcIdx; - for (int i = 0; i != (int)NumZeros; ++i) - Mask[(Scale * DstIdx) + i + 1] = SM_SentinelZero; + unsigned MinBytesPerElts = MinBitsPerElt / 8; + MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt); + for (unsigned i = 0; i != MinBytesPerElts; ++i) + Mask[DstByte + i] = SrcByte + i; + for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i) + Mask[DstByte + i] = SM_SentinelZero; return true; } case X86ISD::PACKSS: diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll index 218701250e43..863ab4dee123 100644 --- a/llvm/test/CodeGen/X86/buildvec-extract.ll +++ b/llvm/test/CodeGen/X86/buildvec-extract.ll @@ -293,24 +293,19 @@ define <2 x i64> @extract2_i32_zext_insert1_i64_undef(<4 x i32> %x) { define <2 x i64> @extract2_i32_zext_insert1_i64_zero(<4 x i32> %x) { ; SSE2-LABEL: extract2_i32_zext_insert1_i64_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract2_i32_zext_insert1_i64_zero: ; SSE41: # %bb.0: -; SSE41-NEXT: extractps $2, %xmm0, %eax -; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; SSE41-NEXT: retq ; ; AVX-LABEL: extract2_i32_zext_insert1_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vextractps $2, %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 2 %z = zext i32 %e to i64 @@ -386,16 +381,22 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_undef(<8 x i16> %x) { } define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) { -; SSE-LABEL: extract0_i16_zext_insert0_i64_zero: -; SSE: # %bb.0: -; SSE-NEXT: pextrw $0, %xmm0, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: extract0_i16_zext_insert0_i64_zero: +; SSE2: # %bb.0: +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: extract0_i16_zext_insert0_i64_zero: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE41-NEXT: retq ; ; AVX-LABEL: extract0_i16_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $0, %xmm0, %eax -; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 0 %z = zext i16 %e to i64 diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll index 3add65914b58..9fb78491b608 100644 --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -21,10 +21,7 @@ define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind { ; SSE41-LABEL: foo: ; SSE41: # %bb.0: ; SSE41-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE41-NEXT: pextrb $8, %xmm0, %eax -; SSE41-NEXT: pextrb $4, %xmm0, %ecx -; SSE41-NEXT: pinsrb $1, %ecx, %xmm0 -; SSE41-NEXT: pinsrb $2, %eax, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; SSE41-NEXT: movl $255, %eax ; SSE41-NEXT: pinsrb $3, %eax, %xmm0 ; SSE41-NEXT: movd %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll index 085560c1a504..26e07d86bfc3 100644 --- a/llvm/test/CodeGen/X86/extract-concat.ll +++ b/llvm/test/CodeGen/X86/extract-concat.ll @@ -24,10 +24,7 @@ define void @foo(<4 x float> %in, <4 x i8>* %out) { ; SSE42-LABEL: foo: ; SSE42: # %bb.0: ; SSE42-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE42-NEXT: pextrb $8, %xmm0, %eax -; SSE42-NEXT: pextrb $4, %xmm0, %ecx -; SSE42-NEXT: pinsrb $1, %ecx, %xmm0 -; SSE42-NEXT: pinsrb $2, %eax, %xmm0 +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; SSE42-NEXT: movl $255, %eax ; SSE42-NEXT: pinsrb $3, %eax, %xmm0 ; SSE42-NEXT: movd %xmm0, (%rdi) @@ -36,10 +33,7 @@ define void @foo(<4 x float> %in, <4 x i8>* %out) { ; AVX-LABEL: foo: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpextrb $8, %xmm0, %eax -; AVX-NEXT: vpextrb $4, %xmm0, %ecx -; AVX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: movl $255, %eax ; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index c80ff1e8ee33..6b42178c6719 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3028,40 +3028,109 @@ define void @PR43024() { } define void @PR45604(<32 x i16>* %dst, <8 x i16>* %src) { -; SSE-LABEL: PR45604: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rsi), %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: movzwl %ax, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movl $11, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm0 -; SSE-NEXT: pextrw $1, %xmm1, %ecx -; SSE-NEXT: pinsrw $4, %ecx, %xmm0 -; SSE-NEXT: pinsrw $6, %eax, %xmm0 -; SSE-NEXT: pextrw $2, %xmm1, %ecx -; SSE-NEXT: movd %ecx, %xmm2 -; SSE-NEXT: pinsrw $2, %eax, %xmm2 -; SSE-NEXT: pextrw $3, %xmm1, %ecx -; SSE-NEXT: pinsrw $4, %ecx, %xmm2 -; SSE-NEXT: pinsrw $6, %eax, %xmm2 -; SSE-NEXT: pextrw $4, %xmm1, %ecx -; SSE-NEXT: movd %ecx, %xmm3 -; SSE-NEXT: pinsrw $2, %eax, %xmm3 -; SSE-NEXT: pextrw $5, %xmm1, %ecx -; SSE-NEXT: pinsrw $4, %ecx, %xmm3 -; SSE-NEXT: pinsrw $6, %eax, %xmm3 -; SSE-NEXT: pextrw $6, %xmm1, %ecx -; SSE-NEXT: movd %ecx, %xmm4 -; SSE-NEXT: pinsrw $2, %eax, %xmm4 -; SSE-NEXT: pextrw $7, %xmm1, %ecx -; SSE-NEXT: pinsrw $4, %ecx, %xmm4 -; SSE-NEXT: pinsrw $6, %eax, %xmm4 -; SSE-NEXT: movdqa %xmm4, 48(%rdi) -; SSE-NEXT: movdqa %xmm3, 32(%rdi) -; SSE-NEXT: movdqa %xmm2, 16(%rdi) -; SSE-NEXT: movdqa %xmm0, (%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: PR45604: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movl $11, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm0 +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-NEXT: pinsrw $6, %eax, %xmm0 +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pinsrw $2, %eax, %xmm2 +; SSE2-NEXT: pextrw $3, %xmm1, %ecx +; SSE2-NEXT: pinsrw $4, %ecx, %xmm2 +; SSE2-NEXT: pinsrw $6, %eax, %xmm2 +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: pinsrw $2, %eax, %xmm3 +; SSE2-NEXT: pextrw $5, %xmm1, %ecx +; SSE2-NEXT: pinsrw $4, %ecx, %xmm3 +; SSE2-NEXT: pinsrw $6, %eax, %xmm3 +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: pinsrw $2, %eax, %xmm4 +; SSE2-NEXT: pextrw $7, %xmm1, %ecx +; SSE2-NEXT: pinsrw $4, %ecx, %xmm4 +; SSE2-NEXT: pinsrw $6, %eax, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 48(%rdi) +; SSE2-NEXT: movdqa %xmm3, 32(%rdi) +; SSE2-NEXT: movdqa %xmm2, 16(%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR45604: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa (%rsi), %xmm1 +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: movzwl %ax, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movl $11, %eax +; SSSE3-NEXT: pinsrw $2, %eax, %xmm0 +; SSSE3-NEXT: pextrw $1, %xmm1, %ecx +; SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSSE3-NEXT: pinsrw $6, %eax, %xmm0 +; SSSE3-NEXT: pextrw $2, %xmm1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: pinsrw $2, %eax, %xmm2 +; SSSE3-NEXT: pextrw $3, %xmm1, %ecx +; SSSE3-NEXT: pinsrw $4, %ecx, %xmm2 +; SSSE3-NEXT: pinsrw $6, %eax, %xmm2 +; SSSE3-NEXT: pextrw $4, %xmm1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: pinsrw $2, %eax, %xmm3 +; SSSE3-NEXT: pextrw $5, %xmm1, %ecx +; SSSE3-NEXT: pinsrw $4, %ecx, %xmm3 +; SSSE3-NEXT: pinsrw $6, %eax, %xmm3 +; SSSE3-NEXT: pextrw $6, %xmm1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm4 +; SSSE3-NEXT: pinsrw $2, %eax, %xmm4 +; SSSE3-NEXT: pextrw $7, %xmm1, %ecx +; SSSE3-NEXT: pinsrw $4, %ecx, %xmm4 +; SSSE3-NEXT: pinsrw $6, %eax, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 48(%rdi) +; SSSE3-NEXT: movdqa %xmm3, 32(%rdi) +; SSSE3-NEXT: movdqa %xmm2, 16(%rdi) +; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: PR45604: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rsi), %xmm1 +; SSE41-NEXT: pextrw $2, %xmm1, %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: movl $11, %eax +; SSE41-NEXT: pinsrw $2, %eax, %xmm0 +; SSE41-NEXT: pextrw $3, %xmm1, %ecx +; SSE41-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE41-NEXT: pinsrw $6, %eax, %xmm0 +; SSE41-NEXT: pextrw $4, %xmm1, %ecx +; SSE41-NEXT: movd %ecx, %xmm2 +; SSE41-NEXT: pinsrw $2, %eax, %xmm2 +; SSE41-NEXT: pextrw $5, %xmm1, %ecx +; SSE41-NEXT: pinsrw $4, %ecx, %xmm2 +; SSE41-NEXT: pinsrw $6, %eax, %xmm2 +; SSE41-NEXT: pextrw $6, %xmm1, %ecx +; SSE41-NEXT: movd %ecx, %xmm3 +; SSE41-NEXT: pinsrw $2, %eax, %xmm3 +; SSE41-NEXT: pextrw $7, %xmm1, %ecx +; SSE41-NEXT: pinsrw $4, %ecx, %xmm3 +; SSE41-NEXT: pinsrw $6, %eax, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3,4,5,6,7] +; SSE41-NEXT: pinsrw $2, %eax, %xmm4 +; SSE41-NEXT: pextrw $1, %xmm1, %ecx +; SSE41-NEXT: pinsrw $4, %ecx, %xmm4 +; SSE41-NEXT: pinsrw $6, %eax, %xmm4 +; SSE41-NEXT: movdqa %xmm4, (%rdi) +; SSE41-NEXT: movdqa %xmm3, 48(%rdi) +; SSE41-NEXT: movdqa %xmm2, 32(%rdi) +; SSE41-NEXT: movdqa %xmm0, 16(%rdi) +; SSE41-NEXT: retq ; ; AVX1-LABEL: PR45604: ; AVX1: # %bb.0: