From acbc5ede9916a22b06341647d94e5dff51af32a2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 26 Apr 2020 15:31:01 +0100 Subject: [PATCH] [X86][SSE] getFauxShuffle - support insert(truncate/extend(extract(vec0,c0)),vec1,c1) shuffle patterns at the byte level Followup to the PR45604 fix at rGe71dd7c011a3 where we disabled most of these cases. By creating the shuffle at the byte level we can handle any extension/truncation as long as we track how small the scalar got and assume that the upper bytes will need to be zero. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 36 ++--- llvm/test/CodeGen/X86/buildvec-extract.ll | 35 ++--- llvm/test/CodeGen/X86/buildvec-insertvec.ll | 5 +- llvm/test/CodeGen/X86/extract-concat.ll | 10 +- .../CodeGen/X86/vector-shuffle-combining.ll | 137 +++++++++++++----- 5 files changed, 142 insertions(+), 81 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6aa42fba4eb0..155df1577f45 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7462,16 +7462,18 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, } // Peek through trunc/aext/zext. - // TODO: handle elements smaller than VT. // TODO: aext shouldn't require SM_SentinelZero padding. // TODO: handle shift of scalars. + unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits(); while (Scl.getOpcode() == ISD::TRUNCATE || Scl.getOpcode() == ISD::ANY_EXTEND || Scl.getOpcode() == ISD::ZERO_EXTEND) { Scl = Scl.getOperand(0); - if (Scl.getScalarValueSizeInBits() < NumBitsPerElt) - return false; + if (MinBitsPerElt > Scl.getScalarValueSizeInBits()) + MinBitsPerElt = Scl.getScalarValueSizeInBits(); } + if ((MinBitsPerElt % 8) != 0) + return false; // Attempt to find the source vector the scalar was extracted from. SDValue SrcExtract; @@ -7486,31 +7488,29 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SDValue SrcVec = SrcExtract.getOperand(0); EVT SrcVT = SrcVec.getValueType(); - unsigned NumSrcElts = SrcVT.getVectorNumElements(); - unsigned NumZeros = - std::max((NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1, 0); - - if ((NumSrcElts % NumElts) != 0) + if (!SrcVT.getScalarType().isByteSized()) return false; - unsigned SrcIdx = SrcExtract.getConstantOperandVal(1); - if (NumSrcElts <= SrcIdx) - return false; + unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8); + unsigned DstByte = DstIdx * NumBytesPerElt; + // Create 'identity' byte level shuffle mask and then add inserted bytes. if (Opcode == ISD::SCALAR_TO_VECTOR) { Ops.push_back(SrcVec); - Mask.append(NumSrcElts, SM_SentinelUndef); + Mask.append(NumSizeInBytes, SM_SentinelUndef); } else { Ops.push_back(SrcVec); Ops.push_back(N.getOperand(0)); - for (int i = 0; i != (int)NumSrcElts; ++i) - Mask.push_back(NumSrcElts + i); + for (int i = 0; i != (int)NumSizeInBytes; ++i) + Mask.push_back(NumSizeInBytes + i); } - int Scale = NumSrcElts / NumElts; - Mask[Scale * DstIdx] = SrcIdx; - for (int i = 0; i != (int)NumZeros; ++i) - Mask[(Scale * DstIdx) + i + 1] = SM_SentinelZero; + unsigned MinBytesPerElts = MinBitsPerElt / 8; + MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt); + for (unsigned i = 0; i != MinBytesPerElts; ++i) + Mask[DstByte + i] = SrcByte + i; + for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i) + Mask[DstByte + i] = SM_SentinelZero; return true; } case X86ISD::PACKSS: diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll index 218701250e43..863ab4dee123 100644 --- a/llvm/test/CodeGen/X86/buildvec-extract.ll +++ b/llvm/test/CodeGen/X86/buildvec-extract.ll @@ -293,24 +293,19 @@ define <2 x i64> @extract2_i32_zext_insert1_i64_undef(<4 x i32> %x) { define <2 x i64> @extract2_i32_zext_insert1_i64_zero(<4 x i32> %x) { ; SSE2-LABEL: extract2_i32_zext_insert1_i64_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract2_i32_zext_insert1_i64_zero: ; SSE41: # %bb.0: -; SSE41-NEXT: extractps $2, %xmm0, %eax -; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; SSE41-NEXT: retq ; ; AVX-LABEL: extract2_i32_zext_insert1_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vextractps $2, %xmm0, %eax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 2 %z = zext i32 %e to i64 @@ -386,16 +381,22 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_undef(<8 x i16> %x) { } define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) { -; SSE-LABEL: extract0_i16_zext_insert0_i64_zero: -; SSE: # %bb.0: -; SSE-NEXT: pextrw $0, %xmm0, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: extract0_i16_zext_insert0_i64_zero: +; SSE2: # %bb.0: +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: extract0_i16_zext_insert0_i64_zero: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE41-NEXT: retq ; ; AVX-LABEL: extract0_i16_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $0, %xmm0, %eax -; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 0 %z = zext i16 %e to i64 diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll index 3add65914b58..9fb78491b608 100644 --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -21,10 +21,7 @@ define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind { ; SSE41-LABEL: foo: ; SSE41: # %bb.0: ; SSE41-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE41-NEXT: pextrb $8, %xmm0, %eax -; SSE41-NEXT: pextrb $4, %xmm0, %ecx -; SSE41-NEXT: pinsrb $1, %ecx, %xmm0 -; SSE41-NEXT: pinsrb $2, %eax, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; SSE41-NEXT: movl $255, %eax ; SSE41-NEXT: pinsrb $3, %eax, %xmm0 ; SSE41-NEXT: movd %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll index 085560c1a504..26e07d86bfc3 100644 --- a/llvm/test/CodeGen/X86/extract-concat.ll +++ b/llvm/test/CodeGen/X86/extract-concat.ll @@ -24,10 +24,7 @@ define void @foo(<4 x float> %in, <4 x i8>* %out) { ; SSE42-LABEL: foo: ; SSE42: # %bb.0: ; SSE42-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE42-NEXT: pextrb $8, %xmm0, %eax -; SSE42-NEXT: pextrb $4, %xmm0, %ecx -; SSE42-NEXT: pinsrb $1, %ecx, %xmm0 -; SSE42-NEXT: pinsrb $2, %eax, %xmm0 +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; SSE42-NEXT: movl $255, %eax ; SSE42-NEXT: pinsrb $3, %eax, %xmm0 ; SSE42-NEXT: movd %xmm0, (%rdi) @@ -36,10 +33,7 @@ define void @foo(<4 x float> %in, <4 x i8>* %out) { ; AVX-LABEL: foo: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpextrb $8, %xmm0, %eax -; AVX-NEXT: vpextrb $4, %xmm0, %ecx -; AVX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: movl $255, %eax ; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index c80ff1e8ee33..6b42178c6719 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3028,40 +3028,109 @@ define void @PR43024() { } define void @PR45604(<32 x i16>* %dst, <8 x i16>* %src) { -; SSE-LABEL: PR45604: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rsi), %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: movzwl %ax, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movl $11, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm0 -; SSE-NEXT: pextrw $1, %xmm1, %ecx -; SSE-NEXT: pinsrw $4, %ecx, %xmm0 -; SSE-NEXT: pinsrw $6, %eax, %xmm0 -; SSE-NEXT: pextrw $2, %xmm1, %ecx -; SSE-NEXT: movd %ecx, %xmm2 -; SSE-NEXT: pinsrw $2, %eax, %xmm2 -; SSE-NEXT: pextrw $3, %xmm1, %ecx -; SSE-NEXT: pinsrw $4, %ecx, %xmm2 -; SSE-NEXT: pinsrw $6, %eax, %xmm2 -; SSE-NEXT: pextrw $4, %xmm1, %ecx -; SSE-NEXT: movd %ecx, %xmm3 -; SSE-NEXT: pinsrw $2, %eax, %xmm3 -; SSE-NEXT: pextrw $5, %xmm1, %ecx -; SSE-NEXT: pinsrw $4, %ecx, %xmm3 -; SSE-NEXT: pinsrw $6, %eax, %xmm3 -; SSE-NEXT: pextrw $6, %xmm1, %ecx -; SSE-NEXT: movd %ecx, %xmm4 -; SSE-NEXT: pinsrw $2, %eax, %xmm4 -; SSE-NEXT: pextrw $7, %xmm1, %ecx -; SSE-NEXT: pinsrw $4, %ecx, %xmm4 -; SSE-NEXT: pinsrw $6, %eax, %xmm4 -; SSE-NEXT: movdqa %xmm4, 48(%rdi) -; SSE-NEXT: movdqa %xmm3, 32(%rdi) -; SSE-NEXT: movdqa %xmm2, 16(%rdi) -; SSE-NEXT: movdqa %xmm0, (%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: PR45604: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movl $11, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm0 +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-NEXT: pinsrw $6, %eax, %xmm0 +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pinsrw $2, %eax, %xmm2 +; SSE2-NEXT: pextrw $3, %xmm1, %ecx +; SSE2-NEXT: pinsrw $4, %ecx, %xmm2 +; SSE2-NEXT: pinsrw $6, %eax, %xmm2 +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: pinsrw $2, %eax, %xmm3 +; SSE2-NEXT: pextrw $5, %xmm1, %ecx +; SSE2-NEXT: pinsrw $4, %ecx, %xmm3 +; SSE2-NEXT: pinsrw $6, %eax, %xmm3 +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: pinsrw $2, %eax, %xmm4 +; SSE2-NEXT: pextrw $7, %xmm1, %ecx +; SSE2-NEXT: pinsrw $4, %ecx, %xmm4 +; SSE2-NEXT: pinsrw $6, %eax, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 48(%rdi) +; SSE2-NEXT: movdqa %xmm3, 32(%rdi) +; SSE2-NEXT: movdqa %xmm2, 16(%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR45604: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa (%rsi), %xmm1 +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: movzwl %ax, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movl $11, %eax +; SSSE3-NEXT: pinsrw $2, %eax, %xmm0 +; SSSE3-NEXT: pextrw $1, %xmm1, %ecx +; SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSSE3-NEXT: pinsrw $6, %eax, %xmm0 +; SSSE3-NEXT: pextrw $2, %xmm1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: pinsrw $2, %eax, %xmm2 +; SSSE3-NEXT: pextrw $3, %xmm1, %ecx +; SSSE3-NEXT: pinsrw $4, %ecx, %xmm2 +; SSSE3-NEXT: pinsrw $6, %eax, %xmm2 +; SSSE3-NEXT: pextrw $4, %xmm1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: pinsrw $2, %eax, %xmm3 +; SSSE3-NEXT: pextrw $5, %xmm1, %ecx +; SSSE3-NEXT: pinsrw $4, %ecx, %xmm3 +; SSSE3-NEXT: pinsrw $6, %eax, %xmm3 +; SSSE3-NEXT: pextrw $6, %xmm1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm4 +; SSSE3-NEXT: pinsrw $2, %eax, %xmm4 +; SSSE3-NEXT: pextrw $7, %xmm1, %ecx +; SSSE3-NEXT: pinsrw $4, %ecx, %xmm4 +; SSSE3-NEXT: pinsrw $6, %eax, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 48(%rdi) +; SSSE3-NEXT: movdqa %xmm3, 32(%rdi) +; SSSE3-NEXT: movdqa %xmm2, 16(%rdi) +; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: PR45604: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rsi), %xmm1 +; SSE41-NEXT: pextrw $2, %xmm1, %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: movl $11, %eax +; SSE41-NEXT: pinsrw $2, %eax, %xmm0 +; SSE41-NEXT: pextrw $3, %xmm1, %ecx +; SSE41-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE41-NEXT: pinsrw $6, %eax, %xmm0 +; SSE41-NEXT: pextrw $4, %xmm1, %ecx +; SSE41-NEXT: movd %ecx, %xmm2 +; SSE41-NEXT: pinsrw $2, %eax, %xmm2 +; SSE41-NEXT: pextrw $5, %xmm1, %ecx +; SSE41-NEXT: pinsrw $4, %ecx, %xmm2 +; SSE41-NEXT: pinsrw $6, %eax, %xmm2 +; SSE41-NEXT: pextrw $6, %xmm1, %ecx +; SSE41-NEXT: movd %ecx, %xmm3 +; SSE41-NEXT: pinsrw $2, %eax, %xmm3 +; SSE41-NEXT: pextrw $7, %xmm1, %ecx +; SSE41-NEXT: pinsrw $4, %ecx, %xmm3 +; SSE41-NEXT: pinsrw $6, %eax, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3,4,5,6,7] +; SSE41-NEXT: pinsrw $2, %eax, %xmm4 +; SSE41-NEXT: pextrw $1, %xmm1, %ecx +; SSE41-NEXT: pinsrw $4, %ecx, %xmm4 +; SSE41-NEXT: pinsrw $6, %eax, %xmm4 +; SSE41-NEXT: movdqa %xmm4, (%rdi) +; SSE41-NEXT: movdqa %xmm3, 48(%rdi) +; SSE41-NEXT: movdqa %xmm2, 32(%rdi) +; SSE41-NEXT: movdqa %xmm0, 16(%rdi) +; SSE41-NEXT: retq ; ; AVX1-LABEL: PR45604: ; AVX1: # %bb.0: