[X86][SSE] getFauxShuffle - support insert(truncate/extend(extract(vec0,c0)),vec1,c1) shuffle patterns at the byte level

Followup to the PR45604 fix at rGe71dd7c011a3 where we disabled most of these cases.

By creating the shuffle at the byte level we can handle any extension/truncation as long as we track how small the scalar got and assume that the upper bytes will need to be zero.
This commit is contained in:
Simon Pilgrim 2020-04-26 15:31:01 +01:00
parent 33f043cc9f
commit acbc5ede99
5 changed files with 142 additions and 81 deletions

View File

@ -7462,16 +7462,18 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
}
// Peek through trunc/aext/zext.
// TODO: handle elements smaller than VT.
// TODO: aext shouldn't require SM_SentinelZero padding.
// TODO: handle shift of scalars.
unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
while (Scl.getOpcode() == ISD::TRUNCATE ||
Scl.getOpcode() == ISD::ANY_EXTEND ||
Scl.getOpcode() == ISD::ZERO_EXTEND) {
Scl = Scl.getOperand(0);
if (Scl.getScalarValueSizeInBits() < NumBitsPerElt)
return false;
if (MinBitsPerElt > Scl.getScalarValueSizeInBits())
MinBitsPerElt = Scl.getScalarValueSizeInBits();
}
if ((MinBitsPerElt % 8) != 0)
return false;
// Attempt to find the source vector the scalar was extracted from.
SDValue SrcExtract;
@ -7486,31 +7488,29 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
SDValue SrcVec = SrcExtract.getOperand(0);
EVT SrcVT = SrcVec.getValueType();
unsigned NumSrcElts = SrcVT.getVectorNumElements();
unsigned NumZeros =
std::max<int>((NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1, 0);
if ((NumSrcElts % NumElts) != 0)
if (!SrcVT.getScalarType().isByteSized())
return false;
unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
if (NumSrcElts <= SrcIdx)
return false;
unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
unsigned DstByte = DstIdx * NumBytesPerElt;
// Create 'identity' byte level shuffle mask and then add inserted bytes.
if (Opcode == ISD::SCALAR_TO_VECTOR) {
Ops.push_back(SrcVec);
Mask.append(NumSrcElts, SM_SentinelUndef);
Mask.append(NumSizeInBytes, SM_SentinelUndef);
} else {
Ops.push_back(SrcVec);
Ops.push_back(N.getOperand(0));
for (int i = 0; i != (int)NumSrcElts; ++i)
Mask.push_back(NumSrcElts + i);
for (int i = 0; i != (int)NumSizeInBytes; ++i)
Mask.push_back(NumSizeInBytes + i);
}
int Scale = NumSrcElts / NumElts;
Mask[Scale * DstIdx] = SrcIdx;
for (int i = 0; i != (int)NumZeros; ++i)
Mask[(Scale * DstIdx) + i + 1] = SM_SentinelZero;
unsigned MinBytesPerElts = MinBitsPerElt / 8;
MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
for (unsigned i = 0; i != MinBytesPerElts; ++i)
Mask[DstByte + i] = SrcByte + i;
for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
Mask[DstByte + i] = SM_SentinelZero;
return true;
}
case X86ISD::PACKSS:

View File

@ -293,24 +293,19 @@ define <2 x i64> @extract2_i32_zext_insert1_i64_undef(<4 x i32> %x) {
define <2 x i64> @extract2_i32_zext_insert1_i64_zero(<4 x i32> %x) {
; SSE2-LABEL: extract2_i32_zext_insert1_i64_zero:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract2_i32_zext_insert1_i64_zero:
; SSE41: # %bb.0:
; SSE41-NEXT: extractps $2, %xmm0, %eax
; SSE41-NEXT: movq %rax, %xmm0
; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: extract2_i32_zext_insert1_i64_zero:
; AVX: # %bb.0:
; AVX-NEXT: vextractps $2, %xmm0, %eax
; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 2
%z = zext i32 %e to i64
@ -386,16 +381,22 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_undef(<8 x i16> %x) {
}
define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE-LABEL: extract0_i16_zext_insert0_i64_zero:
; SSE: # %bb.0:
; SSE-NEXT: pextrw $0, %xmm0, %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: retq
; SSE2-LABEL: extract0_i16_zext_insert0_i64_zero:
; SSE2: # %bb.0:
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract0_i16_zext_insert0_i64_zero:
; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: extract0_i16_zext_insert0_i64_zero:
; AVX: # %bb.0:
; AVX-NEXT: vpextrw $0, %xmm0, %eax
; AVX-NEXT: vmovd %eax, %xmm0
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 0
%z = zext i16 %e to i64

View File

@ -21,10 +21,7 @@ define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
; SSE41-LABEL: foo:
; SSE41: # %bb.0:
; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
; SSE41-NEXT: pextrb $8, %xmm0, %eax
; SSE41-NEXT: pextrb $4, %xmm0, %ecx
; SSE41-NEXT: pinsrb $1, %ecx, %xmm0
; SSE41-NEXT: pinsrb $2, %eax, %xmm0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
; SSE41-NEXT: movl $255, %eax
; SSE41-NEXT: pinsrb $3, %eax, %xmm0
; SSE41-NEXT: movd %xmm0, (%rdi)

View File

@ -24,10 +24,7 @@ define void @foo(<4 x float> %in, <4 x i8>* %out) {
; SSE42-LABEL: foo:
; SSE42: # %bb.0:
; SSE42-NEXT: cvttps2dq %xmm0, %xmm0
; SSE42-NEXT: pextrb $8, %xmm0, %eax
; SSE42-NEXT: pextrb $4, %xmm0, %ecx
; SSE42-NEXT: pinsrb $1, %ecx, %xmm0
; SSE42-NEXT: pinsrb $2, %eax, %xmm0
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
; SSE42-NEXT: movl $255, %eax
; SSE42-NEXT: pinsrb $3, %eax, %xmm0
; SSE42-NEXT: movd %xmm0, (%rdi)
@ -36,10 +33,7 @@ define void @foo(<4 x float> %in, <4 x i8>* %out) {
; AVX-LABEL: foo:
; AVX: # %bb.0:
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX-NEXT: vpextrb $8, %xmm0, %eax
; AVX-NEXT: vpextrb $4, %xmm0, %ecx
; AVX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; AVX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: movl $255, %eax
; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, (%rdi)

View File

@ -3028,40 +3028,109 @@ define void @PR43024() {
}
define void @PR45604(<32 x i16>* %dst, <8 x i16>* %src) {
; SSE-LABEL: PR45604:
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rsi), %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: movzwl %ax, %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: movl $11, %eax
; SSE-NEXT: pinsrw $2, %eax, %xmm0
; SSE-NEXT: pextrw $1, %xmm1, %ecx
; SSE-NEXT: pinsrw $4, %ecx, %xmm0
; SSE-NEXT: pinsrw $6, %eax, %xmm0
; SSE-NEXT: pextrw $2, %xmm1, %ecx
; SSE-NEXT: movd %ecx, %xmm2
; SSE-NEXT: pinsrw $2, %eax, %xmm2
; SSE-NEXT: pextrw $3, %xmm1, %ecx
; SSE-NEXT: pinsrw $4, %ecx, %xmm2
; SSE-NEXT: pinsrw $6, %eax, %xmm2
; SSE-NEXT: pextrw $4, %xmm1, %ecx
; SSE-NEXT: movd %ecx, %xmm3
; SSE-NEXT: pinsrw $2, %eax, %xmm3
; SSE-NEXT: pextrw $5, %xmm1, %ecx
; SSE-NEXT: pinsrw $4, %ecx, %xmm3
; SSE-NEXT: pinsrw $6, %eax, %xmm3
; SSE-NEXT: pextrw $6, %xmm1, %ecx
; SSE-NEXT: movd %ecx, %xmm4
; SSE-NEXT: pinsrw $2, %eax, %xmm4
; SSE-NEXT: pextrw $7, %xmm1, %ecx
; SSE-NEXT: pinsrw $4, %ecx, %xmm4
; SSE-NEXT: pinsrw $6, %eax, %xmm4
; SSE-NEXT: movdqa %xmm4, 48(%rdi)
; SSE-NEXT: movdqa %xmm3, 32(%rdi)
; SSE-NEXT: movdqa %xmm2, 16(%rdi)
; SSE-NEXT: movdqa %xmm0, (%rdi)
; SSE-NEXT: retq
; SSE2-LABEL: PR45604:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rsi), %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: movl $11, %eax
; SSE2-NEXT: pinsrw $2, %eax, %xmm0
; SSE2-NEXT: pextrw $1, %xmm1, %ecx
; SSE2-NEXT: pinsrw $4, %ecx, %xmm0
; SSE2-NEXT: pinsrw $6, %eax, %xmm0
; SSE2-NEXT: pextrw $2, %xmm1, %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: pinsrw $2, %eax, %xmm2
; SSE2-NEXT: pextrw $3, %xmm1, %ecx
; SSE2-NEXT: pinsrw $4, %ecx, %xmm2
; SSE2-NEXT: pinsrw $6, %eax, %xmm2
; SSE2-NEXT: pextrw $4, %xmm1, %ecx
; SSE2-NEXT: movd %ecx, %xmm3
; SSE2-NEXT: pinsrw $2, %eax, %xmm3
; SSE2-NEXT: pextrw $5, %xmm1, %ecx
; SSE2-NEXT: pinsrw $4, %ecx, %xmm3
; SSE2-NEXT: pinsrw $6, %eax, %xmm3
; SSE2-NEXT: pextrw $6, %xmm1, %ecx
; SSE2-NEXT: movd %ecx, %xmm4
; SSE2-NEXT: pinsrw $2, %eax, %xmm4
; SSE2-NEXT: pextrw $7, %xmm1, %ecx
; SSE2-NEXT: pinsrw $4, %ecx, %xmm4
; SSE2-NEXT: pinsrw $6, %eax, %xmm4
; SSE2-NEXT: movdqa %xmm4, 48(%rdi)
; SSE2-NEXT: movdqa %xmm3, 32(%rdi)
; SSE2-NEXT: movdqa %xmm2, 16(%rdi)
; SSE2-NEXT: movdqa %xmm0, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: PR45604:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa (%rsi), %xmm1
; SSSE3-NEXT: movd %xmm1, %eax
; SSSE3-NEXT: movzwl %ax, %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: movl $11, %eax
; SSSE3-NEXT: pinsrw $2, %eax, %xmm0
; SSSE3-NEXT: pextrw $1, %xmm1, %ecx
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm0
; SSSE3-NEXT: pinsrw $6, %eax, %xmm0
; SSSE3-NEXT: pextrw $2, %xmm1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm2
; SSSE3-NEXT: pinsrw $2, %eax, %xmm2
; SSSE3-NEXT: pextrw $3, %xmm1, %ecx
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm2
; SSSE3-NEXT: pinsrw $6, %eax, %xmm2
; SSSE3-NEXT: pextrw $4, %xmm1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm3
; SSSE3-NEXT: pinsrw $2, %eax, %xmm3
; SSSE3-NEXT: pextrw $5, %xmm1, %ecx
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm3
; SSSE3-NEXT: pinsrw $6, %eax, %xmm3
; SSSE3-NEXT: pextrw $6, %xmm1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm4
; SSSE3-NEXT: pinsrw $2, %eax, %xmm4
; SSSE3-NEXT: pextrw $7, %xmm1, %ecx
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm4
; SSSE3-NEXT: pinsrw $6, %eax, %xmm4
; SSSE3-NEXT: movdqa %xmm4, 48(%rdi)
; SSSE3-NEXT: movdqa %xmm3, 32(%rdi)
; SSSE3-NEXT: movdqa %xmm2, 16(%rdi)
; SSSE3-NEXT: movdqa %xmm0, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: PR45604:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa (%rsi), %xmm1
; SSE41-NEXT: pextrw $2, %xmm1, %eax
; SSE41-NEXT: movd %eax, %xmm0
; SSE41-NEXT: movl $11, %eax
; SSE41-NEXT: pinsrw $2, %eax, %xmm0
; SSE41-NEXT: pextrw $3, %xmm1, %ecx
; SSE41-NEXT: pinsrw $4, %ecx, %xmm0
; SSE41-NEXT: pinsrw $6, %eax, %xmm0
; SSE41-NEXT: pextrw $4, %xmm1, %ecx
; SSE41-NEXT: movd %ecx, %xmm2
; SSE41-NEXT: pinsrw $2, %eax, %xmm2
; SSE41-NEXT: pextrw $5, %xmm1, %ecx
; SSE41-NEXT: pinsrw $4, %ecx, %xmm2
; SSE41-NEXT: pinsrw $6, %eax, %xmm2
; SSE41-NEXT: pextrw $6, %xmm1, %ecx
; SSE41-NEXT: movd %ecx, %xmm3
; SSE41-NEXT: pinsrw $2, %eax, %xmm3
; SSE41-NEXT: pextrw $7, %xmm1, %ecx
; SSE41-NEXT: pinsrw $4, %ecx, %xmm3
; SSE41-NEXT: pinsrw $6, %eax, %xmm3
; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3,4,5,6,7]
; SSE41-NEXT: pinsrw $2, %eax, %xmm4
; SSE41-NEXT: pextrw $1, %xmm1, %ecx
; SSE41-NEXT: pinsrw $4, %ecx, %xmm4
; SSE41-NEXT: pinsrw $6, %eax, %xmm4
; SSE41-NEXT: movdqa %xmm4, (%rdi)
; SSE41-NEXT: movdqa %xmm3, 48(%rdi)
; SSE41-NEXT: movdqa %xmm2, 32(%rdi)
; SSE41-NEXT: movdqa %xmm0, 16(%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: PR45604:
; AVX1: # %bb.0: