[X86] Improve `matchBinaryShuffle()`'s `BLEND` lowering with per-element all-zero/all-ones knowledge

We can use `OR` instead of `BLEND` if either the element we are not picking is zero (or masked away);
or the element we are picking overwhelms (e.g. it's all-ones) whatever the element we are not picking:
https://alive2.llvm.org/ce/z/RKejao

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D109726
This commit is contained in:
Roman Lebedev 2021-09-17 19:07:01 +03:00
parent df1ab7de38
commit 358df06f4e
No known key found for this signature in database
GPG Key ID: 083C3EBB4A1689E0
2 changed files with 64 additions and 37 deletions

View File

@ -36226,12 +36226,58 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
IsBlend = false;
break;
}
if (IsBlend &&
DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
Shuffle = ISD::OR;
SrcVT = DstVT = MaskVT.changeTypeToInteger();
return true;
if (IsBlend) {
if (DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
Shuffle = ISD::OR;
SrcVT = DstVT = MaskVT.changeTypeToInteger();
return true;
}
if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
// FIXME: handle mismatched sizes?
// TODO: investigate if `ISD::OR` handling in
// `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
auto computeKnownBitsElementWise = [&DAG](SDValue V) {
unsigned NumElts = V.getValueType().getVectorNumElements();
KnownBits Known(NumElts);
for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
if (PeepholeKnown.isZero())
Known.Zero.setBit(EltIdx);
if (PeepholeKnown.isAllOnes())
Known.One.setBit(EltIdx);
}
return Known;
};
KnownBits V1Known = computeKnownBitsElementWise(V1);
KnownBits V2Known = computeKnownBitsElementWise(V2);
for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef)
continue;
if (M == SM_SentinelZero) {
IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
continue;
}
if (M == (int)i) {
IsBlend &= V2Known.Zero[i] || V1Known.One[i];
continue;
}
if (M == (int)(i + NumMaskElts)) {
IsBlend &= V1Known.Zero[i] || V2Known.One[i];
continue;
}
llvm_unreachable("will not get here.");
}
if (IsBlend) {
Shuffle = ISD::OR;
SrcVT = DstVT = MaskVT.changeTypeToInteger();
return true;
}
}
}
}

View File

@ -280,11 +280,8 @@ define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) {
;
; AVX1-LABEL: insert_v16i16_x12345x789ABCDEx:
; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
@ -315,26 +312,22 @@ define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) {
; SSE2-LABEL: insert_v16i8_x123456789ABCDEx:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: movl $255, %eax
; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v16i8_x123456789ABCDEx:
; SSE3: # %bb.0:
; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE3-NEXT: pand %xmm1, %xmm0
; SSE3-NEXT: movl $255, %eax
; SSE3-NEXT: movd %eax, %xmm2
; SSE3-NEXT: pandn %xmm2, %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; SSE3-NEXT: por %xmm1, %xmm2
; SSE3-NEXT: por %xmm2, %xmm0
; SSE3-NEXT: retq
;
@ -344,7 +337,7 @@ define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) {
; SSSE3-NEXT: movd %eax, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero
; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; SSSE3-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
; SSSE3-NEXT: por %xmm2, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm0
@ -372,41 +365,31 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
; SSE2-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: movl $255, %eax
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: por %xmm4, %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
; SSE3: # %bb.0:
; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE3-NEXT: pand %xmm2, %xmm0
; SSE3-NEXT: movl $255, %eax
; SSE3-NEXT: movd %eax, %xmm3
; SSE3-NEXT: pandn %xmm3, %xmm2
; SSE3-NEXT: por %xmm2, %xmm0
; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
; SSE3-NEXT: pand %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm3, %xmm4
; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
; SSE3-NEXT: por %xmm4, %xmm0
; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE3-NEXT: por %xmm4, %xmm2
; SSE3-NEXT: por %xmm2, %xmm0
; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
; SSE3-NEXT: por %xmm4, %xmm3
; SSE3-NEXT: por %xmm3, %xmm1
; SSE3-NEXT: pand %xmm2, %xmm1
; SSE3-NEXT: por %xmm4, %xmm1
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
@ -415,15 +398,13 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
; SSSE3-NEXT: movd %eax, %xmm3
; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero
; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
; SSSE3-NEXT: por %xmm0, %xmm2
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero,xmm1[u]
; SSSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0],zero
; SSSE3-NEXT: por %xmm0, %xmm3
; SSSE3-NEXT: por %xmm3, %xmm1
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero
; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
;