forked from OSchip/llvm-project
[X86] Improve `matchBinaryShuffle()`'s `BLEND` lowering with per-element all-zero/all-ones knowledge
We can use `OR` instead of `BLEND` if either the element we are not picking is zero (or masked away); or the element we are picking overwhelms (e.g. it's all-ones) whatever the element we are not picking: https://alive2.llvm.org/ce/z/RKejao Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D109726
This commit is contained in:
parent
df1ab7de38
commit
358df06f4e
|
@ -36226,12 +36226,58 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
|
|||
IsBlend = false;
|
||||
break;
|
||||
}
|
||||
if (IsBlend &&
|
||||
DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
|
||||
DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
|
||||
Shuffle = ISD::OR;
|
||||
SrcVT = DstVT = MaskVT.changeTypeToInteger();
|
||||
return true;
|
||||
if (IsBlend) {
|
||||
if (DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
|
||||
DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
|
||||
Shuffle = ISD::OR;
|
||||
SrcVT = DstVT = MaskVT.changeTypeToInteger();
|
||||
return true;
|
||||
}
|
||||
if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
|
||||
// FIXME: handle mismatched sizes?
|
||||
// TODO: investigate if `ISD::OR` handling in
|
||||
// `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
|
||||
auto computeKnownBitsElementWise = [&DAG](SDValue V) {
|
||||
unsigned NumElts = V.getValueType().getVectorNumElements();
|
||||
KnownBits Known(NumElts);
|
||||
for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
|
||||
APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
|
||||
KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
|
||||
if (PeepholeKnown.isZero())
|
||||
Known.Zero.setBit(EltIdx);
|
||||
if (PeepholeKnown.isAllOnes())
|
||||
Known.One.setBit(EltIdx);
|
||||
}
|
||||
return Known;
|
||||
};
|
||||
|
||||
KnownBits V1Known = computeKnownBitsElementWise(V1);
|
||||
KnownBits V2Known = computeKnownBitsElementWise(V2);
|
||||
|
||||
for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
|
||||
int M = Mask[i];
|
||||
if (M == SM_SentinelUndef)
|
||||
continue;
|
||||
if (M == SM_SentinelZero) {
|
||||
IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
|
||||
continue;
|
||||
}
|
||||
if (M == (int)i) {
|
||||
IsBlend &= V2Known.Zero[i] || V1Known.One[i];
|
||||
continue;
|
||||
}
|
||||
if (M == (int)(i + NumMaskElts)) {
|
||||
IsBlend &= V1Known.Zero[i] || V2Known.One[i];
|
||||
continue;
|
||||
}
|
||||
llvm_unreachable("will not get here.");
|
||||
}
|
||||
if (IsBlend) {
|
||||
Shuffle = ISD::OR;
|
||||
SrcVT = DstVT = MaskVT.changeTypeToInteger();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -280,11 +280,8 @@ define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) {
|
|||
;
|
||||
; AVX1-LABEL: insert_v16i16_x12345x789ABCDEx:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
|
||||
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
|
||||
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
|
||||
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
|
||||
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
|
||||
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
|
@ -315,26 +312,22 @@ define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) {
|
|||
; SSE2-LABEL: insert_v16i8_x123456789ABCDEx:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
|
||||
; SSE2-NEXT: pand %xmm1, %xmm0
|
||||
; SSE2-NEXT: movl $255, %eax
|
||||
; SSE2-NEXT: movd %eax, %xmm2
|
||||
; SSE2-NEXT: pandn %xmm2, %xmm1
|
||||
; SSE2-NEXT: por %xmm1, %xmm0
|
||||
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
|
||||
; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
|
||||
; SSE2-NEXT: por %xmm1, %xmm2
|
||||
; SSE2-NEXT: por %xmm2, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE3-LABEL: insert_v16i8_x123456789ABCDEx:
|
||||
; SSE3: # %bb.0:
|
||||
; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
|
||||
; SSE3-NEXT: pand %xmm1, %xmm0
|
||||
; SSE3-NEXT: movl $255, %eax
|
||||
; SSE3-NEXT: movd %eax, %xmm2
|
||||
; SSE3-NEXT: pandn %xmm2, %xmm1
|
||||
; SSE3-NEXT: por %xmm1, %xmm0
|
||||
; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
|
||||
; SSE3-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
|
||||
; SSE3-NEXT: por %xmm1, %xmm2
|
||||
; SSE3-NEXT: por %xmm2, %xmm0
|
||||
; SSE3-NEXT: retq
|
||||
;
|
||||
|
@ -344,7 +337,7 @@ define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) {
|
|||
; SSSE3-NEXT: movd %eax, %xmm1
|
||||
; SSSE3-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero
|
||||
; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
|
||||
; SSSE3-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
|
||||
; SSSE3-NEXT: por %xmm2, %xmm1
|
||||
; SSSE3-NEXT: movdqa %xmm1, %xmm0
|
||||
|
@ -372,41 +365,31 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
|
|||
; SSE2-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
|
||||
; SSE2-NEXT: pand %xmm2, %xmm0
|
||||
; SSE2-NEXT: movl $255, %eax
|
||||
; SSE2-NEXT: movd %eax, %xmm3
|
||||
; SSE2-NEXT: pandn %xmm3, %xmm2
|
||||
; SSE2-NEXT: por %xmm2, %xmm0
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
|
||||
; SSE2-NEXT: pand %xmm2, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm3, %xmm4
|
||||
; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
|
||||
; SSE2-NEXT: por %xmm4, %xmm0
|
||||
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
|
||||
; SSE2-NEXT: por %xmm4, %xmm2
|
||||
; SSE2-NEXT: por %xmm2, %xmm0
|
||||
; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
|
||||
; SSE2-NEXT: por %xmm4, %xmm3
|
||||
; SSE2-NEXT: por %xmm3, %xmm1
|
||||
; SSE2-NEXT: pand %xmm2, %xmm1
|
||||
; SSE2-NEXT: por %xmm4, %xmm1
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
|
||||
; SSE3: # %bb.0:
|
||||
; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
|
||||
; SSE3-NEXT: pand %xmm2, %xmm0
|
||||
; SSE3-NEXT: movl $255, %eax
|
||||
; SSE3-NEXT: movd %eax, %xmm3
|
||||
; SSE3-NEXT: pandn %xmm3, %xmm2
|
||||
; SSE3-NEXT: por %xmm2, %xmm0
|
||||
; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
|
||||
; SSE3-NEXT: pand %xmm2, %xmm0
|
||||
; SSE3-NEXT: movdqa %xmm3, %xmm4
|
||||
; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
|
||||
; SSE3-NEXT: por %xmm4, %xmm0
|
||||
; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
|
||||
; SSE3-NEXT: por %xmm4, %xmm2
|
||||
; SSE3-NEXT: por %xmm2, %xmm0
|
||||
; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
|
||||
; SSE3-NEXT: por %xmm4, %xmm3
|
||||
; SSE3-NEXT: por %xmm3, %xmm1
|
||||
; SSE3-NEXT: pand %xmm2, %xmm1
|
||||
; SSE3-NEXT: por %xmm4, %xmm1
|
||||
; SSE3-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
|
||||
|
@ -415,15 +398,13 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
|
|||
; SSSE3-NEXT: movd %eax, %xmm3
|
||||
; SSSE3-NEXT: movdqa %xmm3, %xmm2
|
||||
; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero
|
||||
; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
|
||||
; SSSE3-NEXT: movdqa %xmm3, %xmm0
|
||||
; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
|
||||
; SSSE3-NEXT: por %xmm0, %xmm2
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero,xmm1[u]
|
||||
; SSSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0],zero
|
||||
; SSSE3-NEXT: por %xmm0, %xmm3
|
||||
; SSSE3-NEXT: por %xmm3, %xmm1
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero
|
||||
; SSSE3-NEXT: por %xmm0, %xmm1
|
||||
; SSSE3-NEXT: movdqa %xmm2, %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
|
|
Loading…
Reference in New Issue