[X86][SSE] Add computeKnownBits/ComputeNumSignBits support for PACKSS/PACKUS instructions.

Pull out getPackDemandedElts demanded elts remapping helper from computeKnownBitsForTargetNode and use in computeKnownBits/ComputeNumSignBits.

llvm-svn: 347303
This commit is contained in:
Simon Pilgrim 2018-11-20 13:23:37 +00:00
parent ed7e2fda18
commit ee8b96f253
3 changed files with 72 additions and 66 deletions

View File

@ -5932,6 +5932,31 @@ static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
}
}
// Split the demanded elts of a PACKSS/PACKUS node between its operands.
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
APInt &DemandedLHS, APInt &DemandedRHS) {
int NumLanes = VT.getSizeInBits() / 128;
int NumElts = DemandedElts.getBitWidth();
int NumInnerElts = NumElts / 2;
int NumEltsPerLane = NumElts / NumLanes;
int NumInnerEltsPerLane = NumInnerElts / NumLanes;
DemandedLHS = APInt::getNullValue(NumInnerElts);
DemandedRHS = APInt::getNullValue(NumInnerElts);
// Map DemandedElts to the packed operands.
for (int Lane = 0; Lane != NumLanes; ++Lane) {
for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
int OuterIdx = (Lane * NumEltsPerLane) + Elt;
int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
if (DemandedElts[OuterIdx])
DemandedLHS.setBit(InnerIdx);
if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
DemandedRHS.setBit(InnerIdx);
}
}
}
/// Calculates the shuffle mask corresponding to the target-specific opcode.
/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
/// operands in \p Ops, and returns true.
@ -29938,12 +29963,24 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
}
case X86ISD::PACKUS: {
// PACKUS is just a truncation if the upper half is zero.
// TODO: Add DemandedElts support.
APInt DemandedLHS, DemandedRHS;
getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
Known.One = APInt::getAllOnesValue(BitWidth * 2);
Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
KnownBits Known2;
DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
DAG.computeKnownBits(Op.getOperand(1), Known2, Depth + 1);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
if (!!DemandedLHS) {
DAG.computeKnownBits(Op.getOperand(0), Known2, DemandedLHS, Depth + 1);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
}
if (!!DemandedRHS) {
DAG.computeKnownBits(Op.getOperand(1), Known2, DemandedRHS, Depth + 1);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
}
if (Known.countMinLeadingZeros() < BitWidth)
Known.resetAll();
Known = Known.trunc(BitWidth);
@ -30039,10 +30076,16 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
case X86ISD::PACKSS: {
// PACKSS is just a truncation if the sign bits extend to the packed size.
// TODO: Add DemandedElts support.
APInt DemandedLHS, DemandedRHS;
getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
DemandedRHS);
unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
if (!!DemandedLHS)
Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
if (!!DemandedRHS)
Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
unsigned Tmp = std::min(Tmp0, Tmp1);
if (Tmp > (SrcBits - VTBits))
return Tmp - (SrcBits - VTBits);
@ -32226,24 +32269,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
}
case X86ISD::PACKSS:
case X86ISD::PACKUS: {
int NumLanes = VT.getSizeInBits() / 128;
int NumInnerElts = NumElts / 2;
int NumEltsPerLane = NumElts / NumLanes;
int NumInnerEltsPerLane = NumInnerElts / NumLanes;
// Map DemandedElts to the packed operands.
APInt DemandedLHS = APInt::getNullValue(NumInnerElts);
APInt DemandedRHS = APInt::getNullValue(NumInnerElts);
for (int Lane = 0; Lane != NumLanes; ++Lane) {
for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
int OuterIdx = (Lane * NumEltsPerLane) + Elt;
int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
if (DemandedElts[OuterIdx])
DemandedLHS.setBit(InnerIdx);
if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
DemandedRHS.setBit(InnerIdx);
}
}
APInt DemandedLHS, DemandedRHS;
getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
APInt SrcUndef, SrcZero;
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef,

View File

@ -237,25 +237,7 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr_zero0(<4 x i64> %x) {
define <4 x i32> @combine_vec_lshr_trunc_lshr_zero1(<4 x i64> %x) {
; SSE-LABEL: combine_vec_lshr_trunc_lshr_zero1:
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psrlq $51, %xmm2
; SSE-NEXT: psrlq $50, %xmm1
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $49, %xmm2
; SSE-NEXT: psrlq $48, %xmm0
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: packusdw %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrld $27, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrld $25, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrld $26, %xmm1
; SSE-NEXT: psrld $24, %xmm0
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero1:

View File

@ -1681,17 +1681,17 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
;
; SSE41-LABEL: psubus_8i64_max:
; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: movdqa %xmm0, %xmm10
; SSE41-NEXT: movdqa %xmm0, %xmm8
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm0
; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991]
; SSE41-NEXT: movdqa %xmm8, %xmm7
; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991]
; SSE41-NEXT: movdqa %xmm9, %xmm7
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
; SSE41-NEXT: pand %xmm9, %xmm5
; SSE41-NEXT: pand %xmm10, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535]
@ -1699,12 +1699,12 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm11
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm0
; SSE41-NEXT: movdqa %xmm8, %xmm4
; SSE41-NEXT: movdqa %xmm9, %xmm4
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
; SSE41-NEXT: pand %xmm9, %xmm5
; SSE41-NEXT: pand %xmm10, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm4
@ -1712,21 +1712,21 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: packusdw %xmm11, %xmm4
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm0
; SSE41-NEXT: movdqa %xmm8, %xmm3
; SSE41-NEXT: movdqa %xmm9, %xmm3
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
; SSE41-NEXT: pand %xmm9, %xmm5
; SSE41-NEXT: pand %xmm10, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
; SSE41-NEXT: pxor %xmm1, %xmm6
; SSE41-NEXT: movdqa %xmm8, %xmm0
; SSE41-NEXT: movdqa %xmm9, %xmm0
; SSE41-NEXT: pcmpgtd %xmm6, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm8, %xmm6
; SSE41-NEXT: pcmpeqd %xmm9, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSE41-NEXT: pand %xmm2, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
@ -1734,11 +1734,8 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
; SSE41-NEXT: packusdw %xmm3, %xmm7
; SSE41-NEXT: packusdw %xmm4, %xmm7
; SSE41-NEXT: psubusw %xmm7, %xmm10
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
; SSE41-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
; SSE41-NEXT: packusdw %xmm10, %xmm0
; SSE41-NEXT: psubusw %xmm7, %xmm8
; SSE41-NEXT: movdqa %xmm8, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: psubus_8i64_max: