forked from OSchip/llvm-project
[X86][SSE] Add computeKnownBits/ComputeNumSignBits support for PACKSS/PACKUS instructions.
Pull out getPackDemandedElts demanded elts remapping helper from computeKnownBitsForTargetNode and use in computeKnownBits/ComputeNumSignBits. llvm-svn: 347303
This commit is contained in:
parent
ed7e2fda18
commit
ee8b96f253
|
@ -5932,6 +5932,31 @@ static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
|
|||
}
|
||||
}
|
||||
|
||||
// Split the demanded elts of a PACKSS/PACKUS node between its operands.
|
||||
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
|
||||
APInt &DemandedLHS, APInt &DemandedRHS) {
|
||||
int NumLanes = VT.getSizeInBits() / 128;
|
||||
int NumElts = DemandedElts.getBitWidth();
|
||||
int NumInnerElts = NumElts / 2;
|
||||
int NumEltsPerLane = NumElts / NumLanes;
|
||||
int NumInnerEltsPerLane = NumInnerElts / NumLanes;
|
||||
|
||||
DemandedLHS = APInt::getNullValue(NumInnerElts);
|
||||
DemandedRHS = APInt::getNullValue(NumInnerElts);
|
||||
|
||||
// Map DemandedElts to the packed operands.
|
||||
for (int Lane = 0; Lane != NumLanes; ++Lane) {
|
||||
for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
|
||||
int OuterIdx = (Lane * NumEltsPerLane) + Elt;
|
||||
int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
|
||||
if (DemandedElts[OuterIdx])
|
||||
DemandedLHS.setBit(InnerIdx);
|
||||
if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
|
||||
DemandedRHS.setBit(InnerIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculates the shuffle mask corresponding to the target-specific opcode.
|
||||
/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
|
||||
/// operands in \p Ops, and returns true.
|
||||
|
@ -29938,12 +29963,24 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
|
|||
}
|
||||
case X86ISD::PACKUS: {
|
||||
// PACKUS is just a truncation if the upper half is zero.
|
||||
// TODO: Add DemandedElts support.
|
||||
APInt DemandedLHS, DemandedRHS;
|
||||
getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
|
||||
|
||||
Known.One = APInt::getAllOnesValue(BitWidth * 2);
|
||||
Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
|
||||
|
||||
KnownBits Known2;
|
||||
DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
|
||||
DAG.computeKnownBits(Op.getOperand(1), Known2, Depth + 1);
|
||||
Known.One &= Known2.One;
|
||||
Known.Zero &= Known2.Zero;
|
||||
if (!!DemandedLHS) {
|
||||
DAG.computeKnownBits(Op.getOperand(0), Known2, DemandedLHS, Depth + 1);
|
||||
Known.One &= Known2.One;
|
||||
Known.Zero &= Known2.Zero;
|
||||
}
|
||||
if (!!DemandedRHS) {
|
||||
DAG.computeKnownBits(Op.getOperand(1), Known2, DemandedRHS, Depth + 1);
|
||||
Known.One &= Known2.One;
|
||||
Known.Zero &= Known2.Zero;
|
||||
}
|
||||
|
||||
if (Known.countMinLeadingZeros() < BitWidth)
|
||||
Known.resetAll();
|
||||
Known = Known.trunc(BitWidth);
|
||||
|
@ -30039,10 +30076,16 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
|
|||
|
||||
case X86ISD::PACKSS: {
|
||||
// PACKSS is just a truncation if the sign bits extend to the packed size.
|
||||
// TODO: Add DemandedElts support.
|
||||
APInt DemandedLHS, DemandedRHS;
|
||||
getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
|
||||
DemandedRHS);
|
||||
|
||||
unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
|
||||
unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
|
||||
unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
|
||||
unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
|
||||
if (!!DemandedLHS)
|
||||
Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
|
||||
if (!!DemandedRHS)
|
||||
Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
|
||||
unsigned Tmp = std::min(Tmp0, Tmp1);
|
||||
if (Tmp > (SrcBits - VTBits))
|
||||
return Tmp - (SrcBits - VTBits);
|
||||
|
@ -32226,24 +32269,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
|
|||
}
|
||||
case X86ISD::PACKSS:
|
||||
case X86ISD::PACKUS: {
|
||||
int NumLanes = VT.getSizeInBits() / 128;
|
||||
int NumInnerElts = NumElts / 2;
|
||||
int NumEltsPerLane = NumElts / NumLanes;
|
||||
int NumInnerEltsPerLane = NumInnerElts / NumLanes;
|
||||
|
||||
// Map DemandedElts to the packed operands.
|
||||
APInt DemandedLHS = APInt::getNullValue(NumInnerElts);
|
||||
APInt DemandedRHS = APInt::getNullValue(NumInnerElts);
|
||||
for (int Lane = 0; Lane != NumLanes; ++Lane) {
|
||||
for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
|
||||
int OuterIdx = (Lane * NumEltsPerLane) + Elt;
|
||||
int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
|
||||
if (DemandedElts[OuterIdx])
|
||||
DemandedLHS.setBit(InnerIdx);
|
||||
if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
|
||||
DemandedRHS.setBit(InnerIdx);
|
||||
}
|
||||
}
|
||||
APInt DemandedLHS, DemandedRHS;
|
||||
getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
|
||||
|
||||
APInt SrcUndef, SrcZero;
|
||||
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef,
|
||||
|
|
|
@ -237,25 +237,7 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr_zero0(<4 x i64> %x) {
|
|||
define <4 x i32> @combine_vec_lshr_trunc_lshr_zero1(<4 x i64> %x) {
|
||||
; SSE-LABEL: combine_vec_lshr_trunc_lshr_zero1:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSE-NEXT: psrlq $51, %xmm2
|
||||
; SSE-NEXT: psrlq $50, %xmm1
|
||||
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
|
||||
; SSE-NEXT: movdqa %xmm0, %xmm2
|
||||
; SSE-NEXT: psrlq $49, %xmm2
|
||||
; SSE-NEXT: psrlq $48, %xmm0
|
||||
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
|
||||
; SSE-NEXT: packusdw %xmm1, %xmm0
|
||||
; SSE-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE-NEXT: psrld $27, %xmm1
|
||||
; SSE-NEXT: movdqa %xmm0, %xmm2
|
||||
; SSE-NEXT: psrld $25, %xmm2
|
||||
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
|
||||
; SSE-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE-NEXT: psrld $26, %xmm1
|
||||
; SSE-NEXT: psrld $24, %xmm0
|
||||
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
|
||||
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
|
||||
; SSE-NEXT: xorps %xmm0, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero1:
|
||||
|
|
|
@ -1681,17 +1681,17 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
|
|||
;
|
||||
; SSE41-LABEL: psubus_8i64_max:
|
||||
; SSE41: # %bb.0: # %vector.ph
|
||||
; SSE41-NEXT: movdqa %xmm0, %xmm10
|
||||
; SSE41-NEXT: movdqa %xmm0, %xmm8
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
|
||||
; SSE41-NEXT: movdqa %xmm4, %xmm0
|
||||
; SSE41-NEXT: pxor %xmm6, %xmm0
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991]
|
||||
; SSE41-NEXT: movdqa %xmm8, %xmm7
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991]
|
||||
; SSE41-NEXT: movdqa %xmm9, %xmm7
|
||||
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
|
||||
; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
|
||||
; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
|
||||
; SSE41-NEXT: pand %xmm9, %xmm5
|
||||
; SSE41-NEXT: pand %xmm10, %xmm5
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
|
||||
; SSE41-NEXT: por %xmm5, %xmm0
|
||||
; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535]
|
||||
|
@ -1699,12 +1699,12 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
|
|||
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm11
|
||||
; SSE41-NEXT: movdqa %xmm3, %xmm0
|
||||
; SSE41-NEXT: pxor %xmm6, %xmm0
|
||||
; SSE41-NEXT: movdqa %xmm8, %xmm4
|
||||
; SSE41-NEXT: movdqa %xmm9, %xmm4
|
||||
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2]
|
||||
; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
|
||||
; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
|
||||
; SSE41-NEXT: pand %xmm9, %xmm5
|
||||
; SSE41-NEXT: pand %xmm10, %xmm5
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
|
||||
; SSE41-NEXT: por %xmm5, %xmm0
|
||||
; SSE41-NEXT: movapd %xmm7, %xmm4
|
||||
|
@ -1712,21 +1712,21 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
|
|||
; SSE41-NEXT: packusdw %xmm11, %xmm4
|
||||
; SSE41-NEXT: movdqa %xmm2, %xmm0
|
||||
; SSE41-NEXT: pxor %xmm6, %xmm0
|
||||
; SSE41-NEXT: movdqa %xmm8, %xmm3
|
||||
; SSE41-NEXT: movdqa %xmm9, %xmm3
|
||||
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,2,2]
|
||||
; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
|
||||
; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
|
||||
; SSE41-NEXT: pand %xmm9, %xmm5
|
||||
; SSE41-NEXT: pand %xmm10, %xmm5
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
|
||||
; SSE41-NEXT: por %xmm5, %xmm0
|
||||
; SSE41-NEXT: movapd %xmm7, %xmm3
|
||||
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
|
||||
; SSE41-NEXT: pxor %xmm1, %xmm6
|
||||
; SSE41-NEXT: movdqa %xmm8, %xmm0
|
||||
; SSE41-NEXT: movdqa %xmm9, %xmm0
|
||||
; SSE41-NEXT: pcmpgtd %xmm6, %xmm0
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
|
||||
; SSE41-NEXT: pcmpeqd %xmm8, %xmm6
|
||||
; SSE41-NEXT: pcmpeqd %xmm9, %xmm6
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
|
||||
; SSE41-NEXT: pand %xmm2, %xmm5
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
||||
|
@ -1734,11 +1734,8 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
|
|||
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
|
||||
; SSE41-NEXT: packusdw %xmm3, %xmm7
|
||||
; SSE41-NEXT: packusdw %xmm4, %xmm7
|
||||
; SSE41-NEXT: psubusw %xmm7, %xmm10
|
||||
; SSE41-NEXT: pxor %xmm1, %xmm1
|
||||
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
|
||||
; SSE41-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
|
||||
; SSE41-NEXT: packusdw %xmm10, %xmm0
|
||||
; SSE41-NEXT: psubusw %xmm7, %xmm8
|
||||
; SSE41-NEXT: movdqa %xmm8, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: psubus_8i64_max:
|
||||
|
|
Loading…
Reference in New Issue