[X86][SSE] Add SimplifyDemandedVectorElts support for PACKSS/PACKUS instructions.

As discussed on rL347240.

llvm-svn: 347299
This commit is contained in:
Simon Pilgrim 2018-11-20 11:09:46 +00:00
parent 991e316126
commit 7198506ba8
4 changed files with 39 additions and 12 deletions

View File

@ -32232,6 +32232,36 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return true;
break;
}
case X86ISD::PACKSS:
case X86ISD::PACKUS: {
int NumLanes = VT.getSizeInBits() / 128;
int NumInnerElts = NumElts / 2;
int NumEltsPerLane = NumElts / NumLanes;
int NumInnerEltsPerLane = NumInnerElts / NumLanes;
// Map DemandedElts to the packed operands.
APInt DemandedLHS = APInt::getNullValue(NumInnerElts);
APInt DemandedRHS = APInt::getNullValue(NumInnerElts);
for (int Lane = 0; Lane != NumLanes; ++Lane) {
for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
int OuterIdx = (Lane * NumEltsPerLane) + Elt;
int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
if (DemandedElts[OuterIdx])
DemandedLHS.setBit(InnerIdx);
if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
DemandedRHS.setBit(InnerIdx);
}
}
APInt SrcUndef, SrcZero;
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef,
SrcZero, TLO, Depth + 1))
return true;
if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, SrcUndef,
SrcZero, TLO, Depth + 1))
return true;
break;
}
case X86ISD::VBROADCAST: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();

View File

@ -677,7 +677,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SSE41-NEXT: psllw $1, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm3, %xmm2
; SSE41-NEXT: packuswb %xmm0, %xmm2
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
@ -697,7 +697,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
@ -714,8 +714,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpackuswb %xmm0, %xmm1, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vzeroupper

View File

@ -350,8 +350,7 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm2
; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3
; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0
@ -366,8 +365,7 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpackssdw %xmm0, %xmm2, %xmm2
; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1
; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]

View File

@ -37,15 +37,15 @@ define <16 x i8> @blend_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8
define <8 x i16> @blend_packusdw_packuswb(<4 x i32> %a0, <4 x i32> %a1, <8 x i16> %a2, <8 x i16> %a3) {
; SSE41-LABEL: blend_packusdw_packuswb:
; SSE41: # %bb.0:
; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: packuswb %xmm3, %xmm2
; SSE41-NEXT: packusdw %xmm0, %xmm0
; SSE41-NEXT: packuswb %xmm0, %xmm2
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE41-NEXT: retq
;
; AVX-LABEL: blend_packusdw_packuswb:
; AVX: # %bb.0:
; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpackuswb %xmm3, %xmm2, %xmm1
; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpackuswb %xmm0, %xmm2, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)