forked from OSchip/llvm-project
[X86][SSE] Add SimplifyDemandedVectorElts support for PACKSS/PACKUS instructions.
As discussed on rL347240. llvm-svn: 347299
This commit is contained in:
parent
991e316126
commit
7198506ba8
|
@ -32232,6 +32232,36 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
|
|||
return true;
|
||||
break;
|
||||
}
|
||||
case X86ISD::PACKSS:
|
||||
case X86ISD::PACKUS: {
|
||||
int NumLanes = VT.getSizeInBits() / 128;
|
||||
int NumInnerElts = NumElts / 2;
|
||||
int NumEltsPerLane = NumElts / NumLanes;
|
||||
int NumInnerEltsPerLane = NumInnerElts / NumLanes;
|
||||
|
||||
// Map DemandedElts to the packed operands.
|
||||
APInt DemandedLHS = APInt::getNullValue(NumInnerElts);
|
||||
APInt DemandedRHS = APInt::getNullValue(NumInnerElts);
|
||||
for (int Lane = 0; Lane != NumLanes; ++Lane) {
|
||||
for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
|
||||
int OuterIdx = (Lane * NumEltsPerLane) + Elt;
|
||||
int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
|
||||
if (DemandedElts[OuterIdx])
|
||||
DemandedLHS.setBit(InnerIdx);
|
||||
if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
|
||||
DemandedRHS.setBit(InnerIdx);
|
||||
}
|
||||
}
|
||||
|
||||
APInt SrcUndef, SrcZero;
|
||||
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef,
|
||||
SrcZero, TLO, Depth + 1))
|
||||
return true;
|
||||
if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, SrcUndef,
|
||||
SrcZero, TLO, Depth + 1))
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case X86ISD::VBROADCAST: {
|
||||
SDValue Src = Op.getOperand(0);
|
||||
MVT SrcVT = Src.getSimpleValueType();
|
||||
|
|
|
@ -677,7 +677,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
|
|||
; SSE41-NEXT: psllw $1, %xmm2
|
||||
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
|
||||
; SSE41-NEXT: psrlw $8, %xmm2
|
||||
; SSE41-NEXT: packuswb %xmm3, %xmm2
|
||||
; SSE41-NEXT: packuswb %xmm0, %xmm2
|
||||
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
|
||||
; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
|
||||
; SSE41-NEXT: movdqa %xmm2, %xmm0
|
||||
|
@ -697,7 +697,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
|
|||
; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
|
||||
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
|
||||
; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: retq
|
||||
|
@ -714,8 +714,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
|
|||
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
|
||||
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
||||
; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpackuswb %xmm0, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
|
||||
; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
|
|
|
@ -350,8 +350,7 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
|
|||
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
|
||||
; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
|
||||
; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
|
||||
; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3
|
||||
; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0
|
||||
|
@ -366,8 +365,7 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
|
|||
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
|
||||
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
|
||||
; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
|
||||
; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpackssdw %xmm0, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1
|
||||
; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1
|
||||
; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
|
||||
|
|
|
@ -37,15 +37,15 @@ define <16 x i8> @blend_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8
|
|||
define <8 x i16> @blend_packusdw_packuswb(<4 x i32> %a0, <4 x i32> %a1, <8 x i16> %a2, <8 x i16> %a3) {
|
||||
; SSE41-LABEL: blend_packusdw_packuswb:
|
||||
; SSE41: # %bb.0:
|
||||
; SSE41-NEXT: packusdw %xmm1, %xmm0
|
||||
; SSE41-NEXT: packuswb %xmm3, %xmm2
|
||||
; SSE41-NEXT: packusdw %xmm0, %xmm0
|
||||
; SSE41-NEXT: packuswb %xmm0, %xmm2
|
||||
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: blend_packusdw_packuswb:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpackuswb %xmm3, %xmm2, %xmm1
|
||||
; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpackuswb %xmm0, %xmm2, %xmm1
|
||||
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX-NEXT: retq
|
||||
%p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
|
||||
|
|
Loading…
Reference in New Issue