[X86][XOP] Add SimplifyDemandedVectorElts handling for xop shifts

Noticed while investigating how to improve funnel shift codegen
This commit is contained in:
Simon Pilgrim 2022-01-12 12:17:06 +00:00
parent 13362abf3d
commit c2426fdcae
3 changed files with 20 additions and 6 deletions

View File

@ -40082,6 +40082,21 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
break;
}
case X86ISD::VPSHA:
case X86ISD::VPSHL: {
APInt LHSUndef, LHSZero;
APInt RHSUndef, RHSZero;
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
Depth + 1))
return true;
if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
Depth + 1))
return true;
KnownZero = LHSZero;
break;
}
case X86ISD::KSHIFTL: {
SDValue Src = Op.getOperand(0);
auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));

View File

@ -675,10 +675,11 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
;
; XOP-LABEL: combine_vec_udiv_nonuniform4:
; XOP: # %bb.0:
; XOP-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15]
; XOP-NEXT: movl $171, %eax
; XOP-NEXT: vmovd %eax, %xmm1
; XOP-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; XOP-NEXT: vpmullw %xmm1, %xmm2, %xmm1
; XOP-NEXT: vpsrlw $8, %xmm1, %xmm1
; XOP-NEXT: movl $249, %eax
; XOP-NEXT: vmovd %eax, %xmm2
; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1

View File

@ -4,7 +4,6 @@
define <16 x i8> @demandedelts_vpshab(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: demandedelts_vpshab:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; CHECK-NEXT: vpshab %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm0
@ -18,7 +17,6 @@ define <16 x i8> @demandedelts_vpshab(<16 x i8> %a0, <16 x i8> %a1) {
define <4 x i32> @demandedelts_vpshld(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: demandedelts_vpshld:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; CHECK-NEXT: vpshld %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; CHECK-NEXT: retq