[X86] fold (and (mul x, c1), c2) -> (mul x, (and c1, c2)) iff c2 is all/no bits mask

Noticed on D128216 - if we're zeroing out vector elements of a mul/mulh result then see if we can merge the and-mask into the mul by just multiplying by zero.

Ideally we'd make this generic (similar to the existing foldSelectWithIdentityConstant?), but these cases are appearing very late, after the constants have been lowered to constant-pool loads.
This commit is contained in:
Simon Pilgrim 2022-06-21 15:08:39 +01:00
parent beb8580544
commit ac4cb1775b
6 changed files with 32 additions and 36 deletions

View File

@ -47651,6 +47651,20 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
return R;
// fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
// iff c2 is all/no bits mask - i.e. a select-with-zero mask.
// TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
if (VT.isVector() && getTargetConstantFromNode(N1)) {
unsigned Opc0 = N0.getOpcode();
if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
getTargetConstantFromNode(N0.getOperand(1)) &&
DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
}
}
// Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
// avoids slow variable shift (moving shift amount to ECX etc.)
if (isOneConstant(N1) && N0->hasOneUse()) {

View File

@ -510,11 +510,9 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,0]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_udiv_nonuniform:
@ -697,12 +695,10 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE2-NEXT: paddw %xmm1, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,0,65535]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pandn %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;

View File

@ -1759,14 +1759,12 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; SSE2-LABEL: constant_funnnel_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm1, %xmm3
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: psllw $1, %xmm0
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_funnnel_v8i16:
@ -1853,14 +1851,12 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; X86-SSE2-LABEL: constant_funnnel_v8i16:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
; X86-SSE2-NEXT: pandn %xmm1, %xmm3
; X86-SSE2-NEXT: pandn %xmm1, %xmm2
; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: pand %xmm1, %xmm2
; X86-SSE2-NEXT: psllw $1, %xmm0
; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: por %xmm3, %xmm0
; X86-SSE2-NEXT: por %xmm2, %xmm0
; X86-SSE2-NEXT: por %xmm1, %xmm0
; X86-SSE2-NEXT: retl
%res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
ret <8 x i16> %res

View File

@ -812,11 +812,9 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
; SSE2-LABEL: test_v4i16_v4i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0

View File

@ -1166,11 +1166,9 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE2-LABEL: constant_shift_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v8i16:
@ -1220,11 +1218,9 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; X86-SSE-LABEL: constant_shift_v8i16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
; X86-SSE-NEXT: movdqa %xmm1, %xmm2
; X86-SSE-NEXT: pandn %xmm0, %xmm2
; X86-SSE-NEXT: pandn %xmm0, %xmm1
; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: pand %xmm1, %xmm0
; X86-SSE-NEXT: por %xmm2, %xmm0
; X86-SSE-NEXT: por %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
ret <8 x i16> %shift

View File

@ -1483,11 +1483,9 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; SSE2-LABEL: constant_shift_v4i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i16:
@ -1537,11 +1535,9 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; X86-SSE-LABEL: constant_shift_v4i16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
; X86-SSE-NEXT: movdqa %xmm1, %xmm2
; X86-SSE-NEXT: pandn %xmm0, %xmm2
; X86-SSE-NEXT: pandn %xmm0, %xmm1
; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: pand %xmm1, %xmm0
; X86-SSE-NEXT: por %xmm2, %xmm0
; X86-SSE-NEXT: por %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
ret <4 x i16> %shift