[DAGCombine] Basic combines for AVG nodes.

This adds very basic combines for AVG nodes, mostly for constant folding and handling degenerate (zero) cases. The code performs mostly the same transforms as visitMULHS, adjusted for AVG nodes. Constant folding extends to a higher bitwidth and drops the lowest bit. For undef nodes, `avg undef, x` is transformed to x. There is also a transform for `avgfloor x, 0` transforming to `shr x, 1`. Differential Revision: https://reviews.llvm.org/D119559
2022-02-14 11:18:35 +00:00 · 2022-02-14 11:18:35 +00:00 · 03380c70ed
parent a87d3ba61c
commit 03380c70ed
4 changed files with 86 additions and 37 deletions
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -426,6 +426,7 @@ namespace {
    SDValue visitREM(SDNode *N);
    SDValue visitMULHU(SDNode *N);
    SDValue visitMULHS(SDNode *N);
    SDValue visitAVG(SDNode *N);
    SDValue visitSMUL_LOHI(SDNode *N);
    SDValue visitUMUL_LOHI(SDNode *N);
    SDValue visitMULO(SDNode *N);
@ -1635,6 +1636,10 @@ SDValue DAGCombiner::visit(SDNode *N) {
  case ISD::UREM:               return visitREM(N);
  case ISD::MULHU:              return visitMULHU(N);
  case ISD::MULHS:              return visitMULHS(N);
  case ISD::AVGFLOORS:
  case ISD::AVGFLOORU:
  case ISD::AVGCEILS:
  case ISD::AVGCEILU:           return visitAVG(N);
  case ISD::SMUL_LOHI:          return visitSMUL_LOHI(N);
  case ISD::UMUL_LOHI:          return visitUMUL_LOHI(N);
  case ISD::SMULO:
@ -4654,6 +4659,46 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
  return SDValue();
 }
 SDValue DAGCombiner::visitAVG(SDNode *N) {
  unsigned Opcode = N->getOpcode();
  SDValue N0 = N->getOperand(0);
  SDValue N1 = N->getOperand(1);
  EVT VT = N->getValueType(0);
  SDLoc DL(N);
  // fold (avg c1, c2)
  if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
    return C;
  // canonicalize constant to RHS.
  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
    return DAG.getNode(Opcode, DL, N->getVTList(), N1, N0);
  if (VT.isVector()) {
    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
      return FoldedVOp;
    // fold (avgfloor x, 0) -> x >> 1
    if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) {
      if (Opcode == ISD::AVGFLOORS)
        return DAG.getNode(ISD::SRA, DL, VT, N0, DAG.getConstant(1, DL, VT));
      if (Opcode == ISD::AVGFLOORU)
        return DAG.getNode(ISD::SRL, DL, VT, N0, DAG.getConstant(1, DL, VT));
    }
  }
  // fold (avg x, undef) -> x
  if (N0.isUndef())
    return N1;
  if (N1.isUndef())
    return N0;
  // TODO If we use avg for scalars anywhere, we can add (avgfl x, 0) -> x >> 1
  return SDValue();
 }
 /// Perform optimizations common to nodes that compute two values. LoOp and HiOp
 /// give the opcodes for the two computations that are being performed. Return
 /// true if a simplification was made.
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@ -5274,6 +5274,30 @@ static llvm::Optional<APInt> FoldValue(unsigned Opcode, const APInt &C1,
    APInt C2Ext = C2.zext(FullWidth);
    return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth());
  }
  case ISD::AVGFLOORS: {
    unsigned FullWidth = C1.getBitWidth() + 1;
    APInt C1Ext = C1.sext(FullWidth);
    APInt C2Ext = C2.sext(FullWidth);
    return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1);
  }
  case ISD::AVGFLOORU: {
    unsigned FullWidth = C1.getBitWidth() + 1;
    APInt C1Ext = C1.zext(FullWidth);
    APInt C2Ext = C2.zext(FullWidth);
    return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1);
  }
  case ISD::AVGCEILS: {
    unsigned FullWidth = C1.getBitWidth() + 1;
    APInt C1Ext = C1.sext(FullWidth);
    APInt C2Ext = C2.sext(FullWidth);
    return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1);
  }
  case ISD::AVGCEILU: {
    unsigned FullWidth = C1.getBitWidth() + 1;
    APInt C1Ext = C1.zext(FullWidth);
    APInt C2Ext = C2.zext(FullWidth);
    return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1);
  }
  }
  return llvm::None;
 }
--- a/llvm/test/CodeGen/AArch64/hadd-combine.ll
+++ b/llvm/test/CodeGen/AArch64/hadd-combine.ll
@ -135,8 +135,7 @@ define <8 x i16> @haddu_i_const_lhs(<8 x i16> %src1) {
 define <8 x i16> @haddu_i_const_zero(<8 x i16> %src1) {
 ; CHECK-LABEL: haddu_i_const_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #1
 ; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
  %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1)
  ret <8 x i16> %result
@ -145,9 +144,7 @@ define <8 x i16> @haddu_i_const_zero(<8 x i16> %src1) {
 define <8 x i16> @haddu_i_const_both() {
 ; CHECK-LABEL: haddu_i_const_both:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.8h, #1
+; CHECK-NEXT:    movi v0.8h, #2
 ; CHECK-NEXT:    movi v1.8h, #3
 ; CHECK-NEXT:    uhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
  %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
  ret <8 x i16> %result
@ -156,18 +153,16 @@ define <8 x i16> @haddu_i_const_both() {
 define <8 x i16> @haddu_i_const_bothhigh() {
 ; CHECK-LABEL: haddu_i_const_bothhigh:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-NEXT:    mvni v0.8h, #1
 ; CHECK-NEXT:    mvni v1.8h, #1
 ; CHECK-NEXT:    uhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
  %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>, <8 x i16> <i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535>)
  ret <8 x i16> %result
 }
-define <8 x i16> @haddu_i_undef(<8 x i16> %src1) {
+define <8 x i16> @haddu_i_undef(<8 x i16> %t, <8 x i16> %src1) {
 ; CHECK-LABEL: haddu_i_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uhadd v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
  %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
  ret <8 x i16> %result
@ -312,8 +307,7 @@ define <8 x i16> @hadds_i_const_lhs(<8 x i16> %src1) {
 define <8 x i16> @hadds_i_const_zero(<8 x i16> %src1) {
 ; CHECK-LABEL: hadds_i_const_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    sshr v0.8h, v0.8h, #1
 ; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
  %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1)
  ret <8 x i16> %result
@ -322,9 +316,7 @@ define <8 x i16> @hadds_i_const_zero(<8 x i16> %src1) {
 define <8 x i16> @hadds_i_const_both() {
 ; CHECK-LABEL: hadds_i_const_both:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.8h, #1
+; CHECK-NEXT:    movi v0.8h, #2
 ; CHECK-NEXT:    movi v1.8h, #3
 ; CHECK-NEXT:    shadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
  %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
  ret <8 x i16> %result
@ -334,18 +326,16 @@ define <8 x i16> @hadds_i_const_bothhigh() {
 ; CHECK-LABEL: hadds_i_const_bothhigh:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #32766
-; CHECK-NEXT:    mvni v0.8h, #128, lsl #8
+; CHECK-NEXT:    dup v0.8h, w8
 ; CHECK-NEXT:    dup v1.8h, w8
 ; CHECK-NEXT:    shadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
  %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>, <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>)
  ret <8 x i16> %result
 }
-define <8 x i16> @hadds_i_undef(<8 x i16> %src1) {
+define <8 x i16> @hadds_i_undef(<8 x i16> %t, <8 x i16> %src1) {
 ; CHECK-LABEL: hadds_i_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    shadd v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
  %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
  ret <8 x i16> %result
@ -508,9 +498,7 @@ define <8 x i16> @rhaddu_i_const_zero(<8 x i16> %src1) {
 define <8 x i16> @rhaddu_i_const_both() {
 ; CHECK-LABEL: rhaddu_i_const_both:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.8h, #1
+; CHECK-NEXT:    movi v0.8h, #2
 ; CHECK-NEXT:    movi v1.8h, #3
 ; CHECK-NEXT:    urhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
  %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
  ret <8 x i16> %result
@ -520,17 +508,15 @@ define <8 x i16> @rhaddu_i_const_bothhigh() {
 ; CHECK-LABEL: rhaddu_i_const_bothhigh:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v0.2d, #0xffffffffffffffff
 ; CHECK-NEXT:    mvni v1.8h, #1
 ; CHECK-NEXT:    urhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
  %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>, <8 x i16> <i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535>)
  ret <8 x i16> %result
 }
-define <8 x i16> @rhaddu_i_undef(<8 x i16> %src1) {
+define <8 x i16> @rhaddu_i_undef(<8 x i16> %t, <8 x i16> %src1) {
 ; CHECK-LABEL: rhaddu_i_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    urhadd v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
  %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
  ret <8 x i16> %result
@ -693,9 +679,7 @@ define <8 x i16> @rhadds_i_const_zero(<8 x i16> %src1) {
 define <8 x i16> @rhadds_i_const_both() {
 ; CHECK-LABEL: rhadds_i_const_both:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.8h, #1
+; CHECK-NEXT:    movi v0.8h, #2
 ; CHECK-NEXT:    movi v1.8h, #3
 ; CHECK-NEXT:    srhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
  %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
  ret <8 x i16> %result
@ -704,19 +688,16 @@ define <8 x i16> @rhadds_i_const_both() {
 define <8 x i16> @rhadds_i_const_bothhigh() {
 ; CHECK-LABEL: rhadds_i_const_bothhigh:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #32766
 ; CHECK-NEXT:    mvni v0.8h, #128, lsl #8
 ; CHECK-NEXT:    dup v1.8h, w8
 ; CHECK-NEXT:    srhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
  %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>, <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>)
  ret <8 x i16> %result
 }
-define <8 x i16> @rhadds_i_undef(<8 x i16> %src1) {
+define <8 x i16> @rhadds_i_undef(<8 x i16> %t, <8 x i16> %src1) {
 ; CHECK-LABEL: rhadds_i_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    srhadd v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
  %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
  ret <8 x i16> %result
--- a/llvm/test/CodeGen/X86/pic-load-remat.ll
+++ b/llvm/test/CodeGen/X86/pic-load-remat.ll
@ -7,10 +7,9 @@ define void @f() nounwind  {
 ; CHECK-NEXT:    calll L0$pb
 ; CHECK-NEXT:  L0$pb:
 ; CHECK-NEXT:    popl %eax
 ; CHECK-NEXT:    pxor %xmm0, %xmm0
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
 ; CHECK-NEXT:    psllw {{\.?LCPI[0-9]+_[0-9]+}}-L0$pb(%eax), %xmm1
-; CHECK-NEXT:    pavgw {{\.?LCPI[0-9]+_[0-9]+}}-L0$pb(%eax), %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [21183,21183,21183,21183,21183,21183,21183,21183]
 ; CHECK-NEXT:    paddsw %xmm0, %xmm0
 ; CHECK-NEXT:    paddw %xmm1, %xmm0
 ; CHECK-NEXT:    .p2align 4, 0x90