From 03380c70ed548224e2fd8280bc92f69d356d258c Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 14 Feb 2022 11:18:35 +0000
Subject: [PATCH] [DAGCombine] Basic combines for AVG nodes.

This adds very basic combines for AVG nodes, mostly for constant folding
and handling degenerate (zero) cases. The code performs mostly the same
transforms as visitMULHS, adjusted for AVG nodes.

Constant folding extends to a higher bitwidth and drops the lowest bit.
For undef nodes, `avg undef, x` is transformed to x.  There is also a
transform for `avgfloor x, 0` transforming to `shr x, 1`.

Differential Revision: https://reviews.llvm.org/D119559
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 45 ++++++++++++++++
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 24 +++++++++
 llvm/test/CodeGen/AArch64/hadd-combine.ll     | 51 ++++++-------------
 llvm/test/CodeGen/X86/pic-load-remat.ll       |  3 +-
 4 files changed, 86 insertions(+), 37 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8646fabb1262..4f42815065b7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -426,6 +426,7 @@ namespace {
     SDValue visitREM(SDNode *N);
     SDValue visitMULHU(SDNode *N);
     SDValue visitMULHS(SDNode *N);
+    SDValue visitAVG(SDNode *N);
     SDValue visitSMUL_LOHI(SDNode *N);
     SDValue visitUMUL_LOHI(SDNode *N);
     SDValue visitMULO(SDNode *N);
@@ -1635,6 +1636,10 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::UREM:               return visitREM(N);
   case ISD::MULHU:              return visitMULHU(N);
   case ISD::MULHS:              return visitMULHS(N);
+  case ISD::AVGFLOORS:
+  case ISD::AVGFLOORU:
+  case ISD::AVGCEILS:
+  case ISD::AVGCEILU:           return visitAVG(N);
   case ISD::SMUL_LOHI:          return visitSMUL_LOHI(N);
   case ISD::UMUL_LOHI:          return visitUMUL_LOHI(N);
   case ISD::SMULO:
@@ -4654,6 +4659,46 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitAVG(SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // fold (avg c1, c2)
+  if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS.
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(Opcode, DL, N->getVTList(), N1, N0);
+
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+    // fold (avgfloor x, 0) -> x >> 1
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) {
+      if (Opcode == ISD::AVGFLOORS)
+        return DAG.getNode(ISD::SRA, DL, VT, N0, DAG.getConstant(1, DL, VT));
+      if (Opcode == ISD::AVGFLOORU)
+        return DAG.getNode(ISD::SRL, DL, VT, N0, DAG.getConstant(1, DL, VT));
+    }
+  }
+
+  // fold (avg x, undef) -> x
+  if (N0.isUndef())
+    return N1;
+  if (N1.isUndef())
+    return N0;
+
+  // TODO If we use avg for scalars anywhere, we can add (avgfl x, 0) -> x >> 1
+
+  return SDValue();
+}
+
 /// Perform optimizations common to nodes that compute two values. LoOp and HiOp
 /// give the opcodes for the two computations that are being performed. Return
 /// true if a simplification was made.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 9693cede668c..b3a929050d7c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5274,6 +5274,30 @@ static llvm::Optional<APInt> FoldValue(unsigned Opcode, const APInt &C1,
     APInt C2Ext = C2.zext(FullWidth);
     return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth());
   }
+  case ISD::AVGFLOORS: {
+    unsigned FullWidth = C1.getBitWidth() + 1;
+    APInt C1Ext = C1.sext(FullWidth);
+    APInt C2Ext = C2.sext(FullWidth);
+    return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1);
+  }
+  case ISD::AVGFLOORU: {
+    unsigned FullWidth = C1.getBitWidth() + 1;
+    APInt C1Ext = C1.zext(FullWidth);
+    APInt C2Ext = C2.zext(FullWidth);
+    return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1);
+  }
+  case ISD::AVGCEILS: {
+    unsigned FullWidth = C1.getBitWidth() + 1;
+    APInt C1Ext = C1.sext(FullWidth);
+    APInt C2Ext = C2.sext(FullWidth);
+    return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1);
+  }
+  case ISD::AVGCEILU: {
+    unsigned FullWidth = C1.getBitWidth() + 1;
+    APInt C1Ext = C1.zext(FullWidth);
+    APInt C2Ext = C2.zext(FullWidth);
+    return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1);
+  }
   }
   return llvm::None;
 }
diff --git a/llvm/test/CodeGen/AArch64/hadd-combine.ll b/llvm/test/CodeGen/AArch64/hadd-combine.ll
index 3de09ebcfa20..6c2809ce2070 100644
--- a/llvm/test/CodeGen/AArch64/hadd-combine.ll
+++ b/llvm/test/CodeGen/AArch64/hadd-combine.ll
@@ -135,8 +135,7 @@ define <8 x i16> @haddu_i_const_lhs(<8 x i16> %src1) {
 define <8 x i16> @haddu_i_const_zero(<8 x i16> %src1) {
 ; CHECK-LABEL: haddu_i_const_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #1
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -145,9 +144,7 @@ define <8 x i16> @haddu_i_const_zero(<8 x i16> %src1) {
 define <8 x i16> @haddu_i_const_both() {
 ; CHECK-LABEL: haddu_i_const_both:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.8h, #1
-; CHECK-NEXT:    movi v1.8h, #3
-; CHECK-NEXT:    uhadd v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    movi v0.8h, #2
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %result
@@ -156,18 +153,16 @@ define <8 x i16> @haddu_i_const_both() {
 define <8 x i16> @haddu_i_const_bothhigh() {
 ; CHECK-LABEL: haddu_i_const_bothhigh:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.2d, #0xffffffffffffffff
-; CHECK-NEXT:    mvni v1.8h, #1
-; CHECK-NEXT:    uhadd v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    mvni v0.8h, #1
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>, <8 x i16> <i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535>)
   ret <8 x i16> %result
 }
 
-define <8 x i16> @haddu_i_undef(<8 x i16> %src1) {
+define <8 x i16> @haddu_i_undef(<8 x i16> %t, <8 x i16> %src1) {
 ; CHECK-LABEL: haddu_i_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uhadd v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -312,8 +307,7 @@ define <8 x i16> @hadds_i_const_lhs(<8 x i16> %src1) {
 define <8 x i16> @hadds_i_const_zero(<8 x i16> %src1) {
 ; CHECK-LABEL: hadds_i_const_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    sshr v0.8h, v0.8h, #1
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -322,9 +316,7 @@ define <8 x i16> @hadds_i_const_zero(<8 x i16> %src1) {
 define <8 x i16> @hadds_i_const_both() {
 ; CHECK-LABEL: hadds_i_const_both:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.8h, #1
-; CHECK-NEXT:    movi v1.8h, #3
-; CHECK-NEXT:    shadd v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    movi v0.8h, #2
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %result
@@ -334,18 +326,16 @@ define <8 x i16> @hadds_i_const_bothhigh() {
 ; CHECK-LABEL: hadds_i_const_bothhigh:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #32766
-; CHECK-NEXT:    mvni v0.8h, #128, lsl #8
-; CHECK-NEXT:    dup v1.8h, w8
-; CHECK-NEXT:    shadd v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    dup v0.8h, w8
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>, <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>)
   ret <8 x i16> %result
 }
 
-define <8 x i16> @hadds_i_undef(<8 x i16> %src1) {
+define <8 x i16> @hadds_i_undef(<8 x i16> %t, <8 x i16> %src1) {
 ; CHECK-LABEL: hadds_i_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    shadd v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -508,9 +498,7 @@ define <8 x i16> @rhaddu_i_const_zero(<8 x i16> %src1) {
 define <8 x i16> @rhaddu_i_const_both() {
 ; CHECK-LABEL: rhaddu_i_const_both:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.8h, #1
-; CHECK-NEXT:    movi v1.8h, #3
-; CHECK-NEXT:    urhadd v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    movi v0.8h, #2
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %result
@@ -520,17 +508,15 @@ define <8 x i16> @rhaddu_i_const_bothhigh() {
 ; CHECK-LABEL: rhaddu_i_const_bothhigh:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v0.2d, #0xffffffffffffffff
-; CHECK-NEXT:    mvni v1.8h, #1
-; CHECK-NEXT:    urhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>, <8 x i16> <i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535>)
   ret <8 x i16> %result
 }
 
-define <8 x i16> @rhaddu_i_undef(<8 x i16> %src1) {
+define <8 x i16> @rhaddu_i_undef(<8 x i16> %t, <8 x i16> %src1) {
 ; CHECK-LABEL: rhaddu_i_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    urhadd v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -693,9 +679,7 @@ define <8 x i16> @rhadds_i_const_zero(<8 x i16> %src1) {
 define <8 x i16> @rhadds_i_const_both() {
 ; CHECK-LABEL: rhadds_i_const_both:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.8h, #1
-; CHECK-NEXT:    movi v1.8h, #3
-; CHECK-NEXT:    srhadd v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    movi v0.8h, #2
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %result
@@ -704,19 +688,16 @@ define <8 x i16> @rhadds_i_const_both() {
 define <8 x i16> @rhadds_i_const_bothhigh() {
 ; CHECK-LABEL: rhadds_i_const_bothhigh:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32766
 ; CHECK-NEXT:    mvni v0.8h, #128, lsl #8
-; CHECK-NEXT:    dup v1.8h, w8
-; CHECK-NEXT:    srhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>, <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>)
   ret <8 x i16> %result
 }
 
-define <8 x i16> @rhadds_i_undef(<8 x i16> %src1) {
+define <8 x i16> @rhadds_i_undef(<8 x i16> %t, <8 x i16> %src1) {
 ; CHECK-LABEL: rhadds_i_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    srhadd v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
   ret <8 x i16> %result
diff --git a/llvm/test/CodeGen/X86/pic-load-remat.ll b/llvm/test/CodeGen/X86/pic-load-remat.ll
index 8596e5699b28..0b3dc41e083b 100644
--- a/llvm/test/CodeGen/X86/pic-load-remat.ll
+++ b/llvm/test/CodeGen/X86/pic-load-remat.ll
@@ -7,10 +7,9 @@ define void @f() nounwind  {
 ; CHECK-NEXT:    calll L0$pb
 ; CHECK-NEXT:  L0$pb:
 ; CHECK-NEXT:    popl %eax
-; CHECK-NEXT:    pxor %xmm0, %xmm0
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
 ; CHECK-NEXT:    psllw {{\.?LCPI[0-9]+_[0-9]+}}-L0$pb(%eax), %xmm1
-; CHECK-NEXT:    pavgw {{\.?LCPI[0-9]+_[0-9]+}}-L0$pb(%eax), %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [21183,21183,21183,21183,21183,21183,21183,21183]
 ; CHECK-NEXT:    paddsw %xmm0, %xmm0
 ; CHECK-NEXT:    paddw %xmm1, %xmm0
 ; CHECK-NEXT:    .p2align 4, 0x90