From f8dfbea3249e8304811b1d991e4d6bb362a9b6d6 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 23 Aug 2022 17:45:59 -0400
Subject: [PATCH] [SDAG] expand more is-power-of-2 patterns that use popcount

(ctpop x) == 1 --> (x != 0) && ((x & x-1) == 0)

Adjust the legality check to avoid the poor codegen on AArch64.
We probably only want to use popcount on this pattern when it
is a single instruction.

fixes #57225

Differential Revision: https://reviews.llvm.org/D132237
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 22 +++--
 llvm/test/CodeGen/AArch64/arm64-popcnt.ll     | 85 ++++++-------------
 2 files changed, 34 insertions(+), 73 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 77a3578b2319..2fdd79b3eda3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4003,14 +4003,14 @@ static SDValue simplifySetCCWithCTPOP(const TargetLowering &TLI, EVT VT,
   EVT CTVT = CTPOP.getValueType();
   SDValue CTOp = CTPOP.getOperand(0);
 
-  // If this is a vector CTPOP, keep the CTPOP if it is legal.
-  // TODO: Should we check if CTPOP is legal(or custom) for scalars?
-  if (VT.isVector() && TLI.isOperationLegal(ISD::CTPOP, CTVT))
-    return SDValue();
-
+  // Expand a power-of-2-or-zero comparison based on ctpop:
   // (ctpop x) u< 2 -> (x & x-1) == 0
   // (ctpop x) u> 1 -> (x & x-1) != 0
   if (Cond == ISD::SETULT || Cond == ISD::SETUGT) {
+    // Keep the CTPOP if it is a legal vector op.
+    if (CTVT.isVector() && TLI.isOperationLegal(ISD::CTPOP, CTVT))
+      return SDValue();
+
     unsigned CostLimit = TLI.getCustomCtpopCost(CTVT, Cond);
     if (C1.ugt(CostLimit + (Cond == ISD::SETULT)))
       return SDValue();
@@ -4029,16 +4029,14 @@ static SDValue simplifySetCCWithCTPOP(const TargetLowering &TLI, EVT VT,
     return DAG.getSetCC(dl, VT, Result, DAG.getConstant(0, dl, CTVT), CC);
   }
 
-  // If ctpop is not supported, expand a power-of-2 comparison based on it.
+  // Expand a power-of-2 comparison based on ctpop:
+  // (ctpop x) == 1 --> (x != 0) && ((x & x-1) == 0)
+  // (ctpop x) != 1 --> (x == 0) || ((x & x-1) != 0)
   if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && C1 == 1) {
-    // For scalars, keep CTPOP if it is legal or custom.
-    if (!VT.isVector() && TLI.isOperationLegalOrCustom(ISD::CTPOP, CTVT))
+    // Keep the CTPOP if it is legal.
+    if (TLI.isOperationLegal(ISD::CTPOP, CTVT))
       return SDValue();
-    // This is based on X86's custom lowering for CTPOP which produces more
-    // instructions than the expansion here.
 
-    // (ctpop x) == 1 --> (x != 0) && ((x & x-1) == 0)
-    // (ctpop x) != 1 --> (x == 0) || ((x & x-1) != 0)
     SDValue Zero = DAG.getConstant(0, dl, CTVT);
     SDValue NegOne = DAG.getAllOnesConstant(dl, CTVT);
     assert(CTVT.isInteger());
diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
index 7e8e23243972..872829e950dc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -168,30 +168,18 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
 define i32 @ctpop_eq_one(i64 %x) nounwind readnone {
 ; CHECK-LABEL: ctpop_eq_one:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d0, x0
-; CHECK-NEXT:    cnt.8b v0, v0
-; CHECK-NEXT:    uaddlv.8b h0, v0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    sub x8, x0, #1
+; CHECK-NEXT:    tst x0, x8
+; CHECK-NEXT:    ccmp x0, #0, #4, eq
+; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NONEON-LABEL: ctpop_eq_one:
 ; CHECK-NONEON:       // %bb.0:
-; CHECK-NONEON-NEXT:    lsr x9, x0, #1
-; CHECK-NONEON-NEXT:    mov x8, #72340172838076673
-; CHECK-NONEON-NEXT:    and x9, x9, #0x5555555555555555
-; CHECK-NONEON-NEXT:    sub x9, x0, x9
-; CHECK-NONEON-NEXT:    lsr x10, x9, #2
-; CHECK-NONEON-NEXT:    and x9, x9, #0x3333333333333333
-; CHECK-NONEON-NEXT:    and x10, x10, #0x3333333333333333
-; CHECK-NONEON-NEXT:    add x9, x9, x10
-; CHECK-NONEON-NEXT:    add x9, x9, x9, lsr #4
-; CHECK-NONEON-NEXT:    and x9, x9, #0xf0f0f0f0f0f0f0f
-; CHECK-NONEON-NEXT:    mul x8, x9, x8
-; CHECK-NONEON-NEXT:    lsr x8, x8, #56
-; CHECK-NONEON-NEXT:    cmp x8, #1
-; CHECK-NONEON-NEXT:    cset w0, eq
+; CHECK-NONEON-NEXT:    sub x8, x0, #1
+; CHECK-NONEON-NEXT:    tst x0, x8
+; CHECK-NONEON-NEXT:    ccmp x0, #0, #4, eq
+; CHECK-NONEON-NEXT:    cset w0, ne
 ; CHECK-NONEON-NEXT:    ret
   %count = tail call i64 @llvm.ctpop.i64(i64 %x)
   %cmp = icmp eq i64 %count, 1
@@ -202,30 +190,18 @@ define i32 @ctpop_eq_one(i64 %x) nounwind readnone {
 define i32 @ctpop_ne_one(i64 %x) nounwind readnone {
 ; CHECK-LABEL: ctpop_ne_one:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d0, x0
-; CHECK-NEXT:    cnt.8b v0, v0
-; CHECK-NEXT:    uaddlv.8b h0, v0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    sub x8, x0, #1
+; CHECK-NEXT:    tst x0, x8
+; CHECK-NEXT:    ccmp x0, #0, #4, eq
+; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NONEON-LABEL: ctpop_ne_one:
 ; CHECK-NONEON:       // %bb.0:
-; CHECK-NONEON-NEXT:    lsr x9, x0, #1
-; CHECK-NONEON-NEXT:    mov x8, #72340172838076673
-; CHECK-NONEON-NEXT:    and x9, x9, #0x5555555555555555
-; CHECK-NONEON-NEXT:    sub x9, x0, x9
-; CHECK-NONEON-NEXT:    lsr x10, x9, #2
-; CHECK-NONEON-NEXT:    and x9, x9, #0x3333333333333333
-; CHECK-NONEON-NEXT:    and x10, x10, #0x3333333333333333
-; CHECK-NONEON-NEXT:    add x9, x9, x10
-; CHECK-NONEON-NEXT:    add x9, x9, x9, lsr #4
-; CHECK-NONEON-NEXT:    and x9, x9, #0xf0f0f0f0f0f0f0f
-; CHECK-NONEON-NEXT:    mul x8, x9, x8
-; CHECK-NONEON-NEXT:    lsr x8, x8, #56
-; CHECK-NONEON-NEXT:    cmp x8, #1
-; CHECK-NONEON-NEXT:    cset w0, ne
+; CHECK-NONEON-NEXT:    sub x8, x0, #1
+; CHECK-NONEON-NEXT:    tst x0, x8
+; CHECK-NONEON-NEXT:    ccmp x0, #0, #4, eq
+; CHECK-NONEON-NEXT:    cset w0, eq
 ; CHECK-NONEON-NEXT:    ret
   %count = tail call i64 @llvm.ctpop.i64(i64 %x)
   %cmp = icmp ne i64 %count, 1
@@ -236,31 +212,18 @@ define i32 @ctpop_ne_one(i64 %x) nounwind readnone {
 define i1 @ctpop32_ne_one(i32 %x) nounwind readnone {
 ; CHECK-LABEL: ctpop32_ne_one:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    cnt.8b v0, v0
-; CHECK-NEXT:    uaddlv.8b h0, v0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    cmp w8, #1
-; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    sub w8, w0, #1
+; CHECK-NEXT:    tst w0, w8
+; CHECK-NEXT:    ccmp w0, #0, #4, eq
+; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NONEON-LABEL: ctpop32_ne_one:
 ; CHECK-NONEON:       // %bb.0:
-; CHECK-NONEON-NEXT:    lsr w9, w0, #1
-; CHECK-NONEON-NEXT:    mov w8, #16843009
-; CHECK-NONEON-NEXT:    and w9, w9, #0x55555555
-; CHECK-NONEON-NEXT:    sub w9, w0, w9
-; CHECK-NONEON-NEXT:    lsr w10, w9, #2
-; CHECK-NONEON-NEXT:    and w9, w9, #0x33333333
-; CHECK-NONEON-NEXT:    and w10, w10, #0x33333333
-; CHECK-NONEON-NEXT:    add w9, w9, w10
-; CHECK-NONEON-NEXT:    add w9, w9, w9, lsr #4
-; CHECK-NONEON-NEXT:    and w9, w9, #0xf0f0f0f
-; CHECK-NONEON-NEXT:    mul w8, w9, w8
-; CHECK-NONEON-NEXT:    lsr w8, w8, #24
-; CHECK-NONEON-NEXT:    cmp w8, #1
-; CHECK-NONEON-NEXT:    cset w0, ne
+; CHECK-NONEON-NEXT:    sub w8, w0, #1
+; CHECK-NONEON-NEXT:    tst w0, w8
+; CHECK-NONEON-NEXT:    ccmp w0, #0, #4, eq
+; CHECK-NONEON-NEXT:    cset w0, eq
 ; CHECK-NONEON-NEXT:    ret
   %count = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp ne i32 %count, 1