From f8dfbea3249e8304811b1d991e4d6bb362a9b6d6 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 23 Aug 2022 17:45:59 -0400 Subject: [PATCH] [SDAG] expand more is-power-of-2 patterns that use popcount (ctpop x) == 1 --> (x != 0) && ((x & x-1) == 0) Adjust the legality check to avoid the poor codegen on AArch64. We probably only want to use popcount on this pattern when it is a single instruction. fixes #57225 Differential Revision: https://reviews.llvm.org/D132237 --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 22 +++-- llvm/test/CodeGen/AArch64/arm64-popcnt.ll | 85 ++++++------------- 2 files changed, 34 insertions(+), 73 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 77a3578b2319..2fdd79b3eda3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4003,14 +4003,14 @@ static SDValue simplifySetCCWithCTPOP(const TargetLowering &TLI, EVT VT, EVT CTVT = CTPOP.getValueType(); SDValue CTOp = CTPOP.getOperand(0); - // If this is a vector CTPOP, keep the CTPOP if it is legal. - // TODO: Should we check if CTPOP is legal(or custom) for scalars? - if (VT.isVector() && TLI.isOperationLegal(ISD::CTPOP, CTVT)) - return SDValue(); - + // Expand a power-of-2-or-zero comparison based on ctpop: // (ctpop x) u< 2 -> (x & x-1) == 0 // (ctpop x) u> 1 -> (x & x-1) != 0 if (Cond == ISD::SETULT || Cond == ISD::SETUGT) { + // Keep the CTPOP if it is a legal vector op. + if (CTVT.isVector() && TLI.isOperationLegal(ISD::CTPOP, CTVT)) + return SDValue(); + unsigned CostLimit = TLI.getCustomCtpopCost(CTVT, Cond); if (C1.ugt(CostLimit + (Cond == ISD::SETULT))) return SDValue(); @@ -4029,16 +4029,14 @@ static SDValue simplifySetCCWithCTPOP(const TargetLowering &TLI, EVT VT, return DAG.getSetCC(dl, VT, Result, DAG.getConstant(0, dl, CTVT), CC); } - // If ctpop is not supported, expand a power-of-2 comparison based on it. + // Expand a power-of-2 comparison based on ctpop: + // (ctpop x) == 1 --> (x != 0) && ((x & x-1) == 0) + // (ctpop x) != 1 --> (x == 0) || ((x & x-1) != 0) if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && C1 == 1) { - // For scalars, keep CTPOP if it is legal or custom. - if (!VT.isVector() && TLI.isOperationLegalOrCustom(ISD::CTPOP, CTVT)) + // Keep the CTPOP if it is legal. + if (TLI.isOperationLegal(ISD::CTPOP, CTVT)) return SDValue(); - // This is based on X86's custom lowering for CTPOP which produces more - // instructions than the expansion here. - // (ctpop x) == 1 --> (x != 0) && ((x & x-1) == 0) - // (ctpop x) != 1 --> (x == 0) || ((x & x-1) != 0) SDValue Zero = DAG.getConstant(0, dl, CTVT); SDValue NegOne = DAG.getAllOnesConstant(dl, CTVT); assert(CTVT.isInteger()); diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll index 7e8e23243972..872829e950dc 100644 --- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll +++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll @@ -168,30 +168,18 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat { define i32 @ctpop_eq_one(i64 %x) nounwind readnone { ; CHECK-LABEL: ctpop_eq_one: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov d0, x0 -; CHECK-NEXT: cnt.8b v0, v0 -; CHECK-NEXT: uaddlv.8b h0, v0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: sub x8, x0, #1 +; CHECK-NEXT: tst x0, x8 +; CHECK-NEXT: ccmp x0, #0, #4, eq +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret ; ; CHECK-NONEON-LABEL: ctpop_eq_one: ; CHECK-NONEON: // %bb.0: -; CHECK-NONEON-NEXT: lsr x9, x0, #1 -; CHECK-NONEON-NEXT: mov x8, #72340172838076673 -; CHECK-NONEON-NEXT: and x9, x9, #0x5555555555555555 -; CHECK-NONEON-NEXT: sub x9, x0, x9 -; CHECK-NONEON-NEXT: lsr x10, x9, #2 -; CHECK-NONEON-NEXT: and x9, x9, #0x3333333333333333 -; CHECK-NONEON-NEXT: and x10, x10, #0x3333333333333333 -; CHECK-NONEON-NEXT: add x9, x9, x10 -; CHECK-NONEON-NEXT: add x9, x9, x9, lsr #4 -; CHECK-NONEON-NEXT: and x9, x9, #0xf0f0f0f0f0f0f0f -; CHECK-NONEON-NEXT: mul x8, x9, x8 -; CHECK-NONEON-NEXT: lsr x8, x8, #56 -; CHECK-NONEON-NEXT: cmp x8, #1 -; CHECK-NONEON-NEXT: cset w0, eq +; CHECK-NONEON-NEXT: sub x8, x0, #1 +; CHECK-NONEON-NEXT: tst x0, x8 +; CHECK-NONEON-NEXT: ccmp x0, #0, #4, eq +; CHECK-NONEON-NEXT: cset w0, ne ; CHECK-NONEON-NEXT: ret %count = tail call i64 @llvm.ctpop.i64(i64 %x) %cmp = icmp eq i64 %count, 1 @@ -202,30 +190,18 @@ define i32 @ctpop_eq_one(i64 %x) nounwind readnone { define i32 @ctpop_ne_one(i64 %x) nounwind readnone { ; CHECK-LABEL: ctpop_ne_one: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov d0, x0 -; CHECK-NEXT: cnt.8b v0, v0 -; CHECK-NEXT: uaddlv.8b h0, v0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: sub x8, x0, #1 +; CHECK-NEXT: tst x0, x8 +; CHECK-NEXT: ccmp x0, #0, #4, eq +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret ; ; CHECK-NONEON-LABEL: ctpop_ne_one: ; CHECK-NONEON: // %bb.0: -; CHECK-NONEON-NEXT: lsr x9, x0, #1 -; CHECK-NONEON-NEXT: mov x8, #72340172838076673 -; CHECK-NONEON-NEXT: and x9, x9, #0x5555555555555555 -; CHECK-NONEON-NEXT: sub x9, x0, x9 -; CHECK-NONEON-NEXT: lsr x10, x9, #2 -; CHECK-NONEON-NEXT: and x9, x9, #0x3333333333333333 -; CHECK-NONEON-NEXT: and x10, x10, #0x3333333333333333 -; CHECK-NONEON-NEXT: add x9, x9, x10 -; CHECK-NONEON-NEXT: add x9, x9, x9, lsr #4 -; CHECK-NONEON-NEXT: and x9, x9, #0xf0f0f0f0f0f0f0f -; CHECK-NONEON-NEXT: mul x8, x9, x8 -; CHECK-NONEON-NEXT: lsr x8, x8, #56 -; CHECK-NONEON-NEXT: cmp x8, #1 -; CHECK-NONEON-NEXT: cset w0, ne +; CHECK-NONEON-NEXT: sub x8, x0, #1 +; CHECK-NONEON-NEXT: tst x0, x8 +; CHECK-NONEON-NEXT: ccmp x0, #0, #4, eq +; CHECK-NONEON-NEXT: cset w0, eq ; CHECK-NONEON-NEXT: ret %count = tail call i64 @llvm.ctpop.i64(i64 %x) %cmp = icmp ne i64 %count, 1 @@ -236,31 +212,18 @@ define i32 @ctpop_ne_one(i64 %x) nounwind readnone { define i1 @ctpop32_ne_one(i32 %x) nounwind readnone { ; CHECK-LABEL: ctpop32_ne_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: cnt.8b v0, v0 -; CHECK-NEXT: uaddlv.8b h0, v0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: cmp w8, #1 -; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: sub w8, w0, #1 +; CHECK-NEXT: tst w0, w8 +; CHECK-NEXT: ccmp w0, #0, #4, eq +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret ; ; CHECK-NONEON-LABEL: ctpop32_ne_one: ; CHECK-NONEON: // %bb.0: -; CHECK-NONEON-NEXT: lsr w9, w0, #1 -; CHECK-NONEON-NEXT: mov w8, #16843009 -; CHECK-NONEON-NEXT: and w9, w9, #0x55555555 -; CHECK-NONEON-NEXT: sub w9, w0, w9 -; CHECK-NONEON-NEXT: lsr w10, w9, #2 -; CHECK-NONEON-NEXT: and w9, w9, #0x33333333 -; CHECK-NONEON-NEXT: and w10, w10, #0x33333333 -; CHECK-NONEON-NEXT: add w9, w9, w10 -; CHECK-NONEON-NEXT: add w9, w9, w9, lsr #4 -; CHECK-NONEON-NEXT: and w9, w9, #0xf0f0f0f -; CHECK-NONEON-NEXT: mul w8, w9, w8 -; CHECK-NONEON-NEXT: lsr w8, w8, #24 -; CHECK-NONEON-NEXT: cmp w8, #1 -; CHECK-NONEON-NEXT: cset w0, ne +; CHECK-NONEON-NEXT: sub w8, w0, #1 +; CHECK-NONEON-NEXT: tst w0, w8 +; CHECK-NONEON-NEXT: ccmp w0, #0, #4, eq +; CHECK-NONEON-NEXT: cset w0, eq ; CHECK-NONEON-NEXT: ret %count = tail call i32 @llvm.ctpop.i32(i32 %x) %cmp = icmp ne i32 %count, 1