[RISCV][WebAssembly][TargetLowering] Allow expandCTLZ/expandCTTZ to rely on CTPOP expansion for vectors.

Our fallback expansion for CTLZ/CTTZ relies on CTPOP. If CTPOP isn't legal or custom for a vector type we would scalarize the CTLZ/CTTZ. This is different than CTPOP itself which would use a vector expansion. This patch teaches expandCTLZ/CTTZ to rely on the vector CTPOP expansion instead of scalarizing. To do this I had to add additional checks to make sure the operations used by CTPOP expansions are all supported. Some of the operations were already needed for the CTLZ/CTTZ expansion. This is a huge improvement to the RISCV which doesn't have a scalar ctlz or cttz in the base ISA. For WebAssembly, I've added Custom lowering to keep the scalarizing behavior. I've also extended the scalarizing to CTPOP. Differential Revision: https://reviews.llvm.org/D111919
2021-10-19 16:10:02 -07:00 · 2021-10-19 16:10:02 -07:00 · fe1f0de003
parent 60e19f6752
commit fe1f0de003
5 changed files with 2259 additions and 19686 deletions
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@ -6980,6 +6980,17 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
  return SDValue();
 }

+// Only expand vector types if we have the appropriate vector bit operations.
+static bool canExpandVectorCTPOP(const TargetLowering &TLI, EVT VT) {
+  assert(VT.isVector() && "Expected vector type");
+  unsigned Len = VT.getScalarSizeInBits();
+  return TLI.isOperationLegalOrCustom(ISD::ADD, VT) &&
+         TLI.isOperationLegalOrCustom(ISD::SUB, VT) &&
+         TLI.isOperationLegalOrCustom(ISD::SRL, VT) &&
+         (Len == 8 || TLI.isOperationLegalOrCustom(ISD::MUL, VT)) &&
+         TLI.isOperationLegalOrCustomOrPromote(ISD::AND, VT);
+}
+
 bool TargetLowering::expandCTPOP(SDNode *Node, SDValue &Result,
                                 SelectionDAG &DAG) const {
  SDLoc dl(Node);
@ -6994,11 +7005,7 @@ bool TargetLowering::expandCTPOP(SDNode *Node, SDValue &Result,
    return false;

  // Only expand vector types if we have the appropriate vector bit operations.
-  if (VT.isVector() && (!isOperationLegalOrCustom(ISD::ADD, VT) ||
-                        !isOperationLegalOrCustom(ISD::SUB, VT) ||
-                        !isOperationLegalOrCustom(ISD::SRL, VT) ||
-                        (Len != 8 && !isOperationLegalOrCustom(ISD::MUL, VT)) ||
-                        !isOperationLegalOrCustomOrPromote(ISD::AND, VT)))
+  if (VT.isVector() && !canExpandVectorCTPOP(*this, VT))
    return false;

  // This is the "best" algorithm from
@ -7068,8 +7075,10 @@ bool TargetLowering::expandCTLZ(SDNode *Node, SDValue &Result,
  }

  // Only expand vector types if we have the appropriate vector bit operations.
+  // This includes the operations needed to expand CTPOP if it isn't supported.
  if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) ||
-                        !isOperationLegalOrCustom(ISD::CTPOP, VT) ||
+                        (!isOperationLegalOrCustom(ISD::CTPOP, VT) &&
+                         !canExpandVectorCTPOP(*this, VT)) ||
                        !isOperationLegalOrCustom(ISD::SRL, VT) ||
                        !isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
    return false;
@ -7120,9 +7129,11 @@ bool TargetLowering::expandCTTZ(SDNode *Node, SDValue &Result,
  }

  // Only expand vector types if we have the appropriate vector bit operations.
+  // This includes the operations needed to expand CTPOP if it isn't supported.
  if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) ||
                        (!isOperationLegalOrCustom(ISD::CTPOP, VT) &&
-                         !isOperationLegalOrCustom(ISD::CTLZ, VT)) ||
+                         !isOperationLegalOrCustom(ISD::CTLZ, VT) &&
+                         !canExpandVectorCTPOP(*this, VT)) ||
                        !isOperationLegalOrCustom(ISD::SUB, VT) ||
                        !isOperationLegalOrCustomOrPromote(ISD::AND, VT) ||
                        !isOperationLegalOrCustomOrPromote(ISD::XOR, VT)))
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@ -215,8 +215,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
      setOperationAction(ISD::SELECT_CC, T, Expand);

    // Expand integer operations supported for scalars but not SIMD
-    for (auto Op : {ISD::CTLZ, ISD::CTTZ, ISD::CTPOP, ISD::SDIV, ISD::UDIV,
-                    ISD::SREM, ISD::UREM, ISD::ROTL, ISD::ROTR})
+    for (auto Op :
+         {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, ISD::ROTL, ISD::ROTR})
      for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64})
        setOperationAction(Op, T, Expand);

@ -225,8 +225,15 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
      for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
        setOperationAction(Op, T, Legal);

-    // And we have popcnt for i8x16
+    // And we have popcnt for i8x16. It can be used to expand ctlz/cttz.
    setOperationAction(ISD::CTPOP, MVT::v16i8, Legal);
+    setOperationAction(ISD::CTLZ, MVT::v16i8, Expand);
+    setOperationAction(ISD::CTTZ, MVT::v16i8, Expand);
+
+    // Custom lower bit counting operations for other types to scalarize them.
+    for (auto Op : {ISD::CTLZ, ISD::CTTZ, ISD::CTPOP})
+      for (auto T : {MVT::v8i16, MVT::v4i32, MVT::v2i64})
+        setOperationAction(Op, T, Custom);

    // Expand float operations supported for scalars but not SIMD
    for (auto Op : {ISD::FCOPYSIGN, ISD::FLOG, ISD::FLOG2, ISD::FLOG10,
@ -1405,6 +1412,10 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
    return LowerLoad(Op, DAG);
  case ISD::STORE:
    return LowerStore(Op, DAG);
+  case ISD::CTPOP:
+  case ISD::CTLZ:
+  case ISD::CTTZ:
+    return DAG.UnrollVectorOp(Op.getNode());
  }
 }

--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
--- a/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll
@ -120,8 +120,7 @@ define <8 x i16> @cttz_v8i16_undef(<8 x i16> %x) {
 }

 ; CHECK-LABEL: ctpop_v8i16:
-; Note: expansion does not use i32.popcnt
-; CHECK: v128.and
+; CHECK: i32.popcnt
 declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
 define <8 x i16> @ctpop_v8i16(<8 x i16> %x) {
  %v = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %x)
@ -209,8 +208,7 @@ define <4 x i32> @cttz_v4i32_undef(<4 x i32> %x) {
 }

 ; CHECK-LABEL: ctpop_v4i32:
-; Note: expansion does not use i32.popcnt
-; CHECK: v128.and
+; CHECK: i32.popcnt
 declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
 define <4 x i32> @ctpop_v4i32(<4 x i32> %x) {
  %v = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x)
@ -298,8 +296,7 @@ define <2 x i64> @cttz_v2i64_undef(<2 x i64> %x) {
 }

 ; CHECK-LABEL: ctpop_v2i64:
-; Note: expansion does not use i64.popcnt
-; CHECK: v128.and
+; CHECK: i64.popcnt
 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
 define <2 x i64> @ctpop_v2i64(<2 x i64> %x) {
  %v = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x)