forked from OSchip/llvm-project
[AArch64] Use neon instructions for i64/i128 ISD::PARITY calculation
As noticed on D129765 and reported on Issue #56531 - aarch64 targets can use the neon ctpop + add-reduce instructions to speed up scalar ctpop instructions, but we fail to do this for parity calculations. I'm not sure where the cutoff should be for specific CPUs, but i64 (+ i128 special case) shows a definite reduction in instruction count. i32 is about the same (but scalar <-> neon transfers are probably more costly?), and sub-i32 promotion looks to be a definite regression compared to parity expansion optimized for those widths. Differential Revision: https://reviews.llvm.org/D130246
This commit is contained in:
parent
8f0ba6c405
commit
939cf9b1be
|
@ -521,6 +521,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::CTPOP, MVT::i64, Custom);
|
||||
setOperationAction(ISD::CTPOP, MVT::i128, Custom);
|
||||
|
||||
setOperationAction(ISD::PARITY, MVT::i64, Custom);
|
||||
setOperationAction(ISD::PARITY, MVT::i128, Custom);
|
||||
|
||||
setOperationAction(ISD::ABS, MVT::i32, Custom);
|
||||
setOperationAction(ISD::ABS, MVT::i64, Custom);
|
||||
|
||||
|
@ -5463,7 +5466,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
|
|||
case ISD::SRA_PARTS:
|
||||
return LowerShiftParts(Op, DAG);
|
||||
case ISD::CTPOP:
|
||||
return LowerCTPOP(Op, DAG);
|
||||
case ISD::PARITY:
|
||||
return LowerCTPOP_PARITY(Op, DAG);
|
||||
case ISD::FCOPYSIGN:
|
||||
return LowerFCOPYSIGN(Op, DAG);
|
||||
case ISD::OR:
|
||||
|
@ -7783,7 +7787,8 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
|
|||
return BitCast(VT, BSP, DAG);
|
||||
}
|
||||
|
||||
SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
|
||||
SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
if (DAG.getMachineFunction().getFunction().hasFnAttribute(
|
||||
Attribute::NoImplicitFloat))
|
||||
return SDValue();
|
||||
|
@ -7791,6 +7796,8 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
|
|||
if (!Subtarget->hasNEON())
|
||||
return SDValue();
|
||||
|
||||
bool IsParity = Op.getOpcode() == ISD::PARITY;
|
||||
|
||||
// While there is no integer popcount instruction, it can
|
||||
// be more efficiently lowered to the following sequence that uses
|
||||
// AdvSIMD registers/instructions as long as the copies to/from
|
||||
|
@ -7813,6 +7820,10 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
|
|||
ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
|
||||
DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
|
||||
|
||||
if (IsParity)
|
||||
UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
|
||||
DAG.getConstant(1, DL, MVT::i32));
|
||||
|
||||
if (VT == MVT::i64)
|
||||
UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
|
||||
return UaddLV;
|
||||
|
@ -7824,9 +7835,15 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
|
|||
ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
|
||||
DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
|
||||
|
||||
if (IsParity)
|
||||
UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
|
||||
DAG.getConstant(1, DL, MVT::i32));
|
||||
|
||||
return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
|
||||
}
|
||||
|
||||
assert(!IsParity && "ISD::PARITY of vector types not supported");
|
||||
|
||||
if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
|
||||
return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
|
||||
|
||||
|
@ -20058,7 +20075,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
|
|||
return;
|
||||
|
||||
case ISD::CTPOP:
|
||||
if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG))
|
||||
case ISD::PARITY:
|
||||
if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
|
||||
Results.push_back(Result);
|
||||
return;
|
||||
case AArch64ISD::SADDV:
|
||||
|
|
|
@ -1002,7 +1002,7 @@ private:
|
|||
SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerCTPOP_PARITY(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerBitreverse(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
|
|
@ -77,13 +77,11 @@ define i32 @parity_32(i32 %x) {
|
|||
define i64 @parity_64(i64 %x) {
|
||||
; CHECK-LABEL: parity_64:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: eor x8, x0, x0, lsr #32
|
||||
; CHECK-NEXT: eor x8, x8, x8, lsr #16
|
||||
; CHECK-NEXT: eor x8, x8, x8, lsr #8
|
||||
; CHECK-NEXT: eor x8, x8, x8, lsr #4
|
||||
; CHECK-NEXT: eor x8, x8, x8, lsr #2
|
||||
; CHECK-NEXT: eor w8, w8, w8, lsr #1
|
||||
; CHECK-NEXT: and x0, x8, #0x1
|
||||
; CHECK-NEXT: fmov d0, x0
|
||||
; CHECK-NEXT: cnt v0.8b, v0.8b
|
||||
; CHECK-NEXT: uaddlv h0, v0.8b
|
||||
; CHECK-NEXT: fmov w8, s0
|
||||
; CHECK-NEXT: and w0, w8, #0x1
|
||||
; CHECK-NEXT: ret
|
||||
%1 = tail call i64 @llvm.ctpop.i64(i64 %x)
|
||||
%2 = and i64 %1, 1
|
||||
|
@ -93,15 +91,13 @@ define i64 @parity_64(i64 %x) {
|
|||
define i128 @parity_128(i128 %x) {
|
||||
; CHECK-LABEL: parity_128:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: eor x8, x0, x1
|
||||
; CHECK-NEXT: fmov d0, x0
|
||||
; CHECK-NEXT: mov v0.d[1], x1
|
||||
; CHECK-NEXT: mov x1, xzr
|
||||
; CHECK-NEXT: eor x8, x8, x8, lsr #32
|
||||
; CHECK-NEXT: eor x8, x8, x8, lsr #16
|
||||
; CHECK-NEXT: eor x8, x8, x8, lsr #8
|
||||
; CHECK-NEXT: eor x8, x8, x8, lsr #4
|
||||
; CHECK-NEXT: eor x8, x8, x8, lsr #2
|
||||
; CHECK-NEXT: eor w8, w8, w8, lsr #1
|
||||
; CHECK-NEXT: and x0, x8, #0x1
|
||||
; CHECK-NEXT: cnt v0.16b, v0.16b
|
||||
; CHECK-NEXT: uaddlv h0, v0.16b
|
||||
; CHECK-NEXT: fmov w8, s0
|
||||
; CHECK-NEXT: and w0, w8, #0x1
|
||||
; CHECK-NEXT: ret
|
||||
%1 = tail call i128 @llvm.ctpop.i128(i128 %x)
|
||||
%2 = and i128 %1, 1
|
||||
|
@ -111,12 +107,10 @@ define i128 @parity_128(i128 %x) {
|
|||
define i32 @parity_64_trunc(i64 %x) {
|
||||
; CHECK-LABEL: parity_64_trunc:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: eor x8, x0, x0, lsr #32
|
||||
; CHECK-NEXT: eor x8, x8, x8, lsr #16
|
||||
; CHECK-NEXT: eor x8, x8, x8, lsr #8
|
||||
; CHECK-NEXT: eor x8, x8, x8, lsr #4
|
||||
; CHECK-NEXT: eor x8, x8, x8, lsr #2
|
||||
; CHECK-NEXT: eor w8, w8, w8, lsr #1
|
||||
; CHECK-NEXT: fmov d0, x0
|
||||
; CHECK-NEXT: cnt v0.8b, v0.8b
|
||||
; CHECK-NEXT: uaddlv h0, v0.8b
|
||||
; CHECK-NEXT: fmov w8, s0
|
||||
; CHECK-NEXT: and w0, w8, #0x1
|
||||
; CHECK-NEXT: ret
|
||||
%1 = tail call i64 @llvm.ctpop.i64(i64 %x)
|
||||
|
|
Loading…
Reference in New Issue