From acfc025a7232052f594719e8e9447368a3b21018 Mon Sep 17 00:00:00 2001 From: "chenglin.bi" Date: Mon, 18 Apr 2022 10:35:09 +0800 Subject: [PATCH] Revert "[Arch64][SelectionDAG] Add target-specific implementation of srem" This reverts commit 9d9eddd3dde46751a5c415b7e5e475b4feb76600. --- llvm/include/llvm/CodeGen/TargetLowering.h | 8 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 57 ++------------- .../CodeGen/SelectionDAG/TargetLowering.cpp | 11 --- .../Target/AArch64/AArch64ISelLowering.cpp | 54 --------------- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 - llvm/test/CodeGen/AArch64/srem-pow2.ll | 48 +++++++------ llvm/test/CodeGen/AArch64/srem-seteq.ll | 21 +++--- llvm/test/CodeGen/AArch64/srem-vector-lkk.ll | 69 ++++++++++--------- 8 files changed, 83 insertions(+), 187 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 4f09b9a6da57..dfc0af04b759 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4479,14 +4479,6 @@ public: SelectionDAG &DAG, SmallVectorImpl &Created) const; - /// Targets may override this function to provide custom SREM lowering for - /// power-of-2 denominators. If the target returns an empty SDValue, LLVM - /// assumes SREM is expensive and replaces it with a series of other integer - /// operations. - virtual SDValue BuildSREMPow2(SDNode *N, const APInt &Divisor, - SelectionDAG &DAG, - SmallVectorImpl &Created) const; - /// Indicate whether this target prefers to combine FDIVs with the same /// divisor. If the transform should never be done, return zero. If the /// transform should be done, return the minimum number of divisor uses diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ee0c34235203..c3d30ad435ef 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -569,8 +569,6 @@ namespace { SDValue BuildSDIV(SDNode *N); SDValue BuildSDIVPow2(SDNode *N); SDValue BuildUDIV(SDNode *N); - SDValue BuildSREMPow2(SDNode *N); - SDValue buildOptimizedSREM(SDValue N0, SDValue N1, SDNode *N); SDValue BuildLogBase2(SDValue V, const SDLoc &DL); SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags); SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags); @@ -4322,7 +4320,12 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { return SDValue(); } -static bool isDivisorPowerOfTwo(SDValue Divisor) { +SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + EVT CCVT = getSetCCResultType(VT); + unsigned BitWidth = VT.getScalarSizeInBits(); + // Helper for determining whether a value is a power-2 constant scalar or a // vector of such elements. auto IsPowerOfTwo = [](ConstantSDNode *C) { @@ -4335,20 +4338,11 @@ static bool isDivisorPowerOfTwo(SDValue Divisor) { return false; }; - return ISD::matchUnaryPredicate(Divisor, IsPowerOfTwo); -} - -SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) { - SDLoc DL(N); - EVT VT = N->getValueType(0); - EVT CCVT = getSetCCResultType(VT); - unsigned BitWidth = VT.getScalarSizeInBits(); - // fold (sdiv X, pow2) -> simple ops after legalize // FIXME: We check for the exact bit here because the generic lowering gives // better results in that case. The target-specific lowering should learn how // to handle exact sdivs efficiently. - if (!N->getFlags().hasExact() && isDivisorPowerOfTwo(N1)) { + if (!N->getFlags().hasExact() && ISD::matchUnaryPredicate(N1, IsPowerOfTwo)) { // Target-specific implementation of sdiv x, pow2. if (SDValue Res = BuildSDIVPow2(N)) return Res; @@ -4504,16 +4498,6 @@ SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) { return SDValue(); } -SDValue DAGCombiner::buildOptimizedSREM(SDValue N0, SDValue N1, SDNode *N) { - if (!N->getFlags().hasExact() && isDivisorPowerOfTwo(N1) && - !DAG.doesNodeExist(ISD::SDIV, N->getVTList(), {N0, N1})) { - // Target-specific implementation of srem x, pow2. - if (SDValue Res = BuildSREMPow2(N)) - return Res; - } - return SDValue(); -} - // handles ISD::SREM and ISD::UREM SDValue DAGCombiner::visitREM(SDNode *N) { unsigned Opcode = N->getOpcode(); @@ -4574,12 +4558,6 @@ SDValue DAGCombiner::visitREM(SDNode *N) { // combine will not return a DIVREM. Regardless, checking cheapness here // makes sense since the simplification results in fatter code. if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) { - if (isSigned) { - // check if we can build faster implementation for srem - SDValue OptimizedRem = buildOptimizedSREM(N0, N1, N); - if (OptimizedRem.getNode()) - return OptimizedRem; - } SDValue OptimizedDiv = isSigned ? visitSDIVLike(N0, N1, N) : visitUDIVLike(N0, N1, N); if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != N) { @@ -23898,27 +23876,6 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) { return SDValue(); } -/// Given an ISD::SREM node expressing a remainder by constant power of 2, -/// return a DAG expression that will generate the same value. -SDValue DAGCombiner::BuildSREMPow2(SDNode *N) { - ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); - if (!C) - return SDValue(); - - // Avoid division by zero. - if (C->isZero()) - return SDValue(); - - SmallVector Built; - if (SDValue S = TLI.BuildSREMPow2(N, C->getAPIntValue(), DAG, Built)) { - for (SDNode *N : Built) - AddToWorklist(N); - return S; - } - - return SDValue(); -} - /// Determines the LogBase2 value for a non-null input value using the /// transform: LogBase2(V) = (EltBits - 1) - ctlz(V). SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) { diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 1182e60f37a6..bf26ad5a628d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -5560,17 +5560,6 @@ SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, return SDValue(); } -SDValue -TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor, - SelectionDAG &DAG, - SmallVectorImpl &Created) const { - AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.isIntDivCheap(N->getValueType(0), Attr)) - return SDValue(N, 0); // Lower SREM as SREM - return SDValue(); -} - /// Given an ISD::SDIV node expressing a divide by constant, /// return a DAG expression to select that will generate the same value by /// multiplying by a magic number. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ba4b19c21b62..cde9433e647d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13567,60 +13567,6 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); } -SDValue -AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor, - SelectionDAG &DAG, - SmallVectorImpl &Created) const { - AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); - if (isIntDivCheap(N->getValueType(0), Attr)) - return SDValue(N, 0); // Lower SREM as SREM - - EVT VT = N->getValueType(0); - - // For scalable and fixed types, mark them as cheap so we can handle it much - // later. This allows us to handle larger than legal types. - if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors()) - return SDValue(N, 0); - - // fold (srem X, pow2) - if ((VT != MVT::i32 && VT != MVT::i64) || - !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2())) - return SDValue(); - - unsigned Lg2 = Divisor.countTrailingZeros(); - if (Lg2 == 0) - return SDValue(); - - SDLoc DL(N); - SDValue N0 = N->getOperand(0); - SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); - SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue CCVal, CSNeg; - if (Lg2 == 1) { - SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); - SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne); - CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp); - - Created.push_back(Cmp.getNode()); - Created.push_back(And.getNode()); - } else { - SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC); - SDVTList VTs = DAG.getVTList(VT, MVT::i32); - - SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0); - SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne); - SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne); - CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal, - Negs.getValue(1)); - - Created.push_back(Negs.getNode()); - Created.push_back(AndPos.getNode()); - Created.push_back(AndNeg.getNode()); - } - - return CSNeg; -} - static bool IsSVECntIntrinsic(SDValue S) { switch(getIntrinsicID(S.getNode())) { default: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index f1b4c0ade210..e971d30d8c87 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1043,8 +1043,6 @@ private: SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const override; - SDValue BuildSREMPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, - SmallVectorImpl &Created) const override; SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override; diff --git a/llvm/test/CodeGen/AArch64/srem-pow2.ll b/llvm/test/CodeGen/AArch64/srem-pow2.ll index b825088c8b5d..419a9ac9a91e 100644 --- a/llvm/test/CodeGen/AArch64/srem-pow2.ll +++ b/llvm/test/CodeGen/AArch64/srem-pow2.ll @@ -43,9 +43,10 @@ define i16 @fold_srem_2_i16(i16 %x) { define i32 @fold_srem_2_i64(i32 %x) { ; CHECK-LABEL: fold_srem_2_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0x1 ; CHECK-NEXT: cmp w0, #0 -; CHECK-NEXT: cneg w0, w8, ge +; CHECK-NEXT: cinc w8, w0, lt +; CHECK-NEXT: and w8, w8, #0xfffffffe +; CHECK-NEXT: sub w0, w0, w8 ; CHECK-NEXT: ret %1 = srem i32 %x, 2 ret i32 %1 @@ -54,9 +55,10 @@ define i32 @fold_srem_2_i64(i32 %x) { define i64 @fold_srem_2_i32(i64 %x) { ; CHECK-LABEL: fold_srem_2_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: and x8, x0, #0x1 ; CHECK-NEXT: cmp x0, #0 -; CHECK-NEXT: cneg x0, x8, ge +; CHECK-NEXT: cinc x8, x0, lt +; CHECK-NEXT: and x8, x8, #0xfffffffffffffffe +; CHECK-NEXT: sub x0, x0, x8 ; CHECK-NEXT: ret %1 = srem i64 %x, 2 ret i64 %1 @@ -78,10 +80,11 @@ define i16 @fold_srem_pow2_i16(i16 %x) { define i32 @fold_srem_pow2_i32(i32 %x) { ; CHECK-LABEL: fold_srem_pow2_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: negs w8, w0 -; CHECK-NEXT: and w9, w0, #0x3f -; CHECK-NEXT: and w8, w8, #0x3f -; CHECK-NEXT: csneg w0, w9, w8, mi +; CHECK-NEXT: add w8, w0, #63 +; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: and w8, w8, #0xffffffc0 +; CHECK-NEXT: sub w0, w0, w8 ; CHECK-NEXT: ret %1 = srem i32 %x, 64 ret i32 %1 @@ -90,10 +93,11 @@ define i32 @fold_srem_pow2_i32(i32 %x) { define i64 @fold_srem_pow2_i64(i64 %x) { ; CHECK-LABEL: fold_srem_pow2_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: negs x8, x0 -; CHECK-NEXT: and x9, x0, #0x3f -; CHECK-NEXT: and x8, x8, #0x3f -; CHECK-NEXT: csneg x0, x9, x8, mi +; CHECK-NEXT: add x8, x0, #63 +; CHECK-NEXT: cmp x0, #0 +; CHECK-NEXT: csel x8, x8, x0, lt +; CHECK-NEXT: and x8, x8, #0xffffffffffffffc0 +; CHECK-NEXT: sub x0, x0, x8 ; CHECK-NEXT: ret %1 = srem i64 %x, 64 ret i64 %1 @@ -115,10 +119,12 @@ define i16 @fold_srem_smax_i16(i16 %x) { define i32 @fold_srem_smax_i32(i32 %x) { ; CHECK-LABEL: fold_srem_smax_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: negs w8, w0 -; CHECK-NEXT: and w9, w0, #0x7fffffff -; CHECK-NEXT: and w8, w8, #0x7fffffff -; CHECK-NEXT: csneg w0, w9, w8, mi +; CHECK-NEXT: mov w8, #2147483647 +; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: add w8, w0, w8 +; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: and w8, w8, #0x80000000 +; CHECK-NEXT: add w0, w0, w8 ; CHECK-NEXT: ret %1 = srem i32 %x, 2147483648 ret i32 %1 @@ -127,10 +133,12 @@ define i32 @fold_srem_smax_i32(i32 %x) { define i64 @fold_srem_smax_i64(i64 %x) { ; CHECK-LABEL: fold_srem_smax_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: negs x8, x0 -; CHECK-NEXT: and x9, x0, #0x7fffffffffffffff -; CHECK-NEXT: and x8, x8, #0x7fffffffffffffff -; CHECK-NEXT: csneg x0, x9, x8, mi +; CHECK-NEXT: mov x8, #9223372036854775807 +; CHECK-NEXT: cmp x0, #0 +; CHECK-NEXT: add x8, x0, x8 +; CHECK-NEXT: csel x8, x8, x0, lt +; CHECK-NEXT: and x8, x8, #0x8000000000000000 +; CHECK-NEXT: add x0, x0, x8 ; CHECK-NEXT: ret %1 = srem i64 %x, -9223372036854775808 ret i64 %1 diff --git a/llvm/test/CodeGen/AArch64/srem-seteq.ll b/llvm/test/CodeGen/AArch64/srem-seteq.ll index 5192de1a0e88..01876cd02363 100644 --- a/llvm/test/CodeGen/AArch64/srem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq.ll @@ -234,11 +234,11 @@ define i32 @test_srem_one(i32 %X) nounwind { define i32 @test_srem_pow2(i32 %X) nounwind { ; CHECK-LABEL: test_srem_pow2: ; CHECK: // %bb.0: -; CHECK-NEXT: negs w8, w0 -; CHECK-NEXT: and w9, w0, #0xf -; CHECK-NEXT: and w8, w8, #0xf -; CHECK-NEXT: csneg w8, w9, w8, mi -; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: add w8, w0, #15 +; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: and w8, w8, #0xfffffff0 +; CHECK-NEXT: cmp w0, w8 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %srem = srem i32 %X, 16 @@ -251,11 +251,12 @@ define i32 @test_srem_pow2(i32 %X) nounwind { define i32 @test_srem_int_min(i32 %X) nounwind { ; CHECK-LABEL: test_srem_int_min: ; CHECK: // %bb.0: -; CHECK-NEXT: negs w8, w0 -; CHECK-NEXT: and w9, w0, #0x7fffffff -; CHECK-NEXT: and w8, w8, #0x7fffffff -; CHECK-NEXT: csneg w8, w9, w8, mi -; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: mov w8, #2147483647 +; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: add w8, w0, w8 +; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: and w8, w8, #0x80000000 +; CHECK-NEXT: cmn w0, w8 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %srem = srem i32 %X, 2147483648 diff --git a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll index 5c7b76f54f85..1f27542cd784 100644 --- a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll @@ -159,33 +159,36 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; CHECK-NEXT: smov w9, v0.h[1] ; CHECK-NEXT: smov w10, v0.h[0] ; CHECK-NEXT: mov w8, #37253 -; CHECK-NEXT: smov w12, v0.h[2] ; CHECK-NEXT: movk w8, #44150, lsl #16 -; CHECK-NEXT: negs w11, w9 -; CHECK-NEXT: and w9, w9, #0x1f -; CHECK-NEXT: and w11, w11, #0x1f -; CHECK-NEXT: csneg w9, w9, w11, mi -; CHECK-NEXT: negs w11, w10 -; CHECK-NEXT: and w10, w10, #0x3f -; CHECK-NEXT: and w11, w11, #0x3f -; CHECK-NEXT: csneg w10, w10, w11, mi -; CHECK-NEXT: smov w11, v0.h[3] -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: negs w10, w12 -; CHECK-NEXT: smull x8, w11, w8 -; CHECK-NEXT: and w10, w10, #0x7 +; CHECK-NEXT: add w11, w9, #31 +; CHECK-NEXT: cmp w9, #0 +; CHECK-NEXT: add w12, w10, #63 +; CHECK-NEXT: csel w11, w11, w9, lt +; CHECK-NEXT: cmp w10, #0 +; CHECK-NEXT: and w11, w11, #0xffffffe0 +; CHECK-NEXT: csel w12, w12, w10, lt +; CHECK-NEXT: sub w9, w9, w11 +; CHECK-NEXT: and w12, w12, #0xffffffc0 +; CHECK-NEXT: sub w10, w10, w12 +; CHECK-NEXT: smov w12, v0.h[3] +; CHECK-NEXT: fmov s1, w10 +; CHECK-NEXT: smov w10, v0.h[2] +; CHECK-NEXT: smull x8, w12, w8 +; CHECK-NEXT: mov v1.h[1], w9 ; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: and w9, w12, #0x7 -; CHECK-NEXT: add w8, w8, w11 -; CHECK-NEXT: csneg w9, w9, w10, mi +; CHECK-NEXT: add w9, w10, #7 +; CHECK-NEXT: cmp w10, #0 +; CHECK-NEXT: csel w9, w9, w10, lt +; CHECK-NEXT: add w8, w8, w12 +; CHECK-NEXT: and w9, w9, #0xfffffff8 +; CHECK-NEXT: sub w9, w10, w9 ; CHECK-NEXT: asr w10, w8, #6 ; CHECK-NEXT: add w8, w10, w8, lsr #31 ; CHECK-NEXT: mov w10, #95 -; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: msub w8, w8, w10, w11 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov v1.h[2], w9 +; CHECK-NEXT: msub w8, w8, w10, w12 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -242,25 +245,27 @@ define <4 x i16> @dont_fold_srem_i16_smax(<4 x i16> %x) { ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w8, v0.h[2] ; CHECK-NEXT: mov w9, #17097 -; CHECK-NEXT: movk w9, #45590, lsl #16 ; CHECK-NEXT: smov w10, v0.h[1] +; CHECK-NEXT: movk w9, #45590, lsl #16 +; CHECK-NEXT: mov w11, #32767 ; CHECK-NEXT: smov w12, v0.h[3] ; CHECK-NEXT: movi d1, #0000000000000000 -; CHECK-NEXT: mov w11, #23 ; CHECK-NEXT: smull x9, w8, w9 +; CHECK-NEXT: add w11, w10, w11 +; CHECK-NEXT: cmp w10, #0 ; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: csel w11, w11, w10, lt ; CHECK-NEXT: add w9, w9, w8 +; CHECK-NEXT: and w11, w11, #0xffff8000 ; CHECK-NEXT: asr w13, w9, #4 +; CHECK-NEXT: sub w10, w10, w11 +; CHECK-NEXT: mov w11, #47143 ; CHECK-NEXT: add w9, w13, w9, lsr #31 -; CHECK-NEXT: negs w13, w10 -; CHECK-NEXT: and w10, w10, #0x7fff -; CHECK-NEXT: and w13, w13, #0x7fff -; CHECK-NEXT: csneg w10, w10, w13, mi -; CHECK-NEXT: mov w13, #47143 -; CHECK-NEXT: movk w13, #24749, lsl #16 -; CHECK-NEXT: msub w8, w9, w11, w8 -; CHECK-NEXT: smull x9, w12, w13 +; CHECK-NEXT: mov w13, #23 +; CHECK-NEXT: movk w11, #24749, lsl #16 ; CHECK-NEXT: mov v1.h[1], w10 +; CHECK-NEXT: msub w8, w9, w13, w8 +; CHECK-NEXT: smull x9, w12, w11 ; CHECK-NEXT: lsr x10, x9, #63 ; CHECK-NEXT: asr x9, x9, #43 ; CHECK-NEXT: add w9, w9, w10