From acfc025a7232052f594719e8e9447368a3b21018 Mon Sep 17 00:00:00 2001
From: "chenglin.bi" <chenglin.bi@cixcomputing.com>
Date: Mon, 18 Apr 2022 10:35:09 +0800
Subject: [PATCH] Revert "[Arch64][SelectionDAG] Add target-specific
 implementation of srem"

This reverts commit 9d9eddd3dde46751a5c415b7e5e475b4feb76600.
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  8 ---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 57 ++-------------
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 11 ---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 54 ---------------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  2 -
 llvm/test/CodeGen/AArch64/srem-pow2.ll        | 48 +++++++------
 llvm/test/CodeGen/AArch64/srem-seteq.ll       | 21 +++---
 llvm/test/CodeGen/AArch64/srem-vector-lkk.ll  | 69 ++++++++++---------
 8 files changed, 83 insertions(+), 187 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 4f09b9a6da57..dfc0af04b759 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4479,14 +4479,6 @@ public:
                                 SelectionDAG &DAG,
                                 SmallVectorImpl<SDNode *> &Created) const;
 
-  /// Targets may override this function to provide custom SREM lowering for
-  /// power-of-2 denominators.  If the target returns an empty SDValue, LLVM
-  /// assumes SREM is expensive and replaces it with a series of other integer
-  /// operations.
-  virtual SDValue BuildSREMPow2(SDNode *N, const APInt &Divisor,
-                                SelectionDAG &DAG,
-                                SmallVectorImpl<SDNode *> &Created) const;
-
   /// Indicate whether this target prefers to combine FDIVs with the same
   /// divisor. If the transform should never be done, return zero. If the
   /// transform should be done, return the minimum number of divisor uses
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ee0c34235203..c3d30ad435ef 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -569,8 +569,6 @@ namespace {
     SDValue BuildSDIV(SDNode *N);
     SDValue BuildSDIVPow2(SDNode *N);
     SDValue BuildUDIV(SDNode *N);
-    SDValue BuildSREMPow2(SDNode *N);
-    SDValue buildOptimizedSREM(SDValue N0, SDValue N1, SDNode *N);
     SDValue BuildLogBase2(SDValue V, const SDLoc &DL);
     SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags);
     SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
@@ -4322,7 +4320,12 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
   return SDValue();
 }
 
-static bool isDivisorPowerOfTwo(SDValue Divisor) {
+SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  EVT CCVT = getSetCCResultType(VT);
+  unsigned BitWidth = VT.getScalarSizeInBits();
+
   // Helper for determining whether a value is a power-2 constant scalar or a
   // vector of such elements.
   auto IsPowerOfTwo = [](ConstantSDNode *C) {
@@ -4335,20 +4338,11 @@ static bool isDivisorPowerOfTwo(SDValue Divisor) {
     return false;
   };
 
-  return ISD::matchUnaryPredicate(Divisor, IsPowerOfTwo);
-}
-
-SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-  EVT CCVT = getSetCCResultType(VT);
-  unsigned BitWidth = VT.getScalarSizeInBits();
-
   // fold (sdiv X, pow2) -> simple ops after legalize
   // FIXME: We check for the exact bit here because the generic lowering gives
   // better results in that case. The target-specific lowering should learn how
   // to handle exact sdivs efficiently.
-  if (!N->getFlags().hasExact() && isDivisorPowerOfTwo(N1)) {
+  if (!N->getFlags().hasExact() && ISD::matchUnaryPredicate(N1, IsPowerOfTwo)) {
     // Target-specific implementation of sdiv x, pow2.
     if (SDValue Res = BuildSDIVPow2(N))
       return Res;
@@ -4504,16 +4498,6 @@ SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) {
   return SDValue();
 }
 
-SDValue DAGCombiner::buildOptimizedSREM(SDValue N0, SDValue N1, SDNode *N) {
-  if (!N->getFlags().hasExact() && isDivisorPowerOfTwo(N1) &&
-      !DAG.doesNodeExist(ISD::SDIV, N->getVTList(), {N0, N1})) {
-    // Target-specific implementation of srem x, pow2.
-    if (SDValue Res = BuildSREMPow2(N))
-      return Res;
-  }
-  return SDValue();
-}
-
 // handles ISD::SREM and ISD::UREM
 SDValue DAGCombiner::visitREM(SDNode *N) {
   unsigned Opcode = N->getOpcode();
@@ -4574,12 +4558,6 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
   // combine will not return a DIVREM.  Regardless, checking cheapness here
   // makes sense since the simplification results in fatter code.
   if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) {
-    if (isSigned) {
-      // check if we can build faster implementation for srem
-      SDValue OptimizedRem = buildOptimizedSREM(N0, N1, N);
-      if (OptimizedRem.getNode())
-        return OptimizedRem;
-    }
     SDValue OptimizedDiv =
         isSigned ? visitSDIVLike(N0, N1, N) : visitUDIVLike(N0, N1, N);
     if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != N) {
@@ -23898,27 +23876,6 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) {
   return SDValue();
 }
 
-/// Given an ISD::SREM node expressing a remainder by constant power of 2,
-/// return a DAG expression that will generate the same value.
-SDValue DAGCombiner::BuildSREMPow2(SDNode *N) {
-  ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
-  if (!C)
-    return SDValue();
-
-  // Avoid division by zero.
-  if (C->isZero())
-    return SDValue();
-
-  SmallVector<SDNode *, 8> Built;
-  if (SDValue S = TLI.BuildSREMPow2(N, C->getAPIntValue(), DAG, Built)) {
-    for (SDNode *N : Built)
-      AddToWorklist(N);
-    return S;
-  }
-
-  return SDValue();
-}
-
 /// Determines the LogBase2 value for a non-null input value using the
 /// transform: LogBase2(V) = (EltBits - 1) - ctlz(V).
 SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 1182e60f37a6..bf26ad5a628d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5560,17 +5560,6 @@ SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
   return SDValue();
 }
 
-SDValue
-TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
-                              SelectionDAG &DAG,
-                              SmallVectorImpl<SDNode *> &Created) const {
-  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.isIntDivCheap(N->getValueType(0), Attr))
-    return SDValue(N, 0); // Lower SREM as SREM
-  return SDValue();
-}
-
 /// Given an ISD::SDIV node expressing a divide by constant,
 /// return a DAG expression to select that will generate the same value by
 /// multiplying by a magic number.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ba4b19c21b62..cde9433e647d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13567,60 +13567,6 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
   return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
 }
 
-SDValue
-AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
-                                     SelectionDAG &DAG,
-                                     SmallVectorImpl<SDNode *> &Created) const {
-  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
-  if (isIntDivCheap(N->getValueType(0), Attr))
-    return SDValue(N, 0); // Lower SREM as SREM
-
-  EVT VT = N->getValueType(0);
-
-  // For scalable and fixed types, mark them as cheap so we can handle it much
-  // later. This allows us to handle larger than legal types.
-  if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
-    return SDValue(N, 0);
-
-  // fold (srem X, pow2)
-  if ((VT != MVT::i32 && VT != MVT::i64) ||
-      !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
-    return SDValue();
-
-  unsigned Lg2 = Divisor.countTrailingZeros();
-  if (Lg2 == 0)
-    return SDValue();
-
-  SDLoc DL(N);
-  SDValue N0 = N->getOperand(0);
-  SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
-  SDValue Zero = DAG.getConstant(0, DL, VT);
-  SDValue CCVal, CSNeg;
-  if (Lg2 == 1) {
-    SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
-    SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
-    CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
-
-    Created.push_back(Cmp.getNode());
-    Created.push_back(And.getNode());
-  } else {
-    SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
-    SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-
-    SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
-    SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
-    SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
-    CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
-                        Negs.getValue(1));
-
-    Created.push_back(Negs.getNode());
-    Created.push_back(AndPos.getNode());
-    Created.push_back(AndNeg.getNode());
-  }
-
-  return CSNeg;
-}
-
 static bool IsSVECntIntrinsic(SDValue S) {
   switch(getIntrinsicID(S.getNode())) {
   default:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index f1b4c0ade210..e971d30d8c87 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1043,8 +1043,6 @@ private:
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         SmallVectorImpl<SDNode *> &Created) const override;
-  SDValue BuildSREMPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
-                        SmallVectorImpl<SDNode *> &Created) const override;
   SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
                           int &ExtraSteps, bool &UseOneConst,
                           bool Reciprocal) const override;
diff --git a/llvm/test/CodeGen/AArch64/srem-pow2.ll b/llvm/test/CodeGen/AArch64/srem-pow2.ll
index b825088c8b5d..419a9ac9a91e 100644
--- a/llvm/test/CodeGen/AArch64/srem-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/srem-pow2.ll
@@ -43,9 +43,10 @@ define i16 @fold_srem_2_i16(i16 %x) {
 define i32 @fold_srem_2_i64(i32 %x) {
 ; CHECK-LABEL: fold_srem_2_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cneg w0, w8, ge
+; CHECK-NEXT:    cinc w8, w0, lt
+; CHECK-NEXT:    and w8, w8, #0xfffffffe
+; CHECK-NEXT:    sub w0, w0, w8
 ; CHECK-NEXT:    ret
   %1 = srem i32 %x, 2
   ret i32 %1
@@ -54,9 +55,10 @@ define i32 @fold_srem_2_i64(i32 %x) {
 define i64 @fold_srem_2_i32(i64 %x) {
 ; CHECK-LABEL: fold_srem_2_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x0, #0x1
 ; CHECK-NEXT:    cmp x0, #0
-; CHECK-NEXT:    cneg x0, x8, ge
+; CHECK-NEXT:    cinc x8, x0, lt
+; CHECK-NEXT:    and x8, x8, #0xfffffffffffffffe
+; CHECK-NEXT:    sub x0, x0, x8
 ; CHECK-NEXT:    ret
   %1 = srem i64 %x, 2
   ret i64 %1
@@ -78,10 +80,11 @@ define i16 @fold_srem_pow2_i16(i16 %x) {
 define i32 @fold_srem_pow2_i32(i32 %x) {
 ; CHECK-LABEL: fold_srem_pow2_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    negs w8, w0
-; CHECK-NEXT:    and w9, w0, #0x3f
-; CHECK-NEXT:    and w8, w8, #0x3f
-; CHECK-NEXT:    csneg w0, w9, w8, mi
+; CHECK-NEXT:    add w8, w0, #63
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    csel w8, w8, w0, lt
+; CHECK-NEXT:    and w8, w8, #0xffffffc0
+; CHECK-NEXT:    sub w0, w0, w8
 ; CHECK-NEXT:    ret
   %1 = srem i32 %x, 64
   ret i32 %1
@@ -90,10 +93,11 @@ define i32 @fold_srem_pow2_i32(i32 %x) {
 define i64 @fold_srem_pow2_i64(i64 %x) {
 ; CHECK-LABEL: fold_srem_pow2_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    negs x8, x0
-; CHECK-NEXT:    and x9, x0, #0x3f
-; CHECK-NEXT:    and x8, x8, #0x3f
-; CHECK-NEXT:    csneg x0, x9, x8, mi
+; CHECK-NEXT:    add x8, x0, #63
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    csel x8, x8, x0, lt
+; CHECK-NEXT:    and x8, x8, #0xffffffffffffffc0
+; CHECK-NEXT:    sub x0, x0, x8
 ; CHECK-NEXT:    ret
   %1 = srem i64 %x, 64
   ret i64 %1
@@ -115,10 +119,12 @@ define i16 @fold_srem_smax_i16(i16 %x) {
 define i32 @fold_srem_smax_i32(i32 %x) {
 ; CHECK-LABEL: fold_srem_smax_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    negs w8, w0
-; CHECK-NEXT:    and w9, w0, #0x7fffffff
-; CHECK-NEXT:    and w8, w8, #0x7fffffff
-; CHECK-NEXT:    csneg w0, w9, w8, mi
+; CHECK-NEXT:    mov w8, #2147483647
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    add w8, w0, w8
+; CHECK-NEXT:    csel w8, w8, w0, lt
+; CHECK-NEXT:    and w8, w8, #0x80000000
+; CHECK-NEXT:    add w0, w0, w8
 ; CHECK-NEXT:    ret
   %1 = srem i32 %x, 2147483648
   ret i32 %1
@@ -127,10 +133,12 @@ define i32 @fold_srem_smax_i32(i32 %x) {
 define i64 @fold_srem_smax_i64(i64 %x) {
 ; CHECK-LABEL: fold_srem_smax_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    negs x8, x0
-; CHECK-NEXT:    and x9, x0, #0x7fffffffffffffff
-; CHECK-NEXT:    and x8, x8, #0x7fffffffffffffff
-; CHECK-NEXT:    csneg x0, x9, x8, mi
+; CHECK-NEXT:    mov x8, #9223372036854775807
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    add x8, x0, x8
+; CHECK-NEXT:    csel x8, x8, x0, lt
+; CHECK-NEXT:    and x8, x8, #0x8000000000000000
+; CHECK-NEXT:    add x0, x0, x8
 ; CHECK-NEXT:    ret
   %1 = srem i64 %x, -9223372036854775808
   ret i64 %1
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq.ll b/llvm/test/CodeGen/AArch64/srem-seteq.ll
index 5192de1a0e88..01876cd02363 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq.ll
@@ -234,11 +234,11 @@ define i32 @test_srem_one(i32 %X) nounwind {
 define i32 @test_srem_pow2(i32 %X) nounwind {
 ; CHECK-LABEL: test_srem_pow2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    negs w8, w0
-; CHECK-NEXT:    and w9, w0, #0xf
-; CHECK-NEXT:    and w8, w8, #0xf
-; CHECK-NEXT:    csneg w8, w9, w8, mi
-; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    add w8, w0, #15
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    csel w8, w8, w0, lt
+; CHECK-NEXT:    and w8, w8, #0xfffffff0
+; CHECK-NEXT:    cmp w0, w8
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %srem = srem i32 %X, 16
@@ -251,11 +251,12 @@ define i32 @test_srem_pow2(i32 %X) nounwind {
 define i32 @test_srem_int_min(i32 %X) nounwind {
 ; CHECK-LABEL: test_srem_int_min:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    negs w8, w0
-; CHECK-NEXT:    and w9, w0, #0x7fffffff
-; CHECK-NEXT:    and w8, w8, #0x7fffffff
-; CHECK-NEXT:    csneg w8, w9, w8, mi
-; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    mov w8, #2147483647
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    add w8, w0, w8
+; CHECK-NEXT:    csel w8, w8, w0, lt
+; CHECK-NEXT:    and w8, w8, #0x80000000
+; CHECK-NEXT:    cmn w0, w8
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %srem = srem i32 %X, 2147483648
diff --git a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll
index 5c7b76f54f85..1f27542cd784 100644
--- a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll
@@ -159,33 +159,36 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
 ; CHECK-NEXT:    smov w9, v0.h[1]
 ; CHECK-NEXT:    smov w10, v0.h[0]
 ; CHECK-NEXT:    mov w8, #37253
-; CHECK-NEXT:    smov w12, v0.h[2]
 ; CHECK-NEXT:    movk w8, #44150, lsl #16
-; CHECK-NEXT:    negs w11, w9
-; CHECK-NEXT:    and w9, w9, #0x1f
-; CHECK-NEXT:    and w11, w11, #0x1f
-; CHECK-NEXT:    csneg w9, w9, w11, mi
-; CHECK-NEXT:    negs w11, w10
-; CHECK-NEXT:    and w10, w10, #0x3f
-; CHECK-NEXT:    and w11, w11, #0x3f
-; CHECK-NEXT:    csneg w10, w10, w11, mi
-; CHECK-NEXT:    smov w11, v0.h[3]
-; CHECK-NEXT:    fmov s0, w10
-; CHECK-NEXT:    negs w10, w12
-; CHECK-NEXT:    smull x8, w11, w8
-; CHECK-NEXT:    and w10, w10, #0x7
+; CHECK-NEXT:    add w11, w9, #31
+; CHECK-NEXT:    cmp w9, #0
+; CHECK-NEXT:    add w12, w10, #63
+; CHECK-NEXT:    csel w11, w11, w9, lt
+; CHECK-NEXT:    cmp w10, #0
+; CHECK-NEXT:    and w11, w11, #0xffffffe0
+; CHECK-NEXT:    csel w12, w12, w10, lt
+; CHECK-NEXT:    sub w9, w9, w11
+; CHECK-NEXT:    and w12, w12, #0xffffffc0
+; CHECK-NEXT:    sub w10, w10, w12
+; CHECK-NEXT:    smov w12, v0.h[3]
+; CHECK-NEXT:    fmov s1, w10
+; CHECK-NEXT:    smov w10, v0.h[2]
+; CHECK-NEXT:    smull x8, w12, w8
+; CHECK-NEXT:    mov v1.h[1], w9
 ; CHECK-NEXT:    lsr x8, x8, #32
-; CHECK-NEXT:    mov v0.h[1], w9
-; CHECK-NEXT:    and w9, w12, #0x7
-; CHECK-NEXT:    add w8, w8, w11
-; CHECK-NEXT:    csneg w9, w9, w10, mi
+; CHECK-NEXT:    add w9, w10, #7
+; CHECK-NEXT:    cmp w10, #0
+; CHECK-NEXT:    csel w9, w9, w10, lt
+; CHECK-NEXT:    add w8, w8, w12
+; CHECK-NEXT:    and w9, w9, #0xfffffff8
+; CHECK-NEXT:    sub w9, w10, w9
 ; CHECK-NEXT:    asr w10, w8, #6
 ; CHECK-NEXT:    add w8, w10, w8, lsr #31
 ; CHECK-NEXT:    mov w10, #95
-; CHECK-NEXT:    mov v0.h[2], w9
-; CHECK-NEXT:    msub w8, w8, w10, w11
-; CHECK-NEXT:    mov v0.h[3], w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    mov v1.h[2], w9
+; CHECK-NEXT:    msub w8, w8, w10, w12
+; CHECK-NEXT:    mov v1.h[3], w8
+; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
   ret <4 x i16> %1
@@ -242,25 +245,27 @@ define <4 x i16> @dont_fold_srem_i16_smax(<4 x i16> %x) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    smov w8, v0.h[2]
 ; CHECK-NEXT:    mov w9, #17097
-; CHECK-NEXT:    movk w9, #45590, lsl #16
 ; CHECK-NEXT:    smov w10, v0.h[1]
+; CHECK-NEXT:    movk w9, #45590, lsl #16
+; CHECK-NEXT:    mov w11, #32767
 ; CHECK-NEXT:    smov w12, v0.h[3]
 ; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    mov w11, #23
 ; CHECK-NEXT:    smull x9, w8, w9
+; CHECK-NEXT:    add w11, w10, w11
+; CHECK-NEXT:    cmp w10, #0
 ; CHECK-NEXT:    lsr x9, x9, #32
+; CHECK-NEXT:    csel w11, w11, w10, lt
 ; CHECK-NEXT:    add w9, w9, w8
+; CHECK-NEXT:    and w11, w11, #0xffff8000
 ; CHECK-NEXT:    asr w13, w9, #4
+; CHECK-NEXT:    sub w10, w10, w11
+; CHECK-NEXT:    mov w11, #47143
 ; CHECK-NEXT:    add w9, w13, w9, lsr #31
-; CHECK-NEXT:    negs w13, w10
-; CHECK-NEXT:    and w10, w10, #0x7fff
-; CHECK-NEXT:    and w13, w13, #0x7fff
-; CHECK-NEXT:    csneg w10, w10, w13, mi
-; CHECK-NEXT:    mov w13, #47143
-; CHECK-NEXT:    movk w13, #24749, lsl #16
-; CHECK-NEXT:    msub w8, w9, w11, w8
-; CHECK-NEXT:    smull x9, w12, w13
+; CHECK-NEXT:    mov w13, #23
+; CHECK-NEXT:    movk w11, #24749, lsl #16
 ; CHECK-NEXT:    mov v1.h[1], w10
+; CHECK-NEXT:    msub w8, w9, w13, w8
+; CHECK-NEXT:    smull x9, w12, w11
 ; CHECK-NEXT:    lsr x10, x9, #63
 ; CHECK-NEXT:    asr x9, x9, #43
 ; CHECK-NEXT:    add w9, w9, w10