[AArch64] Optimise min/max lowering in ISel

Differential Revision: https://reviews.llvm.org/D106561
2021-07-22 16:21:48 +01:00 · 2021-07-22 16:21:48 +01:00 · b01417d3c5
parent 9988ab3989
commit b01417d3c5
7 changed files with 107 additions and 129 deletions
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -1040,6 +1040,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
    setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);
    setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
    setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom);
+    for (auto VT : {MVT::v1i64, MVT::v2i64}) {
+      setOperationAction(ISD::UMAX, VT, Custom);
+      setOperationAction(ISD::SMAX, VT, Custom);
+      setOperationAction(ISD::UMIN, VT, Custom);
+      setOperationAction(ISD::SMIN, VT, Custom);
+    }

    // AArch64 doesn't have MUL.2d:
    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
@ -4825,17 +4831,10 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
  case ISD::UDIV:
    return LowerDIV(Op, DAG);
  case ISD::SMIN:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED,
-                               /*OverrideNEON=*/true);
  case ISD::UMIN:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED,
-                               /*OverrideNEON=*/true);
  case ISD::SMAX:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED,
-                               /*OverrideNEON=*/true);
  case ISD::UMAX:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED,
-                               /*OverrideNEON=*/true);
+    return LowerMinMax(Op, DAG);
  case ISD::SRA:
  case ISD::SRL:
  case ISD::SHL:
@ -7131,6 +7130,56 @@ SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
  return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
 }

+SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
+                                           SelectionDAG &DAG) const {
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Opcode = Op.getOpcode();
+  ISD::CondCode CC;
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Wrong instruction");
+  case ISD::SMAX:
+    CC = ISD::SETGT;
+    break;
+  case ISD::SMIN:
+    CC = ISD::SETLT;
+    break;
+  case ISD::UMAX:
+    CC = ISD::SETUGT;
+    break;
+  case ISD::UMIN:
+    CC = ISD::SETULT;
+    break;
+  }
+
+  if (VT.isScalableVector() ||
+      useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) {
+    switch (Opcode) {
+    default:
+      llvm_unreachable("Wrong instruction");
+    case ISD::SMAX:
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED,
+                                 /*OverrideNEON=*/true);
+    case ISD::SMIN:
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED,
+                                 /*OverrideNEON=*/true);
+    case ISD::UMAX:
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED,
+                                 /*OverrideNEON=*/true);
+    case ISD::UMIN:
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED,
+                                 /*OverrideNEON=*/true);
+    }
+  }
+
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
+  return DAG.getSelect(DL, VT, Cond, Op0, Op1);
+}
+
 SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
                                               SelectionDAG &DAG) const {
  EVT VT = Op.getValueType();
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@ -966,6 +966,7 @@ private:
  SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerBitreverse(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@ -220,19 +220,15 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
  auto *RetTy = ICA.getReturnType();
  switch (ICA.getID()) {
  case Intrinsic::umin:
-  case Intrinsic::umax: {
-    auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
-    // umin(x,y) -> sub(x,usubsat(x,y))
-    // umax(x,y) -> add(x,usubsat(y,x))
-    if (LT.second == MVT::v2i64)
-      return LT.first * 2;
-    LLVM_FALLTHROUGH;
-  }
+  case Intrinsic::umax:
  case Intrinsic::smin:
  case Intrinsic::smax: {
    static const auto ValidMinMaxTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
                                        MVT::v8i16, MVT::v2i32, MVT::v4i32};
    auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
+    // v2i64 types get converted to cmp+bif hence the cost of 2
+    if (LT.second == MVT::v2i64)
+      return LT.first * 2;
    if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
      return LT.first;
    break;
--- a/llvm/test/Analysis/CostModel/AArch64/min-max.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/min-max.ll
@ -96,8 +96,8 @@ define void @reduce_smin() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = call <2 x i32> @llvm.smin.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V4i64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
  %V1i8 = call <1 x i8> @llvm.smin.v1i8(<1 x i8> undef, <1 x i8> undef)
@ -135,8 +135,8 @@ define void @reduce_smax() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = call <2 x i32> @llvm.smax.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V4i64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
  %V1i8 = call <1 x i8> @llvm.smax.v1i8(<1 x i8> undef, <1 x i8> undef)
--- a/llvm/test/CodeGen/AArch64/min-max.ll
+++ b/llvm/test/CodeGen/AArch64/min-max.ll
@ -185,13 +185,8 @@ declare <1 x i64> @llvm.smax.v1i64(<1 x i64> %a, <1 x i64> %b) readnone
 define <1 x i64> @smax1i64(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-ISEL-LABEL: smax1i64:
 ; CHECK-ISEL:       // %bb.0:
-; CHECK-ISEL-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-ISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-ISEL-NEXT:    fmov x8, d1
-; CHECK-ISEL-NEXT:    fmov x9, d0
-; CHECK-ISEL-NEXT:    cmp x9, x8
-; CHECK-ISEL-NEXT:    csel x8, x9, x8, gt
-; CHECK-ISEL-NEXT:    fmov d0, x8
+; CHECK-ISEL-NEXT:    cmgt d2, d0, d1
+; CHECK-ISEL-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-ISEL-NEXT:    ret
 ;
 ; CHECK-GLOBAL-LABEL: smax1i64:
@ -210,16 +205,8 @@ declare <2 x i64> @llvm.smax.v2i64(<2 x i64> %a, <2 x i64> %b) readnone
 define <2 x i64> @smax2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-ISEL-LABEL: smax2i64:
 ; CHECK-ISEL:       // %bb.0:
-; CHECK-ISEL-NEXT:    mov x8, v1.d[1]
-; CHECK-ISEL-NEXT:    mov x9, v0.d[1]
-; CHECK-ISEL-NEXT:    fmov x10, d1
-; CHECK-ISEL-NEXT:    fmov x11, d0
-; CHECK-ISEL-NEXT:    cmp x9, x8
-; CHECK-ISEL-NEXT:    csel x8, x9, x8, gt
-; CHECK-ISEL-NEXT:    cmp x11, x10
-; CHECK-ISEL-NEXT:    csel x9, x11, x10, gt
-; CHECK-ISEL-NEXT:    fmov d0, x9
-; CHECK-ISEL-NEXT:    mov v0.d[1], x8
+; CHECK-ISEL-NEXT:    cmgt v2.2d, v0.2d, v1.2d
+; CHECK-ISEL-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-ISEL-NEXT:    ret
 ;
 ; CHECK-GLOBAL-LABEL: smax2i64:
@ -238,26 +225,10 @@ declare <4 x i64> @llvm.smax.v4i64(<4 x i64> %a, <4 x i64> %b) readnone
 define void @smax4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %p) {
 ; CHECK-ISEL-LABEL: smax4i64:
 ; CHECK-ISEL:       // %bb.0:
-; CHECK-ISEL-NEXT:    mov x8, v2.d[1]
-; CHECK-ISEL-NEXT:    mov x9, v0.d[1]
-; CHECK-ISEL-NEXT:    fmov x10, d2
-; CHECK-ISEL-NEXT:    fmov x11, d0
-; CHECK-ISEL-NEXT:    cmp x9, x8
-; CHECK-ISEL-NEXT:    csel x8, x9, x8, gt
-; CHECK-ISEL-NEXT:    cmp x11, x10
-; CHECK-ISEL-NEXT:    mov x9, v3.d[1]
-; CHECK-ISEL-NEXT:    csel x10, x11, x10, gt
-; CHECK-ISEL-NEXT:    mov x11, v1.d[1]
-; CHECK-ISEL-NEXT:    cmp x11, x9
-; CHECK-ISEL-NEXT:    fmov d0, x10
-; CHECK-ISEL-NEXT:    fmov x10, d3
-; CHECK-ISEL-NEXT:    csel x9, x11, x9, gt
-; CHECK-ISEL-NEXT:    fmov x11, d1
-; CHECK-ISEL-NEXT:    cmp x11, x10
-; CHECK-ISEL-NEXT:    csel x10, x11, x10, gt
-; CHECK-ISEL-NEXT:    fmov d1, x10
-; CHECK-ISEL-NEXT:    mov v0.d[1], x8
-; CHECK-ISEL-NEXT:    mov v1.d[1], x9
+; CHECK-ISEL-NEXT:    cmgt v4.2d, v0.2d, v2.2d
+; CHECK-ISEL-NEXT:    cmgt v5.2d, v1.2d, v3.2d
+; CHECK-ISEL-NEXT:    bif v0.16b, v2.16b, v4.16b
+; CHECK-ISEL-NEXT:    bif v1.16b, v3.16b, v5.16b
 ; CHECK-ISEL-NEXT:    stp q0, q1, [x0]
 ; CHECK-ISEL-NEXT:    ret
 ;
@ -457,13 +428,8 @@ declare <1 x i64> @llvm.umax.v1i64(<1 x i64> %a, <1 x i64> %b) readnone
 define <1 x i64> @umax1i64(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-ISEL-LABEL: umax1i64:
 ; CHECK-ISEL:       // %bb.0:
-; CHECK-ISEL-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-ISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-ISEL-NEXT:    fmov x8, d1
-; CHECK-ISEL-NEXT:    fmov x9, d0
-; CHECK-ISEL-NEXT:    cmp x9, x8
-; CHECK-ISEL-NEXT:    csel x8, x9, x8, hi
-; CHECK-ISEL-NEXT:    fmov d0, x8
+; CHECK-ISEL-NEXT:    cmhi d2, d0, d1
+; CHECK-ISEL-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-ISEL-NEXT:    ret
 ;
 ; CHECK-GLOBAL-LABEL: umax1i64:
@ -482,8 +448,8 @@ declare <2 x i64> @llvm.umax.v2i64(<2 x i64> %a, <2 x i64> %b) readnone
 define <2 x i64> @umax2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-ISEL-LABEL: umax2i64:
 ; CHECK-ISEL:       // %bb.0:
-; CHECK-ISEL-NEXT:    uqsub v1.2d, v1.2d, v0.2d
-; CHECK-ISEL-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-ISEL-NEXT:    cmhi v2.2d, v0.2d, v1.2d
+; CHECK-ISEL-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-ISEL-NEXT:    ret
 ;
 ; CHECK-GLOBAL-LABEL: umax2i64:
@ -502,10 +468,10 @@ declare <4 x i64> @llvm.umax.v4i64(<4 x i64> %a, <4 x i64> %b) readnone
 define void @umax4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %p) {
 ; CHECK-ISEL-LABEL: umax4i64:
 ; CHECK-ISEL:       // %bb.0:
-; CHECK-ISEL-NEXT:    uqsub v2.2d, v2.2d, v0.2d
-; CHECK-ISEL-NEXT:    uqsub v3.2d, v3.2d, v1.2d
-; CHECK-ISEL-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-ISEL-NEXT:    add v1.2d, v1.2d, v3.2d
+; CHECK-ISEL-NEXT:    cmhi v4.2d, v0.2d, v2.2d
+; CHECK-ISEL-NEXT:    cmhi v5.2d, v1.2d, v3.2d
+; CHECK-ISEL-NEXT:    bif v0.16b, v2.16b, v4.16b
+; CHECK-ISEL-NEXT:    bif v1.16b, v3.16b, v5.16b
 ; CHECK-ISEL-NEXT:    stp q0, q1, [x0]
 ; CHECK-ISEL-NEXT:    ret
 ;
@ -705,13 +671,8 @@ declare <1 x i64> @llvm.smin.v1i64(<1 x i64> %a, <1 x i64> %b) readnone
 define <1 x i64> @smin1i64(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-ISEL-LABEL: smin1i64:
 ; CHECK-ISEL:       // %bb.0:
-; CHECK-ISEL-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-ISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-ISEL-NEXT:    fmov x8, d1
-; CHECK-ISEL-NEXT:    fmov x9, d0
-; CHECK-ISEL-NEXT:    cmp x9, x8
-; CHECK-ISEL-NEXT:    csel x8, x9, x8, lt
-; CHECK-ISEL-NEXT:    fmov d0, x8
+; CHECK-ISEL-NEXT:    cmgt d2, d1, d0
+; CHECK-ISEL-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-ISEL-NEXT:    ret
 ;
 ; CHECK-GLOBAL-LABEL: smin1i64:
@ -730,16 +691,8 @@ declare <2 x i64> @llvm.smin.v2i64(<2 x i64> %a, <2 x i64> %b) readnone
 define <2 x i64> @smin2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-ISEL-LABEL: smin2i64:
 ; CHECK-ISEL:       // %bb.0:
-; CHECK-ISEL-NEXT:    mov x8, v1.d[1]
-; CHECK-ISEL-NEXT:    mov x9, v0.d[1]
-; CHECK-ISEL-NEXT:    fmov x10, d1
-; CHECK-ISEL-NEXT:    fmov x11, d0
-; CHECK-ISEL-NEXT:    cmp x9, x8
-; CHECK-ISEL-NEXT:    csel x8, x9, x8, lt
-; CHECK-ISEL-NEXT:    cmp x11, x10
-; CHECK-ISEL-NEXT:    csel x9, x11, x10, lt
-; CHECK-ISEL-NEXT:    fmov d0, x9
-; CHECK-ISEL-NEXT:    mov v0.d[1], x8
+; CHECK-ISEL-NEXT:    cmgt v2.2d, v1.2d, v0.2d
+; CHECK-ISEL-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-ISEL-NEXT:    ret
 ;
 ; CHECK-GLOBAL-LABEL: smin2i64:
@ -758,26 +711,10 @@ declare <4 x i64> @llvm.smin.v4i64(<4 x i64> %a, <4 x i64> %b) readnone
 define void @smin4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %p) {
 ; CHECK-ISEL-LABEL: smin4i64:
 ; CHECK-ISEL:       // %bb.0:
-; CHECK-ISEL-NEXT:    mov x8, v2.d[1]
-; CHECK-ISEL-NEXT:    mov x9, v0.d[1]
-; CHECK-ISEL-NEXT:    fmov x10, d2
-; CHECK-ISEL-NEXT:    fmov x11, d0
-; CHECK-ISEL-NEXT:    cmp x9, x8
-; CHECK-ISEL-NEXT:    csel x8, x9, x8, lt
-; CHECK-ISEL-NEXT:    cmp x11, x10
-; CHECK-ISEL-NEXT:    mov x9, v3.d[1]
-; CHECK-ISEL-NEXT:    csel x10, x11, x10, lt
-; CHECK-ISEL-NEXT:    mov x11, v1.d[1]
-; CHECK-ISEL-NEXT:    cmp x11, x9
-; CHECK-ISEL-NEXT:    fmov d0, x10
-; CHECK-ISEL-NEXT:    fmov x10, d3
-; CHECK-ISEL-NEXT:    csel x9, x11, x9, lt
-; CHECK-ISEL-NEXT:    fmov x11, d1
-; CHECK-ISEL-NEXT:    cmp x11, x10
-; CHECK-ISEL-NEXT:    csel x10, x11, x10, lt
-; CHECK-ISEL-NEXT:    fmov d1, x10
-; CHECK-ISEL-NEXT:    mov v0.d[1], x8
-; CHECK-ISEL-NEXT:    mov v1.d[1], x9
+; CHECK-ISEL-NEXT:    cmgt v4.2d, v2.2d, v0.2d
+; CHECK-ISEL-NEXT:    cmgt v5.2d, v3.2d, v1.2d
+; CHECK-ISEL-NEXT:    bif v0.16b, v2.16b, v4.16b
+; CHECK-ISEL-NEXT:    bif v1.16b, v3.16b, v5.16b
 ; CHECK-ISEL-NEXT:    stp q0, q1, [x0]
 ; CHECK-ISEL-NEXT:    ret
 ;
@ -977,13 +914,8 @@ declare <1 x i64> @llvm.umin.v1i64(<1 x i64> %a, <1 x i64> %b) readnone
 define <1 x i64> @umin1i64(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-ISEL-LABEL: umin1i64:
 ; CHECK-ISEL:       // %bb.0:
-; CHECK-ISEL-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-ISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-ISEL-NEXT:    fmov x8, d1
-; CHECK-ISEL-NEXT:    fmov x9, d0
-; CHECK-ISEL-NEXT:    cmp x9, x8
-; CHECK-ISEL-NEXT:    csel x8, x9, x8, lo
-; CHECK-ISEL-NEXT:    fmov d0, x8
+; CHECK-ISEL-NEXT:    cmhi d2, d1, d0
+; CHECK-ISEL-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-ISEL-NEXT:    ret
 ;
 ; CHECK-GLOBAL-LABEL: umin1i64:
@ -1002,8 +934,8 @@ declare <2 x i64> @llvm.umin.v2i64(<2 x i64> %a, <2 x i64> %b) readnone
 define <2 x i64> @umin2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-ISEL-LABEL: umin2i64:
 ; CHECK-ISEL:       // %bb.0:
-; CHECK-ISEL-NEXT:    uqsub v1.2d, v0.2d, v1.2d
-; CHECK-ISEL-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-ISEL-NEXT:    cmhi v2.2d, v1.2d, v0.2d
+; CHECK-ISEL-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-ISEL-NEXT:    ret
 ;
 ; CHECK-GLOBAL-LABEL: umin2i64:
@ -1022,10 +954,10 @@ declare <4 x i64> @llvm.umin.v4i64(<4 x i64> %a, <4 x i64> %b) readnone
 define void @umin4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %p) {
 ; CHECK-ISEL-LABEL: umin4i64:
 ; CHECK-ISEL:       // %bb.0:
-; CHECK-ISEL-NEXT:    uqsub v2.2d, v0.2d, v2.2d
-; CHECK-ISEL-NEXT:    uqsub v3.2d, v1.2d, v3.2d
-; CHECK-ISEL-NEXT:    sub v0.2d, v0.2d, v2.2d
-; CHECK-ISEL-NEXT:    sub v1.2d, v1.2d, v3.2d
+; CHECK-ISEL-NEXT:    cmhi v4.2d, v2.2d, v0.2d
+; CHECK-ISEL-NEXT:    cmhi v5.2d, v3.2d, v1.2d
+; CHECK-ISEL-NEXT:    bif v0.16b, v2.16b, v4.16b
+; CHECK-ISEL-NEXT:    bif v1.16b, v3.16b, v5.16b
 ; CHECK-ISEL-NEXT:    stp q0, q1, [x0]
 ; CHECK-ISEL-NEXT:    ret
 ;
--- a/llvm/test/CodeGen/AArch64/minmax.ll
+++ b/llvm/test/CodeGen/AArch64/minmax.ll
@ -160,10 +160,10 @@ define <2 x i64> @t14(<2 x i64> %a, <2 x i64> %b) {
 define <4 x i64> @t15(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK-LABEL: t15:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmhs v4.2d, v3.2d, v1.2d
-; CHECK-NEXT:    cmhs v5.2d, v2.2d, v0.2d
-; CHECK-NEXT:    bif v0.16b, v2.16b, v5.16b
-; CHECK-NEXT:    bif v1.16b, v3.16b, v4.16b
+; CHECK-NEXT:    cmhi v4.2d, v2.2d, v0.2d
+; CHECK-NEXT:    cmhi v5.2d, v3.2d, v1.2d
+; CHECK-NEXT:    bif v0.16b, v2.16b, v4.16b
+; CHECK-NEXT:    bif v1.16b, v3.16b, v5.16b
 ; CHECK-NEXT:    ret
  %t1 = icmp ule <4 x i64> %a, %b
  %t2 = select <4 x i1> %t1, <4 x i64> %a, <4 x i64> %b
--- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
@ -87,11 +87,11 @@ define i128 @test_v1i128(<1 x i128> %a) nounwind {
 define i64 @test_v2i64(<2 x i64> %a) nounwind {
 ; CHECK-LABEL: test_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:   mov     x8, v0.d[1]
-; CHECK-NEXT:   fmov    x9, d0
-; CHECK-NEXT:   cmp     x9, x8
-; CHECK-NEXT:   csel    x0, x9, x8, hi
-; CHECK-NEXT:   ret
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    cmhi d2, d0, d1
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
  %b = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
  ret i64 %b
 }