[AArch64] Lower fpto*i.sat intrinsics.

AArch64's fctv* instructions implement the saturating behaviour that the fpto*i.sat intrinsics require, in cases where the destination width matches the saturation width. Lowering them removes a lot of unnecessary generated code. Only scalar lowerings are supported for now. Differential Revision: https://reviews.llvm.org/D102353
2021-04-27 10:12:11 +01:00 · 2021-04-27 10:12:11 +01:00 · 900c898994
parent 5b614eb4ea
commit 900c898994
9 changed files with 1332 additions and 1849 deletions
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -471,6 +471,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);

+  setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
+
  // Variable arguments.
  setOperationAction(ISD::VASTART, MVT::Other, Custom);
  setOperationAction(ISD::VAARG, MVT::Other, Custom);
@ -876,6 +881,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
  setTargetDAGCombine(ISD::SINT_TO_FP);
  setTargetDAGCombine(ISD::UINT_TO_FP);

+  // TODO: Do the same for FP_TO_*INT_SAT.
  setTargetDAGCombine(ISD::FP_TO_SINT);
  setTargetDAGCombine(ISD::FP_TO_UINT);
  setTargetDAGCombine(ISD::FDIV);
@ -3292,6 +3298,44 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
  return SDValue();
 }

+SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  // AArch64 FP-to-int conversions saturate to the destination register size, so
+  // we can lower common saturating conversions to simple instructions.
+  SDValue SrcVal = Op.getOperand(0);
+
+  EVT SrcVT = SrcVal.getValueType();
+  EVT DstVT = Op.getValueType();
+
+  EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+  uint64_t SatWidth = SatVT.getScalarSizeInBits();
+  uint64_t DstWidth = DstVT.getScalarSizeInBits();
+  assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
+
+  // TODO: Support lowering of NEON and SVE conversions.
+  if (SrcVT.isVector())
+    return SDValue();
+
+  // TODO: Saturate to SatWidth explicitly.
+  if (SatWidth != DstWidth)
+    return SDValue();
+
+  // In the absence of FP16 support, promote f32 to f16, like LowerFP_TO_INT().
+  if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16())
+    return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
+                       DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal),
+                       Op.getOperand(1));
+
+  // Cases that we can emit directly.
+  if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
+       (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
+      (DstVT == MVT::i64 || DstVT == MVT::i32))
+    return Op;
+
+  // For all other cases, fall back on the expanded form.
+  return SDValue();
+}
+
 SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
                                                    SelectionDAG &DAG) const {
  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
@ -4553,6 +4597,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
  case ISD::STRICT_FP_TO_SINT:
  case ISD::STRICT_FP_TO_UINT:
    return LowerFP_TO_INT(Op, DAG);
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    return LowerFP_TO_INT_SAT(Op, DAG);
  case ISD::FSINCOS:
    return LowerFSINCOS(Op, DAG);
  case ISD::FLT_ROUNDS_:
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@ -960,6 +960,7 @@ private:
  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@ -3692,6 +3692,25 @@ defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>;
 defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>;
 defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>;

+// AArch64's FCVT instructions saturate when out of range.
+multiclass FPToIntegerSatPats<SDNode to_int_sat, string INST> {
+  def : Pat<(i32 (to_int_sat f16:$Rn, i32)),
+            (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
+  def : Pat<(i32 (to_int_sat f32:$Rn, i32)),
+            (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+  def : Pat<(i32 (to_int_sat f64:$Rn, i32)),
+            (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+  def : Pat<(i64 (to_int_sat f16:$Rn, i64)),
+            (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
+  def : Pat<(i64 (to_int_sat f32:$Rn, i64)),
+            (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+  def : Pat<(i64 (to_int_sat f64:$Rn, i64)),
+            (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+}
+
+defm : FPToIntegerSatPats<fp_to_sint_sat, "FCVTZS">;
+defm : FPToIntegerSatPats<fp_to_uint_sat, "FCVTZU">;
+
 multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
  def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>;
  def : Pat<(i64 (round f16:$Rn)), (!cast<Instruction>(INST # UXHr) $Rn)>;
@ -3717,7 +3736,7 @@ multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
 defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">;
 defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">;

-multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> {
+multiclass FPToIntegerPats<SDNode to_int, SDNode to_int_sat, SDNode round, string INST> {
  def : Pat<(i32 (to_int (round f32:$Rn))),
            (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
  def : Pat<(i64 (to_int (round f32:$Rn))),
@ -3726,16 +3745,32 @@ multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> {
            (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
  def : Pat<(i64 (to_int (round f64:$Rn))),
            (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+
+  // These instructions saturate like fp_to_[su]int_sat.
+  def : Pat<(i32 (to_int_sat (round f16:$Rn), i32)),
+            (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
+  def : Pat<(i64 (to_int_sat (round f16:$Rn), i64)),
+            (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
+  def : Pat<(i32 (to_int_sat (round f32:$Rn), i32)),
+            (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+  def : Pat<(i64 (to_int_sat (round f32:$Rn), i64)),
+            (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+  def : Pat<(i32 (to_int_sat (round f64:$Rn), i32)),
+            (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+  def : Pat<(i64 (to_int_sat (round f64:$Rn), i64)),
+            (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
 }

-defm : FPToIntegerPats<fp_to_sint, fceil,  "FCVTPS">;
-defm : FPToIntegerPats<fp_to_uint, fceil,  "FCVTPU">;
-defm : FPToIntegerPats<fp_to_sint, ffloor, "FCVTMS">;
-defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">;
-defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">;
-defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">;
-defm : FPToIntegerPats<fp_to_sint, fround, "FCVTAS">;
-defm : FPToIntegerPats<fp_to_uint, fround, "FCVTAU">;
+defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fceil,  "FCVTPS">;
+defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fceil,  "FCVTPU">;
+defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, ffloor, "FCVTMS">;
+defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, ffloor, "FCVTMU">;
+defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, ftrunc, "FCVTZS">;
+defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, ftrunc, "FCVTZU">;
+defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fround, "FCVTAS">;
+defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fround, "FCVTAU">;
+
+

 let Predicates = [HasFullFP16] in {
  def : Pat<(i32 (lround f16:$Rn)),
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll
@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP16

 ;
 ; 32-bit float to signed integer
@ -106,19 +107,7 @@ define i19 @test_signed_i19_f32(float %f) nounwind {
 define i32 @test_signed_i32_f32(float %f) nounwind {
 ; CHECK-LABEL: test_signed_i32_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #-822083584
-; CHECK-NEXT:    mov w11, #1325400063
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcvtzs w8, s0
-; CHECK-NEXT:    mov w10, #-2147483648
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    fmov s1, w11
-; CHECK-NEXT:    mov w12, #2147483647
-; CHECK-NEXT:    csel w8, w10, w8, lt
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    csel w8, w12, w8, gt
-; CHECK-NEXT:    fcmp s0, s0
-; CHECK-NEXT:    csel w0, wzr, w8, vs
+; CHECK-NEXT:    fcvtzs w0, s0
 ; CHECK-NEXT:    ret
    %x = call i32 @llvm.fptosi.sat.i32.f32(float %f)
    ret i32 %x
@ -148,19 +137,7 @@ define i50 @test_signed_i50_f32(float %f) nounwind {
 define i64 @test_signed_i64_f32(float %f) nounwind {
 ; CHECK-LABEL: test_signed_i64_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #-553648128
-; CHECK-NEXT:    mov w11, #1593835519
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcvtzs x8, s0
-; CHECK-NEXT:    mov x10, #-9223372036854775808
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    fmov s1, w11
-; CHECK-NEXT:    mov x12, #9223372036854775807
-; CHECK-NEXT:    csel x8, x10, x8, lt
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    csel x8, x12, x8, gt
-; CHECK-NEXT:    fcmp s0, s0
-; CHECK-NEXT:    csel x0, xzr, x8, vs
+; CHECK-NEXT:    fcvtzs x0, s0
 ; CHECK-NEXT:    ret
    %x = call i64 @llvm.fptosi.sat.i64.f32(float %f)
    ret i64 %x
@ -330,16 +307,7 @@ define i19 @test_signed_i19_f64(double %f) nounwind {
 define i32 @test_signed_i32_f64(double %f) nounwind {
 ; CHECK-LABEL: test_signed_i32_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-4476578029606273024
-; CHECK-NEXT:    mov x9, #281474972516352
-; CHECK-NEXT:    movk x9, #16863, lsl #48
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    fmaxnm d1, d0, d1
-; CHECK-NEXT:    fmov d2, x9
-; CHECK-NEXT:    fminnm d1, d1, d2
-; CHECK-NEXT:    fcvtzs w8, d1
-; CHECK-NEXT:    fcmp d0, d0
-; CHECK-NEXT:    csel w0, wzr, w8, vs
+; CHECK-NEXT:    fcvtzs w0, d0
 ; CHECK-NEXT:    ret
    %x = call i32 @llvm.fptosi.sat.i32.f64(double %f)
    ret i32 %x
@ -366,19 +334,7 @@ define i50 @test_signed_i50_f64(double %f) nounwind {
 define i64 @test_signed_i64_f64(double %f) nounwind {
 ; CHECK-LABEL: test_signed_i64_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, #-4332462841530417152
-; CHECK-NEXT:    mov x11, #4890909195324358655
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    mov x10, #-9223372036854775808
-; CHECK-NEXT:    fcmp d0, d1
-; CHECK-NEXT:    fmov d1, x11
-; CHECK-NEXT:    mov x12, #9223372036854775807
-; CHECK-NEXT:    csel x8, x10, x8, lt
-; CHECK-NEXT:    fcmp d0, d1
-; CHECK-NEXT:    csel x8, x12, x8, gt
-; CHECK-NEXT:    fcmp d0, d0
-; CHECK-NEXT:    csel x0, xzr, x8, vs
+; CHECK-NEXT:    fcvtzs x0, d0
 ; CHECK-NEXT:    ret
    %x = call i64 @llvm.fptosi.sat.i64.f64(double %f)
    ret i64 %x
@ -550,23 +506,16 @@ define i19 @test_signed_i19_f16(half %f) nounwind {
 }

 define i32 @test_signed_i32_f16(half %f) nounwind {
-; CHECK-LABEL: test_signed_i32_f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-822083584
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov w8, #1325400063
-; CHECK-NEXT:    mov w9, #-2147483648
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    fcvtzs w8, s0
-; CHECK-NEXT:    csel w8, w9, w8, lt
-; CHECK-NEXT:    mov w9, #2147483647
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    csel w8, w9, w8, gt
-; CHECK-NEXT:    fcmp s0, s0
-; CHECK-NEXT:    csel w0, wzr, w8, vs
-; CHECK-NEXT:    ret
+; CHECK-CVT-LABEL: test_signed_i32_f16:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzs w0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_signed_i32_f16:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcvtzs w0, h0
+; CHECK-FP16-NEXT:    ret
    %x = call i32 @llvm.fptosi.sat.i32.f16(half %f)
    ret i32 %x
 }
@ -594,23 +543,16 @@ define i50 @test_signed_i50_f16(half %f) nounwind {
 }

 define i64 @test_signed_i64_f16(half %f) nounwind {
-; CHECK-LABEL: test_signed_i64_f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-553648128
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov w8, #1593835519
-; CHECK-NEXT:    mov x9, #-9223372036854775808
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    fcvtzs x8, s0
-; CHECK-NEXT:    csel x8, x9, x8, lt
-; CHECK-NEXT:    mov x9, #9223372036854775807
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    csel x8, x9, x8, gt
-; CHECK-NEXT:    fcmp s0, s0
-; CHECK-NEXT:    csel x0, xzr, x8, vs
-; CHECK-NEXT:    ret
+; CHECK-CVT-LABEL: test_signed_i64_f16:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzs x0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_signed_i64_f16:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcvtzs x0, h0
+; CHECK-FP16-NEXT:    ret
    %x = call i64 @llvm.fptosi.sat.i64.f16(half %f)
    ret i64 %x
 }
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll
@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP16

 ;
 ; 32-bit float to unsigned integer
@ -92,13 +93,7 @@ define i19 @test_unsigned_i19_f32(float %f) nounwind {
 define i32 @test_unsigned_i32_f32(float %f) nounwind {
 ; CHECK-LABEL: test_unsigned_i32_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #1333788671
-; CHECK-NEXT:    fcvtzu w8, s0
-; CHECK-NEXT:    fcmp s0, #0.0
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    csel w8, wzr, w8, lt
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    csinv w0, w8, wzr, le
+; CHECK-NEXT:    fcvtzu w0, s0
 ; CHECK-NEXT:    ret
    %x = call i32 @llvm.fptoui.sat.i32.f32(float %f)
    ret i32 %x
@ -123,13 +118,7 @@ define i50 @test_unsigned_i50_f32(float %f) nounwind {
 define i64 @test_unsigned_i64_f32(float %f) nounwind {
 ; CHECK-LABEL: test_unsigned_i64_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #1602224127
-; CHECK-NEXT:    fcvtzu x8, s0
-; CHECK-NEXT:    fcmp s0, #0.0
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    csel x8, xzr, x8, lt
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    csinv x0, x8, xzr, le
+; CHECK-NEXT:    fcvtzu x0, s0
 ; CHECK-NEXT:    ret
    %x = call i64 @llvm.fptoui.sat.i64.f32(float %f)
    ret i64 %x
@ -272,12 +261,6 @@ define i19 @test_unsigned_i19_f64(double %f) nounwind {
 define i32 @test_unsigned_i32_f64(double %f) nounwind {
 ; CHECK-LABEL: test_unsigned_i32_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #281474974613504
-; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    movk x8, #16879, lsl #48
-; CHECK-NEXT:    fmaxnm d0, d0, d1
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    fminnm d0, d0, d1
 ; CHECK-NEXT:    fcvtzu w0, d0
 ; CHECK-NEXT:    ret
    %x = call i32 @llvm.fptoui.sat.i32.f64(double %f)
@ -302,13 +285,7 @@ define i50 @test_unsigned_i50_f64(double %f) nounwind {
 define i64 @test_unsigned_i64_f64(double %f) nounwind {
 ; CHECK-LABEL: test_unsigned_i64_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, #4895412794951729151
-; CHECK-NEXT:    fcvtzu x8, d0
-; CHECK-NEXT:    fcmp d0, #0.0
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    csel x8, xzr, x8, lt
-; CHECK-NEXT:    fcmp d0, d1
-; CHECK-NEXT:    csinv x0, x8, xzr, le
+; CHECK-NEXT:    fcvtzu x0, d0
 ; CHECK-NEXT:    ret
    %x = call i64 @llvm.fptoui.sat.i64.f64(double %f)
    ret i64 %x
@ -453,17 +430,16 @@ define i19 @test_unsigned_i19_f16(half %f) nounwind {
 }

 define i32 @test_unsigned_i32_f16(half %f) nounwind {
-; CHECK-LABEL: test_unsigned_i32_f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    mov w8, #1333788671
-; CHECK-NEXT:    fcvtzu w9, s0
-; CHECK-NEXT:    fcmp s0, #0.0
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    csel w8, wzr, w9, lt
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    csinv w0, w8, wzr, le
-; CHECK-NEXT:    ret
+; CHECK-CVT-LABEL: test_unsigned_i32_f16:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzu w0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_unsigned_i32_f16:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcvtzu w0, h0
+; CHECK-FP16-NEXT:    ret
    %x = call i32 @llvm.fptoui.sat.i32.f16(half %f)
    ret i32 %x
 }
@ -486,17 +462,16 @@ define i50 @test_unsigned_i50_f16(half %f) nounwind {
 }

 define i64 @test_unsigned_i64_f16(half %f) nounwind {
-; CHECK-LABEL: test_unsigned_i64_f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    mov w8, #1602224127
-; CHECK-NEXT:    fcvtzu x9, s0
-; CHECK-NEXT:    fcmp s0, #0.0
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    csel x8, xzr, x9, lt
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    csinv x0, x8, xzr, le
-; CHECK-NEXT:    ret
+; CHECK-CVT-LABEL: test_unsigned_i64_f16:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzu x0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: test_unsigned_i64_f16:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcvtzu x0, h0
+; CHECK-FP16-NEXT:    ret
    %x = call i64 @llvm.fptoui.sat.i64.f16(half %f)
    ret i64 %x
 }
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
--- a/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll
@ -0,0 +1,367 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
+
+; Round towards minus infinity (fcvtms).
+
+define i32 @testmswh(half %a) {
+; CHECK-CVT-LABEL: testmswh:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frintm s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzs w0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: testmswh:
+; CHECK-FP16:       // %bb.0: // %entry
+; CHECK-FP16-NEXT:    fcvtms w0, h0
+; CHECK-FP16-NEXT:    ret
+entry:
+  %r = call half @llvm.floor.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+  ret i32 %i
+}
+
+define i64 @testmsxh(half %a) {
+; CHECK-CVT-LABEL: testmsxh:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frintm s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzs x0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: testmsxh:
+; CHECK-FP16:       // %bb.0: // %entry
+; CHECK-FP16-NEXT:    fcvtms x0, h0
+; CHECK-FP16-NEXT:    ret
+entry:
+  %r = call half @llvm.floor.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+  ret i64 %i
+}
+
+define i32 @testmsws(float %a) {
+; CHECK-LABEL: testmsws:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtms w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %r = call float @floorf(float %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  ret i32 %i
+}
+
+define i64 @testmsxs(float %a) {
+; CHECK-LABEL: testmsxs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtms x0, s0
+; CHECK-NEXT:    ret
+entry:
+  %r = call float @floorf(float %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+  ret i64 %i
+}
+
+define i32 @testmswd(double %a) {
+; CHECK-LABEL: testmswd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtms w0, d0
+; CHECK-NEXT:    ret
+entry:
+  %r = call double @floor(double %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+  ret i32 %i
+}
+
+define i64 @testmsxd(double %a) {
+; CHECK-LABEL: testmsxd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtms x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %r = call double @floor(double %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  ret i64 %i
+}
+
+; Round towards plus infinity (fcvtps).
+
+define i32 @testpswh(half %a) {
+; CHECK-CVT-LABEL: testpswh:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frintp s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzs w0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: testpswh:
+; CHECK-FP16:       // %bb.0: // %entry
+; CHECK-FP16-NEXT:    fcvtps w0, h0
+; CHECK-FP16-NEXT:    ret
+entry:
+  %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+  ret i32 %i
+}
+
+define i64 @testpsxh(half %a) {
+; CHECK-CVT-LABEL: testpsxh:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frintp s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzs x0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: testpsxh:
+; CHECK-FP16:       // %bb.0: // %entry
+; CHECK-FP16-NEXT:    fcvtps x0, h0
+; CHECK-FP16-NEXT:    ret
+entry:
+  %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+  ret i64 %i
+}
+
+define i32 @testpsws(float %a) {
+; CHECK-LABEL: testpsws:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtps w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %r = call float @ceilf(float %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  ret i32 %i
+}
+
+define i64 @testpsxs(float %a) {
+; CHECK-LABEL: testpsxs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtps x0, s0
+; CHECK-NEXT:    ret
+entry:
+  %r = call float @ceilf(float %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+  ret i64 %i
+}
+
+define i32 @testpswd(double %a) {
+; CHECK-LABEL: testpswd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtps w0, d0
+; CHECK-NEXT:    ret
+entry:
+  %r = call double @ceil(double %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+  ret i32 %i
+}
+
+define i64 @testpsxd(double %a) {
+; CHECK-LABEL: testpsxd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtps x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %r = call double @ceil(double %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  ret i64 %i
+}
+
+; Round towards zero (fcvtzs).
+
+define i32 @testzswh(half %a) {
+; CHECK-CVT-LABEL: testzswh:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frintz s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzs w0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: testzswh:
+; CHECK-FP16:       // %bb.0: // %entry
+; CHECK-FP16-NEXT:    fcvtzs w0, h0
+; CHECK-FP16-NEXT:    ret
+entry:
+  %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+  ret i32 %i
+}
+
+define i64 @testzsxh(half %a) {
+; CHECK-CVT-LABEL: testzsxh:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frintz s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzs x0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: testzsxh:
+; CHECK-FP16:       // %bb.0: // %entry
+; CHECK-FP16-NEXT:    fcvtzs x0, h0
+; CHECK-FP16-NEXT:    ret
+entry:
+  %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+  ret i64 %i
+}
+
+define i32 @testzsws(float %a) {
+; CHECK-LABEL: testzsws:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %r = call float @truncf(float %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  ret i32 %i
+}
+
+define i64 @testzsxs(float %a) {
+; CHECK-LABEL: testzsxs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs x0, s0
+; CHECK-NEXT:    ret
+entry:
+  %r = call float @truncf(float %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+  ret i64 %i
+}
+
+define i32 @testzswd(double %a) {
+; CHECK-LABEL: testzswd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs w0, d0
+; CHECK-NEXT:    ret
+entry:
+  %r = call double @trunc(double %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+  ret i32 %i
+}
+
+define i64 @testzsxd(double %a) {
+; CHECK-LABEL: testzsxd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %r = call double @trunc(double %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  ret i64 %i
+}
+
+; Round to nearest, ties away from zero (fcvtas).
+
+define i32 @testaswh(half %a) {
+; CHECK-CVT-LABEL: testaswh:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frinta s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzs w0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: testaswh:
+; CHECK-FP16:       // %bb.0: // %entry
+; CHECK-FP16-NEXT:    fcvtas w0, h0
+; CHECK-FP16-NEXT:    ret
+entry:
+  %r = call half @llvm.round.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+  ret i32 %i
+}
+
+define i64 @testasxh(half %a) {
+; CHECK-CVT-LABEL: testasxh:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frinta s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzs x0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: testasxh:
+; CHECK-FP16:       // %bb.0: // %entry
+; CHECK-FP16-NEXT:    fcvtas x0, h0
+; CHECK-FP16-NEXT:    ret
+entry:
+  %r = call half @llvm.round.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+  ret i64 %i
+}
+
+define i32 @testasws(float %a) {
+; CHECK-LABEL: testasws:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %r = call float @roundf(float %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  ret i32 %i
+}
+
+define i64 @testasxs(float %a) {
+; CHECK-LABEL: testasxs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, s0
+; CHECK-NEXT:    ret
+entry:
+  %r = call float @roundf(float %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+  ret i64 %i
+}
+
+define i32 @testaswd(double %a) {
+; CHECK-LABEL: testaswd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas w0, d0
+; CHECK-NEXT:    ret
+entry:
+  %r = call double @round(double %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+  ret i32 %i
+}
+
+define i64 @testasxd(double %a) {
+; CHECK-LABEL: testasxd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %r = call double @round(double %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  ret i64 %i
+}
+
+declare i32 @llvm.fptosi.sat.i32.f16 (half)
+declare i64 @llvm.fptosi.sat.i64.f16 (half)
+declare i32 @llvm.fptosi.sat.i32.f32 (float)
+declare i64 @llvm.fptosi.sat.i64.f32 (float)
+declare i32 @llvm.fptosi.sat.i32.f64 (double)
+declare i64 @llvm.fptosi.sat.i64.f64 (double)
+
+declare half @llvm.floor.f16(half) nounwind readnone
+declare half @llvm.ceil.f16(half) nounwind readnone
+declare half @llvm.trunc.f16(half) nounwind readnone
+declare half @llvm.round.f16(half) nounwind readnone
+declare float @floorf(float) nounwind readnone
+declare float @ceilf(float) nounwind readnone
+declare float @truncf(float) nounwind readnone
+declare float @roundf(float) nounwind readnone
+declare double @floor(double) nounwind readnone
+declare double @ceil(double) nounwind readnone
+declare double @trunc(double) nounwind readnone
+declare double @round(double) nounwind readnone
--- a/llvm/test/CodeGen/AArch64/round-fptoui-sat-scalar.ll
+++ b/llvm/test/CodeGen/AArch64/round-fptoui-sat-scalar.ll
@ -0,0 +1,367 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
+
+; Round towards minus infinity (fcvtmu).
+
+define i32 @testmuwh(half %a) {
+; CHECK-CVT-LABEL: testmuwh:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frintm s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzu w0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: testmuwh:
+; CHECK-FP16:       // %bb.0: // %entry
+; CHECK-FP16-NEXT:    fcvtmu w0, h0
+; CHECK-FP16-NEXT:    ret
+entry:
+  %r = call half @llvm.floor.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+  ret i32 %i
+}
+
+define i64 @testmuxh(half %a) {
+; CHECK-CVT-LABEL: testmuxh:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frintm s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzu x0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: testmuxh:
+; CHECK-FP16:       // %bb.0: // %entry
+; CHECK-FP16-NEXT:    fcvtmu x0, h0
+; CHECK-FP16-NEXT:    ret
+entry:
+  %r = call half @llvm.floor.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+  ret i64 %i
+}
+
+define i32 @testmuws(float %a) {
+; CHECK-LABEL: testmuws:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtmu w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %r = call float @floorf(float %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f32(float %r)
+  ret i32 %i
+}
+
+define i64 @testmuxs(float %a) {
+; CHECK-LABEL: testmuxs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtmu x0, s0
+; CHECK-NEXT:    ret
+entry:
+  %r = call float @floorf(float %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+  ret i64 %i
+}
+
+define i32 @testmuwd(double %a) {
+; CHECK-LABEL: testmuwd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtmu w0, d0
+; CHECK-NEXT:    ret
+entry:
+  %r = call double @floor(double %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+  ret i32 %i
+}
+
+define i64 @testmuxd(double %a) {
+; CHECK-LABEL: testmuxd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtmu x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %r = call double @floor(double %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f64(double %r)
+  ret i64 %i
+}
+
+; Round towards plus infinity (fcvtpu).
+
+define i32 @testpuwh(half %a) {
+; CHECK-CVT-LABEL: testpuwh:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frintp s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzu w0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: testpuwh:
+; CHECK-FP16:       // %bb.0: // %entry
+; CHECK-FP16-NEXT:    fcvtpu w0, h0
+; CHECK-FP16-NEXT:    ret
+entry:
+  %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+  ret i32 %i
+}
+
+define i64 @testpuxh(half %a) {
+; CHECK-CVT-LABEL: testpuxh:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frintp s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzu x0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: testpuxh:
+; CHECK-FP16:       // %bb.0: // %entry
+; CHECK-FP16-NEXT:    fcvtpu x0, h0
+; CHECK-FP16-NEXT:    ret
+entry:
+  %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+  ret i64 %i
+}
+
+define i32 @testpuws(float %a) {
+; CHECK-LABEL: testpuws:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtpu w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %r = call float @ceilf(float %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f32(float %r)
+  ret i32 %i
+}
+
+define i64 @testpuxs(float %a) {
+; CHECK-LABEL: testpuxs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtpu x0, s0
+; CHECK-NEXT:    ret
+entry:
+  %r = call float @ceilf(float %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+  ret i64 %i
+}
+
+define i32 @testpuwd(double %a) {
+; CHECK-LABEL: testpuwd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtpu w0, d0
+; CHECK-NEXT:    ret
+entry:
+  %r = call double @ceil(double %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+  ret i32 %i
+}
+
+define i64 @testpuxd(double %a) {
+; CHECK-LABEL: testpuxd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtpu x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %r = call double @ceil(double %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f64(double %r)
+  ret i64 %i
+}
+
+; Round towards zero (fcvtzu).
+
+define i32 @testzuwh(half %a) {
+; CHECK-CVT-LABEL: testzuwh:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frintz s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzu w0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: testzuwh:
+; CHECK-FP16:       // %bb.0: // %entry
+; CHECK-FP16-NEXT:    fcvtzu w0, h0
+; CHECK-FP16-NEXT:    ret
+entry:
+  %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+  ret i32 %i
+}
+
+define i64 @testzuxh(half %a) {
+; CHECK-CVT-LABEL: testzuxh:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frintz s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzu x0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: testzuxh:
+; CHECK-FP16:       // %bb.0: // %entry
+; CHECK-FP16-NEXT:    fcvtzu x0, h0
+; CHECK-FP16-NEXT:    ret
+entry:
+  %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+  ret i64 %i
+}
+
+define i32 @testzuws(float %a) {
+; CHECK-LABEL: testzuws:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %r = call float @truncf(float %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f32(float %r)
+  ret i32 %i
+}
+
+define i64 @testzuxs(float %a) {
+; CHECK-LABEL: testzuxs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu x0, s0
+; CHECK-NEXT:    ret
+entry:
+  %r = call float @truncf(float %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+  ret i64 %i
+}
+
+define i32 @testzuwd(double %a) {
+; CHECK-LABEL: testzuwd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu w0, d0
+; CHECK-NEXT:    ret
+entry:
+  %r = call double @trunc(double %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+  ret i32 %i
+}
+
+define i64 @testzuxd(double %a) {
+; CHECK-LABEL: testzuxd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %r = call double @trunc(double %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f64(double %r)
+  ret i64 %i
+}
+
+; Round to nearest, ties away from zero (fcvtau).
+
+define i32 @testauwh(half %a) {
+; CHECK-CVT-LABEL: testauwh:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frinta s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzu w0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: testauwh:
+; CHECK-FP16:       // %bb.0: // %entry
+; CHECK-FP16-NEXT:    fcvtau w0, h0
+; CHECK-FP16-NEXT:    ret
+entry:
+  %r = call half @llvm.round.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+  ret i32 %i
+}
+
+define i64 @testauxh(half %a) {
+; CHECK-CVT-LABEL: testauxh:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    frinta s0, s0
+; CHECK-CVT-NEXT:    fcvt h0, s0
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzu x0, s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: testauxh:
+; CHECK-FP16:       // %bb.0: // %entry
+; CHECK-FP16-NEXT:    fcvtau x0, h0
+; CHECK-FP16-NEXT:    ret
+entry:
+  %r = call half @llvm.round.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+  ret i64 %i
+}
+
+define i32 @testauws(float %a) {
+; CHECK-LABEL: testauws:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtau w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %r = call float @roundf(float %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f32(float %r)
+  ret i32 %i
+}
+
+define i64 @testauxs(float %a) {
+; CHECK-LABEL: testauxs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtau x0, s0
+; CHECK-NEXT:    ret
+entry:
+  %r = call float @roundf(float %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+  ret i64 %i
+}
+
+define i32 @testauwd(double %a) {
+; CHECK-LABEL: testauwd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtau w0, d0
+; CHECK-NEXT:    ret
+entry:
+  %r = call double @round(double %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+  ret i32 %i
+}
+
+define i64 @testauxd(double %a) {
+; CHECK-LABEL: testauxd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtau x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %r = call double @round(double %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f64(double %r)
+  ret i64 %i
+}
+
+declare i32 @llvm.fptoui.sat.i32.f16 (half)
+declare i64 @llvm.fptoui.sat.i64.f16 (half)
+declare i32 @llvm.fptoui.sat.i32.f32 (float)
+declare i64 @llvm.fptoui.sat.i64.f32 (float)
+declare i32 @llvm.fptoui.sat.i32.f64 (double)
+declare i64 @llvm.fptoui.sat.i64.f64 (double)
+
+declare half @llvm.floor.f16(half) nounwind readnone
+declare half @llvm.ceil.f16(half) nounwind readnone
+declare half @llvm.trunc.f16(half) nounwind readnone
+declare half @llvm.round.f16(half) nounwind readnone
+declare float @floorf(float) nounwind readnone
+declare float @ceilf(float) nounwind readnone
+declare float @truncf(float) nounwind readnone
+declare float @roundf(float) nounwind readnone
+declare double @floor(double) nounwind readnone
+declare double @ceil(double) nounwind readnone
+declare double @trunc(double) nounwind readnone
+declare double @round(double) nounwind readnone