[AArch64][SVE] Implement several floating-point arithmetic intrinsics

Summary: Adds intrinsics for the following: - fabd, fadd, fsub & fsubr - fmul, fmulx, fdiv & fdivr - fmax, fmaxnm, fmin & fminnm - fscale & ftsmul Reviewers: huntergr, sdesmalen, dancgr Reviewed By: sdesmalen Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, cameron.mcinally, cfe-commits, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D69657
2019-11-01 10:40:36 +00:00 · 2019-11-01 10:40:36 +00:00 · 5ec34dfdf7
parent 6082a062a7
commit 5ec34dfdf7
4 changed files with 617 additions and 26 deletions
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@ -788,6 +788,19 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                [llvm_anyvector_ty],
                [IntrNoMem]>;

+  class AdvSIMD_SVE_SCALE_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                 LLVMMatchType<0>,
+                 LLVMVectorOfBitcastsToInt<0>],
+                [IntrNoMem]>;
+
+  class AdvSIMD_SVE_TSMUL_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>,
+                 LLVMVectorOfBitcastsToInt<0>],
+                [IntrNoMem]>;
+
  class AdvSIMD_SVE_DOT_Intrinsic
    : Intrinsic<[llvm_anyvector_ty],
                [LLVMMatchType<0>,
@ -877,6 +890,25 @@ def int_aarch64_sve_uxtb : AdvSIMD_Merged1VectorArg_Intrinsic;
 def int_aarch64_sve_uxth : AdvSIMD_Merged1VectorArg_Intrinsic;
 def int_aarch64_sve_uxtw : AdvSIMD_Merged1VectorArg_Intrinsic;

+//
+// Floating-point arithmetic
+//
+
+def int_aarch64_sve_fabd       : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_fadd       : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_fdiv       : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_fdivr      : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_fmax       : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_fmaxnm     : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_fmin       : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_fminnm     : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_fmul       : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_fmulx      : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_fscale     : AdvSIMD_SVE_SCALE_Intrinsic;
+def int_aarch64_sve_fsub       : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_fsubr      : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_ftsmul_x   : AdvSIMD_SVE_TSMUL_Intrinsic;
+
 //
 // Floating-point comparisons
 //
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@ -124,25 +124,25 @@ let Predicates = [HasSVE] in {
  defm FMAX_ZPmI    : sve_fp_2op_i_p_zds<0b110, "fmax", sve_fpimm_zero_one>;
  defm FMIN_ZPmI    : sve_fp_2op_i_p_zds<0b111, "fmin", sve_fpimm_zero_one>;

-  defm FADD_ZPmZ   : sve_fp_2op_p_zds<0b0000, "fadd">;
-  defm FSUB_ZPmZ   : sve_fp_2op_p_zds<0b0001, "fsub">;
-  defm FMUL_ZPmZ   : sve_fp_2op_p_zds<0b0010, "fmul">;
-  defm FSUBR_ZPmZ  : sve_fp_2op_p_zds<0b0011, "fsubr">;
-  defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm">;
-  defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm">;
-  defm FMAX_ZPmZ   : sve_fp_2op_p_zds<0b0110, "fmax">;
-  defm FMIN_ZPmZ   : sve_fp_2op_p_zds<0b0111, "fmin">;
-  defm FABD_ZPmZ   : sve_fp_2op_p_zds<0b1000, "fabd">;
-  defm FSCALE_ZPmZ : sve_fp_2op_p_zds<0b1001, "fscale">;
-  defm FMULX_ZPmZ  : sve_fp_2op_p_zds<0b1010, "fmulx">;
-  defm FDIVR_ZPmZ  : sve_fp_2op_p_zds<0b1100, "fdivr">;
-  defm FDIV_ZPmZ   : sve_fp_2op_p_zds<0b1101, "fdiv">;
+  defm FADD_ZPmZ   : sve_fp_2op_p_zds<0b0000, "fadd",   int_aarch64_sve_fadd>;
+  defm FSUB_ZPmZ   : sve_fp_2op_p_zds<0b0001, "fsub",   int_aarch64_sve_fsub>;
+  defm FMUL_ZPmZ   : sve_fp_2op_p_zds<0b0010, "fmul",   int_aarch64_sve_fmul>;
+  defm FSUBR_ZPmZ  : sve_fp_2op_p_zds<0b0011, "fsubr",  int_aarch64_sve_fsubr>;
+  defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", int_aarch64_sve_fmaxnm>;
+  defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", int_aarch64_sve_fminnm>;
+  defm FMAX_ZPmZ   : sve_fp_2op_p_zds<0b0110, "fmax",   int_aarch64_sve_fmax>;
+  defm FMIN_ZPmZ   : sve_fp_2op_p_zds<0b0111, "fmin",   int_aarch64_sve_fmin>;
+  defm FABD_ZPmZ   : sve_fp_2op_p_zds<0b1000, "fabd",   int_aarch64_sve_fabd>;
+  defm FSCALE_ZPmZ : sve_fp_2op_p_zds_fscale<0b1001, "fscale", int_aarch64_sve_fscale>;
+  defm FMULX_ZPmZ  : sve_fp_2op_p_zds<0b1010, "fmulx",  int_aarch64_sve_fmulx>;
+  defm FDIVR_ZPmZ  : sve_fp_2op_p_zds<0b1100, "fdivr",  int_aarch64_sve_fdivr>;
+  defm FDIV_ZPmZ   : sve_fp_2op_p_zds<0b1101, "fdiv",   int_aarch64_sve_fdiv>;

-  defm FADD_ZZZ    : sve_fp_3op_u_zd<0b000, "fadd", fadd>;
-  defm FSUB_ZZZ    : sve_fp_3op_u_zd<0b001, "fsub", fsub>;
-  defm FMUL_ZZZ    : sve_fp_3op_u_zd<0b010, "fmul", fmul>;
-  defm FTSMUL_ZZZ  : sve_fp_3op_u_zd<0b011, "ftsmul", null_frag>;
-  defm FRECPS_ZZZ  : sve_fp_3op_u_zd<0b110, "frecps", int_aarch64_sve_frecps_x>;
+  defm FADD_ZZZ    : sve_fp_3op_u_zd<0b000, "fadd",    fadd>;
+  defm FSUB_ZZZ    : sve_fp_3op_u_zd<0b001, "fsub",    fsub>;
+  defm FMUL_ZZZ    : sve_fp_3op_u_zd<0b010, "fmul",    fmul>;
+  defm FTSMUL_ZZZ  : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul",  int_aarch64_sve_ftsmul_x>;
+  defm FRECPS_ZZZ  : sve_fp_3op_u_zd<0b110, "frecps",  int_aarch64_sve_frecps_x>;
  defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", int_aarch64_sve_frsqrts_x>;

  defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel">;
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@ -1197,10 +1197,26 @@ class sve_fp_2op_p_zds<bits<2> sz, bits<4> opc, string asm,
  let ElementSize = zprty.ElementSize;
 }

-multiclass sve_fp_2op_p_zds<bits<4> opc, string asm> {
+multiclass sve_fp_2op_p_zds<bits<4> opc, string asm,
+                            SDPatternOperator op> {
  def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>;
  def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>;
  def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+}
+
+multiclass sve_fp_2op_p_zds_fscale<bits<4> opc, string asm,
+                                   SDPatternOperator op> {
+  def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>;
+  def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>;
+  def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }

 class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty>
@ -1235,13 +1251,11 @@ multiclass sve_fp_ftmad<string asm> {
 // SVE Floating Point Arithmetic - Unpredicated Group
 //===----------------------------------------------------------------------===//

-class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm,
-                      ZPRRegOp zprty,
-                      ValueType vt, ValueType vt2, SDPatternOperator op>
+class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm, ZPRRegOp zprty>
 : I<(outs zprty:$Zd), (ins  zprty:$Zn, zprty:$Zm),
  asm, "\t$Zd, $Zn, $Zm",
  "",
-  [(set (vt zprty:$Zd), (op (vt zprty:$Zn), (vt2 zprty:$Zm)))]>, Sched<[]> {
+  []>, Sched<[]> {
  bits<5> Zd;
  bits<5> Zm;
  bits<5> Zn;
@ -1256,9 +1270,24 @@ class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm,
 }

 multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op> {
-  def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16, nxv8f16, nxv8f16, op>;
-  def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32, nxv4f32, nxv4f32, op>;
-  def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64, nxv2f64, nxv2f64, op>;
+  def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>;
+  def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>;
+  def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>;
+
+  def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+}
+
+multiclass sve_fp_3op_u_zd_ftsmul<bits<3> opc, string asm, SDPatternOperator op> {
+  def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>;
+  def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>;
+  def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>;
+
+  def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }

 //===----------------------------------------------------------------------===//
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-arith.ll
@ -0,0 +1,530 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; FABD
+;
+
+define <vscale x 8 x half> @fabd_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fabd_h:
+; CHECK: fabd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fabd.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x half> %a,
+                                                                 <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fabd_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fabd_s:
+; CHECK: fabd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fabd.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                  <vscale x 4 x float> %a,
+                                                                  <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fabd_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fabd_d:
+; CHECK: fabd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fabd.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                   <vscale x 2 x double> %a,
+                                                                   <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FADD
+;
+
+define <vscale x 8 x half> @fadd_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fadd_h:
+; CHECK: fadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x half> %a,
+                                                                 <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fadd_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fadd_s:
+; CHECK: fadd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                  <vscale x 4 x float> %a,
+                                                                  <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fadd_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fadd_d:
+; CHECK: fadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                   <vscale x 2 x double> %a,
+                                                                   <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FDIV
+;
+
+define <vscale x 8 x half> @fdiv_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fdiv_h:
+; CHECK: fdiv z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fdiv.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x half> %a,
+                                                                 <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fdiv_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fdiv_s:
+; CHECK: fdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fdiv.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                  <vscale x 4 x float> %a,
+                                                                  <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fdiv_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fdiv_d:
+; CHECK: fdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fdiv.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                   <vscale x 2 x double> %a,
+                                                                   <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FDIVR
+;
+
+define <vscale x 8 x half> @fdivr_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fdivr_h:
+; CHECK: fdivr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fdivr.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                  <vscale x 8 x half> %a,
+                                                                  <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fdivr_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fdivr_s:
+; CHECK: fdivr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fdivr.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                   <vscale x 4 x float> %a,
+                                                                   <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fdivr_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fdivr_d:
+; CHECK: fdivr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fdivr.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                    <vscale x 2 x double> %a,
+                                                                    <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FMAX
+;
+
+define <vscale x 8 x half> @fmax_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fmax_h:
+; CHECK: fmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fmax.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x half> %a,
+                                                                 <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fmax_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fmax_s:
+; CHECK: fmax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmax.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                  <vscale x 4 x float> %a,
+                                                                  <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fmax_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fmax_d:
+; CHECK: fmax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fmax.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                   <vscale x 2 x double> %a,
+                                                                   <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FMAXNM
+;
+
+define <vscale x 8 x half> @fmaxnm_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fmaxnm_h:
+; CHECK: fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fmaxnm.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                   <vscale x 8 x half> %a,
+                                                                   <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fmaxnm_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fmaxnm_s:
+; CHECK: fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmaxnm.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                    <vscale x 4 x float> %a,
+                                                                    <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fmaxnm_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fmaxnm_d:
+; CHECK: fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fmaxnm.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                     <vscale x 2 x double> %a,
+                                                                     <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FMIN
+;
+
+define <vscale x 8 x half> @fmin_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fmin_h:
+; CHECK: fmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fmin.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x half> %a,
+                                                                 <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fmin_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fmin_s:
+; CHECK: fmin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmin.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                  <vscale x 4 x float> %a,
+                                                                  <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fmin_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fmin_d:
+; CHECK: fmin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fmin.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                   <vscale x 2 x double> %a,
+                                                                   <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FMINNM
+;
+
+define <vscale x 8 x half> @fminnm_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fminnm_h:
+; CHECK: fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fminnm.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                   <vscale x 8 x half> %a,
+                                                                   <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fminnm_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fminnm_s:
+; CHECK: fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fminnm.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                    <vscale x 4 x float> %a,
+                                                                    <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fminnm_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fminnm_d:
+; CHECK: fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fminnm.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                     <vscale x 2 x double> %a,
+                                                                     <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FMUL
+;
+
+define <vscale x 8 x half> @fmul_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fmul_h:
+; CHECK: fmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x half> %a,
+                                                                 <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fmul_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fmul_s:
+; CHECK: fmul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                  <vscale x 4 x float> %a,
+                                                                  <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fmul_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fmul_d:
+; CHECK: fmul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                   <vscale x 2 x double> %a,
+                                                                   <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FMULX
+;
+
+define <vscale x 8 x half> @fmulx_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fmulx_h:
+; CHECK: fmulx z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fmulx.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                  <vscale x 8 x half> %a,
+                                                                  <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fmulx_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fmulx_s:
+; CHECK: fmulx z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmulx.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                   <vscale x 4 x float> %a,
+                                                                   <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fmulx_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fmulx_d:
+; CHECK: fmulx z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fmulx.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                    <vscale x 2 x double> %a,
+                                                                    <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FSCALE
+;
+
+define <vscale x 8 x half> @fscale_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: fscale_h:
+; CHECK: fscale z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fscale.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                   <vscale x 8 x half> %a,
+                                                                   <vscale x 8 x i16> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fscale_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: fscale_s:
+; CHECK: fscale z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fscale.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                    <vscale x 4 x float> %a,
+                                                                    <vscale x 4 x i32> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fscale_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: fscale_d:
+; CHECK: fscale z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fscale.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                     <vscale x 2 x double> %a,
+                                                                     <vscale x 2 x i64> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FSUB
+;
+
+define <vscale x 8 x half> @fsub_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fsub_h:
+; CHECK: fsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fsub.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x half> %a,
+                                                                 <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fsub_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fsub_s:
+; CHECK: fsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                  <vscale x 4 x float> %a,
+                                                                  <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fsub_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fsub_d:
+; CHECK: fsub z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fsub.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                   <vscale x 2 x double> %a,
+                                                                   <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FSUBR
+;
+
+define <vscale x 8 x half> @fsubr_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fsubr_h:
+; CHECK: fsubr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fsubr.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                  <vscale x 8 x half> %a,
+                                                                  <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fsubr_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fsubr_s:
+; CHECK: fsubr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fsubr.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                   <vscale x 4 x float> %a,
+                                                                   <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fsubr_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fsubr_d:
+; CHECK: fsubr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fsubr.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                    <vscale x 2 x double> %a,
+                                                                    <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FTSMUL
+;
+
+define <vscale x 8 x half> @ftsmul_h(<vscale x 8 x half> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: ftsmul_h:
+; CHECK: ftsmul z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.ftsmul.x.nxv8f16(<vscale x 8 x half> %a,
+                                                                     <vscale x 8 x i16> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @ftsmul_s(<vscale x 4 x float> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: ftsmul_s:
+; CHECK: ftsmul z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.ftsmul.x.nxv4f32(<vscale x 4 x float> %a,
+                                                                      <vscale x 4 x i32> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @ftsmul_d(<vscale x 2 x double> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: ftsmul_d:
+; CHECK: ftsmul z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.ftsmul.x.nxv2f64(<vscale x 2 x double> %a,
+                                                                       <vscale x 2 x i64> %b)
+  ret <vscale x 2 x double> %out
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fabd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fabd.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fabd.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fdiv.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fdiv.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fdiv.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fdivr.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fdivr.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fdivr.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmax.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmax.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmax.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmaxnm.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmaxnm.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmaxnm.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmin.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmin.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmin.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fminnm.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fminnm.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fminnm.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmulx.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmulx.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmulx.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fscale.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x i16>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fscale.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x i32>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fscale.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x i64>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fsub.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fsub.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fsubr.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fsubr.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fsubr.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.ftsmul.x.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i16>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ftsmul.x.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i32>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ftsmul.x.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i64>)