[AArch64][SME] Add SME outer product intrinsics

This patch adds the following intrinsics to support the SME ACLE: * @llvm.aarch64.sme.mopa: Non-widening outer product + accumulate * @llvm.aarch64.sme.mops: Non-widening outer product + subtract * @llvm.aarch64.sme.mopa.wide: Widening outer product + accumulate * @llvm.aarch64.sme.mops.wide: Widening outer product + subtract * @llvm.aarch64.sme.smopa.wide: Widening signed sum of outer product + accumulate * @llvm.aarch64.sme.smops.wide: Widening signed sum of outer product + subtract * @llvm.aarch64.sme.umopa.wide: Widening unsigned sum of outer product + accumulate * @llvm.aarch64.sme.umops.wide: Widening unsigned sum of outer product + subtract * @llvm.aarch64.sme.sumopa.wide: Widening signed by unsigned sum of outer product + accumulate * @llvm.aarch64.sme.sumops.wide: Widening signed by unsigned sum of outer product + subtract * @llvm.aarch64.sme.usmopa.wide: Widening unsigned by signed sum of outer product + accumulate * @llvm.aarch64.sme.usmops.wide: Widening unsigned by signed sum of outer product + subtract Differential Revision: https://reviews.llvm.org/D127956
2022-06-16 09:39:40 +01:00 · 2022-06-16 09:39:40 +01:00 · f916ee0fb1
parent 5548e807b5
commit f916ee0fb1
8 changed files with 442 additions and 50 deletions
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@ -2652,6 +2652,29 @@ let TargetPrefix = "aarch64" in {

  def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i64_ty]>;

+  class SME_OuterProduct_Intrinsic
+      : DefaultAttrsIntrinsic<[],
+          [llvm_i64_ty,
+           LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+           LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+           LLVMMatchType<0>,
+           llvm_anyvector_ty]>;
+
+  def int_aarch64_sme_mopa : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_mops : SME_OuterProduct_Intrinsic;
+
+  def int_aarch64_sme_mopa_wide : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_mops_wide : SME_OuterProduct_Intrinsic;
+
+  def int_aarch64_sme_smopa_wide  : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_smops_wide  : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_umopa_wide  : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_umops_wide  : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_sumopa_wide : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_sumops_wide : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_usmopa_wide : SME_OuterProduct_Intrinsic;
+  def int_aarch64_sme_usmops_wide : SME_OuterProduct_Intrinsic;
+
  //
  // Counting elements
  //
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -2372,6 +2372,23 @@ AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
  return BB;
 }

+MachineBasicBlock *
+AArch64TargetLowering::EmitMopa(unsigned Opc, unsigned BaseReg,
+                                MachineInstr &MI, MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
+
+  MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
+  MIB.addReg(BaseReg + MI.getOperand(0).getImm());
+  MIB.add(MI.getOperand(1)); // pn
+  MIB.add(MI.getOperand(2)); // pm
+  MIB.add(MI.getOperand(3)); // zn
+  MIB.add(MI.getOperand(4)); // zm
+
+  MI.eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
 MachineBasicBlock *
 AArch64TargetLowering::EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg,
                                              MachineInstr &MI,
@ -2459,6 +2476,54 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
    return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
  case AArch64::LDR_ZA_PSEUDO:
    return EmitFill(MI, BB);
+  case AArch64::BFMOPA_MPPZZ_PSEUDO:
+    return EmitMopa(AArch64::BFMOPA_MPPZZ, AArch64::ZAS0, MI, BB);
+  case AArch64::BFMOPS_MPPZZ_PSEUDO:
+    return EmitMopa(AArch64::BFMOPS_MPPZZ, AArch64::ZAS0, MI, BB);
+  case AArch64::FMOPAL_MPPZZ_PSEUDO:
+    return EmitMopa(AArch64::FMOPAL_MPPZZ, AArch64::ZAS0, MI, BB);
+  case AArch64::FMOPSL_MPPZZ_PSEUDO:
+    return EmitMopa(AArch64::FMOPSL_MPPZZ, AArch64::ZAS0, MI, BB);
+  case AArch64::FMOPA_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::FMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::FMOPS_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::FMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::FMOPA_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::FMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::FMOPS_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::FMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::SMOPA_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::SMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::SMOPS_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::SMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::UMOPA_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::UMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::UMOPS_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::UMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::SUMOPA_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::SUMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::SUMOPS_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::SUMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::USMOPA_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::USMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::USMOPS_MPPZZ_S_PSEUDO:
+    return EmitMopa(AArch64::USMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
+  case AArch64::SMOPA_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::SMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::SMOPS_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::SMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::UMOPA_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::UMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::UMOPS_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::UMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::SUMOPA_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::SUMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::SUMOPS_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::SUMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::USMOPA_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::USMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
+  case AArch64::USMOPS_MPPZZ_D_PSEUDO:
+    return EmitMopa(AArch64::USMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
  case AArch64::INSERT_MXIPZ_H_PSEUDO_B:
    return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_B, AArch64::ZAB0, MI,
                                  BB);
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@ -563,7 +563,8 @@ public:
                                  MachineInstr &MI,
                                  MachineBasicBlock *BB) const;
  MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const;
-
+  MachineBasicBlock *EmitMopa(unsigned Opc, unsigned BaseReg, MachineInstr &MI,
+                              MachineBasicBlock *BB) const;
  MachineBasicBlock *EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg,
                                            MachineInstr &MI,
                                            MachineBasicBlock *BB) const;
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@ -38,41 +38,41 @@ let Predicates = [HasSME] in {
 // Outer products
 //===----------------------------------------------------------------------===//

-defm BFMOPA_MPPZZ  : sme_bf16_outer_product<0b0, "bfmopa">;
-defm BFMOPS_MPPZZ  : sme_bf16_outer_product<0b1, "bfmops">;
+defm BFMOPA_MPPZZ  : sme_bf16_outer_product<0b0, "bfmopa", int_aarch64_sme_mopa_wide>;
+defm BFMOPS_MPPZZ  : sme_bf16_outer_product<0b1, "bfmops", int_aarch64_sme_mops_wide>;

-def FMOPA_MPPZZ_S : sme_outer_product_fp32<0b0, "fmopa">;
-def FMOPS_MPPZZ_S : sme_outer_product_fp32<0b1, "fmops">;
+defm FMOPA_MPPZZ_S : sme_outer_product_fp32<0b0, "fmopa", int_aarch64_sme_mopa>;
+defm FMOPS_MPPZZ_S : sme_outer_product_fp32<0b1, "fmops", int_aarch64_sme_mops>;
 }

 let Predicates = [HasSMEF64] in {
-def FMOPA_MPPZZ_D : sme_outer_product_fp64<0b0, "fmopa">;
-def FMOPS_MPPZZ_D : sme_outer_product_fp64<0b1, "fmops">;
+defm FMOPA_MPPZZ_D : sme_outer_product_fp64<0b0, "fmopa", int_aarch64_sme_mopa>;
+defm FMOPS_MPPZZ_D : sme_outer_product_fp64<0b1, "fmops", int_aarch64_sme_mops>;
 }

 let Predicates = [HasSME] in {
-defm FMOPAL_MPPZZ  : sme_f16_outer_product<0b0, "fmopa">;
-defm FMOPSL_MPPZZ  : sme_f16_outer_product<0b1, "fmops">;
+defm FMOPAL_MPPZZ  : sme_f16_outer_product<0b0, "fmopa", int_aarch64_sme_mopa_wide>;
+defm FMOPSL_MPPZZ  : sme_f16_outer_product<0b1, "fmops", int_aarch64_sme_mops_wide>;

-def SMOPA_MPPZZ_S  : sme_int_outer_product_i32<0b000, "smopa">;
-def SMOPS_MPPZZ_S  : sme_int_outer_product_i32<0b001, "smops">;
-def UMOPA_MPPZZ_S  : sme_int_outer_product_i32<0b110, "umopa">;
-def UMOPS_MPPZZ_S  : sme_int_outer_product_i32<0b111, "umops">;
-def SUMOPA_MPPZZ_S : sme_int_outer_product_i32<0b010, "sumopa">;
-def SUMOPS_MPPZZ_S : sme_int_outer_product_i32<0b011, "sumops">;
-def USMOPA_MPPZZ_S : sme_int_outer_product_i32<0b100, "usmopa">;
-def USMOPS_MPPZZ_S : sme_int_outer_product_i32<0b101, "usmops">;
+defm SMOPA_MPPZZ_S  : sme_int_outer_product_i32<0b000, "smopa",  int_aarch64_sme_smopa_wide>;
+defm SMOPS_MPPZZ_S  : sme_int_outer_product_i32<0b001, "smops",  int_aarch64_sme_smops_wide>;
+defm UMOPA_MPPZZ_S  : sme_int_outer_product_i32<0b110, "umopa",  int_aarch64_sme_umopa_wide>;
+defm UMOPS_MPPZZ_S  : sme_int_outer_product_i32<0b111, "umops",  int_aarch64_sme_umops_wide>;
+defm SUMOPA_MPPZZ_S : sme_int_outer_product_i32<0b010, "sumopa", int_aarch64_sme_sumopa_wide>;
+defm SUMOPS_MPPZZ_S : sme_int_outer_product_i32<0b011, "sumops", int_aarch64_sme_sumops_wide>;
+defm USMOPA_MPPZZ_S : sme_int_outer_product_i32<0b100, "usmopa", int_aarch64_sme_usmopa_wide>;
+defm USMOPS_MPPZZ_S : sme_int_outer_product_i32<0b101, "usmops", int_aarch64_sme_usmops_wide>;
 }

 let Predicates = [HasSMEI64] in {
-def SMOPA_MPPZZ_D  : sme_int_outer_product_i64<0b000, "smopa">;
-def SMOPS_MPPZZ_D  : sme_int_outer_product_i64<0b001, "smops">;
-def UMOPA_MPPZZ_D  : sme_int_outer_product_i64<0b110, "umopa">;
-def UMOPS_MPPZZ_D  : sme_int_outer_product_i64<0b111, "umops">;
-def SUMOPA_MPPZZ_D : sme_int_outer_product_i64<0b010, "sumopa">;
-def SUMOPS_MPPZZ_D : sme_int_outer_product_i64<0b011, "sumops">;
-def USMOPA_MPPZZ_D : sme_int_outer_product_i64<0b100, "usmopa">;
-def USMOPS_MPPZZ_D : sme_int_outer_product_i64<0b101, "usmops">;
+defm SMOPA_MPPZZ_D  : sme_int_outer_product_i64<0b000, "smopa",  int_aarch64_sme_smopa_wide>;
+defm SMOPS_MPPZZ_D  : sme_int_outer_product_i64<0b001, "smops",  int_aarch64_sme_smops_wide>;
+defm UMOPA_MPPZZ_D  : sme_int_outer_product_i64<0b110, "umopa",  int_aarch64_sme_umopa_wide>;
+defm UMOPS_MPPZZ_D  : sme_int_outer_product_i64<0b111, "umops",  int_aarch64_sme_umops_wide>;
+defm SUMOPA_MPPZZ_D : sme_int_outer_product_i64<0b010, "sumopa", int_aarch64_sme_sumopa_wide>;
+defm SUMOPS_MPPZZ_D : sme_int_outer_product_i64<0b011, "sumops", int_aarch64_sme_sumops_wide>;
+defm USMOPA_MPPZZ_D : sme_int_outer_product_i64<0b100, "usmopa", int_aarch64_sme_usmopa_wide>;
+defm USMOPS_MPPZZ_D : sme_int_outer_product_i64<0b101, "usmops", int_aarch64_sme_usmops_wide>;
 }

 let Predicates = [HasSME] in {
--- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
@ -25,7 +25,8 @@ def TSV110Model : SchedMachineModel {
  let CompleteModel         =   1;

  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
-                                                    PAUnsupported.F);
+                                                    PAUnsupported.F,
+                                                    SMEUnsupported.F);
 }

 // Define each kind of processor resource and number available on the TSV110,
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@ -28,6 +28,14 @@ def am_sme_indexed_b4 :ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0,15>",
 // SME Outer Products
 //===----------------------------------------------------------------------===//

+class sme_outer_product_pseudo<ZPRRegOp zpr_ty>
+    : Pseudo<(outs), (ins i64imm:$tile, PPR3bAny:$pn, PPR3bAny:$pm,
+                          zpr_ty:$zn, zpr_ty:$zm), []>,
+      Sched<[]> {
+  // Translated to the actual instructions in AArch64ISelLowering.cpp
+  let usesCustomInserter = 1;
+}
+
 class sme_fp_outer_product_inst<bit S, bit sz, MatrixTileOperand za_ty,
                                ZPRRegOp zpr_ty, string mnemonic>
    : I<(outs za_ty:$ZAda),
@ -52,17 +60,31 @@ class sme_fp_outer_product_inst<bit S, bit sz, MatrixTileOperand za_ty,
  let Constraints = "$ZAda = $_ZAda";
 }

-class sme_outer_product_fp32<bit S, string mnemonic>
-    : sme_fp_outer_product_inst<S, 0b0, TileOp32, ZPR32, mnemonic> {
-  bits<2> ZAda;
-  let Inst{1-0} = ZAda;
-  let Inst{2}   = 0b0;
+multiclass sme_outer_product_fp32<bit S, string mnemonic, SDPatternOperator op> {
+  def NAME : sme_fp_outer_product_inst<S, 0b0, TileOp32, ZPR32, mnemonic> {
+    bits<2> ZAda;
+    let Inst{1-0} = ZAda;
+    let Inst{2}   = 0b0;
+  }
+
+  def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR32>;
+
+  def : Pat<(op imm0_3:$tile, (nxv4i1 PPR3bAny:$pn), (nxv4i1 PPR3bAny:$pm),
+                (nxv4f32 ZPR32:$zn), (nxv4f32 ZPR32:$zm)),
+            (!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>;
 }

-class sme_outer_product_fp64<bit S, string mnemonic>
-    : sme_fp_outer_product_inst<S, 0b1, TileOp64, ZPR64, mnemonic> {
-  bits<3> ZAda;
-  let Inst{2-0} = ZAda;
+multiclass sme_outer_product_fp64<bit S, string mnemonic, SDPatternOperator op> {
+  def NAME : sme_fp_outer_product_inst<S, 0b1, TileOp64, ZPR64, mnemonic> {
+    bits<3> ZAda;
+    let Inst{2-0} = ZAda;
+  }
+
+  def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR64>;
+
+  def : Pat<(op imm0_7:$tile, (nxv2i1 PPR3bAny:$pn), (nxv2i1 PPR3bAny:$pm),
+                (nxv2f64 ZPR64:$zn), (nxv2f64 ZPR64:$zm)),
+            (!cast<Instruction>(NAME # _PSEUDO) imm0_7:$tile, $pn, $pm, $zn, $zm)>;
 }

 class sme_int_outer_product_inst<bit u0, bit u1, bit S, bit sz,
@ -92,19 +114,35 @@ class sme_int_outer_product_inst<bit u0, bit u1, bit S, bit sz,
  let Constraints = "$ZAda = $_ZAda";
 }

-class sme_int_outer_product_i32<bits<3> opc, string mnemonic>
-    : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b0, TileOp32, ZPR8,
-                                 mnemonic> {
-  bits<2> ZAda;
-  let Inst{1-0} = ZAda;
-  let Inst{2}   = 0b0;
+multiclass sme_int_outer_product_i32<bits<3> opc, string mnemonic,
+                                     SDPatternOperator op> {
+  def NAME : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b0, TileOp32,
+                                        ZPR8, mnemonic> {
+    bits<2> ZAda;
+    let Inst{1-0} = ZAda;
+    let Inst{2}   = 0b0;
+  }
+
+  def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR8>;
+
+  def : Pat<(op imm0_3:$tile, (nxv16i1 PPR3bAny:$pn), (nxv16i1 PPR3bAny:$pm),
+                (nxv16i8 ZPR8:$zn), (nxv16i8 ZPR8:$zm)),
+            (!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>;
 }

-class sme_int_outer_product_i64<bits<3> opc, string mnemonic>
-    : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b1, TileOp64, ZPR16,
-                                 mnemonic> {
-  bits<3> ZAda;
-  let Inst{2-0} = ZAda;
+multiclass sme_int_outer_product_i64<bits<3> opc, string mnemonic,
+                                     SDPatternOperator op> {
+  def NAME : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b1, TileOp64,
+                                        ZPR16, mnemonic> {
+    bits<3> ZAda;
+    let Inst{2-0} = ZAda;
+  }
+
+  def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16>;
+
+  def : Pat<(op imm0_7:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm),
+                (nxv8i16 ZPR16:$zn), (nxv8i16 ZPR16:$zm)),
+            (!cast<Instruction>(NAME # _PSEUDO) imm0_7:$tile, $pn, $pm, $zn, $zm)>;
 }

 class sme_outer_product_widening_inst<bit op, bit S, string mnemonic>
@ -131,12 +169,24 @@ class sme_outer_product_widening_inst<bit op, bit S, string mnemonic>
  let Constraints = "$ZAda = $_ZAda";
 }

-multiclass sme_bf16_outer_product<bit S, string mnemonic> {
-  def : sme_outer_product_widening_inst<0b0, S, mnemonic>;
+multiclass sme_bf16_outer_product<bit S, string mnemonic, SDPatternOperator op> {
+  def NAME : sme_outer_product_widening_inst<0b0, S, mnemonic>;
+
+  def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16>;
+
+  def : Pat<(op imm0_3:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm),
+                (nxv8bf16 ZPR16:$zn), (nxv8bf16 ZPR16:$zm)),
+            (!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>;
 }

-multiclass sme_f16_outer_product<bit S, string mnemonic> {
-  def : sme_outer_product_widening_inst<0b1, S, mnemonic>;
+multiclass sme_f16_outer_product<bit S, string mnemonic, SDPatternOperator op> {
+  def NAME : sme_outer_product_widening_inst<0b1, S, mnemonic>;
+
+  def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16>;
+
+  def : Pat<(op imm0_3:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm),
+                (nxv8f16 ZPR16:$zn), (nxv8f16 ZPR16:$zm)),
+            (!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>;
 }

 //===----------------------------------------------------------------------===//
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mopa.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mopa.ll
@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+
+define void @bfmopa(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: bfmopa:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmopa za0.s, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i64 0, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+  ret void
+}
+
+define void @fmopa(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
+; CHECK-LABEL: fmopa:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmopa za1.s, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i64 1, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+  ret void
+}
+
+define void @smopa_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
+; CHECK-LABEL: smopa_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smopa za2.s, p0/m, p1/m, z0.b, z1.b
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i64 2, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+  ret void
+}
+
+define void @smopa_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: smopa_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smopa za0.d, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i64 0, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+  ret void
+}
+
+define void @umopa_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
+; CHECK-LABEL: umopa_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umopa za3.s, p0/m, p1/m, z0.b, z1.b
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.umopa.wide.nxv16i8(i64 3, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+  ret void
+}
+
+define void @umopa_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: umopa_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umopa za1.d, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i64 1, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+  ret void
+}
+
+define void @fmopa_s(<vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm) {
+; CHECK-LABEL: fmopa_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmopa za0.s, p0/m, p1/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.mopa.nxv4f32(i64 0, <vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
+  ret void
+}
+
+define void @fmopa_d(<vscale x 2 x i1> %pn, <vscale x 2 x i1> %pm, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm) #1 {
+; CHECK-LABEL: fmopa_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmopa za2.d, p0/m, p1/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.mopa.nxv2f64(i64 2, <vscale x 2 x i1> %pn, <vscale x 2 x i1> %pm, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
+  ret void
+}
+
+define void @sumopa_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
+; CHECK-LABEL: sumopa_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sumopa za1.s, p0/m, p1/m, z0.b, z1.b
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.sumopa.wide.nxv16i8(i64 1, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+  ret void
+}
+
+define void @sumopa_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: sumopa_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sumopa za3.d, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.sumopa.wide.nxv8i16(i64 3, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+  ret void
+}
+
+define void @usmopa_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
+; CHECK-LABEL: usmopa_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    usmopa za2.s, p0/m, p1/m, z0.b, z1.b
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.usmopa.wide.nxv16i8(i64 2, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+  ret void
+}
+
+define void @usmopa_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: usmopa_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    usmopa za7.d, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.usmopa.wide.nxv8i16(i64 7, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+  ret void
+}
+
+attributes #0 = { "target-features"="+sme-i64" }
+attributes #1 = { "target-features"="+sme-f64" }
+
+declare void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare void @llvm.aarch64.sme.mopa.wide.nxv8f16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare void @llvm.aarch64.sme.mopa.nxv4f32(i64, <vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare void @llvm.aarch64.sme.mopa.nxv2f64(i64, <vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+declare void @llvm.aarch64.sme.smopa.wide.nxv16i8(i64, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare void @llvm.aarch64.sme.smopa.wide.nxv8i16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.umopa.wide.nxv16i8(i64, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare void @llvm.aarch64.sme.umopa.wide.nxv8i16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.sumopa.wide.nxv16i8(i64, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare void @llvm.aarch64.sme.sumopa.wide.nxv8i16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.usmopa.wide.nxv16i8(i64, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare void @llvm.aarch64.sme.usmopa.wide.nxv8i16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mops.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mops.ll
@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+
+define void @bfmops(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: bfmops:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmops za0.s, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i64 0, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+  ret void
+}
+
+define void @fmops(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
+; CHECK-LABEL: fmops:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmops za1.s, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.mops.wide.nxv8f16(i64 1, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+  ret void
+}
+
+define void @smops_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
+; CHECK-LABEL: smops_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smops za2.s, p0/m, p1/m, z0.b, z1.b
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.smops.wide.nxv16i8(i64 2, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+  ret void
+}
+
+define void @smops_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: smops_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smops za0.d, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.smops.wide.nxv8i16(i64 0, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+  ret void
+}
+
+define void @umops_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
+; CHECK-LABEL: umops_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umops za3.s, p0/m, p1/m, z0.b, z1.b
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.umops.wide.nxv16i8(i64 3, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+  ret void
+}
+
+define void @umops_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: umops_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umops za1.d, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.umops.wide.nxv8i16(i64 1, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+  ret void
+}
+
+define void @fmops_s(<vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm) {
+; CHECK-LABEL: fmops_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmops za0.s, p0/m, p1/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.mops.nxv4f32(i64 0, <vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
+  ret void
+}
+
+define void @fmops_d(<vscale x 2 x i1> %pn, <vscale x 2 x i1> %pm, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm) #1 {
+; CHECK-LABEL: fmops_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmops za2.d, p0/m, p1/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.mops.nxv2f64(i64 2, <vscale x 2 x i1> %pn, <vscale x 2 x i1> %pm, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
+  ret void
+}
+
+define void @sumops_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
+; CHECK-LABEL: sumops_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sumops za1.s, p0/m, p1/m, z0.b, z1.b
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i64 1, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+  ret void
+}
+
+define void @sumops_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: sumops_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sumops za3.d, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i64 3, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+  ret void
+}
+
+define void @usmops_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
+; CHECK-LABEL: usmops_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    usmops za2.s, p0/m, p1/m, z0.b, z1.b
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i64 2, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+  ret void
+}
+
+define void @usmops_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: usmops_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    usmops za7.d, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i64 7, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+  ret void
+}
+
+attributes #0 = { "target-features"="+sme-i64" }
+attributes #1 = { "target-features"="+sme-f64" }
+
+declare void @llvm.aarch64.sme.mops.wide.nxv8bf16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare void @llvm.aarch64.sme.mops.wide.nxv8f16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare void @llvm.aarch64.sme.mops.nxv4f32(i64, <vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare void @llvm.aarch64.sme.mops.nxv2f64(i64, <vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+declare void @llvm.aarch64.sme.smops.wide.nxv16i8(i64, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare void @llvm.aarch64.sme.smops.wide.nxv8i16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.umops.wide.nxv16i8(i64, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare void @llvm.aarch64.sme.umops.wide.nxv8i16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.sumops.wide.nxv16i8(i64, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare void @llvm.aarch64.sme.sumops.wide.nxv8i16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.usmops.wide.nxv16i8(i64, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare void @llvm.aarch64.sme.usmops.wide.nxv8i16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)