[AArch64][SME] Add SME outer product intrinsics

This patch adds the following intrinsics to support the SME ACLE:

  * @llvm.aarch64.sme.mopa: Non-widening outer product + accumulate
  * @llvm.aarch64.sme.mops: Non-widening outer product + subtract
  * @llvm.aarch64.sme.mopa.wide: Widening outer product + accumulate
  * @llvm.aarch64.sme.mops.wide: Widening outer product + subtract
  * @llvm.aarch64.sme.smopa.wide: Widening signed sum of outer product + accumulate
  * @llvm.aarch64.sme.smops.wide: Widening signed sum of outer product + subtract
  * @llvm.aarch64.sme.umopa.wide: Widening unsigned sum of outer product + accumulate
  * @llvm.aarch64.sme.umops.wide: Widening unsigned sum of outer product + subtract
  * @llvm.aarch64.sme.sumopa.wide: Widening signed by unsigned sum of outer product + accumulate
  * @llvm.aarch64.sme.sumops.wide: Widening signed by unsigned sum of outer product + subtract
  * @llvm.aarch64.sme.usmopa.wide: Widening unsigned by signed sum of outer product + accumulate
  * @llvm.aarch64.sme.usmops.wide: Widening unsigned by signed sum of outer product + subtract

Differential Revision: https://reviews.llvm.org/D127956
This commit is contained in:
David Sherwood 2022-06-16 09:39:40 +01:00
parent 5548e807b5
commit f916ee0fb1
8 changed files with 442 additions and 50 deletions

View File

@ -2652,6 +2652,29 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i64_ty]>;
class SME_OuterProduct_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i64_ty,
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
LLVMMatchType<0>,
llvm_anyvector_ty]>;
def int_aarch64_sme_mopa : SME_OuterProduct_Intrinsic;
def int_aarch64_sme_mops : SME_OuterProduct_Intrinsic;
def int_aarch64_sme_mopa_wide : SME_OuterProduct_Intrinsic;
def int_aarch64_sme_mops_wide : SME_OuterProduct_Intrinsic;
def int_aarch64_sme_smopa_wide : SME_OuterProduct_Intrinsic;
def int_aarch64_sme_smops_wide : SME_OuterProduct_Intrinsic;
def int_aarch64_sme_umopa_wide : SME_OuterProduct_Intrinsic;
def int_aarch64_sme_umops_wide : SME_OuterProduct_Intrinsic;
def int_aarch64_sme_sumopa_wide : SME_OuterProduct_Intrinsic;
def int_aarch64_sme_sumops_wide : SME_OuterProduct_Intrinsic;
def int_aarch64_sme_usmopa_wide : SME_OuterProduct_Intrinsic;
def int_aarch64_sme_usmops_wide : SME_OuterProduct_Intrinsic;
//
// Counting elements
//

View File

@ -2372,6 +2372,23 @@ AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
return BB;
}
MachineBasicBlock *
AArch64TargetLowering::EmitMopa(unsigned Opc, unsigned BaseReg,
MachineInstr &MI, MachineBasicBlock *BB) const {
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
MIB.addReg(BaseReg + MI.getOperand(0).getImm());
MIB.add(MI.getOperand(1)); // pn
MIB.add(MI.getOperand(2)); // pm
MIB.add(MI.getOperand(3)); // zn
MIB.add(MI.getOperand(4)); // zm
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
MachineBasicBlock *
AArch64TargetLowering::EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
@ -2459,6 +2476,54 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
case AArch64::LDR_ZA_PSEUDO:
return EmitFill(MI, BB);
case AArch64::BFMOPA_MPPZZ_PSEUDO:
return EmitMopa(AArch64::BFMOPA_MPPZZ, AArch64::ZAS0, MI, BB);
case AArch64::BFMOPS_MPPZZ_PSEUDO:
return EmitMopa(AArch64::BFMOPS_MPPZZ, AArch64::ZAS0, MI, BB);
case AArch64::FMOPAL_MPPZZ_PSEUDO:
return EmitMopa(AArch64::FMOPAL_MPPZZ, AArch64::ZAS0, MI, BB);
case AArch64::FMOPSL_MPPZZ_PSEUDO:
return EmitMopa(AArch64::FMOPSL_MPPZZ, AArch64::ZAS0, MI, BB);
case AArch64::FMOPA_MPPZZ_S_PSEUDO:
return EmitMopa(AArch64::FMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
case AArch64::FMOPS_MPPZZ_S_PSEUDO:
return EmitMopa(AArch64::FMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
case AArch64::FMOPA_MPPZZ_D_PSEUDO:
return EmitMopa(AArch64::FMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
case AArch64::FMOPS_MPPZZ_D_PSEUDO:
return EmitMopa(AArch64::FMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
case AArch64::SMOPA_MPPZZ_S_PSEUDO:
return EmitMopa(AArch64::SMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
case AArch64::SMOPS_MPPZZ_S_PSEUDO:
return EmitMopa(AArch64::SMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
case AArch64::UMOPA_MPPZZ_S_PSEUDO:
return EmitMopa(AArch64::UMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
case AArch64::UMOPS_MPPZZ_S_PSEUDO:
return EmitMopa(AArch64::UMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
case AArch64::SUMOPA_MPPZZ_S_PSEUDO:
return EmitMopa(AArch64::SUMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
case AArch64::SUMOPS_MPPZZ_S_PSEUDO:
return EmitMopa(AArch64::SUMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
case AArch64::USMOPA_MPPZZ_S_PSEUDO:
return EmitMopa(AArch64::USMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
case AArch64::USMOPS_MPPZZ_S_PSEUDO:
return EmitMopa(AArch64::USMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
case AArch64::SMOPA_MPPZZ_D_PSEUDO:
return EmitMopa(AArch64::SMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
case AArch64::SMOPS_MPPZZ_D_PSEUDO:
return EmitMopa(AArch64::SMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
case AArch64::UMOPA_MPPZZ_D_PSEUDO:
return EmitMopa(AArch64::UMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
case AArch64::UMOPS_MPPZZ_D_PSEUDO:
return EmitMopa(AArch64::UMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
case AArch64::SUMOPA_MPPZZ_D_PSEUDO:
return EmitMopa(AArch64::SUMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
case AArch64::SUMOPS_MPPZZ_D_PSEUDO:
return EmitMopa(AArch64::SUMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
case AArch64::USMOPA_MPPZZ_D_PSEUDO:
return EmitMopa(AArch64::USMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
case AArch64::USMOPS_MPPZZ_D_PSEUDO:
return EmitMopa(AArch64::USMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
case AArch64::INSERT_MXIPZ_H_PSEUDO_B:
return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_B, AArch64::ZAB0, MI,
BB);

View File

@ -563,7 +563,8 @@ public:
MachineInstr &MI,
MachineBasicBlock *BB) const;
MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const;
MachineBasicBlock *EmitMopa(unsigned Opc, unsigned BaseReg, MachineInstr &MI,
MachineBasicBlock *BB) const;
MachineBasicBlock *EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
MachineBasicBlock *BB) const;

View File

@ -38,41 +38,41 @@ let Predicates = [HasSME] in {
// Outer products
//===----------------------------------------------------------------------===//
defm BFMOPA_MPPZZ : sme_bf16_outer_product<0b0, "bfmopa">;
defm BFMOPS_MPPZZ : sme_bf16_outer_product<0b1, "bfmops">;
defm BFMOPA_MPPZZ : sme_bf16_outer_product<0b0, "bfmopa", int_aarch64_sme_mopa_wide>;
defm BFMOPS_MPPZZ : sme_bf16_outer_product<0b1, "bfmops", int_aarch64_sme_mops_wide>;
def FMOPA_MPPZZ_S : sme_outer_product_fp32<0b0, "fmopa">;
def FMOPS_MPPZZ_S : sme_outer_product_fp32<0b1, "fmops">;
defm FMOPA_MPPZZ_S : sme_outer_product_fp32<0b0, "fmopa", int_aarch64_sme_mopa>;
defm FMOPS_MPPZZ_S : sme_outer_product_fp32<0b1, "fmops", int_aarch64_sme_mops>;
}
let Predicates = [HasSMEF64] in {
def FMOPA_MPPZZ_D : sme_outer_product_fp64<0b0, "fmopa">;
def FMOPS_MPPZZ_D : sme_outer_product_fp64<0b1, "fmops">;
defm FMOPA_MPPZZ_D : sme_outer_product_fp64<0b0, "fmopa", int_aarch64_sme_mopa>;
defm FMOPS_MPPZZ_D : sme_outer_product_fp64<0b1, "fmops", int_aarch64_sme_mops>;
}
let Predicates = [HasSME] in {
defm FMOPAL_MPPZZ : sme_f16_outer_product<0b0, "fmopa">;
defm FMOPSL_MPPZZ : sme_f16_outer_product<0b1, "fmops">;
defm FMOPAL_MPPZZ : sme_f16_outer_product<0b0, "fmopa", int_aarch64_sme_mopa_wide>;
defm FMOPSL_MPPZZ : sme_f16_outer_product<0b1, "fmops", int_aarch64_sme_mops_wide>;
def SMOPA_MPPZZ_S : sme_int_outer_product_i32<0b000, "smopa">;
def SMOPS_MPPZZ_S : sme_int_outer_product_i32<0b001, "smops">;
def UMOPA_MPPZZ_S : sme_int_outer_product_i32<0b110, "umopa">;
def UMOPS_MPPZZ_S : sme_int_outer_product_i32<0b111, "umops">;
def SUMOPA_MPPZZ_S : sme_int_outer_product_i32<0b010, "sumopa">;
def SUMOPS_MPPZZ_S : sme_int_outer_product_i32<0b011, "sumops">;
def USMOPA_MPPZZ_S : sme_int_outer_product_i32<0b100, "usmopa">;
def USMOPS_MPPZZ_S : sme_int_outer_product_i32<0b101, "usmops">;
defm SMOPA_MPPZZ_S : sme_int_outer_product_i32<0b000, "smopa", int_aarch64_sme_smopa_wide>;
defm SMOPS_MPPZZ_S : sme_int_outer_product_i32<0b001, "smops", int_aarch64_sme_smops_wide>;
defm UMOPA_MPPZZ_S : sme_int_outer_product_i32<0b110, "umopa", int_aarch64_sme_umopa_wide>;
defm UMOPS_MPPZZ_S : sme_int_outer_product_i32<0b111, "umops", int_aarch64_sme_umops_wide>;
defm SUMOPA_MPPZZ_S : sme_int_outer_product_i32<0b010, "sumopa", int_aarch64_sme_sumopa_wide>;
defm SUMOPS_MPPZZ_S : sme_int_outer_product_i32<0b011, "sumops", int_aarch64_sme_sumops_wide>;
defm USMOPA_MPPZZ_S : sme_int_outer_product_i32<0b100, "usmopa", int_aarch64_sme_usmopa_wide>;
defm USMOPS_MPPZZ_S : sme_int_outer_product_i32<0b101, "usmops", int_aarch64_sme_usmops_wide>;
}
let Predicates = [HasSMEI64] in {
def SMOPA_MPPZZ_D : sme_int_outer_product_i64<0b000, "smopa">;
def SMOPS_MPPZZ_D : sme_int_outer_product_i64<0b001, "smops">;
def UMOPA_MPPZZ_D : sme_int_outer_product_i64<0b110, "umopa">;
def UMOPS_MPPZZ_D : sme_int_outer_product_i64<0b111, "umops">;
def SUMOPA_MPPZZ_D : sme_int_outer_product_i64<0b010, "sumopa">;
def SUMOPS_MPPZZ_D : sme_int_outer_product_i64<0b011, "sumops">;
def USMOPA_MPPZZ_D : sme_int_outer_product_i64<0b100, "usmopa">;
def USMOPS_MPPZZ_D : sme_int_outer_product_i64<0b101, "usmops">;
defm SMOPA_MPPZZ_D : sme_int_outer_product_i64<0b000, "smopa", int_aarch64_sme_smopa_wide>;
defm SMOPS_MPPZZ_D : sme_int_outer_product_i64<0b001, "smops", int_aarch64_sme_smops_wide>;
defm UMOPA_MPPZZ_D : sme_int_outer_product_i64<0b110, "umopa", int_aarch64_sme_umopa_wide>;
defm UMOPS_MPPZZ_D : sme_int_outer_product_i64<0b111, "umops", int_aarch64_sme_umops_wide>;
defm SUMOPA_MPPZZ_D : sme_int_outer_product_i64<0b010, "sumopa", int_aarch64_sme_sumopa_wide>;
defm SUMOPS_MPPZZ_D : sme_int_outer_product_i64<0b011, "sumops", int_aarch64_sme_sumops_wide>;
defm USMOPA_MPPZZ_D : sme_int_outer_product_i64<0b100, "usmopa", int_aarch64_sme_usmopa_wide>;
defm USMOPS_MPPZZ_D : sme_int_outer_product_i64<0b101, "usmops", int_aarch64_sme_usmops_wide>;
}
let Predicates = [HasSME] in {

View File

@ -25,7 +25,8 @@ def TSV110Model : SchedMachineModel {
let CompleteModel = 1;
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
PAUnsupported.F);
PAUnsupported.F,
SMEUnsupported.F);
}
// Define each kind of processor resource and number available on the TSV110,

View File

@ -28,6 +28,14 @@ def am_sme_indexed_b4 :ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0,15>",
// SME Outer Products
//===----------------------------------------------------------------------===//
class sme_outer_product_pseudo<ZPRRegOp zpr_ty>
: Pseudo<(outs), (ins i64imm:$tile, PPR3bAny:$pn, PPR3bAny:$pm,
zpr_ty:$zn, zpr_ty:$zm), []>,
Sched<[]> {
// Translated to the actual instructions in AArch64ISelLowering.cpp
let usesCustomInserter = 1;
}
class sme_fp_outer_product_inst<bit S, bit sz, MatrixTileOperand za_ty,
ZPRRegOp zpr_ty, string mnemonic>
: I<(outs za_ty:$ZAda),
@ -52,17 +60,31 @@ class sme_fp_outer_product_inst<bit S, bit sz, MatrixTileOperand za_ty,
let Constraints = "$ZAda = $_ZAda";
}
class sme_outer_product_fp32<bit S, string mnemonic>
: sme_fp_outer_product_inst<S, 0b0, TileOp32, ZPR32, mnemonic> {
bits<2> ZAda;
let Inst{1-0} = ZAda;
let Inst{2} = 0b0;
multiclass sme_outer_product_fp32<bit S, string mnemonic, SDPatternOperator op> {
def NAME : sme_fp_outer_product_inst<S, 0b0, TileOp32, ZPR32, mnemonic> {
bits<2> ZAda;
let Inst{1-0} = ZAda;
let Inst{2} = 0b0;
}
def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR32>;
def : Pat<(op imm0_3:$tile, (nxv4i1 PPR3bAny:$pn), (nxv4i1 PPR3bAny:$pm),
(nxv4f32 ZPR32:$zn), (nxv4f32 ZPR32:$zm)),
(!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>;
}
class sme_outer_product_fp64<bit S, string mnemonic>
: sme_fp_outer_product_inst<S, 0b1, TileOp64, ZPR64, mnemonic> {
bits<3> ZAda;
let Inst{2-0} = ZAda;
multiclass sme_outer_product_fp64<bit S, string mnemonic, SDPatternOperator op> {
def NAME : sme_fp_outer_product_inst<S, 0b1, TileOp64, ZPR64, mnemonic> {
bits<3> ZAda;
let Inst{2-0} = ZAda;
}
def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR64>;
def : Pat<(op imm0_7:$tile, (nxv2i1 PPR3bAny:$pn), (nxv2i1 PPR3bAny:$pm),
(nxv2f64 ZPR64:$zn), (nxv2f64 ZPR64:$zm)),
(!cast<Instruction>(NAME # _PSEUDO) imm0_7:$tile, $pn, $pm, $zn, $zm)>;
}
class sme_int_outer_product_inst<bit u0, bit u1, bit S, bit sz,
@ -92,19 +114,35 @@ class sme_int_outer_product_inst<bit u0, bit u1, bit S, bit sz,
let Constraints = "$ZAda = $_ZAda";
}
class sme_int_outer_product_i32<bits<3> opc, string mnemonic>
: sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b0, TileOp32, ZPR8,
mnemonic> {
bits<2> ZAda;
let Inst{1-0} = ZAda;
let Inst{2} = 0b0;
multiclass sme_int_outer_product_i32<bits<3> opc, string mnemonic,
SDPatternOperator op> {
def NAME : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b0, TileOp32,
ZPR8, mnemonic> {
bits<2> ZAda;
let Inst{1-0} = ZAda;
let Inst{2} = 0b0;
}
def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR8>;
def : Pat<(op imm0_3:$tile, (nxv16i1 PPR3bAny:$pn), (nxv16i1 PPR3bAny:$pm),
(nxv16i8 ZPR8:$zn), (nxv16i8 ZPR8:$zm)),
(!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>;
}
class sme_int_outer_product_i64<bits<3> opc, string mnemonic>
: sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b1, TileOp64, ZPR16,
mnemonic> {
bits<3> ZAda;
let Inst{2-0} = ZAda;
multiclass sme_int_outer_product_i64<bits<3> opc, string mnemonic,
SDPatternOperator op> {
def NAME : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b1, TileOp64,
ZPR16, mnemonic> {
bits<3> ZAda;
let Inst{2-0} = ZAda;
}
def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16>;
def : Pat<(op imm0_7:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm),
(nxv8i16 ZPR16:$zn), (nxv8i16 ZPR16:$zm)),
(!cast<Instruction>(NAME # _PSEUDO) imm0_7:$tile, $pn, $pm, $zn, $zm)>;
}
class sme_outer_product_widening_inst<bit op, bit S, string mnemonic>
@ -131,12 +169,24 @@ class sme_outer_product_widening_inst<bit op, bit S, string mnemonic>
let Constraints = "$ZAda = $_ZAda";
}
multiclass sme_bf16_outer_product<bit S, string mnemonic> {
def : sme_outer_product_widening_inst<0b0, S, mnemonic>;
multiclass sme_bf16_outer_product<bit S, string mnemonic, SDPatternOperator op> {
def NAME : sme_outer_product_widening_inst<0b0, S, mnemonic>;
def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16>;
def : Pat<(op imm0_3:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm),
(nxv8bf16 ZPR16:$zn), (nxv8bf16 ZPR16:$zm)),
(!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>;
}
multiclass sme_f16_outer_product<bit S, string mnemonic> {
def : sme_outer_product_widening_inst<0b1, S, mnemonic>;
multiclass sme_f16_outer_product<bit S, string mnemonic, SDPatternOperator op> {
def NAME : sme_outer_product_widening_inst<0b1, S, mnemonic>;
def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16>;
def : Pat<(op imm0_3:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm),
(nxv8f16 ZPR16:$zn), (nxv8f16 ZPR16:$zm)),
(!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>;
}
//===----------------------------------------------------------------------===//

View File

@ -0,0 +1,126 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
define void @bfmopa(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
; CHECK-LABEL: bfmopa:
; CHECK: // %bb.0:
; CHECK-NEXT: bfmopa za0.s, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i64 0, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
ret void
}
define void @fmopa(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
; CHECK-LABEL: fmopa:
; CHECK: // %bb.0:
; CHECK-NEXT: fmopa za1.s, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i64 1, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
ret void
}
define void @smopa_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: smopa_s:
; CHECK: // %bb.0:
; CHECK-NEXT: smopa za2.s, p0/m, p1/m, z0.b, z1.b
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i64 2, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
define void @smopa_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: smopa_d:
; CHECK: // %bb.0:
; CHECK-NEXT: smopa za0.d, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i64 0, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
define void @umopa_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: umopa_s:
; CHECK: // %bb.0:
; CHECK-NEXT: umopa za3.s, p0/m, p1/m, z0.b, z1.b
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.umopa.wide.nxv16i8(i64 3, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
define void @umopa_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: umopa_d:
; CHECK: // %bb.0:
; CHECK-NEXT: umopa za1.d, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i64 1, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
define void @fmopa_s(<vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm) {
; CHECK-LABEL: fmopa_s:
; CHECK: // %bb.0:
; CHECK-NEXT: fmopa za0.s, p0/m, p1/m, z0.s, z1.s
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mopa.nxv4f32(i64 0, <vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
ret void
}
define void @fmopa_d(<vscale x 2 x i1> %pn, <vscale x 2 x i1> %pm, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm) #1 {
; CHECK-LABEL: fmopa_d:
; CHECK: // %bb.0:
; CHECK-NEXT: fmopa za2.d, p0/m, p1/m, z0.d, z1.d
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mopa.nxv2f64(i64 2, <vscale x 2 x i1> %pn, <vscale x 2 x i1> %pm, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
ret void
}
define void @sumopa_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: sumopa_s:
; CHECK: // %bb.0:
; CHECK-NEXT: sumopa za1.s, p0/m, p1/m, z0.b, z1.b
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.sumopa.wide.nxv16i8(i64 1, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
define void @sumopa_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: sumopa_d:
; CHECK: // %bb.0:
; CHECK-NEXT: sumopa za3.d, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.sumopa.wide.nxv8i16(i64 3, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
define void @usmopa_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: usmopa_s:
; CHECK: // %bb.0:
; CHECK-NEXT: usmopa za2.s, p0/m, p1/m, z0.b, z1.b
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.usmopa.wide.nxv16i8(i64 2, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
define void @usmopa_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: usmopa_d:
; CHECK: // %bb.0:
; CHECK-NEXT: usmopa za7.d, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.usmopa.wide.nxv8i16(i64 7, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
attributes #0 = { "target-features"="+sme-i64" }
attributes #1 = { "target-features"="+sme-f64" }
declare void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
declare void @llvm.aarch64.sme.mopa.wide.nxv8f16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
declare void @llvm.aarch64.sme.mopa.nxv4f32(i64, <vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
declare void @llvm.aarch64.sme.mopa.nxv2f64(i64, <vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
declare void @llvm.aarch64.sme.smopa.wide.nxv16i8(i64, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare void @llvm.aarch64.sme.smopa.wide.nxv8i16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare void @llvm.aarch64.sme.umopa.wide.nxv16i8(i64, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare void @llvm.aarch64.sme.umopa.wide.nxv8i16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare void @llvm.aarch64.sme.sumopa.wide.nxv16i8(i64, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare void @llvm.aarch64.sme.sumopa.wide.nxv8i16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare void @llvm.aarch64.sme.usmopa.wide.nxv16i8(i64, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare void @llvm.aarch64.sme.usmopa.wide.nxv8i16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)

View File

@ -0,0 +1,126 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
define void @bfmops(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
; CHECK-LABEL: bfmops:
; CHECK: // %bb.0:
; CHECK-NEXT: bfmops za0.s, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i64 0, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
ret void
}
define void @fmops(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
; CHECK-LABEL: fmops:
; CHECK: // %bb.0:
; CHECK-NEXT: fmops za1.s, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mops.wide.nxv8f16(i64 1, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
ret void
}
define void @smops_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: smops_s:
; CHECK: // %bb.0:
; CHECK-NEXT: smops za2.s, p0/m, p1/m, z0.b, z1.b
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.smops.wide.nxv16i8(i64 2, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
define void @smops_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: smops_d:
; CHECK: // %bb.0:
; CHECK-NEXT: smops za0.d, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.smops.wide.nxv8i16(i64 0, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
define void @umops_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: umops_s:
; CHECK: // %bb.0:
; CHECK-NEXT: umops za3.s, p0/m, p1/m, z0.b, z1.b
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.umops.wide.nxv16i8(i64 3, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
define void @umops_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: umops_d:
; CHECK: // %bb.0:
; CHECK-NEXT: umops za1.d, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.umops.wide.nxv8i16(i64 1, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
define void @fmops_s(<vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm) {
; CHECK-LABEL: fmops_s:
; CHECK: // %bb.0:
; CHECK-NEXT: fmops za0.s, p0/m, p1/m, z0.s, z1.s
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mops.nxv4f32(i64 0, <vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
ret void
}
define void @fmops_d(<vscale x 2 x i1> %pn, <vscale x 2 x i1> %pm, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm) #1 {
; CHECK-LABEL: fmops_d:
; CHECK: // %bb.0:
; CHECK-NEXT: fmops za2.d, p0/m, p1/m, z0.d, z1.d
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.mops.nxv2f64(i64 2, <vscale x 2 x i1> %pn, <vscale x 2 x i1> %pm, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
ret void
}
define void @sumops_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: sumops_s:
; CHECK: // %bb.0:
; CHECK-NEXT: sumops za1.s, p0/m, p1/m, z0.b, z1.b
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i64 1, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
define void @sumops_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: sumops_d:
; CHECK: // %bb.0:
; CHECK-NEXT: sumops za3.d, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i64 3, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
define void @usmops_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: usmops_s:
; CHECK: // %bb.0:
; CHECK-NEXT: usmops za2.s, p0/m, p1/m, z0.b, z1.b
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i64 2, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
define void @usmops_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: usmops_d:
; CHECK: // %bb.0:
; CHECK-NEXT: usmops za7.d, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i64 7, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
attributes #0 = { "target-features"="+sme-i64" }
attributes #1 = { "target-features"="+sme-f64" }
declare void @llvm.aarch64.sme.mops.wide.nxv8bf16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
declare void @llvm.aarch64.sme.mops.wide.nxv8f16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
declare void @llvm.aarch64.sme.mops.nxv4f32(i64, <vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
declare void @llvm.aarch64.sme.mops.nxv2f64(i64, <vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
declare void @llvm.aarch64.sme.smops.wide.nxv16i8(i64, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare void @llvm.aarch64.sme.smops.wide.nxv8i16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare void @llvm.aarch64.sme.umops.wide.nxv16i8(i64, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare void @llvm.aarch64.sme.umops.wide.nxv8i16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare void @llvm.aarch64.sme.sumops.wide.nxv16i8(i64, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare void @llvm.aarch64.sme.sumops.wide.nxv8i16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare void @llvm.aarch64.sme.usmops.wide.nxv16i8(i64, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare void @llvm.aarch64.sme.usmops.wide.nxv8i16(i64, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)