[AArch64][SME] Add load/store intrinsics

This patch adds implementations for the load/store SME ACLE intrinsics:
  - @llvm.aarch64.sme.ld1*
  - @llvm.aarch64.sme.st1*

Differential Revision: https://reviews.llvm.org/D127210
This commit is contained in:
Rosie Sumpter 2022-06-06 16:06:43 +01:00
parent 365d827f65
commit 2c4e44752d
11 changed files with 812 additions and 19 deletions

View File

@ -245,6 +245,7 @@ def llvm_i8_ty : LLVMType<i8>;
def llvm_i16_ty : LLVMType<i16>;
def llvm_i32_ty : LLVMType<i32>;
def llvm_i64_ty : LLVMType<i64>;
def llvm_i128_ty : LLVMType<i128>;
def llvm_half_ty : LLVMType<f16>;
def llvm_bfloat_ty : LLVMType<bf16>;
def llvm_float_ty : LLVMType<f32>;

View File

@ -2583,3 +2583,46 @@ def int_aarch64_sve_whilewr_b : SVE2_CONFLICT_DETECT_Intrinsic;
def int_aarch64_sve_whilewr_h : SVE2_CONFLICT_DETECT_Intrinsic;
def int_aarch64_sve_whilewr_s : SVE2_CONFLICT_DETECT_Intrinsic;
def int_aarch64_sve_whilewr_d : SVE2_CONFLICT_DETECT_Intrinsic;
// Scalable Matrix Extension (SME) Intrinsics
let TargetPrefix = "aarch64" in {
class SME_Load_Store_B_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_nxv16i1_ty, llvm_ptr_ty, llvm_i64_ty, llvm_i32_ty], []>;
class SME_Load_Store_H_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_nxv16i1_ty, LLVMPointerType<llvm_i16_ty>, llvm_i64_ty, llvm_i32_ty], []>;
class SME_Load_Store_S_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_nxv16i1_ty, LLVMPointerType<llvm_i32_ty>, llvm_i64_ty, llvm_i32_ty], []>;
class SME_Load_Store_D_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_nxv16i1_ty, LLVMPointerType<llvm_i64_ty>, llvm_i64_ty, llvm_i32_ty], []>;
class SME_Load_Store_Q_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_nxv16i1_ty, LLVMPointerType<llvm_i128_ty>, llvm_i64_ty, llvm_i32_ty], []>;
// Loads
def int_aarch64_sme_ld1b_horiz : SME_Load_Store_B_Intrinsic;
def int_aarch64_sme_ld1h_horiz : SME_Load_Store_H_Intrinsic;
def int_aarch64_sme_ld1w_horiz : SME_Load_Store_S_Intrinsic;
def int_aarch64_sme_ld1d_horiz : SME_Load_Store_D_Intrinsic;
def int_aarch64_sme_ld1q_horiz : SME_Load_Store_Q_Intrinsic;
def int_aarch64_sme_ld1b_vert : SME_Load_Store_B_Intrinsic;
def int_aarch64_sme_ld1h_vert : SME_Load_Store_H_Intrinsic;
def int_aarch64_sme_ld1w_vert : SME_Load_Store_S_Intrinsic;
def int_aarch64_sme_ld1d_vert : SME_Load_Store_D_Intrinsic;
def int_aarch64_sme_ld1q_vert : SME_Load_Store_Q_Intrinsic;
// Stores
def int_aarch64_sme_st1b_horiz : SME_Load_Store_B_Intrinsic;
def int_aarch64_sme_st1h_horiz : SME_Load_Store_H_Intrinsic;
def int_aarch64_sme_st1w_horiz : SME_Load_Store_S_Intrinsic;
def int_aarch64_sme_st1d_horiz : SME_Load_Store_D_Intrinsic;
def int_aarch64_sme_st1q_horiz : SME_Load_Store_Q_Intrinsic;
def int_aarch64_sme_st1b_vert : SME_Load_Store_B_Intrinsic;
def int_aarch64_sme_st1h_vert : SME_Load_Store_H_Intrinsic;
def int_aarch64_sme_st1w_vert : SME_Load_Store_S_Intrinsic;
def int_aarch64_sme_st1d_vert : SME_Load_Store_D_Intrinsic;
def int_aarch64_sme_st1q_vert : SME_Load_Store_Q_Intrinsic;
}

View File

@ -278,6 +278,15 @@ public:
return false;
}
template <unsigned BaseReg> bool ImmToTile(SDValue N, SDValue &Imm) {
if (auto *CI = dyn_cast<ConstantSDNode>(N)) {
uint64_t C = CI->getZExtValue();
Imm = CurDAG->getRegister(BaseReg + C, MVT::Other);
return true;
}
return false;
}
/// Form sequences of consecutive 64/128-bit registers for use in NEON
/// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
/// between 1 and 4 elements. If it contains a single element that is returned
@ -321,6 +330,11 @@ public:
return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
}
template <unsigned Scale>
bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) {
return SelectSMETileSlice(N, Scale, Vector, Offset);
}
void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
@ -389,6 +403,8 @@ private:
bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm);
bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
SDValue &Offset);
bool SelectSMETileSlice(SDValue N, unsigned Scale, SDValue &Vector,
SDValue &Offset);
bool SelectAllActivePredicate(SDValue N);
};
@ -5224,3 +5240,27 @@ bool AArch64DAGToDAGISel::SelectAllActivePredicate(SDValue N) {
return TLI->isAllActivePredicate(*CurDAG, N);
}
bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned Scale,
SDValue &Vector, SDValue &Offset) {
if (N.getOpcode() != ISD::ADD)
return false;
// Process an ADD node.
const SDValue LHS = N.getOperand(0);
const SDValue RHS = N.getOperand(1);
if (auto C = dyn_cast<ConstantSDNode>(RHS)) {
int64_t ImmOff = C->getSExtValue();
unsigned MaxSize = (1 << Scale) - 1;
if (ImmOff < 0 || ImmOff > MaxSize)
return false;
Vector = LHS;
Offset = CurDAG->getTargetConstant(ImmOff, SDLoc(N), MVT::i64);
return true;
}
return false;
}

View File

@ -2323,6 +2323,24 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
return BB;
}
MachineBasicBlock *
AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
MachineBasicBlock *BB) const {
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
MIB.add(MI.getOperand(1)); // slice index register
MIB.add(MI.getOperand(2)); // slice index offset
MIB.add(MI.getOperand(3)); // pg
MIB.add(MI.getOperand(4)); // base
MIB.add(MI.getOperand(5)); // offset
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {
switch (MI.getOpcode()) {
@ -2353,6 +2371,26 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
case AArch64::CATCHRET:
return EmitLoweredCatchRet(MI, BB);
case AArch64::LD1_MXIPXX_H_PSEUDO_B:
return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
case AArch64::LD1_MXIPXX_H_PSEUDO_H:
return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
case AArch64::LD1_MXIPXX_H_PSEUDO_S:
return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
case AArch64::LD1_MXIPXX_H_PSEUDO_D:
return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
case AArch64::LD1_MXIPXX_V_PSEUDO_B:
return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
case AArch64::LD1_MXIPXX_V_PSEUDO_H:
return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
case AArch64::LD1_MXIPXX_V_PSEUDO_S:
return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
case AArch64::LD1_MXIPXX_V_PSEUDO_D:
return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
}
}

View File

@ -556,6 +556,10 @@ public:
MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
MachineBasicBlock *BB) const;
MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
MachineBasicBlock *BB) const;
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const override;

View File

@ -338,6 +338,13 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
markSuperRegs(Reserved, AArch64::W16);
// SME tiles are not allocatable.
if (MF.getSubtarget<AArch64Subtarget>().hasSME()) {
for (MCSubRegIterator SubReg(AArch64::ZA, this, /*self=*/true);
SubReg.isValid(); ++SubReg)
Reserved.set(*SubReg);
}
assert(checkAllSuperRegsMarked(Reserved));
return Reserved;
}

View File

@ -1212,26 +1212,28 @@ let SubRegIndices = [zasubb] in {
// SME Register Classes
// Accumulator array
def MPR : RegisterClass<"AArch64", [untyped], 2048, (add ZA)> {
let Size = 2048;
}
let isAllocatable = 0 in {
// Accumulator array
def MPR : RegisterClass<"AArch64", [untyped], 2048, (add ZA)> {
let Size = 2048;
}
// Accumulator array as single tiles
def MPR8 : RegisterClass<"AArch64", [untyped], 2048, (add (sequence "ZAB%u", 0, 0))> {
let Size = 2048;
}
def MPR16 : RegisterClass<"AArch64", [untyped], 1024, (add (sequence "ZAH%u", 0, 1))> {
let Size = 1024;
}
def MPR32 : RegisterClass<"AArch64", [untyped], 512, (add (sequence "ZAS%u", 0, 3))> {
let Size = 512;
}
def MPR64 : RegisterClass<"AArch64", [untyped], 256, (add (sequence "ZAD%u", 0, 7))> {
let Size = 256;
}
def MPR128 : RegisterClass<"AArch64", [untyped], 128, (add (sequence "ZAQ%u", 0, 15))> {
let Size = 128;
// Accumulator array as single tiles
def MPR8 : RegisterClass<"AArch64", [untyped], 2048, (add (sequence "ZAB%u", 0, 0))> {
let Size = 2048;
}
def MPR16 : RegisterClass<"AArch64", [untyped], 1024, (add (sequence "ZAH%u", 0, 1))> {
let Size = 1024;
}
def MPR32 : RegisterClass<"AArch64", [untyped], 512, (add (sequence "ZAS%u", 0, 3))> {
let Size = 512;
}
def MPR64 : RegisterClass<"AArch64", [untyped], 256, (add (sequence "ZAD%u", 0, 7))> {
let Size = 256;
}
def MPR128 : RegisterClass<"AArch64", [untyped], 128, (add (sequence "ZAQ%u", 0, 15))> {
let Size = 128;
}
}
// SME Register Operands

View File

@ -10,6 +10,18 @@
//
//===----------------------------------------------------------------------===//
def imm_to_tile8 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAB0>", []>;
def imm_to_tile16 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAH0>", []>;
def imm_to_tile32 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAS0>", []>;
def imm_to_tile64 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAD0>", []>;
def imm_to_tile128 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAQ0>", []>;
def tileslice8 : ComplexPattern<i32 , 2, "SelectSMETileSlice<4>", []>;
def tileslice16 : ComplexPattern<i32 , 2, "SelectSMETileSlice<3>", []>;
def tileslice32 : ComplexPattern<i32 , 2, "SelectSMETileSlice<2>", []>;
def tileslice64 : ComplexPattern<i32 , 2, "SelectSMETileSlice<1>", []>;
def tileslice128 : ComplexPattern<i32 , 2, "SelectSMETileSlice<0>", []>; // nop
//===----------------------------------------------------------------------===//
// SME Outer Products
//===----------------------------------------------------------------------===//
@ -233,6 +245,45 @@ multiclass sme_mem_ld_ss_aliases<string inst, bit is_col> {
defm NAME : sme_mem_ss_aliases<"ld1", inst, is_col, "/z">;
}
multiclass sme_mem_ld_ss_patterns<Instruction Inst, SDPatternOperator Load,
Operand tile_ty, Operand offset_ty,
ComplexPattern addr,
ComplexPattern tileslice> {
// base
def : Pat<(Load PPR3bAny:$pg, GPR64sp:$base, tile_ty:$tile,
MatrixIndexGPR32Op12_15:$idx),
(Inst tile_ty:$tile, $idx, 0, $pg, $base, XZR)>;
// reg + reg
let AddedComplexity = 1 in {
def : Pat<(Load PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset),
tile_ty:$tile, MatrixIndexGPR32Op12_15:$idx),
(Inst tile_ty:$tile, $idx, 0, $pg, $base, $offset)>;
}
// base, tileslice
let AddedComplexity = 1 in {
def : Pat<(Load PPR3bAny:$pg, GPR64sp:$base, tile_ty:$tile,
(i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))),
(Inst tile_ty:$tile, $idx, $imm, $pg, $base, XZR)>;
}
// reg + reg, tileslice
let AddedComplexity = 2 in {
def : Pat<(Load PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset),
tile_ty:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$idx,
offset_ty:$imm))),
(Inst tile_ty:$tile, $idx, $imm, $pg, $base, $offset)>;
}
}
class sme_load_pseudo
: Pseudo<(outs), (ins i64imm:$tile, MatrixIndexGPR32Op12_15:$idx,
i64imm:$imm, PPR3bAny:$pg, GPR64sp:$base, GPR64:$offset), []>,
Sched<[]> {
// Translated to the actual instructions in AArch64ISelLowering.cpp
let usesCustomInserter = 1;
let mayLoad = 1;
}
multiclass sme_mem_ld_v_ss<string mnemonic, bit is_col> {
def _B : sme_mem_ld_ss_inst<0b0, 0b00, mnemonic # "b",
!if(is_col, TileVectorOpV8, TileVectorOpH8),
@ -272,6 +323,40 @@ multiclass sme_mem_ld_v_ss<string mnemonic, bit is_col> {
}
defm : sme_mem_ld_ss_aliases<NAME, is_col>;
// Pseudo instructions for lowering intrinsics, using immediates instead of
// tile registers.
def _PSEUDO_B : sme_load_pseudo;
def _PSEUDO_H : sme_load_pseudo;
def _PSEUDO_S : sme_load_pseudo;
def _PSEUDO_D : sme_load_pseudo;
def _PSEUDO_Q : sme_load_pseudo;
defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_B),
!if(is_col, int_aarch64_sme_ld1b_vert,
int_aarch64_sme_ld1b_horiz),
sme_elm_idx0_0, imm0_15, am_sve_regreg_lsl0,
tileslice8>;
defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_H),
!if(is_col, int_aarch64_sme_ld1h_vert,
int_aarch64_sme_ld1h_horiz),
imm0_1, imm0_7, am_sve_regreg_lsl1,
tileslice16>;
defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_S),
!if(is_col, int_aarch64_sme_ld1w_vert,
int_aarch64_sme_ld1w_horiz),
imm0_3, imm0_3, am_sve_regreg_lsl2,
tileslice32>;
defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_D),
!if(is_col, int_aarch64_sme_ld1d_vert,
int_aarch64_sme_ld1d_horiz),
imm0_7, imm0_1, am_sve_regreg_lsl3,
tileslice64>;
defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
!if(is_col, int_aarch64_sme_ld1q_vert,
int_aarch64_sme_ld1q_horiz),
imm0_15, sme_elm_idx0_0, am_sve_regreg_lsl4,
tileslice128>;
}
multiclass sme_mem_ld_ss<string mnemonic> {
@ -318,6 +403,36 @@ multiclass sme_mem_st_ss_aliases<string inst, bit is_col> {
defm NAME : sme_mem_ss_aliases<"st1", inst, is_col>;
}
multiclass sme_mem_st_ss_patterns<Instruction Inst, SDPatternOperator Store,
Operand offset_ty,
ComplexPattern imm2tile,
ComplexPattern addr,
ComplexPattern tileslice> {
// base
def : Pat<(Store PPR3bAny:$pg, GPR64sp:$base, (imm2tile untyped:$tile),
MatrixIndexGPR32Op12_15:$idx),
(Inst $tile, $idx, 0, $pg, $base, XZR)>;
// reg + reg
let AddedComplexity = 1 in {
def : Pat<(Store PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset),
(imm2tile untyped:$tile), MatrixIndexGPR32Op12_15:$idx),
(Inst $tile, $idx, 0, $pg, $base, $offset)>;
}
// base, tileslice
let AddedComplexity = 1 in {
def : Pat<(Store PPR3bAny:$pg, GPR64sp:$base, (imm2tile untyped:$tile),
(i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))),
(Inst $tile, $idx, $imm, $pg, $base, XZR)>;
}
// reg + reg, tileslice
let AddedComplexity = 2 in {
def : Pat<(Store PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset),
(imm2tile untyped:$tile),
(i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))),
(Inst $tile, $idx, $imm, $pg, $base, $offset)>;
}
}
multiclass sme_mem_st_v_ss<string mnemonic, bit is_col> {
def _B : sme_mem_st_ss_inst<0b0, 0b00, mnemonic # "b",
!if(is_col, TileVectorOpV8, TileVectorOpH8),
@ -357,6 +472,32 @@ multiclass sme_mem_st_v_ss<string mnemonic, bit is_col> {
}
defm : sme_mem_st_ss_aliases<NAME, is_col>;
defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _B),
!if(is_col, int_aarch64_sme_st1b_vert,
int_aarch64_sme_st1b_horiz),
imm0_15, imm_to_tile8, am_sve_regreg_lsl0,
tileslice8>;
defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _H),
!if(is_col, int_aarch64_sme_st1h_vert,
int_aarch64_sme_st1h_horiz),
imm0_7, imm_to_tile16, am_sve_regreg_lsl1,
tileslice16>;
defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _S),
!if(is_col, int_aarch64_sme_st1w_vert,
int_aarch64_sme_st1w_horiz),
imm0_3, imm_to_tile32, am_sve_regreg_lsl2,
tileslice32>;
defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _D),
!if(is_col, int_aarch64_sme_st1d_vert,
int_aarch64_sme_st1d_horiz),
imm0_1, imm_to_tile64, am_sve_regreg_lsl3,
tileslice64>;
defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _Q),
!if(is_col, int_aarch64_sme_st1q_vert,
int_aarch64_sme_st1q_horiz),
sme_elm_idx0_0, imm_to_tile128,
am_sve_regreg_lsl4, tileslice128>;
}
multiclass sme_mem_st_ss<string mnemonic> {

View File

@ -8489,6 +8489,7 @@ def am_sve_regreg_lsl0 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<0>", [
def am_sve_regreg_lsl1 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<1>", []>;
def am_sve_regreg_lsl2 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<2>", []>;
def am_sve_regreg_lsl3 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<3>", []>;
def am_sve_regreg_lsl4 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<4>", []>;
// Predicated pseudo floating point two operand instructions.
multiclass sve_fp_bin_pred_hfd<SDPatternOperator op> {

View File

@ -0,0 +1,258 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
define void @ld1b(<vscale x 16 x i1> %pg, i8* %ptr, i32 %sliceidx) {
; CHECK-LABEL: ld1b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w1
; CHECK-NEXT: mov w13, wzr
; CHECK-NEXT: ld1b {za0h.b[w12, 15]}, p0/z, [x0]
; CHECK-NEXT: ld1b {za0v.b[w13, 0]}, p0/z, [x0]
; CHECK-NEXT: ret
%tileslice = add i32 %sliceidx, 15
call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> %pg, i8* %ptr, i64 0, i32 %tileslice)
call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> %pg, i8* %ptr, i64 0, i32 0)
ret void;
}
define void @ld1b_with_addr_offset(<vscale x 16 x i1> %pg, i8* %ptr, i64 %index, i32 %sliceidx) {
; CHECK-LABEL: ld1b_with_addr_offset:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, wzr
; CHECK-NEXT: mov w13, w2
; CHECK-NEXT: ld1b {za0h.b[w12, 0]}, p0/z, [x0, x1]
; CHECK-NEXT: ld1b {za0v.b[w13, 15]}, p0/z, [x0, x1]
; CHECK-NEXT: ret
%base = getelementptr i8, i8* %ptr, i64 %index
%tileslice = add i32 %sliceidx, 15
call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> %pg, i8* %base, i64 0, i32 0)
call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> %pg, i8* %base, i64 0, i32 %tileslice)
ret void;
}
define void @ld1h(<vscale x 16 x i1> %pg, i16* %ptr, i32 %sliceidx) {
; CHECK-LABEL: ld1h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w1
; CHECK-NEXT: mov w13, wzr
; CHECK-NEXT: ld1h {za0h.h[w12, 7]}, p0/z, [x0]
; CHECK-NEXT: ld1h {za1h.h[w13, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1h {za0v.h[w13, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1h {za1v.h[w12, 7]}, p0/z, [x0]
; CHECK-NEXT: ret
%tileslice = add i32 %sliceidx, 7
call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 16 x i1> %pg, i16* %ptr, i64 0, i32 %tileslice)
call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 16 x i1> %pg, i16* %ptr, i64 1, i32 0)
call void @llvm.aarch64.sme.ld1h.vert(<vscale x 16 x i1> %pg, i16* %ptr, i64 0, i32 0)
call void @llvm.aarch64.sme.ld1h.vert(<vscale x 16 x i1> %pg, i16* %ptr, i64 1, i32 %tileslice)
ret void;
}
define void @ld1h_with_addr_offset(<vscale x 16 x i1> %pg, i16* %ptr, i64 %index, i32 %sliceidx) {
; CHECK-LABEL: ld1h_with_addr_offset:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w2
; CHECK-NEXT: mov w13, wzr
; CHECK-NEXT: ld1h {za0h.h[w12, 7]}, p0/z, [x0, x1, lsl #1]
; CHECK-NEXT: ld1h {za1v.h[w13, 0]}, p0/z, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%base = getelementptr i16, i16* %ptr, i64 %index
%tileslice = add i32 %sliceidx, 7
call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 16 x i1> %pg, i16* %base, i64 0, i32 %tileslice)
call void @llvm.aarch64.sme.ld1h.vert(<vscale x 16 x i1> %pg, i16* %base, i64 1, i32 0)
ret void;
}
define void @ld1w(<vscale x 16 x i1> %pg, i32* %ptr, i32 %sliceidx) {
; CHECK-LABEL: ld1w:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, wzr
; CHECK-NEXT: mov w13, w1
; CHECK-NEXT: ld1w {za0h.s[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1w {za1h.s[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1w {za2h.s[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1w {za3h.s[w13, 3]}, p0/z, [x0]
; CHECK-NEXT: ld1w {za0v.s[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1w {za1v.s[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1w {za2v.s[w13, 3]}, p0/z, [x0]
; CHECK-NEXT: ld1w {za3v.s[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ret
%tileslice = add i32 %sliceidx, 3
call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> %pg, i32* %ptr, i64 0, i32 0)
call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> %pg, i32* %ptr, i64 1, i32 0)
call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> %pg, i32* %ptr, i64 2, i32 0)
call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> %pg, i32* %ptr, i64 3, i32 %tileslice)
call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> %pg, i32* %ptr, i64 0, i32 0)
call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> %pg, i32* %ptr, i64 1, i32 0)
call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> %pg, i32* %ptr, i64 2, i32 %tileslice)
call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> %pg, i32* %ptr, i64 3, i32 0)
ret void;
}
define void @ld1w_with_addr_offset(<vscale x 16 x i1> %pg, i32* %ptr, i64 %index, i32 %sliceidx) {
; CHECK-LABEL: ld1w_with_addr_offset:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w2
; CHECK-NEXT: mov w13, wzr
; CHECK-NEXT: ld1w {za0h.s[w13, 0]}, p0/z, [x0, x1, lsl #2]
; CHECK-NEXT: ld1w {za3v.s[w12, 3]}, p0/z, [x0, x1, lsl #2]
; CHECK-NEXT: ret
%base = getelementptr i32, i32* %ptr, i64 %index
%tileslice = add i32 %sliceidx, 3
call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> %pg, i32* %base, i64 0, i32 0)
call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> %pg, i32* %base, i64 3, i32 %tileslice)
ret void;
}
define void @ld1d(<vscale x 16 x i1> %pg, i64* %ptr, i32 %sliceidx) {
; CHECK-LABEL: ld1d:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w13, wzr
; CHECK-NEXT: mov w12, w1
; CHECK-NEXT: ld1d {za0h.d[w13, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1d {za1h.d[w13, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1d {za2h.d[w13, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1d {za3h.d[w13, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1d {za4h.d[w12, 1]}, p0/z, [x0]
; CHECK-NEXT: ld1d {za5h.d[w13, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1d {za6h.d[w13, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1d {za7h.d[w13, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1d {za0v.d[w13, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1d {za1v.d[w13, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1d {za2v.d[w13, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1d {za3v.d[w13, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1d {za4v.d[w13, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1d {za5v.d[w13, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1d {za6v.d[w13, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1d {za7v.d[w12, 1]}, p0/z, [x0]
; CHECK-NEXT: ret
%tileslice = add i32 %sliceidx, 1
call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 0, i32 0)
call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 1, i32 0)
call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 2, i32 0)
call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 3, i32 0)
call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 4, i32 %tileslice)
call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 5, i32 0)
call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 6, i32 0)
call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 7, i32 0)
call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 0, i32 0)
call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 1, i32 0)
call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 2, i32 0)
call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 3, i32 0)
call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 4, i32 0)
call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 5, i32 0)
call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 6, i32 0)
call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 7, i32 %tileslice)
ret void;
}
define void @ld1d_with_addr_offset(<vscale x 16 x i1> %pg, i64* %ptr, i64 %index, i32 %sliceidx) {
; CHECK-LABEL: ld1d_with_addr_offset:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w2
; CHECK-NEXT: mov w13, wzr
; CHECK-NEXT: ld1d {za0h.d[w12, 1]}, p0/z, [x0, x1, lsl #3]
; CHECK-NEXT: ld1d {za7v.d[w13, 0]}, p0/z, [x0, x1, lsl #3]
; CHECK-NEXT: ret
%base = getelementptr i64, i64* %ptr, i64 %index
%tileslice = add i32 %sliceidx, 1
call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %base, i64 0, i32 %tileslice)
call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %base, i64 7, i32 0)
ret void;
}
define void @ld1q(<vscale x 16 x i1> %pg, i128* %ptr) {
; CHECK-LABEL: ld1q:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, wzr
; CHECK-NEXT: ld1q {za0h.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za1h.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za2h.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za3h.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za4h.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za5h.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za6h.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za7h.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za8h.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za9h.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za10h.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za11h.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za12h.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za13h.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za14h.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za15h.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za0v.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za1v.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za2v.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za3v.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za4v.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za5v.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za6v.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za7v.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za8v.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za9v.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za10v.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za11v.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za12v.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za13v.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za14v.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ld1q {za15v.q[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 0, i32 0)
call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 1, i32 0)
call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 2, i32 0)
call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 3, i32 0)
call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 4, i32 0)
call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 5, i32 0)
call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 6, i32 0)
call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 7, i32 0)
call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 8, i32 0)
call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 9, i32 0)
call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 10, i32 0)
call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 11, i32 0)
call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 12, i32 0)
call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 13, i32 0)
call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 14, i32 0)
call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 15, i32 0)
call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 0, i32 0)
call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 1, i32 0)
call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 2, i32 0)
call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 3, i32 0)
call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 4, i32 0)
call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 5, i32 0)
call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 6, i32 0)
call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 7, i32 0)
call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 8, i32 0)
call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 9, i32 0)
call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 10, i32 0)
call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 11, i32 0)
call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 12, i32 0)
call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 13, i32 0)
call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 14, i32 0)
call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 15, i32 0)
ret void;
}
define void @ld1q_with_addr_offset(<vscale x 16 x i1> %pg, i128* %ptr, i64 %index) {
; CHECK-LABEL: ld1q_with_addr_offset:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, wzr
; CHECK-NEXT: ld1q {za0h.q[w12, 0]}, p0/z, [x0, x1, lsl #4]
; CHECK-NEXT: ld1q {za15v.q[w12, 0]}, p0/z, [x0, x1, lsl #4]
; CHECK-NEXT: ret
%base = getelementptr i128, i128* %ptr, i64 %index
call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %base, i64 0, i32 0)
call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %base, i64 15, i32 0)
ret void;
}
declare void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1>, i8*, i64, i32)
declare void @llvm.aarch64.sme.ld1h.horiz(<vscale x 16 x i1>, i16*, i64, i32)
declare void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1>, i32*, i64, i32)
declare void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1>, i64*, i64, i32)
declare void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1>, i128*, i64, i32)
declare void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1>, i8*, i64, i32)
declare void @llvm.aarch64.sme.ld1h.vert(<vscale x 16 x i1>, i16*, i64, i32)
declare void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1>, i32*, i64, i32)
declare void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1>, i64*, i64, i32)
declare void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1>, i128*, i64, i32)

View File

@ -0,0 +1,258 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
define void @st1b(<vscale x 16 x i1> %pg, i8* %ptr, i32 %sliceidx) {
; CHECK-LABEL: st1b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w1
; CHECK-NEXT: mov w13, wzr
; CHECK-NEXT: st1b {za0h.b[w12, 15]}, p0, [x0]
; CHECK-NEXT: st1b {za0v.b[w13, 0]}, p0, [x0]
; CHECK-NEXT: ret
%tileslice = add i32 %sliceidx, 15
call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> %pg, i8* %ptr, i64 0, i32 %tileslice)
call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> %pg, i8* %ptr, i64 0, i32 0)
ret void;
}
define void @st1b_with_addr_offset(<vscale x 16 x i1> %pg, i8* %ptr, i64 %index, i32 %sliceidx) {
; CHECK-LABEL: st1b_with_addr_offset:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, wzr
; CHECK-NEXT: mov w13, w2
; CHECK-NEXT: st1b {za0h.b[w12, 0]}, p0, [x0, x1]
; CHECK-NEXT: st1b {za0v.b[w13, 15]}, p0, [x0, x1]
; CHECK-NEXT: ret
%base = getelementptr i8, i8* %ptr, i64 %index
%tileslice = add i32 %sliceidx, 15
call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> %pg, i8* %base, i64 0, i32 0)
call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> %pg, i8* %base, i64 0, i32 %tileslice)
ret void;
}
define void @st1h(<vscale x 16 x i1> %pg, i16* %ptr, i32 %sliceidx) {
; CHECK-LABEL: st1h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w1
; CHECK-NEXT: mov w13, wzr
; CHECK-NEXT: st1h {za0h.h[w12, 7]}, p0, [x0]
; CHECK-NEXT: st1h {za1h.h[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1h {za0v.h[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1h {za1v.h[w12, 7]}, p0, [x0]
; CHECK-NEXT: ret
%tileslice = add i32 %sliceidx, 7
call void @llvm.aarch64.sme.st1h.horiz(<vscale x 16 x i1> %pg, i16* %ptr, i64 0, i32 %tileslice)
call void @llvm.aarch64.sme.st1h.horiz(<vscale x 16 x i1> %pg, i16* %ptr, i64 1, i32 0)
call void @llvm.aarch64.sme.st1h.vert(<vscale x 16 x i1> %pg, i16* %ptr, i64 0, i32 0)
call void @llvm.aarch64.sme.st1h.vert(<vscale x 16 x i1> %pg, i16* %ptr, i64 1, i32 %tileslice)
ret void;
}
define void @st1h_with_addr_offset(<vscale x 16 x i1> %pg, i16* %ptr, i64 %index, i32 %sliceidx) {
; CHECK-LABEL: st1h_with_addr_offset:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w2
; CHECK-NEXT: mov w13, wzr
; CHECK-NEXT: st1h {za0h.h[w12, 7]}, p0, [x0, x1, lsl #1]
; CHECK-NEXT: st1h {za1v.h[w13, 0]}, p0, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%base = getelementptr i16, i16* %ptr, i64 %index
%tileslice = add i32 %sliceidx, 7
call void @llvm.aarch64.sme.st1h.horiz(<vscale x 16 x i1> %pg, i16* %base, i64 0, i32 %tileslice)
call void @llvm.aarch64.sme.st1h.vert(<vscale x 16 x i1> %pg, i16* %base, i64 1, i32 0)
ret void;
}
define void @st1w(<vscale x 16 x i1> %pg, i32* %ptr, i32 %sliceidx) {
; CHECK-LABEL: st1w:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w13, wzr
; CHECK-NEXT: mov w12, w1
; CHECK-NEXT: st1w {za0h.s[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1w {za1h.s[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1w {za2h.s[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1w {za3h.s[w12, 3]}, p0, [x0]
; CHECK-NEXT: st1w {za0v.s[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1w {za1v.s[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1w {za2v.s[w12, 3]}, p0, [x0]
; CHECK-NEXT: st1w {za3v.s[w13, 0]}, p0, [x0]
; CHECK-NEXT: ret
%tileslice = add i32 %sliceidx, 3
call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> %pg, i32* %ptr, i64 0, i32 0)
call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> %pg, i32* %ptr, i64 1, i32 0)
call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> %pg, i32* %ptr, i64 2, i32 0)
call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> %pg, i32* %ptr, i64 3, i32 %tileslice)
call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> %pg, i32* %ptr, i64 0, i32 0)
call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> %pg, i32* %ptr, i64 1, i32 0)
call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> %pg, i32* %ptr, i64 2, i32 %tileslice)
call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> %pg, i32* %ptr, i64 3, i32 0)
ret void;
}
define void @st1w_with_addr_offset(<vscale x 16 x i1> %pg, i32* %ptr, i64 %index, i32 %sliceidx) {
; CHECK-LABEL: st1w_with_addr_offset:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, wzr
; CHECK-NEXT: mov w13, w2
; CHECK-NEXT: st1w {za0h.s[w12, 0]}, p0, [x0, x1, lsl #2]
; CHECK-NEXT: st1w {za3v.s[w13, 3]}, p0, [x0, x1, lsl #2]
; CHECK-NEXT: ret
%base = getelementptr i32, i32* %ptr, i64 %index
%tileslice = add i32 %sliceidx, 3
call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> %pg, i32* %base, i64 0, i32 0)
call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> %pg, i32* %base, i64 3, i32 %tileslice)
ret void;
}
define void @st1d(<vscale x 16 x i1> %pg, i64* %ptr, i32 %sliceidx) {
; CHECK-LABEL: st1d:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w13, wzr
; CHECK-NEXT: mov w12, w1
; CHECK-NEXT: st1d {za0h.d[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1d {za1h.d[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1d {za2h.d[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1d {za3h.d[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1d {za4h.d[w12, 1]}, p0, [x0]
; CHECK-NEXT: st1d {za5h.d[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1d {za6h.d[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1d {za7h.d[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1d {za0v.d[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1d {za1v.d[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1d {za2v.d[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1d {za3v.d[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1d {za4v.d[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1d {za5v.d[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1d {za6v.d[w13, 0]}, p0, [x0]
; CHECK-NEXT: st1d {za7v.d[w12, 1]}, p0, [x0]
; CHECK-NEXT: ret
%tileslice = add i32 %sliceidx, 1
call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 0, i32 0)
call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 1, i32 0)
call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 2, i32 0)
call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 3, i32 0)
call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 4, i32 %tileslice)
call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 5, i32 0)
call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 6, i32 0)
call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 7, i32 0)
call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 0, i32 0)
call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 1, i32 0)
call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 2, i32 0)
call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 3, i32 0)
call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 4, i32 0)
call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 5, i32 0)
call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 6, i32 0)
call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 7, i32 %tileslice)
ret void;
}
define void @st1d_with_addr_offset(<vscale x 16 x i1> %pg, i64* %ptr, i64 %index, i32 %sliceidx) {
; CHECK-LABEL: st1d_with_addr_offset:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w2
; CHECK-NEXT: mov w13, wzr
; CHECK-NEXT: st1d {za0h.d[w12, 1]}, p0, [x0, x1, lsl #3]
; CHECK-NEXT: st1d {za7v.d[w13, 0]}, p0, [x0, x1, lsl #3]
; CHECK-NEXT: ret
%base = getelementptr i64, i64* %ptr, i64 %index
%tileslice = add i32 %sliceidx, 1
call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %base, i64 0, i32 %tileslice)
call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %base, i64 7, i32 0)
ret void;
}
define void @st1q(<vscale x 16 x i1> %pg, i128* %ptr) {
; CHECK-LABEL: st1q:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, wzr
; CHECK-NEXT: st1q {za0h.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za1h.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za2h.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za3h.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za4h.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za5h.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za6h.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za7h.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za8h.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za9h.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za10h.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za11h.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za12h.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za13h.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za14h.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za15h.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za0v.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za1v.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za2v.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za3v.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za4v.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za5v.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za6v.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za7v.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za8v.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za9v.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za10v.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za11v.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za12v.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za13v.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za14v.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: st1q {za15v.q[w12, 0]}, p0, [x0]
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 0, i32 0)
call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 1, i32 0)
call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 2, i32 0)
call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 3, i32 0)
call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 4, i32 0)
call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 5, i32 0)
call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 6, i32 0)
call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 7, i32 0)
call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 8, i32 0)
call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 9, i32 0)
call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 10, i32 0)
call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 11, i32 0)
call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 12, i32 0)
call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 13, i32 0)
call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 14, i32 0)
call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 15, i32 0)
call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 0, i32 0)
call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 1, i32 0)
call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 2, i32 0)
call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 3, i32 0)
call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 4, i32 0)
call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 5, i32 0)
call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 6, i32 0)
call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 7, i32 0)
call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 8, i32 0)
call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 9, i32 0)
call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 10, i32 0)
call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 11, i32 0)
call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 12, i32 0)
call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 13, i32 0)
call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 14, i32 0)
call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 15, i32 0)
ret void;
}
define void @st1q_with_addr_offset(<vscale x 16 x i1> %pg, i128* %ptr, i64 %index) {
; CHECK-LABEL: st1q_with_addr_offset:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, wzr
; CHECK-NEXT: st1q {za0h.q[w12, 0]}, p0, [x0, x1, lsl #4]
; CHECK-NEXT: st1q {za15v.q[w12, 0]}, p0, [x0, x1, lsl #4]
; CHECK-NEXT: ret
%base = getelementptr i128, i128* %ptr, i64 %index
call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %base, i64 0, i32 0)
call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %base, i64 15, i32 0)
ret void;
}
declare void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1>, i8*, i64, i32)
declare void @llvm.aarch64.sme.st1h.horiz(<vscale x 16 x i1>, i16*, i64, i32)
declare void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1>, i32*, i64, i32)
declare void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1>, i64*, i64, i32)
declare void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1>, i128*, i64, i32)
declare void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1>, i8*, i64, i32)
declare void @llvm.aarch64.sme.st1h.vert(<vscale x 16 x i1>, i16*, i64, i32)
declare void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1>, i32*, i64, i32)
declare void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1>, i64*, i64, i32)
declare void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1>, i128*, i64, i32)