[AArch64][SVE] Add unpredicated ld1/st1 patterns for reg+reg addressing modes

Differential Revision: https://reviews.llvm.org/D95677
This commit is contained in:
Bradley Smith 2021-01-28 12:39:39 +00:00
parent 75a184dacf
commit d09ae9328f
7 changed files with 777 additions and 195 deletions

View File

@ -1891,13 +1891,18 @@ let Predicates = [HasSVE] in {
defm : pred_store<nxv4i32, nxv4i1, non_temporal_store, STNT1W_ZRR, STNT1W_ZRI, am_sve_regreg_lsl2>;
defm : pred_store<nxv2i64, nxv2i1, non_temporal_store, STNT1D_ZRR, STNT1D_ZRI, am_sve_regreg_lsl3>;
multiclass unpred_store<PatFrag Store, ValueType Ty, Instruction RegImmInst,
Instruction PTrue> {
multiclass unpred_store<PatFrag Store, ValueType Ty, Instruction RegRegInst,
Instruction RegImmInst, Instruction PTrue,
ComplexPattern AddrCP> {
let AddedComplexity = 1 in {
def _reg : Pat<(Store (Ty ZPR:$val), (AddrCP GPR64sp:$base, GPR64:$offset)),
(RegRegInst ZPR:$val, (PTrue 31), GPR64sp:$base, GPR64:$offset)>;
}
let AddedComplexity = 2 in {
def _imm : Pat<(Store (Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)),
(RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
}
let AddedComplexity = 2 in {
let AddedComplexity = 3 in {
def _fi : Pat<(Store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)),
(RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
}
@ -1906,32 +1911,36 @@ let Predicates = [HasSVE] in {
(RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
}
defm : unpred_store< store, nxv16i8, ST1B_IMM, PTRUE_B>;
defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H_IMM, PTRUE_H>;
defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S_IMM, PTRUE_S>;
defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D_IMM, PTRUE_D>;
defm : unpred_store< store, nxv8i16, ST1H_IMM, PTRUE_H>;
defm : unpred_store<truncstorevi16, nxv4i32, ST1H_S_IMM, PTRUE_S>;
defm : unpred_store<truncstorevi16, nxv2i64, ST1H_D_IMM, PTRUE_D>;
defm : unpred_store< store, nxv4i32, ST1W_IMM, PTRUE_S>;
defm : unpred_store<truncstorevi32, nxv2i64, ST1W_D_IMM, PTRUE_D>;
defm : unpred_store< store, nxv2i64, ST1D_IMM, PTRUE_D>;
defm : unpred_store< store, nxv8f16, ST1H_IMM, PTRUE_H>;
defm : unpred_store< store, nxv8bf16, ST1H_IMM, PTRUE_H>;
defm : unpred_store< store, nxv4f16, ST1H_S_IMM, PTRUE_S>;
defm : unpred_store< store, nxv2f16, ST1H_D_IMM, PTRUE_D>;
defm : unpred_store< store, nxv4f32, ST1W_IMM, PTRUE_S>;
defm : unpred_store< store, nxv2f32, ST1W_D_IMM, PTRUE_D>;
defm : unpred_store< store, nxv2f64, ST1D_IMM, PTRUE_D>;
defm : unpred_store< store, nxv16i8, ST1B, ST1B_IMM, PTRUE_B, am_sve_regreg_lsl0>;
defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H, ST1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S, ST1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D, ST1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>;
defm : unpred_store< store, nxv8i16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
defm : unpred_store<truncstorevi16, nxv4i32, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
defm : unpred_store<truncstorevi16, nxv2i64, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
defm : unpred_store< store, nxv4i32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
defm : unpred_store<truncstorevi32, nxv2i64, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
defm : unpred_store< store, nxv2i64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
defm : unpred_store< store, nxv8f16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
defm : unpred_store< store, nxv8bf16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
defm : unpred_store< store, nxv4f16, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
defm : unpred_store< store, nxv2f16, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
defm : unpred_store< store, nxv4f32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
defm : unpred_store< store, nxv2f32, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
defm : unpred_store< store, nxv2f64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
multiclass unpred_load<PatFrag Load, ValueType Ty, Instruction RegImmInst,
Instruction PTrue> {
multiclass unpred_load<PatFrag Load, ValueType Ty, Instruction RegRegInst,
Instruction RegImmInst, Instruction PTrue,
ComplexPattern AddrCP> {
let AddedComplexity = 1 in {
def _reg: Pat<(Ty (Load (AddrCP GPR64sp:$base, GPR64:$offset))),
(RegRegInst (PTrue 31), GPR64sp:$base, GPR64:$offset)>;
}
let AddedComplexity = 2 in {
def _imm: Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))),
(RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
}
let AddedComplexity = 2 in {
let AddedComplexity = 3 in {
def _fi : Pat<(Ty (Load (am_sve_fi GPR64sp:$base, simm4s1:$offset))),
(RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
}
@ -1940,35 +1949,35 @@ let Predicates = [HasSVE] in {
(RegImmInst (PTrue 31), GPR64:$base, (i64 0))>;
}
defm : unpred_load< load, nxv16i8, LD1B_IMM, PTRUE_B>;
defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>;
defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>;
defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>;
defm : unpred_load< extloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>;
defm : unpred_load< extloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>;
defm : unpred_load< extloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>;
defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H_IMM, PTRUE_H>;
defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S_IMM, PTRUE_S>;
defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D_IMM, PTRUE_D>;
defm : unpred_load< load, nxv8i16, LD1H_IMM, PTRUE_H>;
defm : unpred_load<zextloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>;
defm : unpred_load<zextloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>;
defm : unpred_load< extloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>;
defm : unpred_load< extloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>;
defm : unpred_load<sextloadvi16, nxv4i32, LD1SH_S_IMM, PTRUE_S>;
defm : unpred_load<sextloadvi16, nxv2i64, LD1SH_D_IMM, PTRUE_D>;
defm : unpred_load< load, nxv4i32, LD1W_IMM, PTRUE_S>;
defm : unpred_load<zextloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>;
defm : unpred_load< extloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>;
defm : unpred_load<sextloadvi32, nxv2i64, LD1SW_D_IMM, PTRUE_D>;
defm : unpred_load< load, nxv2i64, LD1D_IMM, PTRUE_D>;
defm : unpred_load< load, nxv8f16, LD1H_IMM, PTRUE_H>;
defm : unpred_load< load, nxv8bf16, LD1H_IMM, PTRUE_H>;
defm : unpred_load< load, nxv4f16, LD1H_S_IMM, PTRUE_S>;
defm : unpred_load< load, nxv2f16, LD1H_D_IMM, PTRUE_D>;
defm : unpred_load< load, nxv4f32, LD1W_IMM, PTRUE_S>;
defm : unpred_load< load, nxv2f32, LD1W_D_IMM, PTRUE_D>;
defm : unpred_load< load, nxv2f64, LD1D_IMM, PTRUE_D>;
defm : unpred_load< load, nxv16i8, LD1B, LD1B_IMM, PTRUE_B, am_sve_regreg_lsl0>;
defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H, LD1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S, LD1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D, LD1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>;
defm : unpred_load< extloadvi8, nxv8i16, LD1B_H, LD1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
defm : unpred_load< extloadvi8, nxv4i32, LD1B_S, LD1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
defm : unpred_load< extloadvi8, nxv2i64, LD1B_D, LD1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>;
defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H, LD1SB_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S, LD1SB_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D, LD1SB_D_IMM, PTRUE_D, am_sve_regreg_lsl0>;
defm : unpred_load< load, nxv8i16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
defm : unpred_load<zextloadvi16, nxv4i32, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
defm : unpred_load<zextloadvi16, nxv2i64, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
defm : unpred_load< extloadvi16, nxv4i32, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
defm : unpred_load< extloadvi16, nxv2i64, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
defm : unpred_load<sextloadvi16, nxv4i32, LD1SH_S, LD1SH_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
defm : unpred_load<sextloadvi16, nxv2i64, LD1SH_D, LD1SH_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
defm : unpred_load< load, nxv4i32, LD1W, LD1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
defm : unpred_load<zextloadvi32, nxv2i64, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
defm : unpred_load< extloadvi32, nxv2i64, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
defm : unpred_load<sextloadvi32, nxv2i64, LD1SW_D, LD1SW_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
defm : unpred_load< load, nxv2i64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
defm : unpred_load< load, nxv8f16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
defm : unpred_load< load, nxv8bf16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
defm : unpred_load< load, nxv4f16, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
defm : unpred_load< load, nxv2f16, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
defm : unpred_load< load, nxv4f32, LD1W, LD1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
defm : unpred_load< load, nxv2f32, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
defm : unpred_load< load, nxv2f64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
multiclass unpred_store_predicate<ValueType Ty, Instruction Store> {
def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)),

View File

@ -14,14 +14,13 @@ define <vscale x 16 x i8> @splice_nxv16i8_first_idx(<vscale x 16 x i8> %a, <vsca
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: rdvl x9, #1
; CHECK-NEXT: sub x9, x9, #1 // =1
; CHECK-NEXT: cmp x9, #0 // =0
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x9, xzr, lo
; CHECK-NEXT: cmp x9, #0 // =0
; CHECK-NEXT: st1b { z0.b }, p0, [sp]
; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8]
; CHECK-NEXT: csel x9, x9, xzr, lo
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -36,15 +35,14 @@ define <vscale x 16 x i8> @splice_nxv16i8_last_idx(<vscale x 16 x i8> %a, <vscal
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: rdvl x9, #1
; CHECK-NEXT: sub x9, x9, #1 // =1
; CHECK-NEXT: mov w10, #15
; CHECK-NEXT: cmp x9, #15 // =15
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x9, x10, lo
; CHECK-NEXT: mov w10, #15
; CHECK-NEXT: cmp x9, #15 // =15
; CHECK-NEXT: st1b { z0.b }, p0, [sp]
; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8]
; CHECK-NEXT: csel x9, x9, x10, lo
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -60,15 +58,14 @@ define <vscale x 16 x i8> @splice_nxv16i8_clamped_idx(<vscale x 16 x i8> %a, <vs
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: rdvl x9, #1
; CHECK-NEXT: sub x9, x9, #1 // =1
; CHECK-NEXT: mov w10, #16
; CHECK-NEXT: cmp x9, #16 // =16
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x9, x10, lo
; CHECK-NEXT: mov w10, #16
; CHECK-NEXT: cmp x9, #16 // =16
; CHECK-NEXT: st1b { z0.b }, p0, [sp]
; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8]
; CHECK-NEXT: csel x9, x9, x10, lo
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -83,14 +80,13 @@ define <vscale x 8 x i16> @splice_nxv8i16_first_idx(<vscale x 8 x i16> %a, <vsca
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cnth x9
; CHECK-NEXT: sub x9, x9, #1 // =1
; CHECK-NEXT: cmp x9, #0 // =0
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x9, xzr, lo
; CHECK-NEXT: cmp x9, #0 // =0
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #1
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8]
; CHECK-NEXT: csel x9, x9, xzr, lo
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -105,15 +101,14 @@ define <vscale x 8 x i16> @splice_nxv8i16_last_idx(<vscale x 8 x i16> %a, <vscal
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cnth x10
; CHECK-NEXT: sub x10, x10, #1 // =1
; CHECK-NEXT: mov w9, #7
; CHECK-NEXT: cmp x10, #7 // =7
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: mov w9, #7
; CHECK-NEXT: cmp x10, #7 // =7
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #1
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8]
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -129,15 +124,14 @@ define <vscale x 8 x i16> @splice_nxv8i16_clamped_idx(<vscale x 8 x i16> %a, <vs
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cnth x10
; CHECK-NEXT: sub x10, x10, #1 // =1
; CHECK-NEXT: mov w9, #8
; CHECK-NEXT: cmp x10, #8 // =8
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: mov w9, #8
; CHECK-NEXT: cmp x10, #8 // =8
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #1
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8]
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -152,14 +146,13 @@ define <vscale x 4 x i32> @splice_nxv4i32_first_idx(<vscale x 4 x i32> %a, <vsca
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cntw x9
; CHECK-NEXT: sub x9, x9, #1 // =1
; CHECK-NEXT: cmp x9, #0 // =0
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x9, xzr, lo
; CHECK-NEXT: cmp x9, #0 // =0
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #2
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
; CHECK-NEXT: csel x9, x9, xzr, lo
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -174,15 +167,14 @@ define <vscale x 4 x i32> @splice_nxv4i32_last_idx(<vscale x 4 x i32> %a, <vscal
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cntw x10
; CHECK-NEXT: sub x10, x10, #1 // =1
; CHECK-NEXT: mov w9, #3
; CHECK-NEXT: cmp x10, #3 // =3
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: mov w9, #3
; CHECK-NEXT: cmp x10, #3 // =3
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #2
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -198,15 +190,14 @@ define <vscale x 4 x i32> @splice_nxv4i32_clamped_idx(<vscale x 4 x i32> %a, <vs
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cntw x10
; CHECK-NEXT: sub x10, x10, #1 // =1
; CHECK-NEXT: mov w9, #4
; CHECK-NEXT: cmp x10, #4 // =4
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: mov w9, #4
; CHECK-NEXT: cmp x10, #4 // =4
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #2
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -221,14 +212,13 @@ define <vscale x 2 x i64> @splice_nxv2i64_first_idx(<vscale x 2 x i64> %a, <vsca
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cntd x9
; CHECK-NEXT: sub x9, x9, #1 // =1
; CHECK-NEXT: cmp x9, #0 // =0
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x9, xzr, lo
; CHECK-NEXT: cmp x9, #0 // =0
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #3
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8]
; CHECK-NEXT: csel x9, x9, xzr, lo
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -243,14 +233,13 @@ define <vscale x 2 x i64> @splice_nxv2i64_last_idx(<vscale x 2 x i64> %a, <vscal
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cntd x9
; CHECK-NEXT: sub x9, x9, #1 // =1
; CHECK-NEXT: cmp x9, #1 // =1
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csinc x9, x9, xzr, lo
; CHECK-NEXT: cmp x9, #1 // =1
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #3
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8]
; CHECK-NEXT: csinc x9, x9, xzr, lo
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -266,15 +255,14 @@ define <vscale x 2 x i64> @splice_nxv2i64_clamped_idx(<vscale x 2 x i64> %a, <vs
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cntd x10
; CHECK-NEXT: sub x10, x10, #1 // =1
; CHECK-NEXT: mov w9, #2
; CHECK-NEXT: cmp x10, #2 // =2
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: mov w9, #2
; CHECK-NEXT: cmp x10, #2 // =2
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #3
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8]
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -289,14 +277,13 @@ define <vscale x 8 x half> @splice_nxv8f16_first_idx(<vscale x 8 x half> %a, <vs
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cnth x9
; CHECK-NEXT: sub x9, x9, #1 // =1
; CHECK-NEXT: cmp x9, #0 // =0
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x9, xzr, lo
; CHECK-NEXT: cmp x9, #0 // =0
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #1
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8]
; CHECK-NEXT: csel x9, x9, xzr, lo
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -311,15 +298,14 @@ define <vscale x 8 x half> @splice_nxv8f16_last_idx(<vscale x 8 x half> %a, <vsc
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cnth x10
; CHECK-NEXT: sub x10, x10, #1 // =1
; CHECK-NEXT: mov w9, #7
; CHECK-NEXT: cmp x10, #7 // =7
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: mov w9, #7
; CHECK-NEXT: cmp x10, #7 // =7
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #1
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8]
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -335,15 +321,14 @@ define <vscale x 8 x half> @splice_nxv8f16_clamped_idx(<vscale x 8 x half> %a, <
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cnth x10
; CHECK-NEXT: sub x10, x10, #1 // =1
; CHECK-NEXT: mov w9, #8
; CHECK-NEXT: cmp x10, #8 // =8
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: mov w9, #8
; CHECK-NEXT: cmp x10, #8 // =8
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #1
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8]
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -358,14 +343,13 @@ define <vscale x 4 x float> @splice_nxv4f32_first_idx(<vscale x 4 x float> %a, <
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cntw x9
; CHECK-NEXT: sub x9, x9, #1 // =1
; CHECK-NEXT: cmp x9, #0 // =0
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x9, xzr, lo
; CHECK-NEXT: cmp x9, #0 // =0
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #2
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
; CHECK-NEXT: csel x9, x9, xzr, lo
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -380,15 +364,14 @@ define <vscale x 4 x float> @splice_nxv4f32_last_idx(<vscale x 4 x float> %a, <v
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cntw x10
; CHECK-NEXT: sub x10, x10, #1 // =1
; CHECK-NEXT: mov w9, #3
; CHECK-NEXT: cmp x10, #3 // =3
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: mov w9, #3
; CHECK-NEXT: cmp x10, #3 // =3
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #2
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -404,15 +387,14 @@ define <vscale x 4 x float> @splice_nxv4f32_clamped_idx(<vscale x 4 x float> %a,
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cntw x10
; CHECK-NEXT: sub x10, x10, #1 // =1
; CHECK-NEXT: mov w9, #4
; CHECK-NEXT: cmp x10, #4 // =4
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: mov w9, #4
; CHECK-NEXT: cmp x10, #4 // =4
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #2
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -427,14 +409,13 @@ define <vscale x 2 x double> @splice_nxv2f64_first_idx(<vscale x 2 x double> %a,
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cntd x9
; CHECK-NEXT: sub x9, x9, #1 // =1
; CHECK-NEXT: cmp x9, #0 // =0
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x9, xzr, lo
; CHECK-NEXT: cmp x9, #0 // =0
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #3
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8]
; CHECK-NEXT: csel x9, x9, xzr, lo
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -449,14 +430,13 @@ define <vscale x 2 x double> @splice_nxv2f64_last_idx(<vscale x 2 x double> %a,
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cntd x9
; CHECK-NEXT: sub x9, x9, #1 // =1
; CHECK-NEXT: cmp x9, #1 // =1
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csinc x9, x9, xzr, lo
; CHECK-NEXT: cmp x9, #1 // =1
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #3
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8]
; CHECK-NEXT: csinc x9, x9, xzr, lo
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -472,15 +452,14 @@ define <vscale x 2 x double> @splice_nxv2f64_clamped_idx(<vscale x 2 x double> %
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cntd x10
; CHECK-NEXT: sub x10, x10, #1 // =1
; CHECK-NEXT: mov w9, #2
; CHECK-NEXT: cmp x10, #2 // =2
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: mov w9, #2
; CHECK-NEXT: cmp x10, #2 // =2
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #3
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8]
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -495,17 +474,16 @@ define <vscale x 2 x i1> @splice_nxv2i1_idx(<vscale x 2 x i1> %a, <vscale x 2 x
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cntd x9
; CHECK-NEXT: sub x9, x9, #1 // =1
; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: cmp x9, #1 // =1
; CHECK-NEXT: sub x9, x9, #1 // =1
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: mov z0.d, p1/z, #1 // =0x1
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csinc x9, x9, xzr, lo
; CHECK-NEXT: cmp x9, #1 // =1
; CHECK-NEXT: st1d { z0.d }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #3
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8]
; CHECK-NEXT: csinc x9, x9, xzr, lo
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3]
; CHECK-NEXT: and z0.d, z0.d, #0x1
; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
; CHECK-NEXT: addvl sp, sp, #2
@ -522,18 +500,17 @@ define <vscale x 4 x i1> @splice_nxv4i1_idx(<vscale x 4 x i1> %a, <vscale x 4 x
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cntw x10
; CHECK-NEXT: sub x10, x10, #1 // =1
; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov w9, #2
; CHECK-NEXT: cmp x10, #2 // =2
; CHECK-NEXT: sub x10, x10, #1 // =1
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
; CHECK-NEXT: mov z0.s, p1/z, #1 // =0x1
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: mov w9, #2
; CHECK-NEXT: cmp x10, #2 // =2
; CHECK-NEXT: st1w { z0.s }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #2
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
; CHECK-NEXT: and z0.s, z0.s, #0x1
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
; CHECK-NEXT: addvl sp, sp, #2
@ -550,18 +527,17 @@ define <vscale x 8 x i1> @splice_nxv8i1_idx(<vscale x 8 x i1> %a, <vscale x 8 x
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cnth x10
; CHECK-NEXT: sub x10, x10, #1 // =1
; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov w9, #4
; CHECK-NEXT: cmp x10, #4 // =4
; CHECK-NEXT: sub x10, x10, #1 // =1
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: mov z0.h, p1/z, #1 // =0x1
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: mov w9, #4
; CHECK-NEXT: cmp x10, #4 // =4
; CHECK-NEXT: st1h { z0.h }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #1
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8]
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1]
; CHECK-NEXT: and z0.h, z0.h, #0x1
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
; CHECK-NEXT: addvl sp, sp, #2
@ -578,18 +554,17 @@ define <vscale x 16 x i1> @splice_nxv16i1_idx(<vscale x 16 x i1> %a, <vscale x 1
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: rdvl x9, #1
; CHECK-NEXT: sub x9, x9, #1 // =1
; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mov w10, #8
; CHECK-NEXT: cmp x9, #8 // =8
; CHECK-NEXT: sub x9, x9, #1 // =1
; CHECK-NEXT: st1b { z0.b }, p0, [sp]
; CHECK-NEXT: mov z0.b, p1/z, #1 // =0x1
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x9, x10, lo
; CHECK-NEXT: mov w10, #8
; CHECK-NEXT: cmp x9, #8 // =8
; CHECK-NEXT: st1b { z0.b }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8]
; CHECK-NEXT: csel x9, x9, x10, lo
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9]
; CHECK-NEXT: and z0.b, z0.b, #0x1
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
; CHECK-NEXT: addvl sp, sp, #2
@ -607,14 +582,13 @@ define <vscale x 2 x i8> @splice_nxv2i8_idx(<vscale x 2 x i8> %a, <vscale x 2 x
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: cntd x9
; CHECK-NEXT: sub x9, x9, #1 // =1
; CHECK-NEXT: cmp x9, #1 // =1
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csinc x9, x9, xzr, lo
; CHECK-NEXT: cmp x9, #1 // =1
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl]
; CHECK-NEXT: add x8, x8, x9, lsl #3
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8]
; CHECK-NEXT: csinc x9, x9, xzr, lo
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -657,11 +631,10 @@ define <vscale x 16 x float> @splice_nxv16f32_clamped_idx(<vscale x 16 x float>
; CHECK-NEXT: addvl sp, sp, #-8
; CHECK-NEXT: rdvl x10, #1
; CHECK-NEXT: sub x10, x10, #1 // =1
; CHECK-NEXT: mov w9, #16
; CHECK-NEXT: cmp x10, #16 // =16
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: mov w9, #16
; CHECK-NEXT: cmp x10, #16 // =16
; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl]
; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl]
; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl]
@ -670,8 +643,9 @@ define <vscale x 16 x float> @splice_nxv16f32_clamped_idx(<vscale x 16 x float>
; CHECK-NEXT: st1w { z4.s }, p0, [x8, #4, mul vl]
; CHECK-NEXT: st1w { z5.s }, p0, [x8, #5, mul vl]
; CHECK-NEXT: st1w { z6.s }, p0, [x8, #6, mul vl]
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
; CHECK-NEXT: add x8, x8, x9, lsl #2
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl]
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8, #2, mul vl]
; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8, #3, mul vl]
@ -696,8 +670,8 @@ define <vscale x 16 x i8> @splice_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x
; CHECK-NEXT: st1b { z0.b }, p0, [sp]
; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl]
; CHECK-NEXT: addvl x8, x8, #1
; CHECK-NEXT: sub x8, x8, #16 // =16
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8]
; CHECK-NEXT: mov x9, #-16
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -715,8 +689,8 @@ define <vscale x 16 x i8> @splice_nxv16i8_1(<vscale x 16 x i8> %a, <vscale x 16
; CHECK-NEXT: st1b { z0.b }, p0, [sp]
; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl]
; CHECK-NEXT: addvl x8, x8, #1
; CHECK-NEXT: sub x8, x8, #1 // =1
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8]
; CHECK-NEXT: mov x9, #-1
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@ -1205,8 +1179,8 @@ define <vscale x 16 x i1> @splice_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x
; CHECK-NEXT: st1b { z0.b }, p0, [sp]
; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl]
; CHECK-NEXT: addvl x8, x8, #1
; CHECK-NEXT: sub x8, x8, #1 // =1
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8]
; CHECK-NEXT: mov x9, #-1
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9]
; CHECK-NEXT: and z0.b, z0.b, #0x1
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
; CHECK-NEXT: addvl sp, sp, #2

View File

@ -0,0 +1,74 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -disable-lsr < %s 2>%t | FileCheck %s
; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
; WARN-NOT: warning
; Check that vscale call is recognised by load/store reg/reg pattern and
; partially folded, with the rest pulled out of the loop. This requires LSR to
; be disabled, which is something that will be addressed at a later date.
define void @ld1w_reg_loop([32000 x i32]* %addr) {
; CHECK-LABEL: ld1w_reg_loop:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: cntw x9
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; CHECK-NEXT: adds x8, x8, x9
; CHECK-NEXT: b.ne .LBB0_1
; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
; CHECK-NEXT: ret
entry:
%0 = call i64 @llvm.vscale.i64()
%1 = shl i64 %0, 2
br label %vector.body
vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%2 = getelementptr inbounds [32000 x i32], [32000 x i32]* %addr, i64 0, i64 %index
%3 = bitcast i32* %2 to <vscale x 4 x i32>*
%load = load volatile <vscale x 4 x i32>, <vscale x 4 x i32>* %3, align 16
%index.next = add i64 %index, %1
%4 = icmp eq i64 %index.next, 0
br i1 %4, label %for.cond.cleanup, label %vector.body
for.cond.cleanup:
ret void
}
define void @st1w_reg_loop([32000 x i32]* %addr, <vscale x 4 x i32> %val) {
; CHECK-LABEL: st1w_reg_loop:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: cntw x9
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; CHECK-NEXT: adds x8, x8, x9
; CHECK-NEXT: b.ne .LBB1_1
; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
; CHECK-NEXT: ret
entry:
%0 = call i64 @llvm.vscale.i64()
%1 = shl i64 %0, 2
br label %vector.body
vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%2 = getelementptr inbounds [32000 x i32], [32000 x i32]* %addr, i64 0, i64 %index
%3 = bitcast i32* %2 to <vscale x 4 x i32>*
store volatile <vscale x 4 x i32> %val, <vscale x 4 x i32>* %3, align 16
%index.next = add i64 %index, %1
%4 = icmp eq i64 %index.next, 0
br i1 %4, label %for.cond.cleanup, label %vector.body
for.cond.cleanup:
ret void
}
declare i64 @llvm.vscale.i64()

View File

@ -43,9 +43,9 @@ define <vscale x 16 x i8> @ld1b_upper_bound(<vscale x 16 x i8>* %a) {
define <vscale x 16 x i8> @ld1b_out_of_upper_bound(<vscale x 16 x i8>* %a) {
; CHECK-LABEL: ld1b_out_of_upper_bound:
; CHECK: // %bb.0:
; CHECK-NEXT: addvl x8, x0, #8
; CHECK-NEXT: rdvl x8, #8
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8]
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; CHECK-NEXT: ret
%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %a, i64 8
%load = load <vscale x 16 x i8>, <vscale x 16 x i8>* %base
@ -55,9 +55,9 @@ define <vscale x 16 x i8> @ld1b_out_of_upper_bound(<vscale x 16 x i8>* %a) {
define <vscale x 16 x i8> @ld1b_out_of_lower_bound(<vscale x 16 x i8>* %a) {
; CHECK-LABEL: ld1b_out_of_lower_bound:
; CHECK: // %bb.0:
; CHECK-NEXT: addvl x8, x0, #-9
; CHECK-NEXT: rdvl x8, #-9
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8]
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; CHECK-NEXT: ret
%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %a, i64 -9
%load = load <vscale x 16 x i8>, <vscale x 16 x i8>* %base
@ -138,4 +138,3 @@ define void @load_nxv12f16(<vscale x 12 x half>* %a) {
%val = load volatile <vscale x 12 x half>, <vscale x 12 x half>* %a
ret void
}

View File

@ -0,0 +1,302 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s 2>%t | FileCheck %s
; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
; WARN-NOT: warning
; LD1B
define <vscale x 16 x i8> @ld1_nxv16i8(i8* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i8, i8* %addr, i64 %off
%ptrcast = bitcast i8* %ptr to <vscale x 16 x i8>*
%val = load volatile <vscale x 16 x i8>, <vscale x 16 x i8>* %ptrcast
ret <vscale x 16 x i8> %val
}
define <vscale x 8 x i16> @ld1_nxv8i16_zext8(i8* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv8i16_zext8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i8, i8* %addr, i64 %off
%ptrcast = bitcast i8* %ptr to <vscale x 8 x i8>*
%val = load volatile <vscale x 8 x i8>, <vscale x 8 x i8>* %ptrcast
%zext = zext <vscale x 8 x i8> %val to <vscale x 8 x i16>
ret <vscale x 8 x i16> %zext
}
define <vscale x 4 x i32> @ld1_nxv4i32_zext8(i8* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv4i32_zext8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i8, i8* %addr, i64 %off
%ptrcast = bitcast i8* %ptr to <vscale x 4 x i8>*
%val = load volatile <vscale x 4 x i8>, <vscale x 4 x i8>* %ptrcast
%zext = zext <vscale x 4 x i8> %val to <vscale x 4 x i32>
ret <vscale x 4 x i32> %zext
}
define <vscale x 2 x i64> @ld1_nxv2i64_zext8(i8* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv2i64_zext8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i8, i8* %addr, i64 %off
%ptrcast = bitcast i8* %ptr to <vscale x 2 x i8>*
%val = load volatile <vscale x 2 x i8>, <vscale x 2 x i8>* %ptrcast
%zext = zext <vscale x 2 x i8> %val to <vscale x 2 x i64>
ret <vscale x 2 x i64> %zext
}
define <vscale x 8 x i16> @ld1_nxv8i16_sext8(i8* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv8i16_sext8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i8, i8* %addr, i64 %off
%ptrcast = bitcast i8* %ptr to <vscale x 8 x i8>*
%val = load volatile <vscale x 8 x i8>, <vscale x 8 x i8>* %ptrcast
%sext = sext <vscale x 8 x i8> %val to <vscale x 8 x i16>
ret <vscale x 8 x i16> %sext
}
define <vscale x 4 x i32> @ld1_nxv4i32_sext8(i8* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv4i32_sext8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, x1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i8, i8* %addr, i64 %off
%ptrcast = bitcast i8* %ptr to <vscale x 4 x i8>*
%val = load volatile <vscale x 4 x i8>, <vscale x 4 x i8>* %ptrcast
%sext = sext <vscale x 4 x i8> %val to <vscale x 4 x i32>
ret <vscale x 4 x i32> %sext
}
define <vscale x 2 x i64> @ld1_nxv2i64_sext8(i8* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv2i64_sext8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, x1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i8, i8* %addr, i64 %off
%ptrcast = bitcast i8* %ptr to <vscale x 2 x i8>*
%val = load volatile <vscale x 2 x i8>, <vscale x 2 x i8>* %ptrcast
%sext = sext <vscale x 2 x i8> %val to <vscale x 2 x i64>
ret <vscale x 2 x i64> %sext
}
; LD1H
define <vscale x 8 x i16> @ld1_nxv8i16(i16* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i16, i16* %addr, i64 %off
%ptrcast = bitcast i16* %ptr to <vscale x 8 x i16>*
%val = load volatile <vscale x 8 x i16>, <vscale x 8 x i16>* %ptrcast
ret <vscale x 8 x i16> %val
}
define <vscale x 4 x i32> @ld1_nxv4i32_zext16(i16* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv4i32_zext16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i16, i16* %addr, i64 %off
%ptrcast = bitcast i16* %ptr to <vscale x 4 x i16>*
%val = load volatile <vscale x 4 x i16>, <vscale x 4 x i16>* %ptrcast
%zext = zext <vscale x 4 x i16> %val to <vscale x 4 x i32>
ret <vscale x 4 x i32> %zext
}
define <vscale x 2 x i64> @ld1_nxv2i64_zext16(i16* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv2i64_zext16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i16, i16* %addr, i64 %off
%ptrcast = bitcast i16* %ptr to <vscale x 2 x i16>*
%val = load volatile <vscale x 2 x i16>, <vscale x 2 x i16>* %ptrcast
%zext = zext <vscale x 2 x i16> %val to <vscale x 2 x i64>
ret <vscale x 2 x i64> %zext
}
define <vscale x 4 x i32> @ld1_nxv4i32_sext16(i16* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv4i32_sext16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i16, i16* %addr, i64 %off
%ptrcast = bitcast i16* %ptr to <vscale x 4 x i16>*
%val = load volatile <vscale x 4 x i16>, <vscale x 4 x i16>* %ptrcast
%sext = sext <vscale x 4 x i16> %val to <vscale x 4 x i32>
ret <vscale x 4 x i32> %sext
}
define <vscale x 2 x i64> @ld1_nxv2i64_sext16(i16* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv2i64_sext16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i16, i16* %addr, i64 %off
%ptrcast = bitcast i16* %ptr to <vscale x 2 x i16>*
%val = load volatile <vscale x 2 x i16>, <vscale x 2 x i16>* %ptrcast
%sext = sext <vscale x 2 x i16> %val to <vscale x 2 x i64>
ret <vscale x 2 x i64> %sext
}
define <vscale x 8 x half> @ld1_nxv8f16(half* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds half, half* %addr, i64 %off
%ptrcast = bitcast half* %ptr to <vscale x 8 x half>*
%val = load volatile <vscale x 8 x half>, <vscale x 8 x half>* %ptrcast
ret <vscale x 8 x half> %val
}
define <vscale x 8 x bfloat> @ld1_nxv8bf16(bfloat* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %off
%ptrcast = bitcast bfloat* %ptr to <vscale x 8 x bfloat>*
%val = load volatile <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %ptrcast
ret <vscale x 8 x bfloat> %val
}
define <vscale x 4 x half> @ld1_nxv4f16(half* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds half, half* %addr, i64 %off
%ptrcast = bitcast half* %ptr to <vscale x 4 x half>*
%val = load volatile <vscale x 4 x half>, <vscale x 4 x half>* %ptrcast
ret <vscale x 4 x half> %val
}
define <vscale x 2 x half> @ld1_nxv2f16(half* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds half, half* %addr, i64 %off
%ptrcast = bitcast half* %ptr to <vscale x 2 x half>*
%val = load volatile <vscale x 2 x half>, <vscale x 2 x half>* %ptrcast
ret <vscale x 2 x half> %val
}
; LD1W
define <vscale x 4 x i32> @ld1_nxv4i32(i32* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x1, lsl #2]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i32, i32* %addr, i64 %off
%ptrcast = bitcast i32* %ptr to <vscale x 4 x i32>*
%val = load volatile <vscale x 4 x i32>, <vscale x 4 x i32>* %ptrcast
ret <vscale x 4 x i32> %val
}
define <vscale x 2 x i64> @ld1_nxv2i64_zext32(i32* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv2i64_zext32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i32, i32* %addr, i64 %off
%ptrcast = bitcast i32* %ptr to <vscale x 2 x i32>*
%val = load volatile <vscale x 2 x i32>, <vscale x 2 x i32>* %ptrcast
%zext = zext <vscale x 2 x i32> %val to <vscale x 2 x i64>
ret <vscale x 2 x i64> %zext
}
define <vscale x 2 x i64> @ld1_nxv2i64_sext32(i32* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv2i64_sext32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i32, i32* %addr, i64 %off
%ptrcast = bitcast i32* %ptr to <vscale x 2 x i32>*
%val = load volatile <vscale x 2 x i32>, <vscale x 2 x i32>* %ptrcast
%sext = sext <vscale x 2 x i32> %val to <vscale x 2 x i64>
ret <vscale x 2 x i64> %sext
}
define <vscale x 4 x float> @ld1_nxv4f32(float* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x1, lsl #2]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds float, float* %addr, i64 %off
%ptrcast = bitcast float* %ptr to <vscale x 4 x float>*
%val = load volatile <vscale x 4 x float>, <vscale x 4 x float>* %ptrcast
ret <vscale x 4 x float> %val
}
define <vscale x 2 x float> @ld1_nxv2f32(float* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds float, float* %addr, i64 %off
%ptrcast = bitcast float* %ptr to <vscale x 2 x float>*
%val = load volatile <vscale x 2 x float>, <vscale x 2 x float>* %ptrcast
ret <vscale x 2 x float> %val
}
; LD1D
define <vscale x 2 x i64> @ld1_nxv2i64(i64* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x1, lsl #3]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i64, i64* %addr, i64 %off
%ptrcast = bitcast i64* %ptr to <vscale x 2 x i64>*
%val = load volatile <vscale x 2 x i64>, <vscale x 2 x i64>* %ptrcast
ret <vscale x 2 x i64> %val
}
define <vscale x 2 x double> @ld1_nxv2f64(double* %addr, i64 %off) {
; CHECK-LABEL: ld1_nxv2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x1, lsl #3]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds double, double* %addr, i64 %off
%ptrcast = bitcast double* %ptr to <vscale x 2 x double>*
%val = load volatile <vscale x 2 x double>, <vscale x 2 x double>* %ptrcast
ret <vscale x 2 x double> %val
}

View File

@ -43,9 +43,9 @@ define void @st1b_upper_bound(<vscale x 16 x i8> %data, <vscale x 16 x i8>* %a)
define void @st1b_out_of_upper_bound(<vscale x 16 x i8> %data, <vscale x 16 x i8>* %a) {
; CHECK-LABEL: st1b_out_of_upper_bound:
; CHECK: // %bb.0:
; CHECK-NEXT: addvl x8, x0, #8
; CHECK-NEXT: rdvl x8, #8
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x8]
; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8]
; CHECK-NEXT: ret
%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %a, i64 8
store <vscale x 16 x i8> %data, <vscale x 16 x i8>* %base
@ -55,9 +55,9 @@ define void @st1b_out_of_upper_bound(<vscale x 16 x i8> %data, <vscale x 16 x i8
define void @st1b_out_of_lower_bound(<vscale x 16 x i8> %data, <vscale x 16 x i8>* %a) {
; CHECK-LABEL: st1b_out_of_lower_bound:
; CHECK: // %bb.0:
; CHECK-NEXT: addvl x8, x0, #-9
; CHECK-NEXT: rdvl x8, #-9
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x8]
; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8]
; CHECK-NEXT: ret
%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %a, i64 -9
store <vscale x 16 x i8> %data, <vscale x 16 x i8>* %base

View File

@ -0,0 +1,224 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s 2>%t | FileCheck %s
; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
; WARN-NOT: warning
; ST1B
define void @st1_nxv16i8(i8* %addr, i64 %off, <vscale x 16 x i8> %val) {
; CHECK-LABEL: st1_nxv16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0, x1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i8, i8* %addr, i64 %off
%ptrcast = bitcast i8* %ptr to <vscale x 16 x i8>*
store <vscale x 16 x i8> %val, <vscale x 16 x i8>* %ptrcast
ret void
}
define void @st1_nxv8i16_trunc8(i8* %addr, i64 %off, <vscale x 8 x i16> %val) {
; CHECK-LABEL: st1_nxv8i16_trunc8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: st1b { z0.h }, p0, [x0, x1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i8, i8* %addr, i64 %off
%ptrcast = bitcast i8* %ptr to <vscale x 8 x i8>*
%trunc = trunc <vscale x 8 x i16> %val to <vscale x 8 x i8>
store <vscale x 8 x i8> %trunc, <vscale x 8 x i8>* %ptrcast
ret void
}
define void @st1_nxv4i32_trunc8(i8* %addr, i64 %off, <vscale x 4 x i32> %val) {
; CHECK-LABEL: st1_nxv4i32_trunc8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: st1b { z0.s }, p0, [x0, x1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i8, i8* %addr, i64 %off
%ptrcast = bitcast i8* %ptr to <vscale x 4 x i8>*
%trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i8>
store <vscale x 4 x i8> %trunc, <vscale x 4 x i8>* %ptrcast
ret void
}
define void @st1_nxv2i64_trunc8(i8* %addr, i64 %off, <vscale x 2 x i64> %val) {
; CHECK-LABEL: st1_nxv2i64_trunc8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1b { z0.d }, p0, [x0, x1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i8, i8* %addr, i64 %off
%ptrcast = bitcast i8* %ptr to <vscale x 2 x i8>*
%trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i8>
store <vscale x 2 x i8> %trunc, <vscale x 2 x i8>* %ptrcast
ret void
}
; ST1H
define void @st1_nxv8i16(i16* %addr, i64 %off, <vscale x 8 x i16> %val) {
; CHECK-LABEL: st1_nxv8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i16, i16* %addr, i64 %off
%ptrcast = bitcast i16* %ptr to <vscale x 8 x i16>*
store <vscale x 8 x i16> %val, <vscale x 8 x i16>* %ptrcast
ret void
}
define void @st1_nxv4i32_trunc16(i16* %addr, i64 %off, <vscale x 4 x i32> %val) {
; CHECK-LABEL: st1_nxv4i32_trunc16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i16, i16* %addr, i64 %off
%ptrcast = bitcast i16* %ptr to <vscale x 4 x i16>*
%trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i16>
store <vscale x 4 x i16> %trunc, <vscale x 4 x i16>* %ptrcast
ret void
}
define void @st1_nxv2i64_trunc16(i16* %addr, i64 %off, <vscale x 2 x i64> %val) {
; CHECK-LABEL: st1_nxv2i64_trunc16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i16, i16* %addr, i64 %off
%ptrcast = bitcast i16* %ptr to <vscale x 2 x i16>*
%trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i16>
store <vscale x 2 x i16> %trunc, <vscale x 2 x i16>* %ptrcast
ret void
}
define void @st1_nxv8f16(half* %addr, i64 %off, <vscale x 8 x half> %val) {
; CHECK-LABEL: st1_nxv8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds half, half* %addr, i64 %off
%ptrcast = bitcast half* %ptr to <vscale x 8 x half>*
store <vscale x 8 x half> %val, <vscale x 8 x half>* %ptrcast
ret void
}
define void @st1_nxv8bf16(bfloat* %addr, i64 %off, <vscale x 8 x bfloat> %val) {
; CHECK-LABEL: st1_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %off
%ptrcast = bitcast bfloat* %ptr to <vscale x 8 x bfloat>*
store <vscale x 8 x bfloat> %val, <vscale x 8 x bfloat>* %ptrcast
ret void
}
define void @st1_nxv4f16(half* %addr, i64 %off, <vscale x 4 x half> %val) {
; CHECK-LABEL: st1_nxv4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds half, half* %addr, i64 %off
%ptrcast = bitcast half* %ptr to <vscale x 4 x half>*
store <vscale x 4 x half> %val, <vscale x 4 x half>* %ptrcast
ret void
}
define void @st1_nxv2f16(half* %addr, i64 %off, <vscale x 2 x half> %val) {
; CHECK-LABEL: st1_nxv2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds half, half* %addr, i64 %off
%ptrcast = bitcast half* %ptr to <vscale x 2 x half>*
store <vscale x 2 x half> %val, <vscale x 2 x half>* %ptrcast
ret void
}
; ST1W
define void @st1_nxv4i32(i32* %addr, i64 %off, <vscale x 4 x i32> %val) {
; CHECK-LABEL: st1_nxv4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0, x1, lsl #2]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i32, i32* %addr, i64 %off
%ptrcast = bitcast i32* %ptr to <vscale x 4 x i32>*
store <vscale x 4 x i32> %val, <vscale x 4 x i32>* %ptrcast
ret void
}
define void @st1_nxv2i64_trunc32(i32* %addr, i64 %off, <vscale x 2 x i64> %val) {
; CHECK-LABEL: st1_nxv2i64_trunc32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i32, i32* %addr, i64 %off
%ptrcast = bitcast i32* %ptr to <vscale x 2 x i32>*
%trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i32>
store <vscale x 2 x i32> %trunc, <vscale x 2 x i32>* %ptrcast
ret void
}
define void @st1_nxv4f32(float* %addr, i64 %off, <vscale x 4 x float> %val) {
; CHECK-LABEL: st1_nxv4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0, x1, lsl #2]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds float, float* %addr, i64 %off
%ptrcast = bitcast float* %ptr to <vscale x 4 x float>*
store <vscale x 4 x float> %val, <vscale x 4 x float>* %ptrcast
ret void
}
define void @st1_nxv2f32(float* %addr, i64 %off, <vscale x 2 x float> %val) {
; CHECK-LABEL: st1_nxv2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds float, float* %addr, i64 %off
%ptrcast = bitcast float* %ptr to <vscale x 2 x float>*
store <vscale x 2 x float> %val, <vscale x 2 x float>* %ptrcast
ret void
}
; ST1D
define void @st1_nxv2i64(i64* %addr, i64 %off, <vscale x 2 x i64> %val) {
; CHECK-LABEL: st1_nxv2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds i64, i64* %addr, i64 %off
%ptrcast = bitcast i64* %ptr to <vscale x 2 x i64>*
store <vscale x 2 x i64> %val, <vscale x 2 x i64>* %ptrcast
ret void
}
define void @st1_nxv2f64(double* %addr, i64 %off, <vscale x 2 x double> %val) {
; CHECK-LABEL: st1_nxv2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3]
; CHECK-NEXT: ret
%ptr = getelementptr inbounds double, double* %addr, i64 %off
%ptrcast = bitcast double* %ptr to <vscale x 2 x double>*
store <vscale x 2 x double> %val, <vscale x 2 x double>* %ptrcast
ret void
}