From d09ae9328f67fd419ab8cea0e73dcdfe8d75f481 Mon Sep 17 00:00:00 2001 From: Bradley Smith Date: Thu, 28 Jan 2021 12:39:39 +0000 Subject: [PATCH] [AArch64][SVE] Add unpredicated ld1/st1 patterns for reg+reg addressing modes Differential Revision: https://reviews.llvm.org/D95677 --- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 115 ++++--- .../AArch64/named-vector-shuffles-sve.ll | 240 +++++++------- llvm/test/CodeGen/AArch64/sve-fold-vscale.ll | 74 +++++ .../sve-ld1-addressing-mode-reg-imm.ll | 9 +- .../sve-ld1-addressing-mode-reg-reg.ll | 302 ++++++++++++++++++ .../sve-st1-addressing-mode-reg-imm.ll | 8 +- .../sve-st1-addressing-mode-reg-reg.ll | 224 +++++++++++++ 7 files changed, 777 insertions(+), 195 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve-fold-vscale.ll create mode 100644 llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-reg.ll create mode 100644 llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-reg.ll diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 764241040133..63a53cc0c8f1 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1891,13 +1891,18 @@ let Predicates = [HasSVE] in { defm : pred_store; defm : pred_store; - multiclass unpred_store { + multiclass unpred_store { let AddedComplexity = 1 in { + def _reg : Pat<(Store (Ty ZPR:$val), (AddrCP GPR64sp:$base, GPR64:$offset)), + (RegRegInst ZPR:$val, (PTrue 31), GPR64sp:$base, GPR64:$offset)>; + } + let AddedComplexity = 2 in { def _imm : Pat<(Store (Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)), (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } - let AddedComplexity = 2 in { + let AddedComplexity = 3 in { def _fi : Pat<(Store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)), (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } @@ -1906,32 +1911,36 @@ let Predicates = [HasSVE] in { (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>; } - defm : unpred_store< store, nxv16i8, ST1B_IMM, PTRUE_B>; - defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H_IMM, PTRUE_H>; - defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S_IMM, PTRUE_S>; - defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D_IMM, PTRUE_D>; - defm : unpred_store< store, nxv8i16, ST1H_IMM, PTRUE_H>; - defm : unpred_store; - defm : unpred_store; - defm : unpred_store< store, nxv4i32, ST1W_IMM, PTRUE_S>; - defm : unpred_store; - defm : unpred_store< store, nxv2i64, ST1D_IMM, PTRUE_D>; - defm : unpred_store< store, nxv8f16, ST1H_IMM, PTRUE_H>; - defm : unpred_store< store, nxv8bf16, ST1H_IMM, PTRUE_H>; - defm : unpred_store< store, nxv4f16, ST1H_S_IMM, PTRUE_S>; - defm : unpred_store< store, nxv2f16, ST1H_D_IMM, PTRUE_D>; - defm : unpred_store< store, nxv4f32, ST1W_IMM, PTRUE_S>; - defm : unpred_store< store, nxv2f32, ST1W_D_IMM, PTRUE_D>; - defm : unpred_store< store, nxv2f64, ST1D_IMM, PTRUE_D>; + defm : unpred_store< store, nxv16i8, ST1B, ST1B_IMM, PTRUE_B, am_sve_regreg_lsl0>; + defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H, ST1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S, ST1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D, ST1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_store< store, nxv8i16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_store; + defm : unpred_store; + defm : unpred_store< store, nxv4i32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_store; + defm : unpred_store< store, nxv2i64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + defm : unpred_store< store, nxv8f16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv8bf16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv4f16, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv2f16, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv4f32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_store< store, nxv2f32, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; + defm : unpred_store< store, nxv2f64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; - multiclass unpred_load { + multiclass unpred_load { let AddedComplexity = 1 in { + def _reg: Pat<(Ty (Load (AddrCP GPR64sp:$base, GPR64:$offset))), + (RegRegInst (PTrue 31), GPR64sp:$base, GPR64:$offset)>; + } + let AddedComplexity = 2 in { def _imm: Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))), (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } - - let AddedComplexity = 2 in { + let AddedComplexity = 3 in { def _fi : Pat<(Ty (Load (am_sve_fi GPR64sp:$base, simm4s1:$offset))), (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } @@ -1940,35 +1949,35 @@ let Predicates = [HasSVE] in { (RegImmInst (PTrue 31), GPR64:$base, (i64 0))>; } - defm : unpred_load< load, nxv16i8, LD1B_IMM, PTRUE_B>; - defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>; - defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>; - defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>; - defm : unpred_load< extloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>; - defm : unpred_load< extloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>; - defm : unpred_load< extloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>; - defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H_IMM, PTRUE_H>; - defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S_IMM, PTRUE_S>; - defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D_IMM, PTRUE_D>; - defm : unpred_load< load, nxv8i16, LD1H_IMM, PTRUE_H>; - defm : unpred_load; - defm : unpred_load; - defm : unpred_load< extloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>; - defm : unpred_load< extloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>; - defm : unpred_load; - defm : unpred_load; - defm : unpred_load< load, nxv4i32, LD1W_IMM, PTRUE_S>; - defm : unpred_load; - defm : unpred_load< extloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>; - defm : unpred_load; - defm : unpred_load< load, nxv2i64, LD1D_IMM, PTRUE_D>; - defm : unpred_load< load, nxv8f16, LD1H_IMM, PTRUE_H>; - defm : unpred_load< load, nxv8bf16, LD1H_IMM, PTRUE_H>; - defm : unpred_load< load, nxv4f16, LD1H_S_IMM, PTRUE_S>; - defm : unpred_load< load, nxv2f16, LD1H_D_IMM, PTRUE_D>; - defm : unpred_load< load, nxv4f32, LD1W_IMM, PTRUE_S>; - defm : unpred_load< load, nxv2f32, LD1W_D_IMM, PTRUE_D>; - defm : unpred_load< load, nxv2f64, LD1D_IMM, PTRUE_D>; + defm : unpred_load< load, nxv16i8, LD1B, LD1B_IMM, PTRUE_B, am_sve_regreg_lsl0>; + defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H, LD1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S, LD1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D, LD1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_load< extloadvi8, nxv8i16, LD1B_H, LD1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_load< extloadvi8, nxv4i32, LD1B_S, LD1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_load< extloadvi8, nxv2i64, LD1B_D, LD1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H, LD1SB_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S, LD1SB_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D, LD1SB_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_load< load, nxv8i16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_load; + defm : unpred_load; + defm : unpred_load< extloadvi16, nxv4i32, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>; + defm : unpred_load< extloadvi16, nxv2i64, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>; + defm : unpred_load; + defm : unpred_load; + defm : unpred_load< load, nxv4i32, LD1W, LD1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_load; + defm : unpred_load< extloadvi32, nxv2i64, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; + defm : unpred_load; + defm : unpred_load< load, nxv2i64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + defm : unpred_load< load, nxv8f16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv8bf16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv4f16, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv2f16, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv4f32, LD1W, LD1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_load< load, nxv2f32, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; + defm : unpred_load< load, nxv2f64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; multiclass unpred_store_predicate { def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)), diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll index ab8818a1ad10..a9941221c4b8 100644 --- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll +++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -14,14 +14,13 @@ define @splice_nxv16i8_first_idx( %a, @splice_nxv16i8_last_idx( %a, @splice_nxv16i8_clamped_idx( %a, @splice_nxv8i16_first_idx( %a, @splice_nxv8i16_last_idx( %a, @splice_nxv8i16_clamped_idx( %a, @splice_nxv4i32_first_idx( %a, @splice_nxv4i32_last_idx( %a, @splice_nxv4i32_clamped_idx( %a, @splice_nxv2i64_first_idx( %a, @splice_nxv2i64_last_idx( %a, @splice_nxv2i64_clamped_idx( %a, @splice_nxv8f16_first_idx( %a, @splice_nxv8f16_last_idx( %a, @splice_nxv8f16_clamped_idx( %a, < ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cnth x10 ; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: mov w9, #8 -; CHECK-NEXT: cmp x10, #8 // =8 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: mov w9, #8 +; CHECK-NEXT: cmp x10, #8 // =8 ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -358,14 +343,13 @@ define @splice_nxv4f32_first_idx( %a, < ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntw x9 ; CHECK-NEXT: sub x9, x9, #1 // =1 -; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #2 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -380,15 +364,14 @@ define @splice_nxv4f32_last_idx( %a, @splice_nxv4f32_clamped_idx( %a, ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntw x10 ; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: mov w9, #4 -; CHECK-NEXT: cmp x10, #4 // =4 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: mov w9, #4 +; CHECK-NEXT: cmp x10, #4 // =4 ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #2 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -427,14 +409,13 @@ define @splice_nxv2f64_first_idx( %a, ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: sub x9, x9, #1 // =1 -; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #3 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -449,14 +430,13 @@ define @splice_nxv2f64_last_idx( %a, ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: sub x9, x9, #1 // =1 -; CHECK-NEXT: cmp x9, #1 // =1 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csinc x9, x9, xzr, lo +; CHECK-NEXT: cmp x9, #1 // =1 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #3 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: csinc x9, x9, xzr, lo +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -472,15 +452,14 @@ define @splice_nxv2f64_clamped_idx( % ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntd x10 ; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: mov w9, #2 -; CHECK-NEXT: cmp x10, #2 // =2 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: cmp x10, #2 // =2 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #3 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -495,17 +474,16 @@ define @splice_nxv2i1_idx( %a, @splice_nxv4i1_idx( %a, @splice_nxv8i1_idx( %a, @splice_nxv16i1_idx( %a, @splice_nxv2i8_idx( %a, @splice_nxv16f32_clamped_idx( ; CHECK-NEXT: addvl sp, sp, #-8 ; CHECK-NEXT: rdvl x10, #1 ; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: mov w9, #16 -; CHECK-NEXT: cmp x10, #16 // =16 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: mov w9, #16 +; CHECK-NEXT: cmp x10, #16 // =16 ; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] ; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] @@ -670,8 +643,9 @@ define @splice_nxv16f32_clamped_idx( ; CHECK-NEXT: st1w { z4.s }, p0, [x8, #4, mul vl] ; CHECK-NEXT: st1w { z5.s }, p0, [x8, #5, mul vl] ; CHECK-NEXT: st1w { z6.s }, p0, [x8, #6, mul vl] +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] ; CHECK-NEXT: add x8, x8, x9, lsl #2 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8, #2, mul vl] ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8, #3, mul vl] @@ -696,8 +670,8 @@ define @splice_nxv16i8( %a, @splice_nxv16i8_1( %a, @splice_nxv16i1( %a, %t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; Check that vscale call is recognised by load/store reg/reg pattern and +; partially folded, with the rest pulled out of the loop. This requires LSR to +; be disabled, which is something that will be addressed at a later date. + +define void @ld1w_reg_loop([32000 x i32]* %addr) { +; CHECK-LABEL: ld1w_reg_loop: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: cntw x9 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: .LBB0_1: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: adds x8, x8, x9 +; CHECK-NEXT: b.ne .LBB0_1 +; CHECK-NEXT: // %bb.2: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %0 = call i64 @llvm.vscale.i64() + %1 = shl i64 %0, 2 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %2 = getelementptr inbounds [32000 x i32], [32000 x i32]* %addr, i64 0, i64 %index + %3 = bitcast i32* %2 to * + %load = load volatile , * %3, align 16 + %index.next = add i64 %index, %1 + %4 = icmp eq i64 %index.next, 0 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: + ret void +} + +define void @st1w_reg_loop([32000 x i32]* %addr, %val) { +; CHECK-LABEL: st1w_reg_loop: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: cntw x9 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: .LBB1_1: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; CHECK-NEXT: adds x8, x8, x9 +; CHECK-NEXT: b.ne .LBB1_1 +; CHECK-NEXT: // %bb.2: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %0 = call i64 @llvm.vscale.i64() + %1 = shl i64 %0, 2 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %2 = getelementptr inbounds [32000 x i32], [32000 x i32]* %addr, i64 0, i64 %index + %3 = bitcast i32* %2 to * + store volatile %val, * %3, align 16 + %index.next = add i64 %index, %1 + %4 = icmp eq i64 %index.next, 0 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: + ret void +} + +declare i64 @llvm.vscale.i64() diff --git a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll index 5cbcee7b85be..04f36bbf6f88 100644 --- a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll @@ -43,9 +43,9 @@ define @ld1b_upper_bound(* %a) { define @ld1b_out_of_upper_bound(* %a) { ; CHECK-LABEL: ld1b_out_of_upper_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #8 +; CHECK-NEXT: rdvl x8, #8 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; CHECK-NEXT: ret %base = getelementptr , * %a, i64 8 %load = load , * %base @@ -55,9 +55,9 @@ define @ld1b_out_of_upper_bound(* %a) { define @ld1b_out_of_lower_bound(* %a) { ; CHECK-LABEL: ld1b_out_of_lower_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #-9 +; CHECK-NEXT: rdvl x8, #-9 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; CHECK-NEXT: ret %base = getelementptr , * %a, i64 -9 %load = load , * %base @@ -138,4 +138,3 @@ define void @load_nxv12f16(* %a) { %val = load volatile , * %a ret void } - diff --git a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-reg.ll b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-reg.ll new file mode 100644 index 000000000000..8c2b9eede634 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-reg.ll @@ -0,0 +1,302 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; LD1B + +define @ld1_nxv16i8(i8* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv8i16_zext8(i8* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv8i16_zext8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + %zext = zext %val to + ret %zext +} + +define @ld1_nxv4i32_zext8(i8* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv4i32_zext8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + %zext = zext %val to + ret %zext +} + +define @ld1_nxv2i64_zext8(i8* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2i64_zext8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + %zext = zext %val to + ret %zext +} + +define @ld1_nxv8i16_sext8(i8* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv8i16_sext8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + %sext = sext %val to + ret %sext +} + +define @ld1_nxv4i32_sext8(i8* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv4i32_sext8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + %sext = sext %val to + ret %sext +} + +define @ld1_nxv2i64_sext8(i8* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2i64_sext8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + %sext = sext %val to + ret %sext +} + +; LD1H + +define @ld1_nxv8i16(i16* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %off + %ptrcast = bitcast i16* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv4i32_zext16(i16* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv4i32_zext16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %off + %ptrcast = bitcast i16* %ptr to * + %val = load volatile , * %ptrcast + %zext = zext %val to + ret %zext +} + +define @ld1_nxv2i64_zext16(i16* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2i64_zext16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %off + %ptrcast = bitcast i16* %ptr to * + %val = load volatile , * %ptrcast + %zext = zext %val to + ret %zext +} + +define @ld1_nxv4i32_sext16(i16* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv4i32_sext16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %off + %ptrcast = bitcast i16* %ptr to * + %val = load volatile , * %ptrcast + %sext = sext %val to + ret %sext +} + +define @ld1_nxv2i64_sext16(i16* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2i64_sext16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %off + %ptrcast = bitcast i16* %ptr to * + %val = load volatile , * %ptrcast + %sext = sext %val to + ret %sext +} + +define @ld1_nxv8f16(half* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds half, half* %addr, i64 %off + %ptrcast = bitcast half* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv8bf16(bfloat* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %off + %ptrcast = bitcast bfloat* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv4f16(half* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds half, half* %addr, i64 %off + %ptrcast = bitcast half* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv2f16(half* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds half, half* %addr, i64 %off + %ptrcast = bitcast half* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +; LD1W + +define @ld1_nxv4i32(i32* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i32, i32* %addr, i64 %off + %ptrcast = bitcast i32* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv2i64_zext32(i32* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2i64_zext32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i32, i32* %addr, i64 %off + %ptrcast = bitcast i32* %ptr to * + %val = load volatile , * %ptrcast + %zext = zext %val to + ret %zext +} + +define @ld1_nxv2i64_sext32(i32* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2i64_sext32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i32, i32* %addr, i64 %off + %ptrcast = bitcast i32* %ptr to * + %val = load volatile , * %ptrcast + %sext = sext %val to + ret %sext +} + +define @ld1_nxv4f32(float* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds float, float* %addr, i64 %off + %ptrcast = bitcast float* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv2f32(float* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds float, float* %addr, i64 %off + %ptrcast = bitcast float* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +; LD1D + +define @ld1_nxv2i64(i64* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i64, i64* %addr, i64 %off + %ptrcast = bitcast i64* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv2f64(double* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds double, double* %addr, i64 %off + %ptrcast = bitcast double* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} diff --git a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll index a477b0e20050..afb6ac325560 100644 --- a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll @@ -43,9 +43,9 @@ define void @st1b_upper_bound( %data, * %a) define void @st1b_out_of_upper_bound( %data, * %a) { ; CHECK-LABEL: st1b_out_of_upper_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #8 +; CHECK-NEXT: rdvl x8, #8 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x8] +; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8] ; CHECK-NEXT: ret %base = getelementptr , * %a, i64 8 store %data, * %base @@ -55,9 +55,9 @@ define void @st1b_out_of_upper_bound( %data, %data, * %a) { ; CHECK-LABEL: st1b_out_of_lower_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #-9 +; CHECK-NEXT: rdvl x8, #-9 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x8] +; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8] ; CHECK-NEXT: ret %base = getelementptr , * %a, i64 -9 store %data, * %base diff --git a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-reg.ll b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-reg.ll new file mode 100644 index 000000000000..6a974255e881 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-reg.ll @@ -0,0 +1,224 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; ST1B + +define void @st1_nxv16i8(i8* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv8i16_trunc8(i8* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv8i16_trunc8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: st1b { z0.h }, p0, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %trunc = trunc %val to + store %trunc, * %ptrcast + ret void +} + +define void @st1_nxv4i32_trunc8(i8* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv4i32_trunc8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1b { z0.s }, p0, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %trunc = trunc %val to + store %trunc, * %ptrcast + ret void +} + +define void @st1_nxv2i64_trunc8(i8* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv2i64_trunc8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1b { z0.d }, p0, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %trunc = trunc %val to + store %trunc, * %ptrcast + ret void +} + +; ST1H + +define void @st1_nxv8i16(i16* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %off + %ptrcast = bitcast i16* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv4i32_trunc16(i16* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv4i32_trunc16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %off + %ptrcast = bitcast i16* %ptr to * + %trunc = trunc %val to + store %trunc, * %ptrcast + ret void +} + +define void @st1_nxv2i64_trunc16(i16* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv2i64_trunc16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %off + %ptrcast = bitcast i16* %ptr to * + %trunc = trunc %val to + store %trunc, * %ptrcast + ret void +} + +define void @st1_nxv8f16(half* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds half, half* %addr, i64 %off + %ptrcast = bitcast half* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv8bf16(bfloat* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %off + %ptrcast = bitcast bfloat* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv4f16(half* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds half, half* %addr, i64 %off + %ptrcast = bitcast half* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv2f16(half* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds half, half* %addr, i64 %off + %ptrcast = bitcast half* %ptr to * + store %val, * %ptrcast + ret void +} + +; ST1W + +define void @st1_nxv4i32(i32* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i32, i32* %addr, i64 %off + %ptrcast = bitcast i32* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv2i64_trunc32(i32* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv2i64_trunc32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i32, i32* %addr, i64 %off + %ptrcast = bitcast i32* %ptr to * + %trunc = trunc %val to + store %trunc, * %ptrcast + ret void +} + +define void @st1_nxv4f32(float* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds float, float* %addr, i64 %off + %ptrcast = bitcast float* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv2f32(float* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds float, float* %addr, i64 %off + %ptrcast = bitcast float* %ptr to * + store %val, * %ptrcast + ret void +} + +; ST1D + +define void @st1_nxv2i64(i64* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i64, i64* %addr, i64 %off + %ptrcast = bitcast i64* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv2f64(double* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds double, double* %addr, i64 %off + %ptrcast = bitcast double* %ptr to * + store %val, * %ptrcast + ret void +}