From 354604a2a7149b5efd52134efa4765cf8c32e386 Mon Sep 17 00:00:00 2001 From: Bradley Smith Date: Fri, 23 Apr 2021 16:34:26 +0100 Subject: [PATCH] [AArch64][SVE] Use SIMD variant of INSR when scalar is the result of a vector extract At the intrinsic layer the sve.insr operation takes a scalar. When this scalar is an integer we are forcing a data transition between GPRs and ZPRs that is potentially costly. Often the integer scalar is the result of a vector extract, when performing a reduction for example. In such cases we should keep all data within the ZPRs. Co-authored-by: Paul Walker Differential Revision: https://reviews.llvm.org/D101169 --- llvm/lib/Target/AArch64/SVEInstrFormats.td | 35 +++++++++++----- llvm/test/CodeGen/AArch64/sve-insr.ll | 49 ++++++++++++++++++++++ 2 files changed, 74 insertions(+), 10 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve-insr.ll diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 37a22d3a16aa..4b6a9819338c 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -1292,8 +1292,8 @@ multiclass sve_int_perm_insrs { } class sve_int_perm_insrv sz8_64, string asm, ZPRRegOp zprty, - RegisterClass srcRegType> -: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcRegType:$Vm), + FPRasZPROperand srcOpType> +: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcOpType:$Vm), asm, "\t$Zdn, $Vm", "", []>, Sched<[]> { @@ -1310,16 +1310,31 @@ class sve_int_perm_insrv sz8_64, string asm, ZPRRegOp zprty, } multiclass sve_int_perm_insrv { - def _B : sve_int_perm_insrv<0b00, asm, ZPR8, FPR8>; - def _H : sve_int_perm_insrv<0b01, asm, ZPR16, FPR16>; - def _S : sve_int_perm_insrv<0b10, asm, ZPR32, FPR32>; - def _D : sve_int_perm_insrv<0b11, asm, ZPR64, FPR64>; + def _B : sve_int_perm_insrv<0b00, asm, ZPR8, FPR8asZPR>; + def _H : sve_int_perm_insrv<0b01, asm, ZPR16, FPR16asZPR>; + def _S : sve_int_perm_insrv<0b10, asm, ZPR32, FPR32asZPR>; + def _D : sve_int_perm_insrv<0b11, asm, ZPR64, FPR64asZPR>; - def : SVE_2_Op_Pat(NAME # _H)>; - def : SVE_2_Op_Pat(NAME # _S)>; - def : SVE_2_Op_Pat(NAME # _D)>; + def : Pat<(nxv8f16 (op nxv8f16:$Zn, f16:$Vm)), + (!cast(NAME # _H) $Zn, (INSERT_SUBREG (IMPLICIT_DEF), $Vm, hsub))>; + def : Pat<(nxv4f32 (op nxv4f32:$Zn, f32:$Vm)), + (!cast(NAME # _S) $Zn, (INSERT_SUBREG (IMPLICIT_DEF), $Vm, ssub))>; + def : Pat<(nxv2f64 (op nxv2f64:$Zn, f64:$Vm)), + (!cast(NAME # _D) $Zn, (INSERT_SUBREG (IMPLICIT_DEF), $Vm, dsub))>; + + def : Pat<(nxv8bf16 (op nxv8bf16:$Zn, bf16:$Vm)), + (!cast(NAME # _H) $Zn, (INSERT_SUBREG (IMPLICIT_DEF), $Vm, hsub))>; + + // Keep integer insertions within the vector unit. + def : Pat<(nxv16i8 (op (nxv16i8 ZPR:$Zn), (i32 (vector_extract (nxv16i8 ZPR:$Vm), 0)))), + (!cast(NAME # _B) $Zn, ZPR:$Vm)>; + def : Pat<(nxv8i16 (op (nxv8i16 ZPR:$Zn), (i32 (vector_extract (nxv8i16 ZPR:$Vm), 0)))), + (!cast(NAME # _H) $Zn, ZPR:$Vm)>; + def : Pat<(nxv4i32 (op (nxv4i32 ZPR:$Zn), (i32 (vector_extract (nxv4i32 ZPR:$Vm), 0)))), + (!cast(NAME # _S) $Zn, ZPR: $Vm)>; + def : Pat<(nxv2i64 (op (nxv2i64 ZPR:$Zn), (i64 (vector_extract (nxv2i64 ZPR:$Vm), 0)))), + (!cast(NAME # _D) $Zn, ZPR:$Vm)>; - def : SVE_2_Op_Pat(NAME # _H)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sve-insr.ll b/llvm/test/CodeGen/AArch64/sve-insr.ll new file mode 100644 index 000000000000..c1895d2f2cea --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-insr.ll @@ -0,0 +1,49 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s + +define @insr_zpr_only_nxv16i8( %a, %b) #0 { +; CHECK-LABEL: insr_zpr_only_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: insr z0.b, b1 +; CHECK-NEXT: ret + %t0 = extractelement %b, i64 0 + %t1 = tail call @llvm.aarch64.sve.insr.nxv16i8( %a, i8 %t0) + ret %t1 +} + +define @insr_zpr_only_nxv8i16( %a, %b) #0 { +; CHECK-LABEL: insr_zpr_only_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: insr z0.h, h1 +; CHECK-NEXT: ret + %t0 = extractelement %b, i64 0 + %t1 = tail call @llvm.aarch64.sve.insr.nxv8i16( %a, i16 %t0) + ret %t1 +} + +define @insr_zpr_only_nxv4i32( %a, %b) #0 { +; CHECK-LABEL: insr_zpr_only_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: insr z0.s, s1 +; CHECK-NEXT: ret + %t0 = extractelement %b, i64 0 + %t1 = tail call @llvm.aarch64.sve.insr.nxv4i32( %a, i32 %t0) + ret %t1 +} + +define @insr_zpr_only_nxv2i64( %a, %b) #0 { +; CHECK-LABEL: insr_zpr_only_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: insr z0.d, d1 +; CHECK-NEXT: ret + %t0 = extractelement %b, i64 0 + %t1 = tail call @llvm.aarch64.sve.insr.nxv2i64( %a, i64 %t0) + ret %t1 +} + +declare @llvm.aarch64.sve.insr.nxv16i8(, i8) +declare @llvm.aarch64.sve.insr.nxv8i16(, i16) +declare @llvm.aarch64.sve.insr.nxv4i32(, i32) +declare @llvm.aarch64.sve.insr.nxv2i64(, i64) + +attributes #0 = { "target-features"="+sve" }