[llvm][Aarch64][SVE] Remove extra fmov instruction with certain literals

When a literal that cannot fit in the immediate form of the fmov instruction
is used to initialise an SVE vector, an extra unnecessary fmov is currently
generated. This patch adds an extra codegen pattern preventing the extra
instruction from being generated.

Differential Revision: https://reviews.llvm.org/D96700

Co-Authored-By: Paul Walker <paul.walker@arm.com>
This commit is contained in:
David Truby 2021-02-16 14:15:28 +00:00
parent ed86328515
commit e86f9ba15c
3 changed files with 60 additions and 0 deletions

View File

@ -553,6 +553,14 @@ let Predicates = [HasSVE] in {
def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm i32:$a, i32:$b)))),
(DUP_ZI_D $a, $b)>;
// Duplicate immediate FP into all vector elements.
def : Pat<(nxv2f32 (AArch64dup (f32 fpimm:$val))),
(DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>;
def : Pat<(nxv4f32 (AArch64dup (f32 fpimm:$val))),
(DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>;
def : Pat<(nxv2f64 (AArch64dup (f64 fpimm:$val))),
(DUP_ZR_D (MOVi64imm (bitcast_fpimm_to_i64 f64:$val)))>;
// Duplicate FP immediate into all vector elements
let AddedComplexity = 2 in {
def : Pat<(nxv8f16 (AArch64dup fpimm16:$imm8)),

View File

@ -130,12 +130,37 @@ define <vscale x 2 x double> @dup_imm_f64(double %b) {
ret <vscale x 2 x double> %out
}
define <vscale x 2 x float> @dup_fmov_imm_f32_2() {
; CHECK-LABEL: dup_fmov_imm_f32_2:
; CHECK: mov w8, #1109917696
; CHECK-NEXT: mov z0.s, w8
%out = tail call <vscale x 2 x float> @llvm.aarch64.sve.dup.x.nxv2f32(float 4.200000e+01)
ret <vscale x 2 x float> %out
}
define <vscale x 4 x float> @dup_fmov_imm_f32_4() {
; CHECK-LABEL: dup_fmov_imm_f32_4:
; CHECK: mov w8, #1109917696
; CHECK-NEXT: mov z0.s, w8
%out = tail call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 4.200000e+01)
ret <vscale x 4 x float> %out
}
define <vscale x 2 x double> @dup_fmov_imm_f64_2() {
; CHECK-LABEL: dup_fmov_imm_f64_2:
; CHECK: mov x8, #4631107791820423168
; CHECK-NEXT: mov z0.d, x8
%out = tail call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 4.200000e+01)
ret <vscale x 2 x double> %out
}
declare <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8( i8)
declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64)
declare <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half)
declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat)
declare <vscale x 2 x float> @llvm.aarch64.sve.dup.x.nxv2f32(float)
declare <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float)
declare <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double)

View File

@ -372,5 +372,32 @@ define <vscale x 4 x float> @splat_nxv4f32_fold(<vscale x 4 x float> %x) {
ret <vscale x 4 x float> %r
}
define <vscale x 2 x float> @splat_nxv2f32_fmov_fold() {
; CHECK-LABEL: splat_nxv2f32_fmov_fold
; CHECK: mov w8, #1109917696
; CHECK-NEXT: mov z0.s, w8
%1 = insertelement <vscale x 2 x float> undef, float 4.200000e+01, i32 0
%2 = shufflevector <vscale x 2 x float> %1, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
ret <vscale x 2 x float> %2
}
define <vscale x 4 x float> @splat_nxv4f32_fmov_fold() {
; CHECK-LABEL: splat_nxv4f32_fmov_fold
; CHECK: mov w8, #1109917696
; CHECK-NEXT: mov z0.s, w8
%1 = insertelement <vscale x 4 x float> undef, float 4.200000e+01, i32 0
%2 = shufflevector <vscale x 4 x float> %1, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
ret <vscale x 4 x float> %2
}
define <vscale x 2 x double> @splat_nxv2f64_fmov_fold() {
; CHECK-LABEL: splat_nxv2f64_fmov_fold
; CHECK: mov x8, #4631107791820423168
; CHECK-NEXT: mov z0.d, x8
%1 = insertelement <vscale x 2 x double> undef, double 4.200000e+01, i32 0
%2 = shufflevector <vscale x 2 x double> %1, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
ret <vscale x 2 x double> %2
}
; +bf16 is required for the bfloat version.
attributes #0 = { "target-features"="+sve,+bf16" }