[llvm][Aarch64][SVE] Remove extra fmov instruction with certain literals

When a literal that cannot fit in the immediate form of the fmov instruction is used to initialise an SVE vector, an extra unnecessary fmov is currently generated. This patch adds an extra codegen pattern preventing the extra instruction from being generated. Differential Revision: https://reviews.llvm.org/D96700 Co-Authored-By: Paul Walker <paul.walker@arm.com>
2021-02-16 14:15:28 +00:00 · 2021-02-16 14:15:28 +00:00 · e86f9ba15c
parent ed86328515
commit e86f9ba15c
3 changed files with 60 additions and 0 deletions
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@ -553,6 +553,14 @@ let Predicates = [HasSVE] in {
  def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm i32:$a, i32:$b)))),
            (DUP_ZI_D $a, $b)>;

+  // Duplicate immediate FP into all vector elements.
+ def : Pat<(nxv2f32 (AArch64dup (f32 fpimm:$val))),
+            (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>;
+ def : Pat<(nxv4f32 (AArch64dup (f32 fpimm:$val))),
+            (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>;
+ def : Pat<(nxv2f64 (AArch64dup (f64 fpimm:$val))),
+            (DUP_ZR_D (MOVi64imm (bitcast_fpimm_to_i64 f64:$val)))>;
+
  // Duplicate FP immediate into all vector elements
  let AddedComplexity = 2 in {
    def : Pat<(nxv8f16 (AArch64dup fpimm16:$imm8)),
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
@ -130,12 +130,37 @@ define <vscale x 2 x double> @dup_imm_f64(double %b) {
  ret <vscale x 2 x double> %out
 }

+define <vscale x 2 x float> @dup_fmov_imm_f32_2() {
+; CHECK-LABEL: dup_fmov_imm_f32_2:
+; CHECK: mov w8, #1109917696
+; CHECK-NEXT: mov z0.s, w8
+  %out = tail call <vscale x 2 x float> @llvm.aarch64.sve.dup.x.nxv2f32(float 4.200000e+01)
+  ret <vscale x 2 x float> %out
+}
+
+define <vscale x 4 x float> @dup_fmov_imm_f32_4() {
+; CHECK-LABEL: dup_fmov_imm_f32_4:
+; CHECK: mov w8, #1109917696
+; CHECK-NEXT: mov z0.s, w8
+  %out = tail call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 4.200000e+01)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @dup_fmov_imm_f64_2() {
+; CHECK-LABEL: dup_fmov_imm_f64_2:
+; CHECK: mov x8, #4631107791820423168
+; CHECK-NEXT: mov z0.d, x8
+  %out = tail call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 4.200000e+01)
+  ret <vscale x 2 x double> %out
+}
+
 declare <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8( i8)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64)
 declare <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat)
+declare <vscale x 2 x float> @llvm.aarch64.sve.dup.x.nxv2f32(float)
 declare <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float)
 declare <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double)

--- a/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
@ -372,5 +372,32 @@ define <vscale x 4 x float> @splat_nxv4f32_fold(<vscale x 4 x float> %x) {
  ret <vscale x 4 x float> %r
 }

+define <vscale x 2 x float> @splat_nxv2f32_fmov_fold() {
+; CHECK-LABEL: splat_nxv2f32_fmov_fold
+; CHECK: mov w8, #1109917696
+; CHECK-NEXT: mov z0.s, w8
+  %1 = insertelement <vscale x 2 x float> undef, float 4.200000e+01, i32 0
+  %2 = shufflevector <vscale x 2 x float> %1, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x float> %2
+}
+
+define <vscale x 4 x float> @splat_nxv4f32_fmov_fold() {
+; CHECK-LABEL: splat_nxv4f32_fmov_fold
+; CHECK: mov w8, #1109917696
+; CHECK-NEXT: mov z0.s, w8
+  %1 = insertelement <vscale x 4 x float> undef, float 4.200000e+01, i32 0
+  %2 = shufflevector <vscale x 4 x float> %1, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x float> %2
+}
+
+define <vscale x 2 x double> @splat_nxv2f64_fmov_fold() {
+; CHECK-LABEL: splat_nxv2f64_fmov_fold
+; CHECK: mov x8, #4631107791820423168
+; CHECK-NEXT: mov z0.d, x8
+  %1 = insertelement <vscale x 2 x double> undef, double 4.200000e+01, i32 0
+  %2 = shufflevector <vscale x 2 x double> %1, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x double> %2
+}
+
 ; +bf16 is required for the bfloat version.
 attributes #0 = { "target-features"="+sve,+bf16" }