[SVE][CodeGen] Improve codegen for some FP insert_subvector cases

When inserting an unpacked FP subvector into a packed vector we can simply cast the unpacked value into a packed value, since both types are legal for SVE. We can then use this as the input for the UZP instruction. This avoids us expanding the operation by going through the stack. Differential Revision: https://reviews.llvm.org/D113270
2021-11-05 11:42:51 +00:00 · 2021-11-05 11:42:51 +00:00 · 8d38c24fb6
parent 438437cbb6
commit 8d38c24fb6
2 changed files with 15 additions and 12 deletions
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -10912,7 +10912,7 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
    SDLoc DL(Op);
    EVT VT = Op.getValueType();

-    if (!isTypeLegal(VT) || !VT.isInteger())
+    if (!isTypeLegal(VT))
      return SDValue();

    SDValue Vec0 = Op.getOperand(0);
@ -10922,9 +10922,19 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
    if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
      return SDValue();

-    // Extend elements of smaller vector...
-    EVT WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext()));
-    SDValue ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
+    EVT WideVT;
+    SDValue ExtVec;
+
+    if (VT.isFloatingPoint()) {
+      // The InVT type should be legal. We can safely cast the unpacked
+      // subvector from InVT -> VT.
+      WideVT = VT;
+      ExtVec = getSVESafeBitCast(VT, Vec1, DAG);
+    } else {
+      // Extend elements of smaller vector...
+      WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext()));
+      ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
+    }

    if (Idx == 0) {
      SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
--- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
@ -424,14 +424,7 @@ define <vscale x 3 x i32> @insert_nxv3i32_nxv2i32_2(<vscale x 3 x i32> %sv0, <vs
 define <vscale x 3 x float> @insert_nxv3f32_nxv2f32(<vscale x 2 x float> %sv0) nounwind {
 ; CHECK-LABEL: insert_nxv3f32_nxv2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    st1w { z0.d }, p0, [sp]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    ret
  %v0 = call <vscale x 3 x float> @llvm.experimental.vector.insert.nxv3f32.nxv2f32(<vscale x 3 x float> undef, <vscale x 2 x float> %sv0, i64 0)
 ret <vscale x 3 x float> %v0