[CodeGen] Add support for inserting elements into scalable vectors

Summary: This patch tries to ensure that we do something sensible when generating code for the ISD::INSERT_VECTOR_ELT DAG node when operating on scalable vectors. Previously we always returned 'undef' when inserting an element into an out-of-bounds lane index, whereas now we only do this for fixed length vectors. For scalable vectors it is assumed that the backend will do the right thing in the same way that we have to deal with variable lane indices. In this patch I have permitted a few basic combinations for scalable vector types where it makes sense, but in general avoided most cases for now as they currently require the use of BUILD_VECTOR nodes. This patch includes tests for all scalable vector types when inserting into lane 0, but I've only included one or two vector types for other cases such as variable lane inserts. Differential Revision: https://reviews.llvm.org/D78992
2020-04-24 07:55:53 +01:00 · 2020-04-24 07:55:53 +01:00 · 058cd8c5be
parent 7a3be975b9
commit 058cd8c5be
4 changed files with 224 additions and 6 deletions
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -17134,11 +17134,11 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
  SDLoc DL(N);

  EVT VT = InVec.getValueType();
-  unsigned NumElts = VT.getVectorNumElements();
  auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);

  // Insert into out-of-bounds element is undefined.
-  if (IndexC && IndexC->getZExtValue() >= VT.getVectorNumElements())
+  if (IndexC && VT.isFixedLengthVector() &&
+      IndexC->getZExtValue() >= VT.getVectorNumElements())
    return DAG.getUNDEF(VT);

  // Remove redundant insertions:
@ -17151,12 +17151,21 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
    // If this is variable insert to undef vector, it might be better to splat:
    // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
    if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) {
-      SmallVector<SDValue, 8> Ops(NumElts, InVal);
-      return DAG.getBuildVector(VT, DL, Ops);
+      if (VT.isScalableVector())
+        return DAG.getSplatVector(VT, DL, InVal);
+      else {
+        SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), InVal);
+        return DAG.getBuildVector(VT, DL, Ops);
+      }
    }
    return SDValue();
  }

+  if (VT.isScalableVector())
+    return SDValue();
+
+  unsigned NumElts = VT.getVectorNumElements();
+
  // We must know which element is being inserted for folds below here.
  unsigned Elt = IndexC->getZExtValue();
  if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@ -5615,8 +5615,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
    llvm_unreachable("should use getVectorShuffle constructor!");
  case ISD::INSERT_VECTOR_ELT: {
    ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N3);
-    // INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF
-    if (N3C && N3C->getZExtValue() >= N1.getValueType().getVectorNumElements())
+    // INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF, except
+    // for scalable vectors where we will generate appropriate code to
+    // deal with out-of-bounds cases correctly.
+    if (N3C && N1.getValueType().isFixedLengthVector() &&
+        N3C->getZExtValue() >= N1.getValueType().getVectorNumElements())
      return getUNDEF(VT);

    // Undefined index can be assumed out-of-bounds, so that's UNDEF too.
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@ -1767,6 +1767,77 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
  // 16-element contiguous store
  defm : st1<ST1B, ST1B_IMM,   nxv16i8, AArch64st1, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;

+  def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i32 FPR32:$src), 0)),
+            (INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+  def : Pat<(nxv8i16 (vector_insert (nxv8i16 (undef)), (i32 FPR32:$src), 0)),
+            (INSERT_SUBREG (nxv8i16 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+  def : Pat<(nxv4i32 (vector_insert (nxv4i32 (undef)), (i32 FPR32:$src), 0)),
+            (INSERT_SUBREG (nxv4i32 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+  def : Pat<(nxv2i64 (vector_insert (nxv2i64 (undef)), (i64 FPR64:$src), 0)),
+            (INSERT_SUBREG (nxv2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+
+  // Insert scalar into vector[0]
+  def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), (i32 GPR32:$src), 0)),
+            (CPY_ZPmR_B ZPR:$vec, (PTRUE_B 1), GPR32:$src)>;
+  def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), (i32 GPR32:$src), 0)),
+            (CPY_ZPmR_H ZPR:$vec, (PTRUE_H 1), GPR32:$src)>;
+  def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), (i32 GPR32:$src), 0)),
+            (CPY_ZPmR_S ZPR:$vec, (PTRUE_S 1), GPR32:$src)>;
+  def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), (i64 GPR64:$src), 0)),
+            (CPY_ZPmR_D ZPR:$vec, (PTRUE_D 1), GPR64:$src)>;
+
+  def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), 0)),
+            (SEL_ZPZZ_H (PTRUE_H 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), ZPR:$vec)>;
+  def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), 0)),
+            (SEL_ZPZZ_S (PTRUE_S 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), ZPR:$vec)>;
+  def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), 0)),
+            (SEL_ZPZZ_D (PTRUE_D 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), ZPR:$vec)>;
+
+  // Insert scalar into vector with scalar index
+  def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), GPR32:$src, GPR64:$index)),
+            (CPY_ZPmR_B ZPR:$vec,
+                        (CMPEQ_PPzZZ_B (PTRUE_B 31),
+                                       (INDEX_II_B 0, 1),
+                                       (DUP_ZR_B (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+                        GPR32:$src)>;
+  def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), GPR32:$src, GPR64:$index)),
+            (CPY_ZPmR_H ZPR:$vec,
+                        (CMPEQ_PPzZZ_H (PTRUE_H 31),
+                                       (INDEX_II_H 0, 1),
+                                       (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+                        GPR32:$src)>;
+  def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), GPR32:$src, GPR64:$index)),
+            (CPY_ZPmR_S ZPR:$vec,
+                        (CMPEQ_PPzZZ_S (PTRUE_S 31),
+                                       (INDEX_II_S 0, 1),
+                                       (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+                        GPR32:$src)>;
+  def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), GPR64:$src, GPR64:$index)),
+            (CPY_ZPmR_D ZPR:$vec,
+                        (CMPEQ_PPzZZ_D (PTRUE_D 31),
+                                       (INDEX_II_D 0, 1),
+                                       (DUP_ZR_D GPR64:$index)),
+                        GPR64:$src)>;
+
+  // Insert FP scalar into vector with scalar index
+  def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)),
+            (CPY_ZPmV_H ZPR:$vec,
+                        (CMPEQ_PPzZZ_H (PTRUE_H 31),
+                                       (INDEX_II_H 0, 1),
+                                       (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+                        $src)>;
+  def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), GPR64:$index)),
+            (CPY_ZPmV_S ZPR:$vec,
+                        (CMPEQ_PPzZZ_S (PTRUE_S 31),
+                                       (INDEX_II_S 0, 1),
+                                       (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+                        $src)>;
+  def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), GPR64:$index)),
+            (CPY_ZPmV_D ZPR:$vec,
+                        (CMPEQ_PPzZZ_D (PTRUE_D 31),
+                                       (INDEX_II_D 0, 1),
+                                       (DUP_ZR_D $index)),
+                        $src)>;
 }

 let Predicates = [HasSVE, HasMatMulInt8] in {
--- a/llvm/test/CodeGen/AArch64/sve-insert-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
@ -0,0 +1,135 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 16 x i8> @test_lane0_16xi8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: test_lane0_16xi8
+; CHECK:       mov [[REG:.*]], #30
+; CHECK:       mov z0.b, p{{[0-7]}}/m, [[REG]]
+  %b = insertelement <vscale x 16 x i8> %a, i8 30, i32 0
+  ret <vscale x 16 x i8> %b
+}
+
+define <vscale x 8 x i16> @test_lane0_8xi16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: test_lane0_8xi16
+; CHECK:       mov [[REG:.*]], #30
+; CHECK:       mov z0.h, p{{[0-7]}}/m, [[REG]]
+  %b = insertelement <vscale x 8 x i16> %a, i16 30, i32 0
+  ret <vscale x 8 x i16> %b
+}
+
+define <vscale x 4 x i32> @test_lane0_4xi32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: test_lane0_4xi32
+; CHECK:       mov [[REG:.*]], #30
+; CHECK:       mov z0.s, p{{[0-7]}}/m, [[REG]]
+  %b = insertelement <vscale x 4 x i32> %a, i32 30, i32 0
+  ret <vscale x 4 x i32> %b
+}
+
+define <vscale x 2 x i64> @test_lane0_2xi64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: test_lane0_2xi64
+; CHECK:       mov w[[REG:.*]], #30
+; CHECK:       mov z0.d, p{{[0-7]}}/m, x[[REG]]
+  %b = insertelement <vscale x 2 x i64> %a, i64 30, i32 0
+  ret <vscale x 2 x i64> %b
+}
+
+define <vscale x 2 x double> @test_lane0_2xf64(<vscale x 2 x double> %a) {
+; CHECK-LABEL: test_lane0_2xf64
+; CHECK:       fmov d[[REG:[0-9]+]], #1.00000000
+; CHECK:       mov z0.d, p{{[0-7]}}/m, z[[REG]].d
+  %b = insertelement <vscale x 2 x double> %a, double 1.0, i32 0
+  ret <vscale x 2 x double> %b
+}
+
+define <vscale x 4 x float> @test_lane0_4xf32(<vscale x 4 x float> %a) {
+; CHECK-LABEL: test_lane0_4xf32
+; CHECK:       fmov s[[REG:[0-9]+]], #1.00000000
+; CHECK:       mov z0.s, p{{[0-7]}}/m, z[[REG]].s
+  %b = insertelement <vscale x 4 x float> %a, float 1.0, i32 0
+  ret <vscale x 4 x float> %b
+}
+
+define <vscale x 8 x half> @test_lane0_8xf16(<vscale x 8 x half> %a) {
+; CHECK-LABEL: test_lane0_8xf16
+; CHECK:       fmov h[[REG:[0-9]+]], #1.00000000
+; CHECK:       mov z0.h, p{{[0-7]}}/m, z[[REG]].h
+  %b = insertelement <vscale x 8 x half> %a, half 1.0, i32 0
+  ret <vscale x 8 x half> %b
+}
+
+; Undefined lane insert
+define <vscale x 2 x i64> @test_lane4_2xi64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: test_lane4_2xi64
+; CHECK:       mov w[[IDXREG:.*]], #4
+; CHECK:       index z[[CMPVEC:[0-9]+]].d, #0, #1
+; CHECK:       mov z[[IDXVEC:[0-9]+]].d, x[[IDXREG]]
+; CHECK:       cmpeq p[[PRED:[0-9]+]].d, p{{[0-7]}}/z, z[[CMPVEC]].d, z[[IDXVEC]].d
+; CHECK:       mov w[[VALREG:.*]], #30
+; CHECK:       mov z0.d, p[[PRED]]/m, x[[VALREG]]
+  %b = insertelement <vscale x 2 x i64> %a, i64 30, i32 4
+  ret <vscale x 2 x i64> %b
+}
+
+; Undefined lane insert
+define <vscale x 8 x half> @test_lane9_8xf16(<vscale x 8 x half> %a) {
+; CHECK-LABEL: test_lane9_8xf16
+; CHECK:       mov w[[IDXREG:.*]], #9
+; CHECK:       index z[[CMPVEC:[0-9]+]].h, #0, #1
+; CHECK:       mov z[[IDXVEC:[0-9]+]].h, w[[IDXREG]]
+; CHECK:       cmpeq p[[PRED:[0-9]+]].h, p{{[0-7]}}/z, z[[CMPVEC]].h, z[[IDXVEC]].h
+; CHECK:       fmov h[[VALREG:[0-9]+]], #1.00000000
+; CHECK:       mov z0.h, p[[PRED]]/m, h[[VALREG]]
+  %b = insertelement <vscale x 8 x half> %a, half 1.0, i32 9
+  ret <vscale x 8 x half> %b
+}
+
+define <vscale x 16 x i8> @test_lane1_16xi8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: test_lane1_16xi8
+; CHECK:       mov w[[IDXREG:.*]], #1
+; CHECK:       index z[[CMPVEC:[0-9]+]].b, #0, #1
+; CHECK:       mov z[[IDXVEC:[0-9]+]].b, w[[IDXREG]]
+; CHECK:       cmpeq p[[PRED:[0-9]+]].b, p{{[0-7]}}/z, z[[CMPVEC]].b, z[[IDXVEC]].b
+; CHECK:       mov w[[VALREG:.*]], #30
+; CHECK:       mov z0.b, p[[PRED]]/m, w[[VALREG]]
+  %b = insertelement <vscale x 16 x i8> %a, i8 30, i32 1
+  ret <vscale x 16 x i8> %b
+}
+
+define <vscale x 16 x i8> @test_lanex_16xi8(<vscale x 16 x i8> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_16xi8
+; CHECK:       index z[[CMPVEC:[0-9]+]].b, #0, #1
+; CHECK:       mov z[[IDXVEC:[0-9]+]].b, w[[IDXREG]]
+; CHECK:       cmpeq p[[PRED:[0-9]+]].b, p{{[0-7]}}/z, z[[CMPVEC]].b, z[[IDXVEC]].b
+; CHECK:       mov w[[VALREG:.*]], #30
+; CHECK:       mov z0.b, p[[PRED]]/m, w[[VALREG]]
+  %b = insertelement <vscale x 16 x i8> %a, i8 30, i32 %x
+  ret <vscale x 16 x i8> %b
+}
+
+
+; Redundant lane insert
+define <vscale x 4 x i32> @extract_insert_4xi32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: extract_insert_4xi32
+; CHECK-NOT:   mov w{{.*}}, #30
+; CHECK-NOT:   mov z0.d
+  %b = extractelement <vscale x 4 x i32> %a, i32 2
+  %c = insertelement <vscale x 4 x i32> %a, i32 %b, i32 2
+  ret <vscale x 4 x i32> %c
+}
+
+define <vscale x 8 x i16> @test_lane6_undef_8xi16(i16 %a) {
+; CHECK-LABEL: test_lane6_undef_8xi16
+; CHECK:       mov w[[IDXREG:.*]], #6
+; CHECK:       index z[[CMPVEC:.*]].h, #0, #1
+; CHECK:       mov z[[IDXVEC:[0-9]+]].h, w[[IDXREG]]
+; CHECK:       cmpeq p[[PRED:.*]].h, p{{.*}}/z, z[[CMPVEC]].h, z[[IDXVEC]].h
+; CHECK:       mov z0.h, p[[PRED]]/m, w0
+  %b = insertelement <vscale x 8 x i16> undef, i16 %a, i32 6
+  ret <vscale x 8 x i16> %b
+}
+
+define <vscale x 16 x i8> @test_lane0_undef_16xi8(i8 %a) {
+; CHECK-LABEL: test_lane0_undef_16xi8
+; CHECK:       fmov s0, w0
+  %b = insertelement <vscale x 16 x i8> undef, i8 %a, i32 0
+  ret <vscale x 16 x i8> %b
+}