[RISCV] Begin to support more subvector inserts/extracts

This patch adds support for INSERT_SUBVECTOR and EXTRACT_SUBVECTOR (nominally where both operands are scalable vector types) where the vector, subvector, and index align sufficiently to allow decomposition to subregister manipulation: * For extracts, the extracted subvector must correctly align with the lower elements of a vector register. * For inserts, the inserted subvector must be at least one full vector register, and correctly align as above. This approach should work for fixed-length vector insertion/extraction too, but that will come later. Reviewed By: craig.topper, khchen, arcbbb Differential Revision: https://reviews.llvm.org/D96873
2021-02-17 15:57:59 +00:00 · 2021-02-17 15:57:59 +00:00 · d876214990
parent 0176fecfbc
commit d876214990
3 changed files with 588 additions and 43 deletions
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@ -382,6 +382,48 @@ void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, unsigned IntNo,
  ReplaceNode(Node, Store);
 }

+static unsigned getRegClassIDForVecVT(MVT VT) {
+  if (VT.getVectorElementType() == MVT::i1)
+    return RISCV::VRRegClassID;
+  return getRegClassIDForLMUL(getLMUL(VT));
+}
+
+// Attempt to decompose a subvector insert/extract between VecVT and
+// SubVecVT via subregister indices. Returns the subregister index that
+// can perform the subvector insert/extract with the given element index, as
+// well as the index corresponding to any leftover subvectors that must be
+// further inserted/extracted within the register class for SubVecVT.
+static std::pair<unsigned, unsigned>
+decomposeSubvectorInsertExtractToSubRegs(MVT VecVT, MVT SubVecVT,
+                                         unsigned InsertExtractIdx,
+                                         const RISCVRegisterInfo *TRI) {
+  static_assert((RISCV::VRM8RegClassID > RISCV::VRM4RegClassID &&
+                 RISCV::VRM4RegClassID > RISCV::VRM2RegClassID &&
+                 RISCV::VRM2RegClassID > RISCV::VRRegClassID),
+                "Register classes not ordered");
+  unsigned VecRegClassID = getRegClassIDForVecVT(VecVT);
+  unsigned SubRegClassID = getRegClassIDForVecVT(SubVecVT);
+  // Try to compose a subregister index that takes us from the incoming
+  // LMUL>1 register class down to the outgoing one. At each step we half
+  // the LMUL:
+  //   nxv16i32@12 -> nxv2i32: sub_vrm4_1_then_sub_vrm2_1_then_sub_vrm1_0
+  // Note that this is not guaranteed to find a subregister index, such as
+  // when we are extracting from one VR type to another.
+  unsigned SubRegIdx = RISCV::NoSubRegister;
+  for (const unsigned RCID :
+       {RISCV::VRM4RegClassID, RISCV::VRM2RegClassID, RISCV::VRRegClassID})
+    if (VecRegClassID > RCID && SubRegClassID <= RCID) {
+      VecVT = VecVT.getHalfNumVectorElementsVT();
+      bool IsHi =
+          InsertExtractIdx >= VecVT.getVectorElementCount().getKnownMinValue();
+      SubRegIdx = TRI->composeSubRegIndices(SubRegIdx,
+                                            getSubregIndexByMVT(VecVT, IsHi));
+      if (IsHi)
+        InsertExtractIdx -= VecVT.getVectorElementCount().getKnownMinValue();
+    }
+  return {SubRegIdx, InsertExtractIdx};
+}
+
 void RISCVDAGToDAGISel::Select(SDNode *Node) {
  // If we have a custom node, we have already selected.
  if (Node->isMachineOpcode()) {
@ -704,56 +746,127 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
    break;
  }
  case ISD::INSERT_SUBVECTOR: {
-    // Bail when not a "cast" like insert_subvector.
-    if (Node->getConstantOperandVal(2) != 0)
-      break;
-    if (!Node->getOperand(0).isUndef())
-      break;
+    SDValue V = Node->getOperand(0);
+    SDValue SubV = Node->getOperand(1);
+    SDLoc DL(SubV);
+    auto Idx = Node->getConstantOperandVal(2);
+    MVT SubVecVT = Node->getOperand(1).getSimpleValueType();

-    // Bail when normal isel should do the job.
-    MVT InVT = Node->getOperand(1).getSimpleValueType();
-    if (VT.isFixedLengthVector() || InVT.isScalableVector())
-      break;
+    // TODO: This method of selecting INSERT_SUBVECTOR should work
+    // with any type of insertion (fixed <-> scalable) but we don't yet
+    // correctly identify the canonical register class for fixed-length types.
+    // For now, keep the two paths separate.
+    if (VT.isScalableVector() && SubVecVT.isScalableVector()) {
+      bool IsFullVecReg = false;
+      switch (getLMUL(SubVecVT)) {
+      default:
+        break;
+      case RISCVVLMUL::LMUL_1:
+      case RISCVVLMUL::LMUL_2:
+      case RISCVVLMUL::LMUL_4:
+      case RISCVVLMUL::LMUL_8:
+        IsFullVecReg = true;
+        break;
+      }

-    unsigned RegClassID;
-    if (VT.getVectorElementType() == MVT::i1)
-      RegClassID = RISCV::VRRegClassID;
-    else
-      RegClassID = getRegClassIDForLMUL(getLMUL(VT));
+      // If the subvector doesn't occupy a full vector register then we can't
+      // insert it purely using subregister manipulation. We must not clobber
+      // the untouched elements (say, in the upper half of the VR register).
+      if (!IsFullVecReg)
+        break;

-    SDValue V = Node->getOperand(1);
-    SDLoc DL(V);
-    SDValue RC =
-        CurDAG->getTargetConstant(RegClassID, DL, Subtarget->getXLenVT());
-    SDNode *NewNode =
-        CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
-    ReplaceNode(Node, NewNode);
-    return;
+      const auto *TRI = Subtarget->getRegisterInfo();
+      unsigned SubRegIdx;
+      std::tie(SubRegIdx, Idx) =
+          decomposeSubvectorInsertExtractToSubRegs(VT, SubVecVT, Idx, TRI);
+
+      // If the Idx hasn't been completely eliminated then this is a subvector
+      // extract which doesn't naturally align to a vector register. These must
+      // be handled using instructions to manipulate the vector registers.
+      if (Idx != 0)
+        break;
+
+      SDNode *NewNode = CurDAG->getMachineNode(
+          TargetOpcode::INSERT_SUBREG, DL, VT, V, SubV,
+          CurDAG->getTargetConstant(SubRegIdx, DL, Subtarget->getXLenVT()));
+      return ReplaceNode(Node, NewNode);
+    }
+
+    if (VT.isScalableVector() && SubVecVT.isFixedLengthVector()) {
+      // Bail when not a "cast" like insert_subvector.
+      if (Idx != 0)
+        break;
+      if (!Node->getOperand(0).isUndef())
+        break;
+
+      unsigned RegClassID = getRegClassIDForVecVT(VT);
+
+      SDValue RC =
+          CurDAG->getTargetConstant(RegClassID, DL, Subtarget->getXLenVT());
+      SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                               DL, VT, SubV, RC);
+      ReplaceNode(Node, NewNode);
+      return;
+    }
+    break;
  }
  case ISD::EXTRACT_SUBVECTOR: {
-    // Bail when not a "cast" like extract_subvector.
-    if (Node->getConstantOperandVal(1) != 0)
-      break;
-
-    // Bail when normal isel can do the job.
-    MVT InVT = Node->getOperand(0).getSimpleValueType();
-    if (VT.isScalableVector() || InVT.isFixedLengthVector())
-      break;
-
-    unsigned RegClassID;
-    if (InVT.getVectorElementType() == MVT::i1)
-      RegClassID = RISCV::VRRegClassID;
-    else
-      RegClassID = getRegClassIDForLMUL(getLMUL(InVT));
-
    SDValue V = Node->getOperand(0);
+    auto Idx = Node->getConstantOperandVal(1);
+    MVT InVT = Node->getOperand(0).getSimpleValueType();
    SDLoc DL(V);
-    SDValue RC =
-        CurDAG->getTargetConstant(RegClassID, DL, Subtarget->getXLenVT());
-    SDNode *NewNode =
-        CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
-    ReplaceNode(Node, NewNode);
-    return;
+
+    // TODO: This method of selecting EXTRACT_SUBVECTOR should work
+    // with any type of extraction (fixed <-> scalable) but we don't yet
+    // correctly identify the canonical register class for fixed-length types.
+    // For now, keep the two paths separate.
+    if (VT.isScalableVector() && InVT.isScalableVector()) {
+      const auto *TRI = Subtarget->getRegisterInfo();
+      unsigned SubRegIdx;
+      std::tie(SubRegIdx, Idx) =
+          decomposeSubvectorInsertExtractToSubRegs(InVT, VT, Idx, TRI);
+
+      // If the Idx hasn't been completely eliminated then this is a subvector
+      // extract which doesn't naturally align to a vector register. These must
+      // be handled using instructions to manipulate the vector registers.
+      if (Idx != 0)
+        break;
+
+      // If we haven't set a SubRegIdx, then we must be going between LMUL<=1
+      // types (VR -> VR). This can be done as a copy.
+      if (SubRegIdx == RISCV::NoSubRegister) {
+        unsigned RegClassID = getRegClassIDForVecVT(VT);
+        unsigned InRegClassID = getRegClassIDForVecVT(InVT);
+        assert(RegClassID == InRegClassID &&
+               RegClassID == RISCV::VRRegClassID &&
+               "Unexpected subvector extraction");
+        SDValue RC =
+            CurDAG->getTargetConstant(InRegClassID, DL, Subtarget->getXLenVT());
+        SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                                 DL, VT, V, RC);
+        return ReplaceNode(Node, NewNode);
+      }
+      SDNode *NewNode = CurDAG->getMachineNode(
+          TargetOpcode::EXTRACT_SUBREG, DL, VT, V,
+          CurDAG->getTargetConstant(SubRegIdx, DL, Subtarget->getXLenVT()));
+      return ReplaceNode(Node, NewNode);
+    }
+
+    if (VT.isFixedLengthVector() && InVT.isScalableVector()) {
+      // Bail when not a "cast" like extract_subvector.
+      if (Idx != 0)
+        break;
+
+      unsigned InRegClassID = getRegClassIDForVecVT(InVT);
+
+      SDValue RC =
+          CurDAG->getTargetConstant(InRegClassID, DL, Subtarget->getXLenVT());
+      SDNode *NewNode =
+          CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
+      ReplaceNode(Node, NewNode);
+      return;
+    }
+    break;
  }
  }

--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@ -0,0 +1,226 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 4 x i32> @extract_nxv8i32_nxv4i32_0(<vscale x 8 x i32> %vec) {
+; CHECK-LABEL: extract_nxv8i32_nxv4i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $v8m2 killed $v8m2 killed $v8m4
+; CHECK-NEXT:    ret
+  %c = call <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %vec, i64 0)
+  ret <vscale x 4 x i32> %c
+}
+
+define <vscale x 4 x i32> @extract_nxv8i32_nxv4i32_4(<vscale x 8 x i32> %vec) {
+; CHECK-LABEL: extract_nxv8i32_nxv4i32_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    ret
+  %c = call <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %vec, i64 4)
+  ret <vscale x 4 x i32> %c
+}
+
+define <vscale x 2 x i32> @extract_nxv8i32_nxv2i32_0(<vscale x 8 x i32> %vec) {
+; CHECK-LABEL: extract_nxv8i32_nxv2i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $v8 killed $v8 killed $v8m4
+; CHECK-NEXT:    ret
+  %c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, i64 0)
+  ret <vscale x 2 x i32> %c
+}
+
+define <vscale x 2 x i32> @extract_nxv8i32_nxv2i32_2(<vscale x 8 x i32> %vec) {
+; CHECK-LABEL: extract_nxv8i32_nxv2i32_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, i64 2)
+  ret <vscale x 2 x i32> %c
+}
+
+define <vscale x 2 x i32> @extract_nxv8i32_nxv2i32_4(<vscale x 8 x i32> %vec) {
+; CHECK-LABEL: extract_nxv8i32_nxv2i32_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+  %c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, i64 4)
+  ret <vscale x 2 x i32> %c
+}
+
+define <vscale x 2 x i32> @extract_nxv8i32_nxv2i32_6(<vscale x 8 x i32> %vec) {
+; CHECK-LABEL: extract_nxv8i32_nxv2i32_6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v11
+; CHECK-NEXT:    ret
+  %c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, i64 6)
+  ret <vscale x 2 x i32> %c
+}
+
+define <vscale x 8 x i32> @extract_nxv16i32_nxv8i32_0(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: extract_nxv16i32_nxv8i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $v8m4 killed $v8m4 killed $v8m8
+; CHECK-NEXT:    ret
+  %c = call <vscale x 8 x i32> @llvm.experimental.vector.extract.nxv8i32.nxv16i32(<vscale x 16 x i32> %vec, i64 0)
+  ret <vscale x 8 x i32> %c
+}
+
+define <vscale x 8 x i32> @extract_nxv16i32_nxv8i32_8(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: extract_nxv16i32_nxv8i32_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    ret
+  %c = call <vscale x 8 x i32> @llvm.experimental.vector.extract.nxv8i32.nxv16i32(<vscale x 16 x i32> %vec, i64 8)
+  ret <vscale x 8 x i32> %c
+}
+
+define <vscale x 4 x i32> @extract_nxv16i32_nxv4i32_0(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: extract_nxv16i32_nxv4i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $v8m2 killed $v8m2 killed $v8m8
+; CHECK-NEXT:    ret
+  %c = call <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, i64 0)
+  ret <vscale x 4 x i32> %c
+}
+
+define <vscale x 4 x i32> @extract_nxv16i32_nxv4i32_4(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: extract_nxv16i32_nxv4i32_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    ret
+  %c = call <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, i64 4)
+  ret <vscale x 4 x i32> %c
+}
+
+define <vscale x 4 x i32> @extract_nxv16i32_nxv4i32_8(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: extract_nxv16i32_nxv4i32_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    ret
+  %c = call <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, i64 8)
+  ret <vscale x 4 x i32> %c
+}
+
+define <vscale x 4 x i32> @extract_nxv16i32_nxv4i32_12(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: extract_nxv16i32_nxv4i32_12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv2r.v v8, v14
+; CHECK-NEXT:    ret
+  %c = call <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, i64 12)
+  ret <vscale x 4 x i32> %c
+}
+
+define <vscale x 2 x i32> @extract_nxv16i32_nxv2i32_0(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: extract_nxv16i32_nxv2i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $v8 killed $v8 killed $v8m8
+; CHECK-NEXT:    ret
+  %c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 0)
+  ret <vscale x 2 x i32> %c
+}
+
+define <vscale x 2 x i32> @extract_nxv16i32_nxv2i32_2(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: extract_nxv16i32_nxv2i32_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 2)
+  ret <vscale x 2 x i32> %c
+}
+
+define <vscale x 2 x i32> @extract_nxv16i32_nxv2i32_4(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: extract_nxv16i32_nxv2i32_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+  %c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 4)
+  ret <vscale x 2 x i32> %c
+}
+
+define <vscale x 2 x i32> @extract_nxv16i32_nxv2i32_6(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: extract_nxv16i32_nxv2i32_6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v11
+; CHECK-NEXT:    ret
+  %c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 6)
+  ret <vscale x 2 x i32> %c
+}
+
+define <vscale x 2 x i32> @extract_nxv16i32_nxv2i32_8(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: extract_nxv16i32_nxv2i32_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v12
+; CHECK-NEXT:    ret
+  %c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 8)
+  ret <vscale x 2 x i32> %c
+}
+
+define <vscale x 2 x i32> @extract_nxv16i32_nxv2i32_10(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: extract_nxv16i32_nxv2i32_10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v13
+; CHECK-NEXT:    ret
+  %c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 10)
+  ret <vscale x 2 x i32> %c
+}
+
+define <vscale x 2 x i32> @extract_nxv16i32_nxv2i32_12(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: extract_nxv16i32_nxv2i32_12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v14
+; CHECK-NEXT:    ret
+  %c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 12)
+  ret <vscale x 2 x i32> %c
+}
+
+define <vscale x 2 x i32> @extract_nxv16i32_nxv2i32_14(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: extract_nxv16i32_nxv2i32_14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v15
+; CHECK-NEXT:    ret
+  %c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 14)
+  ret <vscale x 2 x i32> %c
+}
+
+define <vscale x 1 x i32> @extract_nxv16i32_nxv1i32_0(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: extract_nxv16i32_nxv1i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $v8 killed $v8 killed $v8m8
+; CHECK-NEXT:    ret
+  %c = call <vscale x 1 x i32> @llvm.experimental.vector.extract.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, i64 0)
+  ret <vscale x 1 x i32> %c
+}
+
+; TODO: Extracts that don't align to a vector register are not yet supported.
+; In this case we want to extract the upper half of the lowest VR subregister
+; in the LMUL group.
+; define <vscale x 1 x i32> @extract_nxv16i32_nxv1i32_1(<vscale x 16 x i32> %vec) {
+;   %c = call <vscale x 1 x i32> @llvm.experimental.vector.extract.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, i64 1)
+;   ret <vscale x 1 x i32> %c
+; }
+
+define <vscale x 1 x i32> @extract_nxv16i32_nxv1i32_2(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: extract_nxv16i32_nxv1i32_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %c = call <vscale x 1 x i32> @llvm.experimental.vector.extract.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, i64 2)
+  ret <vscale x 1 x i32> %c
+}
+
+define <vscale x 1 x i32> @extract_nxv2i32_nxv1i32_0(<vscale x 2 x i32> %vec) {
+; CHECK-LABEL: extract_nxv2i32_nxv1i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret
+  %c = call <vscale x 1 x i32> @llvm.experimental.vector.extract.nxv1i32.nxv2i32(<vscale x 2 x i32> %vec, i64 0)
+  ret <vscale x 1 x i32> %c
+}
+
+declare <vscale x 1 x i32> @llvm.experimental.vector.extract.nxv1i32.nxv2i32(<vscale x 2 x i32> %vec, i64 %idx)
+
+declare <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, i64 %idx)
+declare <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %vec, i64 %idx)
+
+declare <vscale x 1 x i32> @llvm.experimental.vector.extract.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, i64 %idx)
+declare <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 %idx)
+declare <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, i64 %idx)
+declare <vscale x 8 x i32> @llvm.experimental.vector.extract.nxv8i32.nxv16i32(<vscale x 16 x i32> %vec, i64 %idx)
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@ -0,0 +1,206 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 8 x i32> @insert_nxv8i32_nxv4i32_0(<vscale x 8 x i32> %vec, <vscale x 4 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv8i32_nxv4i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv8i32(<vscale x 8 x i32> %vec, <vscale x 4 x i32> %subvec, i64 0)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @insert_nxv8i32_nxv4i32_4(<vscale x 8 x i32> %vec, <vscale x 4 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv8i32_nxv4i32_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv2r.v v10, v12
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv8i32(<vscale x 8 x i32> %vec, <vscale x 4 x i32> %subvec, i64 4)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @insert_nxv8i32_nxv2i32_0(<vscale x 8 x i32> %vec, <vscale x 2 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv8i32_nxv2i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v12
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, <vscale x 2 x i32> %subvec, i64 0)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @insert_nxv8i32_nxv2i32_2(<vscale x 8 x i32> %vec, <vscale x 2 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv8i32_nxv2i32_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v12
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, <vscale x 2 x i32> %subvec, i64 2)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @insert_nxv8i32_nxv2i32_4(<vscale x 8 x i32> %vec, <vscale x 2 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv8i32_nxv2i32_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v12
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, <vscale x 2 x i32> %subvec, i64 4)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @insert_nxv8i32_nxv2i32_6(<vscale x 8 x i32> %vec, <vscale x 2 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv8i32_nxv2i32_6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v11, v12
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, <vscale x 2 x i32> %subvec, i64 6)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv8i32_0(<vscale x 16 x i32> %vec, <vscale x 8 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv8i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 8 x i32> %subvec, i64 0)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv8i32_8(<vscale x 16 x i32> %vec, <vscale x 8 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv8i32_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv4r.v v12, v16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 8 x i32> %subvec, i64 8)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv4i32_0(<vscale x 16 x i32> %vec, <vscale x 4 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv4i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv2r.v v8, v16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 4 x i32> %subvec, i64 0)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv4i32_4(<vscale x 16 x i32> %vec, <vscale x 4 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv4i32_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv2r.v v10, v16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 4 x i32> %subvec, i64 4)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv4i32_8(<vscale x 16 x i32> %vec, <vscale x 4 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv4i32_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv2r.v v12, v16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 4 x i32> %subvec, i64 8)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv4i32_12(<vscale x 16 x i32> %vec, <vscale x 4 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv4i32_12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv2r.v v14, v16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 4 x i32> %subvec, i64 12)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv2i32_0(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv2i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec, i64 0)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv2i32_2(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv2i32_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec, i64 2)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv2i32_4(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv2i32_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec, i64 4)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv2i32_6(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv2i32_6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v11, v16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec, i64 6)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv2i32_8(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv2i32_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec, i64 8)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv2i32_10(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv2i32_10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v13, v16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec, i64 10)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv2i32_12(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv2i32_12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v14, v16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec, i64 12)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv2i32_14(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv2i32_14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v15, v16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec, i64 14)
+  ret <vscale x 16 x i32> %v
+}
+
+; TODO: Inserts that are less than LMUL=1 are not yet supported. In this case
+; we need mask out the unaffected elements (top half of the VR %subvec
+; register)
+;define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_0(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec) {
+;  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 0)
+;  ret <vscale x 16 x i32> %v
+;}
+
+; TODO: Inserts that don't align to a vector register are not yet supported.
+; In this case we want to insert the subvector into the upper half of the
+; lowest VR subregister in the LMUL group.
+;define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_1(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec) {
+;  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 1)
+;  ret <vscale x 16 x i32> %v
+;}
+
+declare <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv8i32(<vscale x 8 x i32>, <vscale x 2 x i32>, i64 %idx)
+declare <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv8i32(<vscale x 8 x i32>, <vscale x 4 x i32>, i64 %idx)
+
+declare <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32>, <vscale x 1 x i32>, i64 %idx)
+declare <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32>, <vscale x 2 x i32>, i64 %idx)
+declare <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv16i32(<vscale x 16 x i32>, <vscale x 4 x i32>, i64 %idx)
+declare <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv16i32(<vscale x 16 x i32>, <vscale x 8 x i32>, i64 %idx)