[SelectionDAG] Add llvm.vector.{extract,insert} intrinsics

This commit adds two new intrinsics. - llvm.experimental.vector.insert: used to insert a vector into another vector starting at a given index. - llvm.experimental.vector.extract: used to extract a subvector from a larger vector starting from a given index. The codegen work for these intrinsics has already been completed; this commit is simply exposing the existing ISD nodes to LLVM IR. Reviewed By: cameron.mcinally Differential Revision: https://reviews.llvm.org/D91362
2020-12-09 11:05:51 +00:00 · 2020-12-09 11:05:51 +00:00 · 80c33de2d3
parent 4167a0259e
commit 80c33de2d3
11 changed files with 847 additions and 0 deletions
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@ -16095,6 +16095,81 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of floating-point values.

+'``llvm.experimental.vector.insert``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. You can use ``llvm.experimental.vector.insert``
+to insert a fixed-width vector into a scalable vector, but not the other way
+around.
+
+::
+
+      declare <vscale x 4 x float> @llvm.experimental.vector.insert.v4f32(<vscale x 4 x float> %vec, <4 x float> %subvec, i64 %idx)
+      declare <vscale x 2 x double> @llvm.experimental.vector.insert.v2f64(<vscale x 2 x double> %vec, <2 x double> %subvec, i64 %idx)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.insert.*``' intrinsics insert a vector into another vector
+starting from a given index. The return type matches the type of the vector we
+insert into. Conceptually, this can be used to build a scalable vector out of
+non-scalable vectors.
+
+Arguments:
+""""""""""
+
+The ``vec`` is the vector which ``subvec`` will be inserted into.
+The ``subvec`` is the vector that will be inserted.
+
+``idx`` represents the starting element number at which ``subvec`` will be
+inserted. ``idx`` must be a constant multiple of ``subvec``'s known minimum
+vector length. If ``subvec`` is a scalable vector, ``idx`` is first scaled by
+the runtime scaling factor of ``subvec``. The elements of ``vec`` starting at
+``idx`` are overwritten with ``subvec``. Elements ``idx`` through (``idx`` +
+num_elements(``subvec``) - 1) must be valid ``vec`` indices. If this condition
+cannot be determined statically but is false at runtime, then the result vector
+is undefined.
+
+
+'``llvm.experimental.vector.extract``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. You can use
+``llvm.experimental.vector.extract`` to extract a fixed-width vector from a
+scalable vector, but not the other way around.
+
+::
+
+      declare <4 x float> @llvm.experimental.vector.extract.v4f32(<vscale x 4 x float> %vec, i64 %idx)
+      declare <2 x double> @llvm.experimental.vector.extract.v2f64(<vscale x 2 x double> %vec, i64 %idx)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.extract.*``' intrinsics extract a vector from
+within another vector starting from a given index. The return type must be
+explicitly specified. Conceptually, this can be used to decompose a scalable
+vector into non-scalable parts.
+
+Arguments:
+""""""""""
+
+The ``vec`` is the vector from which we will extract a subvector.
+
+The ``idx`` specifies the starting element number within ``vec`` from which a
+subvector is extracted. ``idx`` must be a constant multiple of the known-minimum
+vector length of the result type. If the result type is a scalable vector,
+``idx`` is first scaled by the result type's runtime scaling factor. Elements
+``idx`` through (``idx`` + num_elements(result_type) - 1) must be valid vector
+indices. If this condition cannot be determined statically but is false at
+runtime, then the result vector is undefined. The ``idx`` parameter must be a
+vector index constant type (for most targets this will be an integer pointer
+type).
+
 Matrix Intrinsics
 -----------------

--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@ -1614,6 +1614,15 @@ def int_preserve_struct_access_index : DefaultAttrsIntrinsic<[llvm_anyptr_ty],
 //===---------- Intrinsics to query properties of scalable vectors --------===//
 def int_vscale : DefaultAttrsIntrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;

+//===---------- Intrinsics to perform subvector insertion/extraction ------===//
+def int_experimental_vector_insert : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                                           [LLVMMatchType<0>, llvm_anyvector_ty, llvm_i64_ty],
+                                                           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
+def int_experimental_vector_extract : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                                            [llvm_anyvector_ty, llvm_i64_ty],
+                                                            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+
 //===----------------------------------------------------------------------===//

 //===----------------------------------------------------------------------===//
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -6932,6 +6932,27 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                             SetCC));
    return;
  }
+  case Intrinsic::experimental_vector_insert: {
+    auto DL = getCurSDLoc();
+
+    SDValue Vec = getValue(I.getOperand(0));
+    SDValue SubVec = getValue(I.getOperand(1));
+    SDValue Index = getValue(I.getOperand(2));
+    EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    setValue(&I, DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ResultVT, Vec, SubVec,
+                             Index));
+    return;
+  }
+  case Intrinsic::experimental_vector_extract: {
+    auto DL = getCurSDLoc();
+
+    SDValue Vec = getValue(I.getOperand(0));
+    SDValue Index = getValue(I.getOperand(1));
+    EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+    setValue(&I, DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResultVT, Vec, Index));
+    return;
+  }
  }
 }

--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@ -5138,6 +5138,26 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {

    break;
  }
+  case Intrinsic::experimental_vector_insert: {
+    VectorType *VecTy = cast<VectorType>(Call.getArgOperand(0)->getType());
+    VectorType *SubVecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
+
+    Assert(VecTy->getElementType() == SubVecTy->getElementType(),
+           "experimental_vector_insert parameters must have the same element "
+           "type.",
+           &Call);
+    break;
+  }
+  case Intrinsic::experimental_vector_extract: {
+    VectorType *ResultTy = cast<VectorType>(Call.getType());
+    VectorType *VecTy = cast<VectorType>(Call.getArgOperand(0)->getType());
+
+    Assert(ResultTy->getElementType() == VecTy->getElementType(),
+           "experimental_vector_extract result must have the same element "
+           "type as the input vector.",
+           &Call);
+    break;
+  }
  };
 }

--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@ -1652,6 +1652,102 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
    }
    break;
  }
+  case Intrinsic::experimental_vector_insert: {
+    Value *Vec = II->getArgOperand(0);
+    Value *SubVec = II->getArgOperand(1);
+    Value *Idx = II->getArgOperand(2);
+    auto *DstTy = dyn_cast<FixedVectorType>(II->getType());
+    auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+    auto *SubVecTy = dyn_cast<FixedVectorType>(SubVec->getType());
+
+    // Only canonicalize if the destination vector, Vec, and SubVec are all
+    // fixed vectors.
+    if (DstTy && VecTy && SubVecTy) {
+      unsigned DstNumElts = DstTy->getNumElements();
+      unsigned VecNumElts = VecTy->getNumElements();
+      unsigned SubVecNumElts = SubVecTy->getNumElements();
+      unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
+
+      // The result of this call is undefined if IdxN is not a constant multiple
+      // of the SubVec's minimum vector length OR the insertion overruns Vec.
+      if (IdxN % SubVecNumElts != 0 || IdxN + SubVecNumElts > VecNumElts) {
+        replaceInstUsesWith(CI, UndefValue::get(CI.getType()));
+        return eraseInstFromFunction(CI);
+      }
+
+      // An insert that entirely overwrites Vec with SubVec is a nop.
+      if (VecNumElts == SubVecNumElts) {
+        replaceInstUsesWith(CI, SubVec);
+        return eraseInstFromFunction(CI);
+      }
+
+      // Widen SubVec into a vector of the same width as Vec, since
+      // shufflevector requires the two input vectors to be the same width.
+      // Elements beyond the bounds of SubVec within the widened vector are
+      // undefined.
+      SmallVector<int, 8> WidenMask;
+      unsigned i;
+      for (i = 0; i != SubVecNumElts; ++i)
+        WidenMask.push_back(i);
+      for (; i != VecNumElts; ++i)
+        WidenMask.push_back(UndefMaskElem);
+
+      Value *WidenShuffle = Builder.CreateShuffleVector(
+          SubVec, llvm::UndefValue::get(SubVecTy), WidenMask);
+
+      SmallVector<int, 8> Mask;
+      for (unsigned i = 0; i != IdxN; ++i)
+        Mask.push_back(i);
+      for (unsigned i = DstNumElts; i != DstNumElts + SubVecNumElts; ++i)
+        Mask.push_back(i);
+      for (unsigned i = IdxN + SubVecNumElts; i != DstNumElts; ++i)
+        Mask.push_back(i);
+
+      Value *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
+      replaceInstUsesWith(CI, Shuffle);
+      return eraseInstFromFunction(CI);
+    }
+    break;
+  }
+  case Intrinsic::experimental_vector_extract: {
+    Value *Vec = II->getArgOperand(0);
+    Value *Idx = II->getArgOperand(1);
+
+    auto *DstTy = dyn_cast<FixedVectorType>(II->getType());
+    auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+
+    // Only canonicalize if the the destination vector and Vec are fixed
+    // vectors.
+    if (DstTy && VecTy) {
+      unsigned DstNumElts = DstTy->getNumElements();
+      unsigned VecNumElts = VecTy->getNumElements();
+      unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
+
+      // The result of this call is undefined if IdxN is not a constant multiple
+      // of the result type's minimum vector length OR the extraction overruns
+      // Vec.
+      if (IdxN % DstNumElts != 0 || IdxN + DstNumElts > VecNumElts) {
+        replaceInstUsesWith(CI, UndefValue::get(CI.getType()));
+        return eraseInstFromFunction(CI);
+      }
+
+      // Extracting the entirety of Vec is a nop.
+      if (VecNumElts == DstNumElts) {
+        replaceInstUsesWith(CI, Vec);
+        return eraseInstFromFunction(CI);
+      }
+
+      SmallVector<int, 8> Mask;
+      for (unsigned i = 0; i != DstNumElts; ++i)
+        Mask.push_back(IdxN + i);
+
+      Value *Shuffle =
+          Builder.CreateShuffleVector(Vec, UndefValue::get(VecTy), Mask);
+      replaceInstUsesWith(CI, Shuffle);
+      return eraseInstFromFunction(CI);
+    }
+    break;
+  }
  default: {
    // Handle target specific intrinsics
    Optional<Instruction *> V = targetInstCombineIntrinsic(*II);
--- a/llvm/test/CodeGen/AArch64/sve-extract-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-vector.ll
@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s --check-prefixes=CHECK
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s < %t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+; Should codegen to a nop, since idx is zero.
+define <2 x i64> @extract_v2i64_nxv2i64(<vscale x 2 x i64> %vec) nounwind {
+; CHECK-LABEL: extract_v2i64_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %retval = call <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> %vec, i64 0)
+  ret <2 x i64> %retval
+}
+
+; Goes through memory currently; idx != 0.
+define <2 x i64> @extract_v2i64_nxv2i64_idx1(<vscale x 2 x i64> %vec) nounwind {
+; CHECK-LABEL: extract_v2i64_nxv2i64_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #1 // =1
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    csinc x8, x8, xzr, lo
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK-NEXT:    lsl x8, x8, #3
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ldr q0, [x9, x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+%retval = call <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> %vec, i64 1)
+ret <2 x i64> %retval
+}
+
+; Should codegen to a nop, since idx is zero.
+define <4 x i32> @extract_v4i32_nxv4i32(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_v4i32_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+%retval = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
+ret <4 x i32> %retval
+}
+
+; Goes through memory currently; idx != 0.
+define <4 x i32> @extract_v4i32_nxv4i32_idx1(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_v4i32_nxv4i32_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #1 // =1
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    csinc x8, x8, xzr, lo
+; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK-NEXT:    lsl x8, x8, #2
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ldr q0, [x9, x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> %vec, i64 1)
+  ret <4 x i32> %retval
+}
+
+; Should codegen to a nop, since idx is zero.
+define <8 x i16> @extract_v8i16_nxv8i16(<vscale x 8 x i16> %vec) nounwind {
+; CHECK-LABEL: extract_v8i16_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %retval = call <8 x i16> @llvm.experimental.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> %vec, i64 0)
+  ret <8 x i16> %retval
+}
+
+; Goes through memory currently; idx != 0.
+define <8 x i16> @extract_v8i16_nxv8i16_idx1(<vscale x 8 x i16> %vec) nounwind {
+; CHECK-LABEL: extract_v8i16_nxv8i16_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #1 // =1
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    csinc x8, x8, xzr, lo
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK-NEXT:    lsl x8, x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ldr q0, [x9, x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <8 x i16> @llvm.experimental.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> %vec, i64 1)
+  ret <8 x i16> %retval
+}
+
+; Should codegen to a nop, since idx is zero.
+define <16 x i8> @extract_v16i8_nxv16i8(<vscale x 16 x i8> %vec) nounwind {
+; CHECK-LABEL: extract_v16i8_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %retval = call <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> %vec, i64 0)
+  ret <16 x i8> %retval
+}
+
+; Goes through memory currently; idx != 0.
+define <16 x i8> @extract_v16i8_nxv16i8_idx1(<vscale x 16 x i8> %vec) nounwind {
+; CHECK-LABEL: extract_v16i8_nxv16i8_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    cmp x8, #1 // =1
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    csinc x8, x8, xzr, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ldr q0, [x9, x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> %vec, i64 1)
+  ret <16 x i8> %retval
+}
+
+declare <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64>, i64)
+declare <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32>, i64)
+declare <8 x i16> @llvm.experimental.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16>, i64)
+declare <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8>, i64)
--- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
@ -0,0 +1,184 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s --check-prefixes=CHECK
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s < %t
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+define <vscale x 2 x i64> @insert_v2i64_nxv2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind {
+; CHECK-LABEL: insert_v2i64_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #0 // =0
+; CHECK-NEXT:    csel x8, x8, xzr, lo
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    lsl x8, x8, #3
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 0)
+  ret <vscale x 2 x i64> %retval
+}
+
+define <vscale x 2 x i64> @insert_v2i64_nxv2i64_idx1(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind {
+; CHECK-LABEL: insert_v2i64_nxv2i64_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #1 // =1
+; CHECK-NEXT:    csinc x8, x8, xzr, lo
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    lsl x8, x8, #3
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 1)
+  ret <vscale x 2 x i64> %retval
+}
+
+define <vscale x 4 x i32> @insert_v4i32_nxv4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec) nounwind {
+; CHECK-LABEL: insert_v4i32_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #0 // =0
+; CHECK-NEXT:    csel x8, x8, xzr, lo
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    lsl x8, x8, #2
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <vscale x 4 x i32> %retval
+}
+
+define <vscale x 4 x i32> @insert_v4i32_nxv4i32_idx1(<vscale x 4 x i32> %vec, <4 x i32> %subvec) nounwind {
+; CHECK-LABEL: insert_v4i32_nxv4i32_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #1 // =1
+; CHECK-NEXT:    csinc x8, x8, xzr, lo
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    lsl x8, x8, #2
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 1)
+  ret <vscale x 4 x i32> %retval
+}
+
+define <vscale x 8 x i16> @insert_v8i16_nxv8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec) nounwind {
+; CHECK-LABEL: insert_v8i16_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #0 // =0
+; CHECK-NEXT:    csel x8, x8, xzr, lo
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    lsl x8, x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec, i64 0)
+  ret <vscale x 8 x i16> %retval
+}
+
+define <vscale x 8 x i16> @insert_v8i16_nxv8i16_idx1(<vscale x 8 x i16> %vec, <8 x i16> %subvec) nounwind {
+; CHECK-LABEL: insert_v8i16_nxv8i16_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #1 // =1
+; CHECK-NEXT:    csinc x8, x8, xzr, lo
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    lsl x8, x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec, i64 1)
+  ret <vscale x 8 x i16> %retval
+}
+
+define <vscale x 16 x i8> @insert_v16i8_nxv16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec) nounwind {
+; CHECK-LABEL: insert_v16i8_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #0 // =0
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    csel x8, x8, xzr, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec, i64 0)
+  ret <vscale x 16 x i8> %retval
+}
+
+define <vscale x 16 x i8> @insert_v16i8_nxv16i8_idx1(<vscale x 16 x i8> %vec, <16 x i8> %subvec) nounwind {
+; CHECK-LABEL: insert_v16i8_nxv16i8_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #1 // =1
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    csinc x8, x8, xzr, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec, i64 1)
+  ret <vscale x 16 x i8> %retval
+}
+
+declare <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
+declare <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
+declare <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
+declare <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
--- a/llvm/test/Transforms/InstCombine/canonicalize-vector-extract.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-vector-extract.ll
@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; llvm.experimental.vector.extract canonicalizes to shufflevector in the fixed case. In the
+; scalable case, we lower to the EXTRACT_SUBVECTOR ISD node.
+
+declare <10 x i32> @llvm.experimental.vector.extract.v10i32.v8i32(<8 x i32> %vec, i64 %idx)
+declare <2 x i32> @llvm.experimental.vector.extract.v2i32.v4i32(<8 x i32> %vec, i64 %idx)
+declare <3 x i32> @llvm.experimental.vector.extract.v3i32.v8i32(<8 x i32> %vec, i64 %idx)
+declare <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> %vec, i64 %idx)
+declare <4 x i32> @llvm.experimental.vector.extract.v4i32.v8i32(<8 x i32> %vec, i64 %idx)
+declare <8 x i32> @llvm.experimental.vector.extract.v8i32.v8i32(<8 x i32> %vec, i64 %idx)
+
+; ============================================================================ ;
+; Trivial cases
+; ============================================================================ ;
+
+; Extracting the entirety of a vector is a nop.
+define <8 x i32> @trivial_nop(<8 x i32> %vec) {
+; CHECK-LABEL: @trivial_nop(
+; CHECK-NEXT:    ret <8 x i32> [[VEC:%.*]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.extract.v8i32.v8i32(<8 x i32> %vec, i64 0)
+  ret <8 x i32> %1
+}
+
+; ============================================================================ ;
+; Valid canonicalizations
+; ============================================================================ ;
+
+define <2 x i32> @valid_extraction_a(<8 x i32> %vec) {
+; CHECK-LABEL: @valid_extraction_a(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+;
+  %1 = call <2 x i32> @llvm.experimental.vector.extract.v2i32.v4i32(<8 x i32> %vec, i64 0)
+  ret <2 x i32> %1
+}
+
+define <2 x i32> @valid_extraction_b(<8 x i32> %vec) {
+; CHECK-LABEL: @valid_extraction_b(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+;
+  %1 = call <2 x i32> @llvm.experimental.vector.extract.v2i32.v4i32(<8 x i32> %vec, i64 2)
+  ret <2 x i32> %1
+}
+
+define <2 x i32> @valid_extraction_c(<8 x i32> %vec) {
+; CHECK-LABEL: @valid_extraction_c(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+;
+  %1 = call <2 x i32> @llvm.experimental.vector.extract.v2i32.v4i32(<8 x i32> %vec, i64 4)
+  ret <2 x i32> %1
+}
+
+define <2 x i32> @valid_extraction_d(<8 x i32> %vec) {
+; CHECK-LABEL: @valid_extraction_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+;
+  %1 = call <2 x i32> @llvm.experimental.vector.extract.v2i32.v4i32(<8 x i32> %vec, i64 6)
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @valid_extraction_e(<8 x i32> %vec) {
+; CHECK-LABEL: @valid_extraction_e(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = call <4 x i32> @llvm.experimental.vector.extract.v4i32.v8i32(<8 x i32> %vec, i64 0)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @valid_extraction_f(<8 x i32> %vec) {
+; CHECK-LABEL: @valid_extraction_f(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = call <4 x i32> @llvm.experimental.vector.extract.v4i32.v8i32(<8 x i32> %vec, i64 4)
+  ret <4 x i32> %1
+}
+
+define <3 x i32> @valid_extraction_g(<8 x i32> %vec) {
+; CHECK-LABEL: @valid_extraction_g(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    ret <3 x i32> [[TMP1]]
+;
+  %1 = call <3 x i32> @llvm.experimental.vector.extract.v3i32.v8i32(<8 x i32> %vec, i64 0)
+  ret <3 x i32> %1
+}
+
+define <3 x i32> @valid_extraction_h(<8 x i32> %vec) {
+; CHECK-LABEL: @valid_extraction_h(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    ret <3 x i32> [[TMP1]]
+;
+  %1 = call <3 x i32> @llvm.experimental.vector.extract.v3i32.v8i32(<8 x i32> %vec, i64 3)
+  ret <3 x i32> %1
+}
+
+; ============================================================================ ;
+; Invalid canonicalizations
+; ============================================================================ ;
+
+; Idx must be the be a constant multiple of the destination vector's length,
+; otherwise the result is undefined.
+define <4 x i32> @idx_not_constant_multiple(<8 x i32> %vec) {
+; CHECK-LABEL: @idx_not_constant_multiple(
+; CHECK-NEXT:    ret <4 x i32> undef
+;
+  %1 = call <4 x i32> @llvm.experimental.vector.extract.v4i32.v8i32(<8 x i32> %vec, i64 1)
+  ret <4 x i32> %1
+}
+
+; If the extraction overruns the vector, the result is undefined.
+define <10 x i32> @extract_overrun(<8 x i32> %vec) {
+; CHECK-LABEL: @extract_overrun(
+; CHECK-NEXT:    ret <10 x i32> undef
+;
+  %1 = call <10 x i32> @llvm.experimental.vector.extract.v10i32.v8i32(<8 x i32> %vec, i64 0)
+  ret <10 x i32> %1
+}
+
+; ============================================================================ ;
+; Scalable cases
+; ============================================================================ ;
+
+; Scalable extractions should not be canonicalized. This will be lowered to the
+; EXTRACT_SUBVECTOR ISD node later.
+define <4 x i32> @scalable_extract(<vscale x 4 x i32> %vec) {
+; CHECK-LABEL: @scalable_extract(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> [[VEC:%.*]], i64 0)
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
+  ret <4 x i32> %1
+}
--- a/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll
@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; llvm.experimental.vector.insert canonicalizes to shufflevector in the fixed case. In the
+; scalable case, we lower to the INSERT_SUBVECTOR ISD node.
+
+declare <8 x i32> @llvm.experimental.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 %idx)
+declare <8 x i32> @llvm.experimental.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 %idx)
+declare <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 %idx)
+declare <8 x i32> @llvm.experimental.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 %idx)
+declare <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 %idx)
+
+; ============================================================================ ;
+; Trivial cases
+; ============================================================================ ;
+
+; An insert that entirely overwrites an <n x ty> with another <n x ty> is a
+; nop.
+define <8 x i32> @trivial_nop(<8 x i32> %vec, <8 x i32> %subvec) {
+; CHECK-LABEL: @trivial_nop(
+; CHECK-NEXT:    ret <8 x i32> [[SUBVEC:%.*]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 0)
+  ret <8 x i32> %1
+}
+
+; ============================================================================ ;
+; Valid canonicalizations
+; ============================================================================ ;
+
+define <8 x i32> @valid_insertion_a(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_a(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_b(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_b(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 2)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_c(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_c(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 4)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_d(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 6)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_e(<8 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_e(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC:%.*]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_f(<8 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_f(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC:%.*]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 4)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_g(<8 x i32> %vec, <3 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_g(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i32> [[SUBVEC:%.*]], <3 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_h(<8 x i32> %vec, <3 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_h(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i32> [[SUBVEC:%.*]], <3 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 10, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 3)
+  ret <8 x i32> %1
+}
+
+; ============================================================================ ;
+; Invalid canonicalizations
+; ============================================================================ ;
+
+; Idx must be the be a constant multiple of the subvector's minimum vector
+; length, otherwise the result is undefined.
+define <8 x i32> @idx_not_constant_multiple(<8 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: @idx_not_constant_multiple(
+; CHECK-NEXT:    ret <8 x i32> undef
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 2)
+  ret <8 x i32> %1
+}
+
+; If the insertion overruns the vector, the result is undefined.
+define <8 x i32> @insert_overrun(<8 x i32> %vec, <8 x i32> %subvec) {
+; CHECK-LABEL: @insert_overrun(
+; CHECK-NEXT:    ret <8 x i32> undef
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 4)
+  ret <8 x i32> %1
+}
+
+; ============================================================================ ;
+; Scalable cases
+; ============================================================================ ;
+
+; Scalable insertions should not be canonicalized. This will be lowered to the
+; INSERT_SUBVECTOR ISD node later.
+define <vscale x 4 x i32> @scalable_insert(<vscale x 4 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: @scalable_insert(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+;
+  %1 = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <vscale x 4 x i32> %1
+}
--- a/llvm/test/Verifier/extract-vector-mismatched-element-types.ll
+++ b/llvm/test/Verifier/extract-vector-mismatched-element-types.ll
@ -0,0 +1,9 @@
+; RUN: not opt -verify -S < %s 2>&1 >/dev/null | FileCheck %s
+
+; CHECK: experimental_vector_extract result must have the same element type as the input vector.
+define <16 x i16> @invalid_mismatched_element_types(<vscale x 16 x i8> %vec) nounwind {
+  %retval = call <16 x i16> @llvm.experimental.vector.extract.v16i16.nxv16i8(<vscale x 16 x i8> %vec, i64 0)
+  ret <16 x i16> %retval
+}
+
+declare <16 x i16> @llvm.experimental.vector.extract.v16i16.nxv16i8(<vscale x 16 x i8>, i64)
--- a/llvm/test/Verifier/insert-vector-mismatched-element-types.ll
+++ b/llvm/test/Verifier/insert-vector-mismatched-element-types.ll
@ -0,0 +1,9 @@
+; RUN: not opt -verify -S < %s 2>&1 >/dev/null | FileCheck %s
+
+; CHECK: experimental_vector_insert parameters must have the same element type.
+define <vscale x 16 x i8> @invalid_mismatched_element_types(<vscale x 16 x i8> %vec, <4 x i16> %subvec) nounwind {
+  %retval = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v4i16(<vscale x 16 x i8> %vec, <4 x i16> %subvec, i64 0)
+  ret <vscale x 16 x i8> %retval
+}
+
+declare <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v4i16(<vscale x 16 x i8>, <4 x i16>, i64)