[RISCV] Begin to support more subvector inserts/extracts

This patch adds support for INSERT_SUBVECTOR and EXTRACT_SUBVECTOR
(nominally where both operands are scalable vector types) where the
vector, subvector, and index align sufficiently to allow decomposition
to subregister manipulation:

* For extracts, the extracted subvector must correctly align with the
lower elements of a vector register.
* For inserts, the inserted subvector must be at least one full vector
register, and correctly align as above.

This approach should work for fixed-length vector insertion/extraction
too, but that will come later.

Reviewed By: craig.topper, khchen, arcbbb

Differential Revision: https://reviews.llvm.org/D96873
This commit is contained in:
Fraser Cormack 2021-02-17 15:57:59 +00:00
parent 0176fecfbc
commit d876214990
3 changed files with 588 additions and 43 deletions

View File

@ -382,6 +382,48 @@ void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, unsigned IntNo,
ReplaceNode(Node, Store);
}
static unsigned getRegClassIDForVecVT(MVT VT) {
if (VT.getVectorElementType() == MVT::i1)
return RISCV::VRRegClassID;
return getRegClassIDForLMUL(getLMUL(VT));
}
// Attempt to decompose a subvector insert/extract between VecVT and
// SubVecVT via subregister indices. Returns the subregister index that
// can perform the subvector insert/extract with the given element index, as
// well as the index corresponding to any leftover subvectors that must be
// further inserted/extracted within the register class for SubVecVT.
static std::pair<unsigned, unsigned>
decomposeSubvectorInsertExtractToSubRegs(MVT VecVT, MVT SubVecVT,
unsigned InsertExtractIdx,
const RISCVRegisterInfo *TRI) {
static_assert((RISCV::VRM8RegClassID > RISCV::VRM4RegClassID &&
RISCV::VRM4RegClassID > RISCV::VRM2RegClassID &&
RISCV::VRM2RegClassID > RISCV::VRRegClassID),
"Register classes not ordered");
unsigned VecRegClassID = getRegClassIDForVecVT(VecVT);
unsigned SubRegClassID = getRegClassIDForVecVT(SubVecVT);
// Try to compose a subregister index that takes us from the incoming
// LMUL>1 register class down to the outgoing one. At each step we half
// the LMUL:
// nxv16i32@12 -> nxv2i32: sub_vrm4_1_then_sub_vrm2_1_then_sub_vrm1_0
// Note that this is not guaranteed to find a subregister index, such as
// when we are extracting from one VR type to another.
unsigned SubRegIdx = RISCV::NoSubRegister;
for (const unsigned RCID :
{RISCV::VRM4RegClassID, RISCV::VRM2RegClassID, RISCV::VRRegClassID})
if (VecRegClassID > RCID && SubRegClassID <= RCID) {
VecVT = VecVT.getHalfNumVectorElementsVT();
bool IsHi =
InsertExtractIdx >= VecVT.getVectorElementCount().getKnownMinValue();
SubRegIdx = TRI->composeSubRegIndices(SubRegIdx,
getSubregIndexByMVT(VecVT, IsHi));
if (IsHi)
InsertExtractIdx -= VecVT.getVectorElementCount().getKnownMinValue();
}
return {SubRegIdx, InsertExtractIdx};
}
void RISCVDAGToDAGISel::Select(SDNode *Node) {
// If we have a custom node, we have already selected.
if (Node->isMachineOpcode()) {
@ -704,56 +746,127 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
break;
}
case ISD::INSERT_SUBVECTOR: {
// Bail when not a "cast" like insert_subvector.
if (Node->getConstantOperandVal(2) != 0)
break;
if (!Node->getOperand(0).isUndef())
break;
SDValue V = Node->getOperand(0);
SDValue SubV = Node->getOperand(1);
SDLoc DL(SubV);
auto Idx = Node->getConstantOperandVal(2);
MVT SubVecVT = Node->getOperand(1).getSimpleValueType();
// Bail when normal isel should do the job.
MVT InVT = Node->getOperand(1).getSimpleValueType();
if (VT.isFixedLengthVector() || InVT.isScalableVector())
break;
// TODO: This method of selecting INSERT_SUBVECTOR should work
// with any type of insertion (fixed <-> scalable) but we don't yet
// correctly identify the canonical register class for fixed-length types.
// For now, keep the two paths separate.
if (VT.isScalableVector() && SubVecVT.isScalableVector()) {
bool IsFullVecReg = false;
switch (getLMUL(SubVecVT)) {
default:
break;
case RISCVVLMUL::LMUL_1:
case RISCVVLMUL::LMUL_2:
case RISCVVLMUL::LMUL_4:
case RISCVVLMUL::LMUL_8:
IsFullVecReg = true;
break;
}
unsigned RegClassID;
if (VT.getVectorElementType() == MVT::i1)
RegClassID = RISCV::VRRegClassID;
else
RegClassID = getRegClassIDForLMUL(getLMUL(VT));
// If the subvector doesn't occupy a full vector register then we can't
// insert it purely using subregister manipulation. We must not clobber
// the untouched elements (say, in the upper half of the VR register).
if (!IsFullVecReg)
break;
SDValue V = Node->getOperand(1);
SDLoc DL(V);
SDValue RC =
CurDAG->getTargetConstant(RegClassID, DL, Subtarget->getXLenVT());
SDNode *NewNode =
CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
ReplaceNode(Node, NewNode);
return;
const auto *TRI = Subtarget->getRegisterInfo();
unsigned SubRegIdx;
std::tie(SubRegIdx, Idx) =
decomposeSubvectorInsertExtractToSubRegs(VT, SubVecVT, Idx, TRI);
// If the Idx hasn't been completely eliminated then this is a subvector
// extract which doesn't naturally align to a vector register. These must
// be handled using instructions to manipulate the vector registers.
if (Idx != 0)
break;
SDNode *NewNode = CurDAG->getMachineNode(
TargetOpcode::INSERT_SUBREG, DL, VT, V, SubV,
CurDAG->getTargetConstant(SubRegIdx, DL, Subtarget->getXLenVT()));
return ReplaceNode(Node, NewNode);
}
if (VT.isScalableVector() && SubVecVT.isFixedLengthVector()) {
// Bail when not a "cast" like insert_subvector.
if (Idx != 0)
break;
if (!Node->getOperand(0).isUndef())
break;
unsigned RegClassID = getRegClassIDForVecVT(VT);
SDValue RC =
CurDAG->getTargetConstant(RegClassID, DL, Subtarget->getXLenVT());
SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
DL, VT, SubV, RC);
ReplaceNode(Node, NewNode);
return;
}
break;
}
case ISD::EXTRACT_SUBVECTOR: {
// Bail when not a "cast" like extract_subvector.
if (Node->getConstantOperandVal(1) != 0)
break;
// Bail when normal isel can do the job.
MVT InVT = Node->getOperand(0).getSimpleValueType();
if (VT.isScalableVector() || InVT.isFixedLengthVector())
break;
unsigned RegClassID;
if (InVT.getVectorElementType() == MVT::i1)
RegClassID = RISCV::VRRegClassID;
else
RegClassID = getRegClassIDForLMUL(getLMUL(InVT));
SDValue V = Node->getOperand(0);
auto Idx = Node->getConstantOperandVal(1);
MVT InVT = Node->getOperand(0).getSimpleValueType();
SDLoc DL(V);
SDValue RC =
CurDAG->getTargetConstant(RegClassID, DL, Subtarget->getXLenVT());
SDNode *NewNode =
CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
ReplaceNode(Node, NewNode);
return;
// TODO: This method of selecting EXTRACT_SUBVECTOR should work
// with any type of extraction (fixed <-> scalable) but we don't yet
// correctly identify the canonical register class for fixed-length types.
// For now, keep the two paths separate.
if (VT.isScalableVector() && InVT.isScalableVector()) {
const auto *TRI = Subtarget->getRegisterInfo();
unsigned SubRegIdx;
std::tie(SubRegIdx, Idx) =
decomposeSubvectorInsertExtractToSubRegs(InVT, VT, Idx, TRI);
// If the Idx hasn't been completely eliminated then this is a subvector
// extract which doesn't naturally align to a vector register. These must
// be handled using instructions to manipulate the vector registers.
if (Idx != 0)
break;
// If we haven't set a SubRegIdx, then we must be going between LMUL<=1
// types (VR -> VR). This can be done as a copy.
if (SubRegIdx == RISCV::NoSubRegister) {
unsigned RegClassID = getRegClassIDForVecVT(VT);
unsigned InRegClassID = getRegClassIDForVecVT(InVT);
assert(RegClassID == InRegClassID &&
RegClassID == RISCV::VRRegClassID &&
"Unexpected subvector extraction");
SDValue RC =
CurDAG->getTargetConstant(InRegClassID, DL, Subtarget->getXLenVT());
SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
DL, VT, V, RC);
return ReplaceNode(Node, NewNode);
}
SDNode *NewNode = CurDAG->getMachineNode(
TargetOpcode::EXTRACT_SUBREG, DL, VT, V,
CurDAG->getTargetConstant(SubRegIdx, DL, Subtarget->getXLenVT()));
return ReplaceNode(Node, NewNode);
}
if (VT.isFixedLengthVector() && InVT.isScalableVector()) {
// Bail when not a "cast" like extract_subvector.
if (Idx != 0)
break;
unsigned InRegClassID = getRegClassIDForVecVT(InVT);
SDValue RC =
CurDAG->getTargetConstant(InRegClassID, DL, Subtarget->getXLenVT());
SDNode *NewNode =
CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
ReplaceNode(Node, NewNode);
return;
}
break;
}
}

View File

@ -0,0 +1,226 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s
define <vscale x 4 x i32> @extract_nxv8i32_nxv4i32_0(<vscale x 8 x i32> %vec) {
; CHECK-LABEL: extract_nxv8i32_nxv4i32_0:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m4
; CHECK-NEXT: ret
%c = call <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %vec, i64 0)
ret <vscale x 4 x i32> %c
}
define <vscale x 4 x i32> @extract_nxv8i32_nxv4i32_4(<vscale x 8 x i32> %vec) {
; CHECK-LABEL: extract_nxv8i32_nxv4i32_4:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv2r.v v8, v10
; CHECK-NEXT: ret
%c = call <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %vec, i64 4)
ret <vscale x 4 x i32> %c
}
define <vscale x 2 x i32> @extract_nxv8i32_nxv2i32_0(<vscale x 8 x i32> %vec) {
; CHECK-LABEL: extract_nxv8i32_nxv2i32_0:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8m4
; CHECK-NEXT: ret
%c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, i64 0)
ret <vscale x 2 x i32> %c
}
define <vscale x 2 x i32> @extract_nxv8i32_nxv2i32_2(<vscale x 8 x i32> %vec) {
; CHECK-LABEL: extract_nxv8i32_nxv2i32_2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, i64 2)
ret <vscale x 2 x i32> %c
}
define <vscale x 2 x i32> @extract_nxv8i32_nxv2i32_4(<vscale x 8 x i32> %vec) {
; CHECK-LABEL: extract_nxv8i32_nxv2i32_4:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, i64 4)
ret <vscale x 2 x i32> %c
}
define <vscale x 2 x i32> @extract_nxv8i32_nxv2i32_6(<vscale x 8 x i32> %vec) {
; CHECK-LABEL: extract_nxv8i32_nxv2i32_6:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v8, v11
; CHECK-NEXT: ret
%c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, i64 6)
ret <vscale x 2 x i32> %c
}
define <vscale x 8 x i32> @extract_nxv16i32_nxv8i32_0(<vscale x 16 x i32> %vec) {
; CHECK-LABEL: extract_nxv16i32_nxv8i32_0:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $v8m4 killed $v8m4 killed $v8m8
; CHECK-NEXT: ret
%c = call <vscale x 8 x i32> @llvm.experimental.vector.extract.nxv8i32.nxv16i32(<vscale x 16 x i32> %vec, i64 0)
ret <vscale x 8 x i32> %c
}
define <vscale x 8 x i32> @extract_nxv16i32_nxv8i32_8(<vscale x 16 x i32> %vec) {
; CHECK-LABEL: extract_nxv16i32_nxv8i32_8:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv4r.v v8, v12
; CHECK-NEXT: ret
%c = call <vscale x 8 x i32> @llvm.experimental.vector.extract.nxv8i32.nxv16i32(<vscale x 16 x i32> %vec, i64 8)
ret <vscale x 8 x i32> %c
}
define <vscale x 4 x i32> @extract_nxv16i32_nxv4i32_0(<vscale x 16 x i32> %vec) {
; CHECK-LABEL: extract_nxv16i32_nxv4i32_0:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m8
; CHECK-NEXT: ret
%c = call <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, i64 0)
ret <vscale x 4 x i32> %c
}
define <vscale x 4 x i32> @extract_nxv16i32_nxv4i32_4(<vscale x 16 x i32> %vec) {
; CHECK-LABEL: extract_nxv16i32_nxv4i32_4:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv2r.v v8, v10
; CHECK-NEXT: ret
%c = call <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, i64 4)
ret <vscale x 4 x i32> %c
}
define <vscale x 4 x i32> @extract_nxv16i32_nxv4i32_8(<vscale x 16 x i32> %vec) {
; CHECK-LABEL: extract_nxv16i32_nxv4i32_8:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv2r.v v8, v12
; CHECK-NEXT: ret
%c = call <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, i64 8)
ret <vscale x 4 x i32> %c
}
define <vscale x 4 x i32> @extract_nxv16i32_nxv4i32_12(<vscale x 16 x i32> %vec) {
; CHECK-LABEL: extract_nxv16i32_nxv4i32_12:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv2r.v v8, v14
; CHECK-NEXT: ret
%c = call <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, i64 12)
ret <vscale x 4 x i32> %c
}
define <vscale x 2 x i32> @extract_nxv16i32_nxv2i32_0(<vscale x 16 x i32> %vec) {
; CHECK-LABEL: extract_nxv16i32_nxv2i32_0:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8m8
; CHECK-NEXT: ret
%c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 0)
ret <vscale x 2 x i32> %c
}
define <vscale x 2 x i32> @extract_nxv16i32_nxv2i32_2(<vscale x 16 x i32> %vec) {
; CHECK-LABEL: extract_nxv16i32_nxv2i32_2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 2)
ret <vscale x 2 x i32> %c
}
define <vscale x 2 x i32> @extract_nxv16i32_nxv2i32_4(<vscale x 16 x i32> %vec) {
; CHECK-LABEL: extract_nxv16i32_nxv2i32_4:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 4)
ret <vscale x 2 x i32> %c
}
define <vscale x 2 x i32> @extract_nxv16i32_nxv2i32_6(<vscale x 16 x i32> %vec) {
; CHECK-LABEL: extract_nxv16i32_nxv2i32_6:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v8, v11
; CHECK-NEXT: ret
%c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 6)
ret <vscale x 2 x i32> %c
}
define <vscale x 2 x i32> @extract_nxv16i32_nxv2i32_8(<vscale x 16 x i32> %vec) {
; CHECK-LABEL: extract_nxv16i32_nxv2i32_8:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v8, v12
; CHECK-NEXT: ret
%c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 8)
ret <vscale x 2 x i32> %c
}
define <vscale x 2 x i32> @extract_nxv16i32_nxv2i32_10(<vscale x 16 x i32> %vec) {
; CHECK-LABEL: extract_nxv16i32_nxv2i32_10:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v8, v13
; CHECK-NEXT: ret
%c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 10)
ret <vscale x 2 x i32> %c
}
define <vscale x 2 x i32> @extract_nxv16i32_nxv2i32_12(<vscale x 16 x i32> %vec) {
; CHECK-LABEL: extract_nxv16i32_nxv2i32_12:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v8, v14
; CHECK-NEXT: ret
%c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 12)
ret <vscale x 2 x i32> %c
}
define <vscale x 2 x i32> @extract_nxv16i32_nxv2i32_14(<vscale x 16 x i32> %vec) {
; CHECK-LABEL: extract_nxv16i32_nxv2i32_14:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v8, v15
; CHECK-NEXT: ret
%c = call <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 14)
ret <vscale x 2 x i32> %c
}
define <vscale x 1 x i32> @extract_nxv16i32_nxv1i32_0(<vscale x 16 x i32> %vec) {
; CHECK-LABEL: extract_nxv16i32_nxv1i32_0:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8m8
; CHECK-NEXT: ret
%c = call <vscale x 1 x i32> @llvm.experimental.vector.extract.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, i64 0)
ret <vscale x 1 x i32> %c
}
; TODO: Extracts that don't align to a vector register are not yet supported.
; In this case we want to extract the upper half of the lowest VR subregister
; in the LMUL group.
; define <vscale x 1 x i32> @extract_nxv16i32_nxv1i32_1(<vscale x 16 x i32> %vec) {
; %c = call <vscale x 1 x i32> @llvm.experimental.vector.extract.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, i64 1)
; ret <vscale x 1 x i32> %c
; }
define <vscale x 1 x i32> @extract_nxv16i32_nxv1i32_2(<vscale x 16 x i32> %vec) {
; CHECK-LABEL: extract_nxv16i32_nxv1i32_2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%c = call <vscale x 1 x i32> @llvm.experimental.vector.extract.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, i64 2)
ret <vscale x 1 x i32> %c
}
define <vscale x 1 x i32> @extract_nxv2i32_nxv1i32_0(<vscale x 2 x i32> %vec) {
; CHECK-LABEL: extract_nxv2i32_nxv1i32_0:
; CHECK: # %bb.0:
; CHECK-NEXT: ret
%c = call <vscale x 1 x i32> @llvm.experimental.vector.extract.nxv1i32.nxv2i32(<vscale x 2 x i32> %vec, i64 0)
ret <vscale x 1 x i32> %c
}
declare <vscale x 1 x i32> @llvm.experimental.vector.extract.nxv1i32.nxv2i32(<vscale x 2 x i32> %vec, i64 %idx)
declare <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, i64 %idx)
declare <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %vec, i64 %idx)
declare <vscale x 1 x i32> @llvm.experimental.vector.extract.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, i64 %idx)
declare <vscale x 2 x i32> @llvm.experimental.vector.extract.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, i64 %idx)
declare <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, i64 %idx)
declare <vscale x 8 x i32> @llvm.experimental.vector.extract.nxv8i32.nxv16i32(<vscale x 16 x i32> %vec, i64 %idx)

View File

@ -0,0 +1,206 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s
define <vscale x 8 x i32> @insert_nxv8i32_nxv4i32_0(<vscale x 8 x i32> %vec, <vscale x 4 x i32> %subvec) {
; CHECK-LABEL: insert_nxv8i32_nxv4i32_0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv2r.v v8, v12
; CHECK-NEXT: ret
%v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv8i32(<vscale x 8 x i32> %vec, <vscale x 4 x i32> %subvec, i64 0)
ret <vscale x 8 x i32> %v
}
define <vscale x 8 x i32> @insert_nxv8i32_nxv4i32_4(<vscale x 8 x i32> %vec, <vscale x 4 x i32> %subvec) {
; CHECK-LABEL: insert_nxv8i32_nxv4i32_4:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv2r.v v10, v12
; CHECK-NEXT: ret
%v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv8i32(<vscale x 8 x i32> %vec, <vscale x 4 x i32> %subvec, i64 4)
ret <vscale x 8 x i32> %v
}
define <vscale x 8 x i32> @insert_nxv8i32_nxv2i32_0(<vscale x 8 x i32> %vec, <vscale x 2 x i32> %subvec) {
; CHECK-LABEL: insert_nxv8i32_nxv2i32_0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v8, v12
; CHECK-NEXT: ret
%v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, <vscale x 2 x i32> %subvec, i64 0)
ret <vscale x 8 x i32> %v
}
define <vscale x 8 x i32> @insert_nxv8i32_nxv2i32_2(<vscale x 8 x i32> %vec, <vscale x 2 x i32> %subvec) {
; CHECK-LABEL: insert_nxv8i32_nxv2i32_2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v9, v12
; CHECK-NEXT: ret
%v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, <vscale x 2 x i32> %subvec, i64 2)
ret <vscale x 8 x i32> %v
}
define <vscale x 8 x i32> @insert_nxv8i32_nxv2i32_4(<vscale x 8 x i32> %vec, <vscale x 2 x i32> %subvec) {
; CHECK-LABEL: insert_nxv8i32_nxv2i32_4:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v10, v12
; CHECK-NEXT: ret
%v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, <vscale x 2 x i32> %subvec, i64 4)
ret <vscale x 8 x i32> %v
}
define <vscale x 8 x i32> @insert_nxv8i32_nxv2i32_6(<vscale x 8 x i32> %vec, <vscale x 2 x i32> %subvec) {
; CHECK-LABEL: insert_nxv8i32_nxv2i32_6:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v11, v12
; CHECK-NEXT: ret
%v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv8i32(<vscale x 8 x i32> %vec, <vscale x 2 x i32> %subvec, i64 6)
ret <vscale x 8 x i32> %v
}
define <vscale x 16 x i32> @insert_nxv16i32_nxv8i32_0(<vscale x 16 x i32> %vec, <vscale x 8 x i32> %subvec) {
; CHECK-LABEL: insert_nxv16i32_nxv8i32_0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv4r.v v8, v16
; CHECK-NEXT: ret
%v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 8 x i32> %subvec, i64 0)
ret <vscale x 16 x i32> %v
}
define <vscale x 16 x i32> @insert_nxv16i32_nxv8i32_8(<vscale x 16 x i32> %vec, <vscale x 8 x i32> %subvec) {
; CHECK-LABEL: insert_nxv16i32_nxv8i32_8:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv4r.v v12, v16
; CHECK-NEXT: ret
%v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 8 x i32> %subvec, i64 8)
ret <vscale x 16 x i32> %v
}
define <vscale x 16 x i32> @insert_nxv16i32_nxv4i32_0(<vscale x 16 x i32> %vec, <vscale x 4 x i32> %subvec) {
; CHECK-LABEL: insert_nxv16i32_nxv4i32_0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv2r.v v8, v16
; CHECK-NEXT: ret
%v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 4 x i32> %subvec, i64 0)
ret <vscale x 16 x i32> %v
}
define <vscale x 16 x i32> @insert_nxv16i32_nxv4i32_4(<vscale x 16 x i32> %vec, <vscale x 4 x i32> %subvec) {
; CHECK-LABEL: insert_nxv16i32_nxv4i32_4:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv2r.v v10, v16
; CHECK-NEXT: ret
%v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 4 x i32> %subvec, i64 4)
ret <vscale x 16 x i32> %v
}
define <vscale x 16 x i32> @insert_nxv16i32_nxv4i32_8(<vscale x 16 x i32> %vec, <vscale x 4 x i32> %subvec) {
; CHECK-LABEL: insert_nxv16i32_nxv4i32_8:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv2r.v v12, v16
; CHECK-NEXT: ret
%v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 4 x i32> %subvec, i64 8)
ret <vscale x 16 x i32> %v
}
define <vscale x 16 x i32> @insert_nxv16i32_nxv4i32_12(<vscale x 16 x i32> %vec, <vscale x 4 x i32> %subvec) {
; CHECK-LABEL: insert_nxv16i32_nxv4i32_12:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv2r.v v14, v16
; CHECK-NEXT: ret
%v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 4 x i32> %subvec, i64 12)
ret <vscale x 16 x i32> %v
}
define <vscale x 16 x i32> @insert_nxv16i32_nxv2i32_0(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec) {
; CHECK-LABEL: insert_nxv16i32_nxv2i32_0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v8, v16
; CHECK-NEXT: ret
%v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec, i64 0)
ret <vscale x 16 x i32> %v
}
define <vscale x 16 x i32> @insert_nxv16i32_nxv2i32_2(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec) {
; CHECK-LABEL: insert_nxv16i32_nxv2i32_2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v9, v16
; CHECK-NEXT: ret
%v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec, i64 2)
ret <vscale x 16 x i32> %v
}
define <vscale x 16 x i32> @insert_nxv16i32_nxv2i32_4(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec) {
; CHECK-LABEL: insert_nxv16i32_nxv2i32_4:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v10, v16
; CHECK-NEXT: ret
%v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec, i64 4)
ret <vscale x 16 x i32> %v
}
define <vscale x 16 x i32> @insert_nxv16i32_nxv2i32_6(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec) {
; CHECK-LABEL: insert_nxv16i32_nxv2i32_6:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v11, v16
; CHECK-NEXT: ret
%v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec, i64 6)
ret <vscale x 16 x i32> %v
}
define <vscale x 16 x i32> @insert_nxv16i32_nxv2i32_8(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec) {
; CHECK-LABEL: insert_nxv16i32_nxv2i32_8:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v12, v16
; CHECK-NEXT: ret
%v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec, i64 8)
ret <vscale x 16 x i32> %v
}
define <vscale x 16 x i32> @insert_nxv16i32_nxv2i32_10(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec) {
; CHECK-LABEL: insert_nxv16i32_nxv2i32_10:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v13, v16
; CHECK-NEXT: ret
%v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec, i64 10)
ret <vscale x 16 x i32> %v
}
define <vscale x 16 x i32> @insert_nxv16i32_nxv2i32_12(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec) {
; CHECK-LABEL: insert_nxv16i32_nxv2i32_12:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v14, v16
; CHECK-NEXT: ret
%v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec, i64 12)
ret <vscale x 16 x i32> %v
}
define <vscale x 16 x i32> @insert_nxv16i32_nxv2i32_14(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec) {
; CHECK-LABEL: insert_nxv16i32_nxv2i32_14:
; CHECK: # %bb.0:
; CHECK-NEXT: vmv1r.v v15, v16
; CHECK-NEXT: ret
%v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 2 x i32> %subvec, i64 14)
ret <vscale x 16 x i32> %v
}
; TODO: Inserts that are less than LMUL=1 are not yet supported. In this case
; we need mask out the unaffected elements (top half of the VR %subvec
; register)
;define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_0(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec) {
; %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 0)
; ret <vscale x 16 x i32> %v
;}
; TODO: Inserts that don't align to a vector register are not yet supported.
; In this case we want to insert the subvector into the upper half of the
; lowest VR subregister in the LMUL group.
;define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_1(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec) {
; %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 1)
; ret <vscale x 16 x i32> %v
;}
declare <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv8i32(<vscale x 8 x i32>, <vscale x 2 x i32>, i64 %idx)
declare <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv8i32(<vscale x 8 x i32>, <vscale x 4 x i32>, i64 %idx)
declare <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32>, <vscale x 1 x i32>, i64 %idx)
declare <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv16i32(<vscale x 16 x i32>, <vscale x 2 x i32>, i64 %idx)
declare <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv16i32(<vscale x 16 x i32>, <vscale x 4 x i32>, i64 %idx)
declare <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv16i32(<vscale x 16 x i32>, <vscale x 8 x i32>, i64 %idx)