[RISCV] Optimize vector_shuffles that are interleaving the lowest elements of two vectors.

RISCV only has a unary shuffle that requires places indices in a
register. For interleaving two vectors this means we need at least
two vrgathers and a vmerge to do a shuffle of two vectors.

This patch teaches shuffle lowering to use a widening addu followed
by a widening vmaccu to implement the interleave. First we extract
the low half of both V1 and V2. Then we implement
(zext(V1) + zext(V2)) + (zext(V2) * zext(2^eltbits - 1)) which
simplifies to (zext(V1) + zext(V2) * 2^eltbits). This further
simplifies to (zext(V1) + zext(V2) << eltbits). Then we bitcast the
result back to the original type splitting the wide elements in half.

We can only do this if we have a type with wider elements available.
Because we're using extends we also have to be careful with fractional
lmuls. Floating point types are supported by bitcasting to/from integer.

The tests test a varied combination of LMULs split across VLEN>=128 and
VLEN>=512 tests. There a few tests with shuffle indices commuted as well
as tests for undef indices. There's one test for a vXi64/vXf64 vector which
we can't optimize, but verifies we don't crash.

Reviewed By: rogfer01

Differential Revision: https://reviews.llvm.org/D117743
This commit is contained in:
Craig Topper 2022-01-20 14:16:37 -08:00
parent cd2d736963
commit fa8bb22466
6 changed files with 995 additions and 15 deletions

View File

@ -2328,6 +2328,48 @@ static int matchShuffleAsSlideDown(ArrayRef<int> Mask) {
return -1;
}
static bool isInterleaveShuffle(ArrayRef<int> Mask, MVT VT, bool &SwapSources,
const RISCVSubtarget &Subtarget) {
// We need to be able to widen elements to the next larger integer type.
if (VT.getScalarSizeInBits() >= Subtarget.getMaxELENForFixedLengthVectors())
return false;
int Size = Mask.size();
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
int Srcs[] = {-1, -1};
for (int i = 0; i != Size; ++i) {
// Ignore undef elements.
if (Mask[i] < 0)
continue;
// Is this an even or odd element.
int Pol = i % 2;
// Ensure we consistently use the same source for this element polarity.
int Src = Mask[i] / Size;
if (Srcs[Pol] < 0)
Srcs[Pol] = Src;
if (Srcs[Pol] != Src)
return false;
// Make sure the element within the source is appropriate for this element
// in the destination.
int Elt = Mask[i] % Size;
if (Elt != i / 2)
return false;
}
// We need to find a source for each polarity and they can't be the same.
if (Srcs[0] < 0 || Srcs[1] < 0 || Srcs[0] == Srcs[1])
return false;
// Swap the sources if the second source was in the even polarity.
SwapSources = Srcs[0] > Srcs[1];
return true;
}
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
SDValue V1 = Op.getOperand(0);
@ -2413,8 +2455,10 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
}
}
ArrayRef<int> Mask = SVN->getMask();
// Try to match as a slidedown.
int SlideAmt = matchShuffleAsSlideDown(SVN->getMask());
int SlideAmt = matchShuffleAsSlideDown(Mask);
if (SlideAmt >= 0) {
// TODO: Should we reduce the VL to account for the upper undef elements?
// Requires additional vsetvlis, but might be faster to execute.
@ -2427,10 +2471,81 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
return convertFromScalableVector(VT, SlideDown, DAG, Subtarget);
}
// Detect an interleave shuffle and lower to
// (vmaccu.vx (vwaddu.vx lohalf(V1), lohalf(V2)), lohalf(V2), (2^eltbits - 1))
bool SwapSources;
if (isInterleaveShuffle(Mask, VT, SwapSources, Subtarget)) {
// Swap sources if needed.
if (SwapSources)
std::swap(V1, V2);
// Extract the lower half of the vectors.
MVT HalfVT = VT.getHalfNumVectorElementsVT();
V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
DAG.getConstant(0, DL, XLenVT));
V2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V2,
DAG.getConstant(0, DL, XLenVT));
// Double the element width and halve the number of elements in an int type.
unsigned EltBits = VT.getScalarSizeInBits();
MVT WideIntEltVT = MVT::getIntegerVT(EltBits * 2);
MVT WideIntVT =
MVT::getVectorVT(WideIntEltVT, VT.getVectorNumElements() / 2);
// Convert this to a scalable vector. We need to base this on the
// destination size to ensure there's always a type with a smaller LMUL.
MVT WideIntContainerVT =
getContainerForFixedLengthVector(DAG, WideIntVT, Subtarget);
// Convert sources to scalable vectors with the same element count as the
// larger type.
MVT HalfContainerVT = MVT::getVectorVT(
VT.getVectorElementType(), WideIntContainerVT.getVectorElementCount());
V1 = convertToScalableVector(HalfContainerVT, V1, DAG, Subtarget);
V2 = convertToScalableVector(HalfContainerVT, V2, DAG, Subtarget);
// Cast sources to integer.
MVT IntEltVT = MVT::getIntegerVT(EltBits);
MVT IntHalfVT =
MVT::getVectorVT(IntEltVT, HalfContainerVT.getVectorElementCount());
V1 = DAG.getBitcast(IntHalfVT, V1);
V2 = DAG.getBitcast(IntHalfVT, V2);
// Freeze V2 since we use it twice and we need to be sure that the add and
// multiply see the same value.
V2 = DAG.getNode(ISD::FREEZE, DL, IntHalfVT, V2);
// Recreate TrueMask using the widened type's element count.
MVT MaskVT =
MVT::getVectorVT(MVT::i1, HalfContainerVT.getVectorElementCount());
TrueMask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
// Widen V1 and V2 with 0s and add one copy of V2 to V1.
SDValue Add = DAG.getNode(RISCVISD::VWADDU_VL, DL, WideIntContainerVT, V1,
V2, TrueMask, VL);
// Create 2^eltbits - 1 copies of V2 by multiplying by the largest integer.
SDValue Multiplier = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntHalfVT,
DAG.getAllOnesConstant(DL, XLenVT));
SDValue WidenMul = DAG.getNode(RISCVISD::VWMULU_VL, DL, WideIntContainerVT,
V2, Multiplier, TrueMask, VL);
// Add the new copies to our previous addition giving us 2^eltbits copies of
// V2. This is equivalent to shifting V2 left by eltbits. This should
// combine with the vwmulu.vv above to form vwmaccu.vv.
Add = DAG.getNode(RISCVISD::ADD_VL, DL, WideIntContainerVT, Add, WidenMul,
TrueMask, VL);
// Cast back to ContainerVT. We need to re-create a new ContainerVT in case
// WideIntContainerVT is a larger fractional LMUL than implied by the fixed
// vector VT.
ContainerVT =
MVT::getVectorVT(VT.getVectorElementType(),
WideIntContainerVT.getVectorElementCount() * 2);
Add = DAG.getBitcast(ContainerVT, Add);
return convertFromScalableVector(VT, Add, DAG, Subtarget);
}
// Detect shuffles which can be re-expressed as vector selects; these are
// shuffles in which each element in the destination is taken from an element
// at the corresponding index in either source vectors.
bool IsSelect = all_of(enumerate(SVN->getMask()), [&](const auto &MaskIdx) {
bool IsSelect = all_of(enumerate(Mask), [&](const auto &MaskIdx) {
int MaskIndex = MaskIdx.value();
return MaskIndex < 0 || MaskIdx.index() == (unsigned)MaskIndex % NumElts;
});
@ -2456,7 +2571,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
// Now construct the mask that will be used by the vselect or blended
// vrgather operation. For vrgathers, construct the appropriate indices into
// each vector.
for (int MaskIndex : SVN->getMask()) {
for (int MaskIndex : Mask) {
bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ InvertMask;
MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT));
if (!IsSelect) {
@ -9941,6 +10056,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FP_ROUND_VL)
NODE_NAME_CASE(VWMUL_VL)
NODE_NAME_CASE(VWMULU_VL)
NODE_NAME_CASE(VWADDU_VL)
NODE_NAME_CASE(SETCC_VL)
NODE_NAME_CASE(VSELECT_VL)
NODE_NAME_CASE(VMAND_VL)

View File

@ -245,6 +245,7 @@ enum NodeType : unsigned {
// Widening instructions
VWMUL_VL,
VWMULU_VL,
VWADDU_VL,
// Vector compare producing a mask. Fourth operand is input mask. Fifth
// operand is VL.

View File

@ -221,14 +221,15 @@ def riscv_trunc_vector_vl : SDNode<"RISCVISD::TRUNCATE_VECTOR_VL",
SDTCVecEltisVT<2, i1>,
SDTCisVT<3, XLenVT>]>>;
def SDT_RISCVVWMUL_VL : SDTypeProfile<1, 4, [SDTCisVec<0>,
def SDT_RISCVVWBinOp_VL : SDTypeProfile<1, 4, [SDTCisVec<0>,
SDTCisSameNumEltsAs<0, 1>,
SDTCisSameAs<1, 2>,
SDTCisSameNumEltsAs<1, 3>,
SDTCVecEltisVT<3, i1>,
SDTCisVT<4, XLenVT>]>;
def riscv_vwmul_vl : SDNode<"RISCVISD::VWMUL_VL", SDT_RISCVVWMUL_VL, [SDNPCommutative]>;
def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWMUL_VL, [SDNPCommutative]>;
def riscv_vwmul_vl : SDNode<"RISCVISD::VWMUL_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>;
def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>;
def riscv_vwaddu_vl : SDNode<"RISCVISD::VWADDU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>;
def SDTRVVVecReduce : SDTypeProfile<1, 5, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisSameAs<0, 3>,
@ -712,6 +713,9 @@ foreach vti = AllIntegerVectors in {
(vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
}
// 12.2. Vector Widening Integer Add/Subtract
defm : VPatBinaryWVL_VV_VX<riscv_vwaddu_vl, "PseudoVWADDU">;
// 12.3. Vector Integer Extension
defm : VPatExtendSDNode_V_VL<riscv_zext_vl, "PseudoVZEXT", "VF2",
AllFractionableVF2IntVectors>;

View File

@ -0,0 +1,378 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+experimental-v,+zfh -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV32-V128
; RUN: llc -mtriple=riscv64 -mattr=+experimental-v,+zfh -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV64-V128
; RUN: llc -mtriple=riscv32 -mattr=+experimental-v,+zfh -riscv-v-vector-bits-min=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV32-V512
; RUN: llc -mtriple=riscv64 -mattr=+experimental-v,+zfh -riscv-v-vector-bits-min=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV64-V512
; Test optimizing interleaves to widening arithmetic.
define <4 x half> @interleave_v2f16(<2 x half> %x, <2 x half> %y) {
; CHECK-LABEL: interleave_v2f16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf4, ta, mu
; CHECK-NEXT: vwaddu.vv v10, v8, v9
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: vwmaccu.vx v10, a0, v9
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%a = shufflevector <2 x half> %x, <2 x half> %y, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
ret <4 x half> %a
}
; Vector order switched for coverage.
define <4 x float> @interleave_v2f32(<2 x float> %x, <2 x float> %y) {
; CHECK-LABEL: interleave_v2f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, mf2, ta, mu
; CHECK-NEXT: vwaddu.vv v10, v9, v8
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: vwmaccu.vx v10, a0, v8
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%a = shufflevector <2 x float> %x, <2 x float> %y, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
ret <4 x float> %a
}
; One vXf64 test case to very that we don't optimize it.
; FIXME: Is there better codegen we can do here?
define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) {
; RV32-V128-LABEL: interleave_v2f64:
; RV32-V128: # %bb.0:
; RV32-V128-NEXT: vmv1r.v v12, v9
; RV32-V128-NEXT: # kill: def $v8 killed $v8 def $v8m2
; RV32-V128-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
; RV32-V128-NEXT: vid.v v10
; RV32-V128-NEXT: vsrl.vi v14, v10, 1
; RV32-V128-NEXT: vsetvli zero, zero, e64, m2, ta, mu
; RV32-V128-NEXT: vrgatherei16.vv v10, v8, v14
; RV32-V128-NEXT: li a0, 10
; RV32-V128-NEXT: vsetivli zero, 1, e8, mf8, ta, mu
; RV32-V128-NEXT: vmv.s.x v0, a0
; RV32-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu
; RV32-V128-NEXT: vrgatherei16.vv v10, v12, v14, v0.t
; RV32-V128-NEXT: vmv.v.v v8, v10
; RV32-V128-NEXT: ret
;
; RV64-V128-LABEL: interleave_v2f64:
; RV64-V128: # %bb.0:
; RV64-V128-NEXT: vmv1r.v v12, v9
; RV64-V128-NEXT: # kill: def $v8 killed $v8 def $v8m2
; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu
; RV64-V128-NEXT: vid.v v10
; RV64-V128-NEXT: vsrl.vi v14, v10, 1
; RV64-V128-NEXT: vrgather.vv v10, v8, v14
; RV64-V128-NEXT: li a0, 10
; RV64-V128-NEXT: vsetivli zero, 1, e8, mf8, ta, mu
; RV64-V128-NEXT: vmv.s.x v0, a0
; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu
; RV64-V128-NEXT: vrgather.vv v10, v12, v14, v0.t
; RV64-V128-NEXT: vmv.v.v v8, v10
; RV64-V128-NEXT: ret
;
; RV32-V512-LABEL: interleave_v2f64:
; RV32-V512: # %bb.0:
; RV32-V512-NEXT: vsetivli zero, 4, e16, mf4, ta, mu
; RV32-V512-NEXT: vid.v v10
; RV32-V512-NEXT: vsrl.vi v11, v10, 1
; RV32-V512-NEXT: vsetvli zero, zero, e64, m1, ta, mu
; RV32-V512-NEXT: vrgatherei16.vv v10, v8, v11
; RV32-V512-NEXT: li a0, 10
; RV32-V512-NEXT: vsetivli zero, 1, e8, mf8, ta, mu
; RV32-V512-NEXT: vmv.s.x v0, a0
; RV32-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu
; RV32-V512-NEXT: vrgatherei16.vv v10, v9, v11, v0.t
; RV32-V512-NEXT: vmv.v.v v8, v10
; RV32-V512-NEXT: ret
;
; RV64-V512-LABEL: interleave_v2f64:
; RV64-V512: # %bb.0:
; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu
; RV64-V512-NEXT: vid.v v10
; RV64-V512-NEXT: vsrl.vi v11, v10, 1
; RV64-V512-NEXT: vrgather.vv v10, v8, v11
; RV64-V512-NEXT: li a0, 10
; RV64-V512-NEXT: vsetivli zero, 1, e8, mf8, ta, mu
; RV64-V512-NEXT: vmv.s.x v0, a0
; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu
; RV64-V512-NEXT: vrgather.vv v10, v9, v11, v0.t
; RV64-V512-NEXT: vmv.v.v v8, v10
; RV64-V512-NEXT: ret
%a = shufflevector <2 x double> %x, <2 x double> %y, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
ret <4 x double> %a
}
; Undef elements for coverage
define <8 x half> @interleave_v4f16(<4 x half> %x, <4 x half> %y) {
; V128-LABEL: interleave_v4f16:
; V128: # %bb.0:
; V128-NEXT: vsetivli zero, 8, e16, mf2, ta, mu
; V128-NEXT: vwaddu.vv v10, v8, v9
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v10, a0, v9
; V128-NEXT: vmv1r.v v8, v10
; V128-NEXT: ret
;
; V512-LABEL: interleave_v4f16:
; V512: # %bb.0:
; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, mu
; V512-NEXT: vwaddu.vv v10, v8, v9
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v9
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <4 x half> %x, <4 x half> %y, <8 x i32> <i32 0, i32 4, i32 undef, i32 5, i32 2, i32 undef, i32 3, i32 7>
ret <8 x half> %a
}
define <8 x float> @interleave_v4f32(<4 x float> %x, <4 x float> %y) {
; V128-LABEL: interleave_v4f32:
; V128: # %bb.0:
; V128-NEXT: vsetivli zero, 8, e32, m1, ta, mu
; V128-NEXT: vwaddu.vv v10, v8, v9
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v10, a0, v9
; V128-NEXT: vmv2r.v v8, v10
; V128-NEXT: ret
;
; V512-LABEL: interleave_v4f32:
; V512: # %bb.0:
; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, mu
; V512-NEXT: vwaddu.vv v10, v8, v9
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v9
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <4 x float> %x, <4 x float> %y, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
ret <8 x float> %a
}
; Vector order switched for coverage.
define <16 x half> @interleave_v8f16(<8 x half> %x, <8 x half> %y) {
; V128-LABEL: interleave_v8f16:
; V128: # %bb.0:
; V128-NEXT: vsetivli zero, 16, e16, m1, ta, mu
; V128-NEXT: vwaddu.vv v10, v9, v8
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v10, a0, v8
; V128-NEXT: vmv2r.v v8, v10
; V128-NEXT: ret
;
; V512-LABEL: interleave_v8f16:
; V512: # %bb.0:
; V512-NEXT: vsetivli zero, 16, e16, mf4, ta, mu
; V512-NEXT: vwaddu.vv v10, v9, v8
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v8
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <8 x half> %x, <8 x half> %y, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
ret <16 x half> %a
}
define <16 x float> @interleave_v8f32(<8 x float> %x, <8 x float> %y) {
; V128-LABEL: interleave_v8f32:
; V128: # %bb.0:
; V128-NEXT: vsetivli zero, 16, e32, m2, ta, mu
; V128-NEXT: vwaddu.vv v12, v8, v10
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v12, a0, v10
; V128-NEXT: vmv4r.v v8, v12
; V128-NEXT: ret
;
; V512-LABEL: interleave_v8f32:
; V512: # %bb.0:
; V512-NEXT: vsetivli zero, 16, e32, mf2, ta, mu
; V512-NEXT: vwaddu.vv v10, v8, v9
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v9
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <8 x float> %x, <8 x float> %y, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
ret <16 x float> %a
}
define <32 x half> @interleave_v16f16(<16 x half> %x, <16 x half> %y) {
; V128-LABEL: interleave_v16f16:
; V128: # %bb.0:
; V128-NEXT: li a0, 32
; V128-NEXT: vsetvli zero, a0, e16, m2, ta, mu
; V128-NEXT: vwaddu.vv v12, v8, v10
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v12, a0, v10
; V128-NEXT: vmv4r.v v8, v12
; V128-NEXT: ret
;
; V512-LABEL: interleave_v16f16:
; V512: # %bb.0:
; V512-NEXT: li a0, 32
; V512-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
; V512-NEXT: vwaddu.vv v10, v8, v9
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v9
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <16 x half> %x, <16 x half> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
ret <32 x half> %a
}
define <32 x float> @interleave_v16f32(<16 x float> %x, <16 x float> %y) {
; V128-LABEL: interleave_v16f32:
; V128: # %bb.0:
; V128-NEXT: li a0, 32
; V128-NEXT: vsetvli zero, a0, e32, m4, ta, mu
; V128-NEXT: vwaddu.vv v16, v8, v12
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v16, a0, v12
; V128-NEXT: vmv8r.v v8, v16
; V128-NEXT: ret
;
; V512-LABEL: interleave_v16f32:
; V512: # %bb.0:
; V512-NEXT: li a0, 32
; V512-NEXT: vsetvli zero, a0, e32, m1, ta, mu
; V512-NEXT: vwaddu.vv v10, v8, v9
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v9
; V512-NEXT: vmv2r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <16 x float> %x, <16 x float> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
ret <32 x float> %a
}
define <64 x half> @interleave_v32f16(<32 x half> %x, <32 x half> %y) {
; V128-LABEL: interleave_v32f16:
; V128: # %bb.0:
; V128-NEXT: li a0, 64
; V128-NEXT: vsetvli zero, a0, e16, m4, ta, mu
; V128-NEXT: vwaddu.vv v16, v8, v12
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v16, a0, v12
; V128-NEXT: vmv8r.v v8, v16
; V128-NEXT: ret
;
; V512-LABEL: interleave_v32f16:
; V512: # %bb.0:
; V512-NEXT: li a0, 64
; V512-NEXT: vsetvli zero, a0, e16, m1, ta, mu
; V512-NEXT: vwaddu.vv v10, v8, v9
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v9
; V512-NEXT: vmv2r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <32 x half> %x, <32 x half> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
ret <64 x half> %a
}
define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
; RV32-V128-LABEL: interleave_v32f32:
; RV32-V128: # %bb.0:
; RV32-V128-NEXT: addi sp, sp, -16
; RV32-V128-NEXT: .cfi_def_cfa_offset 16
; RV32-V128-NEXT: csrr a0, vlenb
; RV32-V128-NEXT: slli a0, a0, 4
; RV32-V128-NEXT: sub sp, sp, a0
; RV32-V128-NEXT: lui a0, %hi(.LCPI10_0)
; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_0)
; RV32-V128-NEXT: li a1, 32
; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
; RV32-V128-NEXT: vle32.v v0, (a0)
; RV32-V128-NEXT: vmv8r.v v24, v8
; RV32-V128-NEXT: addi a0, sp, 16
; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-V128-NEXT: vrgather.vv v8, v24, v0
; RV32-V128-NEXT: lui a0, %hi(.LCPI10_1)
; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_1)
; RV32-V128-NEXT: vle32.v v24, (a0)
; RV32-V128-NEXT: csrr a0, vlenb
; RV32-V128-NEXT: slli a0, a0, 3
; RV32-V128-NEXT: add a0, sp, a0
; RV32-V128-NEXT: addi a0, a0, 16
; RV32-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; RV32-V128-NEXT: lui a0, 699051
; RV32-V128-NEXT: addi a0, a0, -1366
; RV32-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, mu
; RV32-V128-NEXT: vmv.s.x v0, a0
; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
; RV32-V128-NEXT: csrr a0, vlenb
; RV32-V128-NEXT: slli a0, a0, 3
; RV32-V128-NEXT: add a0, sp, a0
; RV32-V128-NEXT: addi a0, a0, 16
; RV32-V128-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload
; RV32-V128-NEXT: vrgather.vv v8, v16, v24, v0.t
; RV32-V128-NEXT: vmv.v.v v24, v8
; RV32-V128-NEXT: vsetvli zero, a1, e32, m4, ta, mu
; RV32-V128-NEXT: addi a0, sp, 16
; RV32-V128-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload
; RV32-V128-NEXT: vwaddu.vv v0, v8, v16
; RV32-V128-NEXT: li a0, -1
; RV32-V128-NEXT: vwmaccu.vx v0, a0, v16
; RV32-V128-NEXT: vmv8r.v v8, v0
; RV32-V128-NEXT: vmv8r.v v16, v24
; RV32-V128-NEXT: csrr a0, vlenb
; RV32-V128-NEXT: slli a0, a0, 4
; RV32-V128-NEXT: add sp, sp, a0
; RV32-V128-NEXT: addi sp, sp, 16
; RV32-V128-NEXT: ret
;
; RV64-V128-LABEL: interleave_v32f32:
; RV64-V128: # %bb.0:
; RV64-V128-NEXT: addi sp, sp, -16
; RV64-V128-NEXT: .cfi_def_cfa_offset 16
; RV64-V128-NEXT: csrr a0, vlenb
; RV64-V128-NEXT: slli a0, a0, 4
; RV64-V128-NEXT: sub sp, sp, a0
; RV64-V128-NEXT: lui a0, %hi(.LCPI10_0)
; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_0)
; RV64-V128-NEXT: li a1, 32
; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
; RV64-V128-NEXT: vle32.v v0, (a0)
; RV64-V128-NEXT: vmv8r.v v24, v8
; RV64-V128-NEXT: addi a0, sp, 16
; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV64-V128-NEXT: vrgather.vv v8, v24, v0
; RV64-V128-NEXT: lui a0, %hi(.LCPI10_1)
; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_1)
; RV64-V128-NEXT: vle32.v v24, (a0)
; RV64-V128-NEXT: csrr a0, vlenb
; RV64-V128-NEXT: slli a0, a0, 3
; RV64-V128-NEXT: add a0, sp, a0
; RV64-V128-NEXT: addi a0, a0, 16
; RV64-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; RV64-V128-NEXT: lui a0, 699051
; RV64-V128-NEXT: addiw a0, a0, -1366
; RV64-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, mu
; RV64-V128-NEXT: vmv.s.x v0, a0
; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
; RV64-V128-NEXT: csrr a0, vlenb
; RV64-V128-NEXT: slli a0, a0, 3
; RV64-V128-NEXT: add a0, sp, a0
; RV64-V128-NEXT: addi a0, a0, 16
; RV64-V128-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload
; RV64-V128-NEXT: vrgather.vv v8, v16, v24, v0.t
; RV64-V128-NEXT: vmv.v.v v24, v8
; RV64-V128-NEXT: vsetvli zero, a1, e32, m4, ta, mu
; RV64-V128-NEXT: addi a0, sp, 16
; RV64-V128-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload
; RV64-V128-NEXT: vwaddu.vv v0, v8, v16
; RV64-V128-NEXT: li a0, -1
; RV64-V128-NEXT: vwmaccu.vx v0, a0, v16
; RV64-V128-NEXT: vmv8r.v v8, v0
; RV64-V128-NEXT: vmv8r.v v16, v24
; RV64-V128-NEXT: csrr a0, vlenb
; RV64-V128-NEXT: slli a0, a0, 4
; RV64-V128-NEXT: add sp, sp, a0
; RV64-V128-NEXT: addi sp, sp, 16
; RV64-V128-NEXT: ret
;
; V512-LABEL: interleave_v32f32:
; V512: # %bb.0:
; V512-NEXT: li a0, 64
; V512-NEXT: vsetvli zero, a0, e32, m2, ta, mu
; V512-NEXT: vwaddu.vv v12, v8, v10
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v12, a0, v10
; V512-NEXT: vmv4r.v v8, v12
; V512-NEXT: ret
%a = shufflevector <32 x float> %x, <32 x float> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
ret <64 x float> %a
}

View File

@ -0,0 +1,484 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV32-V128
; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV64-V128
; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV32-V512
; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV64-V512
; Test optimizing interleaves to widening arithmetic.
define <4 x i8> @interleave_v2i8(<2 x i8> %x, <2 x i8> %y) {
; CHECK-LABEL: interleave_v2i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e8, mf8, ta, mu
; CHECK-NEXT: vwaddu.vv v10, v8, v9
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: vwmaccu.vx v10, a0, v9
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%a = shufflevector <2 x i8> %x, <2 x i8> %y, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
ret <4 x i8> %a
}
define <4 x i16> @interleave_v2i16(<2 x i16> %x, <2 x i16> %y) {
; CHECK-LABEL: interleave_v2i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf4, ta, mu
; CHECK-NEXT: vwaddu.vv v10, v8, v9
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: vwmaccu.vx v10, a0, v9
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%a = shufflevector <2 x i16> %x, <2 x i16> %y, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
ret <4 x i16> %a
}
; Vector order switched for coverage.
define <4 x i32> @interleave_v2i32(<2 x i32> %x, <2 x i32> %y) {
; CHECK-LABEL: interleave_v2i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, mf2, ta, mu
; CHECK-NEXT: vwaddu.vv v10, v9, v8
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: vwmaccu.vx v10, a0, v8
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%a = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
ret <4 x i32> %a
}
; One vXi64 test case to very that we don't optimize it.
; FIXME: Is there better codegen we can do here?
define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) {
; RV32-V128-LABEL: interleave_v2i64:
; RV32-V128: # %bb.0:
; RV32-V128-NEXT: vmv1r.v v12, v9
; RV32-V128-NEXT: # kill: def $v8 killed $v8 def $v8m2
; RV32-V128-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
; RV32-V128-NEXT: vid.v v10
; RV32-V128-NEXT: vsrl.vi v14, v10, 1
; RV32-V128-NEXT: vsetvli zero, zero, e64, m2, ta, mu
; RV32-V128-NEXT: vrgatherei16.vv v10, v8, v14
; RV32-V128-NEXT: li a0, 10
; RV32-V128-NEXT: vsetivli zero, 1, e8, mf8, ta, mu
; RV32-V128-NEXT: vmv.s.x v0, a0
; RV32-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu
; RV32-V128-NEXT: vrgatherei16.vv v10, v12, v14, v0.t
; RV32-V128-NEXT: vmv.v.v v8, v10
; RV32-V128-NEXT: ret
;
; RV64-V128-LABEL: interleave_v2i64:
; RV64-V128: # %bb.0:
; RV64-V128-NEXT: vmv1r.v v12, v9
; RV64-V128-NEXT: # kill: def $v8 killed $v8 def $v8m2
; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu
; RV64-V128-NEXT: vid.v v10
; RV64-V128-NEXT: vsrl.vi v14, v10, 1
; RV64-V128-NEXT: vrgather.vv v10, v8, v14
; RV64-V128-NEXT: li a0, 10
; RV64-V128-NEXT: vsetivli zero, 1, e8, mf8, ta, mu
; RV64-V128-NEXT: vmv.s.x v0, a0
; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu
; RV64-V128-NEXT: vrgather.vv v10, v12, v14, v0.t
; RV64-V128-NEXT: vmv.v.v v8, v10
; RV64-V128-NEXT: ret
;
; RV32-V512-LABEL: interleave_v2i64:
; RV32-V512: # %bb.0:
; RV32-V512-NEXT: vsetivli zero, 4, e16, mf4, ta, mu
; RV32-V512-NEXT: vid.v v10
; RV32-V512-NEXT: vsrl.vi v11, v10, 1
; RV32-V512-NEXT: vsetvli zero, zero, e64, m1, ta, mu
; RV32-V512-NEXT: vrgatherei16.vv v10, v8, v11
; RV32-V512-NEXT: li a0, 10
; RV32-V512-NEXT: vsetivli zero, 1, e8, mf8, ta, mu
; RV32-V512-NEXT: vmv.s.x v0, a0
; RV32-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu
; RV32-V512-NEXT: vrgatherei16.vv v10, v9, v11, v0.t
; RV32-V512-NEXT: vmv.v.v v8, v10
; RV32-V512-NEXT: ret
;
; RV64-V512-LABEL: interleave_v2i64:
; RV64-V512: # %bb.0:
; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu
; RV64-V512-NEXT: vid.v v10
; RV64-V512-NEXT: vsrl.vi v11, v10, 1
; RV64-V512-NEXT: vrgather.vv v10, v8, v11
; RV64-V512-NEXT: li a0, 10
; RV64-V512-NEXT: vsetivli zero, 1, e8, mf8, ta, mu
; RV64-V512-NEXT: vmv.s.x v0, a0
; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu
; RV64-V512-NEXT: vrgather.vv v10, v9, v11, v0.t
; RV64-V512-NEXT: vmv.v.v v8, v10
; RV64-V512-NEXT: ret
%a = shufflevector <2 x i64> %x, <2 x i64> %y, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
ret <4 x i64> %a
}
; Vector order switched for coverage.
define <8 x i8> @interleave_v4i8(<4 x i8> %x, <4 x i8> %y) {
; V128-LABEL: interleave_v4i8:
; V128: # %bb.0:
; V128-NEXT: vsetivli zero, 8, e8, mf4, ta, mu
; V128-NEXT: vwaddu.vv v10, v9, v8
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v10, a0, v8
; V128-NEXT: vmv1r.v v8, v10
; V128-NEXT: ret
;
; V512-LABEL: interleave_v4i8:
; V512: # %bb.0:
; V512-NEXT: vsetivli zero, 8, e8, mf8, ta, mu
; V512-NEXT: vwaddu.vv v10, v9, v8
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v8
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
ret <8 x i8> %a
}
; Undef elements for coverage
define <8 x i16> @interleave_v4i16(<4 x i16> %x, <4 x i16> %y) {
; V128-LABEL: interleave_v4i16:
; V128: # %bb.0:
; V128-NEXT: vsetivli zero, 8, e16, mf2, ta, mu
; V128-NEXT: vwaddu.vv v10, v8, v9
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v10, a0, v9
; V128-NEXT: vmv1r.v v8, v10
; V128-NEXT: ret
;
; V512-LABEL: interleave_v4i16:
; V512: # %bb.0:
; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, mu
; V512-NEXT: vwaddu.vv v10, v8, v9
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v9
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <4 x i16> %x, <4 x i16> %y, <8 x i32> <i32 0, i32 4, i32 undef, i32 5, i32 2, i32 undef, i32 3, i32 7>
ret <8 x i16> %a
}
define <8 x i32> @interleave_v4i32(<4 x i32> %x, <4 x i32> %y) {
; V128-LABEL: interleave_v4i32:
; V128: # %bb.0:
; V128-NEXT: vsetivli zero, 8, e32, m1, ta, mu
; V128-NEXT: vwaddu.vv v10, v8, v9
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v10, a0, v9
; V128-NEXT: vmv2r.v v8, v10
; V128-NEXT: ret
;
; V512-LABEL: interleave_v4i32:
; V512: # %bb.0:
; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, mu
; V512-NEXT: vwaddu.vv v10, v8, v9
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v9
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <4 x i32> %x, <4 x i32> %y, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
ret <8 x i32> %a
}
define <16 x i8> @interleave_v8i8(<8 x i8> %x, <8 x i8> %y) {
; V128-LABEL: interleave_v8i8:
; V128: # %bb.0:
; V128-NEXT: vsetivli zero, 16, e8, mf2, ta, mu
; V128-NEXT: vwaddu.vv v10, v8, v9
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v10, a0, v9
; V128-NEXT: vmv1r.v v8, v10
; V128-NEXT: ret
;
; V512-LABEL: interleave_v8i8:
; V512: # %bb.0:
; V512-NEXT: vsetivli zero, 16, e8, mf8, ta, mu
; V512-NEXT: vwaddu.vv v10, v8, v9
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v9
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
ret <16 x i8> %a
}
; Vector order switched for coverage.
define <16 x i16> @interleave_v8i16(<8 x i16> %x, <8 x i16> %y) {
; V128-LABEL: interleave_v8i16:
; V128: # %bb.0:
; V128-NEXT: vsetivli zero, 16, e16, m1, ta, mu
; V128-NEXT: vwaddu.vv v10, v9, v8
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v10, a0, v8
; V128-NEXT: vmv2r.v v8, v10
; V128-NEXT: ret
;
; V512-LABEL: interleave_v8i16:
; V512: # %bb.0:
; V512-NEXT: vsetivli zero, 16, e16, mf4, ta, mu
; V512-NEXT: vwaddu.vv v10, v9, v8
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v8
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <8 x i16> %x, <8 x i16> %y, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
ret <16 x i16> %a
}
define <16 x i32> @interleave_v8i32(<8 x i32> %x, <8 x i32> %y) {
; V128-LABEL: interleave_v8i32:
; V128: # %bb.0:
; V128-NEXT: vsetivli zero, 16, e32, m2, ta, mu
; V128-NEXT: vwaddu.vv v12, v8, v10
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v12, a0, v10
; V128-NEXT: vmv4r.v v8, v12
; V128-NEXT: ret
;
; V512-LABEL: interleave_v8i32:
; V512: # %bb.0:
; V512-NEXT: vsetivli zero, 16, e32, mf2, ta, mu
; V512-NEXT: vwaddu.vv v10, v8, v9
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v9
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <8 x i32> %x, <8 x i32> %y, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
ret <16 x i32> %a
}
define <32 x i8> @interleave_v16i8(<16 x i8> %x, <16 x i8> %y) {
; V128-LABEL: interleave_v16i8:
; V128: # %bb.0:
; V128-NEXT: li a0, 32
; V128-NEXT: vsetvli zero, a0, e8, m1, ta, mu
; V128-NEXT: vwaddu.vv v10, v8, v9
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v10, a0, v9
; V128-NEXT: vmv2r.v v8, v10
; V128-NEXT: ret
;
; V512-LABEL: interleave_v16i8:
; V512: # %bb.0:
; V512-NEXT: li a0, 32
; V512-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
; V512-NEXT: vwaddu.vv v10, v8, v9
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v9
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <16 x i8> %x, <16 x i8> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
ret <32 x i8> %a
}
define <32 x i16> @interleave_v16i16(<16 x i16> %x, <16 x i16> %y) {
; V128-LABEL: interleave_v16i16:
; V128: # %bb.0:
; V128-NEXT: li a0, 32
; V128-NEXT: vsetvli zero, a0, e16, m2, ta, mu
; V128-NEXT: vwaddu.vv v12, v8, v10
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v12, a0, v10
; V128-NEXT: vmv4r.v v8, v12
; V128-NEXT: ret
;
; V512-LABEL: interleave_v16i16:
; V512: # %bb.0:
; V512-NEXT: li a0, 32
; V512-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
; V512-NEXT: vwaddu.vv v10, v8, v9
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v9
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <16 x i16> %x, <16 x i16> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
ret <32 x i16> %a
}
define <32 x i32> @interleave_v16i32(<16 x i32> %x, <16 x i32> %y) {
; V128-LABEL: interleave_v16i32:
; V128: # %bb.0:
; V128-NEXT: li a0, 32
; V128-NEXT: vsetvli zero, a0, e32, m4, ta, mu
; V128-NEXT: vwaddu.vv v16, v8, v12
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v16, a0, v12
; V128-NEXT: vmv8r.v v8, v16
; V128-NEXT: ret
;
; V512-LABEL: interleave_v16i32:
; V512: # %bb.0:
; V512-NEXT: li a0, 32
; V512-NEXT: vsetvli zero, a0, e32, m1, ta, mu
; V512-NEXT: vwaddu.vv v10, v8, v9
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v9
; V512-NEXT: vmv2r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <16 x i32> %x, <16 x i32> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
ret <32 x i32> %a
}
define <64 x i8> @interleave_v32i8(<32 x i8> %x, <32 x i8> %y) {
; V128-LABEL: interleave_v32i8:
; V128: # %bb.0:
; V128-NEXT: li a0, 64
; V128-NEXT: vsetvli zero, a0, e8, m2, ta, mu
; V128-NEXT: vwaddu.vv v12, v8, v10
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v12, a0, v10
; V128-NEXT: vmv4r.v v8, v12
; V128-NEXT: ret
;
; V512-LABEL: interleave_v32i8:
; V512: # %bb.0:
; V512-NEXT: li a0, 64
; V512-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
; V512-NEXT: vwaddu.vv v10, v8, v9
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v9
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <32 x i8> %x, <32 x i8> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
ret <64 x i8> %a
}
define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) {
; V128-LABEL: interleave_v32i16:
; V128: # %bb.0:
; V128-NEXT: li a0, 64
; V128-NEXT: vsetvli zero, a0, e16, m4, ta, mu
; V128-NEXT: vwaddu.vv v16, v8, v12
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v16, a0, v12
; V128-NEXT: vmv8r.v v8, v16
; V128-NEXT: ret
;
; V512-LABEL: interleave_v32i16:
; V512: # %bb.0:
; V512-NEXT: li a0, 64
; V512-NEXT: vsetvli zero, a0, e16, m1, ta, mu
; V512-NEXT: vwaddu.vv v10, v8, v9
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v10, a0, v9
; V512-NEXT: vmv2r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <32 x i16> %x, <32 x i16> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
ret <64 x i16> %a
}
define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
; RV32-V128-LABEL: interleave_v32i32:
; RV32-V128: # %bb.0:
; RV32-V128-NEXT: addi sp, sp, -16
; RV32-V128-NEXT: .cfi_def_cfa_offset 16
; RV32-V128-NEXT: csrr a0, vlenb
; RV32-V128-NEXT: slli a0, a0, 4
; RV32-V128-NEXT: sub sp, sp, a0
; RV32-V128-NEXT: lui a0, %hi(.LCPI15_0)
; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI15_0)
; RV32-V128-NEXT: li a1, 32
; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
; RV32-V128-NEXT: vle32.v v0, (a0)
; RV32-V128-NEXT: vmv8r.v v24, v8
; RV32-V128-NEXT: addi a0, sp, 16
; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-V128-NEXT: vrgather.vv v8, v24, v0
; RV32-V128-NEXT: lui a0, %hi(.LCPI15_1)
; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI15_1)
; RV32-V128-NEXT: vle32.v v24, (a0)
; RV32-V128-NEXT: csrr a0, vlenb
; RV32-V128-NEXT: slli a0, a0, 3
; RV32-V128-NEXT: add a0, sp, a0
; RV32-V128-NEXT: addi a0, a0, 16
; RV32-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; RV32-V128-NEXT: lui a0, 699051
; RV32-V128-NEXT: addi a0, a0, -1366
; RV32-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, mu
; RV32-V128-NEXT: vmv.s.x v0, a0
; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
; RV32-V128-NEXT: csrr a0, vlenb
; RV32-V128-NEXT: slli a0, a0, 3
; RV32-V128-NEXT: add a0, sp, a0
; RV32-V128-NEXT: addi a0, a0, 16
; RV32-V128-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload
; RV32-V128-NEXT: vrgather.vv v8, v16, v24, v0.t
; RV32-V128-NEXT: vmv.v.v v24, v8
; RV32-V128-NEXT: vsetvli zero, a1, e32, m4, ta, mu
; RV32-V128-NEXT: addi a0, sp, 16
; RV32-V128-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload
; RV32-V128-NEXT: vwaddu.vv v0, v8, v16
; RV32-V128-NEXT: li a0, -1
; RV32-V128-NEXT: vwmaccu.vx v0, a0, v16
; RV32-V128-NEXT: vmv8r.v v8, v0
; RV32-V128-NEXT: vmv8r.v v16, v24
; RV32-V128-NEXT: csrr a0, vlenb
; RV32-V128-NEXT: slli a0, a0, 4
; RV32-V128-NEXT: add sp, sp, a0
; RV32-V128-NEXT: addi sp, sp, 16
; RV32-V128-NEXT: ret
;
; RV64-V128-LABEL: interleave_v32i32:
; RV64-V128: # %bb.0:
; RV64-V128-NEXT: addi sp, sp, -16
; RV64-V128-NEXT: .cfi_def_cfa_offset 16
; RV64-V128-NEXT: csrr a0, vlenb
; RV64-V128-NEXT: slli a0, a0, 4
; RV64-V128-NEXT: sub sp, sp, a0
; RV64-V128-NEXT: lui a0, %hi(.LCPI15_0)
; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI15_0)
; RV64-V128-NEXT: li a1, 32
; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
; RV64-V128-NEXT: vle32.v v0, (a0)
; RV64-V128-NEXT: vmv8r.v v24, v8
; RV64-V128-NEXT: addi a0, sp, 16
; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV64-V128-NEXT: vrgather.vv v8, v24, v0
; RV64-V128-NEXT: lui a0, %hi(.LCPI15_1)
; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI15_1)
; RV64-V128-NEXT: vle32.v v24, (a0)
; RV64-V128-NEXT: csrr a0, vlenb
; RV64-V128-NEXT: slli a0, a0, 3
; RV64-V128-NEXT: add a0, sp, a0
; RV64-V128-NEXT: addi a0, a0, 16
; RV64-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; RV64-V128-NEXT: lui a0, 699051
; RV64-V128-NEXT: addiw a0, a0, -1366
; RV64-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, mu
; RV64-V128-NEXT: vmv.s.x v0, a0
; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
; RV64-V128-NEXT: csrr a0, vlenb
; RV64-V128-NEXT: slli a0, a0, 3
; RV64-V128-NEXT: add a0, sp, a0
; RV64-V128-NEXT: addi a0, a0, 16
; RV64-V128-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload
; RV64-V128-NEXT: vrgather.vv v8, v16, v24, v0.t
; RV64-V128-NEXT: vmv.v.v v24, v8
; RV64-V128-NEXT: vsetvli zero, a1, e32, m4, ta, mu
; RV64-V128-NEXT: addi a0, sp, 16
; RV64-V128-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload
; RV64-V128-NEXT: vwaddu.vv v0, v8, v16
; RV64-V128-NEXT: li a0, -1
; RV64-V128-NEXT: vwmaccu.vx v0, a0, v16
; RV64-V128-NEXT: vmv8r.v v8, v0
; RV64-V128-NEXT: vmv8r.v v16, v24
; RV64-V128-NEXT: csrr a0, vlenb
; RV64-V128-NEXT: slli a0, a0, 4
; RV64-V128-NEXT: add sp, sp, a0
; RV64-V128-NEXT: addi sp, sp, 16
; RV64-V128-NEXT: ret
;
; V512-LABEL: interleave_v32i32:
; V512: # %bb.0:
; V512-NEXT: li a0, 64
; V512-NEXT: vsetvli zero, a0, e32, m2, ta, mu
; V512-NEXT: vwaddu.vv v12, v8, v10
; V512-NEXT: li a0, -1
; V512-NEXT: vwmaccu.vx v12, a0, v10
; V512-NEXT: vmv4r.v v8, v12
; V512-NEXT: ret
%a = shufflevector <32 x i32> %x, <32 x i32> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
ret <64 x i32> %a
}

View File

@ -314,16 +314,13 @@ define <8 x i64> @vrgather_shuffle_vx_v8i64(<8 x i64> %x) {
define <4 x i8> @interleave_shuffles(<4 x i8> %x) {
; CHECK-LABEL: interleave_shuffles:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 0, e8, mf4, ta, mu
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
; CHECK-NEXT: vrgather.vi v9, v8, 1
; CHECK-NEXT: li a1, 10
; CHECK-NEXT: vmv.s.x v0, a1
; CHECK-NEXT: vid.v v8
; CHECK-NEXT: vsrl.vi v10, v8, 1
; CHECK-NEXT: vmv.v.x v8, a0
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
; CHECK-NEXT: vrgather.vi v9, v8, 0
; CHECK-NEXT: vrgather.vi v10, v8, 1
; CHECK-NEXT: vsetivli zero, 4, e8, mf8, ta, mu
; CHECK-NEXT: vwaddu.vv v8, v9, v10
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: vwmaccu.vx v8, a0, v10
; CHECK-NEXT: ret
%y = shufflevector <4 x i8> %x, <4 x i8> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
%z = shufflevector <4 x i8> %x, <4 x i8> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>