forked from OSchip/llvm-project
[SVE][CodeGen] Legalisation of INSERT_VECTOR_ELT for scalable vectors
When the result type of insertelement needs to be split, SplitVecRes_INSERT_VECTOR_ELT will try to store the vector to a stack temporary, store the element at the location of the stack temporary plus the index, and reload the Lo/Hi parts. This patch does the following to ensure this works for scalable vectors: - Sets the StackID with getStackIDForScalableVectors() in CreateStackTemporary - Adds an IsScalable flag to getMemBasePlusOffset() and scales the offset by VScale when this is true - Ensures the immediate is clamped correctly by clampDynamicVectorIndex so that we don't try to use an out of range index Reviewed By: david-arm Differential Revision: https://reviews.llvm.org/D84874
This commit is contained in:
parent
49193e1fe7
commit
455ed56d48
|
@ -1456,14 +1456,16 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
|
||||||
|
|
||||||
if (ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
|
if (ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
|
||||||
unsigned IdxVal = CIdx->getZExtValue();
|
unsigned IdxVal = CIdx->getZExtValue();
|
||||||
unsigned LoNumElts = Lo.getValueType().getVectorNumElements();
|
unsigned LoNumElts = Lo.getValueType().getVectorMinNumElements();
|
||||||
if (IdxVal < LoNumElts)
|
if (IdxVal < LoNumElts) {
|
||||||
Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
|
Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
|
||||||
Lo.getValueType(), Lo, Elt, Idx);
|
Lo.getValueType(), Lo, Elt, Idx);
|
||||||
else
|
return;
|
||||||
|
} else if (!Vec.getValueType().isScalableVector()) {
|
||||||
Hi = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
|
Hi = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
|
||||||
DAG.getVectorIdxConstant(IdxVal - LoNumElts, dl));
|
DAG.getVectorIdxConstant(IdxVal - LoNumElts, dl));
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// See if the target wants to custom expand this node.
|
// See if the target wants to custom expand this node.
|
||||||
|
@ -1476,7 +1478,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
|
||||||
if (VecVT.getScalarSizeInBits() < 8) {
|
if (VecVT.getScalarSizeInBits() < 8) {
|
||||||
EltVT = MVT::i8;
|
EltVT = MVT::i8;
|
||||||
VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
|
VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
|
||||||
VecVT.getVectorNumElements());
|
VecVT.getVectorElementCount());
|
||||||
Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
|
Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
|
||||||
// Extend the element type to match if needed.
|
// Extend the element type to match if needed.
|
||||||
if (EltVT.bitsGT(Elt.getValueType()))
|
if (EltVT.bitsGT(Elt.getValueType()))
|
||||||
|
@ -1501,7 +1503,8 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
|
||||||
SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
|
SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
|
||||||
Store = DAG.getTruncStore(
|
Store = DAG.getTruncStore(
|
||||||
Store, dl, Elt, EltPtr, MachinePointerInfo::getUnknownStack(MF), EltVT,
|
Store, dl, Elt, EltPtr, MachinePointerInfo::getUnknownStack(MF), EltVT,
|
||||||
commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8));
|
commonAlignment(SmallestAlign,
|
||||||
|
EltVT.getSizeInBits().getFixedSize() / 8));
|
||||||
|
|
||||||
EVT LoVT, HiVT;
|
EVT LoVT, HiVT;
|
||||||
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
|
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
|
||||||
|
@ -1510,13 +1513,11 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
|
||||||
Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, SmallestAlign);
|
Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, SmallestAlign);
|
||||||
|
|
||||||
// Increment the pointer to the other part.
|
// Increment the pointer to the other part.
|
||||||
unsigned IncrementSize = LoVT.getSizeInBits() / 8;
|
auto Load = cast<LoadSDNode>(Lo);
|
||||||
StackPtr =
|
MachinePointerInfo MPI = Load->getPointerInfo();
|
||||||
DAG.getMemBasePlusOffset(StackPtr, TypeSize::Fixed(IncrementSize), dl);
|
IncrementPointer(Load, LoVT, MPI, StackPtr);
|
||||||
|
|
||||||
// Load the Hi part from the stack slot.
|
Hi = DAG.getLoad(HiVT, dl, Store, StackPtr, MPI, SmallestAlign);
|
||||||
Hi = DAG.getLoad(HiVT, dl, Store, StackPtr,
|
|
||||||
PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
|
|
||||||
|
|
||||||
// If we adjusted the original type, we need to truncate the results.
|
// If we adjusted the original type, we need to truncate the results.
|
||||||
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
|
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
|
||||||
|
|
|
@ -2025,7 +2025,12 @@ Align SelectionDAG::getReducedAlign(EVT VT, bool UseABI) {
|
||||||
|
|
||||||
SDValue SelectionDAG::CreateStackTemporary(TypeSize Bytes, Align Alignment) {
|
SDValue SelectionDAG::CreateStackTemporary(TypeSize Bytes, Align Alignment) {
|
||||||
MachineFrameInfo &MFI = MF->getFrameInfo();
|
MachineFrameInfo &MFI = MF->getFrameInfo();
|
||||||
int FrameIdx = MFI.CreateStackObject(Bytes, Alignment, false);
|
const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
|
||||||
|
int StackID = 0;
|
||||||
|
if (Bytes.isScalable())
|
||||||
|
StackID = TFI->getStackIDForScalableVectors();
|
||||||
|
int FrameIdx = MFI.CreateStackObject(Bytes, Alignment,
|
||||||
|
false, nullptr, StackID);
|
||||||
return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
|
return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5937,8 +5942,16 @@ SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, TypeSize Offset,
|
||||||
const SDLoc &DL,
|
const SDLoc &DL,
|
||||||
const SDNodeFlags Flags) {
|
const SDNodeFlags Flags) {
|
||||||
EVT VT = Base.getValueType();
|
EVT VT = Base.getValueType();
|
||||||
return getMemBasePlusOffset(Base, getConstant(Offset.getFixedSize(), DL, VT),
|
SDValue Index;
|
||||||
DL, Flags);
|
|
||||||
|
if (Offset.isScalable())
|
||||||
|
Index = getVScale(DL, Base.getValueType(),
|
||||||
|
APInt(Base.getValueSizeInBits().getFixedSize(),
|
||||||
|
Offset.getKnownMinSize()));
|
||||||
|
else
|
||||||
|
Index = getConstant(Offset.getFixedSize(), DL, VT);
|
||||||
|
|
||||||
|
return getMemBasePlusOffset(Base, Index, DL, Flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
SDValue SelectionDAG::getMemBasePlusOffset(SDValue Ptr, SDValue Offset,
|
SDValue SelectionDAG::getMemBasePlusOffset(SDValue Ptr, SDValue Offset,
|
||||||
|
|
|
@ -7197,16 +7197,26 @@ static SDValue clampDynamicVectorIndex(SelectionDAG &DAG,
|
||||||
SDValue Idx,
|
SDValue Idx,
|
||||||
EVT VecVT,
|
EVT VecVT,
|
||||||
const SDLoc &dl) {
|
const SDLoc &dl) {
|
||||||
if (isa<ConstantSDNode>(Idx))
|
if (!VecVT.isScalableVector() && isa<ConstantSDNode>(Idx))
|
||||||
return Idx;
|
return Idx;
|
||||||
|
|
||||||
EVT IdxVT = Idx.getValueType();
|
EVT IdxVT = Idx.getValueType();
|
||||||
unsigned NElts = VecVT.getVectorNumElements();
|
unsigned NElts = VecVT.getVectorMinNumElements();
|
||||||
if (isPowerOf2_32(NElts)) {
|
if (VecVT.isScalableVector()) {
|
||||||
APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(),
|
SDValue VS = DAG.getVScale(dl, IdxVT,
|
||||||
Log2_32(NElts));
|
APInt(IdxVT.getSizeInBits().getFixedSize(),
|
||||||
return DAG.getNode(ISD::AND, dl, IdxVT, Idx,
|
NElts));
|
||||||
DAG.getConstant(Imm, dl, IdxVT));
|
SDValue Sub = DAG.getNode(ISD::SUB, dl, IdxVT, VS,
|
||||||
|
DAG.getConstant(1, dl, IdxVT));
|
||||||
|
|
||||||
|
return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx, Sub);
|
||||||
|
} else {
|
||||||
|
if (isPowerOf2_32(NElts)) {
|
||||||
|
APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(),
|
||||||
|
Log2_32(NElts));
|
||||||
|
return DAG.getNode(ISD::AND, dl, IdxVT, Idx,
|
||||||
|
DAG.getConstant(Imm, dl, IdxVT));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx,
|
return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx,
|
||||||
|
@ -7223,8 +7233,8 @@ SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
|
||||||
EVT EltVT = VecVT.getVectorElementType();
|
EVT EltVT = VecVT.getVectorElementType();
|
||||||
|
|
||||||
// Calculate the element offset and add it to the pointer.
|
// Calculate the element offset and add it to the pointer.
|
||||||
unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
|
unsigned EltSize = EltVT.getSizeInBits().getFixedSize() / 8; // FIXME: should be ABI size.
|
||||||
assert(EltSize * 8 == EltVT.getSizeInBits() &&
|
assert(EltSize * 8 == EltVT.getSizeInBits().getFixedSize() &&
|
||||||
"Converting bits to bytes lost precision");
|
"Converting bits to bytes lost precision");
|
||||||
|
|
||||||
Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl);
|
Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl);
|
||||||
|
|
|
@ -0,0 +1,186 @@
|
||||||
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||||
|
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
|
||||||
|
|
||||||
|
; INSERT VECTOR ELT
|
||||||
|
|
||||||
|
define <vscale x 8 x i8> @promote_insert_8i8(<vscale x 8 x i8> %a, i8 %elt, i64 %idx) {
|
||||||
|
; CHECK-LABEL: promote_insert_8i8:
|
||||||
|
; CHECK: // %bb.0:
|
||||||
|
; CHECK-NEXT: mov z1.h, w1
|
||||||
|
; CHECK-NEXT: index z2.h, #0, #1
|
||||||
|
; CHECK-NEXT: ptrue p0.h
|
||||||
|
; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z1.h
|
||||||
|
; CHECK-NEXT: mov z0.h, p0/m, w0
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%ins = insertelement <vscale x 8 x i8> %a, i8 %elt, i64 %idx
|
||||||
|
ret <vscale x 8 x i8> %ins
|
||||||
|
}
|
||||||
|
|
||||||
|
define <vscale x 32 x i8> @split_insert_32i8_idx(<vscale x 32 x i8> %a, i8 %elt, i64 %idx) {
|
||||||
|
; CHECK-LABEL: split_insert_32i8_idx:
|
||||||
|
; CHECK: // %bb.0:
|
||||||
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
||||||
|
; CHECK-NEXT: addvl sp, sp, #-2
|
||||||
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
|
||||||
|
; CHECK-NEXT: .cfi_offset w29, -16
|
||||||
|
; CHECK-NEXT: rdvl x8, #2
|
||||||
|
; CHECK-NEXT: sub x8, x8, #1 // =1
|
||||||
|
; CHECK-NEXT: cmp x1, x8
|
||||||
|
; CHECK-NEXT: ptrue p0.b
|
||||||
|
; CHECK-NEXT: csel x8, x1, x8, lo
|
||||||
|
; CHECK-NEXT: mov x9, sp
|
||||||
|
; CHECK-NEXT: st1b { z1.b }, p0, [x9, #1, mul vl]
|
||||||
|
; CHECK-NEXT: st1b { z0.b }, p0, [sp]
|
||||||
|
; CHECK-NEXT: strb w0, [x9, x8]
|
||||||
|
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x9, #1, mul vl]
|
||||||
|
; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp]
|
||||||
|
; CHECK-NEXT: addvl sp, sp, #2
|
||||||
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%ins = insertelement <vscale x 32 x i8> %a, i8 %elt, i64 %idx
|
||||||
|
ret <vscale x 32 x i8> %ins
|
||||||
|
}
|
||||||
|
|
||||||
|
define <vscale x 8 x float> @split_insert_8f32_idx(<vscale x 8 x float> %a, float %elt, i64 %idx) {
|
||||||
|
; CHECK-LABEL: split_insert_8f32_idx:
|
||||||
|
; CHECK: // %bb.0:
|
||||||
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
||||||
|
; CHECK-NEXT: addvl sp, sp, #-2
|
||||||
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
|
||||||
|
; CHECK-NEXT: .cfi_offset w29, -16
|
||||||
|
; CHECK-NEXT: cnth x8
|
||||||
|
; CHECK-NEXT: sub x8, x8, #1 // =1
|
||||||
|
; CHECK-NEXT: cmp x0, x8
|
||||||
|
; CHECK-NEXT: ptrue p0.s
|
||||||
|
; CHECK-NEXT: csel x8, x0, x8, lo
|
||||||
|
; CHECK-NEXT: mov x9, sp
|
||||||
|
; CHECK-NEXT: st1w { z1.s }, p0, [x9, #1, mul vl]
|
||||||
|
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
|
||||||
|
; CHECK-NEXT: str s2, [x9, x8, lsl #2]
|
||||||
|
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x9, #1, mul vl]
|
||||||
|
; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
|
||||||
|
; CHECK-NEXT: addvl sp, sp, #2
|
||||||
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%ins = insertelement <vscale x 8 x float> %a, float %elt, i64 %idx
|
||||||
|
ret <vscale x 8 x float> %ins
|
||||||
|
}
|
||||||
|
|
||||||
|
define <vscale x 8 x i64> @split_insert_8i64_idx(<vscale x 8 x i64> %a, i64 %elt, i64 %idx) {
|
||||||
|
; CHECK-LABEL: split_insert_8i64_idx:
|
||||||
|
; CHECK: // %bb.0:
|
||||||
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
||||||
|
; CHECK-NEXT: addvl sp, sp, #-4
|
||||||
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
|
||||||
|
; CHECK-NEXT: .cfi_offset w29, -16
|
||||||
|
; CHECK-NEXT: cnth x8
|
||||||
|
; CHECK-NEXT: sub x8, x8, #1 // =1
|
||||||
|
; CHECK-NEXT: cmp x1, x8
|
||||||
|
; CHECK-NEXT: ptrue p0.d
|
||||||
|
; CHECK-NEXT: csel x8, x1, x8, lo
|
||||||
|
; CHECK-NEXT: mov x9, sp
|
||||||
|
; CHECK-NEXT: st1d { z3.d }, p0, [x9, #3, mul vl]
|
||||||
|
; CHECK-NEXT: st1d { z2.d }, p0, [x9, #2, mul vl]
|
||||||
|
; CHECK-NEXT: st1d { z1.d }, p0, [x9, #1, mul vl]
|
||||||
|
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
|
||||||
|
; CHECK-NEXT: str x0, [x9, x8, lsl #3]
|
||||||
|
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x9, #1, mul vl]
|
||||||
|
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x9, #2, mul vl]
|
||||||
|
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x9, #3, mul vl]
|
||||||
|
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
|
||||||
|
; CHECK-NEXT: addvl sp, sp, #4
|
||||||
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%ins = insertelement <vscale x 8 x i64> %a, i64 %elt, i64 %idx
|
||||||
|
ret <vscale x 8 x i64> %ins
|
||||||
|
}
|
||||||
|
|
||||||
|
; INSERT VECTOR ELT, CONSTANT IDX
|
||||||
|
|
||||||
|
define <vscale x 4 x i16> @promote_insert_4i16(<vscale x 4 x i16> %a, i16 %elt) {
|
||||||
|
; CHECK-LABEL: promote_insert_4i16:
|
||||||
|
; CHECK: // %bb.0:
|
||||||
|
; CHECK-NEXT: mov w8, #5
|
||||||
|
; CHECK-NEXT: index z1.s, #0, #1
|
||||||
|
; CHECK-NEXT: mov z2.s, w8
|
||||||
|
; CHECK-NEXT: ptrue p0.s
|
||||||
|
; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s
|
||||||
|
; CHECK-NEXT: mov z0.s, p0/m, w0
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%ins = insertelement <vscale x 4 x i16> %a, i16 %elt, i64 5
|
||||||
|
ret <vscale x 4 x i16> %ins
|
||||||
|
}
|
||||||
|
|
||||||
|
; In this test, the index is small enough that we know it will be in the
|
||||||
|
; low half of the vector and there is no need to go through the stack as
|
||||||
|
; done in the remaining tests
|
||||||
|
define <vscale x 32 x i8> @split_insert_32i8(<vscale x 32 x i8> %a, i8 %elt) {
|
||||||
|
; CHECK-LABEL: split_insert_32i8:
|
||||||
|
; CHECK: // %bb.0:
|
||||||
|
; CHECK-NEXT: mov w8, #3
|
||||||
|
; CHECK-NEXT: index z2.b, #0, #1
|
||||||
|
; CHECK-NEXT: mov z3.b, w8
|
||||||
|
; CHECK-NEXT: ptrue p0.b
|
||||||
|
; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b
|
||||||
|
; CHECK-NEXT: mov z0.b, p0/m, w0
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%ins = insertelement <vscale x 32 x i8> %a, i8 %elt, i64 3
|
||||||
|
ret <vscale x 32 x i8> %ins
|
||||||
|
}
|
||||||
|
|
||||||
|
define <vscale x 32 x i16> @split_insert_32i16(<vscale x 32 x i16> %a, i16 %elt) {
|
||||||
|
; CHECK-LABEL: split_insert_32i16:
|
||||||
|
; CHECK: // %bb.0:
|
||||||
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
||||||
|
; CHECK-NEXT: addvl sp, sp, #-4
|
||||||
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
|
||||||
|
; CHECK-NEXT: .cfi_offset w29, -16
|
||||||
|
; CHECK-NEXT: rdvl x10, #2
|
||||||
|
; CHECK-NEXT: sub x10, x10, #1 // =1
|
||||||
|
; CHECK-NEXT: mov w9, #128
|
||||||
|
; CHECK-NEXT: cmp x10, #128 // =128
|
||||||
|
; CHECK-NEXT: ptrue p0.h
|
||||||
|
; CHECK-NEXT: mov x8, sp
|
||||||
|
; CHECK-NEXT: csel x9, x10, x9, lo
|
||||||
|
; CHECK-NEXT: st1h { z3.h }, p0, [x8, #3, mul vl]
|
||||||
|
; CHECK-NEXT: st1h { z2.h }, p0, [x8, #2, mul vl]
|
||||||
|
; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl]
|
||||||
|
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
|
||||||
|
; CHECK-NEXT: strh w0, [x8, x9, lsl #1]
|
||||||
|
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8, #1, mul vl]
|
||||||
|
; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8, #2, mul vl]
|
||||||
|
; CHECK-NEXT: ld1h { z3.h }, p0/z, [x8, #3, mul vl]
|
||||||
|
; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp]
|
||||||
|
; CHECK-NEXT: addvl sp, sp, #4
|
||||||
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%ins = insertelement <vscale x 32 x i16> %a, i16 %elt, i64 128
|
||||||
|
ret <vscale x 32 x i16> %ins
|
||||||
|
}
|
||||||
|
|
||||||
|
define <vscale x 8 x i32> @split_insert_8i32(<vscale x 8 x i32> %a, i32 %elt) {
|
||||||
|
; CHECK-LABEL: split_insert_8i32:
|
||||||
|
; CHECK: // %bb.0:
|
||||||
|
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
||||||
|
; CHECK-NEXT: addvl sp, sp, #-2
|
||||||
|
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
|
||||||
|
; CHECK-NEXT: .cfi_offset w29, -16
|
||||||
|
; CHECK-NEXT: mov w9, #16960
|
||||||
|
; CHECK-NEXT: cnth x10
|
||||||
|
; CHECK-NEXT: movk w9, #15, lsl #16
|
||||||
|
; CHECK-NEXT: sub x10, x10, #1 // =1
|
||||||
|
; CHECK-NEXT: cmp x10, x9
|
||||||
|
; CHECK-NEXT: ptrue p0.s
|
||||||
|
; CHECK-NEXT: mov x8, sp
|
||||||
|
; CHECK-NEXT: csel x9, x10, x9, lo
|
||||||
|
; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl]
|
||||||
|
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
|
||||||
|
; CHECK-NEXT: str w0, [x8, x9, lsl #2]
|
||||||
|
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl]
|
||||||
|
; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
|
||||||
|
; CHECK-NEXT: addvl sp, sp, #2
|
||||||
|
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%ins = insertelement <vscale x 8 x i32> %a, i32 %elt, i64 1000000
|
||||||
|
ret <vscale x 8 x i32> %ins
|
||||||
|
}
|
Loading…
Reference in New Issue