[SVE][CodeGen] Legalisation of INSERT_VECTOR_ELT for scalable vectors

When the result type of insertelement needs to be split,
SplitVecRes_INSERT_VECTOR_ELT will try to store the vector to a
stack temporary, store the element at the location of the stack
temporary plus the index, and reload the Lo/Hi parts.

This patch does the following to ensure this works for scalable vectors:
 - Sets the StackID with getStackIDForScalableVectors() in CreateStackTemporary
 - Adds an IsScalable flag to getMemBasePlusOffset() and scales the
    offset by VScale when this is true
 - Ensures the immediate is clamped correctly by clampDynamicVectorIndex
    so that we don't try to use an out of range index

Reviewed By: david-arm

Differential Revision: https://reviews.llvm.org/D84874
This commit is contained in:
Kerry McLaughlin 2020-08-11 12:19:42 +01:00
parent 49193e1fe7
commit 455ed56d48
4 changed files with 234 additions and 24 deletions

View File

@ -1456,14 +1456,16 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
if (ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) { if (ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
unsigned IdxVal = CIdx->getZExtValue(); unsigned IdxVal = CIdx->getZExtValue();
unsigned LoNumElts = Lo.getValueType().getVectorNumElements(); unsigned LoNumElts = Lo.getValueType().getVectorMinNumElements();
if (IdxVal < LoNumElts) if (IdxVal < LoNumElts) {
Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
Lo.getValueType(), Lo, Elt, Idx); Lo.getValueType(), Lo, Elt, Idx);
else return;
} else if (!Vec.getValueType().isScalableVector()) {
Hi = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt, Hi = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
DAG.getVectorIdxConstant(IdxVal - LoNumElts, dl)); DAG.getVectorIdxConstant(IdxVal - LoNumElts, dl));
return; return;
}
} }
// See if the target wants to custom expand this node. // See if the target wants to custom expand this node.
@ -1476,7 +1478,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
if (VecVT.getScalarSizeInBits() < 8) { if (VecVT.getScalarSizeInBits() < 8) {
EltVT = MVT::i8; EltVT = MVT::i8;
VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
VecVT.getVectorNumElements()); VecVT.getVectorElementCount());
Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec); Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
// Extend the element type to match if needed. // Extend the element type to match if needed.
if (EltVT.bitsGT(Elt.getValueType())) if (EltVT.bitsGT(Elt.getValueType()))
@ -1501,7 +1503,8 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx); SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
Store = DAG.getTruncStore( Store = DAG.getTruncStore(
Store, dl, Elt, EltPtr, MachinePointerInfo::getUnknownStack(MF), EltVT, Store, dl, Elt, EltPtr, MachinePointerInfo::getUnknownStack(MF), EltVT,
commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8)); commonAlignment(SmallestAlign,
EltVT.getSizeInBits().getFixedSize() / 8));
EVT LoVT, HiVT; EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT); std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
@ -1510,13 +1513,11 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, SmallestAlign); Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, SmallestAlign);
// Increment the pointer to the other part. // Increment the pointer to the other part.
unsigned IncrementSize = LoVT.getSizeInBits() / 8; auto Load = cast<LoadSDNode>(Lo);
StackPtr = MachinePointerInfo MPI = Load->getPointerInfo();
DAG.getMemBasePlusOffset(StackPtr, TypeSize::Fixed(IncrementSize), dl); IncrementPointer(Load, LoVT, MPI, StackPtr);
// Load the Hi part from the stack slot. Hi = DAG.getLoad(HiVT, dl, Store, StackPtr, MPI, SmallestAlign);
Hi = DAG.getLoad(HiVT, dl, Store, StackPtr,
PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
// If we adjusted the original type, we need to truncate the results. // If we adjusted the original type, we need to truncate the results.
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

View File

@ -2025,7 +2025,12 @@ Align SelectionDAG::getReducedAlign(EVT VT, bool UseABI) {
SDValue SelectionDAG::CreateStackTemporary(TypeSize Bytes, Align Alignment) { SDValue SelectionDAG::CreateStackTemporary(TypeSize Bytes, Align Alignment) {
MachineFrameInfo &MFI = MF->getFrameInfo(); MachineFrameInfo &MFI = MF->getFrameInfo();
int FrameIdx = MFI.CreateStackObject(Bytes, Alignment, false); const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
int StackID = 0;
if (Bytes.isScalable())
StackID = TFI->getStackIDForScalableVectors();
int FrameIdx = MFI.CreateStackObject(Bytes, Alignment,
false, nullptr, StackID);
return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout())); return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
} }
@ -5937,8 +5942,16 @@ SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, TypeSize Offset,
const SDLoc &DL, const SDLoc &DL,
const SDNodeFlags Flags) { const SDNodeFlags Flags) {
EVT VT = Base.getValueType(); EVT VT = Base.getValueType();
return getMemBasePlusOffset(Base, getConstant(Offset.getFixedSize(), DL, VT), SDValue Index;
DL, Flags);
if (Offset.isScalable())
Index = getVScale(DL, Base.getValueType(),
APInt(Base.getValueSizeInBits().getFixedSize(),
Offset.getKnownMinSize()));
else
Index = getConstant(Offset.getFixedSize(), DL, VT);
return getMemBasePlusOffset(Base, Index, DL, Flags);
} }
SDValue SelectionDAG::getMemBasePlusOffset(SDValue Ptr, SDValue Offset, SDValue SelectionDAG::getMemBasePlusOffset(SDValue Ptr, SDValue Offset,

View File

@ -7197,16 +7197,26 @@ static SDValue clampDynamicVectorIndex(SelectionDAG &DAG,
SDValue Idx, SDValue Idx,
EVT VecVT, EVT VecVT,
const SDLoc &dl) { const SDLoc &dl) {
if (isa<ConstantSDNode>(Idx)) if (!VecVT.isScalableVector() && isa<ConstantSDNode>(Idx))
return Idx; return Idx;
EVT IdxVT = Idx.getValueType(); EVT IdxVT = Idx.getValueType();
unsigned NElts = VecVT.getVectorNumElements(); unsigned NElts = VecVT.getVectorMinNumElements();
if (isPowerOf2_32(NElts)) { if (VecVT.isScalableVector()) {
APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(), SDValue VS = DAG.getVScale(dl, IdxVT,
Log2_32(NElts)); APInt(IdxVT.getSizeInBits().getFixedSize(),
return DAG.getNode(ISD::AND, dl, IdxVT, Idx, NElts));
DAG.getConstant(Imm, dl, IdxVT)); SDValue Sub = DAG.getNode(ISD::SUB, dl, IdxVT, VS,
DAG.getConstant(1, dl, IdxVT));
return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx, Sub);
} else {
if (isPowerOf2_32(NElts)) {
APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(),
Log2_32(NElts));
return DAG.getNode(ISD::AND, dl, IdxVT, Idx,
DAG.getConstant(Imm, dl, IdxVT));
}
} }
return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx, return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx,
@ -7223,8 +7233,8 @@ SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
EVT EltVT = VecVT.getVectorElementType(); EVT EltVT = VecVT.getVectorElementType();
// Calculate the element offset and add it to the pointer. // Calculate the element offset and add it to the pointer.
unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size. unsigned EltSize = EltVT.getSizeInBits().getFixedSize() / 8; // FIXME: should be ABI size.
assert(EltSize * 8 == EltVT.getSizeInBits() && assert(EltSize * 8 == EltVT.getSizeInBits().getFixedSize() &&
"Converting bits to bytes lost precision"); "Converting bits to bytes lost precision");
Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl); Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl);

View File

@ -0,0 +1,186 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
; INSERT VECTOR ELT
define <vscale x 8 x i8> @promote_insert_8i8(<vscale x 8 x i8> %a, i8 %elt, i64 %idx) {
; CHECK-LABEL: promote_insert_8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.h, w1
; CHECK-NEXT: index z2.h, #0, #1
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z1.h
; CHECK-NEXT: mov z0.h, p0/m, w0
; CHECK-NEXT: ret
%ins = insertelement <vscale x 8 x i8> %a, i8 %elt, i64 %idx
ret <vscale x 8 x i8> %ins
}
define <vscale x 32 x i8> @split_insert_32i8_idx(<vscale x 32 x i8> %a, i8 %elt, i64 %idx) {
; CHECK-LABEL: split_insert_32i8_idx:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: rdvl x8, #2
; CHECK-NEXT: sub x8, x8, #1 // =1
; CHECK-NEXT: cmp x1, x8
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: csel x8, x1, x8, lo
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: st1b { z1.b }, p0, [x9, #1, mul vl]
; CHECK-NEXT: st1b { z0.b }, p0, [sp]
; CHECK-NEXT: strb w0, [x9, x8]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x9, #1, mul vl]
; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%ins = insertelement <vscale x 32 x i8> %a, i8 %elt, i64 %idx
ret <vscale x 32 x i8> %ins
}
define <vscale x 8 x float> @split_insert_8f32_idx(<vscale x 8 x float> %a, float %elt, i64 %idx) {
; CHECK-LABEL: split_insert_8f32_idx:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: cnth x8
; CHECK-NEXT: sub x8, x8, #1 // =1
; CHECK-NEXT: cmp x0, x8
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: csel x8, x0, x8, lo
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: st1w { z1.s }, p0, [x9, #1, mul vl]
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
; CHECK-NEXT: str s2, [x9, x8, lsl #2]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x9, #1, mul vl]
; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%ins = insertelement <vscale x 8 x float> %a, float %elt, i64 %idx
ret <vscale x 8 x float> %ins
}
define <vscale x 8 x i64> @split_insert_8i64_idx(<vscale x 8 x i64> %a, i64 %elt, i64 %idx) {
; CHECK-LABEL: split_insert_8i64_idx:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-4
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: cnth x8
; CHECK-NEXT: sub x8, x8, #1 // =1
; CHECK-NEXT: cmp x1, x8
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: csel x8, x1, x8, lo
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: st1d { z3.d }, p0, [x9, #3, mul vl]
; CHECK-NEXT: st1d { z2.d }, p0, [x9, #2, mul vl]
; CHECK-NEXT: st1d { z1.d }, p0, [x9, #1, mul vl]
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: str x0, [x9, x8, lsl #3]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x9, #1, mul vl]
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x9, #2, mul vl]
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x9, #3, mul vl]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
; CHECK-NEXT: addvl sp, sp, #4
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%ins = insertelement <vscale x 8 x i64> %a, i64 %elt, i64 %idx
ret <vscale x 8 x i64> %ins
}
; INSERT VECTOR ELT, CONSTANT IDX
define <vscale x 4 x i16> @promote_insert_4i16(<vscale x 4 x i16> %a, i16 %elt) {
; CHECK-LABEL: promote_insert_4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #5
; CHECK-NEXT: index z1.s, #0, #1
; CHECK-NEXT: mov z2.s, w8
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s
; CHECK-NEXT: mov z0.s, p0/m, w0
; CHECK-NEXT: ret
%ins = insertelement <vscale x 4 x i16> %a, i16 %elt, i64 5
ret <vscale x 4 x i16> %ins
}
; In this test, the index is small enough that we know it will be in the
; low half of the vector and there is no need to go through the stack as
; done in the remaining tests
define <vscale x 32 x i8> @split_insert_32i8(<vscale x 32 x i8> %a, i8 %elt) {
; CHECK-LABEL: split_insert_32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #3
; CHECK-NEXT: index z2.b, #0, #1
; CHECK-NEXT: mov z3.b, w8
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b
; CHECK-NEXT: mov z0.b, p0/m, w0
; CHECK-NEXT: ret
%ins = insertelement <vscale x 32 x i8> %a, i8 %elt, i64 3
ret <vscale x 32 x i8> %ins
}
define <vscale x 32 x i16> @split_insert_32i16(<vscale x 32 x i16> %a, i16 %elt) {
; CHECK-LABEL: split_insert_32i16:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-4
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: rdvl x10, #2
; CHECK-NEXT: sub x10, x10, #1 // =1
; CHECK-NEXT: mov w9, #128
; CHECK-NEXT: cmp x10, #128 // =128
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: st1h { z3.h }, p0, [x8, #3, mul vl]
; CHECK-NEXT: st1h { z2.h }, p0, [x8, #2, mul vl]
; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl]
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: strh w0, [x8, x9, lsl #1]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8, #1, mul vl]
; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8, #2, mul vl]
; CHECK-NEXT: ld1h { z3.h }, p0/z, [x8, #3, mul vl]
; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp]
; CHECK-NEXT: addvl sp, sp, #4
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%ins = insertelement <vscale x 32 x i16> %a, i16 %elt, i64 128
ret <vscale x 32 x i16> %ins
}
define <vscale x 8 x i32> @split_insert_8i32(<vscale x 8 x i32> %a, i32 %elt) {
; CHECK-LABEL: split_insert_8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: mov w9, #16960
; CHECK-NEXT: cnth x10
; CHECK-NEXT: movk w9, #15, lsl #16
; CHECK-NEXT: sub x10, x10, #1 // =1
; CHECK-NEXT: cmp x10, x9
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: csel x9, x10, x9, lo
; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl]
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
; CHECK-NEXT: str w0, [x8, x9, lsl #2]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl]
; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%ins = insertelement <vscale x 8 x i32> %a, i32 %elt, i64 1000000
ret <vscale x 8 x i32> %ins
}