forked from OSchip/llvm-project
[ARM] Add ARMISD::VLD1DUP to match vld1_dup more consistently.
Currently, there are substantial problems forming vld1_dup even if the VDUP survives legalization. The lack of an actual node leads to terrible results: not only can we not form post-increment vld1_dup instructions, but we form scalar pre-increment and post-increment loads which force the loaded value into a GPR. This patch fixes that by combining the vdup+load into an ARMISD node before DAGCombine messes it up. Also includes a crash fix for vld2_dup (see testcase @vld2dupi8_postinc_variable). Recommiting with fix to avoid forming vld1dup if the type of the load doesn't match the type of the vdup (see https://llvm.org/bugs/show_bug.cgi?id=31404). Differential Revision: https://reviews.llvm.org/D27694 llvm-svn: 289972
This commit is contained in:
parent
79b4f0ad9c
commit
f624ec27b7
|
@ -222,10 +222,11 @@ private:
|
|||
const uint16_t *QOpcodes);
|
||||
|
||||
/// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs
|
||||
/// should be 2, 3 or 4. The opcode array specifies the instructions used
|
||||
/// should be 1, 2, 3 or 4. The opcode array specifies the instructions used
|
||||
/// for loading D registers. (Q registers are not supported.)
|
||||
void SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
|
||||
const uint16_t *Opcodes);
|
||||
const uint16_t *DOpcodes,
|
||||
const uint16_t *QOpcodes = nullptr);
|
||||
|
||||
/// SelectVTBL - Select NEON VTBL and VTBX intrinsics. NumVecs should be 2,
|
||||
/// 3 or 4. These are custom-selected so that a REG_SEQUENCE can be
|
||||
|
@ -1762,6 +1763,12 @@ static bool isVLDfixed(unsigned Opc)
|
|||
case ARM::VLD1q16wb_fixed : return true;
|
||||
case ARM::VLD1q32wb_fixed : return true;
|
||||
case ARM::VLD1q64wb_fixed : return true;
|
||||
case ARM::VLD1DUPd8wb_fixed : return true;
|
||||
case ARM::VLD1DUPd16wb_fixed : return true;
|
||||
case ARM::VLD1DUPd32wb_fixed : return true;
|
||||
case ARM::VLD1DUPq8wb_fixed : return true;
|
||||
case ARM::VLD1DUPq16wb_fixed : return true;
|
||||
case ARM::VLD1DUPq32wb_fixed : return true;
|
||||
case ARM::VLD2d8wb_fixed : return true;
|
||||
case ARM::VLD2d16wb_fixed : return true;
|
||||
case ARM::VLD2d32wb_fixed : return true;
|
||||
|
@ -1816,6 +1823,12 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
|
|||
case ARM::VLD1d64Qwb_fixed: return ARM::VLD1d64Qwb_register;
|
||||
case ARM::VLD1d64TPseudoWB_fixed: return ARM::VLD1d64TPseudoWB_register;
|
||||
case ARM::VLD1d64QPseudoWB_fixed: return ARM::VLD1d64QPseudoWB_register;
|
||||
case ARM::VLD1DUPd8wb_fixed : return ARM::VLD1DUPd8wb_register;
|
||||
case ARM::VLD1DUPd16wb_fixed : return ARM::VLD1DUPd16wb_register;
|
||||
case ARM::VLD1DUPd32wb_fixed : return ARM::VLD1DUPd32wb_register;
|
||||
case ARM::VLD1DUPq8wb_fixed : return ARM::VLD1DUPq8wb_register;
|
||||
case ARM::VLD1DUPq16wb_fixed : return ARM::VLD1DUPq16wb_register;
|
||||
case ARM::VLD1DUPq32wb_fixed : return ARM::VLD1DUPq32wb_register;
|
||||
|
||||
case ARM::VST1d8wb_fixed: return ARM::VST1d8wb_register;
|
||||
case ARM::VST1d16wb_fixed: return ARM::VST1d16wb_register;
|
||||
|
@ -2256,8 +2269,9 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
|
|||
}
|
||||
|
||||
void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
|
||||
const uint16_t *Opcodes) {
|
||||
assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
|
||||
const uint16_t *DOpcodes,
|
||||
const uint16_t *QOpcodes) {
|
||||
assert(NumVecs >= 1 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
|
||||
SDLoc dl(N);
|
||||
|
||||
SDValue MemAddr, Align;
|
||||
|
@ -2285,19 +2299,21 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
|
|||
}
|
||||
Align = CurDAG->getTargetConstant(Alignment, dl, MVT::i32);
|
||||
|
||||
unsigned OpcodeIndex;
|
||||
unsigned Opc;
|
||||
switch (VT.getSimpleVT().SimpleTy) {
|
||||
default: llvm_unreachable("unhandled vld-dup type");
|
||||
case MVT::v8i8: OpcodeIndex = 0; break;
|
||||
case MVT::v4i16: OpcodeIndex = 1; break;
|
||||
case MVT::v8i8: Opc = DOpcodes[0]; break;
|
||||
case MVT::v16i8: Opc = QOpcodes[0]; break;
|
||||
case MVT::v4i16: Opc = DOpcodes[1]; break;
|
||||
case MVT::v8i16: Opc = QOpcodes[1]; break;
|
||||
case MVT::v2f32:
|
||||
case MVT::v2i32: OpcodeIndex = 2; break;
|
||||
case MVT::v2i32: Opc = DOpcodes[2]; break;
|
||||
case MVT::v4f32:
|
||||
case MVT::v4i32: Opc = QOpcodes[2]; break;
|
||||
}
|
||||
|
||||
SDValue Pred = getAL(CurDAG, dl);
|
||||
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
|
||||
SDValue SuperReg;
|
||||
unsigned Opc = Opcodes[OpcodeIndex];
|
||||
SmallVector<SDValue, 6> Ops;
|
||||
Ops.push_back(MemAddr);
|
||||
Ops.push_back(Align);
|
||||
|
@ -2305,6 +2321,8 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
|
|||
// fixed-stride update instructions don't have an explicit writeback
|
||||
// operand. It's implicit in the opcode itself.
|
||||
SDValue Inc = N->getOperand(2);
|
||||
if (NumVecs <= 2 && !isa<ConstantSDNode>(Inc.getNode()))
|
||||
Opc = getVLDSTRegisterUpdateOpcode(Opc);
|
||||
if (!isa<ConstantSDNode>(Inc.getNode()))
|
||||
Ops.push_back(Inc);
|
||||
// FIXME: VLD3 and VLD4 haven't been updated to that form yet.
|
||||
|
@ -2323,14 +2341,18 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
|
|||
ResTys.push_back(MVT::Other);
|
||||
SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
|
||||
cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
|
||||
SuperReg = SDValue(VLdDup, 0);
|
||||
|
||||
// Extract the subregisters.
|
||||
static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering");
|
||||
unsigned SubIdx = ARM::dsub_0;
|
||||
for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
|
||||
ReplaceUses(SDValue(N, Vec),
|
||||
CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
|
||||
if (NumVecs == 1) {
|
||||
ReplaceUses(SDValue(N, 0), SDValue(VLdDup, 0));
|
||||
} else {
|
||||
SDValue SuperReg = SDValue(VLdDup, 0);
|
||||
static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering");
|
||||
unsigned SubIdx = ARM::dsub_0;
|
||||
for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
|
||||
ReplaceUses(SDValue(N, Vec),
|
||||
CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
|
||||
}
|
||||
ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
|
||||
if (isUpdating)
|
||||
ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
|
||||
|
@ -3402,6 +3424,15 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
|
|||
return;
|
||||
}
|
||||
|
||||
case ARMISD::VLD1DUP: {
|
||||
static const uint16_t DOpcodes[] = { ARM::VLD1DUPd8, ARM::VLD1DUPd16,
|
||||
ARM::VLD1DUPd32 };
|
||||
static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8, ARM::VLD1DUPq16,
|
||||
ARM::VLD1DUPq32 };
|
||||
SelectVLDDup(N, false, 1, DOpcodes, QOpcodes);
|
||||
return;
|
||||
}
|
||||
|
||||
case ARMISD::VLD2DUP: {
|
||||
static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
|
||||
ARM::VLD2DUPd32 };
|
||||
|
@ -3425,6 +3456,17 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
|
|||
return;
|
||||
}
|
||||
|
||||
case ARMISD::VLD1DUP_UPD: {
|
||||
static const uint16_t DOpcodes[] = { ARM::VLD1DUPd8wb_fixed,
|
||||
ARM::VLD1DUPd16wb_fixed,
|
||||
ARM::VLD1DUPd32wb_fixed };
|
||||
static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8wb_fixed,
|
||||
ARM::VLD1DUPq16wb_fixed,
|
||||
ARM::VLD1DUPq32wb_fixed };
|
||||
SelectVLDDup(N, true, 1, DOpcodes, QOpcodes);
|
||||
return;
|
||||
}
|
||||
|
||||
case ARMISD::VLD2DUP_UPD: {
|
||||
static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed,
|
||||
ARM::VLD2DUPd16wb_fixed,
|
||||
|
|
|
@ -1428,6 +1428,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
case ARMISD::VBICIMM: return "ARMISD::VBICIMM";
|
||||
case ARMISD::VBSL: return "ARMISD::VBSL";
|
||||
case ARMISD::MEMCPY: return "ARMISD::MEMCPY";
|
||||
case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP";
|
||||
case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP";
|
||||
case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP";
|
||||
case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP";
|
||||
|
@ -1438,6 +1439,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD";
|
||||
case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD";
|
||||
case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD";
|
||||
case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD";
|
||||
case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD";
|
||||
case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD";
|
||||
case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD";
|
||||
|
@ -10473,6 +10475,7 @@ static SDValue CombineBaseUpdate(SDNode *N,
|
|||
isLaneOp = true;
|
||||
switch (N->getOpcode()) {
|
||||
default: llvm_unreachable("unexpected opcode for Neon base update");
|
||||
case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
|
||||
case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
|
||||
case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
|
||||
case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
|
||||
|
@ -10587,8 +10590,8 @@ static SDValue CombineBaseUpdate(SDNode *N,
|
|||
StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
|
||||
}
|
||||
|
||||
SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys,
|
||||
Ops, AlignedVecTy,
|
||||
EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
|
||||
SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
|
||||
MemN->getMemOperand());
|
||||
|
||||
// Update the uses.
|
||||
|
@ -10733,6 +10736,31 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
|
|||
return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
|
||||
}
|
||||
|
||||
/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
|
||||
static SDValue PerformVDUPCombine(SDNode *N,
|
||||
TargetLowering::DAGCombinerInfo &DCI) {
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
SDValue Op = N->getOperand(0);
|
||||
|
||||
// Match VDUP(LOAD) -> VLD1DUP.
|
||||
// We match this pattern here rather than waiting for isel because the
|
||||
// transform is only legal for unindexed loads.
|
||||
LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
|
||||
if (LD && Op.hasOneUse() && LD->isUnindexed() &&
|
||||
LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
|
||||
SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1),
|
||||
DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) };
|
||||
SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
|
||||
SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys,
|
||||
Ops, LD->getMemoryVT(),
|
||||
LD->getMemOperand());
|
||||
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
|
||||
return VLDDup;
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
static SDValue PerformLOADCombine(SDNode *N,
|
||||
TargetLowering::DAGCombinerInfo &DCI) {
|
||||
EVT VT = N->getValueType(0);
|
||||
|
@ -11560,6 +11588,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
|
|||
case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
|
||||
case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
|
||||
case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
|
||||
case ARMISD::VDUP: return PerformVDUPCombine(N, DCI);
|
||||
case ISD::FP_TO_SINT:
|
||||
case ISD::FP_TO_UINT:
|
||||
return PerformVCVTCombine(N, DCI.DAG, Subtarget);
|
||||
|
@ -11575,6 +11604,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
|
|||
case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
|
||||
case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
|
||||
case ISD::LOAD: return PerformLOADCombine(N, DCI);
|
||||
case ARMISD::VLD1DUP:
|
||||
case ARMISD::VLD2DUP:
|
||||
case ARMISD::VLD3DUP:
|
||||
case ARMISD::VLD4DUP:
|
||||
|
|
|
@ -190,7 +190,8 @@ namespace llvm {
|
|||
MEMCPY,
|
||||
|
||||
// Vector load N-element structure to all lanes:
|
||||
VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
|
||||
VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
|
||||
VLD2DUP,
|
||||
VLD3DUP,
|
||||
VLD4DUP,
|
||||
|
||||
|
@ -202,6 +203,7 @@ namespace llvm {
|
|||
VLD2LN_UPD,
|
||||
VLD3LN_UPD,
|
||||
VLD4LN_UPD,
|
||||
VLD1DUP_UPD,
|
||||
VLD2DUP_UPD,
|
||||
VLD3DUP_UPD,
|
||||
VLD4DUP_UPD,
|
||||
|
|
|
@ -10,6 +10,84 @@ define <8 x i8> @vld1dupi8(i8* %A) nounwind {
|
|||
ret <8 x i8> %tmp3
|
||||
}
|
||||
|
||||
define <8 x i8> @vld1dupi8_preinc(i8** noalias nocapture %a, i32 %b) nounwind {
|
||||
entry:
|
||||
;CHECK-LABEL: vld1dupi8_preinc:
|
||||
;CHECK: vld1.8 {d16[]}, [r1]
|
||||
%0 = load i8*, i8** %a, align 4
|
||||
%add.ptr = getelementptr inbounds i8, i8* %0, i32 %b
|
||||
%1 = load i8, i8* %add.ptr, align 1
|
||||
%2 = insertelement <8 x i8> undef, i8 %1, i32 0
|
||||
%lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
|
||||
store i8* %add.ptr, i8** %a, align 4
|
||||
ret <8 x i8> %lane
|
||||
}
|
||||
|
||||
define <8 x i8> @vld1dupi8_postinc_fixed(i8** noalias nocapture %a) nounwind {
|
||||
entry:
|
||||
;CHECK-LABEL: vld1dupi8_postinc_fixed:
|
||||
;CHECK: vld1.8 {d16[]}, [r1]!
|
||||
%0 = load i8*, i8** %a, align 4
|
||||
%1 = load i8, i8* %0, align 1
|
||||
%2 = insertelement <8 x i8> undef, i8 %1, i32 0
|
||||
%lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
|
||||
%add.ptr = getelementptr inbounds i8, i8* %0, i32 1
|
||||
store i8* %add.ptr, i8** %a, align 4
|
||||
ret <8 x i8> %lane
|
||||
}
|
||||
|
||||
define <8 x i8> @vld1dupi8_postinc_register(i8** noalias nocapture %a, i32 %n) nounwind {
|
||||
entry:
|
||||
;CHECK-LABEL: vld1dupi8_postinc_register:
|
||||
;CHECK: vld1.8 {d16[]}, [r2], r1
|
||||
%0 = load i8*, i8** %a, align 4
|
||||
%1 = load i8, i8* %0, align 1
|
||||
%2 = insertelement <8 x i8> undef, i8 %1, i32 0
|
||||
%lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
|
||||
%add.ptr = getelementptr inbounds i8, i8* %0, i32 %n
|
||||
store i8* %add.ptr, i8** %a, align 4
|
||||
ret <8 x i8> %lane
|
||||
}
|
||||
|
||||
define <16 x i8> @vld1dupqi8_preinc(i8** noalias nocapture %a, i32 %b) nounwind {
|
||||
entry:
|
||||
;CHECK-LABEL: vld1dupqi8_preinc:
|
||||
;CHECK: vld1.8 {d16[], d17[]}, [r1]
|
||||
%0 = load i8*, i8** %a, align 4
|
||||
%add.ptr = getelementptr inbounds i8, i8* %0, i32 %b
|
||||
%1 = load i8, i8* %add.ptr, align 1
|
||||
%2 = insertelement <16 x i8> undef, i8 %1, i32 0
|
||||
%lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
|
||||
store i8* %add.ptr, i8** %a, align 4
|
||||
ret <16 x i8> %lane
|
||||
}
|
||||
|
||||
define <16 x i8> @vld1dupqi8_postinc_fixed(i8** noalias nocapture %a) nounwind {
|
||||
entry:
|
||||
;CHECK-LABEL: vld1dupqi8_postinc_fixed:
|
||||
;CHECK: vld1.8 {d16[], d17[]}, [r1]!
|
||||
%0 = load i8*, i8** %a, align 4
|
||||
%1 = load i8, i8* %0, align 1
|
||||
%2 = insertelement <16 x i8> undef, i8 %1, i32 0
|
||||
%lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
|
||||
%add.ptr = getelementptr inbounds i8, i8* %0, i32 1
|
||||
store i8* %add.ptr, i8** %a, align 4
|
||||
ret <16 x i8> %lane
|
||||
}
|
||||
|
||||
define <16 x i8> @vld1dupqi8_postinc_register(i8** noalias nocapture %a, i32 %n) nounwind {
|
||||
entry:
|
||||
;CHECK-LABEL: vld1dupqi8_postinc_register:
|
||||
;CHECK: vld1.8 {d16[], d17[]}, [r2], r1
|
||||
%0 = load i8*, i8** %a, align 4
|
||||
%1 = load i8, i8* %0, align 1
|
||||
%2 = insertelement <16 x i8> undef, i8 %1, i32 0
|
||||
%lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
|
||||
%add.ptr = getelementptr inbounds i8, i8* %0, i32 %n
|
||||
store i8* %add.ptr, i8** %a, align 4
|
||||
ret <16 x i8> %lane
|
||||
}
|
||||
|
||||
define <4 x i16> @vld1dupi16(i16* %A) nounwind {
|
||||
;CHECK-LABEL: vld1dupi16:
|
||||
;Check the alignment value. Max for this instruction is 16 bits:
|
||||
|
@ -20,6 +98,51 @@ define <4 x i16> @vld1dupi16(i16* %A) nounwind {
|
|||
ret <4 x i16> %tmp3
|
||||
}
|
||||
|
||||
define <4 x i16> @vld1dupi16_misaligned(i16* %A) nounwind {
|
||||
;CHECK-LABEL: vld1dupi16_misaligned:
|
||||
;CHECK: vld1.16 {d16[]}, [r0]
|
||||
%tmp1 = load i16, i16* %A, align 1
|
||||
%tmp2 = insertelement <4 x i16> undef, i16 %tmp1, i32 0
|
||||
%tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> undef, <4 x i32> zeroinitializer
|
||||
ret <4 x i16> %tmp3
|
||||
}
|
||||
|
||||
; This sort of looks like a vld1dup, but there's an extension in the way.
|
||||
define <4 x i16> @load_i16_dup_zext(i8* %A) nounwind {
|
||||
;CHECK-LABEL: load_i16_dup_zext:
|
||||
;CHECK: ldrb r0, [r0]
|
||||
;CHECK-NEXT: vdup.16 d16, r0
|
||||
%tmp1 = load i8, i8* %A, align 1
|
||||
%tmp2 = zext i8 %tmp1 to i16
|
||||
%tmp3 = insertelement <4 x i16> undef, i16 %tmp2, i32 0
|
||||
%tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
|
||||
ret <4 x i16> %tmp4
|
||||
}
|
||||
|
||||
; This sort of looks like a vld1dup, but there's an extension in the way.
|
||||
define <4 x i16> @load_i16_dup_sext(i8* %A) nounwind {
|
||||
;CHECK-LABEL: load_i16_dup_sext:
|
||||
;CHECK: ldrsb r0, [r0]
|
||||
;CHECK-NEXT: vdup.16 d16, r0
|
||||
%tmp1 = load i8, i8* %A, align 1
|
||||
%tmp2 = sext i8 %tmp1 to i16
|
||||
%tmp3 = insertelement <4 x i16> undef, i16 %tmp2, i32 0
|
||||
%tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
|
||||
ret <4 x i16> %tmp4
|
||||
}
|
||||
|
||||
; This sort of looks like a vld1dup, but there's an extension in the way.
|
||||
define <8 x i16> @load_i16_dupq_zext(i8* %A) nounwind {
|
||||
;CHECK-LABEL: load_i16_dupq_zext:
|
||||
;CHECK: ldrb r0, [r0]
|
||||
;CHECK-NEXT: vdup.16 q8, r0
|
||||
%tmp1 = load i8, i8* %A, align 1
|
||||
%tmp2 = zext i8 %tmp1 to i16
|
||||
%tmp3 = insertelement <8 x i16> undef, i16 %tmp2, i32 0
|
||||
%tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> zeroinitializer
|
||||
ret <8 x i16> %tmp4
|
||||
}
|
||||
|
||||
define <2 x i32> @vld1dupi32(i32* %A) nounwind {
|
||||
;CHECK-LABEL: vld1dupi32:
|
||||
;Check the alignment value. Max for this instruction is 32 bits:
|
||||
|
@ -30,6 +153,30 @@ define <2 x i32> @vld1dupi32(i32* %A) nounwind {
|
|||
ret <2 x i32> %tmp3
|
||||
}
|
||||
|
||||
; This sort of looks like a vld1dup, but there's an extension in the way.
|
||||
define <4 x i32> @load_i32_dup_zext(i8* %A) nounwind {
|
||||
;CHECK-LABEL: load_i32_dup_zext:
|
||||
;CHECK: ldrb r0, [r0]
|
||||
;CHECK-NEXT: vdup.32 q8, r0
|
||||
%tmp1 = load i8, i8* %A, align 1
|
||||
%tmp2 = zext i8 %tmp1 to i32
|
||||
%tmp3 = insertelement <4 x i32> undef, i32 %tmp2, i32 0
|
||||
%tmp4 = shufflevector <4 x i32> %tmp3, <4 x i32> undef, <4 x i32> zeroinitializer
|
||||
ret <4 x i32> %tmp4
|
||||
}
|
||||
|
||||
; This sort of looks like a vld1dup, but there's an extension in the way.
|
||||
define <4 x i32> @load_i32_dup_sext(i8* %A) nounwind {
|
||||
;CHECK-LABEL: load_i32_dup_sext:
|
||||
;CHECK: ldrsb r0, [r0]
|
||||
;CHECK-NEXT: vdup.32 q8, r0
|
||||
%tmp1 = load i8, i8* %A, align 1
|
||||
%tmp2 = sext i8 %tmp1 to i32
|
||||
%tmp3 = insertelement <4 x i32> undef, i32 %tmp2, i32 0
|
||||
%tmp4 = shufflevector <4 x i32> %tmp3, <4 x i32> undef, <4 x i32> zeroinitializer
|
||||
ret <4 x i32> %tmp4
|
||||
}
|
||||
|
||||
define <2 x float> @vld1dupf(float* %A) nounwind {
|
||||
;CHECK-LABEL: vld1dupf:
|
||||
;CHECK: vld1.32 {d16[]}, [r0:32]
|
||||
|
@ -75,6 +222,63 @@ define <8 x i8> @vld2dupi8(i8* %A) nounwind {
|
|||
ret <8 x i8> %tmp5
|
||||
}
|
||||
|
||||
define void @vld2dupi8_preinc(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a, i32 %b) nounwind {
|
||||
;CHECK-LABEL: vld2dupi8_preinc:
|
||||
;CHECK: vld2.8 {d16[], d17[]}, [r2]
|
||||
entry:
|
||||
%0 = load i8*, i8** %a, align 4
|
||||
%add.ptr = getelementptr inbounds i8, i8* %0, i32 %b
|
||||
%vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %add.ptr, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
|
||||
%1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0
|
||||
%lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
|
||||
%2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1
|
||||
%lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
|
||||
store i8* %add.ptr, i8** %a, align 4
|
||||
%r8 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0
|
||||
store <8 x i8> %lane, <8 x i8>* %r8, align 8
|
||||
%r11 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1
|
||||
store <8 x i8> %lane1, <8 x i8>* %r11, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @vld2dupi8_postinc_fixed(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a) nounwind {
|
||||
entry:
|
||||
;CHECK-LABEL: vld2dupi8_postinc_fixed:
|
||||
;CHECK: vld2.8 {d16[], d17[]}, [r2]!
|
||||
%0 = load i8*, i8** %a, align 4
|
||||
%vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %0, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
|
||||
%1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0
|
||||
%lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
|
||||
%2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1
|
||||
%lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
|
||||
%add.ptr = getelementptr inbounds i8, i8* %0, i32 2
|
||||
store i8* %add.ptr, i8** %a, align 4
|
||||
%r7 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0
|
||||
store <8 x i8> %lane, <8 x i8>* %r7, align 8
|
||||
%r10 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1
|
||||
store <8 x i8> %lane1, <8 x i8>* %r10, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @vld2dupi8_postinc_variable(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a, i32 %n) nounwind {
|
||||
entry:
|
||||
;CHECK-LABEL: vld2dupi8_postinc_variable:
|
||||
;CHECK: vld2.8 {d16[], d17[]}, [r3], r2
|
||||
%0 = load i8*, i8** %a, align 4
|
||||
%vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %0, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
|
||||
%1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0
|
||||
%lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
|
||||
%2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1
|
||||
%lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
|
||||
%add.ptr = getelementptr inbounds i8, i8* %0, i32 %n
|
||||
store i8* %add.ptr, i8** %a, align 4
|
||||
%r7 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0
|
||||
store <8 x i8> %lane, <8 x i8>* %r7, align 8
|
||||
%r10 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1
|
||||
store <8 x i8> %lane1, <8 x i8>* %r10, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
define <4 x i16> @vld2dupi16(i8* %A) nounwind {
|
||||
;CHECK-LABEL: vld2dupi16:
|
||||
;Check that a power-of-two alignment smaller than the total size of the memory
|
||||
|
|
|
@ -635,13 +635,26 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
define void @foo(<4 x float> * %a, <4 x float>* nocapture %dst, float* nocapture readonly %src) nounwind {
|
||||
; Look for doing a normal scalar FP load rather than an to-all-lanes load.
|
||||
; e.g., "ldr s0, [r2]" rathern than "vld1.32 {d18[], d19[]}, [r2:32]"
|
||||
; Then check that the vector multiply has folded the splat to all lanes
|
||||
; and used a vector * scalar instruction.
|
||||
; CHECK: vldr {{s[0-9]+}}, [r2]
|
||||
define void @fmul_splat(<4 x float> * %a, <4 x float>* nocapture %dst, float %tmp) nounwind {
|
||||
; Look for a scalar float rather than a splat, then a vector*scalar multiply.
|
||||
; CHECK: vmov s0, r2
|
||||
; CHECK: vmul.f32 q8, q8, d0[0]
|
||||
%tmp5 = load <4 x float>, <4 x float>* %a, align 4
|
||||
%tmp6 = insertelement <4 x float> undef, float %tmp, i32 0
|
||||
%tmp7 = insertelement <4 x float> %tmp6, float %tmp, i32 1
|
||||
%tmp8 = insertelement <4 x float> %tmp7, float %tmp, i32 2
|
||||
%tmp9 = insertelement <4 x float> %tmp8, float %tmp, i32 3
|
||||
%tmp10 = fmul <4 x float> %tmp9, %tmp5
|
||||
store <4 x float> %tmp10, <4 x float>* %dst, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @fmul_splat_load(<4 x float> * %a, <4 x float>* nocapture %dst, float* nocapture readonly %src) nounwind {
|
||||
; Look for doing a normal scalar FP load rather than an to-all-lanes load,
|
||||
; then a vector*scalar multiply.
|
||||
; FIXME: Temporarily broken due to splat representation changes.
|
||||
; CHECK: vld1.32 {d18[], d19[]}, [r2:32]
|
||||
; CHECK: vmul.f32 q8, q9, q8
|
||||
%tmp = load float, float* %src, align 4
|
||||
%tmp5 = load <4 x float>, <4 x float>* %a, align 4
|
||||
%tmp6 = insertelement <4 x float> undef, float %tmp, i32 0
|
||||
|
|
Loading…
Reference in New Issue