forked from OSchip/llvm-project
Add support for NEON VLD2-dup instructions.
llvm-svn: 120236
This commit is contained in:
parent
f9b96c474f
commit
2d790df105
|
@ -140,6 +140,13 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
|
|||
{ ARM::VLD1q8Pseudo, ARM::VLD1q8, true, false, SingleSpc, 2, 8 },
|
||||
{ ARM::VLD1q8Pseudo_UPD, ARM::VLD1q8_UPD, true, true, SingleSpc, 2, 8 },
|
||||
|
||||
{ ARM::VLD2DUPd16Pseudo, ARM::VLD2DUPd16, true, false, SingleSpc, 2, 4},
|
||||
{ ARM::VLD2DUPd16Pseudo_UPD, ARM::VLD2DUPd16_UPD, true, true, SingleSpc, 2, 4},
|
||||
{ ARM::VLD2DUPd32Pseudo, ARM::VLD2DUPd32, true, false, SingleSpc, 2, 2},
|
||||
{ ARM::VLD2DUPd32Pseudo_UPD, ARM::VLD2DUPd32_UPD, true, true, SingleSpc, 2, 2},
|
||||
{ ARM::VLD2DUPd8Pseudo, ARM::VLD2DUPd8, true, false, SingleSpc, 2, 8},
|
||||
{ ARM::VLD2DUPd8Pseudo_UPD, ARM::VLD2DUPd8_UPD, true, true, SingleSpc, 2, 8},
|
||||
|
||||
{ ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, SingleSpc, 2, 4 },
|
||||
{ ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, SingleSpc, 2, 4 },
|
||||
{ ARM::VLD2LNd32Pseudo, ARM::VLD2LNd32, true, false, SingleSpc, 2, 2 },
|
||||
|
@ -933,6 +940,12 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
|
|||
case ARM::VLD1DUPq8Pseudo_UPD:
|
||||
case ARM::VLD1DUPq16Pseudo_UPD:
|
||||
case ARM::VLD1DUPq32Pseudo_UPD:
|
||||
case ARM::VLD2DUPd8Pseudo:
|
||||
case ARM::VLD2DUPd16Pseudo:
|
||||
case ARM::VLD2DUPd32Pseudo:
|
||||
case ARM::VLD2DUPd8Pseudo_UPD:
|
||||
case ARM::VLD2DUPd16Pseudo_UPD:
|
||||
case ARM::VLD2DUPd32Pseudo_UPD:
|
||||
ExpandVLD(MBBI);
|
||||
break;
|
||||
|
||||
|
|
|
@ -197,6 +197,11 @@ private:
|
|||
SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, unsigned NumVecs,
|
||||
unsigned *DOpcodes, unsigned *QOpcodes);
|
||||
|
||||
/// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs
|
||||
/// should be 2, 3 or 4. The opcode array specifies the instructions used
|
||||
/// for loading D registers. (Q registers are not supported.)
|
||||
SDNode *SelectVLDDup(SDNode *N, unsigned NumVecs, unsigned *Opcodes);
|
||||
|
||||
/// SelectVTBL - Select NEON VTBL and VTBX intrinsics. NumVecs should be 2,
|
||||
/// 3 or 4. These are custom-selected so that a REG_SEQUENCE can be
|
||||
/// generated to force the table registers to be consecutive.
|
||||
|
@ -1643,6 +1648,62 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, unsigned NumVecs,
|
||||
unsigned *Opcodes) {
|
||||
assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
|
||||
DebugLoc dl = N->getDebugLoc();
|
||||
|
||||
SDValue MemAddr, Align;
|
||||
if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
|
||||
return NULL;
|
||||
|
||||
SDValue Chain = N->getOperand(0);
|
||||
EVT VT = N->getValueType(0);
|
||||
|
||||
unsigned Alignment = 0;
|
||||
if (NumVecs != 3) {
|
||||
Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
|
||||
unsigned NumBytes = NumVecs * VT.getVectorElementType().getSizeInBits()/8;
|
||||
if (Alignment > NumBytes)
|
||||
Alignment = NumBytes;
|
||||
// Alignment must be a power of two; make sure of that.
|
||||
Alignment = (Alignment & -Alignment);
|
||||
if (Alignment == 1)
|
||||
Alignment = 0;
|
||||
}
|
||||
Align = CurDAG->getTargetConstant(Alignment, MVT::i32);
|
||||
|
||||
unsigned OpcodeIndex;
|
||||
switch (VT.getSimpleVT().SimpleTy) {
|
||||
default: llvm_unreachable("unhandled vld-dup type");
|
||||
case MVT::v8i8: OpcodeIndex = 0; break;
|
||||
case MVT::v4i16: OpcodeIndex = 1; break;
|
||||
case MVT::v2f32:
|
||||
case MVT::v2i32: OpcodeIndex = 2; break;
|
||||
}
|
||||
|
||||
SDValue Pred = getAL(CurDAG);
|
||||
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
|
||||
SDValue SuperReg;
|
||||
unsigned Opc = Opcodes[OpcodeIndex];
|
||||
const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain };
|
||||
|
||||
unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
|
||||
EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts);
|
||||
SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTy, MVT::Other, Ops, 5);
|
||||
SuperReg = SDValue(VLdDup, 0);
|
||||
Chain = SDValue(VLdDup, 1);
|
||||
|
||||
// Extract the subregisters.
|
||||
assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
|
||||
unsigned SubIdx = ARM::dsub_0;
|
||||
for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
|
||||
ReplaceUses(SDValue(N, Vec),
|
||||
CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
|
||||
ReplaceUses(SDValue(N, NumVecs), Chain);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
|
||||
unsigned Opc) {
|
||||
assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range");
|
||||
|
@ -2294,6 +2355,12 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
|
|||
N->getOperand(2), N->getOperand(3));
|
||||
}
|
||||
|
||||
case ARMISD::VLD2DUP: {
|
||||
unsigned Opcodes[] = { ARM::VLD2DUPd8Pseudo, ARM::VLD2DUPd16Pseudo,
|
||||
ARM::VLD2DUPd32Pseudo };
|
||||
return SelectVLDDup(N, 2, Opcodes);
|
||||
}
|
||||
|
||||
case ISD::INTRINSIC_VOID:
|
||||
case ISD::INTRINSIC_W_CHAIN: {
|
||||
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
|
||||
|
|
|
@ -824,6 +824,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
case ARMISD::BFI: return "ARMISD::BFI";
|
||||
case ARMISD::VORRIMM: return "ARMISD::VORRIMM";
|
||||
case ARMISD::VBICIMM: return "ARMISD::VBICIMM";
|
||||
case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP";
|
||||
case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP";
|
||||
case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP";
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4836,15 +4839,100 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
|
|||
DAG.getUNDEF(VT), NewMask.data());
|
||||
}
|
||||
|
||||
/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
|
||||
/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
|
||||
/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
|
||||
/// return true.
|
||||
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
EVT VT = N->getValueType(0);
|
||||
// vldN-dup instructions only support 64-bit vectors for N > 1.
|
||||
if (!VT.is64BitVector())
|
||||
return false;
|
||||
|
||||
// Check if the VDUPLANE operand is a vldN-dup intrinsic.
|
||||
SDNode *VLD = N->getOperand(0).getNode();
|
||||
if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
|
||||
return false;
|
||||
unsigned NumVecs = 0;
|
||||
unsigned NewOpc = 0;
|
||||
unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
|
||||
if (IntNo == Intrinsic::arm_neon_vld2lane) {
|
||||
NumVecs = 2;
|
||||
NewOpc = ARMISD::VLD2DUP;
|
||||
} else if (IntNo == Intrinsic::arm_neon_vld3lane) {
|
||||
NumVecs = 3;
|
||||
NewOpc = ARMISD::VLD3DUP;
|
||||
} else if (IntNo == Intrinsic::arm_neon_vld4lane) {
|
||||
NumVecs = 4;
|
||||
NewOpc = ARMISD::VLD4DUP;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
// First check that all the vldN-lane uses are VDUPLANEs and that the lane
|
||||
// numbers match the load.
|
||||
unsigned VLDLaneNo =
|
||||
cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
|
||||
for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
|
||||
UI != UE; ++UI) {
|
||||
// Ignore uses of the chain result.
|
||||
if (UI.getUse().getResNo() == NumVecs)
|
||||
continue;
|
||||
SDNode *User = *UI;
|
||||
if (User->getOpcode() != ARMISD::VDUPLANE ||
|
||||
VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
|
||||
return false;
|
||||
}
|
||||
|
||||
// Create the vldN-dup node.
|
||||
EVT Tys[5];
|
||||
unsigned n;
|
||||
for (n = 0; n < NumVecs; ++n)
|
||||
Tys[n] = VT;
|
||||
Tys[n] = MVT::Other;
|
||||
SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1);
|
||||
SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
|
||||
MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
|
||||
SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, VLD->getDebugLoc(), SDTys,
|
||||
Ops, 2, VLDMemInt->getMemoryVT(),
|
||||
VLDMemInt->getMemOperand());
|
||||
|
||||
// Update the uses.
|
||||
for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
|
||||
UI != UE; ++UI) {
|
||||
unsigned ResNo = UI.getUse().getResNo();
|
||||
// Ignore uses of the chain result.
|
||||
if (ResNo == NumVecs)
|
||||
continue;
|
||||
SDNode *User = *UI;
|
||||
DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
|
||||
}
|
||||
|
||||
// Now the vldN-lane intrinsic is dead except for its chain result.
|
||||
// Update uses of the chain.
|
||||
std::vector<SDValue> VLDDupResults;
|
||||
for (unsigned n = 0; n < NumVecs; ++n)
|
||||
VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
|
||||
VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
|
||||
DCI.CombineTo(VLD, VLDDupResults);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// PerformVDUPLANECombine - Target-specific dag combine xforms for
|
||||
/// ARMISD::VDUPLANE.
|
||||
static SDValue PerformVDUPLANECombine(SDNode *N, SelectionDAG &DAG) {
|
||||
// If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
|
||||
// redundant.
|
||||
static SDValue PerformVDUPLANECombine(SDNode *N,
|
||||
TargetLowering::DAGCombinerInfo &DCI) {
|
||||
SDValue Op = N->getOperand(0);
|
||||
EVT VT = N->getValueType(0);
|
||||
|
||||
// Ignore bit_converts.
|
||||
// If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
|
||||
// of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
|
||||
if (CombineVLDDUP(N, DCI))
|
||||
return SDValue(N, 0);
|
||||
|
||||
// If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
|
||||
// redundant. Ignore bit_converts for now; element sizes are checked below.
|
||||
while (Op.getOpcode() == ISD::BITCAST)
|
||||
Op = Op.getOperand(0);
|
||||
if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
|
||||
|
@ -4857,10 +4945,11 @@ static SDValue PerformVDUPLANECombine(SDNode *N, SelectionDAG &DAG) {
|
|||
unsigned EltBits;
|
||||
if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
|
||||
EltSize = 8;
|
||||
EVT VT = N->getValueType(0);
|
||||
if (EltSize > VT.getVectorElementType().getSizeInBits())
|
||||
return SDValue();
|
||||
|
||||
return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
|
||||
return DCI.DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
|
||||
}
|
||||
|
||||
/// getVShiftImm - Check if this is a valid build_vector for the immediate
|
||||
|
@ -5248,7 +5337,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
|
|||
case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
|
||||
case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI.DAG);
|
||||
case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
|
||||
case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI.DAG);
|
||||
case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
|
||||
case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
|
||||
case ISD::SHL:
|
||||
case ISD::SRA:
|
||||
|
|
|
@ -172,7 +172,12 @@ namespace llvm {
|
|||
// Vector OR with immediate
|
||||
VORRIMM,
|
||||
// Vector AND with NOT of immediate
|
||||
VBICIMM
|
||||
VBICIMM,
|
||||
|
||||
// Vector load N-element structure to all lanes:
|
||||
VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
|
||||
VLD3DUP,
|
||||
VLD4DUP
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -854,6 +854,47 @@ def VLD1DUPq16Pseudo_UPD : VLDQWBPseudo<IIC_VLD1dupu>;
|
|||
def VLD1DUPq32Pseudo_UPD : VLDQWBPseudo<IIC_VLD1dupu>;
|
||||
|
||||
// VLD2DUP : Vector Load (single 2-element structure to all lanes)
|
||||
class VLD2DUP<bits<4> op7_4, string Dt>
|
||||
: NLdSt<1, 0b10, 0b1101, op7_4, (outs DPR:$Vd, DPR:$dst2),
|
||||
(ins addrmode6:$Rn), IIC_VLD2dup,
|
||||
"vld2", Dt, "\\{$Vd[], $dst2[]\\}, $Rn", "", []> {
|
||||
let Rm = 0b1111;
|
||||
let Inst{4} = Rn{4};
|
||||
}
|
||||
|
||||
def VLD2DUPd8 : VLD2DUP<{0,0,0,?}, "8">;
|
||||
def VLD2DUPd16 : VLD2DUP<{0,1,0,?}, "16">;
|
||||
def VLD2DUPd32 : VLD2DUP<{1,0,0,?}, "32">;
|
||||
|
||||
def VLD2DUPd8Pseudo : VLDQPseudo<IIC_VLD2dup>;
|
||||
def VLD2DUPd16Pseudo : VLDQPseudo<IIC_VLD2dup>;
|
||||
def VLD2DUPd32Pseudo : VLDQPseudo<IIC_VLD2dup>;
|
||||
|
||||
// ...with double-spaced registers (not used for codegen):
|
||||
def VLD2DUPd8Q : VLD2DUP<{0,0,1,?}, "8">;
|
||||
def VLD2DUPd16Q : VLD2DUP<{0,1,1,?}, "16">;
|
||||
def VLD2DUPd32Q : VLD2DUP<{1,0,1,?}, "32">;
|
||||
|
||||
// ...with address register writeback:
|
||||
class VLD2DUPWB<bits<4> op7_4, string Dt>
|
||||
: NLdSt<1, 0b10, 0b1101, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb),
|
||||
(ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD2dupu,
|
||||
"vld2", Dt, "\\{$Vd[], $dst2[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
|
||||
let Inst{4} = Rn{4};
|
||||
}
|
||||
|
||||
def VLD2DUPd8_UPD : VLD2DUPWB<{0,0,0,0}, "8">;
|
||||
def VLD2DUPd16_UPD : VLD2DUPWB<{0,1,0,?}, "16">;
|
||||
def VLD2DUPd32_UPD : VLD2DUPWB<{1,0,0,?}, "32">;
|
||||
|
||||
def VLD2DUPd8Q_UPD : VLD2DUPWB<{0,0,1,0}, "8">;
|
||||
def VLD2DUPd16Q_UPD : VLD2DUPWB<{0,1,1,?}, "16">;
|
||||
def VLD2DUPd32Q_UPD : VLD2DUPWB<{1,0,1,?}, "32">;
|
||||
|
||||
def VLD2DUPd8Pseudo_UPD : VLDQWBPseudo<IIC_VLD2dupu>;
|
||||
def VLD2DUPd16Pseudo_UPD : VLDQWBPseudo<IIC_VLD2dupu>;
|
||||
def VLD2DUPd32Pseudo_UPD : VLDQWBPseudo<IIC_VLD2dupu>;
|
||||
|
||||
// VLD3DUP : Vector Load (single 3-element structure to all lanes)
|
||||
// VLD4DUP : Vector Load (single 4-element structure to all lanes)
|
||||
// FIXME: Not yet implemented.
|
||||
|
|
|
@ -146,6 +146,8 @@ def IIC_VLD2u : InstrItinClass;
|
|||
def IIC_VLD2x2u : InstrItinClass;
|
||||
def IIC_VLD2ln : InstrItinClass;
|
||||
def IIC_VLD2lnu : InstrItinClass;
|
||||
def IIC_VLD2dup : InstrItinClass;
|
||||
def IIC_VLD2dupu : InstrItinClass;
|
||||
def IIC_VLD3 : InstrItinClass;
|
||||
def IIC_VLD3ln : InstrItinClass;
|
||||
def IIC_VLD3u : InstrItinClass;
|
||||
|
|
|
@ -523,6 +523,18 @@ def CortexA8Itineraries : ProcessorItineraries<
|
|||
InstrStage<3, [A8_LSPipe]>],
|
||||
[3, 3, 2, 1, 1, 1, 1, 1]>,
|
||||
//
|
||||
// VLD2dup
|
||||
InstrItinData<IIC_VLD2dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
|
||||
InstrStage<2, [A8_NLSPipe], 0>,
|
||||
InstrStage<2, [A8_LSPipe]>],
|
||||
[2, 1]>,
|
||||
//
|
||||
// VLD2dupu
|
||||
InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
|
||||
InstrStage<2, [A8_NLSPipe], 0>,
|
||||
InstrStage<2, [A8_LSPipe]>],
|
||||
[2, 2, 1, 1]>,
|
||||
//
|
||||
// VLD3
|
||||
InstrItinData<IIC_VLD3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
|
||||
InstrStage<4, [A8_NLSPipe], 0>,
|
||||
|
|
|
@ -887,6 +887,24 @@ def CortexA9Itineraries : ProcessorItineraries<
|
|||
InstrStage<3, [A9_LSUnit]>],
|
||||
[4, 4, 2, 1, 1, 1, 1, 1]>,
|
||||
//
|
||||
// VLD2dup
|
||||
InstrItinData<IIC_VLD2dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
|
||||
InstrStage<1, [A9_MUX0], 0>,
|
||||
InstrStage<1, [A9_DRegsN], 0, Required>,
|
||||
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
|
||||
InstrStage<2, [A9_NPipe], 0>,
|
||||
InstrStage<2, [A9_LSUnit]>],
|
||||
[3, 1]>,
|
||||
//
|
||||
// VLD2dupu
|
||||
InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
|
||||
InstrStage<1, [A9_MUX0], 0>,
|
||||
InstrStage<1, [A9_DRegsN], 0, Required>,
|
||||
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
|
||||
InstrStage<2, [A9_NPipe], 0>,
|
||||
InstrStage<2, [A9_LSUnit]>],
|
||||
[3, 2, 1, 1]>,
|
||||
//
|
||||
// VLD3
|
||||
InstrItinData<IIC_VLD3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
|
||||
InstrStage<1, [A9_MUX0], 0>,
|
||||
|
|
|
@ -39,3 +39,35 @@ define <16 x i8> @vld1dupQi8(i8* %A) nounwind {
|
|||
%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer
|
||||
ret <16 x i8> %tmp3
|
||||
}
|
||||
|
||||
%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
|
||||
%struct.__neon_int2x32x2_t = type { <2 x i32>, <2 x i32> }
|
||||
|
||||
define <8 x i8> @vld2dupi8(i8* %A) nounwind {
|
||||
;CHECK: vld2dupi8:
|
||||
;Check the (default) alignment value.
|
||||
;CHECK: vld2.8 {d16[], d17[]}, [r0]
|
||||
%tmp0 = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
|
||||
%tmp1 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 0
|
||||
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
|
||||
%tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 1
|
||||
%tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <8 x i32> zeroinitializer
|
||||
%tmp5 = add <8 x i8> %tmp2, %tmp4
|
||||
ret <8 x i8> %tmp5
|
||||
}
|
||||
|
||||
define <2 x i32> @vld2dupi32(i32* %A) nounwind {
|
||||
;CHECK: vld2dupi32:
|
||||
;Check the alignment value. Max for this instruction is 64 bits:
|
||||
;CHECK: vld2.32 {d16[], d17[]}, [r0, :64]
|
||||
%tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i32* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
|
||||
%tmp1 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 0
|
||||
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
|
||||
%tmp3 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 1
|
||||
%tmp4 = shufflevector <2 x i32> %tmp3, <2 x i32> undef, <2 x i32> zeroinitializer
|
||||
%tmp5 = add <2 x i32> %tmp2, %tmp4
|
||||
ret <2 x i32> %tmp5
|
||||
}
|
||||
|
||||
declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
|
||||
declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i32*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
|
||||
|
|
Loading…
Reference in New Issue