Add support for NEON VLD2-dup instructions.

llvm-svn: 120236
This commit is contained in:
Bob Wilson 2010-11-28 06:51:26 +00:00
parent f9b96c474f
commit 2d790df105
9 changed files with 287 additions and 8 deletions

View File

@ -140,6 +140,13 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
{ ARM::VLD1q8Pseudo, ARM::VLD1q8, true, false, SingleSpc, 2, 8 },
{ ARM::VLD1q8Pseudo_UPD, ARM::VLD1q8_UPD, true, true, SingleSpc, 2, 8 },
{ ARM::VLD2DUPd16Pseudo, ARM::VLD2DUPd16, true, false, SingleSpc, 2, 4},
{ ARM::VLD2DUPd16Pseudo_UPD, ARM::VLD2DUPd16_UPD, true, true, SingleSpc, 2, 4},
{ ARM::VLD2DUPd32Pseudo, ARM::VLD2DUPd32, true, false, SingleSpc, 2, 2},
{ ARM::VLD2DUPd32Pseudo_UPD, ARM::VLD2DUPd32_UPD, true, true, SingleSpc, 2, 2},
{ ARM::VLD2DUPd8Pseudo, ARM::VLD2DUPd8, true, false, SingleSpc, 2, 8},
{ ARM::VLD2DUPd8Pseudo_UPD, ARM::VLD2DUPd8_UPD, true, true, SingleSpc, 2, 8},
{ ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, SingleSpc, 2, 4 },
{ ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, SingleSpc, 2, 4 },
{ ARM::VLD2LNd32Pseudo, ARM::VLD2LNd32, true, false, SingleSpc, 2, 2 },
@ -933,6 +940,12 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
case ARM::VLD1DUPq8Pseudo_UPD:
case ARM::VLD1DUPq16Pseudo_UPD:
case ARM::VLD1DUPq32Pseudo_UPD:
case ARM::VLD2DUPd8Pseudo:
case ARM::VLD2DUPd16Pseudo:
case ARM::VLD2DUPd32Pseudo:
case ARM::VLD2DUPd8Pseudo_UPD:
case ARM::VLD2DUPd16Pseudo_UPD:
case ARM::VLD2DUPd32Pseudo_UPD:
ExpandVLD(MBBI);
break;

View File

@ -197,6 +197,11 @@ private:
SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, unsigned NumVecs,
unsigned *DOpcodes, unsigned *QOpcodes);
/// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs
/// should be 2, 3 or 4. The opcode array specifies the instructions used
/// for loading D registers. (Q registers are not supported.)
SDNode *SelectVLDDup(SDNode *N, unsigned NumVecs, unsigned *Opcodes);
/// SelectVTBL - Select NEON VTBL and VTBX intrinsics. NumVecs should be 2,
/// 3 or 4. These are custom-selected so that a REG_SEQUENCE can be
/// generated to force the table registers to be consecutive.
@ -1643,6 +1648,62 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
return NULL;
}
SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, unsigned NumVecs,
unsigned *Opcodes) {
assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
DebugLoc dl = N->getDebugLoc();
SDValue MemAddr, Align;
if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
return NULL;
SDValue Chain = N->getOperand(0);
EVT VT = N->getValueType(0);
unsigned Alignment = 0;
if (NumVecs != 3) {
Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
unsigned NumBytes = NumVecs * VT.getVectorElementType().getSizeInBits()/8;
if (Alignment > NumBytes)
Alignment = NumBytes;
// Alignment must be a power of two; make sure of that.
Alignment = (Alignment & -Alignment);
if (Alignment == 1)
Alignment = 0;
}
Align = CurDAG->getTargetConstant(Alignment, MVT::i32);
unsigned OpcodeIndex;
switch (VT.getSimpleVT().SimpleTy) {
default: llvm_unreachable("unhandled vld-dup type");
case MVT::v8i8: OpcodeIndex = 0; break;
case MVT::v4i16: OpcodeIndex = 1; break;
case MVT::v2f32:
case MVT::v2i32: OpcodeIndex = 2; break;
}
SDValue Pred = getAL(CurDAG);
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
SDValue SuperReg;
unsigned Opc = Opcodes[OpcodeIndex];
const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain };
unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts);
SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTy, MVT::Other, Ops, 5);
SuperReg = SDValue(VLdDup, 0);
Chain = SDValue(VLdDup, 1);
// Extract the subregisters.
assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
unsigned SubIdx = ARM::dsub_0;
for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
ReplaceUses(SDValue(N, Vec),
CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
ReplaceUses(SDValue(N, NumVecs), Chain);
return NULL;
}
SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
unsigned Opc) {
assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range");
@ -2294,6 +2355,12 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
N->getOperand(2), N->getOperand(3));
}
case ARMISD::VLD2DUP: {
unsigned Opcodes[] = { ARM::VLD2DUPd8Pseudo, ARM::VLD2DUPd16Pseudo,
ARM::VLD2DUPd32Pseudo };
return SelectVLDDup(N, 2, Opcodes);
}
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN: {
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();

View File

@ -824,6 +824,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::BFI: return "ARMISD::BFI";
case ARMISD::VORRIMM: return "ARMISD::VORRIMM";
case ARMISD::VBICIMM: return "ARMISD::VBICIMM";
case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP";
case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP";
case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP";
}
}
@ -4836,15 +4839,100 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
DAG.getUNDEF(VT), NewMask.data());
}
/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
/// return true.
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
// vldN-dup instructions only support 64-bit vectors for N > 1.
if (!VT.is64BitVector())
return false;
// Check if the VDUPLANE operand is a vldN-dup intrinsic.
SDNode *VLD = N->getOperand(0).getNode();
if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
return false;
unsigned NumVecs = 0;
unsigned NewOpc = 0;
unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
if (IntNo == Intrinsic::arm_neon_vld2lane) {
NumVecs = 2;
NewOpc = ARMISD::VLD2DUP;
} else if (IntNo == Intrinsic::arm_neon_vld3lane) {
NumVecs = 3;
NewOpc = ARMISD::VLD3DUP;
} else if (IntNo == Intrinsic::arm_neon_vld4lane) {
NumVecs = 4;
NewOpc = ARMISD::VLD4DUP;
} else {
return false;
}
// First check that all the vldN-lane uses are VDUPLANEs and that the lane
// numbers match the load.
unsigned VLDLaneNo =
cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
UI != UE; ++UI) {
// Ignore uses of the chain result.
if (UI.getUse().getResNo() == NumVecs)
continue;
SDNode *User = *UI;
if (User->getOpcode() != ARMISD::VDUPLANE ||
VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
return false;
}
// Create the vldN-dup node.
EVT Tys[5];
unsigned n;
for (n = 0; n < NumVecs; ++n)
Tys[n] = VT;
Tys[n] = MVT::Other;
SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1);
SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, VLD->getDebugLoc(), SDTys,
Ops, 2, VLDMemInt->getMemoryVT(),
VLDMemInt->getMemOperand());
// Update the uses.
for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
UI != UE; ++UI) {
unsigned ResNo = UI.getUse().getResNo();
// Ignore uses of the chain result.
if (ResNo == NumVecs)
continue;
SDNode *User = *UI;
DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
}
// Now the vldN-lane intrinsic is dead except for its chain result.
// Update uses of the chain.
std::vector<SDValue> VLDDupResults;
for (unsigned n = 0; n < NumVecs; ++n)
VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
DCI.CombineTo(VLD, VLDDupResults);
return true;
}
/// PerformVDUPLANECombine - Target-specific dag combine xforms for
/// ARMISD::VDUPLANE.
static SDValue PerformVDUPLANECombine(SDNode *N, SelectionDAG &DAG) {
// If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
// redundant.
static SDValue PerformVDUPLANECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SDValue Op = N->getOperand(0);
EVT VT = N->getValueType(0);
// Ignore bit_converts.
// If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
// of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
if (CombineVLDDUP(N, DCI))
return SDValue(N, 0);
// If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
// redundant. Ignore bit_converts for now; element sizes are checked below.
while (Op.getOpcode() == ISD::BITCAST)
Op = Op.getOperand(0);
if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
@ -4857,10 +4945,11 @@ static SDValue PerformVDUPLANECombine(SDNode *N, SelectionDAG &DAG) {
unsigned EltBits;
if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
EltSize = 8;
EVT VT = N->getValueType(0);
if (EltSize > VT.getVectorElementType().getSizeInBits())
return SDValue();
return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
return DCI.DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
}
/// getVShiftImm - Check if this is a valid build_vector for the immediate
@ -5248,7 +5337,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI.DAG);
case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI.DAG);
case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
case ISD::SHL:
case ISD::SRA:

View File

@ -172,7 +172,12 @@ namespace llvm {
// Vector OR with immediate
VORRIMM,
// Vector AND with NOT of immediate
VBICIMM
VBICIMM,
// Vector load N-element structure to all lanes:
VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
VLD3DUP,
VLD4DUP
};
}

View File

@ -854,6 +854,47 @@ def VLD1DUPq16Pseudo_UPD : VLDQWBPseudo<IIC_VLD1dupu>;
def VLD1DUPq32Pseudo_UPD : VLDQWBPseudo<IIC_VLD1dupu>;
// VLD2DUP : Vector Load (single 2-element structure to all lanes)
class VLD2DUP<bits<4> op7_4, string Dt>
: NLdSt<1, 0b10, 0b1101, op7_4, (outs DPR:$Vd, DPR:$dst2),
(ins addrmode6:$Rn), IIC_VLD2dup,
"vld2", Dt, "\\{$Vd[], $dst2[]\\}, $Rn", "", []> {
let Rm = 0b1111;
let Inst{4} = Rn{4};
}
def VLD2DUPd8 : VLD2DUP<{0,0,0,?}, "8">;
def VLD2DUPd16 : VLD2DUP<{0,1,0,?}, "16">;
def VLD2DUPd32 : VLD2DUP<{1,0,0,?}, "32">;
def VLD2DUPd8Pseudo : VLDQPseudo<IIC_VLD2dup>;
def VLD2DUPd16Pseudo : VLDQPseudo<IIC_VLD2dup>;
def VLD2DUPd32Pseudo : VLDQPseudo<IIC_VLD2dup>;
// ...with double-spaced registers (not used for codegen):
def VLD2DUPd8Q : VLD2DUP<{0,0,1,?}, "8">;
def VLD2DUPd16Q : VLD2DUP<{0,1,1,?}, "16">;
def VLD2DUPd32Q : VLD2DUP<{1,0,1,?}, "32">;
// ...with address register writeback:
class VLD2DUPWB<bits<4> op7_4, string Dt>
: NLdSt<1, 0b10, 0b1101, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb),
(ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD2dupu,
"vld2", Dt, "\\{$Vd[], $dst2[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
let Inst{4} = Rn{4};
}
def VLD2DUPd8_UPD : VLD2DUPWB<{0,0,0,0}, "8">;
def VLD2DUPd16_UPD : VLD2DUPWB<{0,1,0,?}, "16">;
def VLD2DUPd32_UPD : VLD2DUPWB<{1,0,0,?}, "32">;
def VLD2DUPd8Q_UPD : VLD2DUPWB<{0,0,1,0}, "8">;
def VLD2DUPd16Q_UPD : VLD2DUPWB<{0,1,1,?}, "16">;
def VLD2DUPd32Q_UPD : VLD2DUPWB<{1,0,1,?}, "32">;
def VLD2DUPd8Pseudo_UPD : VLDQWBPseudo<IIC_VLD2dupu>;
def VLD2DUPd16Pseudo_UPD : VLDQWBPseudo<IIC_VLD2dupu>;
def VLD2DUPd32Pseudo_UPD : VLDQWBPseudo<IIC_VLD2dupu>;
// VLD3DUP : Vector Load (single 3-element structure to all lanes)
// VLD4DUP : Vector Load (single 4-element structure to all lanes)
// FIXME: Not yet implemented.

View File

@ -146,6 +146,8 @@ def IIC_VLD2u : InstrItinClass;
def IIC_VLD2x2u : InstrItinClass;
def IIC_VLD2ln : InstrItinClass;
def IIC_VLD2lnu : InstrItinClass;
def IIC_VLD2dup : InstrItinClass;
def IIC_VLD2dupu : InstrItinClass;
def IIC_VLD3 : InstrItinClass;
def IIC_VLD3ln : InstrItinClass;
def IIC_VLD3u : InstrItinClass;

View File

@ -523,6 +523,18 @@ def CortexA8Itineraries : ProcessorItineraries<
InstrStage<3, [A8_LSPipe]>],
[3, 3, 2, 1, 1, 1, 1, 1]>,
//
// VLD2dup
InstrItinData<IIC_VLD2dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
InstrStage<2, [A8_NLSPipe], 0>,
InstrStage<2, [A8_LSPipe]>],
[2, 1]>,
//
// VLD2dupu
InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
InstrStage<2, [A8_NLSPipe], 0>,
InstrStage<2, [A8_LSPipe]>],
[2, 2, 1, 1]>,
//
// VLD3
InstrItinData<IIC_VLD3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
InstrStage<4, [A8_NLSPipe], 0>,

View File

@ -887,6 +887,24 @@ def CortexA9Itineraries : ProcessorItineraries<
InstrStage<3, [A9_LSUnit]>],
[4, 4, 2, 1, 1, 1, 1, 1]>,
//
// VLD2dup
InstrItinData<IIC_VLD2dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[3, 1]>,
//
// VLD2dupu
InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[3, 2, 1, 1]>,
//
// VLD3
InstrItinData<IIC_VLD3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,

View File

@ -39,3 +39,35 @@ define <16 x i8> @vld1dupQi8(i8* %A) nounwind {
%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer
ret <16 x i8> %tmp3
}
%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
%struct.__neon_int2x32x2_t = type { <2 x i32>, <2 x i32> }
define <8 x i8> @vld2dupi8(i8* %A) nounwind {
;CHECK: vld2dupi8:
;Check the (default) alignment value.
;CHECK: vld2.8 {d16[], d17[]}, [r0]
%tmp0 = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
%tmp1 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 0
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
%tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 1
%tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <8 x i32> zeroinitializer
%tmp5 = add <8 x i8> %tmp2, %tmp4
ret <8 x i8> %tmp5
}
define <2 x i32> @vld2dupi32(i32* %A) nounwind {
;CHECK: vld2dupi32:
;Check the alignment value. Max for this instruction is 64 bits:
;CHECK: vld2.32 {d16[], d17[]}, [r0, :64]
%tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i32* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
%tmp1 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 0
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
%tmp3 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 1
%tmp4 = shufflevector <2 x i32> %tmp3, <2 x i32> undef, <2 x i32> zeroinitializer
%tmp5 = add <2 x i32> %tmp2, %tmp4
ret <2 x i32> %tmp5
}
declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i32*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly