forked from OSchip/llvm-project
Add NEON VLD1-dup instructions (load 1 element to all lanes).
llvm-svn: 120194
This commit is contained in:
parent
3a63f9d852
commit
c92eea0175
|
@ -112,6 +112,13 @@ namespace {
|
|||
}
|
||||
|
||||
static const NEONLdStTableEntry NEONLdStTable[] = {
|
||||
{ ARM::VLD1DUPq16Pseudo, ARM::VLD1DUPq16, true, false, SingleSpc, 2, 4},
|
||||
{ ARM::VLD1DUPq16Pseudo_UPD, ARM::VLD1DUPq16_UPD, true, true, SingleSpc, 2, 4},
|
||||
{ ARM::VLD1DUPq32Pseudo, ARM::VLD1DUPq32, true, false, SingleSpc, 2, 2},
|
||||
{ ARM::VLD1DUPq32Pseudo_UPD, ARM::VLD1DUPq32_UPD, true, true, SingleSpc, 2, 2},
|
||||
{ ARM::VLD1DUPq8Pseudo, ARM::VLD1DUPq8, true, false, SingleSpc, 2, 8},
|
||||
{ ARM::VLD1DUPq8Pseudo_UPD, ARM::VLD1DUPq8_UPD, true, true, SingleSpc, 2, 8},
|
||||
|
||||
{ ARM::VLD1LNq16Pseudo, ARM::VLD1LNd16, true, false, EvenDblSpc, 1, 4 },
|
||||
{ ARM::VLD1LNq16Pseudo_UPD, ARM::VLD1LNd16_UPD, true, true, EvenDblSpc, 1, 4 },
|
||||
{ ARM::VLD1LNq32Pseudo, ARM::VLD1LNd32, true, false, EvenDblSpc, 1, 2 },
|
||||
|
@ -920,6 +927,12 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
|
|||
case ARM::VLD4q8oddPseudo_UPD:
|
||||
case ARM::VLD4q16oddPseudo_UPD:
|
||||
case ARM::VLD4q32oddPseudo_UPD:
|
||||
case ARM::VLD1DUPq8Pseudo:
|
||||
case ARM::VLD1DUPq16Pseudo:
|
||||
case ARM::VLD1DUPq32Pseudo:
|
||||
case ARM::VLD1DUPq8Pseudo_UPD:
|
||||
case ARM::VLD1DUPq16Pseudo_UPD:
|
||||
case ARM::VLD1DUPq32Pseudo_UPD:
|
||||
ExpandVLD(MBBI);
|
||||
break;
|
||||
|
||||
|
|
|
@ -162,8 +162,6 @@ def VSTMQDB
|
|||
IIC_fpStore_m, "",
|
||||
[(store (v2f64 QPR:$src), GPR:$Rn)]>;
|
||||
|
||||
let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
|
||||
|
||||
// Classes for VLD* pseudo-instructions with multi-register operands.
|
||||
// These are expanded to real instructions after register allocation.
|
||||
class VLDQPseudo<InstrItinClass itin>
|
||||
|
@ -183,6 +181,8 @@ class VLDQQQQWBPseudo<InstrItinClass itin>
|
|||
(ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin,
|
||||
"$addr.addr = $wb, $src = $dst">;
|
||||
|
||||
let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
|
||||
|
||||
// VLD1 : Vector Load (multiple single elements)
|
||||
class VLD1D<bits<4> op7_4, string Dt>
|
||||
: NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$Vd),
|
||||
|
@ -790,7 +790,79 @@ def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32"> {
|
|||
def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
|
||||
def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
|
||||
|
||||
} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
|
||||
|
||||
// VLD1DUP : Vector Load (single element to all lanes)
|
||||
class VLD1DUP<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
|
||||
PatFrag LoadOp>
|
||||
: NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$Vd), (ins addrmode6:$Rn),
|
||||
IIC_VLD1dup, "vld1", Dt, "\\{$Vd[]\\}, $Rn", "",
|
||||
[(set DPR:$Vd, (Ty (NEONvdup (i32 (LoadOp addrmode6:$Rn)))))]> {
|
||||
let Rm = 0b1111;
|
||||
}
|
||||
class VLD1QDUPPseudo<ValueType Ty, PatFrag LoadOp> : VLDQPseudo<IIC_VLD1dup> {
|
||||
let Pattern = [(set QPR:$dst,
|
||||
(Ty (NEONvdup (i32 (LoadOp addrmode6:$addr)))))];
|
||||
}
|
||||
|
||||
def VLD1DUPd8 : VLD1DUP<0b1100, {0,0,0,?}, "8", v8i8, extloadi8> {
|
||||
let Inst{4} = Rn{4};
|
||||
}
|
||||
def VLD1DUPd16 : VLD1DUP<0b1100, {0,1,0,?}, "16", v4i16, extloadi16> {
|
||||
let Inst{4} = Rn{4};
|
||||
}
|
||||
def VLD1DUPd32 : VLD1DUP<0b1100, {1,0,0,?}, "32", v2i32, load> {
|
||||
let Inst{4} = Rn{4};
|
||||
}
|
||||
|
||||
def VLD1DUPq8Pseudo : VLD1QDUPPseudo<v16i8, extloadi8>;
|
||||
def VLD1DUPq16Pseudo : VLD1QDUPPseudo<v8i16, extloadi16>;
|
||||
def VLD1DUPq32Pseudo : VLD1QDUPPseudo<v4i32, load>;
|
||||
|
||||
let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
|
||||
|
||||
class VLD1QDUP<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
|
||||
PatFrag LoadOp>
|
||||
: NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2),
|
||||
(ins addrmode6:$Rn), IIC_VLD1dup,
|
||||
"vld1", Dt, "\\{$Vd[], $dst2[]\\}, $Rn", "", []> {
|
||||
let Rm = 0b1111;
|
||||
}
|
||||
|
||||
def VLD1DUPq8 : VLD1QDUP<0b1100, {0,0,1,0}, "8", v16i8, extloadi8>;
|
||||
def VLD1DUPq16 : VLD1QDUP<0b1100, {0,1,1,?}, "16", v8i16, extloadi16> {
|
||||
let Inst{4} = Rn{4};
|
||||
}
|
||||
def VLD1DUPq32 : VLD1QDUP<0b1100, {1,0,1,?}, "32", v4i32, load> {
|
||||
let Inst{4} = Rn{4};
|
||||
}
|
||||
|
||||
// ...with address register writeback:
|
||||
class VLD1DUPWB<bits<4> op11_8, bits<4> op7_4, string Dt>
|
||||
: NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, GPR:$wb),
|
||||
(ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1dupu,
|
||||
"vld1", Dt, "\\{$Vd[]\\}, $Rn$Rm", "$Rn.addr = $wb", []>;
|
||||
class VLD1QDUPWB<bits<4> op11_8, bits<4> op7_4, string Dt>
|
||||
: NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb),
|
||||
(ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1dupu,
|
||||
"vld1", Dt, "\\{$Vd[], $dst2[]\\}, $Rn$Rm", "$Rn.addr = $wb", []>;
|
||||
|
||||
def VLD1DUPd8_UPD : VLD1DUPWB<0b1100, {0,0,0,0}, "8">;
|
||||
def VLD1DUPd16_UPD : VLD1DUPWB<0b1100, {0,1,0,?}, "16"> { let Inst{4} = Rn{4}; }
|
||||
def VLD1DUPd32_UPD : VLD1DUPWB<0b1100, {1,0,0,?}, "32"> { let Inst{4} = Rn{4}; }
|
||||
|
||||
def VLD1DUPq8_UPD : VLD1QDUPWB<0b1100, {0,0,1,0}, "8">;
|
||||
def VLD1DUPq16_UPD : VLD1QDUPWB<0b1100, {0,1,1,?}, "16"> {
|
||||
let Inst{4} = Rn{4};
|
||||
}
|
||||
def VLD1DUPq32_UPD : VLD1QDUPWB<0b1100, {1,0,1,?}, "32"> {
|
||||
let Inst{4} = Rn{4};
|
||||
}
|
||||
|
||||
def VLD1DUPq8Pseudo_UPD : VLDQWBPseudo<IIC_VLD1dupu>;
|
||||
def VLD1DUPq16Pseudo_UPD : VLDQWBPseudo<IIC_VLD1dupu>;
|
||||
def VLD1DUPq32Pseudo_UPD : VLDQWBPseudo<IIC_VLD1dupu>;
|
||||
|
||||
// VLD2DUP : Vector Load (single 2-element structure to all lanes)
|
||||
// VLD3DUP : Vector Load (single 3-element structure to all lanes)
|
||||
// VLD4DUP : Vector Load (single 4-element structure to all lanes)
|
||||
|
|
|
@ -138,6 +138,8 @@ def IIC_VLD1x3u : InstrItinClass;
|
|||
def IIC_VLD1x4u : InstrItinClass;
|
||||
def IIC_VLD1ln : InstrItinClass;
|
||||
def IIC_VLD1lnu : InstrItinClass;
|
||||
def IIC_VLD1dup : InstrItinClass;
|
||||
def IIC_VLD1dupu : InstrItinClass;
|
||||
def IIC_VLD2 : InstrItinClass;
|
||||
def IIC_VLD2x2 : InstrItinClass;
|
||||
def IIC_VLD2u : InstrItinClass;
|
||||
|
|
|
@ -475,6 +475,18 @@ def CortexA8Itineraries : ProcessorItineraries<
|
|||
InstrStage<3, [A8_LSPipe]>],
|
||||
[3, 2, 1, 1, 1, 1]>,
|
||||
//
|
||||
// VLD1dup
|
||||
InstrItinData<IIC_VLD1dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
|
||||
InstrStage<2, [A8_NLSPipe], 0>,
|
||||
InstrStage<2, [A8_LSPipe]>],
|
||||
[2, 1]>,
|
||||
//
|
||||
// VLD1dupu
|
||||
InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
|
||||
InstrStage<2, [A8_NLSPipe], 0>,
|
||||
InstrStage<2, [A8_LSPipe]>],
|
||||
[2, 2, 1, 1]>,
|
||||
//
|
||||
// VLD2
|
||||
InstrItinData<IIC_VLD2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
|
||||
InstrStage<2, [A8_NLSPipe], 0>,
|
||||
|
|
|
@ -813,6 +813,24 @@ def CortexA9Itineraries : ProcessorItineraries<
|
|||
InstrStage<3, [A9_LSUnit]>],
|
||||
[4, 2, 1, 1, 1, 1]>,
|
||||
//
|
||||
// VLD1dup
|
||||
InstrItinData<IIC_VLD1dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
|
||||
InstrStage<1, [A9_MUX0], 0>,
|
||||
InstrStage<1, [A9_DRegsN], 0, Required>,
|
||||
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
|
||||
InstrStage<2, [A9_NPipe], 0>,
|
||||
InstrStage<2, [A9_LSUnit]>],
|
||||
[3, 1]>,
|
||||
//
|
||||
// VLD1dupu
|
||||
InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
|
||||
InstrStage<1, [A9_MUX0], 0>,
|
||||
InstrStage<1, [A9_DRegsN], 0, Required>,
|
||||
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
|
||||
InstrStage<2, [A9_NPipe], 0>,
|
||||
InstrStage<2, [A9_LSUnit]>],
|
||||
[3, 2, 1, 1]>,
|
||||
//
|
||||
// VLD2
|
||||
InstrItinData<IIC_VLD2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
|
||||
InstrStage<1, [A9_MUX0], 0>,
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
; RUN: llc -mcpu=cortex-a8 < %s | grep vdup.16
|
||||
; RUN: llc -mcpu=cortex-a8 < %s | FileCheck %s
|
||||
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
|
||||
target triple = "armv7-eabi"
|
||||
|
||||
|
@ -7,6 +7,7 @@ entry:
|
|||
br i1 undef, label %return, label %bb
|
||||
|
||||
bb: ; preds = %bb, %entry
|
||||
; CHECK: vld1.16 {d16[], d17[]}
|
||||
%0 = load i16* undef, align 2
|
||||
%1 = insertelement <8 x i16> undef, i16 %0, i32 2
|
||||
%2 = insertelement <8 x i16> %1, i16 undef, i32 3
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
|
||||
|
||||
define <8 x i8> @vld1dupi8(i8* %A) nounwind {
|
||||
;CHECK: vld1dupi8:
|
||||
;Check the (default) alignment value.
|
||||
;CHECK: vld1.8 {d16[]}, [r0]
|
||||
%tmp1 = load i8* %A, align 8
|
||||
%tmp2 = insertelement <8 x i8> undef, i8 %tmp1, i32 0
|
||||
%tmp3 = shufflevector <8 x i8> %tmp2, <8 x i8> undef, <8 x i32> zeroinitializer
|
||||
ret <8 x i8> %tmp3
|
||||
}
|
||||
|
||||
define <4 x i16> @vld1dupi16(i16* %A) nounwind {
|
||||
;CHECK: vld1dupi16:
|
||||
;Check the alignment value. Max for this instruction is 16 bits:
|
||||
;CHECK: vld1.16 {d16[]}, [r0, :16]
|
||||
%tmp1 = load i16* %A, align 8
|
||||
%tmp2 = insertelement <4 x i16> undef, i16 %tmp1, i32 0
|
||||
%tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> undef, <4 x i32> zeroinitializer
|
||||
ret <4 x i16> %tmp3
|
||||
}
|
||||
|
||||
define <2 x i32> @vld1dupi32(i32* %A) nounwind {
|
||||
;CHECK: vld1dupi32:
|
||||
;Check the alignment value. Max for this instruction is 32 bits:
|
||||
;CHECK: vld1.32 {d16[]}, [r0, :32]
|
||||
%tmp1 = load i32* %A, align 8
|
||||
%tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i32 0
|
||||
%tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer
|
||||
ret <2 x i32> %tmp3
|
||||
}
|
||||
|
||||
define <16 x i8> @vld1dupQi8(i8* %A) nounwind {
|
||||
;CHECK: vld1dupQi8:
|
||||
;Check the (default) alignment value.
|
||||
;CHECK: vld1.8 {d16[], d17[]}, [r0]
|
||||
%tmp1 = load i8* %A, align 8
|
||||
%tmp2 = insertelement <16 x i8> undef, i8 %tmp1, i32 0
|
||||
%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer
|
||||
ret <16 x i8> %tmp3
|
||||
}
|
Loading…
Reference in New Issue