Add NEON 'laned' operations. This fixes another bunch of gcc testsuite fails and

makes the code faster.

llvm-svn: 81220
This commit is contained in:
Anton Korobeynikov 2009-09-08 15:22:32 +00:00
parent 6760e54c92
commit 59e2b8e894
12 changed files with 773 additions and 22 deletions

View File

@ -612,14 +612,24 @@ ARMBaseInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
if (I != MBB.end()) DL = I->getDebugLoc(); if (I != MBB.end()) DL = I->getDebugLoc();
if (DestRC != SrcRC) { if (DestRC != SrcRC) {
if (((DestRC == ARM::DPRRegisterClass) && // Allow DPR / DPR_VFP2 / DPR_8 cross-class copies
(SrcRC == ARM::DPR_VFP2RegisterClass)) || if (DestRC == ARM::DPRRegisterClass) {
((SrcRC == ARM::DPRRegisterClass) && if (SrcRC == ARM::DPR_VFP2RegisterClass ||
(DestRC == ARM::DPR_VFP2RegisterClass))) { SrcRC == ARM::DPR_8RegisterClass) {
// Allow copy between DPR and DPR_VFP2. } else
} else { return false;
} else if (DestRC == ARM::DPR_VFP2RegisterClass) {
if (SrcRC == ARM::DPRRegisterClass ||
SrcRC == ARM::DPR_8RegisterClass) {
} else
return false;
} else if (DestRC == ARM::DPR_8RegisterClass) {
if (SrcRC == ARM::DPRRegisterClass ||
SrcRC == ARM::DPR_VFP2RegisterClass) {
} else
return false;
} else
return false; return false;
}
} }
if (DestRC == ARM::GPRRegisterClass) { if (DestRC == ARM::GPRRegisterClass) {
@ -629,7 +639,8 @@ ARMBaseInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYS), DestReg) AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYS), DestReg)
.addReg(SrcReg)); .addReg(SrcReg));
} else if ((DestRC == ARM::DPRRegisterClass) || } else if ((DestRC == ARM::DPRRegisterClass) ||
(DestRC == ARM::DPR_VFP2RegisterClass)) { (DestRC == ARM::DPR_VFP2RegisterClass) ||
(DestRC == ARM::DPR_8RegisterClass)) {
AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYD), DestReg) AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYD), DestReg)
.addReg(SrcReg)); .addReg(SrcReg));
} else if (DestRC == ARM::QPRRegisterClass) { } else if (DestRC == ARM::QPRRegisterClass) {
@ -652,7 +663,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR)) AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR))
.addReg(SrcReg, getKillRegState(isKill)) .addReg(SrcReg, getKillRegState(isKill))
.addFrameIndex(FI).addReg(0).addImm(0)); .addFrameIndex(FI).addReg(0).addImm(0));
} else if (RC == ARM::DPRRegisterClass || RC == ARM::DPR_VFP2RegisterClass) { } else if (RC == ARM::DPRRegisterClass ||
RC == ARM::DPR_VFP2RegisterClass ||
RC == ARM::DPR_8RegisterClass) {
AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTD)) AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTD))
.addReg(SrcReg, getKillRegState(isKill)) .addReg(SrcReg, getKillRegState(isKill))
.addFrameIndex(FI).addImm(0)); .addFrameIndex(FI).addImm(0));
@ -678,7 +691,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
if (RC == ARM::GPRRegisterClass) { if (RC == ARM::GPRRegisterClass) {
AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg) AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg)
.addFrameIndex(FI).addReg(0).addImm(0)); .addFrameIndex(FI).addReg(0).addImm(0));
} else if (RC == ARM::DPRRegisterClass || RC == ARM::DPR_VFP2RegisterClass) { } else if (RC == ARM::DPRRegisterClass ||
RC == ARM::DPR_VFP2RegisterClass ||
RC == ARM::DPR_8RegisterClass) {
AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDD), DestReg) AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDD), DestReg)
.addFrameIndex(FI).addImm(0)); .addFrameIndex(FI).addImm(0));
} else if (RC == ARM::SPRRegisterClass) { } else if (RC == ARM::SPRRegisterClass) {

View File

@ -475,6 +475,31 @@ class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
[(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> { [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
let isCommutable = Commutable; let isCommutable = Commutable;
} }
class N3VDSL<bits<2> op21_20, bits<4> op11_8,
string OpcodeStr, ValueType Ty, SDNode ShOp>
: N3V<0, 1, op21_20, op11_8, 1, 0,
(outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
NoItinerary,
!strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
[(set (Ty DPR:$dst),
(Ty (ShOp (Ty DPR:$src1),
(Ty (NEONvduplane (Ty DPR_VFP2:$src2),
imm:$lane)))))]> {
let isCommutable = 0;
}
class N3VDSL16<bits<2> op21_20, bits<4> op11_8,
string OpcodeStr, ValueType Ty, SDNode ShOp>
: N3V<0, 1, op21_20, op11_8, 1, 0,
(outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
NoItinerary,
!strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
[(set (Ty DPR:$dst),
(Ty (ShOp (Ty DPR:$src1),
(Ty (NEONvduplane (Ty DPR_8:$src2),
imm:$lane)))))]> {
let isCommutable = 0;
}
class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
string OpcodeStr, ValueType ResTy, ValueType OpTy, string OpcodeStr, ValueType ResTy, ValueType OpTy,
SDNode OpNode, bit Commutable> SDNode OpNode, bit Commutable>
@ -484,6 +509,30 @@ class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
[(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> { [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
let isCommutable = Commutable; let isCommutable = Commutable;
} }
class N3VQSL<bits<2> op21_20, bits<4> op11_8,
string OpcodeStr, ValueType ResTy, ValueType OpTy, SDNode ShOp>
: N3V<1, 1, op21_20, op11_8, 1, 0,
(outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
NoItinerary,
!strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
[(set (ResTy QPR:$dst),
(ResTy (ShOp (ResTy QPR:$src1),
(ResTy (NEONvduplane (OpTy DPR_VFP2:$src2),
imm:$lane)))))]> {
let isCommutable = 0;
}
class N3VQSL16<bits<2> op21_20, bits<4> op11_8,
string OpcodeStr, ValueType ResTy, ValueType OpTy, SDNode ShOp>
: N3V<1, 1, op21_20, op11_8, 1, 0,
(outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane),
NoItinerary,
!strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
[(set (ResTy QPR:$dst),
(ResTy (ShOp (ResTy QPR:$src1),
(ResTy (NEONvduplane (OpTy DPR_8:$src2),
imm:$lane)))))]> {
let isCommutable = 0;
}
// Basic 3-register operations, scalar single-precision // Basic 3-register operations, scalar single-precision
class N3VDs<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VDs<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
@ -511,6 +560,31 @@ class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
[(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2))))]> { [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
let isCommutable = Commutable; let isCommutable = Commutable;
} }
class N3VDIntSL<bits<2> op21_20, bits<4> op11_8,
string OpcodeStr, ValueType Ty, Intrinsic IntOp>
: N3V<0, 1, op21_20, op11_8, 1, 0,
(outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
NoItinerary,
!strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
[(set (Ty DPR:$dst),
(Ty (IntOp (Ty DPR:$src1),
(Ty (NEONvduplane (Ty DPR_VFP2:$src2),
imm:$lane)))))]> {
let isCommutable = 0;
}
class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8,
string OpcodeStr, ValueType Ty, Intrinsic IntOp>
: N3V<0, 1, op21_20, op11_8, 1, 0,
(outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
NoItinerary,
!strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
[(set (Ty DPR:$dst),
(Ty (IntOp (Ty DPR:$src1),
(Ty (NEONvduplane (Ty DPR_8:$src2),
imm:$lane)))))]> {
let isCommutable = 0;
}
class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
string OpcodeStr, ValueType ResTy, ValueType OpTy, string OpcodeStr, ValueType ResTy, ValueType OpTy,
Intrinsic IntOp, bit Commutable> Intrinsic IntOp, bit Commutable>
@ -520,6 +594,30 @@ class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
[(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2))))]> { [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
let isCommutable = Commutable; let isCommutable = Commutable;
} }
class N3VQIntSL<bits<2> op21_20, bits<4> op11_8,
string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
: N3V<1, 1, op21_20, op11_8, 1, 0,
(outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
NoItinerary,
!strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
[(set (ResTy QPR:$dst),
(ResTy (IntOp (ResTy QPR:$src1),
(ResTy (NEONvduplane (OpTy DPR_VFP2:$src2),
imm:$lane)))))]> {
let isCommutable = 0;
}
class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8,
string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
: N3V<1, 1, op21_20, op11_8, 1, 0,
(outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane),
NoItinerary,
!strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
[(set (ResTy QPR:$dst),
(ResTy (IntOp (ResTy QPR:$src1),
(ResTy (NEONvduplane (OpTy DPR_8:$src2),
imm:$lane)))))]> {
let isCommutable = 0;
}
// Multiply-Add/Sub operations, both double- and quad-register. // Multiply-Add/Sub operations, both double- and quad-register.
class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
@ -529,6 +627,31 @@ class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
!strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst", !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
[(set DPR:$dst, (Ty (OpNode DPR:$src1, [(set DPR:$dst, (Ty (OpNode DPR:$src1,
(Ty (MulOp DPR:$src2, DPR:$src3)))))]>; (Ty (MulOp DPR:$src2, DPR:$src3)))))]>;
class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8,
string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode ShOp>
: N3V<0, 1, op21_20, op11_8, 1, 0,
(outs DPR:$dst),
(ins DPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
NoItinerary,
!strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
[(set (Ty DPR:$dst),
(Ty (ShOp (Ty DPR:$src1),
(Ty (MulOp DPR:$src2,
(Ty (NEONvduplane (Ty DPR_VFP2:$src3),
imm:$lane)))))))]>;
class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8,
string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode ShOp>
: N3V<0, 1, op21_20, op11_8, 1, 0,
(outs DPR:$dst),
(ins DPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane),
NoItinerary,
!strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
[(set (Ty DPR:$dst),
(Ty (ShOp (Ty DPR:$src1),
(Ty (MulOp DPR:$src2,
(Ty (NEONvduplane (Ty DPR_8:$src3),
imm:$lane)))))))]>;
class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode OpNode> string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode OpNode>
: N3V<op24, op23, op21_20, op11_8, 1, op4, : N3V<op24, op23, op21_20, op11_8, 1, op4,
@ -536,6 +659,32 @@ class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
!strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst", !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
[(set QPR:$dst, (Ty (OpNode QPR:$src1, [(set QPR:$dst, (Ty (OpNode QPR:$src1,
(Ty (MulOp QPR:$src2, QPR:$src3)))))]>; (Ty (MulOp QPR:$src2, QPR:$src3)))))]>;
class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8,
string OpcodeStr, ValueType ResTy, ValueType OpTy,
SDNode MulOp, SDNode ShOp>
: N3V<1, 1, op21_20, op11_8, 1, 0,
(outs QPR:$dst),
(ins QPR:$src1, QPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
NoItinerary,
!strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
[(set (ResTy QPR:$dst),
(ResTy (ShOp (ResTy QPR:$src1),
(ResTy (MulOp QPR:$src2,
(ResTy (NEONvduplane (OpTy DPR_VFP2:$src3),
imm:$lane)))))))]>;
class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8,
string OpcodeStr, ValueType ResTy, ValueType OpTy,
SDNode MulOp, SDNode ShOp>
: N3V<1, 1, op21_20, op11_8, 1, 0,
(outs QPR:$dst),
(ins QPR:$src1, QPR:$src2, DPR_8:$src3, nohash_imm:$lane),
NoItinerary,
!strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
[(set (ResTy QPR:$dst),
(ResTy (ShOp (ResTy QPR:$src1),
(ResTy (MulOp QPR:$src2,
(ResTy (NEONvduplane (OpTy DPR_8:$src3),
imm:$lane)))))))]>;
// Multiply-Add/Sub operations, scalar single-precision // Multiply-Add/Sub operations, scalar single-precision
class N3VDMulOps<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VDMulOps<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
@ -581,6 +730,32 @@ class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
!strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst", !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
[(set QPR:$dst, [(set QPR:$dst,
(TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2), (TyD DPR:$src3))))]>; (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2), (TyD DPR:$src3))))]>;
class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8,
string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
: N3V<op24, 1, op21_20, op11_8, 1, 0,
(outs QPR:$dst),
(ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
NoItinerary,
!strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
[(set (ResTy QPR:$dst),
(ResTy (IntOp (ResTy QPR:$src1),
(OpTy DPR:$src2),
(OpTy (NEONvduplane (OpTy DPR_VFP2:$src3),
imm:$lane)))))]>;
class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
string OpcodeStr, ValueType ResTy, ValueType OpTy,
Intrinsic IntOp>
: N3V<op24, 1, op21_20, op11_8, 1, 0,
(outs QPR:$dst),
(ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane),
NoItinerary,
!strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
[(set (ResTy QPR:$dst),
(ResTy (IntOp (ResTy QPR:$src1),
(OpTy DPR:$src2),
(OpTy (NEONvduplane (OpTy DPR_8:$src3),
imm:$lane)))))]>;
// Narrowing 3-register intrinsics. // Narrowing 3-register intrinsics.
class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
@ -603,6 +778,27 @@ class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
[(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src1), (TyD DPR:$src2))))]> { [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src1), (TyD DPR:$src2))))]> {
let isCommutable = Commutable; let isCommutable = Commutable;
} }
class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8,
string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
: N3V<op24, 1, op21_20, op11_8, 1, 0,
(outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
NoItinerary,
!strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
[(set (ResTy QPR:$dst),
(ResTy (IntOp (OpTy DPR:$src1),
(OpTy (NEONvduplane (OpTy DPR_VFP2:$src2),
imm:$lane)))))]>;
class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
string OpcodeStr, ValueType ResTy, ValueType OpTy,
Intrinsic IntOp>
: N3V<op24, 1, op21_20, op11_8, 1, 0,
(outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
NoItinerary,
!strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
[(set (ResTy QPR:$dst),
(ResTy (IntOp (OpTy DPR:$src1),
(OpTy (NEONvduplane (OpTy DPR_8:$src2),
imm:$lane)))))]>;
// Wide 3-register intrinsics. // Wide 3-register intrinsics.
class N3VWInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VWInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
@ -761,6 +957,13 @@ multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
v4i32, v4i32, OpNode, Commutable>; v4i32, v4i32, OpNode, Commutable>;
} }
multiclass N3VSL_HS<bits<4> op11_8, string OpcodeStr, SDNode ShOp> {
def v4i16 : N3VDSL16<0b01, op11_8, !strconcat(OpcodeStr, "16"), v4i16, ShOp>;
def v2i32 : N3VDSL<0b10, op11_8, !strconcat(OpcodeStr, "32"), v2i32, ShOp>;
def v8i16 : N3VQSL16<0b01, op11_8, !strconcat(OpcodeStr, "16"), v8i16, v4i16, ShOp>;
def v4i32 : N3VQSL<0b10, op11_8, !strconcat(OpcodeStr, "32"), v4i32, v2i32, ShOp>;
}
// ....then also with element size 64 bits: // ....then also with element size 64 bits:
multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
string OpcodeStr, SDNode OpNode, bit Commutable = 0> string OpcodeStr, SDNode OpNode, bit Commutable = 0>
@ -817,6 +1020,13 @@ multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
v4i32, v4i32, IntOp, Commutable>; v4i32, v4i32, IntOp, Commutable>;
} }
multiclass N3VIntSL_HS<bits<4> op11_8, string OpcodeStr, Intrinsic IntOp> {
def v4i16 : N3VDIntSL16<0b01, op11_8, !strconcat(OpcodeStr, "16"), v4i16, IntOp>;
def v2i32 : N3VDIntSL<0b10, op11_8, !strconcat(OpcodeStr, "32"), v2i32, IntOp>;
def v8i16 : N3VQIntSL16<0b01, op11_8, !strconcat(OpcodeStr, "16"), v8i16, v4i16, IntOp>;
def v4i32 : N3VQIntSL<0b10, op11_8, !strconcat(OpcodeStr, "32"), v4i32, v2i32, IntOp>;
}
// ....then also with element size of 8 bits: // ....then also with element size of 8 bits:
multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> string OpcodeStr, Intrinsic IntOp, bit Commutable = 0>
@ -862,6 +1072,14 @@ multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
v2i64, v2i32, IntOp, Commutable>; v2i64, v2i32, IntOp, Commutable>;
} }
multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8,
string OpcodeStr, Intrinsic IntOp> {
def v4i16 : N3VLIntSL16<op24, 0b01, op11_8,
!strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
def v2i32 : N3VLIntSL<op24, 0b10, op11_8,
!strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
}
// ....then also with element size of 8 bits: // ....then also with element size of 8 bits:
multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> string OpcodeStr, Intrinsic IntOp, bit Commutable = 0>
@ -905,6 +1123,16 @@ multiclass N3VMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
!strconcat(OpcodeStr, "32"), v4i32, mul, OpNode>; !strconcat(OpcodeStr, "32"), v4i32, mul, OpNode>;
} }
multiclass N3VMulOpSL_HS<bits<4> op11_8, string OpcodeStr, SDNode ShOp> {
def v4i16 : N3VDMulOpSL16<0b01, op11_8,
!strconcat(OpcodeStr, "16"), v4i16, mul, ShOp>;
def v2i32 : N3VDMulOpSL<0b10, op11_8,
!strconcat(OpcodeStr, "32"), v2i32, mul, ShOp>;
def v8i16 : N3VQMulOpSL16<0b01, op11_8,
!strconcat(OpcodeStr, "16"), v8i16, v4i16, mul, ShOp>;
def v4i32 : N3VQMulOpSL<0b10, op11_8,
!strconcat(OpcodeStr, "32"), v4i32, v2i32, mul, ShOp>;
}
// Neon 3-argument intrinsics, // Neon 3-argument intrinsics,
// element sizes of 8, 16 and 32 bits: // element sizes of 8, 16 and 32 bits:
@ -939,6 +1167,14 @@ multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
!strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>; !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
} }
multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
string OpcodeStr, Intrinsic IntOp> {
def v4i16 : N3VLInt3SL16<op24, 0b01, op11_8,
!strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
def v2i32 : N3VLInt3SL<op24, 0b10, op11_8,
!strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
}
// ....then also with element size of 8 bits: // ....then also with element size of 8 bits:
multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
string OpcodeStr, Intrinsic IntOp> string OpcodeStr, Intrinsic IntOp>
@ -1134,17 +1370,71 @@ def VMULpq : N3VQInt<1, 0, 0b00, 0b1001, 1, "vmul.p8", v16i8, v16i8,
int_arm_neon_vmulp, 1>; int_arm_neon_vmulp, 1>;
def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, "vmul.f32", v2f32, v2f32, fmul, 1>; def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, "vmul.f32", v2f32, v2f32, fmul, 1>;
def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, "vmul.f32", v4f32, v4f32, fmul, 1>; def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, "vmul.f32", v4f32, v4f32, fmul, 1>;
defm VMULsl : N3VSL_HS<0b1000, "vmul.i", mul>;
def VMULslfd : N3VDSL<0b10, 0b1001, "vmul.f32", v2f32, fmul>;
def VMULslfq : N3VQSL<0b10, 0b1001, "vmul.f32", v4f32, v2f32, fmul>;
def : Pat<(v8i16 (mul (v8i16 QPR:$src1),
(v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
(v8i16 (VMULslv8i16 (v8i16 QPR:$src1),
(v4i16 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
def : Pat<(v4i32 (mul (v4i32 QPR:$src1),
(v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
(v4i32 (VMULslv4i32 (v4i32 QPR:$src1),
(v2i32 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
(v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))),
(v4f32 (VMULslfq (v4f32 QPR:$src1),
(v2f32 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
// VQDMULH : Vector Saturating Doubling Multiply Returning High Half // VQDMULH : Vector Saturating Doubling Multiply Returning High Half
defm VQDMULH : N3VInt_HS<0,0,0b1011,0, "vqdmulh.s", int_arm_neon_vqdmulh, 1>; defm VQDMULH : N3VInt_HS<0,0,0b1011,0, "vqdmulh.s", int_arm_neon_vqdmulh, 1>;
defm VQDMULHsl: N3VIntSL_HS<0b1100, "vqdmulh.s", int_arm_neon_vqdmulh>;
def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1),
(v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
(v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1),
(v4i16 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1),
(v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
(v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1),
(v2i32 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
// VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half // VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half
defm VQRDMULH : N3VInt_HS<1,0,0b1011,0, "vqrdmulh.s", int_arm_neon_vqrdmulh, 1>; defm VQRDMULH : N3VInt_HS<1,0,0b1011,0, "vqrdmulh.s", int_arm_neon_vqrdmulh, 1>;
defm VQRDMULHsl : N3VIntSL_HS<0b1101, "vqrdmulh.s", int_arm_neon_vqrdmulh>;
def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1),
(v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
(v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1),
(v4i16 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
(v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
(v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1),
(v2i32 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
// VMULL : Vector Multiply Long (integer and polynomial) (Q = D * D) // VMULL : Vector Multiply Long (integer and polynomial) (Q = D * D)
defm VMULLs : N3VLInt_QHS<0,1,0b1100,0, "vmull.s", int_arm_neon_vmulls, 1>; defm VMULLs : N3VLInt_QHS<0,1,0b1100,0, "vmull.s", int_arm_neon_vmulls, 1>;
defm VMULLu : N3VLInt_QHS<1,1,0b1100,0, "vmull.u", int_arm_neon_vmullu, 1>; defm VMULLu : N3VLInt_QHS<1,1,0b1100,0, "vmull.u", int_arm_neon_vmullu, 1>;
def VMULLp : N3VLInt<0, 1, 0b00, 0b1110, 0, "vmull.p8", v8i16, v8i8, def VMULLp : N3VLInt<0, 1, 0b00, 0b1110, 0, "vmull.p8", v8i16, v8i8,
int_arm_neon_vmullp, 1>; int_arm_neon_vmullp, 1>;
defm VMULLsls : N3VLIntSL_HS<0, 0b1010, "vmull.s", int_arm_neon_vmulls>;
defm VMULLslu : N3VLIntSL_HS<1, 0b1010, "vmull.u", int_arm_neon_vmullu>;
// VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D) // VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D)
defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, "vqdmull.s", int_arm_neon_vqdmull, 1>; defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, "vqdmull.s", int_arm_neon_vqdmull, 1>;
defm VQDMULLsl: N3VLIntSL_HS<0, 0b1011, "vqdmull.s", int_arm_neon_vqdmull>;
// Vector Multiply-Accumulate and Multiply-Subtract Operations. // Vector Multiply-Accumulate and Multiply-Subtract Operations.
@ -1152,20 +1442,93 @@ defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, "vqdmull.s", int_arm_neon_vqdmull, 1>;
defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmla.i", add>; defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmla.i", add>;
def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v2f32, fmul, fadd>; def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v2f32, fmul, fadd>;
def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v4f32, fmul, fadd>; def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v4f32, fmul, fadd>;
defm VMLAsl : N3VMulOpSL_HS<0b0000, "vmla.i", add>;
def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, "vmla.f32", v2f32, fmul, fadd>;
def VMLAslfq : N3VQMulOpSL<0b10, 0b0001, "vmla.f32", v4f32, v2f32, fmul, fadd>;
def : Pat<(v8i16 (add (v8i16 QPR:$src1),
(mul (v8i16 QPR:$src2),
(v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
(v8i16 (VMLAslv8i16 (v8i16 QPR:$src1),
(v8i16 QPR:$src2),
(v4i16 (EXTRACT_SUBREG QPR:$src3,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
def : Pat<(v4i32 (add (v4i32 QPR:$src1),
(mul (v4i32 QPR:$src2),
(v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
(v4i32 (VMLAslv4i32 (v4i32 QPR:$src1),
(v4i32 QPR:$src2),
(v2i32 (EXTRACT_SUBREG QPR:$src3,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
def : Pat<(v4f32 (fadd (v4f32 QPR:$src1),
(fmul (v4f32 QPR:$src2),
(v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
(v4f32 (VMLAslfq (v4f32 QPR:$src1),
(v4f32 QPR:$src2),
(v2f32 (EXTRACT_SUBREG QPR:$src3,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
// VMLAL : Vector Multiply Accumulate Long (Q += D * D) // VMLAL : Vector Multiply Accumulate Long (Q += D * D)
defm VMLALs : N3VLInt3_QHS<0,1,0b1000,0, "vmlal.s", int_arm_neon_vmlals>; defm VMLALs : N3VLInt3_QHS<0,1,0b1000,0, "vmlal.s", int_arm_neon_vmlals>;
defm VMLALu : N3VLInt3_QHS<1,1,0b1000,0, "vmlal.u", int_arm_neon_vmlalu>; defm VMLALu : N3VLInt3_QHS<1,1,0b1000,0, "vmlal.u", int_arm_neon_vmlalu>;
defm VMLALsls : N3VLInt3SL_HS<0, 0b0010, "vmlal.s", int_arm_neon_vmlals>;
defm VMLALslu : N3VLInt3SL_HS<1, 0b0010, "vmlal.u", int_arm_neon_vmlalu>;
// VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D) // VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, "vqdmlal.s", int_arm_neon_vqdmlal>; defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, "vqdmlal.s", int_arm_neon_vqdmlal>;
defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal.s", int_arm_neon_vqdmlal>;
// VMLS : Vector Multiply Subtract (integer and floating-point) // VMLS : Vector Multiply Subtract (integer and floating-point)
defm VMLS : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmls.i", sub>; defm VMLS : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmls.i", sub>;
def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v2f32, fmul, fsub>; def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v2f32, fmul, fsub>;
def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v4f32, fmul, fsub>; def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v4f32, fmul, fsub>;
defm VMLSsl : N3VMulOpSL_HS<0b0100, "vmls.i", sub>;
def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, "vmls.f32", v2f32, fmul, fsub>;
def VMLSslfq : N3VQMulOpSL<0b10, 0b0101, "vmls.f32", v4f32, v2f32, fmul, fsub>;
def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
(mul (v8i16 QPR:$src2),
(v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
(v8i16 (VMLSslv8i16 (v8i16 QPR:$src1),
(v8i16 QPR:$src2),
(v4i16 (EXTRACT_SUBREG QPR:$src3,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
def : Pat<(v4i32 (sub (v4i32 QPR:$src1),
(mul (v4i32 QPR:$src2),
(v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
(v4i32 (VMLSslv4i32 (v4i32 QPR:$src1),
(v4i32 QPR:$src2),
(v2i32 (EXTRACT_SUBREG QPR:$src3,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
def : Pat<(v4f32 (fsub (v4f32 QPR:$src1),
(fmul (v4f32 QPR:$src2),
(v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
(v4f32 (VMLSslfq (v4f32 QPR:$src1),
(v4f32 QPR:$src2),
(v2f32 (EXTRACT_SUBREG QPR:$src3,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
// VMLSL : Vector Multiply Subtract Long (Q -= D * D) // VMLSL : Vector Multiply Subtract Long (Q -= D * D)
defm VMLSLs : N3VLInt3_QHS<0,1,0b1010,0, "vmlsl.s", int_arm_neon_vmlsls>; defm VMLSLs : N3VLInt3_QHS<0,1,0b1010,0, "vmlsl.s", int_arm_neon_vmlsls>;
defm VMLSLu : N3VLInt3_QHS<1,1,0b1010,0, "vmlsl.u", int_arm_neon_vmlslu>; defm VMLSLu : N3VLInt3_QHS<1,1,0b1010,0, "vmlsl.u", int_arm_neon_vmlslu>;
defm VMLSLsls : N3VLInt3SL_HS<0, 0b0110, "vmlsl.s", int_arm_neon_vmlsls>;
defm VMLSLslu : N3VLInt3SL_HS<1, 0b0110, "vmlsl.u", int_arm_neon_vmlslu>;
// VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D) // VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, "vqdmlsl.s", int_arm_neon_vqdmlsl>; defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, "vqdmlsl.s", int_arm_neon_vqdmlsl>;
defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl.s", int_arm_neon_vqdmlsl>;
// Vector Subtract Operations. // Vector Subtract Operations.

View File

@ -307,12 +307,19 @@ def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
// Subset of DPR that are accessible with VFP2 (and so that also have // Subset of DPR that are accessible with VFP2 (and so that also have
// 32-bit SPR subregs). // 32-bit SPR subregs).
def DPR_VFP2 : RegisterClass<"ARM", [f64, v2f32], 64, def DPR_VFP2 : RegisterClass<"ARM", [f64, v2i32, v2f32], 64,
[D0, D1, D2, D3, D4, D5, D6, D7, [D0, D1, D2, D3, D4, D5, D6, D7,
D8, D9, D10, D11, D12, D13, D14, D15]> { D8, D9, D10, D11, D12, D13, D14, D15]> {
let SubRegClassList = [SPR, SPR]; let SubRegClassList = [SPR, SPR];
} }
// Subset of DPR which can be used as a source of NEON scalars for 16-bit
// operations
def DPR_8 : RegisterClass<"ARM", [f64, v4i16, v2f32], 64,
[D0, D1, D2, D3, D4, D5, D6, D7]> {
let SubRegClassList = [SPR, SPR];
}
// Generic 128-bit vector register class. // Generic 128-bit vector register class.
def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7,
@ -364,4 +371,3 @@ def : SubRegSet<6, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7,
Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15], Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15],
[D1, D3, D5, D7, D9, D11, D13, D15, [D1, D3, D5, D7, D9, D11, D13, D15,
D17, D19, D21, D23, D25, D27, D29, D31]>; D17, D19, D21, D23, D25, D27, D29, D31]>;

View File

@ -0,0 +1,47 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <4 x i32> @test_vmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vmlal_lanes16
; CHECK: vmlal.s16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vmlal_lanes32
; CHECK: vmlal.s32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
define arm_aapcs_vfpcc <4 x i32> @test_vmlal_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {
entry:
; CHECK: test_vmlal_laneu16
; CHECK: vmlal.u16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vmlal_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {
entry:
; CHECK: test_vmlal_laneu32
; CHECK: vmlal.u32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

View File

@ -0,0 +1,47 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vmlsl_lanes16
; CHECK: vmlsl.s16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vmlsl_lanes32
; CHECK: vmlsl.s32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {
entry:
; CHECK: test_vmlsl_laneu16
; CHECK: vmlsl.u16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {
entry:
; CHECK: test_vmlsl_laneu32
; CHECK: vmlsl.u32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

View File

@ -0,0 +1,57 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <2 x float> @test_vmul_lanef32(<2 x float> %arg0_float32x2_t, <2 x float> %arg1_float32x2_t) nounwind readnone {
entry:
; CHECK: test_vmul_lanef32:
; CHECK: vmul.f32 d0, d0, d1[0]
%0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <2 x i32> zeroinitializer ; <<2 x float>> [#uses=1]
%1 = fmul <2 x float> %0, %arg0_float32x2_t ; <<2 x float>> [#uses=1]
ret <2 x float> %1
}
define arm_aapcs_vfpcc <4 x i16> @test_vmul_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vmul_lanes16:
; CHECK: vmul.i16 d0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses$
%1 = mul <4 x i16> %0, %arg0_int16x4_t ; <<4 x i16>> [#uses=1]
ret <4 x i16> %1
}
define arm_aapcs_vfpcc <2 x i32> @test_vmul_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vmul_lanes32:
; CHECK: vmul.i32 d0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = mul <2 x i32> %0, %arg0_int32x2_t ; <<2 x i32>> [#uses=1]
ret <2 x i32> %1
}
define arm_aapcs_vfpcc <4 x float> @test_vmulQ_lanef32(<4 x float> %arg0_float32x4_t, <2 x float> %arg1_float32x2_t) nounwind readnone {
entry:
; CHECK: test_vmulQ_lanef32:
; CHECK: vmul.f32 q0, q0, d2[1]
%0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x float>$
%1 = fmul <4 x float> %0, %arg0_float32x4_t ; <<4 x float>> [#uses=1]
ret <4 x float> %1
}
define arm_aapcs_vfpcc <8 x i16> @test_vmulQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vmulQ_lanes16:
; CHECK: vmul.i16 q0, q0, d2[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%1 = mul <8 x i16> %0, %arg0_int16x8_t ; <<8 x i16>> [#uses=1]
ret <8 x i16> %1
}
define arm_aapcs_vfpcc <4 x i32> @test_vmulQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vmulQ_lanes32:
; CHECK: vmul.i32 q0, q0, d2[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses$
%1 = mul <4 x i32> %0, %arg0_int32x4_t ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}

View File

@ -0,0 +1,47 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <4 x i32> @test_vmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vmull_lanes16
; CHECK: vmull.s16 q0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vmull_lanes32
; CHECK: vmull.s32 q0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone {
entry:
; CHECK: test_vmull_laneu16
; CHECK: vmull.u16 q0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %arg0_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone {
entry:
; CHECK: test_vmull_laneu32
; CHECK: vmull.u32 q0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %arg0_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone

View File

@ -0,0 +1,47 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <8 x i16> @test_vqRdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqRdmulhQ_lanes16
; CHECK: vqrdmulh.s16 q0, q0, d2[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
%1 = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %1
}
declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define arm_aapcs_vfpcc <4 x i32> @test_vqRdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqRdmulhQ_lanes32
; CHECK: vqrdmulh.s32 q0, q0, d2[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define arm_aapcs_vfpcc <4 x i16> @test_vqRdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqRdmulh_lanes16
; CHECK: vqrdmulh.s16 d0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
ret <4 x i16> %1
}
declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i32> @test_vqRdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqRdmulh_lanes32
; CHECK: vqrdmulh.s32 d0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
ret <2 x i32> %1
}
declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

View File

@ -0,0 +1,25 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqdmlal_lanes16
; CHECK: vqdmlal.s16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqdmlal_lanes32
; CHECK: vqdmlal.s32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

View File

@ -0,0 +1,25 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqdmlsl_lanes16
; CHECK: vqdmlsl.s16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqdmlsl_lanes32
; CHECK: vqdmlsl.s32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

View File

@ -0,0 +1,47 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqdmulhQ_lanes16
; CHECK: vqdmulh.s16 q0, q0, d2[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
%1 = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %1
}
declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqdmulhQ_lanes32
; CHECK: vqdmulh.s32 q0, q0, d2[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define arm_aapcs_vfpcc <4 x i16> @test_vqdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqdmulh_lanes16
; CHECK: vqdmulh.s16 d0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
ret <4 x i16> %1
}
declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i32> @test_vqdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqdmulh_lanes32
; CHECK: vqdmulh.s32 d0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
ret <2 x i32> %1
}
declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

View File

@ -0,0 +1,25 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <4 x i32> @test_vqdmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqdmull_lanes16
; CHECK: vqdmull.s16 q0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vqdmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqdmull_lanes32
; CHECK: vqdmull.s32 q0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone