forked from OSchip/llvm-project
Remove NEON vmull, vmlal, and vmlsl intrinsics, replacing them with multiply,
add, and subtract operations with zero-extended or sign-extended vectors. Update tests. Add auto-upgrade support for the old intrinsics. llvm-svn: 112773
This commit is contained in:
parent
f3a734d8ee
commit
38ab35a911
|
@ -129,16 +129,8 @@ let Properties = [IntrNoMem, Commutative] in {
|
|||
def int_arm_neon_vmulp : Neon_2Arg_Intrinsic;
|
||||
def int_arm_neon_vqdmulh : Neon_2Arg_Intrinsic;
|
||||
def int_arm_neon_vqrdmulh : Neon_2Arg_Intrinsic;
|
||||
def int_arm_neon_vmulls : Neon_2Arg_Long_Intrinsic;
|
||||
def int_arm_neon_vmullu : Neon_2Arg_Long_Intrinsic;
|
||||
def int_arm_neon_vmullp : Neon_2Arg_Long_Intrinsic;
|
||||
def int_arm_neon_vqdmull : Neon_2Arg_Long_Intrinsic;
|
||||
|
||||
// Vector Multiply and Accumulate/Subtract.
|
||||
def int_arm_neon_vmlals : Neon_3Arg_Long_Intrinsic;
|
||||
def int_arm_neon_vmlalu : Neon_3Arg_Long_Intrinsic;
|
||||
def int_arm_neon_vmlsls : Neon_3Arg_Long_Intrinsic;
|
||||
def int_arm_neon_vmlslu : Neon_3Arg_Long_Intrinsic;
|
||||
def int_arm_neon_vqdmlal : Neon_3Arg_Long_Intrinsic;
|
||||
def int_arm_neon_vqdmlsl : Neon_3Arg_Long_Intrinsic;
|
||||
|
||||
|
@ -302,7 +294,7 @@ def int_arm_neon_vcvtfp2fxu : Neon_CvtFPToFx_Intrinsic;
|
|||
def int_arm_neon_vcvtfxs2fp : Neon_CvtFxToFP_Intrinsic;
|
||||
def int_arm_neon_vcvtfxu2fp : Neon_CvtFxToFP_Intrinsic;
|
||||
|
||||
// Narrowing and Lengthening Vector Moves.
|
||||
// Narrowing Saturating Vector Moves.
|
||||
def int_arm_neon_vqmovns : Neon_1Arg_Narrow_Intrinsic;
|
||||
def int_arm_neon_vqmovnu : Neon_1Arg_Narrow_Intrinsic;
|
||||
def int_arm_neon_vqmovnsu : Neon_1Arg_Narrow_Intrinsic;
|
||||
|
|
|
@ -326,7 +326,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
|
|||
|
||||
// Neon does not support some operations on v1i64 and v2i64 types.
|
||||
setOperationAction(ISD::MUL, MVT::v1i64, Expand);
|
||||
setOperationAction(ISD::MUL, MVT::v2i64, Expand);
|
||||
// Custom handling for some quad-vector types to detect VMULL.
|
||||
setOperationAction(ISD::MUL, MVT::v8i16, Custom);
|
||||
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
|
||||
setOperationAction(ISD::VSETCC, MVT::v1i64, Expand);
|
||||
setOperationAction(ISD::VSETCC, MVT::v2i64, Expand);
|
||||
|
||||
|
@ -684,6 +687,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
case ARMISD::VZIP: return "ARMISD::VZIP";
|
||||
case ARMISD::VUZP: return "ARMISD::VUZP";
|
||||
case ARMISD::VTRN: return "ARMISD::VTRN";
|
||||
case ARMISD::VMULLs: return "ARMISD::VMULLs";
|
||||
case ARMISD::VMULLu: return "ARMISD::VMULLu";
|
||||
case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR";
|
||||
case ARMISD::FMAX: return "ARMISD::FMAX";
|
||||
case ARMISD::FMIN: return "ARMISD::FMIN";
|
||||
|
@ -3751,6 +3756,51 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
|
|||
return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Val);
|
||||
}
|
||||
|
||||
/// SkipExtension - For a node that is either a SIGN_EXTEND, ZERO_EXTEND, or
|
||||
/// an extending load, return the unextended value.
|
||||
static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) {
|
||||
if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
|
||||
return N->getOperand(0);
|
||||
LoadSDNode *LD = cast<LoadSDNode>(N);
|
||||
return DAG.getLoad(LD->getMemoryVT(), N->getDebugLoc(), LD->getChain(),
|
||||
LD->getBasePtr(), LD->getSrcValue(),
|
||||
LD->getSrcValueOffset(), LD->isVolatile(),
|
||||
LD->isNonTemporal(), LD->getAlignment());
|
||||
}
|
||||
|
||||
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
|
||||
// Multiplications are only custom-lowered for 128-bit vectors so that
|
||||
// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
|
||||
EVT VT = Op.getValueType();
|
||||
assert(VT.is128BitVector() && "unexpected type for custom-lowering ISD::MUL");
|
||||
SDNode *N0 = Op.getOperand(0).getNode();
|
||||
SDNode *N1 = Op.getOperand(1).getNode();
|
||||
unsigned NewOpc = 0;
|
||||
if ((N0->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N0)) &&
|
||||
(N1->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N1))) {
|
||||
NewOpc = ARMISD::VMULLs;
|
||||
} else if ((N0->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N0)) &&
|
||||
(N1->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N1))) {
|
||||
NewOpc = ARMISD::VMULLu;
|
||||
} else if (VT.getSimpleVT().SimpleTy == MVT::v2i64) {
|
||||
// Fall through to expand this. It is not legal.
|
||||
return SDValue();
|
||||
} else {
|
||||
// Other vector multiplications are legal.
|
||||
return Op;
|
||||
}
|
||||
|
||||
// Legalize to a VMULL instruction.
|
||||
DebugLoc DL = Op.getDebugLoc();
|
||||
SDValue Op0 = SkipExtension(N0, DAG);
|
||||
SDValue Op1 = SkipExtension(N1, DAG);
|
||||
|
||||
assert(Op0.getValueType().is64BitVector() &&
|
||||
Op1.getValueType().is64BitVector() &&
|
||||
"unexpected types for extended operands to VMULL");
|
||||
return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
|
||||
}
|
||||
|
||||
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
switch (Op.getOpcode()) {
|
||||
default: llvm_unreachable("Don't know how to custom lower this!");
|
||||
|
@ -3792,6 +3842,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
|||
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
|
||||
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
|
||||
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
|
||||
case ISD::MUL: return LowerMUL(Op, DAG);
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
|
|
|
@ -145,6 +145,10 @@ namespace llvm {
|
|||
VUZP, // unzip (deinterleave)
|
||||
VTRN, // transpose
|
||||
|
||||
// Vector multiply long:
|
||||
VMULLs, // ...signed
|
||||
VMULLu, // ...unsigned
|
||||
|
||||
// Operands of the standard BUILD_VECTOR node are not legalized, which
|
||||
// is fine if BUILD_VECTORs are always lowered to shuffles or other
|
||||
// operations, but for ARM some BUILD_VECTORs are legal as-is and their
|
||||
|
|
|
@ -93,6 +93,11 @@ def NEONzip : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>;
|
|||
def NEONuzp : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;
|
||||
def NEONtrn : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;
|
||||
|
||||
def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
|
||||
SDTCisSameAs<1, 2>]>;
|
||||
def NEONvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
|
||||
def NEONvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
|
||||
|
||||
def SDTARMFMAX : SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisSameAs<0, 1>,
|
||||
SDTCisSameAs<0, 2>]>;
|
||||
def NEONfmax : SDNode<"ARMISD::FMAX", SDTARMFMAX>;
|
||||
|
@ -1255,6 +1260,42 @@ class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
|
|||
[(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1),
|
||||
(OpTy QPR:$src2), (OpTy QPR:$src3))))]>;
|
||||
|
||||
// Long Multiply-Add/Sub operations.
|
||||
class N3VLMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
|
||||
InstrItinClass itin, string OpcodeStr, string Dt,
|
||||
ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
|
||||
: N3V<op24, op23, op21_20, op11_8, 0, op4,
|
||||
(outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
|
||||
OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
|
||||
[(set QPR:$dst, (OpNode (TyQ QPR:$src1),
|
||||
(TyQ (MulOp (TyD DPR:$src2),
|
||||
(TyD DPR:$src3)))))]>;
|
||||
class N3VLMulOpSL<bit op24, bits<2> op21_20, bits<4> op11_8,
|
||||
InstrItinClass itin, string OpcodeStr, string Dt,
|
||||
ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
|
||||
: N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst),
|
||||
(ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
|
||||
NVMulSLFrm, itin,
|
||||
OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
|
||||
[(set QPR:$dst,
|
||||
(OpNode (TyQ QPR:$src1),
|
||||
(TyQ (MulOp (TyD DPR:$src2),
|
||||
(TyD (NEONvduplane (TyD DPR_VFP2:$src3),
|
||||
imm:$lane))))))]>;
|
||||
class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
|
||||
InstrItinClass itin, string OpcodeStr, string Dt,
|
||||
ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
|
||||
: N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst),
|
||||
(ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane),
|
||||
NVMulSLFrm, itin,
|
||||
OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
|
||||
[(set QPR:$dst,
|
||||
(OpNode (TyQ QPR:$src1),
|
||||
(TyQ (MulOp (TyD DPR:$src2),
|
||||
(TyD (NEONvduplane (TyD DPR_8:$src3),
|
||||
imm:$lane))))))]>;
|
||||
|
||||
|
||||
// Neon Long 3-argument intrinsic. The destination register is
|
||||
// a quad-register and is also used as the first source operand register.
|
||||
class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
|
||||
|
@ -1306,8 +1347,37 @@ class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
|
|||
// Long 3-register operations.
|
||||
class N3VL<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
|
||||
InstrItinClass itin, string OpcodeStr, string Dt,
|
||||
ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp,
|
||||
bit Commutable>
|
||||
ValueType TyQ, ValueType TyD, SDNode OpNode, bit Commutable>
|
||||
: N3V<op24, op23, op21_20, op11_8, 0, op4,
|
||||
(outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
|
||||
OpcodeStr, Dt, "$dst, $src1, $src2", "",
|
||||
[(set QPR:$dst, (TyQ (OpNode (TyD DPR:$src1), (TyD DPR:$src2))))]> {
|
||||
let isCommutable = Commutable;
|
||||
}
|
||||
class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8,
|
||||
InstrItinClass itin, string OpcodeStr, string Dt,
|
||||
ValueType TyQ, ValueType TyD, SDNode OpNode>
|
||||
: N3V<op24, 1, op21_20, op11_8, 1, 0,
|
||||
(outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
|
||||
NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
|
||||
[(set QPR:$dst,
|
||||
(TyQ (OpNode (TyD DPR:$src1),
|
||||
(TyD (NEONvduplane (TyD DPR_VFP2:$src2),imm:$lane)))))]>;
|
||||
class N3VLSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
|
||||
InstrItinClass itin, string OpcodeStr, string Dt,
|
||||
ValueType TyQ, ValueType TyD, SDNode OpNode>
|
||||
: N3V<op24, 1, op21_20, op11_8, 1, 0,
|
||||
(outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
|
||||
NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
|
||||
[(set QPR:$dst,
|
||||
(TyQ (OpNode (TyD DPR:$src1),
|
||||
(TyD (NEONvduplane (TyD DPR_8:$src2), imm:$lane)))))]>;
|
||||
|
||||
// Long 3-register operations with explicitly extended operands.
|
||||
class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
|
||||
InstrItinClass itin, string OpcodeStr, string Dt,
|
||||
ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp,
|
||||
bit Commutable>
|
||||
: N3V<op24, op23, op21_20, op11_8, 0, op4,
|
||||
(outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
|
||||
OpcodeStr, Dt, "$dst, $src1, $src2", "",
|
||||
|
@ -1729,16 +1799,40 @@ multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4,
|
|||
multiclass N3VL_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
|
||||
InstrItinClass itin16, InstrItinClass itin32,
|
||||
string OpcodeStr, string Dt,
|
||||
SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
|
||||
def v4i32 : N3VL<op24, op23, 0b01, op11_8, op4, itin16,
|
||||
OpcodeStr, !strconcat(Dt, "16"),
|
||||
v4i32, v4i16, OpNode, ExtOp, Commutable>;
|
||||
def v2i64 : N3VL<op24, op23, 0b10, op11_8, op4, itin32,
|
||||
OpcodeStr, !strconcat(Dt, "32"),
|
||||
v2i64, v2i32, OpNode, ExtOp, Commutable>;
|
||||
SDNode OpNode, bit Commutable = 0> {
|
||||
def v8i16 : N3VL<op24, op23, 0b00, op11_8, op4, itin16,
|
||||
OpcodeStr, !strconcat(Dt, "8"),
|
||||
v8i16, v8i8, OpNode, ExtOp, Commutable>;
|
||||
v8i16, v8i8, OpNode, Commutable>;
|
||||
def v4i32 : N3VL<op24, op23, 0b01, op11_8, op4, itin16,
|
||||
OpcodeStr, !strconcat(Dt, "16"),
|
||||
v4i32, v4i16, OpNode, Commutable>;
|
||||
def v2i64 : N3VL<op24, op23, 0b10, op11_8, op4, itin32,
|
||||
OpcodeStr, !strconcat(Dt, "32"),
|
||||
v2i64, v2i32, OpNode, Commutable>;
|
||||
}
|
||||
|
||||
multiclass N3VLSL_HS<bit op24, bits<4> op11_8,
|
||||
InstrItinClass itin, string OpcodeStr, string Dt,
|
||||
SDNode OpNode> {
|
||||
def v4i16 : N3VLSL16<op24, 0b01, op11_8, itin, OpcodeStr,
|
||||
!strconcat(Dt, "16"), v4i32, v4i16, OpNode>;
|
||||
def v2i32 : N3VLSL<op24, 0b10, op11_8, itin, OpcodeStr,
|
||||
!strconcat(Dt, "32"), v2i64, v2i32, OpNode>;
|
||||
}
|
||||
|
||||
multiclass N3VLExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
|
||||
InstrItinClass itin16, InstrItinClass itin32,
|
||||
string OpcodeStr, string Dt,
|
||||
SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
|
||||
def v8i16 : N3VLExt<op24, op23, 0b00, op11_8, op4, itin16,
|
||||
OpcodeStr, !strconcat(Dt, "8"),
|
||||
v8i16, v8i8, OpNode, ExtOp, Commutable>;
|
||||
def v4i32 : N3VLExt<op24, op23, 0b01, op11_8, op4, itin16,
|
||||
OpcodeStr, !strconcat(Dt, "16"),
|
||||
v4i32, v4i16, OpNode, ExtOp, Commutable>;
|
||||
def v2i64 : N3VLExt<op24, op23, 0b10, op11_8, op4, itin32,
|
||||
OpcodeStr, !strconcat(Dt, "32"),
|
||||
v2i64, v2i32, OpNode, ExtOp, Commutable>;
|
||||
}
|
||||
|
||||
// Neon Long 3-register vector intrinsics.
|
||||
|
@ -1857,6 +1951,29 @@ multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
|
|||
}
|
||||
|
||||
|
||||
// Neon Long Multiply-Op vector operations,
|
||||
// element sizes of 8, 16 and 32 bits:
|
||||
multiclass N3VLMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
|
||||
InstrItinClass itin16, InstrItinClass itin32,
|
||||
string OpcodeStr, string Dt, SDNode MulOp,
|
||||
SDNode OpNode> {
|
||||
def v8i16 : N3VLMulOp<op24, op23, 0b00, op11_8, op4, itin16, OpcodeStr,
|
||||
!strconcat(Dt, "8"), v8i16, v8i8, MulOp, OpNode>;
|
||||
def v4i32 : N3VLMulOp<op24, op23, 0b01, op11_8, op4, itin16, OpcodeStr,
|
||||
!strconcat(Dt, "16"), v4i32, v4i16, MulOp, OpNode>;
|
||||
def v2i64 : N3VLMulOp<op24, op23, 0b10, op11_8, op4, itin32, OpcodeStr,
|
||||
!strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>;
|
||||
}
|
||||
|
||||
multiclass N3VLMulOpSL_HS<bit op24, bits<4> op11_8, string OpcodeStr,
|
||||
string Dt, SDNode MulOp, SDNode OpNode> {
|
||||
def v4i16 : N3VLMulOpSL16<op24, 0b01, op11_8, IIC_VMACi16D, OpcodeStr,
|
||||
!strconcat(Dt,"16"), v4i32, v4i16, MulOp, OpNode>;
|
||||
def v2i32 : N3VLMulOpSL<op24, 0b10, op11_8, IIC_VMACi32D, OpcodeStr,
|
||||
!strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>;
|
||||
}
|
||||
|
||||
|
||||
// Neon Long 3-argument intrinsics.
|
||||
|
||||
// First with only element sizes of 16 and 32 bits:
|
||||
|
@ -2130,10 +2247,10 @@ def VADDfd : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd", "f32",
|
|||
def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32",
|
||||
v4f32, v4f32, fadd, 1>;
|
||||
// VADDL : Vector Add Long (Q = D + D)
|
||||
defm VADDLs : N3VL_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
|
||||
"vaddl", "s", add, sext, 1>;
|
||||
defm VADDLu : N3VL_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
|
||||
"vaddl", "u", add, zext, 1>;
|
||||
defm VADDLs : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
|
||||
"vaddl", "s", add, sext, 1>;
|
||||
defm VADDLu : N3VLExt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
|
||||
"vaddl", "u", add, zext, 1>;
|
||||
// VADDW : Vector Add Wide (Q = Q + D)
|
||||
defm VADDWs : N3VW_QHS<0,1,0b0001,0, "vaddw", "s", add, sext, 0>;
|
||||
defm VADDWu : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zext, 0>;
|
||||
|
@ -2247,16 +2364,14 @@ def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
|
|||
(SubReg_i32_lane imm:$lane)))>;
|
||||
|
||||
// VMULL : Vector Multiply Long (integer and polynomial) (Q = D * D)
|
||||
defm VMULLs : N3VLInt_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
|
||||
"vmull", "s", int_arm_neon_vmulls, 1>;
|
||||
defm VMULLu : N3VLInt_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
|
||||
"vmull", "u", int_arm_neon_vmullu, 1>;
|
||||
defm VMULLs : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
|
||||
"vmull", "s", NEONvmulls, 1>;
|
||||
defm VMULLu : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
|
||||
"vmull", "u", NEONvmullu, 1>;
|
||||
def VMULLp : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
|
||||
v8i16, v8i8, int_arm_neon_vmullp, 1>;
|
||||
defm VMULLsls : N3VLIntSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s",
|
||||
int_arm_neon_vmulls>;
|
||||
defm VMULLslu : N3VLIntSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u",
|
||||
int_arm_neon_vmullu>;
|
||||
defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>;
|
||||
defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>;
|
||||
|
||||
// VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D)
|
||||
defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D,
|
||||
|
@ -2306,13 +2421,13 @@ def : Pat<(v4f32 (fadd (v4f32 QPR:$src1),
|
|||
(SubReg_i32_lane imm:$lane)))>;
|
||||
|
||||
// VMLAL : Vector Multiply Accumulate Long (Q += D * D)
|
||||
defm VMLALs : N3VLInt3_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
|
||||
"vmlal", "s", int_arm_neon_vmlals>;
|
||||
defm VMLALu : N3VLInt3_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
|
||||
"vmlal", "u", int_arm_neon_vmlalu>;
|
||||
defm VMLALs : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
|
||||
"vmlal", "s", NEONvmulls, add>;
|
||||
defm VMLALu : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
|
||||
"vmlal", "u", NEONvmullu, add>;
|
||||
|
||||
defm VMLALsls : N3VLInt3SL_HS<0, 0b0010, "vmlal", "s", int_arm_neon_vmlals>;
|
||||
defm VMLALslu : N3VLInt3SL_HS<1, 0b0010, "vmlal", "u", int_arm_neon_vmlalu>;
|
||||
defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>;
|
||||
defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>;
|
||||
|
||||
// VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
|
||||
defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
|
||||
|
@ -2358,13 +2473,13 @@ def : Pat<(v4f32 (fsub (v4f32 QPR:$src1),
|
|||
(SubReg_i32_lane imm:$lane)))>;
|
||||
|
||||
// VMLSL : Vector Multiply Subtract Long (Q -= D * D)
|
||||
defm VMLSLs : N3VLInt3_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
|
||||
"vmlsl", "s", int_arm_neon_vmlsls>;
|
||||
defm VMLSLu : N3VLInt3_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
|
||||
"vmlsl", "u", int_arm_neon_vmlslu>;
|
||||
defm VMLSLs : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
|
||||
"vmlsl", "s", NEONvmulls, sub>;
|
||||
defm VMLSLu : N3VLMulOp_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
|
||||
"vmlsl", "u", NEONvmullu, sub>;
|
||||
|
||||
defm VMLSLsls : N3VLInt3SL_HS<0, 0b0110, "vmlsl", "s", int_arm_neon_vmlsls>;
|
||||
defm VMLSLslu : N3VLInt3SL_HS<1, 0b0110, "vmlsl", "u", int_arm_neon_vmlslu>;
|
||||
defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", NEONvmulls, sub>;
|
||||
defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>;
|
||||
|
||||
// VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
|
||||
defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
|
||||
|
@ -2381,10 +2496,10 @@ def VSUBfd : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub", "f32",
|
|||
def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32",
|
||||
v4f32, v4f32, fsub, 0>;
|
||||
// VSUBL : Vector Subtract Long (Q = D - D)
|
||||
defm VSUBLs : N3VL_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
|
||||
"vsubl", "s", sub, sext, 0>;
|
||||
defm VSUBLu : N3VL_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
|
||||
"vsubl", "u", sub, zext, 0>;
|
||||
defm VSUBLs : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
|
||||
"vsubl", "s", sub, sext, 0>;
|
||||
defm VSUBLu : N3VLExt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
|
||||
"vsubl", "u", sub, zext, 0>;
|
||||
// VSUBW : Vector Subtract Wide (Q = Q - D)
|
||||
defm VSUBWs : N3VW_QHS<0,1,0b0011,0, "vsubw", "s", sub, sext, 0>;
|
||||
defm VSUBWu : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zext, 0>;
|
||||
|
|
|
@ -90,6 +90,12 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
|
|||
(Name.compare(19, 2, "s.", 2) == 0 ||
|
||||
Name.compare(19, 2, "u.", 2) == 0)) ||
|
||||
|
||||
((Name.compare(14, 5, "vmull", 5) == 0 ||
|
||||
Name.compare(14, 5, "vmlal", 5) == 0 ||
|
||||
Name.compare(14, 5, "vmlsl", 5) == 0) &&
|
||||
(Name.compare(19, 2, "s.", 2) == 0 ||
|
||||
Name.compare(19, 2, "u.", 2) == 0)) ||
|
||||
|
||||
(Name.compare(14, 6, "vmovn.", 6) == 0)) {
|
||||
|
||||
// Calls to these are transformed into IR without intrinsics.
|
||||
|
@ -359,6 +365,32 @@ bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn) {
|
|||
return Upgraded;
|
||||
}
|
||||
|
||||
/// ExtendNEONArgs - For NEON "long" and "wide" operations, where the results
|
||||
/// have vector elements twice as big as one or both source operands, do the
|
||||
/// sign- or zero-extension that used to be handled by intrinsics. The
|
||||
/// extended values are returned via V0 and V1.
|
||||
static void ExtendNEONArgs(CallInst *CI, Value *Arg0, Value *Arg1,
|
||||
Value *&V0, Value *&V1) {
|
||||
Function *F = CI->getCalledFunction();
|
||||
const std::string& Name = F->getName();
|
||||
bool isLong = (Name.at(18) == 'l');
|
||||
bool isSigned = (Name.at(19) == 's');
|
||||
|
||||
if (isSigned) {
|
||||
if (isLong)
|
||||
V0 = new SExtInst(Arg0, CI->getType(), "", CI);
|
||||
else
|
||||
V0 = Arg0;
|
||||
V1 = new SExtInst(Arg1, CI->getType(), "", CI);
|
||||
} else {
|
||||
if (isLong)
|
||||
V0 = new ZExtInst(Arg0, CI->getType(), "", CI);
|
||||
else
|
||||
V0 = Arg0;
|
||||
V1 = new ZExtInst(Arg1, CI->getType(), "", CI);
|
||||
}
|
||||
}
|
||||
|
||||
// UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the
|
||||
// upgraded intrinsic. All argument and return casting must be provided in
|
||||
// order to seamlessly integrate with existing context.
|
||||
|
@ -376,33 +408,32 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
|
|||
// Upgrade ARM NEON intrinsics.
|
||||
if (Name.compare(5, 9, "arm.neon.", 9) == 0) {
|
||||
Instruction *NewI;
|
||||
Value *V0, *V1;
|
||||
if (Name.compare(14, 7, "vmovls.", 7) == 0) {
|
||||
NewI = new SExtInst(CI->getArgOperand(0), CI->getType(),
|
||||
"upgraded." + CI->getName(), CI);
|
||||
} else if (Name.compare(14, 7, "vmovlu.", 7) == 0) {
|
||||
NewI = new ZExtInst(CI->getArgOperand(0), CI->getType(),
|
||||
"upgraded." + CI->getName(), CI);
|
||||
|
||||
} else if (Name.compare(14, 4, "vadd", 4) == 0 ||
|
||||
Name.compare(14, 4, "vsub", 4) == 0) {
|
||||
// Extend one (vaddw/vsubw) or both (vaddl/vsubl) operands.
|
||||
Value *V0 = CI->getArgOperand(0);
|
||||
Value *V1 = CI->getArgOperand(1);
|
||||
if (Name.at(19) == 's') {
|
||||
if (Name.at(18) == 'l')
|
||||
V0 = new SExtInst(CI->getArgOperand(0), CI->getType(), "", CI);
|
||||
V1 = new SExtInst(CI->getArgOperand(1), CI->getType(), "", CI);
|
||||
} else {
|
||||
assert(Name.at(19) == 'u' && "unexpected vadd/vsub intrinsic");
|
||||
if (Name.at(18) == 'l')
|
||||
V0 = new ZExtInst(CI->getArgOperand(0), CI->getType(), "", CI);
|
||||
V1 = new ZExtInst(CI->getArgOperand(1), CI->getType(), "", CI);
|
||||
}
|
||||
if (Name.compare(14, 4, "vadd", 4) == 0)
|
||||
NewI = BinaryOperator::CreateAdd(V0, V1,"upgraded."+CI->getName(),CI);
|
||||
else
|
||||
NewI = BinaryOperator::CreateSub(V0, V1,"upgraded."+CI->getName(),CI);
|
||||
|
||||
} else if (Name.compare(14, 4, "vadd", 4) == 0) {
|
||||
ExtendNEONArgs(CI, CI->getArgOperand(0), CI->getArgOperand(1), V0, V1);
|
||||
NewI = BinaryOperator::CreateAdd(V0, V1, "upgraded."+CI->getName(), CI);
|
||||
} else if (Name.compare(14, 4, "vsub", 4) == 0) {
|
||||
ExtendNEONArgs(CI, CI->getArgOperand(0), CI->getArgOperand(1), V0, V1);
|
||||
NewI = BinaryOperator::CreateSub(V0, V1,"upgraded."+CI->getName(),CI);
|
||||
} else if (Name.compare(14, 4, "vmul", 4) == 0) {
|
||||
ExtendNEONArgs(CI, CI->getArgOperand(0), CI->getArgOperand(1), V0, V1);
|
||||
NewI = BinaryOperator::CreateMul(V0, V1,"upgraded."+CI->getName(),CI);
|
||||
} else if (Name.compare(14, 4, "vmla", 4) == 0) {
|
||||
ExtendNEONArgs(CI, CI->getArgOperand(1), CI->getArgOperand(2), V0, V1);
|
||||
Instruction *MulI = BinaryOperator::CreateMul(V0, V1, "", CI);
|
||||
NewI = BinaryOperator::CreateAdd(CI->getArgOperand(0), MulI,
|
||||
"upgraded."+CI->getName(), CI);
|
||||
} else if (Name.compare(14, 4, "vmls", 4) == 0) {
|
||||
ExtendNEONArgs(CI, CI->getArgOperand(1), CI->getArgOperand(2), V0, V1);
|
||||
Instruction *MulI = BinaryOperator::CreateMul(V0, V1, "", CI);
|
||||
NewI = BinaryOperator::CreateSub(CI->getArgOperand(0), MulI,
|
||||
"upgraded."+CI->getName(), CI);
|
||||
} else if (Name.compare(14, 6, "vmovn.", 6) == 0) {
|
||||
NewI = new TruncInst(CI->getArgOperand(0), CI->getType(),
|
||||
"upgraded." + CI->getName(), CI);
|
||||
|
|
|
@ -52,7 +52,7 @@
|
|||
; CHECK: zext <4 x i16>
|
||||
; CHECK-NEXT: add <4 x i32>
|
||||
|
||||
; vsubl/vsubw should be auto-upgraded to sub with sext/zext
|
||||
; vsubl/vsubw should be auto-upgraded to subtract with sext/zext
|
||||
|
||||
; CHECK: vsubls16
|
||||
; CHECK-NOT: arm.neon.vsubls.v4i32
|
||||
|
@ -76,6 +76,56 @@
|
|||
; CHECK: zext <4 x i16>
|
||||
; CHECK-NEXT: sub <4 x i32>
|
||||
|
||||
; vmull should be auto-upgraded to multiply with sext/zext
|
||||
; (but vmullp should remain an intrinsic)
|
||||
|
||||
; CHECK: vmulls8
|
||||
; CHECK-NOT: arm.neon.vmulls.v8i16
|
||||
; CHECK: sext <8 x i8>
|
||||
; CHECK-NEXT: sext <8 x i8>
|
||||
; CHECK-NEXT: mul <8 x i16>
|
||||
|
||||
; CHECK: vmullu16
|
||||
; CHECK-NOT: arm.neon.vmullu.v4i32
|
||||
; CHECK: zext <4 x i16>
|
||||
; CHECK-NEXT: zext <4 x i16>
|
||||
; CHECK-NEXT: mul <4 x i32>
|
||||
|
||||
; CHECK: vmullp8
|
||||
; CHECK: arm.neon.vmullp.v8i16
|
||||
|
||||
; vmlal should be auto-upgraded to multiply/add with sext/zext
|
||||
|
||||
; CHECK: vmlals32
|
||||
; CHECK-NOT: arm.neon.vmlals.v2i64
|
||||
; CHECK: sext <2 x i32>
|
||||
; CHECK-NEXT: sext <2 x i32>
|
||||
; CHECK-NEXT: mul <2 x i64>
|
||||
; CHECK-NEXT: add <2 x i64>
|
||||
|
||||
; CHECK: vmlalu8
|
||||
; CHECK-NOT: arm.neon.vmlalu.v8i16
|
||||
; CHECK: zext <8 x i8>
|
||||
; CHECK-NEXT: zext <8 x i8>
|
||||
; CHECK-NEXT: mul <8 x i16>
|
||||
; CHECK-NEXT: add <8 x i16>
|
||||
|
||||
; vmlsl should be auto-upgraded to multiply/sub with sext/zext
|
||||
|
||||
; CHECK: vmlsls16
|
||||
; CHECK-NOT: arm.neon.vmlsls.v4i32
|
||||
; CHECK: sext <4 x i16>
|
||||
; CHECK-NEXT: sext <4 x i16>
|
||||
; CHECK-NEXT: mul <4 x i32>
|
||||
; CHECK-NEXT: sub <4 x i32>
|
||||
|
||||
; CHECK: vmlslu32
|
||||
; CHECK-NOT: arm.neon.vmlslu.v2i64
|
||||
; CHECK: zext <2 x i32>
|
||||
; CHECK-NEXT: zext <2 x i32>
|
||||
; CHECK-NEXT: mul <2 x i64>
|
||||
; CHECK-NEXT: sub <2 x i64>
|
||||
|
||||
; vmovn should be auto-upgraded to trunc
|
||||
|
||||
; CHECK: vmovni16
|
||||
|
|
Binary file not shown.
|
@ -94,8 +94,11 @@ define <8 x i16> @vmlals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
|
|||
%tmp1 = load <8 x i16>* %A
|
||||
%tmp2 = load <8 x i8>* %B
|
||||
%tmp3 = load <8 x i8>* %C
|
||||
%tmp4 = call <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
|
||||
ret <8 x i16> %tmp4
|
||||
%tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
|
||||
%tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
|
||||
%tmp6 = mul <8 x i16> %tmp4, %tmp5
|
||||
%tmp7 = add <8 x i16> %tmp1, %tmp6
|
||||
ret <8 x i16> %tmp7
|
||||
}
|
||||
|
||||
define <4 x i32> @vmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
|
||||
|
@ -104,8 +107,11 @@ define <4 x i32> @vmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind
|
|||
%tmp1 = load <4 x i32>* %A
|
||||
%tmp2 = load <4 x i16>* %B
|
||||
%tmp3 = load <4 x i16>* %C
|
||||
%tmp4 = call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
|
||||
ret <4 x i32> %tmp4
|
||||
%tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
|
||||
%tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
|
||||
%tmp6 = mul <4 x i32> %tmp4, %tmp5
|
||||
%tmp7 = add <4 x i32> %tmp1, %tmp6
|
||||
ret <4 x i32> %tmp7
|
||||
}
|
||||
|
||||
define <2 x i64> @vmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
|
||||
|
@ -114,8 +120,11 @@ define <2 x i64> @vmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind
|
|||
%tmp1 = load <2 x i64>* %A
|
||||
%tmp2 = load <2 x i32>* %B
|
||||
%tmp3 = load <2 x i32>* %C
|
||||
%tmp4 = call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
|
||||
ret <2 x i64> %tmp4
|
||||
%tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
|
||||
%tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
|
||||
%tmp6 = mul <2 x i64> %tmp4, %tmp5
|
||||
%tmp7 = add <2 x i64> %tmp1, %tmp6
|
||||
ret <2 x i64> %tmp7
|
||||
}
|
||||
|
||||
define <8 x i16> @vmlalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
|
||||
|
@ -124,8 +133,11 @@ define <8 x i16> @vmlalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
|
|||
%tmp1 = load <8 x i16>* %A
|
||||
%tmp2 = load <8 x i8>* %B
|
||||
%tmp3 = load <8 x i8>* %C
|
||||
%tmp4 = call <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
|
||||
ret <8 x i16> %tmp4
|
||||
%tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
|
||||
%tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
|
||||
%tmp6 = mul <8 x i16> %tmp4, %tmp5
|
||||
%tmp7 = add <8 x i16> %tmp1, %tmp6
|
||||
ret <8 x i16> %tmp7
|
||||
}
|
||||
|
||||
define <4 x i32> @vmlalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
|
||||
|
@ -134,8 +146,11 @@ define <4 x i32> @vmlalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind
|
|||
%tmp1 = load <4 x i32>* %A
|
||||
%tmp2 = load <4 x i16>* %B
|
||||
%tmp3 = load <4 x i16>* %C
|
||||
%tmp4 = call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
|
||||
ret <4 x i32> %tmp4
|
||||
%tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
|
||||
%tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
|
||||
%tmp6 = mul <4 x i32> %tmp4, %tmp5
|
||||
%tmp7 = add <4 x i32> %tmp1, %tmp6
|
||||
ret <4 x i32> %tmp7
|
||||
}
|
||||
|
||||
define <2 x i64> @vmlalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
|
||||
|
@ -144,8 +159,11 @@ define <2 x i64> @vmlalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind
|
|||
%tmp1 = load <2 x i64>* %A
|
||||
%tmp2 = load <2 x i32>* %B
|
||||
%tmp3 = load <2 x i32>* %C
|
||||
%tmp4 = call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
|
||||
ret <2 x i64> %tmp4
|
||||
%tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
|
||||
%tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
|
||||
%tmp6 = mul <2 x i64> %tmp4, %tmp5
|
||||
%tmp7 = add <2 x i64> %tmp1, %tmp6
|
||||
ret <2 x i64> %tmp7
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <4 x i32> @test_vmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
|
||||
|
@ -153,8 +171,11 @@ entry:
|
|||
; CHECK: test_vmlal_lanes16
|
||||
; CHECK: vmlal.s16 q0, d2, d3[1]
|
||||
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
|
||||
%1 = tail call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %1
|
||||
%1 = sext <4 x i16> %arg1_int16x4_t to <4 x i32>
|
||||
%2 = sext <4 x i16> %0 to <4 x i32>
|
||||
%3 = mul <4 x i32> %1, %2
|
||||
%4 = add <4 x i32> %arg0_int32x4_t, %3
|
||||
ret <4 x i32> %4
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <2 x i64> @test_vmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
|
||||
|
@ -162,8 +183,11 @@ entry:
|
|||
; CHECK: test_vmlal_lanes32
|
||||
; CHECK: vmlal.s32 q0, d2, d3[1]
|
||||
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
|
||||
%1 = tail call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %1
|
||||
%1 = sext <2 x i32> %arg1_int32x2_t to <2 x i64>
|
||||
%2 = sext <2 x i32> %0 to <2 x i64>
|
||||
%3 = mul <2 x i64> %1, %2
|
||||
%4 = add <2 x i64> %arg0_int64x2_t, %3
|
||||
ret <2 x i64> %4
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <4 x i32> @test_vmlal_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {
|
||||
|
@ -171,8 +195,11 @@ entry:
|
|||
; CHECK: test_vmlal_laneu16
|
||||
; CHECK: vmlal.u16 q0, d2, d3[1]
|
||||
%0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
|
||||
%1 = tail call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %1
|
||||
%1 = zext <4 x i16> %arg1_uint16x4_t to <4 x i32>
|
||||
%2 = zext <4 x i16> %0 to <4 x i32>
|
||||
%3 = mul <4 x i32> %1, %2
|
||||
%4 = add <4 x i32> %arg0_uint32x4_t, %3
|
||||
ret <4 x i32> %4
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <2 x i64> @test_vmlal_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {
|
||||
|
@ -180,14 +207,9 @@ entry:
|
|||
; CHECK: test_vmlal_laneu32
|
||||
; CHECK: vmlal.u32 q0, d2, d3[1]
|
||||
%0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
|
||||
%1 = tail call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %1
|
||||
%1 = zext <2 x i32> %arg1_uint32x2_t to <2 x i64>
|
||||
%2 = zext <2 x i32> %0 to <2 x i64>
|
||||
%3 = mul <2 x i64> %1, %2
|
||||
%4 = add <2 x i64> %arg0_uint64x2_t, %3
|
||||
ret <2 x i64> %4
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
|
||||
declare <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
|
||||
declare <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
|
||||
declare <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
|
||||
declare <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
|
||||
|
|
|
@ -94,8 +94,11 @@ define <8 x i16> @vmlsls8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
|
|||
%tmp1 = load <8 x i16>* %A
|
||||
%tmp2 = load <8 x i8>* %B
|
||||
%tmp3 = load <8 x i8>* %C
|
||||
%tmp4 = call <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
|
||||
ret <8 x i16> %tmp4
|
||||
%tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
|
||||
%tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
|
||||
%tmp6 = mul <8 x i16> %tmp4, %tmp5
|
||||
%tmp7 = sub <8 x i16> %tmp1, %tmp6
|
||||
ret <8 x i16> %tmp7
|
||||
}
|
||||
|
||||
define <4 x i32> @vmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
|
||||
|
@ -104,8 +107,11 @@ define <4 x i32> @vmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind
|
|||
%tmp1 = load <4 x i32>* %A
|
||||
%tmp2 = load <4 x i16>* %B
|
||||
%tmp3 = load <4 x i16>* %C
|
||||
%tmp4 = call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
|
||||
ret <4 x i32> %tmp4
|
||||
%tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
|
||||
%tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
|
||||
%tmp6 = mul <4 x i32> %tmp4, %tmp5
|
||||
%tmp7 = sub <4 x i32> %tmp1, %tmp6
|
||||
ret <4 x i32> %tmp7
|
||||
}
|
||||
|
||||
define <2 x i64> @vmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
|
||||
|
@ -114,8 +120,11 @@ define <2 x i64> @vmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind
|
|||
%tmp1 = load <2 x i64>* %A
|
||||
%tmp2 = load <2 x i32>* %B
|
||||
%tmp3 = load <2 x i32>* %C
|
||||
%tmp4 = call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
|
||||
ret <2 x i64> %tmp4
|
||||
%tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
|
||||
%tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
|
||||
%tmp6 = mul <2 x i64> %tmp4, %tmp5
|
||||
%tmp7 = sub <2 x i64> %tmp1, %tmp6
|
||||
ret <2 x i64> %tmp7
|
||||
}
|
||||
|
||||
define <8 x i16> @vmlslu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
|
||||
|
@ -124,8 +133,11 @@ define <8 x i16> @vmlslu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
|
|||
%tmp1 = load <8 x i16>* %A
|
||||
%tmp2 = load <8 x i8>* %B
|
||||
%tmp3 = load <8 x i8>* %C
|
||||
%tmp4 = call <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
|
||||
ret <8 x i16> %tmp4
|
||||
%tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
|
||||
%tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
|
||||
%tmp6 = mul <8 x i16> %tmp4, %tmp5
|
||||
%tmp7 = sub <8 x i16> %tmp1, %tmp6
|
||||
ret <8 x i16> %tmp7
|
||||
}
|
||||
|
||||
define <4 x i32> @vmlslu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
|
||||
|
@ -134,8 +146,11 @@ define <4 x i32> @vmlslu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind
|
|||
%tmp1 = load <4 x i32>* %A
|
||||
%tmp2 = load <4 x i16>* %B
|
||||
%tmp3 = load <4 x i16>* %C
|
||||
%tmp4 = call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
|
||||
ret <4 x i32> %tmp4
|
||||
%tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
|
||||
%tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
|
||||
%tmp6 = mul <4 x i32> %tmp4, %tmp5
|
||||
%tmp7 = sub <4 x i32> %tmp1, %tmp6
|
||||
ret <4 x i32> %tmp7
|
||||
}
|
||||
|
||||
define <2 x i64> @vmlslu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
|
||||
|
@ -144,8 +159,11 @@ define <2 x i64> @vmlslu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind
|
|||
%tmp1 = load <2 x i64>* %A
|
||||
%tmp2 = load <2 x i32>* %B
|
||||
%tmp3 = load <2 x i32>* %C
|
||||
%tmp4 = call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
|
||||
ret <2 x i64> %tmp4
|
||||
%tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
|
||||
%tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
|
||||
%tmp6 = mul <2 x i64> %tmp4, %tmp5
|
||||
%tmp7 = sub <2 x i64> %tmp1, %tmp6
|
||||
ret <2 x i64> %tmp7
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
|
||||
|
@ -153,8 +171,11 @@ entry:
|
|||
; CHECK: test_vmlsl_lanes16
|
||||
; CHECK: vmlsl.s16 q0, d2, d3[1]
|
||||
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
|
||||
%1 = tail call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %1
|
||||
%1 = sext <4 x i16> %arg1_int16x4_t to <4 x i32>
|
||||
%2 = sext <4 x i16> %0 to <4 x i32>
|
||||
%3 = mul <4 x i32> %1, %2
|
||||
%4 = sub <4 x i32> %arg0_int32x4_t, %3
|
||||
ret <4 x i32> %4
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
|
||||
|
@ -162,8 +183,11 @@ entry:
|
|||
; CHECK: test_vmlsl_lanes32
|
||||
; CHECK: vmlsl.s32 q0, d2, d3[1]
|
||||
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
|
||||
%1 = tail call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %1
|
||||
%1 = sext <2 x i32> %arg1_int32x2_t to <2 x i64>
|
||||
%2 = sext <2 x i32> %0 to <2 x i64>
|
||||
%3 = mul <2 x i64> %1, %2
|
||||
%4 = sub <2 x i64> %arg0_int64x2_t, %3
|
||||
ret <2 x i64> %4
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {
|
||||
|
@ -171,8 +195,11 @@ entry:
|
|||
; CHECK: test_vmlsl_laneu16
|
||||
; CHECK: vmlsl.u16 q0, d2, d3[1]
|
||||
%0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
|
||||
%1 = tail call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %1
|
||||
%1 = zext <4 x i16> %arg1_uint16x4_t to <4 x i32>
|
||||
%2 = zext <4 x i16> %0 to <4 x i32>
|
||||
%3 = mul <4 x i32> %1, %2
|
||||
%4 = sub <4 x i32> %arg0_uint32x4_t, %3
|
||||
ret <4 x i32> %4
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {
|
||||
|
@ -180,14 +207,9 @@ entry:
|
|||
; CHECK: test_vmlsl_laneu32
|
||||
; CHECK: vmlsl.u32 q0, d2, d3[1]
|
||||
%0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
|
||||
%1 = tail call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %1
|
||||
%1 = zext <2 x i32> %arg1_uint32x2_t to <2 x i64>
|
||||
%2 = zext <2 x i32> %0 to <2 x i64>
|
||||
%3 = mul <2 x i64> %1, %2
|
||||
%4 = sub <2 x i64> %arg0_uint64x2_t, %3
|
||||
ret <2 x i64> %4
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
|
||||
declare <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
|
||||
declare <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
|
||||
declare <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
|
||||
declare <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
|
||||
|
|
|
@ -152,8 +152,10 @@ define <8 x i16> @vmulls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
|
|||
;CHECK: vmull.s8
|
||||
%tmp1 = load <8 x i8>* %A
|
||||
%tmp2 = load <8 x i8>* %B
|
||||
%tmp3 = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
|
||||
ret <8 x i16> %tmp3
|
||||
%tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
|
||||
%tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
|
||||
%tmp5 = mul <8 x i16> %tmp3, %tmp4
|
||||
ret <8 x i16> %tmp5
|
||||
}
|
||||
|
||||
define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
|
||||
|
@ -161,8 +163,10 @@ define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
|
|||
;CHECK: vmull.s16
|
||||
%tmp1 = load <4 x i16>* %A
|
||||
%tmp2 = load <4 x i16>* %B
|
||||
%tmp3 = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
|
||||
ret <4 x i32> %tmp3
|
||||
%tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
|
||||
%tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
|
||||
%tmp5 = mul <4 x i32> %tmp3, %tmp4
|
||||
ret <4 x i32> %tmp5
|
||||
}
|
||||
|
||||
define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
|
||||
|
@ -170,8 +174,10 @@ define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
|
|||
;CHECK: vmull.s32
|
||||
%tmp1 = load <2 x i32>* %A
|
||||
%tmp2 = load <2 x i32>* %B
|
||||
%tmp3 = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
|
||||
ret <2 x i64> %tmp3
|
||||
%tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
|
||||
%tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
|
||||
%tmp5 = mul <2 x i64> %tmp3, %tmp4
|
||||
ret <2 x i64> %tmp5
|
||||
}
|
||||
|
||||
define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
|
||||
|
@ -179,8 +185,10 @@ define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
|
|||
;CHECK: vmull.u8
|
||||
%tmp1 = load <8 x i8>* %A
|
||||
%tmp2 = load <8 x i8>* %B
|
||||
%tmp3 = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
|
||||
ret <8 x i16> %tmp3
|
||||
%tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
|
||||
%tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
|
||||
%tmp5 = mul <8 x i16> %tmp3, %tmp4
|
||||
ret <8 x i16> %tmp5
|
||||
}
|
||||
|
||||
define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
|
||||
|
@ -188,8 +196,10 @@ define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
|
|||
;CHECK: vmull.u16
|
||||
%tmp1 = load <4 x i16>* %A
|
||||
%tmp2 = load <4 x i16>* %B
|
||||
%tmp3 = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
|
||||
ret <4 x i32> %tmp3
|
||||
%tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
|
||||
%tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
|
||||
%tmp5 = mul <4 x i32> %tmp3, %tmp4
|
||||
ret <4 x i32> %tmp5
|
||||
}
|
||||
|
||||
define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
|
||||
|
@ -197,8 +207,10 @@ define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
|
|||
;CHECK: vmull.u32
|
||||
%tmp1 = load <2 x i32>* %A
|
||||
%tmp2 = load <2 x i32>* %B
|
||||
%tmp3 = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
|
||||
ret <2 x i64> %tmp3
|
||||
%tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
|
||||
%tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
|
||||
%tmp5 = mul <2 x i64> %tmp3, %tmp4
|
||||
ret <2 x i64> %tmp5
|
||||
}
|
||||
|
||||
define <8 x i16> @vmullp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
|
||||
|
@ -215,8 +227,10 @@ entry:
|
|||
; CHECK: test_vmull_lanes16
|
||||
; CHECK: vmull.s16 q0, d0, d1[1]
|
||||
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
|
||||
%1 = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %1
|
||||
%1 = sext <4 x i16> %arg0_int16x4_t to <4 x i32>
|
||||
%2 = sext <4 x i16> %0 to <4 x i32>
|
||||
%3 = mul <4 x i32> %1, %2
|
||||
ret <4 x i32> %3
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
|
||||
|
@ -224,8 +238,10 @@ entry:
|
|||
; CHECK: test_vmull_lanes32
|
||||
; CHECK: vmull.s32 q0, d0, d1[1]
|
||||
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
|
||||
%1 = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %1
|
||||
%1 = sext <2 x i32> %arg0_int32x2_t to <2 x i64>
|
||||
%2 = sext <2 x i32> %0 to <2 x i64>
|
||||
%3 = mul <2 x i64> %1, %2
|
||||
ret <2 x i64> %3
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone {
|
||||
|
@ -233,8 +249,10 @@ entry:
|
|||
; CHECK: test_vmull_laneu16
|
||||
; CHECK: vmull.u16 q0, d0, d1[1]
|
||||
%0 = shufflevector <4 x i16> %arg1_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
|
||||
%1 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %arg0_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %1
|
||||
%1 = zext <4 x i16> %arg0_uint16x4_t to <4 x i32>
|
||||
%2 = zext <4 x i16> %0 to <4 x i32>
|
||||
%3 = mul <4 x i32> %1, %2
|
||||
ret <4 x i32> %3
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone {
|
||||
|
@ -242,16 +260,10 @@ entry:
|
|||
; CHECK: test_vmull_laneu32
|
||||
; CHECK: vmull.u32 q0, d0, d1[1]
|
||||
%0 = shufflevector <2 x i32> %arg1_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
|
||||
%1 = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %arg0_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %1
|
||||
%1 = zext <2 x i32> %arg0_uint32x2_t to <2 x i64>
|
||||
%2 = zext <2 x i32> %0 to <2 x i64>
|
||||
%3 = mul <2 x i64> %1, %2
|
||||
ret <2 x i64> %3
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
|
||||
declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
|
||||
declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
|
||||
declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
|
||||
declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
|
||||
|
|
Loading…
Reference in New Issue