Remove NEON vaddl, vaddw, vsubl, and vsubw intrinsics. Instead, use llvm

IR add/sub operations with one or both operands sign- or zero-extended.
Auto-upgrade the old intrinsics.

llvm-svn: 112416
This commit is contained in:
Bob Wilson 2010-08-29 05:57:34 +00:00
parent b0a6de9e8f
commit d0c054886c
7 changed files with 225 additions and 124 deletions

View File

@ -73,10 +73,6 @@ let TargetPrefix = "arm" in { // All intrinsics start with "llvm.arm.".
[LLVMTruncatedElementVectorType<0>,
LLVMTruncatedElementVectorType<0>],
[IntrNoMem]>;
class Neon_2Arg_Wide_Intrinsic
: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMTruncatedElementVectorType<0>],
[IntrNoMem]>;
class Neon_3Arg_Intrinsic
: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
@ -128,10 +124,6 @@ let Properties = [IntrNoMem, Commutative] in {
def int_arm_neon_vqaddu : Neon_2Arg_Intrinsic;
def int_arm_neon_vaddhn : Neon_2Arg_Narrow_Intrinsic;
def int_arm_neon_vraddhn : Neon_2Arg_Narrow_Intrinsic;
def int_arm_neon_vaddls : Neon_2Arg_Long_Intrinsic;
def int_arm_neon_vaddlu : Neon_2Arg_Long_Intrinsic;
def int_arm_neon_vaddws : Neon_2Arg_Wide_Intrinsic;
def int_arm_neon_vaddwu : Neon_2Arg_Wide_Intrinsic;
// Vector Multiply.
def int_arm_neon_vmulp : Neon_2Arg_Intrinsic;
@ -172,10 +164,6 @@ def int_arm_neon_vqsubs : Neon_2Arg_Intrinsic;
def int_arm_neon_vqsubu : Neon_2Arg_Intrinsic;
def int_arm_neon_vsubhn : Neon_2Arg_Narrow_Intrinsic;
def int_arm_neon_vrsubhn : Neon_2Arg_Narrow_Intrinsic;
def int_arm_neon_vsubls : Neon_2Arg_Long_Intrinsic;
def int_arm_neon_vsublu : Neon_2Arg_Long_Intrinsic;
def int_arm_neon_vsubws : Neon_2Arg_Wide_Intrinsic;
def int_arm_neon_vsubwu : Neon_2Arg_Wide_Intrinsic;
// Vector Absolute Compare.
let TargetPrefix = "arm" in {

View File

@ -1294,6 +1294,19 @@ class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
let isCommutable = Commutable;
}
// Long 3-register operations.
class N3VL<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
InstrItinClass itin, string OpcodeStr, string Dt,
ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp,
bit Commutable>
: N3V<op24, op23, op21_20, op11_8, 0, op4,
(outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
OpcodeStr, Dt, "$dst, $src1, $src2", "",
[(set QPR:$dst, (OpNode (TyQ (ExtOp (TyD DPR:$src1))),
(TyQ (ExtOp (TyD DPR:$src2)))))]> {
let isCommutable = Commutable;
}
// Long 3-register intrinsics.
class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
InstrItinClass itin, string OpcodeStr, string Dt,
@ -1325,14 +1338,15 @@ class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
(OpTy (NEONvduplane (OpTy DPR_8:$src2),
imm:$lane)))))]>;
// Wide 3-register intrinsics.
class N3VWInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD,
Intrinsic IntOp, bit Commutable>
// Wide 3-register operations.
class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD,
SDNode OpNode, SDNode ExtOp, bit Commutable>
: N3V<op24, op23, op21_20, op11_8, 0, op4,
(outs QPR:$dst), (ins QPR:$src1, DPR:$src2), N3RegFrm, IIC_VSUBiD,
OpcodeStr, Dt, "$dst, $src1, $src2", "",
[(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2))))]> {
[(set QPR:$dst, (OpNode (TyQ QPR:$src1),
(TyQ (ExtOp (TyD DPR:$src2)))))]> {
let isCommutable = Commutable;
}
@ -1684,6 +1698,23 @@ multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4,
}
// Neon Long 3-register vector operations.
multiclass N3VL_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
InstrItinClass itin16, InstrItinClass itin32,
string OpcodeStr, string Dt,
SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
def v4i32 : N3VL<op24, op23, 0b01, op11_8, op4, itin16,
OpcodeStr, !strconcat(Dt, "16"),
v4i32, v4i16, OpNode, ExtOp, Commutable>;
def v2i64 : N3VL<op24, op23, 0b10, op11_8, op4, itin32,
OpcodeStr, !strconcat(Dt, "32"),
v2i64, v2i32, OpNode, ExtOp, Commutable>;
def v8i16 : N3VL<op24, op23, 0b00, op11_8, op4, itin16,
OpcodeStr, !strconcat(Dt, "8"),
v8i16, v8i8, OpNode, ExtOp, Commutable>;
}
// Neon Long 3-register vector intrinsics.
// First with only element sizes of 16 and 32 bits:
@ -1723,18 +1754,18 @@ multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
// Neon Wide 3-register vector intrinsics,
// source operand element sizes of 8, 16 and 32 bits:
multiclass N3VWInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
string OpcodeStr, string Dt,
Intrinsic IntOp, bit Commutable = 0> {
def v8i16 : N3VWInt<op24, op23, 0b00, op11_8, op4,
OpcodeStr, !strconcat(Dt, "8"),
v8i16, v8i8, IntOp, Commutable>;
def v4i32 : N3VWInt<op24, op23, 0b01, op11_8, op4,
OpcodeStr, !strconcat(Dt, "16"),
v4i32, v4i16, IntOp, Commutable>;
def v2i64 : N3VWInt<op24, op23, 0b10, op11_8, op4,
OpcodeStr, !strconcat(Dt, "32"),
v2i64, v2i32, IntOp, Commutable>;
multiclass N3VW_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
string OpcodeStr, string Dt,
SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
def v8i16 : N3VW<op24, op23, 0b00, op11_8, op4,
OpcodeStr, !strconcat(Dt, "8"),
v8i16, v8i8, OpNode, ExtOp, Commutable>;
def v4i32 : N3VW<op24, op23, 0b01, op11_8, op4,
OpcodeStr, !strconcat(Dt, "16"),
v4i32, v4i16, OpNode, ExtOp, Commutable>;
def v2i64 : N3VW<op24, op23, 0b10, op11_8, op4,
OpcodeStr, !strconcat(Dt, "32"),
v2i64, v2i32, OpNode, ExtOp, Commutable>;
}
@ -2073,13 +2104,13 @@ def VADDfd : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd", "f32",
def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32",
v4f32, v4f32, fadd, 1>;
// VADDL : Vector Add Long (Q = D + D)
defm VADDLs : N3VLInt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
"vaddl", "s", int_arm_neon_vaddls, 1>;
defm VADDLu : N3VLInt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
"vaddl", "u", int_arm_neon_vaddlu, 1>;
defm VADDLs : N3VL_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
"vaddl", "s", add, sext, 1>;
defm VADDLu : N3VL_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
"vaddl", "u", add, zext, 1>;
// VADDW : Vector Add Wide (Q = Q + D)
defm VADDWs : N3VWInt_QHS<0,1,0b0001,0, "vaddw", "s", int_arm_neon_vaddws, 0>;
defm VADDWu : N3VWInt_QHS<1,1,0b0001,0, "vaddw", "u", int_arm_neon_vaddwu, 0>;
defm VADDWs : N3VW_QHS<0,1,0b0001,0, "vaddw", "s", add, sext, 0>;
defm VADDWu : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zext, 0>;
// VHADD : Vector Halving Add
defm VHADDs : N3VInt_QHS<0, 0, 0b0000, 0, N3RegFrm,
IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
@ -2324,13 +2355,13 @@ def VSUBfd : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub", "f32",
def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32",
v4f32, v4f32, fsub, 0>;
// VSUBL : Vector Subtract Long (Q = D - D)
defm VSUBLs : N3VLInt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
"vsubl", "s", int_arm_neon_vsubls, 1>;
defm VSUBLu : N3VLInt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
"vsubl", "u", int_arm_neon_vsublu, 1>;
defm VSUBLs : N3VL_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
"vsubl", "s", sub, sext, 0>;
defm VSUBLu : N3VL_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
"vsubl", "u", sub, zext, 0>;
// VSUBW : Vector Subtract Wide (Q = Q - D)
defm VSUBWs : N3VWInt_QHS<0,1,0b0011,0, "vsubw", "s", int_arm_neon_vsubws, 0>;
defm VSUBWu : N3VWInt_QHS<1,1,0b0011,0, "vsubw", "u", int_arm_neon_vsubwu, 0>;
defm VSUBWs : N3VW_QHS<0,1,0b0011,0, "vsubw", "s", sub, sext, 0>;
defm VSUBWu : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zext, 0>;
// VHSUB : Vector Halving Subtract
defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, N3RegFrm,
IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
@ -2559,7 +2590,7 @@ def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ,
defm VABDLs : N3VLInt_QHS<0,1,0b0111,0, IIC_VSUBi4Q, IIC_VSUBi4Q,
"vabdl", "s", int_arm_neon_vabdls, 0>;
defm VABDLu : N3VLInt_QHS<1,1,0b0111,0, IIC_VSUBi4Q, IIC_VSUBi4Q,
"vabdl", "u", int_arm_neon_vabdlu, 0>;
"vabdl", "u", int_arm_neon_vabdlu, 0>;
// VABA : Vector Absolute Difference and Accumulate
defm VABAs : N3VInt3_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,

View File

@ -79,8 +79,17 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
return true;
}
} else if (Name.compare(5, 9, "arm.neon.", 9) == 0) {
if (Name.compare(14, 7, "vmovls.", 7) == 0 ||
Name.compare(14, 7, "vmovlu.", 7) == 0) {
if (((Name.compare(14, 5, "vmovl", 5) == 0 ||
Name.compare(14, 5, "vaddl", 5) == 0 ||
Name.compare(14, 5, "vsubl", 5) == 0) &&
(Name.compare(19, 2, "s.", 2) == 0 ||
Name.compare(19, 2, "u.", 2) == 0)) ||
((Name.compare(14, 5, "vaddw", 5) == 0 ||
Name.compare(14, 5, "vsubw", 5) == 0) &&
(Name.compare(19, 2, "s.", 2) == 0 ||
Name.compare(19, 2, "u.", 2) == 0))) {
// Calls to these are transformed into IR without intrinsics.
NewFn = 0;
return true;
@ -371,6 +380,27 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
} else if (Name.compare(14, 7, "vmovlu.", 7) == 0) {
NewI = new ZExtInst(CI->getArgOperand(0), CI->getType(),
"upgraded." + CI->getName(), CI);
} else if (Name.compare(14, 4, "vadd", 4) == 0 ||
Name.compare(14, 4, "vsub", 4) == 0) {
// Extend one (vaddw/vsubw) or both (vaddl/vsubl) operands.
Value *V0 = CI->getArgOperand(0);
Value *V1 = CI->getArgOperand(1);
if (Name.at(19) == 's') {
if (Name.at(18) == 'l')
V0 = new SExtInst(CI->getArgOperand(0), CI->getType(), "", CI);
V1 = new SExtInst(CI->getArgOperand(1), CI->getType(), "", CI);
} else {
assert(Name.at(19) == 'u' && "unexpected vadd/vsub intrinsic");
if (Name.at(18) == 'l')
V0 = new ZExtInst(CI->getArgOperand(0), CI->getType(), "", CI);
V1 = new ZExtInst(CI->getArgOperand(1), CI->getType(), "", CI);
}
if (Name.compare(14, 4, "vadd", 4) == 0)
NewI = BinaryOperator::CreateAdd(V0, V1,"upgraded."+CI->getName(),CI);
else
NewI = BinaryOperator::CreateSub(V0, V1,"upgraded."+CI->getName(),CI);
} else {
llvm_unreachable("Unknown arm.neon function for CallInst upgrade.");
}

View File

@ -28,6 +28,54 @@
; CHECK-NOT: arm.neon.vmovlu.v2i64
; CHECK: zext <2 x i32>
; vaddl/vaddw should be auto-upgraded to add with sext/zext
; CHECK: vaddls16
; CHECK-NOT: arm.neon.vaddls.v4i32
; CHECK: sext <4 x i16>
; CHECK-NEXT: sext <4 x i16>
; CHECK-NEXT: add <4 x i32>
; CHECK: vaddlu32
; CHECK-NOT: arm.neon.vaddlu.v2i64
; CHECK: zext <2 x i32>
; CHECK-NEXT: zext <2 x i32>
; CHECK-NEXT: add <2 x i64>
; CHECK: vaddws8
; CHECK-NOT: arm.neon.vaddws.v8i16
; CHECK: sext <8 x i8>
; CHECK-NEXT: add <8 x i16>
; CHECK: vaddwu16
; CHECK-NOT: arm.neon.vaddwu.v4i32
; CHECK: zext <4 x i16>
; CHECK-NEXT: add <4 x i32>
; vsubl/vsubw should be auto-upgraded to sub with sext/zext
; CHECK: vsubls16
; CHECK-NOT: arm.neon.vsubls.v4i32
; CHECK: sext <4 x i16>
; CHECK-NEXT: sext <4 x i16>
; CHECK-NEXT: sub <4 x i32>
; CHECK: vsublu32
; CHECK-NOT: arm.neon.vsublu.v2i64
; CHECK: zext <2 x i32>
; CHECK-NEXT: zext <2 x i32>
; CHECK-NEXT: sub <2 x i64>
; CHECK: vsubws8
; CHECK-NOT: arm.neon.vsubws.v8i16
; CHECK: sext <8 x i8>
; CHECK-NEXT: sub <8 x i16>
; CHECK: vsubwu16
; CHECK-NOT: arm.neon.vsubwu.v4i32
; CHECK: zext <4 x i16>
; CHECK-NEXT: sub <4 x i32>
; vld* and vst* intrinsic calls need an alignment argument (defaulted to 1)
; CHECK: vld1i8

View File

@ -157,8 +157,10 @@ define <8 x i16> @vaddls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vaddl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
%tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
%tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
%tmp5 = add <8 x i16> %tmp3, %tmp4
ret <8 x i16> %tmp5
}
define <4 x i32> @vaddls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
@ -166,8 +168,10 @@ define <4 x i32> @vaddls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vaddl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
%tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
%tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
%tmp5 = add <4 x i32> %tmp3, %tmp4
ret <4 x i32> %tmp5
}
define <2 x i64> @vaddls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
@ -175,8 +179,10 @@ define <2 x i64> @vaddls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vaddl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
%tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
%tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
%tmp5 = add <2 x i64> %tmp3, %tmp4
ret <2 x i64> %tmp5
}
define <8 x i16> @vaddlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
@ -184,8 +190,10 @@ define <8 x i16> @vaddlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vaddl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
%tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
%tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
%tmp5 = add <8 x i16> %tmp3, %tmp4
ret <8 x i16> %tmp5
}
define <4 x i32> @vaddlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
@ -193,8 +201,10 @@ define <4 x i32> @vaddlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vaddl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
%tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
%tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
%tmp5 = add <4 x i32> %tmp3, %tmp4
ret <4 x i32> %tmp5
}
define <2 x i64> @vaddlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
@ -202,25 +212,20 @@ define <2 x i64> @vaddlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vaddl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
%tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
%tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
%tmp5 = add <2 x i64> %tmp3, %tmp4
ret <2 x i64> %tmp5
}
declare <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
define <8 x i16> @vaddws8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
;CHECK: vaddws8:
;CHECK: vaddw.s8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
%tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
%tmp4 = add <8 x i16> %tmp1, %tmp3
ret <8 x i16> %tmp4
}
define <4 x i32> @vaddws16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
@ -228,8 +233,9 @@ define <4 x i32> @vaddws16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
;CHECK: vaddw.s16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
%tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
%tmp4 = add <4 x i32> %tmp1, %tmp3
ret <4 x i32> %tmp4
}
define <2 x i64> @vaddws32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
@ -237,8 +243,9 @@ define <2 x i64> @vaddws32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
;CHECK: vaddw.s32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
%tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
%tmp4 = add <2 x i64> %tmp1, %tmp3
ret <2 x i64> %tmp4
}
define <8 x i16> @vaddwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
@ -246,8 +253,9 @@ define <8 x i16> @vaddwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
;CHECK: vaddw.u8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
%tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
%tmp4 = add <8 x i16> %tmp1, %tmp3
ret <8 x i16> %tmp4
}
define <4 x i32> @vaddwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
@ -255,8 +263,9 @@ define <4 x i32> @vaddwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
;CHECK: vaddw.u16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
%tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
%tmp4 = add <4 x i32> %tmp1, %tmp3
ret <4 x i32> %tmp4
}
define <2 x i64> @vaddwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
@ -264,14 +273,7 @@ define <2 x i64> @vaddwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
;CHECK: vaddw.u32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
%tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
%tmp4 = add <2 x i64> %tmp1, %tmp3
ret <2 x i64> %tmp4
}
declare <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone

View File

@ -157,8 +157,10 @@ define <8 x i16> @vsubls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vsubl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
%tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
%tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
%tmp5 = sub <8 x i16> %tmp3, %tmp4
ret <8 x i16> %tmp5
}
define <4 x i32> @vsubls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
@ -166,8 +168,10 @@ define <4 x i32> @vsubls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vsubl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
%tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
%tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
%tmp5 = sub <4 x i32> %tmp3, %tmp4
ret <4 x i32> %tmp5
}
define <2 x i64> @vsubls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
@ -175,8 +179,10 @@ define <2 x i64> @vsubls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vsubl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
%tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
%tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
%tmp5 = sub <2 x i64> %tmp3, %tmp4
ret <2 x i64> %tmp5
}
define <8 x i16> @vsublu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
@ -184,8 +190,10 @@ define <8 x i16> @vsublu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vsubl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
%tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
%tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
%tmp5 = sub <8 x i16> %tmp3, %tmp4
ret <8 x i16> %tmp5
}
define <4 x i32> @vsublu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
@ -193,8 +201,10 @@ define <4 x i32> @vsublu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vsubl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
%tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
%tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
%tmp5 = sub <4 x i32> %tmp3, %tmp4
ret <4 x i32> %tmp5
}
define <2 x i64> @vsublu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
@ -202,25 +212,20 @@ define <2 x i64> @vsublu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vsubl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
%tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
%tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
%tmp5 = sub <2 x i64> %tmp3, %tmp4
ret <2 x i64> %tmp5
}
declare <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
define <8 x i16> @vsubws8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
;CHECK: vsubws8:
;CHECK: vsubw.s8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
%tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
%tmp4 = sub <8 x i16> %tmp1, %tmp3
ret <8 x i16> %tmp4
}
define <4 x i32> @vsubws16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
@ -228,8 +233,9 @@ define <4 x i32> @vsubws16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
;CHECK: vsubw.s16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
%tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
%tmp4 = sub <4 x i32> %tmp1, %tmp3
ret <4 x i32> %tmp4
}
define <2 x i64> @vsubws32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
@ -237,8 +243,9 @@ define <2 x i64> @vsubws32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
;CHECK: vsubw.s32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
%tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
%tmp4 = sub <2 x i64> %tmp1, %tmp3
ret <2 x i64> %tmp4
}
define <8 x i16> @vsubwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
@ -246,8 +253,9 @@ define <8 x i16> @vsubwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
;CHECK: vsubw.u8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
%tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
%tmp4 = sub <8 x i16> %tmp1, %tmp3
ret <8 x i16> %tmp4
}
define <4 x i32> @vsubwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
@ -255,8 +263,9 @@ define <4 x i32> @vsubwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
;CHECK: vsubw.u16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
%tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
%tmp4 = sub <4 x i32> %tmp1, %tmp3
ret <4 x i32> %tmp4
}
define <2 x i64> @vsubwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
@ -264,14 +273,7 @@ define <2 x i64> @vsubwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
;CHECK: vsubw.u32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
%tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
%tmp4 = sub <2 x i64> %tmp1, %tmp3
ret <2 x i64> %tmp4
}
declare <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone