[ARM] Generate [SU]RHADD from (b - (~a)) >> 1

Summary:
Teach LLVM to recognize the above pattern, which is usually a
transformation of (a + b + 1) >> 1, where the operands are either
signed or unsigned types.

Subscribers: kristof.beyls, hiraditya, danielkiss, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D82669
This commit is contained in:
Petre-Ionut Tudor 2020-06-03 15:28:43 +01:00
parent b3b952873f
commit af80a4353e
4 changed files with 483 additions and 54 deletions

View File

@ -838,6 +838,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UADDSAT, VT, Legal); setOperationAction(ISD::UADDSAT, VT, Legal);
setOperationAction(ISD::SSUBSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal);
setOperationAction(ISD::USUBSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal);
setOperationAction(ISD::TRUNCATE, VT, Custom);
} }
for (MVT VT : { MVT::v4f16, MVT::v2f32, for (MVT VT : { MVT::v4f16, MVT::v2f32,
MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
@ -1432,6 +1434,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::FCMLTz) MAKE_CASE(AArch64ISD::FCMLTz)
MAKE_CASE(AArch64ISD::SADDV) MAKE_CASE(AArch64ISD::SADDV)
MAKE_CASE(AArch64ISD::UADDV) MAKE_CASE(AArch64ISD::UADDV)
MAKE_CASE(AArch64ISD::SRHADD)
MAKE_CASE(AArch64ISD::URHADD)
MAKE_CASE(AArch64ISD::SMINV) MAKE_CASE(AArch64ISD::SMINV)
MAKE_CASE(AArch64ISD::UMINV) MAKE_CASE(AArch64ISD::UMINV)
MAKE_CASE(AArch64ISD::SMAXV) MAKE_CASE(AArch64ISD::SMAXV)
@ -3260,6 +3264,14 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2), return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
Op.getOperand(3)); Op.getOperand(3));
} }
case Intrinsic::aarch64_neon_srhadd:
case Intrinsic::aarch64_neon_urhadd: {
bool IsSignedAdd = IntNo == Intrinsic::aarch64_neon_srhadd;
unsigned Opcode = IsSignedAdd ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2));
}
} }
} }
@ -3524,6 +3536,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerDYNAMIC_STACKALLOC(Op, DAG); return LowerDYNAMIC_STACKALLOC(Op, DAG);
case ISD::VSCALE: case ISD::VSCALE:
return LowerVSCALE(Op, DAG); return LowerVSCALE(Op, DAG);
case ISD::TRUNCATE:
return LowerTRUNCATE(Op, DAG);
case ISD::LOAD: case ISD::LOAD:
if (useSVEForFixedLengthVectorVT(Op.getValueType())) if (useSVEForFixedLengthVectorVT(Op.getValueType()))
return LowerFixedLengthVectorLoadToSVE(Op, DAG); return LowerFixedLengthVectorLoadToSVE(Op, DAG);
@ -8773,6 +8787,78 @@ static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
} }
// Attempt to form urhadd(OpA, OpB) from
// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)).
// The original form of this expression is
// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and before this function
// is called the srl will have been lowered to AArch64ISD::VLSHR and the
// ((OpA + OpB + 1) >> 1) expression will have been changed to (OpB - (~OpA)).
// This pass can also recognize a variant of this pattern that uses sign
// extension instead of zero extension and form a srhadd(OpA, OpB) from it.
SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
if (!VT.isVector() || VT.isScalableVector())
return Op;
// Since we are looking for a right shift by a constant value of 1 and we are
// operating on types at least 16 bits in length (sign/zero extended OpA and
// OpB, which are at least 8 bits), it follows that the truncate will always
// discard the shifted-in bit and therefore the right shift will be logical
// regardless of the signedness of OpA and OpB.
SDValue Shift = Op.getOperand(0);
if (Shift.getOpcode() != AArch64ISD::VLSHR)
return Op;
// Is the right shift using an immediate value of 1?
uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
if (ShiftAmount != 1)
return Op;
SDValue Sub = Shift->getOperand(0);
if (Sub.getOpcode() != ISD::SUB)
return Op;
SDValue Xor = Sub.getOperand(1);
if (Xor.getOpcode() != ISD::XOR)
return Op;
SDValue ExtendOpA = Xor.getOperand(0);
SDValue ExtendOpB = Sub.getOperand(0);
unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
if (!(ExtendOpAOpc == ExtendOpBOpc &&
(ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND)))
return Op;
// Is the result of the right shift being truncated to the same value type as
// the original operands, OpA and OpB?
SDValue OpA = ExtendOpA.getOperand(0);
SDValue OpB = ExtendOpB.getOperand(0);
EVT OpAVT = OpA.getValueType();
assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
return Op;
// Is the XOR using a constant amount of all ones in the right hand side?
uint64_t C;
if (!isAllConstantBuildVector(Xor.getOperand(1), C))
return Op;
unsigned ElemSizeInBits = VT.getScalarSizeInBits();
APInt CAsAPInt(ElemSizeInBits, C);
if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits))
return Op;
SDLoc DL(Op);
bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
unsigned RHADDOpc = IsSignExtend ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
SDValue ResultURHADD = DAG.getNode(RHADDOpc, DL, VT, OpA, OpB);
return ResultURHADD;
}
SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
SelectionDAG &DAG) const { SelectionDAG &DAG) const {
EVT VT = Op.getValueType(); EVT VT = Op.getValueType();
@ -10982,6 +11068,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
SDLoc dl(N); SDLoc dl(N);
EVT VT = N->getValueType(0); EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
// Optimize concat_vectors of truncated vectors, where the intermediate // Optimize concat_vectors of truncated vectors, where the intermediate
// type is illegal, to avoid said illegality, e.g., // type is illegal, to avoid said illegality, e.g.,
@ -10994,9 +11081,8 @@ static SDValue performConcatVectorsCombine(SDNode *N,
// This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
// on both input and result type, so we might generate worse code. // on both input and result type, so we might generate worse code.
// On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
if (N->getNumOperands() == 2 && if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
N0->getOpcode() == ISD::TRUNCATE && N1Opc == ISD::TRUNCATE) {
N1->getOpcode() == ISD::TRUNCATE) {
SDValue N00 = N0->getOperand(0); SDValue N00 = N0->getOperand(0);
SDValue N10 = N1->getOperand(0); SDValue N10 = N1->getOperand(0);
EVT N00VT = N00.getValueType(); EVT N00VT = N00.getValueType();
@ -11021,6 +11107,52 @@ static SDValue performConcatVectorsCombine(SDNode *N,
if (DCI.isBeforeLegalizeOps()) if (DCI.isBeforeLegalizeOps())
return SDValue(); return SDValue();
// Optimise concat_vectors of two [us]rhadds that use extracted subvectors
// from the same original vectors. Combine these into a single [us]rhadd that
// operates on the two original vectors. Example:
// (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
// extract_subvector (v16i8 OpB,
// <0>))),
// (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>),
// extract_subvector (v16i8 OpB,
// <8>)))))
// ->
// (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
(N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD)) {
SDValue N00 = N0->getOperand(0);
SDValue N01 = N0->getOperand(1);
SDValue N10 = N1->getOperand(0);
SDValue N11 = N1->getOperand(1);
EVT N00VT = N00.getValueType();
EVT N10VT = N10.getValueType();
if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
SDValue N00Source = N00->getOperand(0);
SDValue N01Source = N01->getOperand(0);
SDValue N10Source = N10->getOperand(0);
SDValue N11Source = N11->getOperand(0);
if (N00Source == N10Source && N01Source == N11Source &&
N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
assert(N0.getValueType() == N1.getValueType());
uint64_t N00Index = N00.getConstantOperandVal(1);
uint64_t N01Index = N01.getConstantOperandVal(1);
uint64_t N10Index = N10.getConstantOperandVal(1);
uint64_t N11Index = N11.getConstantOperandVal(1);
if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
N10Index == N00VT.getVectorNumElements())
return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
}
}
}
// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
// splat. The indexed instructions are going to be expecting a DUPLANE64, so // splat. The indexed instructions are going to be expecting a DUPLANE64, so
// canonicalise to that. // canonicalise to that.
@ -11039,7 +11171,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
// becomes // becomes
// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
if (N1->getOpcode() != ISD::BITCAST) if (N1Opc != ISD::BITCAST)
return SDValue(); return SDValue();
SDValue RHS = N1->getOperand(0); SDValue RHS = N1->getOperand(0);
MVT RHSTy = RHS.getValueType().getSimpleVT(); MVT RHSTy = RHS.getValueType().getSimpleVT();

View File

@ -187,6 +187,10 @@ enum NodeType : unsigned {
SADDV, SADDV,
UADDV, UADDV,
// Vector rounding halving addition
SRHADD,
URHADD,
// Vector across-lanes min/max // Vector across-lanes min/max
// Only the lower result lane is defined. // Only the lower result lane is defined.
SMINV, SMINV,
@ -863,6 +867,7 @@ private:
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;

View File

@ -554,6 +554,9 @@ def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>;
def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>; def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>;
def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
def AArch64srhadd : SDNode<"AArch64ISD::SRHADD", SDT_AArch64binvec>;
def AArch64urhadd : SDNode<"AArch64ISD::URHADD", SDT_AArch64binvec>;
def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>; def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
@ -4073,7 +4076,7 @@ defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrd
defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>; defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>; defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>; defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>; defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", AArch64srhadd>;
defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>; defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>; defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>; defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>;
@ -4090,7 +4093,7 @@ defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>; defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>; defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>; defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>; defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", AArch64urhadd>;
defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>; defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>; defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah", defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",

View File

@ -1,8 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK-LABEL: shadd8b: ; CHECK-LABEL: shadd8b:
;CHECK: shadd.8b ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: shadd.8b v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, <8 x i8>* %A %tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B %tmp2 = load <8 x i8>, <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) %tmp3 = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@ -10,8 +15,12 @@ define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
} }
define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK-LABEL: shadd16b: ; CHECK-LABEL: shadd16b:
;CHECK: shadd.16b ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: shadd.16b v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <16 x i8>, <16 x i8>* %A %tmp1 = load <16 x i8>, <16 x i8>* %A
%tmp2 = load <16 x i8>, <16 x i8>* %B %tmp2 = load <16 x i8>, <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) %tmp3 = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@ -19,8 +28,12 @@ define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
} }
define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK-LABEL: shadd4h: ; CHECK-LABEL: shadd4h:
;CHECK: shadd.4h ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: shadd.4h v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, <4 x i16>* %A %tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B %tmp2 = load <4 x i16>, <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) %tmp3 = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@ -28,8 +41,12 @@ define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
} }
define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK-LABEL: shadd8h: ; CHECK-LABEL: shadd8h:
;CHECK: shadd.8h ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: shadd.8h v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, <8 x i16>* %A %tmp1 = load <8 x i16>, <8 x i16>* %A
%tmp2 = load <8 x i16>, <8 x i16>* %B %tmp2 = load <8 x i16>, <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) %tmp3 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@ -37,8 +54,12 @@ define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
} }
define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK-LABEL: shadd2s: ; CHECK-LABEL: shadd2s:
;CHECK: shadd.2s ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: shadd.2s v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, <2 x i32>* %A %tmp1 = load <2 x i32>, <2 x i32>* %A
%tmp2 = load <2 x i32>, <2 x i32>* %B %tmp2 = load <2 x i32>, <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) %tmp3 = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@ -46,8 +67,12 @@ define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
} }
define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK-LABEL: shadd4s: ; CHECK-LABEL: shadd4s:
;CHECK: shadd.4s ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: shadd.4s v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, <4 x i32>* %A %tmp1 = load <4 x i32>, <4 x i32>* %A
%tmp2 = load <4 x i32>, <4 x i32>* %B %tmp2 = load <4 x i32>, <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) %tmp3 = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@ -55,8 +80,12 @@ define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
} }
define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK-LABEL: uhadd8b: ; CHECK-LABEL: uhadd8b:
;CHECK: uhadd.8b ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: uhadd.8b v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, <8 x i8>* %A %tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B %tmp2 = load <8 x i8>, <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) %tmp3 = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@ -64,8 +93,12 @@ define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
} }
define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK-LABEL: uhadd16b: ; CHECK-LABEL: uhadd16b:
;CHECK: uhadd.16b ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: uhadd.16b v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <16 x i8>, <16 x i8>* %A %tmp1 = load <16 x i8>, <16 x i8>* %A
%tmp2 = load <16 x i8>, <16 x i8>* %B %tmp2 = load <16 x i8>, <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) %tmp3 = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@ -73,8 +106,12 @@ define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
} }
define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK-LABEL: uhadd4h: ; CHECK-LABEL: uhadd4h:
;CHECK: uhadd.4h ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: uhadd.4h v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, <4 x i16>* %A %tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B %tmp2 = load <4 x i16>, <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) %tmp3 = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@ -82,8 +119,12 @@ define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
} }
define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK-LABEL: uhadd8h: ; CHECK-LABEL: uhadd8h:
;CHECK: uhadd.8h ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: uhadd.8h v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, <8 x i16>* %A %tmp1 = load <8 x i16>, <8 x i16>* %A
%tmp2 = load <8 x i16>, <8 x i16>* %B %tmp2 = load <8 x i16>, <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) %tmp3 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@ -91,8 +132,12 @@ define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
} }
define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK-LABEL: uhadd2s: ; CHECK-LABEL: uhadd2s:
;CHECK: uhadd.2s ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: uhadd.2s v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, <2 x i32>* %A %tmp1 = load <2 x i32>, <2 x i32>* %A
%tmp2 = load <2 x i32>, <2 x i32>* %B %tmp2 = load <2 x i32>, <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) %tmp3 = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@ -100,8 +145,12 @@ define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
} }
define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK-LABEL: uhadd4s: ; CHECK-LABEL: uhadd4s:
;CHECK: uhadd.4s ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: uhadd.4s v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, <4 x i32>* %A %tmp1 = load <4 x i32>, <4 x i32>* %A
%tmp2 = load <4 x i32>, <4 x i32>* %B %tmp2 = load <4 x i32>, <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) %tmp3 = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@ -125,8 +174,12 @@ declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind
declare <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone declare <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK-LABEL: srhadd8b: ; CHECK-LABEL: srhadd8b:
;CHECK: srhadd.8b ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: srhadd.8b v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, <8 x i8>* %A %tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B %tmp2 = load <8 x i8>, <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) %tmp3 = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@ -134,8 +187,12 @@ define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
} }
define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK-LABEL: srhadd16b: ; CHECK-LABEL: srhadd16b:
;CHECK: srhadd.16b ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: srhadd.16b v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <16 x i8>, <16 x i8>* %A %tmp1 = load <16 x i8>, <16 x i8>* %A
%tmp2 = load <16 x i8>, <16 x i8>* %B %tmp2 = load <16 x i8>, <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) %tmp3 = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@ -143,8 +200,12 @@ define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
} }
define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK-LABEL: srhadd4h: ; CHECK-LABEL: srhadd4h:
;CHECK: srhadd.4h ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: srhadd.4h v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, <4 x i16>* %A %tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B %tmp2 = load <4 x i16>, <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) %tmp3 = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@ -152,8 +213,12 @@ define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
} }
define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK-LABEL: srhadd8h: ; CHECK-LABEL: srhadd8h:
;CHECK: srhadd.8h ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: srhadd.8h v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, <8 x i16>* %A %tmp1 = load <8 x i16>, <8 x i16>* %A
%tmp2 = load <8 x i16>, <8 x i16>* %B %tmp2 = load <8 x i16>, <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) %tmp3 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@ -161,8 +226,12 @@ define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
} }
define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK-LABEL: srhadd2s: ; CHECK-LABEL: srhadd2s:
;CHECK: srhadd.2s ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: srhadd.2s v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, <2 x i32>* %A %tmp1 = load <2 x i32>, <2 x i32>* %A
%tmp2 = load <2 x i32>, <2 x i32>* %B %tmp2 = load <2 x i32>, <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) %tmp3 = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@ -170,8 +239,12 @@ define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
} }
define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK-LABEL: srhadd4s: ; CHECK-LABEL: srhadd4s:
;CHECK: srhadd.4s ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: srhadd.4s v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, <4 x i32>* %A %tmp1 = load <4 x i32>, <4 x i32>* %A
%tmp2 = load <4 x i32>, <4 x i32>* %B %tmp2 = load <4 x i32>, <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) %tmp3 = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@ -179,8 +252,12 @@ define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
} }
define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK-LABEL: urhadd8b: ; CHECK-LABEL: urhadd8b:
;CHECK: urhadd.8b ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: urhadd.8b v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, <8 x i8>* %A %tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B %tmp2 = load <8 x i8>, <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) %tmp3 = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@ -188,8 +265,12 @@ define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
} }
define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK-LABEL: urhadd16b: ; CHECK-LABEL: urhadd16b:
;CHECK: urhadd.16b ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: urhadd.16b v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <16 x i8>, <16 x i8>* %A %tmp1 = load <16 x i8>, <16 x i8>* %A
%tmp2 = load <16 x i8>, <16 x i8>* %B %tmp2 = load <16 x i8>, <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) %tmp3 = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@ -197,8 +278,12 @@ define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
} }
define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK-LABEL: urhadd4h: ; CHECK-LABEL: urhadd4h:
;CHECK: urhadd.4h ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: urhadd.4h v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, <4 x i16>* %A %tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B %tmp2 = load <4 x i16>, <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) %tmp3 = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@ -206,8 +291,12 @@ define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
} }
define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK-LABEL: urhadd8h: ; CHECK-LABEL: urhadd8h:
;CHECK: urhadd.8h ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: urhadd.8h v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, <8 x i16>* %A %tmp1 = load <8 x i16>, <8 x i16>* %A
%tmp2 = load <8 x i16>, <8 x i16>* %B %tmp2 = load <8 x i16>, <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) %tmp3 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@ -215,8 +304,12 @@ define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
} }
define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK-LABEL: urhadd2s: ; CHECK-LABEL: urhadd2s:
;CHECK: urhadd.2s ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: urhadd.2s v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, <2 x i32>* %A %tmp1 = load <2 x i32>, <2 x i32>* %A
%tmp2 = load <2 x i32>, <2 x i32>* %B %tmp2 = load <2 x i32>, <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) %tmp3 = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@ -224,14 +317,210 @@ define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
} }
define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK-LABEL: urhadd4s: ; CHECK-LABEL: urhadd4s:
;CHECK: urhadd.4s ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: urhadd.4s v0, v0, v1
; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, <4 x i32>* %A %tmp1 = load <4 x i32>, <4 x i32>* %A
%tmp2 = load <4 x i32>, <4 x i32>* %B %tmp2 = load <4 x i32>, <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) %tmp3 = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3 ret <4 x i32> %tmp3
} }
define void @testLowerToSRHADD8b(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind {
; CHECK-LABEL: testLowerToSRHADD8b:
; CHECK: // %bb.0:
; CHECK-NEXT: srhadd.8b v0, v0, v1
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
%sextsrc1 = sext <8 x i8> %src1 to <8 x i16>
%sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
%add1 = add <8 x i16> %sextsrc1, %sextsrc2
%add2 = add <8 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%resulti16 = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%result = trunc <8 x i16> %resulti16 to <8 x i8>
store <8 x i8> %result, <8 x i8>* %dest, align 8
ret void
}
define void @testLowerToSRHADD4h(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind {
; CHECK-LABEL: testLowerToSRHADD4h:
; CHECK: // %bb.0:
; CHECK-NEXT: srhadd.4h v0, v0, v1
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
%sextsrc1 = sext <4 x i16> %src1 to <4 x i32>
%sextsrc2 = sext <4 x i16> %src2 to <4 x i32>
%add1 = add <4 x i32> %sextsrc1, %sextsrc2
%add2 = add <4 x i32> %add1, <i32 1, i32 1, i32 1, i32 1>
%resulti16 = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1>
%result = trunc <4 x i32> %resulti16 to <4 x i16>
store <4 x i16> %result, <4 x i16>* %dest, align 8
ret void
}
define void @testLowerToSRHADD2s(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind {
; CHECK-LABEL: testLowerToSRHADD2s:
; CHECK: // %bb.0:
; CHECK-NEXT: srhadd.2s v0, v0, v1
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
%sextsrc1 = sext <2 x i32> %src1 to <2 x i64>
%sextsrc2 = sext <2 x i32> %src2 to <2 x i64>
%add1 = add <2 x i64> %sextsrc1, %sextsrc2
%add2 = add <2 x i64> %add1, <i64 1, i64 1>
%resulti16 = lshr <2 x i64> %add2, <i64 1, i64 1>
%result = trunc <2 x i64> %resulti16 to <2 x i32>
store <2 x i32> %result, <2 x i32>* %dest, align 8
ret void
}
define void @testLowerToSRHADD16b(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
; CHECK-LABEL: testLowerToSRHADD16b:
; CHECK: // %bb.0:
; CHECK-NEXT: srhadd.16b v0, v0, v1
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
%sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
%sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
%add1 = add <16 x i16> %sextsrc1, %sextsrc2
%add2 = add <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%resulti16 = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%result = trunc <16 x i16> %resulti16 to <16 x i8>
store <16 x i8> %result, <16 x i8>* %dest, align 16
ret void
}
define void @testLowerToSRHADD8h(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind {
; CHECK-LABEL: testLowerToSRHADD8h:
; CHECK: // %bb.0:
; CHECK-NEXT: srhadd.8h v0, v0, v1
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
%sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
%sextsrc2 = sext <8 x i16> %src2 to <8 x i32>
%add1 = add <8 x i32> %sextsrc1, %sextsrc2
%add2 = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%resulti16 = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%result = trunc <8 x i32> %resulti16 to <8 x i16>
store <8 x i16> %result, <8 x i16>* %dest, align 16
ret void
}
define void @testLowerToSRHADD4s(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind {
; CHECK-LABEL: testLowerToSRHADD4s:
; CHECK: // %bb.0:
; CHECK-NEXT: srhadd.4s v0, v0, v1
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
%sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
%sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
%add1 = add <4 x i64> %sextsrc1, %sextsrc2
%add2 = add <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1>
%resulti16 = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
%result = trunc <4 x i64> %resulti16 to <4 x i32>
store <4 x i32> %result, <4 x i32>* %dest, align 16
ret void
}
define void @testLowerToURHADD8b(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind {
; CHECK-LABEL: testLowerToURHADD8b:
; CHECK: // %bb.0:
; CHECK-NEXT: urhadd.8b v0, v0, v1
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
%zextsrc1 = zext <8 x i8> %src1 to <8 x i16>
%zextsrc2 = zext <8 x i8> %src2 to <8 x i16>
%add1 = add <8 x i16> %zextsrc1, %zextsrc2
%add2 = add <8 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%resulti16 = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%result = trunc <8 x i16> %resulti16 to <8 x i8>
store <8 x i8> %result, <8 x i8>* %dest, align 8
ret void
}
define void @testLowerToURHADD4h(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind {
; CHECK-LABEL: testLowerToURHADD4h:
; CHECK: // %bb.0:
; CHECK-NEXT: urhadd.4h v0, v0, v1
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
%zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
%zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
%add1 = add <4 x i32> %zextsrc1, %zextsrc2
%add2 = add <4 x i32> %add1, <i32 1, i32 1, i32 1, i32 1>
%resulti16 = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1>
%result = trunc <4 x i32> %resulti16 to <4 x i16>
store <4 x i16> %result, <4 x i16>* %dest, align 8
ret void
}
define void @testLowerToURHADD2s(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind {
; CHECK-LABEL: testLowerToURHADD2s:
; CHECK: // %bb.0:
; CHECK-NEXT: urhadd.2s v0, v0, v1
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
%zextsrc1 = zext <2 x i32> %src1 to <2 x i64>
%zextsrc2 = zext <2 x i32> %src2 to <2 x i64>
%add1 = add <2 x i64> %zextsrc1, %zextsrc2
%add2 = add <2 x i64> %add1, <i64 1, i64 1>
%resulti16 = lshr <2 x i64> %add2, <i64 1, i64 1>
%result = trunc <2 x i64> %resulti16 to <2 x i32>
store <2 x i32> %result, <2 x i32>* %dest, align 8
ret void
}
define void @testLowerToURHADD16b(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
; CHECK-LABEL: testLowerToURHADD16b:
; CHECK: // %bb.0:
; CHECK-NEXT: urhadd.16b v0, v0, v1
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
%zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
%zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
%add1 = add <16 x i16> %zextsrc1, %zextsrc2
%add2 = add <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%resulti16 = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%result = trunc <16 x i16> %resulti16 to <16 x i8>
store <16 x i8> %result, <16 x i8>* %dest, align 16
ret void
}
define void @testLowerToURHADD8h(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind {
; CHECK-LABEL: testLowerToURHADD8h:
; CHECK: // %bb.0:
; CHECK-NEXT: urhadd.8h v0, v0, v1
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
%zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
%zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
%add1 = add <8 x i32> %zextsrc1, %zextsrc2
%add2 = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%resulti16 = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%result = trunc <8 x i32> %resulti16 to <8 x i16>
store <8 x i16> %result, <8 x i16>* %dest, align 16
ret void
}
define void @testLowerToURHADD4s(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind {
; CHECK-LABEL: testLowerToURHADD4s:
; CHECK: // %bb.0:
; CHECK-NEXT: urhadd.4s v0, v0, v1
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
%zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
%zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
%add1 = add <4 x i64> %zextsrc1, %zextsrc2
%add2 = add <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1>
%resulti16 = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
%result = trunc <4 x i64> %resulti16 to <4 x i32>
store <4 x i32> %result, <4 x i32>* %dest, align 16
ret void
}
declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone