From ea6ebbcfb39b6f5bc88cfa07fa2e5c304137b98b Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 14 Feb 2022 11:55:40 +0000 Subject: [PATCH] [ARM] MVE hadd and rhadd This uses the nodes from D106237 to add MVE HADD and RHADD lowering. Differential Revision: https://reviews.llvm.org/D106238 --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 8 + llvm/lib/Target/ARM/ARMInstrMVE.td | 87 ++-- llvm/test/CodeGen/Thumb2/mve-vhadd.ll | 592 +++--------------------- 3 files changed, 118 insertions(+), 569 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index ce62979de13f..34add2c7a811 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -273,6 +273,10 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::USUBSAT, VT, Legal); setOperationAction(ISD::ABDS, VT, Legal); setOperationAction(ISD::ABDU, VT, Legal); + setOperationAction(ISD::AVGFLOORS, VT, Legal); + setOperationAction(ISD::AVGFLOORU, VT, Legal); + setOperationAction(ISD::AVGCEILS, VT, Legal); + setOperationAction(ISD::AVGCEILU, VT, Legal); // No native support for these. setOperationAction(ISD::UDIV, VT, Expand); @@ -15353,6 +15357,10 @@ static SDValue FlattenVectorShuffle(ShuffleVectorSDNode *N, SelectionDAG &DAG) { case ISD::MULHU: case ISD::ABDS: case ISD::ABDU: + case ISD::AVGFLOORS: + case ISD::AVGFLOORU: + case ISD::AVGCEILS: + case ISD::AVGCEILU: break; default: return SDValue(); diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 1ae0354ffc37..15c33014e988 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -2192,36 +2192,29 @@ def subnsw : PatFrag<(ops node:$lhs, node:$rhs), return N->getFlags().hasNoSignedWrap(); }]>; -multiclass MVE_VRHADD_m { +multiclass MVE_VRHADD_m { def "" : MVE_VRHADD_Base; defvar Inst = !cast(NAME); + defm : MVE_TwoOpPattern(NAME)>; let Predicates = [HasMVEInt] in { - // Unpredicated rounding add-with-divide-by-two + // Unpredicated rounding add-with-divide-by-two intrinsic def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned))), (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; - - // Predicated add-with-divide-by-two - def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), - (VTI.Vec MQPR:$inactive))), - (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg, - (VTI.Vec MQPR:$inactive)))>; } } -multiclass MVE_VRHADD - : MVE_VRHADD_m; +multiclass MVE_VRHADD + : MVE_VRHADD_m; -defm MVE_VRHADDs8 : MVE_VRHADD; -defm MVE_VRHADDs16 : MVE_VRHADD; -defm MVE_VRHADDs32 : MVE_VRHADD; -defm MVE_VRHADDu8 : MVE_VRHADD; -defm MVE_VRHADDu16 : MVE_VRHADD; -defm MVE_VRHADDu32 : MVE_VRHADD; +defm MVE_VRHADDs8 : MVE_VRHADD; +defm MVE_VRHADDs16 : MVE_VRHADD; +defm MVE_VRHADDs32 : MVE_VRHADD; +defm MVE_VRHADDu8 : MVE_VRHADD; +defm MVE_VRHADDu16 : MVE_VRHADD; +defm MVE_VRHADDu32 : MVE_VRHADD; // Rounding Halving Add perform the arithemtic operation with an extra bit of // precision, before performing the shift, to void clipping errors. We're not @@ -2303,11 +2296,12 @@ class MVE_VHSUB_ size, list pattern=[]> : MVE_VHADDSUB<"vhsub", suffix, U, 0b1, size, pattern>; -multiclass MVE_VHADD_m { def "" : MVE_VHADD_; defvar Inst = !cast(NAME); + defm : MVE_TwoOpPattern(NAME)>; let Predicates = [HasMVEInt] in { // Unpredicated add-and-divide-by-two @@ -2316,30 +2310,23 @@ multiclass MVE_VHADD_m; - - // Predicated add-and-divide-by-two - def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned), - (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), - (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg, - (VTI.Vec MQPR:$inactive)))>; } } -multiclass MVE_VHADD - : MVE_VHADD_m + : MVE_VHADD_m; // Halving add/sub perform the arithemtic operation with an extra bit of // precision, before performing the shift, to void clipping errors. We're not // modelling that here with these patterns, but we're using no wrap forms of // add/sub to ensure that the extra bit of information is not needed. -defm MVE_VHADDs8 : MVE_VHADD; -defm MVE_VHADDs16 : MVE_VHADD; -defm MVE_VHADDs32 : MVE_VHADD; -defm MVE_VHADDu8 : MVE_VHADD; -defm MVE_VHADDu16 : MVE_VHADD; -defm MVE_VHADDu32 : MVE_VHADD; +defm MVE_VHADDs8 : MVE_VHADD; +defm MVE_VHADDs16 : MVE_VHADD; +defm MVE_VHADDs32 : MVE_VHADD; +defm MVE_VHADDu8 : MVE_VHADD; +defm MVE_VHADDu16 : MVE_VHADD; +defm MVE_VHADDu32 : MVE_VHADD; multiclass MVE_VHSUB_m { +multiclass MVE_VHADDSUB_qr_m { def "" : MVE_VxADDSUB_qr; + defm : MVE_TwoOpPatternDup(NAME)>; defm : MVE_vec_scalar_int_pat_m(NAME), VTI, unpred_int, pred_int, 1, 1>; defvar Inst = !cast(NAME); @@ -5386,20 +5373,20 @@ multiclass MVE_VHADDSUB_qr_m : - MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, int_arm_mve_vhadd, int_arm_mve_hadd_predicated, - add_op, shift_op>; +multiclass MVE_VHADD_qr_m : + MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, Op, int_arm_mve_vhadd, + int_arm_mve_hadd_predicated, add_op, shift_op>; multiclass MVE_VHSUB_qr_m : - MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, int_arm_mve_vhsub, int_arm_mve_hsub_predicated, - add_op, shift_op>; + MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, null_frag, int_arm_mve_vhsub, + int_arm_mve_hsub_predicated, add_op, shift_op>; -defm MVE_VHADD_qr_s8 : MVE_VHADD_qr_m; -defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m; -defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m; -defm MVE_VHADD_qr_u8 : MVE_VHADD_qr_m; -defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m; -defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_s8 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_u8 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m; defm MVE_VHSUB_qr_s8 : MVE_VHSUB_qr_m; defm MVE_VHSUB_qr_s16 : MVE_VHSUB_qr_m; diff --git a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll index e67afc494369..3b3c5c704a59 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll @@ -4,41 +4,7 @@ define arm_aapcs_vfpcc <4 x i32> @vhadds_v4i32(<4 x i32> %s0, <4 x i32> %s1) { ; CHECK-LABEL: vhadds_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d9} -; CHECK-NEXT: vpush {d9} -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.f32 s14, s1 -; CHECK-NEXT: vmov.f32 s18, s5 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.f32 s6, s7 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r2, asr #31 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: adds r2, r1, r3 -; CHECK-NEXT: asr.w r12, r1, #31 -; CHECK-NEXT: adc.w r1, r12, r3, asr #31 -; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r2, asr #31 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: adds r2, r1, r3 -; CHECK-NEXT: asr.w r12, r1, #31 -; CHECK-NEXT: adc.w r1, r12, r3, asr #31 -; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 -; CHECK-NEXT: vpop {d9} +; CHECK-NEXT: vhadd.s32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = sext <4 x i32> %s0 to <4 x i64> @@ -52,45 +18,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vhaddu_v4i32(<4 x i32> %s0, <4 x i32> %s1) { ; CHECK-LABEL: vhaddu_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.i64 q4, #0xffffffff -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vand q2, q2, q4 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vand q3, q3, q4 -; CHECK-NEXT: vmov r0, r1, d4 -; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vand q1, q1, q4 -; CHECK-NEXT: vand q4, q0, q4 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, r2, d8 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, r12, d2 -; CHECK-NEXT: adds r4, r3, r1 -; CHECK-NEXT: adc.w r1, r2, r12 -; CHECK-NEXT: vmov r2, r3, d7 -; CHECK-NEXT: lsrl r4, r1, #1 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, r4, d9 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, r12, d3 -; CHECK-NEXT: adds r2, r3, r1 -; CHECK-NEXT: adc.w r1, r4, r12 -; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vhadd.u32 q0, q0, q1 +; CHECK-NEXT: bx lr entry: %s0s = zext <4 x i32> %s0 to <4 x i64> %s1s = zext <4 x i32> %s1 to <4 x i64> @@ -137,15 +66,7 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vhadds_v8i16(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-LABEL: vhadds_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlt.s16 q2, q1 -; CHECK-NEXT: vmovlt.s16 q3, q0 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vshr.u32 q2, q2, #1 -; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: vhadd.s16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = sext <8 x i16> %s0 to <8 x i32> @@ -159,15 +80,7 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vhaddu_v8i16(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-LABEL: vhaddu_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlt.u16 q2, q1 -; CHECK-NEXT: vmovlt.u16 q3, q0 -; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vshr.u32 q2, q2, #1 -; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: vhadd.u16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <8 x i16> %s0 to <8 x i32> @@ -253,15 +166,7 @@ entry: define arm_aapcs_vfpcc <16 x i8> @vhadds_v16i8(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: vhadds_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlt.s8 q2, q1 -; CHECK-NEXT: vmovlt.s8 q3, q0 -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vadd.i16 q2, q3, q2 -; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vshr.u16 q2, q2, #1 -; CHECK-NEXT: vshr.u16 q0, q0, #1 -; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: vhadd.s8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = sext <16 x i8> %s0 to <16 x i16> @@ -275,15 +180,7 @@ entry: define arm_aapcs_vfpcc <16 x i8> @vhaddu_v16i8(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: vhaddu_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlt.u8 q2, q1 -; CHECK-NEXT: vmovlt.u8 q3, q0 -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vadd.i16 q2, q3, q2 -; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vshr.u16 q2, q2, #1 -; CHECK-NEXT: vshr.u16 q0, q0, #1 -; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: vhadd.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <16 x i8> %s0 to <16 x i16> @@ -297,49 +194,7 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vrhadds_v4i32(<4 x i32> %s0, <4 x i32> %s1) { ; CHECK-LABEL: vrhadds_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d9} -; CHECK-NEXT: vpush {d9} -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.f32 s14, s1 -; CHECK-NEXT: vmov.f32 s18, s5 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.f32 s6, s7 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r2, asr #31 -; CHECK-NEXT: adds r0, #1 -; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adc.w r3, r2, r3, asr #31 -; CHECK-NEXT: adds r2, r1, #1 -; CHECK-NEXT: adc r1, r3, #0 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r2, asr #31 -; CHECK-NEXT: adds r0, #1 -; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adc.w r3, r2, r3, asr #31 -; CHECK-NEXT: adds r2, r1, #1 -; CHECK-NEXT: adc r1, r3, #0 -; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 -; CHECK-NEXT: vpop {d9} +; CHECK-NEXT: vrhadd.s32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = sext <4 x i32> %s0 to <4 x i64> @@ -354,50 +209,7 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vrhaddu_v4i32(<4 x i32> %s0, <4 x i32> %s1) { ; CHECK-LABEL: vrhaddu_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.i64 q4, #0xffffffff -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vand q2, q2, q4 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vand q3, q3, q4 -; CHECK-NEXT: vmov r0, r1, d4 -; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vand q1, q1, q4 -; CHECK-NEXT: vand q4, q0, q4 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: adds r0, #1 -; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: vmov r3, r2, d8 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, r12, d2 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adc.w r3, r2, r12 -; CHECK-NEXT: adds r2, r1, #1 -; CHECK-NEXT: adc r1, r3, #0 -; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: vmov r2, r3, d7 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: adds r0, #1 -; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: vmov r3, r2, d9 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, r12, d3 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adc.w r3, r2, r12 -; CHECK-NEXT: adds r2, r1, #1 -; CHECK-NEXT: adc r1, r3, #0 -; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vrhadd.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <4 x i32> %s0 to <4 x i64> @@ -452,18 +264,7 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vrhadds_v8i16(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-LABEL: vrhadds_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlt.s16 q2, q1 -; CHECK-NEXT: vmovlt.s16 q3, q0 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vshr.u32 q2, q2, #1 -; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: vrhadd.s16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = sext <8 x i16> %s0 to <8 x i32> @@ -478,18 +279,7 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vrhaddu_v8i16(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-LABEL: vrhaddu_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlt.u16 q2, q1 -; CHECK-NEXT: vmovlt.u16 q3, q0 -; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vshr.u32 q2, q2, #1 -; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: vrhadd.u16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <8 x i16> %s0 to <8 x i32> @@ -588,18 +378,7 @@ entry: define arm_aapcs_vfpcc <16 x i8> @vrhadds_v16i8(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: vrhadds_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlt.s8 q2, q1 -; CHECK-NEXT: vmovlt.s8 q3, q0 -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vadd.i16 q2, q3, q2 -; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vadd.i16 q2, q2, r0 -; CHECK-NEXT: vadd.i16 q0, q0, r0 -; CHECK-NEXT: vshr.u16 q2, q2, #1 -; CHECK-NEXT: vshr.u16 q0, q0, #1 -; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: vrhadd.s8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = sext <16 x i8> %s0 to <16 x i16> @@ -614,18 +393,7 @@ entry: define arm_aapcs_vfpcc <16 x i8> @vrhaddu_v16i8(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: vrhaddu_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlt.u8 q2, q1 -; CHECK-NEXT: vmovlt.u8 q3, q0 -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vadd.i16 q2, q3, q2 -; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vadd.i16 q2, q2, r0 -; CHECK-NEXT: vadd.i16 q0, q0, r0 -; CHECK-NEXT: vshr.u16 q2, q2, #1 -; CHECK-NEXT: vshr.u16 q0, q0, #1 -; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: vrhadd.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <16 x i8> %s0 to <16 x i16> @@ -648,16 +416,10 @@ define void @vhadd_loop_s8(i8* nocapture readonly %x, i8* nocapture readonly %y, ; CHECK-NEXT: mov.w lr, #64 ; CHECK-NEXT: .LBB24_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.s16 q0, [r0, #8] -; CHECK-NEXT: vldrb.s16 q1, [r1, #8] -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vldrb.s16 q1, [r1], #16 -; CHECK-NEXT: vshr.u16 q0, q0, #1 -; CHECK-NEXT: vstrb.16 q0, [r2, #8] -; CHECK-NEXT: vldrb.s16 q0, [r0], #16 -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vshr.u16 q0, q0, #1 -; CHECK-NEXT: vstrb.16 q0, [r2], #16 +; CHECK-NEXT: vldrb.u8 q0, [r0], #16 +; CHECK-NEXT: vldrb.u8 q1, [r1], #16 +; CHECK-NEXT: vhadd.s8 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB24_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -696,16 +458,10 @@ define void @vhadd_loop_s16(i16* nocapture readonly %x, i16* nocapture readonly ; CHECK-NEXT: mov.w lr, #128 ; CHECK-NEXT: .LBB25_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.s32 q0, [r0, #8] -; CHECK-NEXT: vldrh.s32 q1, [r1, #8] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrh.s32 q1, [r1], #16 -; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vstrh.32 q0, [r2, #8] -; CHECK-NEXT: vldrh.s32 q0, [r0], #16 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vstrh.32 q0, [r2], #16 +; CHECK-NEXT: vldrh.u16 q0, [r0], #16 +; CHECK-NEXT: vldrh.u16 q1, [r1], #16 +; CHECK-NEXT: vhadd.s16 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB25_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -739,52 +495,18 @@ for.cond.cleanup: ; preds = %vector.body define void @vhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vhadd_loop_s32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d9} -; CHECK-NEXT: vpush {d9} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #256 ; CHECK-NEXT: .LBB26_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s2, s3 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.f32 s6, s7 -; CHECK-NEXT: vmov.f32 s18, s5 -; CHECK-NEXT: vmov r5, s8 -; CHECK-NEXT: asrs r4, r3, #31 -; CHECK-NEXT: adds.w r12, r3, r5 -; CHECK-NEXT: adc.w r3, r4, r5, asr #31 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: lsrl r12, r3, #1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: adds r6, r3, r5 -; CHECK-NEXT: asr.w r4, r3, #31 -; CHECK-NEXT: adc.w r3, r4, r5, asr #31 -; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov q3[2], q3[0], r6, r12 -; CHECK-NEXT: adds r4, r3, r5 -; CHECK-NEXT: asr.w r6, r3, #31 -; CHECK-NEXT: adc.w r3, r6, r5, asr #31 -; CHECK-NEXT: lsrl r4, r3, #1 -; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adds r6, r3, r5 -; CHECK-NEXT: asr.w r12, r3, #31 -; CHECK-NEXT: adc.w r3, r12, r5, asr #31 -; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov q3[3], q3[1], r6, r4 -; CHECK-NEXT: vstrb.8 q3, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vhadd.s32 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB26_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d9} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r7, pc} entry: br label %vector.body @@ -820,14 +542,10 @@ define void @vhadd_loop_u8(i8* nocapture readonly %x, i8* nocapture readonly %y, ; CHECK-NEXT: mov.w lr, #64 ; CHECK-NEXT: .LBB27_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u16 q0, [r0, #8] -; CHECK-NEXT: vldrb.u16 q1, [r1, #8] -; CHECK-NEXT: vhadd.u16 q0, q1, q0 -; CHECK-NEXT: vldrb.u16 q1, [r1], #16 -; CHECK-NEXT: vstrb.16 q0, [r2, #8] -; CHECK-NEXT: vldrb.u16 q0, [r0], #16 -; CHECK-NEXT: vhadd.u16 q0, q1, q0 -; CHECK-NEXT: vstrb.16 q0, [r2], #16 +; CHECK-NEXT: vldrb.u8 q0, [r0], #16 +; CHECK-NEXT: vldrb.u8 q1, [r1], #16 +; CHECK-NEXT: vhadd.u8 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB27_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -866,14 +584,10 @@ define void @vhadd_loop_u16(i16* nocapture readonly %x, i16* nocapture readonly ; CHECK-NEXT: mov.w lr, #128 ; CHECK-NEXT: .LBB28_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q0, [r0, #8] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vhadd.u32 q0, q1, q0 -; CHECK-NEXT: vldrh.u32 q1, [r1], #16 -; CHECK-NEXT: vstrh.32 q0, [r2, #8] -; CHECK-NEXT: vldrh.u32 q0, [r0], #16 -; CHECK-NEXT: vhadd.u32 q0, q1, q0 -; CHECK-NEXT: vstrh.32 q0, [r2], #16 +; CHECK-NEXT: vldrh.u16 q0, [r0], #16 +; CHECK-NEXT: vldrh.u16 q1, [r1], #16 +; CHECK-NEXT: vhadd.u16 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB28_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -907,56 +621,18 @@ for.cond.cleanup: ; preds = %vector.body define void @vhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vhadd_loop_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #256 -; CHECK-NEXT: vmov.i64 q0, #0xffffffff ; CHECK-NEXT: .LBB29_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q3, [r0], #16 -; CHECK-NEXT: vldrw.u32 q4, [r1], #16 -; CHECK-NEXT: vmov.f32 s4, s14 -; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmov.f32 s8, s18 -; CHECK-NEXT: vand q1, q1, q0 -; CHECK-NEXT: vmov.f32 s10, s19 -; CHECK-NEXT: vand q2, q2, q0 -; CHECK-NEXT: vmov r3, r5, d2 -; CHECK-NEXT: vmov r4, r6, d4 -; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: vmov.f32 s18, s17 -; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: vand q5, q4, q0 -; CHECK-NEXT: adds.w r12, r4, r3 -; CHECK-NEXT: adc.w r3, r6, r5 -; CHECK-NEXT: vmov r5, r6, d10 -; CHECK-NEXT: lsrl r12, r3, #1 -; CHECK-NEXT: vmov r3, r7, d6 -; CHECK-NEXT: adds r4, r5, r3 -; CHECK-NEXT: adc.w r3, r6, r7 -; CHECK-NEXT: vmov r6, r5, d5 -; CHECK-NEXT: lsrl r4, r3, #1 -; CHECK-NEXT: vmov r3, r7, d3 -; CHECK-NEXT: vmov q4[2], q4[0], r4, r12 -; CHECK-NEXT: adds r6, r6, r3 -; CHECK-NEXT: adc.w r3, r5, r7 -; CHECK-NEXT: vmov r5, r7, d11 -; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov r3, r12, d7 -; CHECK-NEXT: adds r4, r5, r3 -; CHECK-NEXT: adc.w r3, r7, r12 -; CHECK-NEXT: lsrl r4, r3, #1 -; CHECK-NEXT: vmov q4[3], q4[1], r4, r6 -; CHECK-NEXT: vstrb.8 q4, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vhadd.u32 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB29_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r7, pc} entry: br label %vector.body @@ -990,21 +666,12 @@ define void @vrhadd_loop_s8(i8* nocapture readonly %x, i8* nocapture readonly %y ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #64 -; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: .LBB30_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u16 q0, [r1, #8] -; CHECK-NEXT: vldrb.u16 q1, [r0, #8] -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vldrb.u16 q1, [r0], #16 -; CHECK-NEXT: vadd.i16 q0, q0, r3 -; CHECK-NEXT: vshr.u16 q0, q0, #1 -; CHECK-NEXT: vstrb.16 q0, [r2, #8] -; CHECK-NEXT: vldrb.u16 q0, [r1], #16 -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vadd.i16 q0, q0, r3 -; CHECK-NEXT: vshr.u16 q0, q0, #1 -; CHECK-NEXT: vstrb.16 q0, [r2], #16 +; CHECK-NEXT: vldrb.u8 q0, [r1], #16 +; CHECK-NEXT: vldrb.u8 q1, [r0], #16 +; CHECK-NEXT: vrhadd.u8 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB30_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -1042,21 +709,12 @@ define void @vrhadd_loop_s16(i16* nocapture readonly %x, i16* nocapture readonly ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #128 -; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: .LBB31_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q0, [r1, #8] -; CHECK-NEXT: vldrh.u32 q1, [r0, #8] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrh.u32 q1, [r0], #16 -; CHECK-NEXT: vadd.i32 q0, q0, r3 -; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vstrh.32 q0, [r2, #8] -; CHECK-NEXT: vldrh.u32 q0, [r1], #16 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vadd.i32 q0, q0, r3 -; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vstrh.32 q0, [r2], #16 +; CHECK-NEXT: vldrh.u16 q0, [r1], #16 +; CHECK-NEXT: vldrh.u16 q1, [r0], #16 +; CHECK-NEXT: vrhadd.u16 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB31_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -1091,61 +749,18 @@ for.cond.cleanup: ; preds = %vector.body define void @vrhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vrhadd_loop_s32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #256 -; CHECK-NEXT: vmov.i64 q0, #0xffffffff ; CHECK-NEXT: .LBB32_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q3, [r1], #16 -; CHECK-NEXT: vldrw.u32 q4, [r0], #16 -; CHECK-NEXT: vmov.f32 s4, s14 -; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmov.f32 s8, s18 -; CHECK-NEXT: vand q1, q1, q0 -; CHECK-NEXT: vmov.f32 s10, s19 -; CHECK-NEXT: vand q2, q2, q0 -; CHECK-NEXT: vmov r3, r12, d2 -; CHECK-NEXT: vmov r4, r5, d4 -; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: vmov.f32 s18, s17 -; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: vand q5, q4, q0 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adc.w r4, r5, r12 -; CHECK-NEXT: adds.w r12, r3, #1 -; CHECK-NEXT: adc r3, r4, #0 -; CHECK-NEXT: vmov r5, r6, d10 -; CHECK-NEXT: lsrl r12, r3, #1 -; CHECK-NEXT: vmov r3, r4, d6 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r4, r6 -; CHECK-NEXT: adds r6, r3, #1 -; CHECK-NEXT: adc r3, r4, #0 -; CHECK-NEXT: vmov r5, r4, d5 -; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov q4[2], q4[0], r6, r12 -; CHECK-NEXT: vmov r3, r6, d3 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r4, r6 -; CHECK-NEXT: adds.w r12, r3, #1 -; CHECK-NEXT: adc r3, r4, #0 -; CHECK-NEXT: vmov r5, r6, d11 -; CHECK-NEXT: lsrl r12, r3, #1 -; CHECK-NEXT: vmov r3, r4, d7 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r4, r6 -; CHECK-NEXT: adds r6, r3, #1 -; CHECK-NEXT: adc r3, r4, #0 -; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov q4[3], q4[1], r6, r12 -; CHECK-NEXT: vstrb.8 q4, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vrhadd.u32 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB32_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r7, pc} entry: br label %vector.body @@ -1180,21 +795,12 @@ define void @vrhadd_loop_u8(i8* nocapture readonly %x, i8* nocapture readonly %y ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #64 -; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: .LBB33_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u16 q0, [r1, #8] -; CHECK-NEXT: vldrb.u16 q1, [r0, #8] -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vldrb.u16 q1, [r0], #16 -; CHECK-NEXT: vadd.i16 q0, q0, r3 -; CHECK-NEXT: vshr.u16 q0, q0, #1 -; CHECK-NEXT: vstrb.16 q0, [r2, #8] -; CHECK-NEXT: vldrb.u16 q0, [r1], #16 -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vadd.i16 q0, q0, r3 -; CHECK-NEXT: vshr.u16 q0, q0, #1 -; CHECK-NEXT: vstrb.16 q0, [r2], #16 +; CHECK-NEXT: vldrb.u8 q0, [r1], #16 +; CHECK-NEXT: vldrb.u8 q1, [r0], #16 +; CHECK-NEXT: vrhadd.u8 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB33_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -1232,21 +838,12 @@ define void @vrhadd_loop_u16(i16* nocapture readonly %x, i16* nocapture readonly ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #128 -; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: .LBB34_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q0, [r1, #8] -; CHECK-NEXT: vldrh.u32 q1, [r0, #8] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrh.u32 q1, [r0], #16 -; CHECK-NEXT: vadd.i32 q0, q0, r3 -; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vstrh.32 q0, [r2, #8] -; CHECK-NEXT: vldrh.u32 q0, [r1], #16 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vadd.i32 q0, q0, r3 -; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vstrh.32 q0, [r2], #16 +; CHECK-NEXT: vldrh.u16 q0, [r1], #16 +; CHECK-NEXT: vldrh.u16 q1, [r0], #16 +; CHECK-NEXT: vrhadd.u16 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB34_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -1281,61 +878,18 @@ for.cond.cleanup: ; preds = %vector.body define void @vrhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vrhadd_loop_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #256 -; CHECK-NEXT: vmov.i64 q0, #0xffffffff ; CHECK-NEXT: .LBB35_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q3, [r1], #16 -; CHECK-NEXT: vldrw.u32 q4, [r0], #16 -; CHECK-NEXT: vmov.f32 s4, s14 -; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmov.f32 s8, s18 -; CHECK-NEXT: vand q1, q1, q0 -; CHECK-NEXT: vmov.f32 s10, s19 -; CHECK-NEXT: vand q2, q2, q0 -; CHECK-NEXT: vmov r3, r12, d2 -; CHECK-NEXT: vmov r4, r5, d4 -; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: vmov.f32 s18, s17 -; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: vand q5, q4, q0 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adc.w r4, r5, r12 -; CHECK-NEXT: adds.w r12, r3, #1 -; CHECK-NEXT: adc r3, r4, #0 -; CHECK-NEXT: vmov r5, r6, d10 -; CHECK-NEXT: lsrl r12, r3, #1 -; CHECK-NEXT: vmov r3, r4, d6 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r4, r6 -; CHECK-NEXT: adds r6, r3, #1 -; CHECK-NEXT: adc r3, r4, #0 -; CHECK-NEXT: vmov r5, r4, d5 -; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov q4[2], q4[0], r6, r12 -; CHECK-NEXT: vmov r3, r6, d3 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r4, r6 -; CHECK-NEXT: adds.w r12, r3, #1 -; CHECK-NEXT: adc r3, r4, #0 -; CHECK-NEXT: vmov r5, r6, d11 -; CHECK-NEXT: lsrl r12, r3, #1 -; CHECK-NEXT: vmov r3, r4, d7 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r4, r6 -; CHECK-NEXT: adds r6, r3, #1 -; CHECK-NEXT: adc r3, r4, #0 -; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov q4[3], q4[1], r6, r12 -; CHECK-NEXT: vstrb.8 q4, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vrhadd.u32 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB35_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r7, pc} entry: br label %vector.body