From bdd82c3f51c2c3a75840a3579a29b641b325a364 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 20 May 2021 14:13:39 +0100 Subject: [PATCH] [ARM] Extra tests for MVE vhadd and vmulh. NFC --- llvm/test/CodeGen/Thumb2/mve-vhadd.ll | 1122 +++++++++++++++++++++++++ llvm/test/CodeGen/Thumb2/mve-vmulh.ll | 303 +++++++ 2 files changed, 1425 insertions(+) create mode 100644 llvm/test/CodeGen/Thumb2/mve-vhadd.ll diff --git a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll new file mode 100644 index 000000000000..d06a5418c70d --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll @@ -0,0 +1,1122 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve %s -o - | FileCheck %s + +define arm_aapcs_vfpcc <16 x i8> @vrhadd_s8(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: vrhadd_s8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmovlt.s8 q2, q1 +; CHECK-NEXT: vmovlt.s8 q3, q0 +; CHECK-NEXT: vmovlb.s8 q1, q1 +; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vadd.i16 q2, q3, q2 +; CHECK-NEXT: vmov.i16 q3, #0x1 +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vadd.i16 q2, q2, q3 +; CHECK-NEXT: vadd.i16 q0, q0, q3 +; CHECK-NEXT: vshr.u16 q2, q2, #1 +; CHECK-NEXT: vshr.u16 q0, q0, #1 +; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: bx lr + %sextsrc1 = sext <16 x i8> %src1 to <16 x i16> + %sextsrc2 = sext <16 x i8> %src2 to <16 x i16> + %add1 = add <16 x i16> %sextsrc1, %sextsrc2 + %add2 = add <16 x i16> %add1, + %resulti16 = lshr <16 x i16> %add2, + %result = trunc <16 x i16> %resulti16 to <16 x i8> + ret <16 x i8> %result +} + +define arm_aapcs_vfpcc <8 x i16> @vrhadd_s16(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: vrhadd_s16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmovlt.s16 q2, q1 +; CHECK-NEXT: vmovlt.s16 q3, q0 +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vadd.i32 q2, q3, q2 +; CHECK-NEXT: vmov.i32 q3, #0x1 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vadd.i32 q0, q0, q3 +; CHECK-NEXT: vshr.u32 q2, q2, #1 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: bx lr + %sextsrc1 = sext <8 x i16> %src1 to <8 x i32> + %sextsrc2 = sext <8 x i16> %src2 to <8 x i32> + %add1 = add <8 x i32> %sextsrc1, %sextsrc2 + %add2 = add <8 x i32> %add1, + %resulti16 = lshr <8 x i32> %add2, + %result = trunc <8 x i32> %resulti16 to <8 x i16> + ret <8 x i16> %result +} + +define arm_aapcs_vfpcc <4 x i32> @vrhadd_s32(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: vrhadd_s32: +; CHECK: @ %bb.0: +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s16, s6 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: adds r0, #1 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: lsrl r0, r1, #1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: asrs r2, r1, #31 +; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: adc.w r3, r2, r3, asr #31 +; CHECK-NEXT: adds r2, r1, #1 +; CHECK-NEXT: adc r1, r3, #0 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: lsrl r2, r1, #1 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: adds r0, #1 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: lsrl r0, r1, #1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: asrs r2, r1, #31 +; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: adc.w r3, r2, r3, asr #31 +; CHECK-NEXT: adds r2, r1, #1 +; CHECK-NEXT: adc r1, r3, #0 +; CHECK-NEXT: lsrl r2, r1, #1 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr + %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> + %sextsrc2 = sext <4 x i32> %src2 to <4 x i64> + %add1 = add <4 x i64> %sextsrc1, %sextsrc2 + %add2 = add <4 x i64> %add1, + %resulti16 = lshr <4 x i64> %add2, + %result = trunc <4 x i64> %resulti16 to <4 x i32> + ret <4 x i32> %result +} + +define arm_aapcs_vfpcc <16 x i8> @vhadd_s8(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: vhadd_s8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmovlt.s8 q2, q1 +; CHECK-NEXT: vmovlt.s8 q3, q0 +; CHECK-NEXT: vmovlb.s8 q1, q1 +; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vadd.i16 q2, q3, q2 +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vshr.u16 q2, q2, #1 +; CHECK-NEXT: vshr.u16 q0, q0, #1 +; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: bx lr + %sextsrc1 = sext <16 x i8> %src1 to <16 x i16> + %sextsrc2 = sext <16 x i8> %src2 to <16 x i16> + %add = add <16 x i16> %sextsrc1, %sextsrc2 + %resulti16 = lshr <16 x i16> %add, + %result = trunc <16 x i16> %resulti16 to <16 x i8> + ret <16 x i8> %result +} + +define arm_aapcs_vfpcc <8 x i16> @vhadd_s16(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: vhadd_s16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmovlt.s16 q2, q1 +; CHECK-NEXT: vmovlt.s16 q3, q0 +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vadd.i32 q2, q3, q2 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vshr.u32 q2, q2, #1 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: bx lr + %sextsrc1 = sext <8 x i16> %src1 to <8 x i32> + %sextsrc2 = sext <8 x i16> %src2 to <8 x i32> + %add = add <8 x i32> %sextsrc1, %sextsrc2 + %resulti16 = lshr <8 x i32> %add, + %result = trunc <8 x i32> %resulti16 to <8 x i16> + ret <8 x i16> %result +} + +define arm_aapcs_vfpcc <4 x i32> @vhadd_s32(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: vhadd_s32: +; CHECK: @ %bb.0: +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s16, s6 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: lsrl r0, r1, #1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: adds r2, r1, r3 +; CHECK-NEXT: asr.w r12, r1, #31 +; CHECK-NEXT: adc.w r1, r12, r3, asr #31 +; CHECK-NEXT: lsrl r2, r1, #1 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: lsrl r0, r1, #1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: adds r2, r1, r3 +; CHECK-NEXT: asr.w r12, r1, #31 +; CHECK-NEXT: adc.w r1, r12, r3, asr #31 +; CHECK-NEXT: lsrl r2, r1, #1 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr + %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> + %sextsrc2 = sext <4 x i32> %src2 to <4 x i64> + %add = add <4 x i64> %sextsrc1, %sextsrc2 + %resulti16 = lshr <4 x i64> %add, + %result = trunc <4 x i64> %resulti16 to <4 x i32> + ret <4 x i32> %result +} + +define arm_aapcs_vfpcc <16 x i8> @vrhadd_u8(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: vrhadd_u8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmovlt.u8 q2, q1 +; CHECK-NEXT: vmovlt.u8 q3, q0 +; CHECK-NEXT: vmovlb.u8 q1, q1 +; CHECK-NEXT: vmovlb.u8 q0, q0 +; CHECK-NEXT: vadd.i16 q2, q3, q2 +; CHECK-NEXT: vmov.i16 q3, #0x1 +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vadd.i16 q2, q2, q3 +; CHECK-NEXT: vadd.i16 q0, q0, q3 +; CHECK-NEXT: vshr.u16 q2, q2, #1 +; CHECK-NEXT: vshr.u16 q0, q0, #1 +; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: bx lr + %zextsrc1 = zext <16 x i8> %src1 to <16 x i16> + %zextsrc2 = zext <16 x i8> %src2 to <16 x i16> + %add1 = add <16 x i16> %zextsrc1, %zextsrc2 + %add2 = add <16 x i16> %add1, + %resulti16 = lshr <16 x i16> %add2, + %result = trunc <16 x i16> %resulti16 to <16 x i8> + ret <16 x i8> %result +} + +define arm_aapcs_vfpcc <8 x i16> @vrhadd_u16(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: vrhadd_u16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmovlt.u16 q2, q1 +; CHECK-NEXT: vmovlt.u16 q3, q0 +; CHECK-NEXT: vmovlb.u16 q1, q1 +; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vadd.i32 q2, q3, q2 +; CHECK-NEXT: vmov.i32 q3, #0x1 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vadd.i32 q0, q0, q3 +; CHECK-NEXT: vshr.u32 q2, q2, #1 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: bx lr + %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> + %zextsrc2 = zext <8 x i16> %src2 to <8 x i32> + %add1 = add <8 x i32> %zextsrc1, %zextsrc2 + %add2 = add <8 x i32> %add1, + %resulti16 = lshr <8 x i32> %add2, + %result = trunc <8 x i32> %resulti16 to <8 x i16> + ret <8 x i16> %result +} + +define arm_aapcs_vfpcc <4 x i32> @vrhadd_u32(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: vrhadd_u32: +; CHECK: @ %bb.0: +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.i64 q4, #0xffffffff +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vand q2, q2, q4 +; CHECK-NEXT: vand q3, q3, q4 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vand q1, q1, q4 +; CHECK-NEXT: vand q4, q0, q4 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adds r0, #1 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: vmov r3, r2, d8 +; CHECK-NEXT: lsrl r0, r1, #1 +; CHECK-NEXT: vmov r1, r12, d2 +; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: adc.w r3, r2, r12 +; CHECK-NEXT: adds r2, r1, #1 +; CHECK-NEXT: adc r1, r3, #0 +; CHECK-NEXT: lsrl r2, r1, #1 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adds r0, #1 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: vmov r3, r2, d9 +; CHECK-NEXT: lsrl r0, r1, #1 +; CHECK-NEXT: vmov r1, r12, d3 +; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: adc.w r3, r2, r12 +; CHECK-NEXT: adds r2, r1, #1 +; CHECK-NEXT: adc r1, r3, #0 +; CHECK-NEXT: lsrl r2, r1, #1 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr + %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> + %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> + %add1 = add <4 x i64> %zextsrc1, %zextsrc2 + %add2 = add <4 x i64> %add1, + %resulti16 = lshr <4 x i64> %add2, + %result = trunc <4 x i64> %resulti16 to <4 x i32> + ret <4 x i32> %result +} + +define arm_aapcs_vfpcc <16 x i8> @vhadd_u8(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: vhadd_u8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmovlt.u8 q2, q1 +; CHECK-NEXT: vmovlt.u8 q3, q0 +; CHECK-NEXT: vmovlb.u8 q1, q1 +; CHECK-NEXT: vmovlb.u8 q0, q0 +; CHECK-NEXT: vadd.i16 q2, q3, q2 +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vshr.u16 q2, q2, #1 +; CHECK-NEXT: vshr.u16 q0, q0, #1 +; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: bx lr + %zextsrc1 = zext <16 x i8> %src1 to <16 x i16> + %zextsrc2 = zext <16 x i8> %src2 to <16 x i16> + %add = add <16 x i16> %zextsrc1, %zextsrc2 + %resulti16 = lshr <16 x i16> %add, + %result = trunc <16 x i16> %resulti16 to <16 x i8> + ret <16 x i8> %result +} + +define arm_aapcs_vfpcc <8 x i16> @vhadd_u16(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: vhadd_u16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmovlt.u16 q2, q1 +; CHECK-NEXT: vmovlt.u16 q3, q0 +; CHECK-NEXT: vmovlb.u16 q1, q1 +; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vadd.i32 q2, q3, q2 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vshr.u32 q2, q2, #1 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: bx lr + %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> + %zextsrc2 = zext <8 x i16> %src2 to <8 x i32> + %add = add <8 x i32> %zextsrc1, %zextsrc2 + %resulti16 = lshr <8 x i32> %add, + %result = trunc <8 x i32> %resulti16 to <8 x i16> + ret <8 x i16> %result +} + +define arm_aapcs_vfpcc <4 x i32> @vhadd_u32(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: vhadd_u32: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.i64 q4, #0xffffffff +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vand q2, q2, q4 +; CHECK-NEXT: vand q3, q3, q4 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vand q1, q1, q4 +; CHECK-NEXT: vand q4, q0, q4 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, r2, d8 +; CHECK-NEXT: lsrl r0, r1, #1 +; CHECK-NEXT: vmov r1, r12, d2 +; CHECK-NEXT: adds r4, r3, r1 +; CHECK-NEXT: adc.w r1, r2, r12 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: lsrl r4, r1, #1 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, r4, d9 +; CHECK-NEXT: lsrl r0, r1, #1 +; CHECK-NEXT: vmov r1, r12, d3 +; CHECK-NEXT: adds r2, r3, r1 +; CHECK-NEXT: adc.w r1, r4, r12 +; CHECK-NEXT: lsrl r2, r1, #1 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r4, pc} + %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> + %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> + %add = add <4 x i64> %zextsrc1, %zextsrc2 + %resulti16 = lshr <4 x i64> %add, + %result = trunc <4 x i64> %resulti16 to <4 x i32> + ret <4 x i32> %result +} + +define void @vhadd_loop_s8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vhadd_loop_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: mov.w lr, #64 +; CHECK-NEXT: .LBB12_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.s16 q0, [r0, #8] +; CHECK-NEXT: vldrb.s16 q1, [r1, #8] +; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vldrb.s16 q1, [r1], #16 +; CHECK-NEXT: vshr.u16 q0, q0, #1 +; CHECK-NEXT: vstrb.16 q0, [r2, #8] +; CHECK-NEXT: vldrb.s16 q0, [r0], #16 +; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vshr.u16 q0, q0, #1 +; CHECK-NEXT: vstrb.16 q0, [r2], #16 +; CHECK-NEXT: le lr, .LBB12_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i8, i8* %x, i32 %index + %1 = bitcast i8* %0 to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %1, align 1 + %2 = sext <16 x i8> %wide.load to <16 x i16> + %3 = getelementptr inbounds i8, i8* %y, i32 %index + %4 = bitcast i8* %3 to <16 x i8>* + %wide.load16 = load <16 x i8>, <16 x i8>* %4, align 1 + %5 = sext <16 x i8> %wide.load16 to <16 x i16> + %6 = add nsw <16 x i16> %5, %2 + %7 = lshr <16 x i16> %6, + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = getelementptr inbounds i8, i8* %z, i32 %index + %10 = bitcast i8* %9 to <16 x i8>* + store <16 x i8> %8, <16 x i8>* %10, align 1 + %index.next = add i32 %index, 16 + %11 = icmp eq i32 %index.next, 1024 + br i1 %11, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @vhadd_loop_s16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vhadd_loop_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: mov.w lr, #128 +; CHECK-NEXT: .LBB13_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.s32 q0, [r0, #8] +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vldrh.s32 q1, [r1], #16 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: vstrh.32 q0, [r2, #8] +; CHECK-NEXT: vldrh.s32 q0, [r0], #16 +; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: vstrh.32 q0, [r2], #16 +; CHECK-NEXT: le lr, .LBB13_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i16, i16* %x, i32 %index + %1 = bitcast i16* %0 to <8 x i16>* + %wide.load = load <8 x i16>, <8 x i16>* %1, align 2 + %2 = sext <8 x i16> %wide.load to <8 x i32> + %3 = getelementptr inbounds i16, i16* %y, i32 %index + %4 = bitcast i16* %3 to <8 x i16>* + %wide.load16 = load <8 x i16>, <8 x i16>* %4, align 2 + %5 = sext <8 x i16> %wide.load16 to <8 x i32> + %6 = add nsw <8 x i32> %5, %2 + %7 = lshr <8 x i32> %6, + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = getelementptr inbounds i16, i16* %z, i32 %index + %10 = bitcast i16* %9 to <8 x i16>* + store <8 x i16> %8, <8 x i16>* %10, align 2 + %index.next = add i32 %index, 8 + %11 = icmp eq i32 %index.next, 1024 + br i1 %11, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @vhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vhadd_loop_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: mov.w lr, #256 +; CHECK-NEXT: .LBB14_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s12, s6 +; CHECK-NEXT: vmov.f32 s10, s3 +; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: adds.w r12, r3, r5 +; CHECK-NEXT: asr.w r4, r3, #31 +; CHECK-NEXT: adc.w r3, r4, r5, asr #31 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: lsrl r12, r3, #1 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: adds r6, r3, r5 +; CHECK-NEXT: asr.w r4, r3, #31 +; CHECK-NEXT: adc.w r3, r4, r5, asr #31 +; CHECK-NEXT: lsrl r6, r3, #1 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov q4[2], q4[0], r6, r12 +; CHECK-NEXT: adds r4, r3, r5 +; CHECK-NEXT: asr.w r6, r3, #31 +; CHECK-NEXT: adc.w r3, r6, r5, asr #31 +; CHECK-NEXT: lsrl r4, r3, #1 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: adds r6, r3, r5 +; CHECK-NEXT: asr.w r12, r3, #31 +; CHECK-NEXT: adc.w r3, r12, r5, asr #31 +; CHECK-NEXT: lsrl r6, r3, #1 +; CHECK-NEXT: vmov q4[3], q4[1], r6, r4 +; CHECK-NEXT: vstrb.8 q4, [r2], #16 +; CHECK-NEXT: le lr, .LBB14_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r4, r5, r6, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %x, i32 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = sext <4 x i32> %wide.load to <4 x i64> + %3 = getelementptr inbounds i32, i32* %y, i32 %index + %4 = bitcast i32* %3 to <4 x i32>* + %wide.load16 = load <4 x i32>, <4 x i32>* %4, align 4 + %5 = sext <4 x i32> %wide.load16 to <4 x i64> + %6 = add nsw <4 x i64> %5, %2 + %7 = lshr <4 x i64> %6, + %8 = trunc <4 x i64> %7 to <4 x i32> + %9 = getelementptr inbounds i32, i32* %z, i32 %index + %10 = bitcast i32* %9 to <4 x i32>* + store <4 x i32> %8, <4 x i32>* %10, align 4 + %index.next = add i32 %index, 4 + %11 = icmp eq i32 %index.next, 1024 + br i1 %11, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @vhadd_loop_u8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vhadd_loop_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: mov.w lr, #64 +; CHECK-NEXT: .LBB15_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u16 q0, [r0, #8] +; CHECK-NEXT: vldrb.u16 q1, [r1, #8] +; CHECK-NEXT: vhadd.u16 q0, q1, q0 +; CHECK-NEXT: vldrb.u16 q1, [r1], #16 +; CHECK-NEXT: vstrb.16 q0, [r2, #8] +; CHECK-NEXT: vldrb.u16 q0, [r0], #16 +; CHECK-NEXT: vhadd.u16 q0, q1, q0 +; CHECK-NEXT: vstrb.16 q0, [r2], #16 +; CHECK-NEXT: le lr, .LBB15_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i8, i8* %x, i32 %index + %1 = bitcast i8* %0 to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %1, align 1 + %2 = zext <16 x i8> %wide.load to <16 x i16> + %3 = getelementptr inbounds i8, i8* %y, i32 %index + %4 = bitcast i8* %3 to <16 x i8>* + %wide.load16 = load <16 x i8>, <16 x i8>* %4, align 1 + %5 = zext <16 x i8> %wide.load16 to <16 x i16> + %6 = add nuw nsw <16 x i16> %5, %2 + %7 = lshr <16 x i16> %6, + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = getelementptr inbounds i8, i8* %z, i32 %index + %10 = bitcast i8* %9 to <16 x i8>* + store <16 x i8> %8, <16 x i8>* %10, align 1 + %index.next = add i32 %index, 16 + %11 = icmp eq i32 %index.next, 1024 + br i1 %11, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @vhadd_loop_u16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vhadd_loop_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: mov.w lr, #128 +; CHECK-NEXT: .LBB16_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u32 q0, [r0, #8] +; CHECK-NEXT: vldrh.u32 q1, [r1, #8] +; CHECK-NEXT: vhadd.u32 q0, q1, q0 +; CHECK-NEXT: vldrh.u32 q1, [r1], #16 +; CHECK-NEXT: vstrh.32 q0, [r2, #8] +; CHECK-NEXT: vldrh.u32 q0, [r0], #16 +; CHECK-NEXT: vhadd.u32 q0, q1, q0 +; CHECK-NEXT: vstrh.32 q0, [r2], #16 +; CHECK-NEXT: le lr, .LBB16_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i16, i16* %x, i32 %index + %1 = bitcast i16* %0 to <8 x i16>* + %wide.load = load <8 x i16>, <8 x i16>* %1, align 2 + %2 = zext <8 x i16> %wide.load to <8 x i32> + %3 = getelementptr inbounds i16, i16* %y, i32 %index + %4 = bitcast i16* %3 to <8 x i16>* + %wide.load16 = load <8 x i16>, <8 x i16>* %4, align 2 + %5 = zext <8 x i16> %wide.load16 to <8 x i32> + %6 = add nuw nsw <8 x i32> %5, %2 + %7 = lshr <8 x i32> %6, + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = getelementptr inbounds i16, i16* %z, i32 %index + %10 = bitcast i16* %9 to <8 x i16>* + store <8 x i16> %8, <8 x i16>* %10, align 2 + %index.next = add i32 %index, 8 + %11 = icmp eq i32 %index.next, 1024 + br i1 %11, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @vhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vhadd_loop_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: mov.w lr, #256 +; CHECK-NEXT: vmov.i64 q0, #0xffffffff +; CHECK-NEXT: .LBB17_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q3, [r0], #16 +; CHECK-NEXT: vldrw.u32 q4, [r1], #16 +; CHECK-NEXT: vmov.f32 s4, s14 +; CHECK-NEXT: vmov.f32 s8, s18 +; CHECK-NEXT: vmov.f32 s6, s15 +; CHECK-NEXT: vmov.f32 s10, s19 +; CHECK-NEXT: vand q1, q1, q0 +; CHECK-NEXT: vand q2, q2, q0 +; CHECK-NEXT: vmov r3, r5, d2 +; CHECK-NEXT: vmov r4, r6, d4 +; CHECK-NEXT: vmov.f32 s14, s13 +; CHECK-NEXT: vmov.f32 s18, s17 +; CHECK-NEXT: vand q3, q3, q0 +; CHECK-NEXT: vand q5, q4, q0 +; CHECK-NEXT: adds.w r12, r4, r3 +; CHECK-NEXT: adc.w r3, r6, r5 +; CHECK-NEXT: vmov r5, r6, d10 +; CHECK-NEXT: lsrl r12, r3, #1 +; CHECK-NEXT: vmov r3, r7, d6 +; CHECK-NEXT: adds r4, r5, r3 +; CHECK-NEXT: adc.w r3, r6, r7 +; CHECK-NEXT: vmov r6, r5, d5 +; CHECK-NEXT: lsrl r4, r3, #1 +; CHECK-NEXT: vmov r3, r7, d3 +; CHECK-NEXT: vmov q4[2], q4[0], r4, r12 +; CHECK-NEXT: adds r6, r6, r3 +; CHECK-NEXT: adc.w r3, r5, r7 +; CHECK-NEXT: vmov r5, r7, d11 +; CHECK-NEXT: lsrl r6, r3, #1 +; CHECK-NEXT: vmov r3, r12, d7 +; CHECK-NEXT: adds r4, r5, r3 +; CHECK-NEXT: adc.w r3, r7, r12 +; CHECK-NEXT: lsrl r4, r3, #1 +; CHECK-NEXT: vmov q4[3], q4[1], r4, r6 +; CHECK-NEXT: vstrb.8 q4, [r2], #16 +; CHECK-NEXT: le lr, .LBB17_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %x, i32 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = zext <4 x i32> %wide.load to <4 x i64> + %3 = getelementptr inbounds i32, i32* %y, i32 %index + %4 = bitcast i32* %3 to <4 x i32>* + %wide.load16 = load <4 x i32>, <4 x i32>* %4, align 4 + %5 = zext <4 x i32> %wide.load16 to <4 x i64> + %6 = add nuw nsw <4 x i64> %5, %2 + %7 = lshr <4 x i64> %6, + %8 = trunc <4 x i64> %7 to <4 x i32> + %9 = getelementptr inbounds i32, i32* %z, i32 %index + %10 = bitcast i32* %9 to <4 x i32>* + store <4 x i32> %8, <4 x i32>* %10, align 4 + %index.next = add i32 %index, 4 + %11 = icmp eq i32 %index.next, 1024 + br i1 %11, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @vrhadd_loop_s8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vrhadd_loop_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: mov.w lr, #64 +; CHECK-NEXT: vmov.i16 q0, #0x1 +; CHECK-NEXT: .LBB18_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u16 q1, [r1, #8] +; CHECK-NEXT: vldrb.u16 q2, [r0, #8] +; CHECK-NEXT: vadd.i16 q1, q2, q1 +; CHECK-NEXT: vldrb.u16 q2, [r0], #16 +; CHECK-NEXT: vadd.i16 q1, q1, q0 +; CHECK-NEXT: vshr.u16 q1, q1, #1 +; CHECK-NEXT: vstrb.16 q1, [r2, #8] +; CHECK-NEXT: vldrb.u16 q1, [r1], #16 +; CHECK-NEXT: vadd.i16 q1, q2, q1 +; CHECK-NEXT: vadd.i16 q1, q1, q0 +; CHECK-NEXT: vshr.u16 q1, q1, #1 +; CHECK-NEXT: vstrb.16 q1, [r2], #16 +; CHECK-NEXT: le lr, .LBB18_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i8, i8* %x, i32 %index + %1 = bitcast i8* %0 to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %1, align 1 + %2 = zext <16 x i8> %wide.load to <16 x i16> + %3 = getelementptr inbounds i8, i8* %y, i32 %index + %4 = bitcast i8* %3 to <16 x i8>* + %wide.load16 = load <16 x i8>, <16 x i8>* %4, align 1 + %5 = zext <16 x i8> %wide.load16 to <16 x i16> + %6 = add nuw nsw <16 x i16> %2, + %7 = add nuw nsw <16 x i16> %6, %5 + %8 = lshr <16 x i16> %7, + %9 = trunc <16 x i16> %8 to <16 x i8> + %10 = getelementptr inbounds i8, i8* %z, i32 %index + %11 = bitcast i8* %10 to <16 x i8>* + store <16 x i8> %9, <16 x i8>* %11, align 1 + %index.next = add i32 %index, 16 + %12 = icmp eq i32 %index.next, 1024 + br i1 %12, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @vrhadd_loop_s16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vrhadd_loop_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: mov.w lr, #128 +; CHECK-NEXT: vmov.i32 q0, #0x1 +; CHECK-NEXT: .LBB19_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u32 q1, [r1, #8] +; CHECK-NEXT: vldrh.u32 q2, [r0, #8] +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vldrh.u32 q2, [r0], #16 +; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vshr.u32 q1, q1, #1 +; CHECK-NEXT: vstrh.32 q1, [r2, #8] +; CHECK-NEXT: vldrh.u32 q1, [r1], #16 +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vshr.u32 q1, q1, #1 +; CHECK-NEXT: vstrh.32 q1, [r2], #16 +; CHECK-NEXT: le lr, .LBB19_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i16, i16* %x, i32 %index + %1 = bitcast i16* %0 to <8 x i16>* + %wide.load = load <8 x i16>, <8 x i16>* %1, align 2 + %2 = zext <8 x i16> %wide.load to <8 x i32> + %3 = getelementptr inbounds i16, i16* %y, i32 %index + %4 = bitcast i16* %3 to <8 x i16>* + %wide.load16 = load <8 x i16>, <8 x i16>* %4, align 2 + %5 = zext <8 x i16> %wide.load16 to <8 x i32> + %6 = add nuw nsw <8 x i32> %2, + %7 = add nuw nsw <8 x i32> %6, %5 + %8 = lshr <8 x i32> %7, + %9 = trunc <8 x i32> %8 to <8 x i16> + %10 = getelementptr inbounds i16, i16* %z, i32 %index + %11 = bitcast i16* %10 to <8 x i16>* + store <8 x i16> %9, <8 x i16>* %11, align 2 + %index.next = add i32 %index, 8 + %12 = icmp eq i32 %index.next, 1024 + br i1 %12, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @vrhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vrhadd_loop_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: mov.w lr, #256 +; CHECK-NEXT: vmov.i64 q0, #0xffffffff +; CHECK-NEXT: .LBB20_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q3, [r1], #16 +; CHECK-NEXT: vldrw.u32 q4, [r0], #16 +; CHECK-NEXT: vmov.f32 s4, s14 +; CHECK-NEXT: vmov.f32 s8, s18 +; CHECK-NEXT: vmov.f32 s6, s15 +; CHECK-NEXT: vmov.f32 s10, s19 +; CHECK-NEXT: vand q1, q1, q0 +; CHECK-NEXT: vand q2, q2, q0 +; CHECK-NEXT: vmov r3, r12, d2 +; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: vmov.f32 s14, s13 +; CHECK-NEXT: vmov.f32 s18, s17 +; CHECK-NEXT: vand q3, q3, q0 +; CHECK-NEXT: vand q5, q4, q0 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adc.w r4, r5, r12 +; CHECK-NEXT: adds.w r12, r3, #1 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r5, r6, d10 +; CHECK-NEXT: lsrl r12, r3, #1 +; CHECK-NEXT: vmov r3, r4, d6 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: adds r6, r3, #1 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: lsrl r6, r3, #1 +; CHECK-NEXT: vmov q4[2], q4[0], r6, r12 +; CHECK-NEXT: vmov r3, r6, d3 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: adds.w r12, r3, #1 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r5, r6, d11 +; CHECK-NEXT: lsrl r12, r3, #1 +; CHECK-NEXT: vmov r3, r4, d7 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: adds r6, r3, #1 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: lsrl r6, r3, #1 +; CHECK-NEXT: vmov q4[3], q4[1], r6, r12 +; CHECK-NEXT: vstrb.8 q4, [r2], #16 +; CHECK-NEXT: le lr, .LBB20_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, r5, r6, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %x, i32 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = zext <4 x i32> %wide.load to <4 x i64> + %3 = getelementptr inbounds i32, i32* %y, i32 %index + %4 = bitcast i32* %3 to <4 x i32>* + %wide.load16 = load <4 x i32>, <4 x i32>* %4, align 4 + %5 = zext <4 x i32> %wide.load16 to <4 x i64> + %6 = add nuw nsw <4 x i64> %2, + %7 = add nuw nsw <4 x i64> %6, %5 + %8 = lshr <4 x i64> %7, + %9 = trunc <4 x i64> %8 to <4 x i32> + %10 = getelementptr inbounds i32, i32* %z, i32 %index + %11 = bitcast i32* %10 to <4 x i32>* + store <4 x i32> %9, <4 x i32>* %11, align 4 + %index.next = add i32 %index, 4 + %12 = icmp eq i32 %index.next, 1024 + br i1 %12, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @vrhadd_loop_u8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vrhadd_loop_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: mov.w lr, #64 +; CHECK-NEXT: vmov.i16 q0, #0x1 +; CHECK-NEXT: .LBB21_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u16 q1, [r1, #8] +; CHECK-NEXT: vldrb.u16 q2, [r0, #8] +; CHECK-NEXT: vadd.i16 q1, q2, q1 +; CHECK-NEXT: vldrb.u16 q2, [r0], #16 +; CHECK-NEXT: vadd.i16 q1, q1, q0 +; CHECK-NEXT: vshr.u16 q1, q1, #1 +; CHECK-NEXT: vstrb.16 q1, [r2, #8] +; CHECK-NEXT: vldrb.u16 q1, [r1], #16 +; CHECK-NEXT: vadd.i16 q1, q2, q1 +; CHECK-NEXT: vadd.i16 q1, q1, q0 +; CHECK-NEXT: vshr.u16 q1, q1, #1 +; CHECK-NEXT: vstrb.16 q1, [r2], #16 +; CHECK-NEXT: le lr, .LBB21_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i8, i8* %x, i32 %index + %1 = bitcast i8* %0 to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %1, align 1 + %2 = zext <16 x i8> %wide.load to <16 x i16> + %3 = getelementptr inbounds i8, i8* %y, i32 %index + %4 = bitcast i8* %3 to <16 x i8>* + %wide.load16 = load <16 x i8>, <16 x i8>* %4, align 1 + %5 = zext <16 x i8> %wide.load16 to <16 x i16> + %6 = add nuw nsw <16 x i16> %2, + %7 = add nuw nsw <16 x i16> %6, %5 + %8 = lshr <16 x i16> %7, + %9 = trunc <16 x i16> %8 to <16 x i8> + %10 = getelementptr inbounds i8, i8* %z, i32 %index + %11 = bitcast i8* %10 to <16 x i8>* + store <16 x i8> %9, <16 x i8>* %11, align 1 + %index.next = add i32 %index, 16 + %12 = icmp eq i32 %index.next, 1024 + br i1 %12, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @vrhadd_loop_u16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vrhadd_loop_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: mov.w lr, #128 +; CHECK-NEXT: vmov.i32 q0, #0x1 +; CHECK-NEXT: .LBB22_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u32 q1, [r1, #8] +; CHECK-NEXT: vldrh.u32 q2, [r0, #8] +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vldrh.u32 q2, [r0], #16 +; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vshr.u32 q1, q1, #1 +; CHECK-NEXT: vstrh.32 q1, [r2, #8] +; CHECK-NEXT: vldrh.u32 q1, [r1], #16 +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vshr.u32 q1, q1, #1 +; CHECK-NEXT: vstrh.32 q1, [r2], #16 +; CHECK-NEXT: le lr, .LBB22_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i16, i16* %x, i32 %index + %1 = bitcast i16* %0 to <8 x i16>* + %wide.load = load <8 x i16>, <8 x i16>* %1, align 2 + %2 = zext <8 x i16> %wide.load to <8 x i32> + %3 = getelementptr inbounds i16, i16* %y, i32 %index + %4 = bitcast i16* %3 to <8 x i16>* + %wide.load16 = load <8 x i16>, <8 x i16>* %4, align 2 + %5 = zext <8 x i16> %wide.load16 to <8 x i32> + %6 = add nuw nsw <8 x i32> %2, + %7 = add nuw nsw <8 x i32> %6, %5 + %8 = lshr <8 x i32> %7, + %9 = trunc <8 x i32> %8 to <8 x i16> + %10 = getelementptr inbounds i16, i16* %z, i32 %index + %11 = bitcast i16* %10 to <8 x i16>* + store <8 x i16> %9, <8 x i16>* %11, align 2 + %index.next = add i32 %index, 8 + %12 = icmp eq i32 %index.next, 1024 + br i1 %12, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @vrhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vrhadd_loop_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: mov.w lr, #256 +; CHECK-NEXT: vmov.i64 q0, #0xffffffff +; CHECK-NEXT: .LBB23_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q3, [r1], #16 +; CHECK-NEXT: vldrw.u32 q4, [r0], #16 +; CHECK-NEXT: vmov.f32 s4, s14 +; CHECK-NEXT: vmov.f32 s8, s18 +; CHECK-NEXT: vmov.f32 s6, s15 +; CHECK-NEXT: vmov.f32 s10, s19 +; CHECK-NEXT: vand q1, q1, q0 +; CHECK-NEXT: vand q2, q2, q0 +; CHECK-NEXT: vmov r3, r12, d2 +; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: vmov.f32 s14, s13 +; CHECK-NEXT: vmov.f32 s18, s17 +; CHECK-NEXT: vand q3, q3, q0 +; CHECK-NEXT: vand q5, q4, q0 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adc.w r4, r5, r12 +; CHECK-NEXT: adds.w r12, r3, #1 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r5, r6, d10 +; CHECK-NEXT: lsrl r12, r3, #1 +; CHECK-NEXT: vmov r3, r4, d6 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: adds r6, r3, #1 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: lsrl r6, r3, #1 +; CHECK-NEXT: vmov q4[2], q4[0], r6, r12 +; CHECK-NEXT: vmov r3, r6, d3 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: adds.w r12, r3, #1 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r5, r6, d11 +; CHECK-NEXT: lsrl r12, r3, #1 +; CHECK-NEXT: vmov r3, r4, d7 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: adds r6, r3, #1 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: lsrl r6, r3, #1 +; CHECK-NEXT: vmov q4[3], q4[1], r6, r12 +; CHECK-NEXT: vstrb.8 q4, [r2], #16 +; CHECK-NEXT: le lr, .LBB23_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, r5, r6, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %x, i32 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = zext <4 x i32> %wide.load to <4 x i64> + %3 = getelementptr inbounds i32, i32* %y, i32 %index + %4 = bitcast i32* %3 to <4 x i32>* + %wide.load16 = load <4 x i32>, <4 x i32>* %4, align 4 + %5 = zext <4 x i32> %wide.load16 to <4 x i64> + %6 = add nuw nsw <4 x i64> %2, + %7 = add nuw nsw <4 x i64> %6, %5 + %8 = lshr <4 x i64> %7, + %9 = trunc <4 x i64> %8 to <4 x i32> + %10 = getelementptr inbounds i32, i32* %z, i32 %index + %11 = bitcast i32* %10 to <4 x i32>* + store <4 x i32> %9, <4 x i32>* %11, align 4 + %index.next = add i32 %index, 4 + %12 = icmp eq i32 %index.next, 1024 + br i1 %12, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} diff --git a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll index 057f3f24d0ea..a214eeae590a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll @@ -237,3 +237,306 @@ entry: %s2 = trunc <16 x i16> %s to <16 x i8> ret <16 x i8> %s2 } + +define void @vmulh_s8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vmulh_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: mov.w lr, #64 +; CHECK-NEXT: .LBB12_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q0, [r0], #16 +; CHECK-NEXT: vldrb.u8 q1, [r1], #16 +; CHECK-NEXT: vmullt.s8 q2, q1, q0 +; CHECK-NEXT: vmullb.s8 q0, q1, q0 +; CHECK-NEXT: vshr.u16 q2, q2, #8 +; CHECK-NEXT: vshr.u16 q0, q0, #8 +; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: le lr, .LBB12_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i8, i8* %x, i32 %index + %1 = bitcast i8* %0 to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %1, align 1 + %2 = sext <16 x i8> %wide.load to <16 x i16> + %3 = getelementptr inbounds i8, i8* %y, i32 %index + %4 = bitcast i8* %3 to <16 x i8>* + %wide.load17 = load <16 x i8>, <16 x i8>* %4, align 1 + %5 = sext <16 x i8> %wide.load17 to <16 x i16> + %6 = mul nsw <16 x i16> %5, %2 + %7 = lshr <16 x i16> %6, + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = getelementptr inbounds i8, i8* %z, i32 %index + %10 = bitcast i8* %9 to <16 x i8>* + store <16 x i8> %8, <16 x i8>* %10, align 1 + %index.next = add i32 %index, 16 + %11 = icmp eq i32 %index.next, 1024 + br i1 %11, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @vmulh_s16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vmulh_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: mov.w lr, #128 +; CHECK-NEXT: .LBB13_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q0, [r0], #16 +; CHECK-NEXT: vldrh.u16 q1, [r1], #16 +; CHECK-NEXT: vmullt.s16 q2, q1, q0 +; CHECK-NEXT: vmullb.s16 q0, q1, q0 +; CHECK-NEXT: vshr.u32 q2, q2, #16 +; CHECK-NEXT: vshr.u32 q0, q0, #16 +; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: le lr, .LBB13_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i16, i16* %x, i32 %index + %1 = bitcast i16* %0 to <8 x i16>* + %wide.load = load <8 x i16>, <8 x i16>* %1, align 2 + %2 = sext <8 x i16> %wide.load to <8 x i32> + %3 = getelementptr inbounds i16, i16* %y, i32 %index + %4 = bitcast i16* %3 to <8 x i16>* + %wide.load17 = load <8 x i16>, <8 x i16>* %4, align 2 + %5 = sext <8 x i16> %wide.load17 to <8 x i32> + %6 = mul nsw <8 x i32> %5, %2 + %7 = lshr <8 x i32> %6, + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = getelementptr inbounds i16, i16* %z, i32 %index + %10 = bitcast i16* %9 to <8 x i16>* + store <8 x i16> %8, <8 x i16>* %10, align 2 + %index.next = add i32 %index, 8 + %11 = icmp eq i32 %index.next, 1024 + br i1 %11, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @vmulh_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vmulh_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: mov.w lr, #256 +; CHECK-NEXT: .LBB14_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov.f32 s0, s4 +; CHECK-NEXT: vmov.f32 s12, s8 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vmov.f32 s14, s9 +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov.f32 s16, s6 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vmullb.s32 q2, q1, q4 +; CHECK-NEXT: smmul r12, r3, r12 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vmov q1[2], q1[0], r12, r3 +; CHECK-NEXT: vmov r12, s2 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: smmul r12, r3, r12 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: vmov q1[3], q1[1], r12, r3 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: le lr, .LBB14_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %x, i32 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = sext <4 x i32> %wide.load to <4 x i64> + %3 = getelementptr inbounds i32, i32* %y, i32 %index + %4 = bitcast i32* %3 to <4 x i32>* + %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4 + %5 = sext <4 x i32> %wide.load17 to <4 x i64> + %6 = mul nsw <4 x i64> %5, %2 + %7 = lshr <4 x i64> %6, + %8 = trunc <4 x i64> %7 to <4 x i32> + %9 = getelementptr inbounds i32, i32* %z, i32 %index + %10 = bitcast i32* %9 to <4 x i32>* + store <4 x i32> %8, <4 x i32>* %10, align 4 + %index.next = add i32 %index, 4 + %11 = icmp eq i32 %index.next, 1024 + br i1 %11, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @vmulh_u8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vmulh_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: mov.w lr, #64 +; CHECK-NEXT: .LBB15_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q0, [r0], #16 +; CHECK-NEXT: vldrb.u8 q1, [r1], #16 +; CHECK-NEXT: vmullt.u8 q2, q1, q0 +; CHECK-NEXT: vmullb.u8 q0, q1, q0 +; CHECK-NEXT: vshr.u16 q2, q2, #8 +; CHECK-NEXT: vshr.u16 q0, q0, #8 +; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: le lr, .LBB15_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i8, i8* %x, i32 %index + %1 = bitcast i8* %0 to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %1, align 1 + %2 = zext <16 x i8> %wide.load to <16 x i16> + %3 = getelementptr inbounds i8, i8* %y, i32 %index + %4 = bitcast i8* %3 to <16 x i8>* + %wide.load17 = load <16 x i8>, <16 x i8>* %4, align 1 + %5 = zext <16 x i8> %wide.load17 to <16 x i16> + %6 = mul nuw <16 x i16> %5, %2 + %7 = lshr <16 x i16> %6, + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = getelementptr inbounds i8, i8* %z, i32 %index + %10 = bitcast i8* %9 to <16 x i8>* + store <16 x i8> %8, <16 x i8>* %10, align 1 + %index.next = add i32 %index, 16 + %11 = icmp eq i32 %index.next, 1024 + br i1 %11, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @vmulh_u16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vmulh_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: mov.w lr, #128 +; CHECK-NEXT: .LBB16_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q0, [r0], #16 +; CHECK-NEXT: vldrh.u16 q1, [r1], #16 +; CHECK-NEXT: vmullt.u16 q2, q1, q0 +; CHECK-NEXT: vmullb.u16 q0, q1, q0 +; CHECK-NEXT: vshr.u32 q2, q2, #16 +; CHECK-NEXT: vshr.u32 q0, q0, #16 +; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: le lr, .LBB16_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i16, i16* %x, i32 %index + %1 = bitcast i16* %0 to <8 x i16>* + %wide.load = load <8 x i16>, <8 x i16>* %1, align 2 + %2 = zext <8 x i16> %wide.load to <8 x i32> + %3 = getelementptr inbounds i16, i16* %y, i32 %index + %4 = bitcast i16* %3 to <8 x i16>* + %wide.load17 = load <8 x i16>, <8 x i16>* %4, align 2 + %5 = zext <8 x i16> %wide.load17 to <8 x i32> + %6 = mul nuw <8 x i32> %5, %2 + %7 = lshr <8 x i32> %6, + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = getelementptr inbounds i16, i16* %z, i32 %index + %10 = bitcast i16* %9 to <8 x i16>* + store <8 x i16> %8, <8 x i16>* %10, align 2 + %index.next = add i32 %index, 8 + %11 = icmp eq i32 %index.next, 1024 + br i1 %11, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @vmulh_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { +; CHECK-LABEL: vmulh_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: mov.w lr, #256 +; CHECK-NEXT: .LBB17_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q3, [r1], #16 +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s16, s14 +; CHECK-NEXT: vmov.f32 s10, s3 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmullb.u32 q1, q4, q2 +; CHECK-NEXT: vmov.f32 s14, s13 +; CHECK-NEXT: vmullb.u32 q2, q3, q0 +; CHECK-NEXT: vmov.f32 s0, s9 +; CHECK-NEXT: vmov.f32 s1, s11 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: le lr, .LBB17_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %x, i32 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = zext <4 x i32> %wide.load to <4 x i64> + %3 = getelementptr inbounds i32, i32* %y, i32 %index + %4 = bitcast i32* %3 to <4 x i32>* + %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4 + %5 = zext <4 x i32> %wide.load17 to <4 x i64> + %6 = mul nuw <4 x i64> %5, %2 + %7 = lshr <4 x i64> %6, + %8 = trunc <4 x i64> %7 to <4 x i32> + %9 = getelementptr inbounds i32, i32* %z, i32 %index + %10 = bitcast i32* %9 to <4 x i32>* + store <4 x i32> %8, <4 x i32>* %10, align 4 + %index.next = add i32 %index, 4 + %11 = icmp eq i32 %index.next, 1024 + br i1 %11, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +}