From 9afb1e66e55df33b374d585dd5054c350bcf5cb8 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 14 Oct 2018 16:49:04 +0000 Subject: [PATCH] [ARM] Regenerate cttz tests Improve codegen view as part of PR32655 llvm-svn: 344479 --- llvm/test/CodeGen/ARM/cttz_vector.ll | 419 ++++++++++++++++++--------- 1 file changed, 283 insertions(+), 136 deletions(-) diff --git a/llvm/test/CodeGen/ARM/cttz_vector.ll b/llvm/test/CodeGen/ARM/cttz_vector.ll index bed644980415..f27c1e4b4173 100644 --- a/llvm/test/CodeGen/ARM/cttz_vector.ll +++ b/llvm/test/CodeGen/ARM/cttz_vector.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple armv7-linux-gnueabihf -mattr=+neon | FileCheck %s ; This test checks the @llvm.cttz.* intrinsics for vectors. @@ -23,7 +24,14 @@ declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) ;------------------------------------------------------------------------------ define void @test_v1i8(<1 x i8>* %p) { -; CHECK-LABEL: test_v1i8 +; CHECK-LABEL: test_v1i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrb r1, [r0] +; CHECK-NEXT: orr r1, r1, #256 +; CHECK-NEXT: rbit r1, r1 +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: bx lr %a = load <1 x i8>, <1 x i8>* %p %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 false) store <1 x i8> %tmp, <1 x i8>* %p @@ -32,6 +40,21 @@ define void @test_v1i8(<1 x i8>* %p) { define void @test_v2i8(<2 x i8>* %p) { ; CHECK-LABEL: test_v2i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.16 {d16[0]}, [r0:16] +; CHECK-NEXT: vmovl.u8 q8, d16 +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vorr.i32 d16, #0x100 +; CHECK-NEXT: vneg.s32 d18, d16 +; CHECK-NEXT: vand d16, d16, d18 +; CHECK-NEXT: vmov.i32 d17, #0x1f +; CHECK-NEXT: vclz.i32 d16, d16 +; CHECK-NEXT: vsub.i32 d16, d17, d16 +; CHECK-NEXT: vmov.32 r1, d16[1] +; CHECK-NEXT: vmov.32 r2, d16[0] +; CHECK-NEXT: strb r1, [r0, #1] +; CHECK-NEXT: strb r2, [r0] +; CHECK-NEXT: bx lr %a = load <2 x i8>, <2 x i8>* %p %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 false) store <2 x i8> %tmp, <2 x i8>* %p @@ -40,6 +63,19 @@ define void @test_v2i8(<2 x i8>* %p) { define void @test_v4i8(<4 x i8>* %p) { ; CHECK-LABEL: test_v4i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: vmov.i16 d19, #0x1 +; CHECK-NEXT: vmovl.u8 q8, d16 +; CHECK-NEXT: vorr.i16 d16, #0x100 +; CHECK-NEXT: vneg.s16 d18, d16 +; CHECK-NEXT: vand d16, d16, d18 +; CHECK-NEXT: vsub.i16 d16, d16, d19 +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vpaddl.u8 d16, d16 +; CHECK-NEXT: vuzp.8 d16, d17 +; CHECK-NEXT: vst1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: bx lr %a = load <4 x i8>, <4 x i8>* %p %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 false) store <4 x i8> %tmp, <4 x i8>* %p @@ -48,13 +84,15 @@ define void @test_v4i8(<4 x i8>* %p) { define void @test_v8i8(<8 x i8>* %p) { ; CHECK-LABEL: test_v8i8: -; CHECK: vldr [[D1:d[0-9]+]], [r0] -; CHECK: vmov.i8 [[D2:d[0-9]+]], #0x1 -; CHECK: vneg.s8 [[D3:d[0-9]+]], [[D1]] -; CHECK: vand [[D1]], [[D1]], [[D3]] -; CHECK: vsub.i8 [[D1]], [[D1]], [[D2]] -; CHECK: vcnt.8 [[D1]], [[D1]] -; CHECK: vstr [[D1]], [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vmov.i8 d18, #0x1 +; CHECK-NEXT: vneg.s8 d17, d16 +; CHECK-NEXT: vand d16, d16, d17 +; CHECK-NEXT: vsub.i8 d16, d16, d18 +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vstr d16, [r0] +; CHECK-NEXT: bx lr %a = load <8 x i8>, <8 x i8>* %p %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 false) store <8 x i8> %tmp, <8 x i8>* %p @@ -63,13 +101,15 @@ define void @test_v8i8(<8 x i8>* %p) { define void @test_v16i8(<16 x i8>* %p) { ; CHECK-LABEL: test_v16i8: -; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] -; CHECK: vmov.i8 [[Q2:q[0-9]+]], #0x1 -; CHECK: vneg.s8 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]] -; CHECK: vand [[Q1]], [[Q1]], [[Q3]] -; CHECK: vsub.i8 [[Q1]], [[Q1]], [[Q2]] -; CHECK: vcnt.8 [[Q1]], [[Q1]] -; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vmov.i8 q10, #0x1 +; CHECK-NEXT: vneg.s8 q9, q8 +; CHECK-NEXT: vand q8, q8, q9 +; CHECK-NEXT: vsub.i8 q8, q8, q10 +; CHECK-NEXT: vcnt.8 q8, q8 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: bx lr %a = load <16 x i8>, <16 x i8>* %p %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false) store <16 x i8> %tmp, <16 x i8>* %p @@ -78,6 +118,13 @@ define void @test_v16i8(<16 x i8>* %p) { define void @test_v1i16(<1 x i16>* %p) { ; CHECK-LABEL: test_v1i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrh r1, [r0] +; CHECK-NEXT: orr r1, r1, #65536 +; CHECK-NEXT: rbit r1, r1 +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr %a = load <1 x i16>, <1 x i16>* %p %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 false) store <1 x i16> %tmp, <1 x i16>* %p @@ -86,6 +133,18 @@ define void @test_v1i16(<1 x i16>* %p) { define void @test_v2i16(<2 x i16>* %p) { ; CHECK-LABEL: test_v2i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vorr.i32 d16, #0x10000 +; CHECK-NEXT: vneg.s32 d18, d16 +; CHECK-NEXT: vand d16, d16, d18 +; CHECK-NEXT: vmov.i32 d17, #0x1f +; CHECK-NEXT: vclz.i32 d16, d16 +; CHECK-NEXT: vsub.i32 d16, d17, d16 +; CHECK-NEXT: vuzp.16 d16, d17 +; CHECK-NEXT: vst1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: bx lr %a = load <2 x i16>, <2 x i16>* %p %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 false) store <2 x i16> %tmp, <2 x i16>* %p @@ -94,14 +153,16 @@ define void @test_v2i16(<2 x i16>* %p) { define void @test_v4i16(<4 x i16>* %p) { ; CHECK-LABEL: test_v4i16: -; CHECK: vldr [[D1:d[0-9]+]], [r0] -; CHECK: vmov.i16 [[D2:d[0-9]+]], #0x1 -; CHECK: vneg.s16 [[D3:d[0-9]+]], [[D1]] -; CHECK: vand [[D1]], [[D1]], [[D3]] -; CHECK: vsub.i16 [[D1]], [[D1]], [[D2]] -; CHECK: vcnt.8 [[D1]], [[D1]] -; CHECK: vpaddl.u8 [[D1]], [[D1]] -; CHECK: vstr [[D1]], [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vmov.i16 d18, #0x1 +; CHECK-NEXT: vneg.s16 d17, d16 +; CHECK-NEXT: vand d16, d16, d17 +; CHECK-NEXT: vsub.i16 d16, d16, d18 +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vpaddl.u8 d16, d16 +; CHECK-NEXT: vstr d16, [r0] +; CHECK-NEXT: bx lr %a = load <4 x i16>, <4 x i16>* %p %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 false) store <4 x i16> %tmp, <4 x i16>* %p @@ -110,14 +171,16 @@ define void @test_v4i16(<4 x i16>* %p) { define void @test_v8i16(<8 x i16>* %p) { ; CHECK-LABEL: test_v8i16: -; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] -; CHECK: vmov.i16 [[Q2:q[0-9]+]], #0x1 -; CHECK: vneg.s16 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]] -; CHECK: vand [[Q1]], [[Q1]], [[Q3]] -; CHECK: vsub.i16 [[Q1]], [[Q1]], [[Q2]] -; CHECK: vcnt.8 [[Q1]], [[Q1]] -; CHECK: vpaddl.u8 [[Q1]], [[Q1]] -; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vmov.i16 q10, #0x1 +; CHECK-NEXT: vneg.s16 q9, q8 +; CHECK-NEXT: vand q8, q8, q9 +; CHECK-NEXT: vsub.i16 q8, q8, q10 +; CHECK-NEXT: vcnt.8 q8, q8 +; CHECK-NEXT: vpaddl.u8 q8, q8 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: bx lr %a = load <8 x i16>, <8 x i16>* %p %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false) store <8 x i16> %tmp, <8 x i16>* %p @@ -126,6 +189,12 @@ define void @test_v8i16(<8 x i16>* %p) { define void @test_v1i32(<1 x i32>* %p) { ; CHECK-LABEL: test_v1i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: rbit r1, r1 +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr %a = load <1 x i32>, <1 x i32>* %p %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 false) store <1 x i32> %tmp, <1 x i32>* %p @@ -134,15 +203,17 @@ define void @test_v1i32(<1 x i32>* %p) { define void @test_v2i32(<2 x i32>* %p) { ; CHECK-LABEL: test_v2i32: -; CHECK: vldr [[D1:d[0-9]+]], [r0] -; CHECK: vmov.i32 [[D2:d[0-9]+]], #0x1 -; CHECK: vneg.s32 [[D3:d[0-9]+]], [[D1]] -; CHECK: vand [[D1]], [[D1]], [[D3]] -; CHECK: vsub.i32 [[D1]], [[D1]], [[D2]] -; CHECK: vcnt.8 [[D1]], [[D1]] -; CHECK: vpaddl.u8 [[D1]], [[D1]] -; CHECK: vpaddl.u16 [[D1]], [[D1]] -; CHECK: vstr [[D1]], [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vmov.i32 d18, #0x1 +; CHECK-NEXT: vneg.s32 d17, d16 +; CHECK-NEXT: vand d16, d16, d17 +; CHECK-NEXT: vsub.i32 d16, d16, d18 +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vpaddl.u8 d16, d16 +; CHECK-NEXT: vpaddl.u16 d16, d16 +; CHECK-NEXT: vstr d16, [r0] +; CHECK-NEXT: bx lr %a = load <2 x i32>, <2 x i32>* %p %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false) store <2 x i32> %tmp, <2 x i32>* %p @@ -151,15 +222,17 @@ define void @test_v2i32(<2 x i32>* %p) { define void @test_v4i32(<4 x i32>* %p) { ; CHECK-LABEL: test_v4i32: -; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] -; CHECK: vmov.i32 [[Q2:q[0-9]+]], #0x1 -; CHECK: vneg.s32 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]] -; CHECK: vand [[Q1]], [[Q1]], [[Q3]] -; CHECK: vsub.i32 [[Q1]], [[Q1]], [[Q2]] -; CHECK: vcnt.8 [[Q1]], [[Q1]] -; CHECK: vpaddl.u8 [[Q1]], [[Q1]] -; CHECK: vpaddl.u16 [[Q1]], [[Q1]] -; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vmov.i32 q10, #0x1 +; CHECK-NEXT: vneg.s32 q9, q8 +; CHECK-NEXT: vand q8, q8, q9 +; CHECK-NEXT: vsub.i32 q8, q8, q10 +; CHECK-NEXT: vcnt.8 q8, q8 +; CHECK-NEXT: vpaddl.u8 q8, q8 +; CHECK-NEXT: vpaddl.u16 q8, q8 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: bx lr %a = load <4 x i32>, <4 x i32>* %p %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false) store <4 x i32> %tmp, <4 x i32>* %p @@ -168,17 +241,19 @@ define void @test_v4i32(<4 x i32>* %p) { define void @test_v1i64(<1 x i64>* %p) { ; CHECK-LABEL: test_v1i64: -; CHECK: vmov.i32 [[D2:d[0-9]+]], #0x0 -; CHECK: vldr [[D1:d[0-9]+]], [r0] -; CHECK: vmov.i64 [[D3:d[0-9]+]], #0xffffffffffffffff -; CHECK: vsub.i64 [[D2]], [[D2]], [[D1]] -; CHECK: vand [[D2]], [[D1]], [[D2]] -; CHECK: vadd.i64 [[D2]], [[D2]], [[D3]] -; CHECK: vcnt.8 [[D2]], [[D2]] -; CHECK: vpaddl.u8 [[D2]], [[D2]] -; CHECK: vpaddl.u16 [[D2]], [[D2]] -; CHECK: vpaddl.u32 [[D2]], [[D2]] -; CHECK: vstr [[D2]], [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i32 d16, #0x0 +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vmov.i64 d18, #0xffffffffffffffff +; CHECK-NEXT: vsub.i64 d16, d16, d17 +; CHECK-NEXT: vand d16, d17, d16 +; CHECK-NEXT: vadd.i64 d16, d16, d18 +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vpaddl.u8 d16, d16 +; CHECK-NEXT: vpaddl.u16 d16, d16 +; CHECK-NEXT: vpaddl.u32 d16, d16 +; CHECK-NEXT: vstr d16, [r0] +; CHECK-NEXT: bx lr %a = load <1 x i64>, <1 x i64>* %p %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 false) store <1 x i64> %tmp, <1 x i64>* %p @@ -187,17 +262,19 @@ define void @test_v1i64(<1 x i64>* %p) { define void @test_v2i64(<2 x i64>* %p) { ; CHECK-LABEL: test_v2i64: -; CHECK: vmov.i32 [[Q2:q[0-9]+]], #0x0 -; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] -; CHECK: vmov.i64 [[Q3:q[0-9]+]], #0xffffffffffffffff -; CHECK: vsub.i64 [[Q2]], [[Q2]], [[Q1:q[0-9]+]] -; CHECK: vand [[Q2]], [[Q1]], [[Q2]] -; CHECK: vadd.i64 [[Q2]], [[Q2]], [[Q3]] -; CHECK: vcnt.8 [[Q2]], [[Q2]] -; CHECK: vpaddl.u8 [[Q2]], [[Q2]] -; CHECK: vpaddl.u16 [[Q2]], [[Q2]] -; CHECK: vpaddl.u32 [[Q2]], [[Q2]] -; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i32 q8, #0x0 +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vmov.i64 q10, #0xffffffffffffffff +; CHECK-NEXT: vsub.i64 q8, q8, q9 +; CHECK-NEXT: vand q8, q9, q8 +; CHECK-NEXT: vadd.i64 q8, q8, q10 +; CHECK-NEXT: vcnt.8 q8, q8 +; CHECK-NEXT: vpaddl.u8 q8, q8 +; CHECK-NEXT: vpaddl.u16 q8, q8 +; CHECK-NEXT: vpaddl.u32 q8, q8 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: bx lr %a = load <2 x i64>, <2 x i64>* %p %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false) store <2 x i64> %tmp, <2 x i64>* %p @@ -207,7 +284,13 @@ define void @test_v2i64(<2 x i64>* %p) { ;------------------------------------------------------------------------------ define void @test_v1i8_zero_undef(<1 x i8>* %p) { -; CHECK-LABEL: test_v1i8_zero_undef +; CHECK-LABEL: test_v1i8_zero_undef: +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrb r1, [r0] +; CHECK-NEXT: rbit r1, r1 +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: bx lr %a = load <1 x i8>, <1 x i8>* %p %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 true) store <1 x i8> %tmp, <1 x i8>* %p @@ -216,6 +299,20 @@ define void @test_v1i8_zero_undef(<1 x i8>* %p) { define void @test_v2i8_zero_undef(<2 x i8>* %p) { ; CHECK-LABEL: test_v2i8_zero_undef: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.16 {d16[0]}, [r0:16] +; CHECK-NEXT: vmovl.u8 q8, d16 +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vneg.s32 d18, d16 +; CHECK-NEXT: vand d16, d16, d18 +; CHECK-NEXT: vmov.i32 d17, #0x1f +; CHECK-NEXT: vclz.i32 d16, d16 +; CHECK-NEXT: vsub.i32 d16, d17, d16 +; CHECK-NEXT: vmov.32 r1, d16[1] +; CHECK-NEXT: vmov.32 r2, d16[0] +; CHECK-NEXT: strb r1, [r0, #1] +; CHECK-NEXT: strb r2, [r0] +; CHECK-NEXT: bx lr %a = load <2 x i8>, <2 x i8>* %p %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true) store <2 x i8> %tmp, <2 x i8>* %p @@ -224,6 +321,17 @@ define void @test_v2i8_zero_undef(<2 x i8>* %p) { define void @test_v4i8_zero_undef(<4 x i8>* %p) { ; CHECK-LABEL: test_v4i8_zero_undef: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: vmovl.u8 q8, d16 +; CHECK-NEXT: vneg.s16 d18, d16 +; CHECK-NEXT: vand d16, d16, d18 +; CHECK-NEXT: vmov.i16 d17, #0xf +; CHECK-NEXT: vclz.i16 d16, d16 +; CHECK-NEXT: vsub.i16 d16, d17, d16 +; CHECK-NEXT: vuzp.8 d16, d17 +; CHECK-NEXT: vst1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: bx lr %a = load <4 x i8>, <4 x i8>* %p %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true) store <4 x i8> %tmp, <4 x i8>* %p @@ -232,13 +340,15 @@ define void @test_v4i8_zero_undef(<4 x i8>* %p) { define void @test_v8i8_zero_undef(<8 x i8>* %p) { ; CHECK-LABEL: test_v8i8_zero_undef: -; CHECK: vldr [[D1:d[0-9]+]], [r0] -; CHECK: vmov.i8 [[D2:d[0-9]+]], #0x1 -; CHECK: vneg.s8 [[D3:d[0-9]+]], [[D1]] -; CHECK: vand [[D1]], [[D1]], [[D3]] -; CHECK: vsub.i8 [[D1]], [[D1]], [[D2]] -; CHECK: vcnt.8 [[D1]], [[D1]] -; CHECK: vstr [[D1]], [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vmov.i8 d18, #0x1 +; CHECK-NEXT: vneg.s8 d17, d16 +; CHECK-NEXT: vand d16, d16, d17 +; CHECK-NEXT: vsub.i8 d16, d16, d18 +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vstr d16, [r0] +; CHECK-NEXT: bx lr %a = load <8 x i8>, <8 x i8>* %p %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true) store <8 x i8> %tmp, <8 x i8>* %p @@ -247,13 +357,15 @@ define void @test_v8i8_zero_undef(<8 x i8>* %p) { define void @test_v16i8_zero_undef(<16 x i8>* %p) { ; CHECK-LABEL: test_v16i8_zero_undef: -; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] -; CHECK: vmov.i8 [[Q2:q[0-9]+]], #0x1 -; CHECK: vneg.s8 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]] -; CHECK: vand [[Q1]], [[Q1]], [[Q3]] -; CHECK: vsub.i8 [[Q1]], [[Q1]], [[Q2]] -; CHECK: vcnt.8 [[Q1]], [[Q1]] -; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vmov.i8 q10, #0x1 +; CHECK-NEXT: vneg.s8 q9, q8 +; CHECK-NEXT: vand q8, q8, q9 +; CHECK-NEXT: vsub.i8 q8, q8, q10 +; CHECK-NEXT: vcnt.8 q8, q8 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: bx lr %a = load <16 x i8>, <16 x i8>* %p %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true) store <16 x i8> %tmp, <16 x i8>* %p @@ -262,6 +374,12 @@ define void @test_v16i8_zero_undef(<16 x i8>* %p) { define void @test_v1i16_zero_undef(<1 x i16>* %p) { ; CHECK-LABEL: test_v1i16_zero_undef: +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrh r1, [r0] +; CHECK-NEXT: rbit r1, r1 +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr %a = load <1 x i16>, <1 x i16>* %p %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 true) store <1 x i16> %tmp, <1 x i16>* %p @@ -270,6 +388,17 @@ define void @test_v1i16_zero_undef(<1 x i16>* %p) { define void @test_v2i16_zero_undef(<2 x i16>* %p) { ; CHECK-LABEL: test_v2i16_zero_undef: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vneg.s32 d18, d16 +; CHECK-NEXT: vand d16, d16, d18 +; CHECK-NEXT: vmov.i32 d17, #0x1f +; CHECK-NEXT: vclz.i32 d16, d16 +; CHECK-NEXT: vsub.i32 d16, d17, d16 +; CHECK-NEXT: vuzp.16 d16, d17 +; CHECK-NEXT: vst1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: bx lr %a = load <2 x i16>, <2 x i16>* %p %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true) store <2 x i16> %tmp, <2 x i16>* %p @@ -278,13 +407,15 @@ define void @test_v2i16_zero_undef(<2 x i16>* %p) { define void @test_v4i16_zero_undef(<4 x i16>* %p) { ; CHECK-LABEL: test_v4i16_zero_undef: -; CHECK: vldr [[D1:d[0-9]+]], [r0] -; CHECK: vneg.s16 [[D2:d[0-9]+]], [[D1]] -; CHECK: vand [[D1]], [[D1]], [[D2]] -; CHECK: vmov.i16 [[D3:d[0-9]+]], #0xf -; CHECK: vclz.i16 [[D1]], [[D1]] -; CHECK: vsub.i16 [[D1]], [[D3]], [[D1]] -; CHECK: vstr [[D1]], [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vneg.s16 d17, d16 +; CHECK-NEXT: vand d16, d16, d17 +; CHECK-NEXT: vmov.i16 d17, #0xf +; CHECK-NEXT: vclz.i16 d16, d16 +; CHECK-NEXT: vsub.i16 d16, d17, d16 +; CHECK-NEXT: vstr d16, [r0] +; CHECK-NEXT: bx lr %a = load <4 x i16>, <4 x i16>* %p %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true) store <4 x i16> %tmp, <4 x i16>* %p @@ -293,13 +424,15 @@ define void @test_v4i16_zero_undef(<4 x i16>* %p) { define void @test_v8i16_zero_undef(<8 x i16>* %p) { ; CHECK-LABEL: test_v8i16_zero_undef: -; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] -; CHECK: vneg.s16 [[Q2:q[0-9]+]], [[Q1:q[0-9]+]] -; CHECK: vand [[Q1]], [[Q1]], [[Q2]] -; CHECK: vmov.i16 [[Q3:q[0-9]+]], #0xf -; CHECK: vclz.i16 [[Q1]], [[Q1]] -; CHECK: vsub.i16 [[Q1]], [[Q3]], [[Q1]] -; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vneg.s16 q9, q8 +; CHECK-NEXT: vand q8, q8, q9 +; CHECK-NEXT: vmov.i16 q9, #0xf +; CHECK-NEXT: vclz.i16 q8, q8 +; CHECK-NEXT: vsub.i16 q8, q9, q8 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: bx lr %a = load <8 x i16>, <8 x i16>* %p %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true) store <8 x i16> %tmp, <8 x i16>* %p @@ -308,6 +441,12 @@ define void @test_v8i16_zero_undef(<8 x i16>* %p) { define void @test_v1i32_zero_undef(<1 x i32>* %p) { ; CHECK-LABEL: test_v1i32_zero_undef: +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: rbit r1, r1 +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr %a = load <1 x i32>, <1 x i32>* %p %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 true) store <1 x i32> %tmp, <1 x i32>* %p @@ -316,13 +455,15 @@ define void @test_v1i32_zero_undef(<1 x i32>* %p) { define void @test_v2i32_zero_undef(<2 x i32>* %p) { ; CHECK-LABEL: test_v2i32_zero_undef: -; CHECK: vldr [[D1:d[0-9]+]], [r0] -; CHECK: vneg.s32 [[D2:d[0-9]+]], [[D1]] -; CHECK: vand [[D1]], [[D1]], [[D2]] -; CHECK: vmov.i32 [[D3:d[0-9]+]], #0x1f -; CHECK: vclz.i32 [[D1]], [[D1]] -; CHECK: vsub.i32 [[D1]], [[D3]], [[D1]] -; CHECK: vstr [[D1]], [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vneg.s32 d17, d16 +; CHECK-NEXT: vand d16, d16, d17 +; CHECK-NEXT: vmov.i32 d17, #0x1f +; CHECK-NEXT: vclz.i32 d16, d16 +; CHECK-NEXT: vsub.i32 d16, d17, d16 +; CHECK-NEXT: vstr d16, [r0] +; CHECK-NEXT: bx lr %a = load <2 x i32>, <2 x i32>* %p %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true) store <2 x i32> %tmp, <2 x i32>* %p @@ -331,13 +472,15 @@ define void @test_v2i32_zero_undef(<2 x i32>* %p) { define void @test_v4i32_zero_undef(<4 x i32>* %p) { ; CHECK-LABEL: test_v4i32_zero_undef: -; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] -; CHECK: vneg.s32 [[Q2:q[0-9]+]], [[Q1:q[0-9]+]] -; CHECK: vand [[Q1]], [[Q1]], [[Q2]] -; CHECK: vmov.i32 [[Q3:q[0-9]+]], #0x1f -; CHECK: vclz.i32 [[Q1]], [[Q1]] -; CHECK: vsub.i32 [[Q1]], [[Q3]], [[Q1]] -; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vneg.s32 q9, q8 +; CHECK-NEXT: vand q8, q8, q9 +; CHECK-NEXT: vmov.i32 q9, #0x1f +; CHECK-NEXT: vclz.i32 q8, q8 +; CHECK-NEXT: vsub.i32 q8, q9, q8 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: bx lr %a = load <4 x i32>, <4 x i32>* %p %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true) store <4 x i32> %tmp, <4 x i32>* %p @@ -346,17 +489,19 @@ define void @test_v4i32_zero_undef(<4 x i32>* %p) { define void @test_v1i64_zero_undef(<1 x i64>* %p) { ; CHECK-LABEL: test_v1i64_zero_undef: -; CHECK: vmov.i32 [[D2:d[0-9]+]], #0x0 -; CHECK: vldr [[D1:d[0-9]+]], [r0] -; CHECK: vmov.i64 [[D3:d[0-9]+]], #0xffffffffffffffff -; CHECK: vsub.i64 [[D2]], [[D2]], [[D1]] -; CHECK: vand [[D2]], [[D1]], [[D2]] -; CHECK: vadd.i64 [[D2]], [[D2]], [[D3]] -; CHECK: vcnt.8 [[D2]], [[D2]] -; CHECK: vpaddl.u8 [[D2]], [[D2]] -; CHECK: vpaddl.u16 [[D2]], [[D2]] -; CHECK: vpaddl.u32 [[D2]], [[D2]] -; CHECK: vstr [[D2]], [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i32 d16, #0x0 +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vmov.i64 d18, #0xffffffffffffffff +; CHECK-NEXT: vsub.i64 d16, d16, d17 +; CHECK-NEXT: vand d16, d17, d16 +; CHECK-NEXT: vadd.i64 d16, d16, d18 +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vpaddl.u8 d16, d16 +; CHECK-NEXT: vpaddl.u16 d16, d16 +; CHECK-NEXT: vpaddl.u32 d16, d16 +; CHECK-NEXT: vstr d16, [r0] +; CHECK-NEXT: bx lr %a = load <1 x i64>, <1 x i64>* %p %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 true) store <1 x i64> %tmp, <1 x i64>* %p @@ -365,17 +510,19 @@ define void @test_v1i64_zero_undef(<1 x i64>* %p) { define void @test_v2i64_zero_undef(<2 x i64>* %p) { ; CHECK-LABEL: test_v2i64_zero_undef: -; CHECK: vmov.i32 [[Q2:q[0-9]+]], #0x0 -; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] -; CHECK: vmov.i64 [[Q3:q[0-9]+]], #0xffffffffffffffff -; CHECK: vsub.i64 [[Q2]], [[Q2]], [[Q1:q[0-9]+]] -; CHECK: vand [[Q2]], [[Q1]], [[Q2]] -; CHECK: vadd.i64 [[Q2]], [[Q2]], [[Q3]] -; CHECK: vcnt.8 [[Q2]], [[Q2]] -; CHECK: vpaddl.u8 [[Q2]], [[Q2]] -; CHECK: vpaddl.u16 [[Q2]], [[Q2]] -; CHECK: vpaddl.u32 [[Q2]], [[Q2]] -; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i32 q8, #0x0 +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vmov.i64 q10, #0xffffffffffffffff +; CHECK-NEXT: vsub.i64 q8, q8, q9 +; CHECK-NEXT: vand q8, q9, q8 +; CHECK-NEXT: vadd.i64 q8, q8, q10 +; CHECK-NEXT: vcnt.8 q8, q8 +; CHECK-NEXT: vpaddl.u8 q8, q8 +; CHECK-NEXT: vpaddl.u16 q8, q8 +; CHECK-NEXT: vpaddl.u32 q8, q8 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: bx lr %a = load <2 x i64>, <2 x i64>* %p %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true) store <2 x i64> %tmp, <2 x i64>* %p