From d62c5ec2fe374dc00bcd315abbee9839b80457b2 Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Fri, 3 Aug 2018 09:24:29 +0000 Subject: [PATCH] [ARM] FP16: support vector zip and unzip This is addressing PR38404. Differential Revision: https://reviews.llvm.org/D50186 llvm-svn: 338835 --- llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp | 4 + .../ARM/armv8.2a-fp16-vector-intrinsics.ll | 84 +++++++++++-------- 2 files changed, 52 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 9592dd53c347..2f7a0e3c4f34 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -3030,11 +3030,13 @@ void ARMDAGToDAGISel::Select(SDNode *N) { switch (VT.getSimpleVT().SimpleTy) { default: return; case MVT::v8i8: Opc = ARM::VZIPd8; break; + case MVT::v4f16: case MVT::v4i16: Opc = ARM::VZIPd16; break; case MVT::v2f32: // vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm. case MVT::v2i32: Opc = ARM::VTRNd32; break; case MVT::v16i8: Opc = ARM::VZIPq8; break; + case MVT::v8f16: case MVT::v8i16: Opc = ARM::VZIPq16; break; case MVT::v4f32: case MVT::v4i32: Opc = ARM::VZIPq32; break; @@ -3051,11 +3053,13 @@ void ARMDAGToDAGISel::Select(SDNode *N) { switch (VT.getSimpleVT().SimpleTy) { default: return; case MVT::v8i8: Opc = ARM::VUZPd8; break; + case MVT::v4f16: case MVT::v4i16: Opc = ARM::VUZPd16; break; case MVT::v2f32: // vuzp.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm. case MVT::v2i32: Opc = ARM::VTRNd32; break; case MVT::v16i8: Opc = ARM::VUZPq8; break; + case MVT::v8f16: case MVT::v8i16: Opc = ARM::VUZPq16; break; case MVT::v4f32: case MVT::v4i32: Opc = ARM::VUZPq32; break; diff --git a/llvm/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll b/llvm/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll index 8bc3f496df90..2b532a82b0ee 100644 --- a/llvm/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll +++ b/llvm/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll @@ -1015,44 +1015,56 @@ entry: ret <8 x half> %3 } +define dso_local %struct.float16x4x2_t @test_vzip_f16(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: test_vzip_f16: +; CHECK: vzip.16 d0, d1 +; CHECK-NEXT: bx lr +entry: + %vzip.i = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> + %vzip1.i = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> + %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x half> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x half> %vzip1.i, 0, 1 + ret %struct.float16x4x2_t %.fca.0.1.insert +} + +define dso_local %struct.float16x8x2_t @test_vzipq_f16(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: test_vzipq_f16: +; CHECK: vzip.16 q0, q1 +; CHECK-NEXT: bx lr +entry: + %vzip.i = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> + %vzip1.i = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> + %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x half> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x half> %vzip1.i, 0, 1 + ret %struct.float16x8x2_t %.fca.0.1.insert +} + +define dso_local %struct.float16x4x2_t @test_vuzp_f16(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: test_vuzp_f16: +; CHECK: vuzp.16 d0, d1 +; CHECK-NEXT: bx lr +entry: + %vuzp.i = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> + %vuzp1.i = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> + %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x half> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x half> %vuzp1.i, 0, 1 + ret %struct.float16x4x2_t %.fca.0.1.insert +} + +define dso_local %struct.float16x8x2_t @test_vuzpq_f16(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: test_vuzpq_f16: +; CHECK: vuzp.16 q0, q1 +; CHECK-NEXT: bx lr +entry: + %vuzp.i = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> + %vuzp1.i = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> + %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x half> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x half> %vuzp1.i, 0, 1 + ret %struct.float16x8x2_t %.fca.0.1.insert +} + ; FIXME (PR38404) ; -;define dso_local %struct.float16x4x2_t @test_vzip_f16(<4 x half> %a, <4 x half> %b) { -;entry: -; %vzip.i = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> -; %vzip1.i = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> -; %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x half> %vzip.i, 0, 0 -; %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x half> %vzip1.i, 0, 1 -; ret %struct.float16x4x2_t %.fca.0.1.insert -;} -; -;define dso_local %struct.float16x8x2_t @test_vzipq_f16(<8 x half> %a, <8 x half> %b) { -;entry: -; %vzip.i = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> -; %vzip1.i = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> -; %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x half> %vzip.i, 0, 0 -; %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x half> %vzip1.i, 0, 1 -; ret %struct.float16x8x2_t %.fca.0.1.insert -;} -; -;define dso_local %struct.float16x4x2_t @test_vuzp_f16(<4 x half> %a, <4 x half> %b) { -;entry: -; %vuzp.i = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> -; %vuzp1.i = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> -; %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x half> %vuzp.i, 0, 0 -; %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x half> %vuzp1.i, 0, 1 -; ret %struct.float16x4x2_t %.fca.0.1.insert -;} -; -;define dso_local %struct.float16x8x2_t @test_vuzpq_f16(<8 x half> %a, <8 x half> %b) { -;entry: -; %vuzp.i = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> -; %vuzp1.i = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> -; %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x half> %vuzp.i, 0, 0 -; %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x half> %vuzp1.i, 0, 1 -; ret %struct.float16x8x2_t %.fca.0.1.insert -;} -; ;define dso_local %struct.float16x4x2_t @test_vtrn_f16(<4 x half> %a, <4 x half> %b) { ;entry: ; %vtrn.i = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32>