From 9b302138280e636ed8db27055739d20b1feee7a3 Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Fri, 3 Aug 2018 09:12:56 +0000 Subject: [PATCH] [ARM] FP16: support VFMA This is addressing PR38404. llvm-svn: 338830 --- llvm/lib/Target/ARM/ARMInstrNEON.td | 6 ++ .../ARM/armv8.2a-fp16-vector-intrinsics.ll | 66 +++++++++++-------- 2 files changed, 46 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td index 4525eec8da03..cff58f37ae1d 100644 --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -4734,6 +4734,12 @@ def VFMShq : N3VQMulOp<0, 0, 0b11, 0b1100, 1, IIC_VFMACQ, "vfms", "f16", Requires<[HasNEON,HasFullFP16,UseFusedMAC]>; // Match @llvm.fma.* intrinsics +def : Pat<(v4f16 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)), + (VFMAhd DPR:$src1, DPR:$Vn, DPR:$Vm)>, + Requires<[HasNEON,HasFullFP16]>; +def : Pat<(v8f16 (fma QPR:$Vn, QPR:$Vm, QPR:$src1)), + (VFMAhq QPR:$src1, QPR:$Vn, QPR:$Vm)>, + Requires<[HasNEON,HasFullFP16]>; def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)), (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>, Requires<[HasVFP4]>; diff --git a/llvm/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll b/llvm/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll index a4703cf1dff3..8bc3f496df90 100644 --- a/llvm/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll +++ b/llvm/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll @@ -911,34 +911,48 @@ entry: ret <8 x half> %sub.i } +define dso_local <4 x half> @test_vfma_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) { +; CHECK-LABEL: test_vfma_f16: +; CHECK: vfma.f16 d0, d1, d2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a) + ret <4 x half> %0 +} + +define dso_local <8 x half> @test_vfmaq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { +; CHECK-LABEL: test_vfmaq_f16: +; CHECK: vfma.f16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a) + ret <8 x half> %0 +} + +define dso_local <4 x half> @test_vfms_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) { +; CHECK-LABEL: test_vfms_f16: +; CHECK: vneg.f16 [[D16:d[0-9]+]], d1 +; CHECK-NEXT: vfma.f16 d0, [[D16]], d2 +; CHECK-NEXT: bx lr +entry: + %sub.i = fsub <4 x half> , %b + %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %sub.i, <4 x half> %c, <4 x half> %a) + ret <4 x half> %0 +} + +define dso_local <8 x half> @test_vfmsq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { +; CHECK-LABEL: test_vfmsq_f16: +; CHECK: vneg.f16 [[Q8:q[0-9]+]], q1 +; CHECK-NEXT: vfma.f16 q0, [[Q8]], q2 +; CHECK-NEXT: bx lr +entry: + %sub.i = fsub <8 x half> , %b + %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %sub.i, <8 x half> %c, <8 x half> %a) + ret <8 x half> %0 +} + ; FIXME (PR38404) ; -;define dso_local <4 x half> @test_vfma_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) { -;entry: -; %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a) -; ret <4 x half> %0 -;} - -;define dso_local <8 x half> @test_vfmaq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { -;entry: -; %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a) -; ret <8 x half> %0 -;} - -;define dso_local <4 x half> @test_vfms_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) { -;entry: -; %sub.i = fsub <4 x half> , %b -; %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %sub.i, <4 x half> %c, <4 x half> %a) -; ret <4 x half> %0 -;} - -;define dso_local <8 x half> @test_vfmsq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { -;entry: -; %sub.i = fsub <8 x half> , %b -; %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %sub.i, <8 x half> %c, <8 x half> %a) -; ret <8 x half> %0 -;} - ;define dso_local <4 x half> @test_vmul_lane_f16(<4 x half> %a, <4 x half> %b) { ;entry: ; %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <4 x i32>