diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td index 5f7d22d3b1fe..9ef108d24527 100644 --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -4305,17 +4305,29 @@ def : Pat<(v4f32 (fmul (v4f32 QPR:$src1), (v2f32 (EXTRACT_SUBREG QPR:$src2, (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; - +def : Pat<(v8f16 (fmul (v8f16 QPR:$src1), + (v8f16 (NEONvduplane (v8f16 QPR:$src2), imm:$lane)))), + (v8f16 (VMULslhq(v8f16 QPR:$src1), + (v4f16 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))), (VMULslfd DPR:$Rn, (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0), (i32 0))>; +def : Pat<(v4f16 (fmul DPR:$Rn, (NEONvdup (f16 HPR:$Rm)))), + (VMULslhd DPR:$Rn, + (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0), + (i32 0))>; def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))), (VMULslfq QPR:$Rn, (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0), (i32 0))>; - +def : Pat<(v8f16 (fmul QPR:$Rn, (NEONvdup (f16 HPR:$Rm)))), + (VMULslhq QPR:$Rn, + (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0), + (i32 0))>; // VQDMULH : Vector Saturating Doubling Multiply Returning High Half defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D, diff --git a/llvm/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll b/llvm/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll index 496cabc77960..c8cf71aeaf3f 100644 --- a/llvm/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll +++ b/llvm/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll @@ -979,43 +979,53 @@ entry: ret <8 x half> %0 } -; FIXME (PR38404) -; -;define dso_local <4 x half> @test_vmul_lane_f16(<4 x half> %a, <4 x half> %b) { -;entry: -; %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <4 x i32> -; %mul = fmul <4 x half> %shuffle, %a -; ret <4 x half> %mul -;} +define dso_local <4 x half> @test_vmul_lane_f16(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: test_vmul_lane_f16: +; CHECK: vmul.f16 d0, d0, d1[3] +; CHECK-NEXT: bx lr +entry: + %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <4 x i32> + %mul = fmul <4 x half> %shuffle, %a + ret <4 x half> %mul +} -;define dso_local <8 x half> @test_vmulq_lane_f16(<8 x half> %a, <4 x half> %b) { -;entry: -; %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <8 x i32> -; %mul = fmul <8 x half> %shuffle, %a -; ret <8 x half> %mul -;} +define dso_local <8 x half> @test_vmulq_lane_f16(<8 x half> %a, <4 x half> %b) { +; CHECK-LABEL: test_vmulq_lane_f16: +; CHECK: vmul.f16 q0, q0, d2[3] +; CHECK-NEXT: bx lr +entry: + %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <8 x i32> + %mul = fmul <8 x half> %shuffle, %a + ret <8 x half> %mul +} -;define dso_local <4 x half> @test_vmul_n_f16(<4 x half> %a, float %b.coerce) { -;entry: -; %0 = bitcast float %b.coerce to i32 -; %tmp.0.extract.trunc = trunc i32 %0 to i16 -; %1 = bitcast i16 %tmp.0.extract.trunc to half -; %vecinit = insertelement <4 x half> undef, half %1, i32 0 -; %vecinit4 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer -; %mul = fmul <4 x half> %vecinit4, %a -; ret <4 x half> %mul -;} +define dso_local <4 x half> @test_vmul_n_f16(<4 x half> %a, float %b.coerce) { +; CHECK-LABEL: test_vmul_n_f16: +; CHECK: vmul.f16 d0, d0, d1[0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %b.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %vecinit = insertelement <4 x half> undef, half %1, i32 0 + %vecinit4 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer + %mul = fmul <4 x half> %vecinit4, %a + ret <4 x half> %mul +} -;define dso_local <8 x half> @test_vmulq_n_f16(<8 x half> %a, float %b.coerce) { -;entry: -; %0 = bitcast float %b.coerce to i32 -; %tmp.0.extract.trunc = trunc i32 %0 to i16 -; %1 = bitcast i16 %tmp.0.extract.trunc to half -; %vecinit = insertelement <8 x half> undef, half %1, i32 0 -; %vecinit8 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer -; %mul = fmul <8 x half> %vecinit8, %a -; ret <8 x half> %mul -;} +define dso_local <8 x half> @test_vmulq_n_f16(<8 x half> %a, float %b.coerce) { +; CHECK-LABEL: test_vmulq_n_f16: +; CHECK: vmul.f16 q0, q0, d2[0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %b.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %vecinit = insertelement <8 x half> undef, half %1, i32 0 + %vecinit8 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer + %mul = fmul <8 x half> %vecinit8, %a + ret <8 x half> %mul +} define dso_local <4 x half> @test_vbsl_f16(<4 x i16> %a, <4 x half> %b, <4 x half> %c) { ; CHECKLABEL: test_vbsl_f16: