forked from OSchip/llvm-project
ARM: add patterns for vqdmlal with separate vqdmull and vqadds
The vqdmlal and vqdmlls instructions are really just a fused pair consisting of a vqdmull.sN and a vqadd.sN. This adds patterns to LLVM so that we can switch Clang's CodeGen over to generating these instead of the special vqdmlal intrinsics. llvm-svn: 189480
This commit is contained in:
parent
dd1d1b2c79
commit
8854ba7837
|
@ -4143,6 +4143,25 @@ defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
|
|||
"vqdmlal", "s", int_arm_neon_vqdmlal>;
|
||||
defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", int_arm_neon_vqdmlal>;
|
||||
|
||||
def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
|
||||
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
|
||||
(v4i16 DPR:$Vm))))),
|
||||
(VQDMLALv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
|
||||
def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
|
||||
(v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
|
||||
(v2i32 DPR:$Vm))))),
|
||||
(VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
|
||||
def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
|
||||
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
|
||||
(v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
|
||||
imm:$lane)))))),
|
||||
(VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
|
||||
def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
|
||||
(v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
|
||||
(v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
|
||||
imm:$lane)))))),
|
||||
(VQDMLALslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
|
||||
|
||||
// VMLS : Vector Multiply Subtract (integer and floating-point)
|
||||
defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
|
||||
IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
|
||||
|
@ -4200,6 +4219,25 @@ defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
|
|||
"vqdmlsl", "s", int_arm_neon_vqdmlsl>;
|
||||
defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
|
||||
|
||||
def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
|
||||
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
|
||||
(v4i16 DPR:$Vm))))),
|
||||
(VQDMLSLv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
|
||||
def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
|
||||
(v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
|
||||
(v2i32 DPR:$Vm))))),
|
||||
(VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
|
||||
def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
|
||||
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
|
||||
(v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
|
||||
imm:$lane)))))),
|
||||
(VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
|
||||
def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
|
||||
(v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
|
||||
(v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
|
||||
imm:$lane)))))),
|
||||
(VQDMLSLslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
|
||||
|
||||
// Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations.
|
||||
def VFMAfd : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
|
||||
v2f32, fmul_su, fadd_mlx>,
|
||||
|
|
|
@ -238,6 +238,51 @@ entry:
|
|||
declare <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
|
||||
declare <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
|
||||
|
||||
define <4 x i32> @vqdmlals16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
|
||||
;CHECK-LABEL: vqdmlals16_natural:
|
||||
;CHECK: vqdmlal.s16
|
||||
%tmp1 = load <4 x i32>* %A
|
||||
%tmp2 = load <4 x i16>* %B
|
||||
%tmp3 = load <4 x i16>* %C
|
||||
%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
|
||||
%tmp5 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
|
||||
ret <4 x i32> %tmp5
|
||||
}
|
||||
|
||||
define <2 x i64> @vqdmlals32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
|
||||
;CHECK-LABEL: vqdmlals32_natural:
|
||||
;CHECK: vqdmlal.s32
|
||||
%tmp1 = load <2 x i64>* %A
|
||||
%tmp2 = load <2 x i32>* %B
|
||||
%tmp3 = load <2 x i32>* %C
|
||||
%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
|
||||
%tmp5 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
|
||||
ret <2 x i64> %tmp5
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
|
||||
entry:
|
||||
; CHECK-LABEL: test_vqdmlal_lanes16_natural:
|
||||
; CHECK: vqdmlal.s16 q0, d2, d3[1]
|
||||
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
|
||||
%1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
|
||||
%2 = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
|
||||
ret <4 x i32> %2
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
|
||||
entry:
|
||||
; CHECK-LABEL: test_vqdmlal_lanes32_natural:
|
||||
; CHECK: vqdmlal.s32 q0, d2, d3[1]
|
||||
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
|
||||
%1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
|
||||
%2 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
|
||||
declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
|
||||
|
||||
define <4 x i32> @vqdmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
|
||||
;CHECK-LABEL: vqdmlsls16:
|
||||
;CHECK: vqdmlsl.s16
|
||||
|
@ -278,3 +323,48 @@ entry:
|
|||
|
||||
declare <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
|
||||
declare <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
|
||||
|
||||
define <4 x i32> @vqdmlsls16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
|
||||
;CHECK-LABEL: vqdmlsls16_natural:
|
||||
;CHECK: vqdmlsl.s16
|
||||
%tmp1 = load <4 x i32>* %A
|
||||
%tmp2 = load <4 x i16>* %B
|
||||
%tmp3 = load <4 x i16>* %C
|
||||
%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
|
||||
%tmp5 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
|
||||
ret <4 x i32> %tmp5
|
||||
}
|
||||
|
||||
define <2 x i64> @vqdmlsls32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
|
||||
;CHECK-LABEL: vqdmlsls32_natural:
|
||||
;CHECK: vqdmlsl.s32
|
||||
%tmp1 = load <2 x i64>* %A
|
||||
%tmp2 = load <2 x i32>* %B
|
||||
%tmp3 = load <2 x i32>* %C
|
||||
%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
|
||||
%tmp5 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
|
||||
ret <2 x i64> %tmp5
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
|
||||
entry:
|
||||
; CHECK-LABEL: test_vqdmlsl_lanes16_natural:
|
||||
; CHECK: vqdmlsl.s16 q0, d2, d3[1]
|
||||
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
|
||||
%1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
|
||||
%2 = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
|
||||
ret <4 x i32> %2
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
|
||||
entry:
|
||||
; CHECK-LABEL: test_vqdmlsl_lanes32_natural:
|
||||
; CHECK: vqdmlsl.s32 q0, d2, d3[1]
|
||||
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
|
||||
%1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
|
||||
%2 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
|
||||
declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
|
||||
|
|
Loading…
Reference in New Issue