forked from OSchip/llvm-project
[ARM] MVE VMLAS
This addes extra patterns for the VMLAS MVE instruction, which performs Qda = Qda * Qn + Rm, a similar pattern to the existing VMLA. The sinking of splat(Rm) into the loop is already performed, meaning we just need extra Pat's in tablegen. Differential Revision: https://reviews.llvm.org/D75115
This commit is contained in:
parent
78e5d1346f
commit
e2a2f3f7fc
|
@ -5101,6 +5101,19 @@ let Predicates = [HasMVEInt] in {
|
|||
(v16i8 (mul (v16i8 MQPR:$src2),
|
||||
(v16i8 (ARMvdup (i32 rGPR:$x))))))),
|
||||
(v16i8 (MVE_VMLA_qr_u8 $src1, $src2, $x))>;
|
||||
|
||||
def : Pat<(v4i32 (add (v4i32 (ARMvdup (i32 rGPR:$x))),
|
||||
(v4i32 (mul (v4i32 MQPR:$src1),
|
||||
(v4i32 MQPR:$src2))))),
|
||||
(v4i32 (MVE_VMLAS_qr_u32 $src1, $src2, $x))>;
|
||||
def : Pat<(v8i16 (add (v8i16 (ARMvdup (i32 rGPR:$x))),
|
||||
(v8i16 (mul (v8i16 MQPR:$src1),
|
||||
(v8i16 MQPR:$src2))))),
|
||||
(v8i16 (MVE_VMLAS_qr_u16 $src1, $src2, $x))>;
|
||||
def : Pat<(v16i8 (add (v16i8 (ARMvdup (i32 rGPR:$x))),
|
||||
(v16i8 (mul (v16i8 MQPR:$src1),
|
||||
(v16i8 MQPR:$src2))))),
|
||||
(v16i8 (MVE_VMLAS_qr_u8 $src1, $src2, $x))>;
|
||||
}
|
||||
|
||||
let Predicates = [HasMVEFloat] in {
|
||||
|
|
|
@ -404,9 +404,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
|
|||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrb.u32 q0, [r0], #4
|
||||
; CHECK-NEXT: vldrb.u32 q1, [r1], #4
|
||||
; CHECK-NEXT: vmul.i32 q0, q1, q0
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r2
|
||||
; CHECK-NEXT: vstrw.32 q0, [r3], #16
|
||||
; CHECK-NEXT: vmlas.u32 q1, q0, r2
|
||||
; CHECK-NEXT: vstrw.32 q1, [r3], #16
|
||||
; CHECK-NEXT: letp lr, .LBB5_5
|
||||
; CHECK-NEXT: b .LBB5_11
|
||||
; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new
|
||||
|
@ -609,9 +608,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readon
|
|||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrh.s32 q0, [r0], #8
|
||||
; CHECK-NEXT: vldrh.s32 q1, [r1], #8
|
||||
; CHECK-NEXT: vmul.i32 q0, q1, q0
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r2
|
||||
; CHECK-NEXT: vstrw.32 q0, [r3], #16
|
||||
; CHECK-NEXT: vmlas.u32 q1, q0, r2
|
||||
; CHECK-NEXT: vstrw.32 q1, [r3], #16
|
||||
; CHECK-NEXT: letp lr, .LBB6_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: pop {r4, pc}
|
||||
|
@ -697,9 +695,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
|
|||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrb.u32 q0, [r0], #4
|
||||
; CHECK-NEXT: vldrb.u32 q1, [r1], #4
|
||||
; CHECK-NEXT: vmul.i32 q0, q1, q0
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r2
|
||||
; CHECK-NEXT: vstrw.32 q0, [r3], #16
|
||||
; CHECK-NEXT: vmlas.u32 q1, q0, r2
|
||||
; CHECK-NEXT: vstrw.32 q1, [r3], #16
|
||||
; CHECK-NEXT: letp lr, .LBB7_5
|
||||
; CHECK-NEXT: b .LBB7_11
|
||||
; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new
|
||||
|
@ -902,9 +899,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture reado
|
|||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrh.u32 q0, [r0], #8
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r1], #8
|
||||
; CHECK-NEXT: vmul.i32 q0, q1, q0
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r2
|
||||
; CHECK-NEXT: vstrw.32 q0, [r3], #16
|
||||
; CHECK-NEXT: vmlas.u32 q1, q0, r2
|
||||
; CHECK-NEXT: vstrw.32 q1, [r3], #16
|
||||
; CHECK-NEXT: letp lr, .LBB8_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: pop {r4, pc}
|
||||
|
@ -990,9 +986,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly
|
|||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
||||
; CHECK-NEXT: vmul.i32 q0, q1, q0
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r2
|
||||
; CHECK-NEXT: vstrw.32 q0, [r3], #16
|
||||
; CHECK-NEXT: vmlas.u32 q1, q0, r2
|
||||
; CHECK-NEXT: vstrw.32 q1, [r3], #16
|
||||
; CHECK-NEXT: letp lr, .LBB9_5
|
||||
; CHECK-NEXT: b .LBB9_11
|
||||
; CHECK-NEXT: .LBB9_6: @ %for.body.preheader.new
|
||||
|
|
|
@ -197,8 +197,7 @@ for.cond.cleanup:
|
|||
define arm_aapcs_vfpcc <4 x i32> @vmlasu32(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind {
|
||||
; CHECK-LABEL: vmlasu32:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmul.i32 q0, q0, q1
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r0
|
||||
; CHECK-NEXT: vmlas.u32 q0, q1, r0
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%0 = insertelement <4 x i32> undef, i32 %X, i32 0
|
||||
|
@ -211,8 +210,7 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x i32> @vmlasu32b(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind {
|
||||
; CHECK-LABEL: vmlasu32b:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmul.i32 q0, q0, q1
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r0
|
||||
; CHECK-NEXT: vmlas.u32 q0, q1, r0
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%0 = insertelement <4 x i32> undef, i32 %X, i32 0
|
||||
|
@ -225,8 +223,7 @@ entry:
|
|||
define arm_aapcs_vfpcc <8 x i16> @vmlasu16(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind {
|
||||
; CHECK-LABEL: vmlasu16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmul.i16 q0, q0, q1
|
||||
; CHECK-NEXT: vadd.i16 q0, q0, r0
|
||||
; CHECK-NEXT: vmlas.u16 q0, q1, r0
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%0 = insertelement <8 x i16> undef, i16 %X, i32 0
|
||||
|
@ -239,8 +236,7 @@ entry:
|
|||
define arm_aapcs_vfpcc <8 x i16> @vmlasu16b(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind {
|
||||
; CHECK-LABEL: vmlasu16b:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmul.i16 q0, q0, q1
|
||||
; CHECK-NEXT: vadd.i16 q0, q0, r0
|
||||
; CHECK-NEXT: vmlas.u16 q0, q1, r0
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%0 = insertelement <8 x i16> undef, i16 %X, i32 0
|
||||
|
@ -253,8 +249,7 @@ entry:
|
|||
define arm_aapcs_vfpcc <16 x i8> @vmlasu8(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind {
|
||||
; CHECK-LABEL: vmlasu8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmul.i8 q0, q0, q1
|
||||
; CHECK-NEXT: vadd.i8 q0, q0, r0
|
||||
; CHECK-NEXT: vmlas.u8 q0, q1, r0
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%0 = insertelement <16 x i8> undef, i8 %X, i32 0
|
||||
|
@ -267,8 +262,7 @@ entry:
|
|||
define arm_aapcs_vfpcc <16 x i8> @vmlasu8b(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind {
|
||||
; CHECK-LABEL: vmlasu8b:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmul.i8 q0, q0, q1
|
||||
; CHECK-NEXT: vadd.i8 q0, q0, r0
|
||||
; CHECK-NEXT: vmlas.u8 q0, q1, r0
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%0 = insertelement <16 x i8> undef, i8 %X, i32 0
|
||||
|
@ -286,9 +280,8 @@ define void @vmlas32_in_loop(i32* %s1, i32 %x, i32* %d, i32 %n) {
|
|||
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
||||
; CHECK-NEXT: subs r3, #4
|
||||
; CHECK-NEXT: vmul.i32 q0, q1, q0
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r1
|
||||
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
||||
; CHECK-NEXT: vmlas.u32 q1, q0, r1
|
||||
; CHECK-NEXT: vstrb.8 q1, [r2], #16
|
||||
; CHECK-NEXT: bne .LBB15_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: bx lr
|
||||
|
@ -325,9 +318,8 @@ define void @vmlas16_in_loop(i16* %s1, i16 %x, i16* %d, i32 %n) {
|
|||
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r0], #16
|
||||
; CHECK-NEXT: subs r3, #8
|
||||
; CHECK-NEXT: vmul.i16 q0, q1, q0
|
||||
; CHECK-NEXT: vadd.i16 q0, q0, r1
|
||||
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
||||
; CHECK-NEXT: vmlas.u16 q1, q0, r1
|
||||
; CHECK-NEXT: vstrb.8 q1, [r2], #16
|
||||
; CHECK-NEXT: bne .LBB16_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: bx lr
|
||||
|
@ -364,9 +356,8 @@ define void @vmlas8_in_loop(i8* %s1, i8 %x, i8* %d, i32 %n) {
|
|||
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r0], #16
|
||||
; CHECK-NEXT: subs r3, #16
|
||||
; CHECK-NEXT: vmul.i8 q0, q1, q0
|
||||
; CHECK-NEXT: vadd.i8 q0, q0, r1
|
||||
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
||||
; CHECK-NEXT: vmlas.u8 q1, q0, r1
|
||||
; CHECK-NEXT: vstrb.8 q1, [r2], #16
|
||||
; CHECK-NEXT: bne .LBB17_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: bx lr
|
||||
|
|
Loading…
Reference in New Issue