[MSLITE] Optimize depthwise conv 3x3 arm64

This commit is contained in:
zhanyuan 2020-10-24 14:39:11 +08:00
parent c9d6a7880c
commit 488147dcbd
1 changed files with 141 additions and 18 deletions

View File

@ -67,7 +67,6 @@ ConvDw3x3Int8Neon64:
ld1 {v7.8h}, [x2], x20
ld1 {v8.8h}, [x2], x20
Loop:
mov x16, x1
add x17, x16, x5
add x18, x17, x5
@ -83,6 +82,8 @@ Loop:
ld1 {v21.4s}, [x3]
ld1 {v22.4s}, [x19]
ld1 {v23.4s}, [x3]
ld1 {v24.4s}, [x19]
// subtract input zp
ssubl v9.8h, v9.8b, v25.8b
@ -95,31 +96,160 @@ Loop:
ssubl v18.8h, v18.8b, v25.8b
ssubl v19.8h, v19.8b, v25.8b
cmp w8, #2
beq WIDTH2_LEFT
cmp w8, #1
beq Width1
beq WIDTH1_LEFT
Width2:
HEIGHT1_LOOP:
smlal v21.4s, v0.4h, v9.4h
ld1 {v12.8b}, [x16]
smlal2 v22.4s, v0.8h, v9.8h
ld1 {v16.8b}, [x17]
smlal v23.4s, v0.4h, v10.4h
smlal2 v24.4s, v0.8h, v10.8h
ld1 {v20.8b}, [x18]
ld1 {v23.4s}, [x3]
ld1 {v24.4s}, [x19]
add x1, x1, x21
ssubl v12.8h, v12.8b, v25.8b
smlal v21.4s, v1.4h, v10.4h
mov x16, x1
add x17, x16, x5
add x18, x17, x5
smlal2 v22.4s, v1.8h, v10.8h
ld1 {v9.8b}, [x16], x4
ssubl v16.8h, v16.8b, v25.8b
smlal v23.4s, v1.4h, v11.4h
ld1 {v10.8b}, [x16], x4
ssubl v20.8h, v20.8b, v25.8b
smlal2 v24.4s, v1.8h, v11.8h
smlal v21.4s, v2.4h, v11.4h
smlal2 v22.4s, v2.8h, v11.8h
ld1 {v11.8b}, [x16], x4
smlal v23.4s, v2.4h, v12.4h
smlal2 v24.4s, v2.8h, v12.8h
smlal v21.4s, v3.4h, v13.4h
smlal2 v22.4s, v3.8h, v13.8h
ld1 {v13.8b}, [x17], x4
smlal v23.4s, v3.4h, v14.4h
smlal2 v24.4s, v3.8h, v14.8h
smlal v21.4s, v4.4h, v14.4h
smlal2 v22.4s, v4.8h, v14.8h
ld1 {v14.8b}, [x17], x4
smlal v23.4s, v4.4h, v15.4h
smlal2 v24.4s, v4.8h, v15.8h
smlal v21.4s, v5.4h, v15.4h
smlal2 v22.4s, v5.8h, v15.8h
ld1 {v15.8b}, [x17], x4
smlal v23.4s, v5.4h, v16.4h
smlal2 v24.4s, v5.8h, v16.8h
smlal v21.4s, v6.4h, v17.4h
smlal2 v22.4s, v6.8h, v17.8h
ld1 {v17.8b}, [x18], x4
smlal v23.4s, v6.4h, v18.4h
smlal2 v24.4s, v6.8h, v18.8h
smlal v21.4s, v7.4h, v18.4h
smlal2 v22.4s, v7.8h, v18.8h
ld1 {v18.8b}, [x18], x4
smlal v23.4s, v7.4h, v19.4h
smlal2 v24.4s, v7.8h, v19.8h
smlal v21.4s, v8.4h, v19.4h
smlal2 v22.4s, v8.8h, v19.8h
ld1 {v19.8b}, [x18], x4
smlal v23.4s, v8.4h, v20.4h
smlal2 v24.4s, v8.8h, v20.8h
// Apply left shfit
sqshl v21.4s, v21.4s, v26.4s
sqshl v22.4s, v22.4s, v26.4s
sqshl v23.4s, v23.4s, v26.4s
sqshl v24.4s, v24.4s, v26.4s
// Apply the fixed-point part of the multiplier.
sqrdmulh v21.4s, v21.4s, v27.4s
sqrdmulh v22.4s, v22.4s, v27.4s
sqrdmulh v23.4s, v23.4s, v27.4s
sqrdmulh v24.4s, v24.4s, v27.4s
// Apply right shfit
and v12.16b, v28.16b, v21.16b
sshr v12.4s, v12.4s, #31
sqadd v21.4s, v21.4s, v12.4s
srshl v21.4s, v21.4s, v28.4s
and v16.16b, v28.16b, v22.16b
sshr v16.4s, v16.4s, #31
sqadd v22.4s, v22.4s, v16.4s
srshl v22.4s, v22.4s, v28.4s
and v20.16b, v28.16b, v23.16b
sshr v20.4s, v20.4s, #31
sqadd v23.4s, v23.4s, v20.4s
srshl v23.4s, v23.4s, v28.4s
and v12.16b, v28.16b, v24.16b
sshr v12.4s, v12.4s, #31
sqadd v24.4s, v24.4s, v12.4s
srshl v24.4s, v24.4s, v28.4s
// Add output zero point
sqadd v21.4s, v21.4s, v29.4s
sqadd v22.4s, v22.4s, v29.4s
sqadd v23.4s, v23.4s, v29.4s
sqadd v24.4s, v24.4s, v29.4s
// Apply min bound
smax v21.4s, v21.4s, v30.4s
smax v22.4s, v22.4s, v30.4s
smax v23.4s, v23.4s, v30.4s
smax v24.4s, v24.4s, v30.4s
// Apply max bound
smin v21.4s, v21.4s, v31.4s
smin v22.4s, v22.4s, v31.4s
smin v23.4s, v23.4s, v31.4s
smin v24.4s, v24.4s, v31.4s
sqxtn v21.4h, v21.4s
sqxtn2 v21.8h, v22.4s
ld1 {v22.4s}, [x19]
ssubl v9.8h, v9.8b, v25.8b
ssubl v10.8h, v10.8b, v25.8b
sqxtn v23.4h, v23.4s
sqxtn2 v23.8h, v24.4s
ld1 {v24.4s}, [x19]
sqxtn v21.8b, v21.8h
sqxtn2 v21.16b, v23.8h
st1 {v21.8b}, [x0], x6
mov v23.d[0], v21.d[1]
ld1 {v21.4s}, [x3]
st1 {v23.8b}, [x0], x6
ssubl v11.8h, v11.8b, v25.8b
ssubl v13.8h, v13.8b, v25.8b
ld1 {v23.4s}, [x3]
ssubl v14.8h, v14.8b, v25.8b
ssubl v15.8h, v15.8b, v25.8b
ssubl v17.8h, v17.8b, v25.8b
ssubl v18.8h, v18.8b, v25.8b
ssubl v19.8h, v19.8b, v25.8b
sub w8, w8, #2
cmp w8, #2
bgt HEIGHT1_LOOP
cmp w8, #2
blt WIDTH1_LEFT
WIDTH2_LEFT:
smlal v21.4s, v0.4h, v9.4h
smlal2 v22.4s, v0.8h, v9.8h
ld1 {v12.8b}, [x16]
ssubl v12.8h, v12.8b, v25.8b
smlal v23.4s, v0.4h, v10.4h
smlal2 v24.4s, v0.8h, v10.8h
smlal v21.4s, v1.4h, v10.4h
smlal2 v22.4s, v1.8h, v10.8h
ld1 {v16.8b}, [x17]
smlal v23.4s, v1.4h, v11.4h
smlal2 v24.4s, v1.8h, v11.8h
smlal v21.4s, v2.4h, v11.4h
smlal2 v22.4s, v2.8h, v11.8h
ld1 {v20.8b}, [x18]
smlal v23.4s, v2.4h, v12.4h
smlal2 v24.4s, v2.8h, v12.8h
smlal v21.4s, v3.4h, v13.4h
@ -128,10 +258,12 @@ Width2:
smlal2 v24.4s, v3.8h, v14.8h
smlal v21.4s, v4.4h, v14.4h
smlal2 v22.4s, v4.8h, v14.8h
ssubl v16.8h, v16.8b, v25.8b
smlal v23.4s, v4.4h, v15.4h
smlal2 v24.4s, v4.8h, v15.8h
smlal v21.4s, v5.4h, v15.4h
smlal2 v22.4s, v5.8h, v15.8h
ssubl v20.8h, v20.8b, v25.8b
smlal v23.4s, v5.4h, v16.4h
smlal2 v24.4s, v5.8h, v16.8h
smlal v21.4s, v6.4h, v17.4h
@ -201,16 +333,12 @@ Width2:
sqxtn2 v23.8h, v24.4s
sqxtn v21.8b, v21.8h
sqxtn2 v21.16b, v23.8h
st1 {v21.8b}, [x0], x6
mov v23.d[0], v21.d[1]
st1 {v23.8b}, [x0], x6
sub w8, w8, #2
cbz w8, End
add x1, x1, x21
b Loop
b End
Width1:
WIDTH1_LEFT:
smlal v21.4s, v0.4h, v9.4h
smlal2 v22.4s, v0.8h, v9.8h
smlal v21.4s, v1.4h, v10.4h
@ -263,12 +391,7 @@ Width1:
sqxtn v21.4h, v21.4s
sqxtn2 v21.8h, v22.4s
sqxtn v21.8b, v21.8h
st1 {v21.8b}, [x0], x6
sub w8, w8, #1
cbz w8, End
add x1, x1, x4
b Loop
End:
sub sp, sp, #160