!4309 fix round divide for int8

Merge pull request !4309 from lixian/master
This commit is contained in:
mindspore-ci-bot 2020-08-12 19:02:55 +08:00 committed by Gitee
commit 967a87160e
5 changed files with 241 additions and 55 deletions

View File

@ -90,9 +90,21 @@ ConvDwInt8Center:
vqrdmulh.s32 q1, q1, q10
vqrdmulh.s32 q2, q2, q10
vqrdmulh.s32 q3, q3, q10
vand q4, q0, q11
vshr.s32 q4, q4, #31
vqadd.s32 q0, q0, q4
vrshl.s32 q0, q0, q11
vand q5, q1, q11
vshr.s32 q5, q5, #31
vqadd.s32 q1, q1, q5
vrshl.s32 q1, q1, q11
vand q6, q2, q11
vshr.s32 q6, q6, #31
vqadd.s32 q2, q2, q6
vrshl.s32 q2, q2, q11
vand q7, q3, q11
vshr.s32 q7, q7, #31
vqadd.s32 q3, q3, q7
vrshl.s32 q3, q3, q11
vadd.i32 q0, q0, q12
vadd.i32 q1, q1, q12
@ -173,6 +185,9 @@ ConvDwInt8Center:
vshl.s32 q0, q0, q9
vqrdmulh.s32 q0, q0, q10
vand q4, q0, q11
vshr.s32 q4, q4, #31
vqadd.s32 q0, q0, q4
vrshl.s32 q0, q0, q11
vadd.i32 q0, q0, q12
vmax.s32 q0, q0, q13

View File

@ -166,7 +166,13 @@ IndirectGemmInt8_2x4:
ldr r10, [sp, #40]
vdup.32 q5, r10
vand q3, q5, q8
vshr.s32 q3, q3, #31
vqadd.s32 q8, q8, q3
vrshl.s32 q8, q8, q5
vand q4, q5, q12
vshr.s32 q4, q4, #31
vqadd.s32 q12, q12, q4
vrshl.s32 q12, q12, q5
ldr r10, [sp, #28]

View File

@ -159,22 +159,72 @@ ConvDwInt8Center:
sqrdmulh v13.4s, v13.4s, v27.4s
sqrdmulh v14.4s, v14.4s, v27.4s
sqrdmulh v15.4s, v15.4s, v27.4s
sqrshl v0.4s, v0.4s, v28.4s
sqrshl v1.4s, v1.4s, v28.4s
sqrshl v2.4s, v2.4s, v28.4s
sqrshl v3.4s, v3.4s, v28.4s
sqrshl v4.4s, v4.4s, v28.4s
sqrshl v5.4s, v5.4s, v28.4s
sqrshl v6.4s, v6.4s, v28.4s
sqrshl v7.4s, v7.4s, v28.4s
sqrshl v8.4s, v8.4s, v28.4s
sqrshl v9.4s, v9.4s, v28.4s
sqrshl v10.4s, v10.4s, v28.4s
sqrshl v11.4s, v11.4s, v28.4s
sqrshl v12.4s, v12.4s, v28.4s
sqrshl v13.4s, v13.4s, v28.4s
sqrshl v14.4s, v14.4s, v28.4s
sqrshl v15.4s, v15.4s, v28.4s
and v16.16b, v28.16b, v0.16b
sshr v16.4s, v16.4s, #31
sqadd v0.4s, v0.4s, v16.4s
srshl v0.4s, v0.4s, v28.4s
and v17.16b, v28.16b, v1.16b
sshr v17.4s, v17.4s, #31
sqadd v1.4s, v1.4s, v17.4s
srshl v1.4s, v1.4s, v28.4s
and v18.16b, v28.16b, v2.16b
sshr v18.4s, v18.4s, #31
sqadd v2.4s, v2.4s, v18.4s
srshl v2.4s, v2.4s, v28.4s
and v19.16b, v28.16b, v3.16b
sshr v19.4s, v19.4s, #31
sqadd v3.4s, v3.4s, v19.4s
srshl v3.4s, v3.4s, v28.4s
and v20.16b, v28.16b, v4.16b
sshr v20.4s, v20.4s, #31
sqadd v4.4s, v4.4s, v20.4s
srshl v4.4s, v4.4s, v28.4s
and v21.16b, v28.16b, v5.16b
sshr v21.4s, v21.4s, #31
sqadd v5.4s, v5.4s, v21.4s
srshl v5.4s, v5.4s, v28.4s
and v22.16b, v28.16b, v6.16b
sshr v22.4s, v22.4s, #31
sqadd v6.4s, v6.4s, v22.4s
srshl v6.4s, v6.4s, v28.4s
and v23.16b, v28.16b, v7.16b
sshr v23.4s, v23.4s, #31
sqadd v7.4s, v7.4s, v23.4s
srshl v7.4s, v7.4s, v28.4s
and v16.16b, v28.16b, v8.16b
sshr v16.4s, v16.4s, #31
sqadd v8.4s, v8.4s, v16.4s
srshl v8.4s, v8.4s, v28.4s
and v17.16b, v28.16b, v9.16b
sshr v17.4s, v17.4s, #31
sqadd v9.4s, v9.4s, v17.4s
srshl v9.4s, v9.4s, v28.4s
and v18.16b, v28.16b, v10.16b
sshr v18.4s, v18.4s, #31
sqadd v10.4s, v10.4s, v18.4s
srshl v10.4s, v10.4s, v28.4s
and v19.16b, v28.16b, v11.16b
sshr v19.4s, v19.4s, #31
sqadd v11.4s, v11.4s, v19.4s
srshl v11.4s, v11.4s, v28.4s
and v20.16b, v28.16b, v12.16b
sshr v20.4s, v20.4s, #31
sqadd v12.4s, v12.4s, v20.4s
srshl v12.4s, v12.4s, v28.4s
and v21.16b, v28.16b, v13.16b
sshr v21.4s, v21.4s, #31
sqadd v13.4s, v13.4s, v21.4s
srshl v13.4s, v13.4s, v28.4s
and v22.16b, v28.16b, v14.16b
sshr v22.4s, v22.4s, #31
sqadd v14.4s, v14.4s, v22.4s
srshl v14.4s, v14.4s, v28.4s
and v23.16b, v28.16b, v15.16b
sshr v23.4s, v23.4s, #31
sqadd v15.4s, v15.4s, v23.4s
srshl v15.4s, v15.4s, v28.4s
add v0.4s, v0.4s, v29.4s
add v1.4s, v1.4s, v29.4s
add v2.4s, v2.4s, v29.4s
@ -407,14 +457,40 @@ ConvDwInt8Center:
sqrdmulh v5.4s, v5.4s, v27.4s
sqrdmulh v6.4s, v6.4s, v27.4s
sqrdmulh v7.4s, v7.4s, v27.4s
sqrshl v0.4s, v0.4s, v28.4s
sqrshl v1.4s, v1.4s, v28.4s
sqrshl v2.4s, v2.4s, v28.4s
sqrshl v3.4s, v3.4s, v28.4s
sqrshl v4.4s, v4.4s, v28.4s
sqrshl v5.4s, v5.4s, v28.4s
sqrshl v6.4s, v6.4s, v28.4s
sqrshl v7.4s, v7.4s, v28.4s
and v16.16b, v28.16b, v0.16b
sshr v16.4s, v16.4s, #31
sqadd v0.4s, v0.4s, v16.4s
srshl v0.4s, v0.4s, v28.4s
and v17.16b, v28.16b, v1.16b
sshr v17.4s, v17.4s, #31
sqadd v1.4s, v1.4s, v17.4s
srshl v1.4s, v1.4s, v28.4s
and v18.16b, v28.16b, v2.16b
sshr v18.4s, v18.4s, #31
sqadd v2.4s, v2.4s, v18.4s
srshl v2.4s, v2.4s, v28.4s
and v19.16b, v28.16b, v3.16b
sshr v19.4s, v19.4s, #31
sqadd v3.4s, v3.4s, v19.4s
srshl v3.4s, v3.4s, v28.4s
and v20.16b, v28.16b, v4.16b
sshr v20.4s, v20.4s, #31
sqadd v4.4s, v4.4s, v20.4s
srshl v4.4s, v4.4s, v28.4s
and v21.16b, v28.16b, v5.16b
sshr v21.4s, v21.4s, #31
sqadd v5.4s, v5.4s, v21.4s
srshl v5.4s, v5.4s, v28.4s
and v22.16b, v28.16b, v6.16b
sshr v22.4s, v22.4s, #31
sqadd v6.4s, v6.4s, v22.4s
srshl v6.4s, v6.4s, v28.4s
and v23.16b, v28.16b, v7.16b
sshr v23.4s, v23.4s, #31
sqadd v7.4s, v7.4s, v23.4s
srshl v7.4s, v7.4s, v28.4s
add v0.4s, v0.4s, v29.4s
add v1.4s, v1.4s, v29.4s
add v2.4s, v2.4s, v29.4s
@ -526,7 +602,12 @@ ConvDwInt8Center:
sqshl v0.4s, v0.4s, v26.4s
sqrdmulh v0.4s, v0.4s, v27.4s
sqrshl v0.4s, v0.4s, v28.4s
and v16.16b, v28.16b, v0.16b
sshr v16.4s, v16.4s, #31
sqadd v0.4s, v0.4s, v16.4s
srshl v0.4s, v0.4s, v28.4s
add v0.4s, v0.4s, v29.4s
smax v0.4s, v0.4s, v30.4s
smin v0.4s, v0.4s, v31.4s

View File

@ -151,7 +151,7 @@ IndirectGemmInt8_4x4:
sadalp v17.4s, v9.8h
smull v14.8h, v1.8b, v6.8b
smull v15.8h, v1.8b, v7.8b
saddlp v18.4s, v10.8h
sadalp v18.4s, v10.8h
smlal2 v14.8h, v1.16b, v6.16b
smlal2 v15.8h, v1.16b, v7.16b
@ -234,10 +234,22 @@ IndirectGemmInt8_4x4:
sqrdmulh v28.4s, v28.4s, v3.4s
dup v4.4s, w19
sqrshl v16.4s, v16.4s, v4.4s
sqrshl v20.4s, v20.4s, v4.4s
sqrshl v24.4s, v24.4s, v4.4s
sqrshl v28.4s, v28.4s, v4.4s
and v0.16b, v4.16b, v16.16b
sshr v0.4s, v0.4s, #31
sqadd v16.4s, v16.4s, v0.4s
srshl v16.4s, v16.4s, v4.4s
and v1.16b, v4.16b, v20.16b
sshr v1.4s, v1.4s, #31
sqadd v20.4s, v20.4s, v1.4s
srshl v20.4s, v20.4s, v4.4s
and v2.16b, v4.16b, v24.16b
sshr v2.4s, v2.4s, #31
sqadd v24.4s, v24.4s, v2.4s
srshl v24.4s, v24.4s, v4.4s
and v3.16b, v4.16b, v28.16b
sshr v3.4s, v3.4s, #31
sqadd v28.4s, v28.4s, v3.4s
srshl v28.4s, v28.4s, v4.4s
dup v5.4s, w16
add v16.4s, v16.4s, v5.4s

View File

@ -329,31 +329,103 @@ IndirectGemmInt8_24x4_dp:
sqrdmulh v31.4s, v31.4s, v3.4s
dup v4.4s, w19
sqrshl v8.4s, v8.4s, v4.4s
sqrshl v9.4s, v9.4s, v4.4s
sqrshl v10.4s, v10.4s, v4.4s
sqrshl v11.4s, v11.4s, v4.4s
sqrshl v12.4s, v12.4s, v4.4s
sqrshl v13.4s, v13.4s, v4.4s
sqrshl v14.4s, v14.4s, v4.4s
sqrshl v15.4s, v15.4s, v4.4s
sqrshl v16.4s, v16.4s, v4.4s
sqrshl v17.4s, v17.4s, v4.4s
sqrshl v18.4s, v18.4s, v4.4s
sqrshl v19.4s, v19.4s, v4.4s
sqrshl v20.4s, v20.4s, v4.4s
sqrshl v21.4s, v21.4s, v4.4s
sqrshl v22.4s, v22.4s, v4.4s
sqrshl v23.4s, v23.4s, v4.4s
sqrshl v24.4s, v24.4s, v4.4s
sqrshl v25.4s, v25.4s, v4.4s
sqrshl v26.4s, v26.4s, v4.4s
sqrshl v27.4s, v27.4s, v4.4s
sqrshl v28.4s, v28.4s, v4.4s
sqrshl v29.4s, v29.4s, v4.4s
sqrshl v30.4s, v30.4s, v4.4s
sqrshl v31.4s, v31.4s, v4.4s
add v0.16b, v4.16b, v8.16b
sshr v0.4s, v0.4s, #31
sqadd v8.4s, v8.4s, v0.4s
srshl v8.4s, v8.4s, v4.4s
add v0.16b, v4.16b, v9.16b
sshr v1.4s, v1.4s, #31
sqadd v9.4s, v9.4s, v1.4s
srshl v9.4s, v9.4s, v4.4s
add v2.16b, v4.16b, v10.16b
sshr v2.4s, v2.4s, #31
sqadd v10.4s, v10.4s, v2.4s
srshl v10.4s, v10.4s, v4.4s
add v3.16b, v4.16b, v11.16b
sshr v3.4s, v3.4s, #31
sqadd v11.4s, v11.4s, v3.4s
srshl v11.4s, v11.4s, v4.4s
add v0.16b, v4.16b, v12.16b
sshr v0.4s, v0.4s, #31
sqadd v12.4s, v12.4s, v0.4s
srshl v12.4s, v12.4s, v4.4s
add v0.16b, v4.16b, v13.16b
sshr v1.4s, v1.4s, #31
sqadd v13.4s, v13.4s, v1.4s
srshl v13.4s, v13.4s, v4.4s
add v2.16b, v4.16b, v14.16b
sshr v2.4s, v2.4s, #31
sqadd v14.4s, v14.4s, v2.4s
srshl v14.4s, v14.4s, v4.4s
add v3.16b, v4.16b, v15.16b
sshr v3.4s, v3.4s, #31
sqadd v15.4s, v15.4s, v3.4s
srshl v15.4s, v15.4s, v4.4s
add v0.16b, v4.16b, v16.16b
sshr v0.4s, v0.4s, #31
sqadd v16.4s, v16.4s, v0.4s
srshl v16.4s, v16.4s, v4.4s
add v0.16b, v4.16b, v17.16b
sshr v1.4s, v1.4s, #31
sqadd v17.4s, v17.4s, v1.4s
srshl v17.4s, v17.4s, v4.4s
add v2.16b, v4.16b, v18.16b
sshr v2.4s, v2.4s, #31
sqadd v18.4s, v18.4s, v2.4s
srshl v18.4s, v18.4s, v4.4s
add v3.16b, v4.16b, v19.16b
sshr v3.4s, v3.4s, #31
sqadd v19.4s, v19.4s, v3.4s
srshl v19.4s, v19.4s, v4.4s
add v0.16b, v4.16b, v20.16b
sshr v0.4s, v0.4s, #31
sqadd v20.4s, v20.4s, v0.4s
srshl v20.4s, v20.4s, v4.4s
add v0.16b, v4.16b, v21.16b
sshr v1.4s, v1.4s, #31
sqadd v21.4s, v21.4s, v1.4s
srshl v21.4s, v21.4s, v4.4s
add v2.16b, v4.16b, v22.16b
sshr v2.4s, v2.4s, #31
sqadd v22.4s, v22.4s, v2.4s
srshl v10.4s, v10.4s, v4.4s
add v3.16b, v4.16b, v23.16b
sshr v3.4s, v3.4s, #31
sqadd v23.4s, v23.4s, v3.4s
srshl v23.4s, v23.4s, v4.4s
add v0.16b, v4.16b, v24.16b
sshr v0.4s, v0.4s, #31
sqadd v24.4s, v24.4s, v0.4s
srshl v24.4s, v24.4s, v4.4s
add v0.16b, v4.16b, v25.16b
sshr v1.4s, v1.4s, #31
sqadd v25.4s, v25.4s, v1.4s
srshl v25.4s, v25.4s, v4.4s
add v2.16b, v4.16b, v26.16b
sshr v2.4s, v2.4s, #31
sqadd v26.4s, v26.4s, v2.4s
srshl v26.4s, v26.4s, v4.4s
add v3.16b, v4.16b, v27.16b
sshr v3.4s, v3.4s, #31
sqadd v27.4s, v27.4s, v3.4s
srshl v27.4s, v27.4s, v4.4s
add v0.16b, v4.16b, v28.16b
sshr v0.4s, v0.4s, #31
sqadd v28.4s, v28.4s, v0.4s
srshl v28.4s, v28.4s, v4.4s
add v0.16b, v4.16b, v29.16b
sshr v1.4s, v1.4s, #31
sqadd v29.4s, v29.4s, v1.4s
srshl v29.4s, v29.4s, v4.4s
add v2.16b, v4.16b, v30.16b
sshr v2.4s, v2.4s, #31
sqadd v30.4s, v30.4s, v2.4s
srshl v30.4s, v30.4s, v4.4s
add v3.16b, v4.16b, v31.16b
sshr v3.4s, v3.4s, #31
sqadd v31.4s, v31.4s, v3.4s
srshl v31.4s, v31.4s, v4.4s
dup v5.4s, w16
add v8.4s, v8.4s, v5.4s
add v9.4s, v9.4s, v5.4s