diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm32/ConvDwInt8Center.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm32/ConvDwInt8Center.S index 2f75feaa19b..eba7d8e6054 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm32/ConvDwInt8Center.S +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm32/ConvDwInt8Center.S @@ -90,9 +90,21 @@ ConvDwInt8Center: vqrdmulh.s32 q1, q1, q10 vqrdmulh.s32 q2, q2, q10 vqrdmulh.s32 q3, q3, q10 + vand q4, q0, q11 + vshr.s32 q4, q4, #31 + vqadd.s32 q0, q0, q4 vrshl.s32 q0, q0, q11 + vand q5, q1, q11 + vshr.s32 q5, q5, #31 + vqadd.s32 q1, q1, q5 vrshl.s32 q1, q1, q11 + vand q6, q2, q11 + vshr.s32 q6, q6, #31 + vqadd.s32 q2, q2, q6 vrshl.s32 q2, q2, q11 + vand q7, q3, q11 + vshr.s32 q7, q7, #31 + vqadd.s32 q3, q3, q7 vrshl.s32 q3, q3, q11 vadd.i32 q0, q0, q12 vadd.i32 q1, q1, q12 @@ -173,6 +185,9 @@ ConvDwInt8Center: vshl.s32 q0, q0, q9 vqrdmulh.s32 q0, q0, q10 + vand q4, q0, q11 + vshr.s32 q4, q4, #31 + vqadd.s32 q0, q0, q4 vrshl.s32 q0, q0, q11 vadd.i32 q0, q0, q12 vmax.s32 q0, q0, q13 diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S index 49f3e34ff22..7dc621c7d5e 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S @@ -166,7 +166,13 @@ IndirectGemmInt8_2x4: ldr r10, [sp, #40] vdup.32 q5, r10 + vand q3, q5, q8 + vshr.s32 q3, q3, #31 + vqadd.s32 q8, q8, q3 vrshl.s32 q8, q8, q5 + vand q4, q5, q12 + vshr.s32 q4, q4, #31 + vqadd.s32 q12, q12, q4 vrshl.s32 q12, q12, q5 ldr r10, [sp, #28] diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwInt8Center.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwInt8Center.S index 50378de953e..e381b49ff82 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwInt8Center.S +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwInt8Center.S @@ -159,22 +159,72 @@ ConvDwInt8Center: sqrdmulh v13.4s, v13.4s, v27.4s sqrdmulh v14.4s, v14.4s, v27.4s sqrdmulh v15.4s, v15.4s, v27.4s - sqrshl v0.4s, v0.4s, v28.4s - sqrshl v1.4s, v1.4s, v28.4s - sqrshl v2.4s, v2.4s, v28.4s - sqrshl v3.4s, v3.4s, v28.4s - sqrshl v4.4s, v4.4s, v28.4s - sqrshl v5.4s, v5.4s, v28.4s - sqrshl v6.4s, v6.4s, v28.4s - sqrshl v7.4s, v7.4s, v28.4s - sqrshl v8.4s, v8.4s, v28.4s - sqrshl v9.4s, v9.4s, v28.4s - sqrshl v10.4s, v10.4s, v28.4s - sqrshl v11.4s, v11.4s, v28.4s - sqrshl v12.4s, v12.4s, v28.4s - sqrshl v13.4s, v13.4s, v28.4s - sqrshl v14.4s, v14.4s, v28.4s - sqrshl v15.4s, v15.4s, v28.4s + + and v16.16b, v28.16b, v0.16b + sshr v16.4s, v16.4s, #31 + sqadd v0.4s, v0.4s, v16.4s + srshl v0.4s, v0.4s, v28.4s + and v17.16b, v28.16b, v1.16b + sshr v17.4s, v17.4s, #31 + sqadd v1.4s, v1.4s, v17.4s + srshl v1.4s, v1.4s, v28.4s + and v18.16b, v28.16b, v2.16b + sshr v18.4s, v18.4s, #31 + sqadd v2.4s, v2.4s, v18.4s + srshl v2.4s, v2.4s, v28.4s + and v19.16b, v28.16b, v3.16b + sshr v19.4s, v19.4s, #31 + sqadd v3.4s, v3.4s, v19.4s + srshl v3.4s, v3.4s, v28.4s + and v20.16b, v28.16b, v4.16b + sshr v20.4s, v20.4s, #31 + sqadd v4.4s, v4.4s, v20.4s + srshl v4.4s, v4.4s, v28.4s + and v21.16b, v28.16b, v5.16b + sshr v21.4s, v21.4s, #31 + sqadd v5.4s, v5.4s, v21.4s + srshl v5.4s, v5.4s, v28.4s + and v22.16b, v28.16b, v6.16b + sshr v22.4s, v22.4s, #31 + sqadd v6.4s, v6.4s, v22.4s + srshl v6.4s, v6.4s, v28.4s + and v23.16b, v28.16b, v7.16b + sshr v23.4s, v23.4s, #31 + sqadd v7.4s, v7.4s, v23.4s + srshl v7.4s, v7.4s, v28.4s + and v16.16b, v28.16b, v8.16b + sshr v16.4s, v16.4s, #31 + sqadd v8.4s, v8.4s, v16.4s + srshl v8.4s, v8.4s, v28.4s + and v17.16b, v28.16b, v9.16b + sshr v17.4s, v17.4s, #31 + sqadd v9.4s, v9.4s, v17.4s + srshl v9.4s, v9.4s, v28.4s + and v18.16b, v28.16b, v10.16b + sshr v18.4s, v18.4s, #31 + sqadd v10.4s, v10.4s, v18.4s + srshl v10.4s, v10.4s, v28.4s + and v19.16b, v28.16b, v11.16b + sshr v19.4s, v19.4s, #31 + sqadd v11.4s, v11.4s, v19.4s + srshl v11.4s, v11.4s, v28.4s + and v20.16b, v28.16b, v12.16b + sshr v20.4s, v20.4s, #31 + sqadd v12.4s, v12.4s, v20.4s + srshl v12.4s, v12.4s, v28.4s + and v21.16b, v28.16b, v13.16b + sshr v21.4s, v21.4s, #31 + sqadd v13.4s, v13.4s, v21.4s + srshl v13.4s, v13.4s, v28.4s + and v22.16b, v28.16b, v14.16b + sshr v22.4s, v22.4s, #31 + sqadd v14.4s, v14.4s, v22.4s + srshl v14.4s, v14.4s, v28.4s + and v23.16b, v28.16b, v15.16b + sshr v23.4s, v23.4s, #31 + sqadd v15.4s, v15.4s, v23.4s + srshl v15.4s, v15.4s, v28.4s + add v0.4s, v0.4s, v29.4s add v1.4s, v1.4s, v29.4s add v2.4s, v2.4s, v29.4s @@ -407,14 +457,40 @@ ConvDwInt8Center: sqrdmulh v5.4s, v5.4s, v27.4s sqrdmulh v6.4s, v6.4s, v27.4s sqrdmulh v7.4s, v7.4s, v27.4s - sqrshl v0.4s, v0.4s, v28.4s - sqrshl v1.4s, v1.4s, v28.4s - sqrshl v2.4s, v2.4s, v28.4s - sqrshl v3.4s, v3.4s, v28.4s - sqrshl v4.4s, v4.4s, v28.4s - sqrshl v5.4s, v5.4s, v28.4s - sqrshl v6.4s, v6.4s, v28.4s - sqrshl v7.4s, v7.4s, v28.4s + + and v16.16b, v28.16b, v0.16b + sshr v16.4s, v16.4s, #31 + sqadd v0.4s, v0.4s, v16.4s + srshl v0.4s, v0.4s, v28.4s + and v17.16b, v28.16b, v1.16b + sshr v17.4s, v17.4s, #31 + sqadd v1.4s, v1.4s, v17.4s + srshl v1.4s, v1.4s, v28.4s + and v18.16b, v28.16b, v2.16b + sshr v18.4s, v18.4s, #31 + sqadd v2.4s, v2.4s, v18.4s + srshl v2.4s, v2.4s, v28.4s + and v19.16b, v28.16b, v3.16b + sshr v19.4s, v19.4s, #31 + sqadd v3.4s, v3.4s, v19.4s + srshl v3.4s, v3.4s, v28.4s + and v20.16b, v28.16b, v4.16b + sshr v20.4s, v20.4s, #31 + sqadd v4.4s, v4.4s, v20.4s + srshl v4.4s, v4.4s, v28.4s + and v21.16b, v28.16b, v5.16b + sshr v21.4s, v21.4s, #31 + sqadd v5.4s, v5.4s, v21.4s + srshl v5.4s, v5.4s, v28.4s + and v22.16b, v28.16b, v6.16b + sshr v22.4s, v22.4s, #31 + sqadd v6.4s, v6.4s, v22.4s + srshl v6.4s, v6.4s, v28.4s + and v23.16b, v28.16b, v7.16b + sshr v23.4s, v23.4s, #31 + sqadd v7.4s, v7.4s, v23.4s + srshl v7.4s, v7.4s, v28.4s + add v0.4s, v0.4s, v29.4s add v1.4s, v1.4s, v29.4s add v2.4s, v2.4s, v29.4s @@ -526,7 +602,12 @@ ConvDwInt8Center: sqshl v0.4s, v0.4s, v26.4s sqrdmulh v0.4s, v0.4s, v27.4s - sqrshl v0.4s, v0.4s, v28.4s + + and v16.16b, v28.16b, v0.16b + sshr v16.4s, v16.4s, #31 + sqadd v0.4s, v0.4s, v16.4s + srshl v0.4s, v0.4s, v28.4s + add v0.4s, v0.4s, v29.4s smax v0.4s, v0.4s, v30.4s smin v0.4s, v0.4s, v31.4s diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/IndirectGemmInt8_4x4.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/IndirectGemmInt8_4x4.S index 1843fed2836..5ac9a0f3652 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/IndirectGemmInt8_4x4.S +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/IndirectGemmInt8_4x4.S @@ -151,7 +151,7 @@ IndirectGemmInt8_4x4: sadalp v17.4s, v9.8h smull v14.8h, v1.8b, v6.8b smull v15.8h, v1.8b, v7.8b - saddlp v18.4s, v10.8h + sadalp v18.4s, v10.8h smlal2 v14.8h, v1.16b, v6.16b smlal2 v15.8h, v1.16b, v7.16b @@ -234,10 +234,22 @@ IndirectGemmInt8_4x4: sqrdmulh v28.4s, v28.4s, v3.4s dup v4.4s, w19 - sqrshl v16.4s, v16.4s, v4.4s - sqrshl v20.4s, v20.4s, v4.4s - sqrshl v24.4s, v24.4s, v4.4s - sqrshl v28.4s, v28.4s, v4.4s + and v0.16b, v4.16b, v16.16b + sshr v0.4s, v0.4s, #31 + sqadd v16.4s, v16.4s, v0.4s + srshl v16.4s, v16.4s, v4.4s + and v1.16b, v4.16b, v20.16b + sshr v1.4s, v1.4s, #31 + sqadd v20.4s, v20.4s, v1.4s + srshl v20.4s, v20.4s, v4.4s + and v2.16b, v4.16b, v24.16b + sshr v2.4s, v2.4s, #31 + sqadd v24.4s, v24.4s, v2.4s + srshl v24.4s, v24.4s, v4.4s + and v3.16b, v4.16b, v28.16b + sshr v3.4s, v3.4s, #31 + sqadd v28.4s, v28.4s, v3.4s + srshl v28.4s, v28.4s, v4.4s dup v5.4s, w16 add v16.4s, v16.4s, v5.4s diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/IndirectGemmInt8_24x4_dp.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/IndirectGemmInt8_24x4_dp.S index ca7c73e62e5..7e57c022a3c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/IndirectGemmInt8_24x4_dp.S +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/IndirectGemmInt8_24x4_dp.S @@ -329,31 +329,103 @@ IndirectGemmInt8_24x4_dp: sqrdmulh v31.4s, v31.4s, v3.4s dup v4.4s, w19 - sqrshl v8.4s, v8.4s, v4.4s - sqrshl v9.4s, v9.4s, v4.4s - sqrshl v10.4s, v10.4s, v4.4s - sqrshl v11.4s, v11.4s, v4.4s - sqrshl v12.4s, v12.4s, v4.4s - sqrshl v13.4s, v13.4s, v4.4s - sqrshl v14.4s, v14.4s, v4.4s - sqrshl v15.4s, v15.4s, v4.4s - sqrshl v16.4s, v16.4s, v4.4s - sqrshl v17.4s, v17.4s, v4.4s - sqrshl v18.4s, v18.4s, v4.4s - sqrshl v19.4s, v19.4s, v4.4s - sqrshl v20.4s, v20.4s, v4.4s - sqrshl v21.4s, v21.4s, v4.4s - sqrshl v22.4s, v22.4s, v4.4s - sqrshl v23.4s, v23.4s, v4.4s - sqrshl v24.4s, v24.4s, v4.4s - sqrshl v25.4s, v25.4s, v4.4s - sqrshl v26.4s, v26.4s, v4.4s - sqrshl v27.4s, v27.4s, v4.4s - sqrshl v28.4s, v28.4s, v4.4s - sqrshl v29.4s, v29.4s, v4.4s - sqrshl v30.4s, v30.4s, v4.4s - sqrshl v31.4s, v31.4s, v4.4s - + add v0.16b, v4.16b, v8.16b + sshr v0.4s, v0.4s, #31 + sqadd v8.4s, v8.4s, v0.4s + srshl v8.4s, v8.4s, v4.4s + add v0.16b, v4.16b, v9.16b + sshr v1.4s, v1.4s, #31 + sqadd v9.4s, v9.4s, v1.4s + srshl v9.4s, v9.4s, v4.4s + add v2.16b, v4.16b, v10.16b + sshr v2.4s, v2.4s, #31 + sqadd v10.4s, v10.4s, v2.4s + srshl v10.4s, v10.4s, v4.4s + add v3.16b, v4.16b, v11.16b + sshr v3.4s, v3.4s, #31 + sqadd v11.4s, v11.4s, v3.4s + srshl v11.4s, v11.4s, v4.4s + add v0.16b, v4.16b, v12.16b + sshr v0.4s, v0.4s, #31 + sqadd v12.4s, v12.4s, v0.4s + srshl v12.4s, v12.4s, v4.4s + add v0.16b, v4.16b, v13.16b + sshr v1.4s, v1.4s, #31 + sqadd v13.4s, v13.4s, v1.4s + srshl v13.4s, v13.4s, v4.4s + add v2.16b, v4.16b, v14.16b + sshr v2.4s, v2.4s, #31 + sqadd v14.4s, v14.4s, v2.4s + srshl v14.4s, v14.4s, v4.4s + add v3.16b, v4.16b, v15.16b + sshr v3.4s, v3.4s, #31 + sqadd v15.4s, v15.4s, v3.4s + srshl v15.4s, v15.4s, v4.4s + add v0.16b, v4.16b, v16.16b + sshr v0.4s, v0.4s, #31 + sqadd v16.4s, v16.4s, v0.4s + srshl v16.4s, v16.4s, v4.4s + add v0.16b, v4.16b, v17.16b + sshr v1.4s, v1.4s, #31 + sqadd v17.4s, v17.4s, v1.4s + srshl v17.4s, v17.4s, v4.4s + add v2.16b, v4.16b, v18.16b + sshr v2.4s, v2.4s, #31 + sqadd v18.4s, v18.4s, v2.4s + srshl v18.4s, v18.4s, v4.4s + add v3.16b, v4.16b, v19.16b + sshr v3.4s, v3.4s, #31 + sqadd v19.4s, v19.4s, v3.4s + srshl v19.4s, v19.4s, v4.4s + add v0.16b, v4.16b, v20.16b + sshr v0.4s, v0.4s, #31 + sqadd v20.4s, v20.4s, v0.4s + srshl v20.4s, v20.4s, v4.4s + add v0.16b, v4.16b, v21.16b + sshr v1.4s, v1.4s, #31 + sqadd v21.4s, v21.4s, v1.4s + srshl v21.4s, v21.4s, v4.4s + add v2.16b, v4.16b, v22.16b + sshr v2.4s, v2.4s, #31 + sqadd v22.4s, v22.4s, v2.4s + srshl v10.4s, v10.4s, v4.4s + add v3.16b, v4.16b, v23.16b + sshr v3.4s, v3.4s, #31 + sqadd v23.4s, v23.4s, v3.4s + srshl v23.4s, v23.4s, v4.4s + add v0.16b, v4.16b, v24.16b + sshr v0.4s, v0.4s, #31 + sqadd v24.4s, v24.4s, v0.4s + srshl v24.4s, v24.4s, v4.4s + add v0.16b, v4.16b, v25.16b + sshr v1.4s, v1.4s, #31 + sqadd v25.4s, v25.4s, v1.4s + srshl v25.4s, v25.4s, v4.4s + add v2.16b, v4.16b, v26.16b + sshr v2.4s, v2.4s, #31 + sqadd v26.4s, v26.4s, v2.4s + srshl v26.4s, v26.4s, v4.4s + add v3.16b, v4.16b, v27.16b + sshr v3.4s, v3.4s, #31 + sqadd v27.4s, v27.4s, v3.4s + srshl v27.4s, v27.4s, v4.4s + add v0.16b, v4.16b, v28.16b + sshr v0.4s, v0.4s, #31 + sqadd v28.4s, v28.4s, v0.4s + srshl v28.4s, v28.4s, v4.4s + add v0.16b, v4.16b, v29.16b + sshr v1.4s, v1.4s, #31 + sqadd v29.4s, v29.4s, v1.4s + srshl v29.4s, v29.4s, v4.4s + add v2.16b, v4.16b, v30.16b + sshr v2.4s, v2.4s, #31 + sqadd v30.4s, v30.4s, v2.4s + srshl v30.4s, v30.4s, v4.4s + add v3.16b, v4.16b, v31.16b + sshr v3.4s, v3.4s, #31 + sqadd v31.4s, v31.4s, v3.4s + srshl v31.4s, v31.4s, v4.4s + dup v5.4s, w16 add v8.4s, v8.4s, v5.4s add v9.4s, v9.4s, v5.4s