optimization for depth wise convolution

2020-08-06 21:04:15 +08:00 · 2020-08-06 21:04:15 +08:00 · 48e2f85593
parent 0921c33f99
commit 48e2f85593
16 changed files with 1886 additions and 44 deletions
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/ConvDwFp32Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/ConvDwFp32Center.S
@ -0,0 +1,161 @@
+#ifdef __arm__
+#ifndef __aarch64__
+
+.text
+.align 5
+.global ConvDwFp32Center
+#ifndef __APPLE__
+.type ConvDwFp32Center, %function
+#endif
+
+// void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
+//                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
+//                      size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
+// r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: weight, #56: kernel_h, #60: kernel_w, 
+// #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step
+// #88: relu, #92: relu6
+ConvDwFp32Center:
+    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
+    // according to https://stackoverflow.com/questions/53625807
+    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
+    // clang's rule seems more simple, though there are no subroutine calls here
+    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
+    push {r0-r8, r10, r11, lr}
+    vpush {v4-v7}
+    add sp, sp, #112
+
+    ldr r4, [sp, #48]
+
+    vld1.32 {q13}, [r3]
+    vmov.i32 q14, #6
+    vcvt.f32.s32 q14, q14
+    veor q15, q15, q15
+
+    LoopH:
+        ldr r1, [sp, #4] // src_w
+        ldr r5, [sp, #52] // width
+        ldr r0, [sp] // dst_w
+        cmp r5, #4
+        blt LoopW
+        LoopW4:
+            mov r11, [sp, #76] // in_sw_step
+            mov r8, r1 // src_kh
+            ldr r2, [sp, #8] // weight_kh
+            ldr r6, [sp, #56] // kernel_h
+            vmov q0, q13
+            LoopKh4:
+                ldr r12, [sp, #80] //in_kh_step 
+                ldr r7, [sp, #60] // kernel_w
+                mov lr, r8 // src_kw
+                LoopKw4:
+                    mov r10, lr
+                    vld1.32 {q12}, [r2]!
+                    vld1.32 {q4}, [r10]
+                    add r10, r10, r11
+                    vmla.f32 q0, q4, q12
+                    vld1.32 {q5}, [r10]
+                    add r10, r10, r11
+                    vmla.f32 q1, q5, q12
+                    vld1.32 {q6}, [r10]
+                    add r10, r10, r11
+                    vmla.f32 q2, q6, q12
+                    vld1.32 {q7}, [r10]
+                    add r10, r10, r11
+                    vmla.f32 q3, q7, q12
+                    subs r7, r7, #1
+                    add lr, lr, r12
+                    bne LoopKw4
+                ldr r12, [sp, #80]
+                add r8, r8, r12
+                subs r6, r6, #1
+                bne LoopKh4
+            ldr r12, [sp, #92]
+            cmp r12, #0
+            bne Relu64
+            ldr r12, [sp, #88]
+            cmp r12, #0
+            bne Relu4
+            b Write4
+        Relu64:
+            vmin.f32 q0, q0, q14
+            vmin.f32 q1, q1, q14
+            vmin.f32 q2, q2, q14
+            vmin.f32 q3, q3, q14
+        Relu4:
+            vmax.f32 q0, q0, q15
+            vmax.f32 q1, q1, q15
+            vmax.f32 q2, q2, q15
+            vmax.f32 q3, q3, q15
+        Write4:
+            ldr r12, [sp, #68]
+            vst1.32 {q0}, [r0]
+            add r0, r0, r12
+            vst1.32 {q1}, [r0]
+            add r0, r0, r12
+            vst1.32 {q2}, [r0]
+            add r0, r0, r12
+            vst1.32 {q3}, [r0]
+            add r0, r0, r12
+            mov r12, #4
+            mul r11, r11, r12
+            add r1, r1, r11
+            sub r5, r5, #4
+            cmp r5, r5, #0
+            ble LoopWEnd
+            cmp r5, #4
+            bge LoopW
+        LoopW:
+            mov r8, r1 // src_kh
+            ldr r2, [sp, #8] // weight_kh
+            ldr r6, [sp, #56] // kernel_h
+            vmov q0, q13
+            LoopKh:
+                ldr r12, [sp, #84] //in_kw_step 
+                ldr r7, [sp, #60] // kernel_w
+                mov r10, r8 // src_kw
+                LoopKw:
+                    vld1.32 {q1}, [r10]
+                    add r10, r10, r12
+                    vld1.32 {q12}, [r2]!
+                    vmla.f32 q0, q1, q12
+                    subs r7, r7, #1
+                    bne LoopKw
+                ldr r12, [sp, #80]
+                add r8, r8, r12
+                subs r6, r6, #1
+                bne LoopKh
+            ldr r12, [sp, #92]
+            cmp r12, #0
+            bne Relu6
+            ldr r12, [sp, #88]
+            cmp r12, #0
+            bne Relu
+            b Write
+        Relu6:
+            vmin.f32 q0, q0, q14
+        Relu:
+            vmax.f32 q0, q0, q15
+        Write:
+            ldr r12, [sp, #68]
+            vst1.32 {q0}, [r0]
+            add r0, r0, r12
+            ldr r12, [sp, #76]
+            add r1, r1, r12
+            subs r5, r5, #1
+            bne LoopW
+        ldr r3, [sp, #64]
+        ldr r12, [sp]
+        add r12, r12, r3
+        str r12, [sp]
+        ldr r3, [sp, #72]
+        ldr r12, [sp, #4]
+        add r12, r12, r3
+        str r12, [sp, #4]
+        subs r4, r4, #1
+        bne LoopH
+LoopWEnd:
+    sub sp, sp, #112
+    vpop {v4-v7}
+    pop {r0-r8, r10, r11, pc}
+#endif
+#endif
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/ConvDwInt8Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/ConvDwInt8Center.S
@ -0,0 +1,207 @@
+#ifdef __arm__
+#ifndef __aarch64__
+
+.text
+.align 5
+.global ConvDwInt8Center
+#ifndef __APPLE__
+.type ConvDwInt8Center, %function
+#endif
+
+// void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height, size_t width,
+//                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
+//                      size_t in_kh_step, size_t in_kw_step, int out_multiplier, int left_shift,
+//                      int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max);
+// r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: width, #56: kernel_h, #60: kernel_w, 
+// #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step
+// #88: out_multiplier, #92: left_shift, #96: right_shift, #100: out_zp, #104: acc_min, #108: acc_max
+ConvDwInt8Center:
+    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
+    // according to https://stackoverflow.com/questions/53625807
+    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
+    // clang's rule seems more simple, though there are no subroutine calls here
+    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
+    push {r0-r8, r10, r11, lr}
+    vpush {q4-q7}
+    add sp, sp, #112
+
+    ldr r4, [sp, #48]
+
+    ldr r12, [sp, #92]
+    vdup.32 q9, r12
+
+    ldr r11, [sp, #88]
+    vdup.32 q10, r11
+
+    ldr r10, [sp, #96]
+    vdup.32 q11, r10
+
+    ldr r8, [sp, #100]
+    vdup.32 q12, r8
+ 
+    ldr r7, [sp, #104]
+    vdup.32 q13, r7
+
+    ldr r6, [sp, #108]
+    vdup.32 q14, r6
+
+    vld1.32 {q15}, [r3]
+
+    LoopH:
+        ldr r1, [sp, #4] // src_w
+        ldr r5, [sp, #52] // width
+        ldr r0, [sp] // dst_w
+        LoopW4:
+            mov r11, [sp, #76] // in_sw_step
+            mov r8, r1 // src_kh
+            ldr r2, [sp, #8] // weight_kh
+            ldr r6, [sp, #56] // kernel_h
+            vmov q0, q15
+            LoopKh4:
+                ldr r12, [sp, #80] //in_kh_step
+                ldr r7, [sp, #60] // kernel_w
+                mov r10, r8 // src_kw
+                LoopKw4:
+                    vld1.16 {d24}, [r2]!
+                    vld1.16 {d8}, [r10]
+                    add r10, r10, r11
+                    vmlal.s16 q0, d8, d24
+                    vld1.16 {d10}, [r10]
+                    add r10, r10, r11
+                    vmlal.s16 q1, d10, d24
+                    vld1.16 {d12}, [r10]
+                    add r10, r10, r11
+                    vmlal.s16 q2, d12, d24
+                    vld1.16 {d14}, [r10]
+                    add r10, r10, r11
+                    vmlal.s16 q3, d14, d24
+                    subs r7, r7, #1
+                    bne LoopKw4
+                ldr r12, [sp, #80]
+                add r8, r8, r12
+                subs r6, r6, #1
+                bne LoopKh4
+
+            vshl.s32 q0, q0, q9
+            vshl.s32 q1, q1, q9
+            vshl.s32 q2, q2, q9
+            vshl.s32 q3, q3, q9
+            vqrdmulh.s32 q0, q0, q10
+            vqrdmulh.s32 q1, q1, q10
+            vqrdmulh.s32 q2, q2, q10
+            vqrdmulh.s32 q3, q3, q10
+            vrshl.s32 q0, q0, q11
+            vrshl.s32 q1, q1, q11
+            vrshl.s32 q2, q2, q11
+            vrshl.s32 q3, q3, q11
+            vadd.i32 q0, q0, q12
+            vadd.i32 q1, q1, q12
+            vadd.i32 q2, q2, q12
+            vadd.i32 q3, q3, q12
+            vmax.s32 q0, q0, q13
+            vmax.s32 q1, q1, q13
+            vmax.s32 q2, q2, q13
+            vmax.s32 q3, q3, q13
+            vmin.s32 q0, q0, q14
+            vmin.s32 q1, q1, q14
+            vmin.s32 q2, q2, q14
+            vmin.s32 q3, q3, q14
+
+            vqmovn.s32 d0, q0
+            vqmovn.s32 d2, q1
+            vqmovn.s32 d4, q2
+            vqmovn.s32 d6, q3
+            vqmovn.s16 d0, q0
+            vqmovn.s16 d2, q1
+            vqmovn.s16 d4, q2
+            vqmovn.s16 d6, q3
+
+            mov r3, r0
+            ldr r12, [sp, #68]
+            vst1.8 {d0[0]}, [r3]!
+            vst1.8 {d0[1]}, [r3]!
+            vst1.8 {d0[2]}, [r3]!
+            vst1.8 {d0[3]}, [r3]!
+            add r0, r0, r12
+            mov r3, r0
+            vst1.8 {d2[0]}, [r3]!
+            vst1.8 {d2[1]}, [r3]!
+            vst1.8 {d2[2]}, [r3]!
+            vst1.8 {d2[3]}, [r3]!
+            add r0, r0, r12
+            mov r3, r0
+            vst1.8 {d4[0]}, [r3]!
+            vst1.8 {d4[1]}, [r3]!
+            vst1.8 {d4[2]}, [r3]!
+            vst1.8 {d4[3]}, [r3]!
+            add r0, r0, r12
+            mov r3, r0
+            vst1.8 {d6[0]}, [r3]!
+            vst1.8 {d6[1]}, [r3]!
+            vst1.8 {d6[2]}, [r3]!
+            vst1.8 {d6[3]}, [r3]!
+            add r0, r0, r12
+            mov r3, r0
+            mov r12, #4
+            mul r11, r11, r12
+            add r1, r1, r11
+            subs r5, r5, #1
+            bne LoopW4
+        LoopW:
+            mov r8, r1 // src_kh
+            ldr r2, [sp, #8] // weight_kh
+            ldr r6, [sp, #56] // kernel_h
+            vmov q0, q15
+            LoopKh:
+                ldr r12, [sp, #84] //in_kw_step 
+                ldr r7, [sp, #60] // kernel_w
+                mov r10, r8 // src_kw
+                LoopKw:
+                    vld1.16 {d2}, [r10]
+                    add r10, r10, r12
+                    vld1.16 {d24}, [r2]!
+                    vmlal.s16 q0, d2, d24
+                    subs r7, r7, #1
+                    bne LoopKw
+                ldr r12, [sp, #80]
+                add r8, r8, r12
+                subs r6, r6, #1
+                bne LoopKh
+
+            vshl.s32 q0, q0, q9
+            vqrdmulh.s32 q0, q0, q10
+            vrshl.s32 q0, q0, q11
+            vadd.i32 q0, q0, q12
+            vmax.s32 q0, q0, q13
+            vmin.s32 q0, q0, q14
+
+            vqmovn.s32 d0, q0
+            vqmovn.s16 d0, q0
+
+            mov r3, r0
+            ldr r12, [sp, #68]
+            vst1.8 {d0[0]}, [r3]!
+            vst1.8 {d0[1]}, [r3]!
+            vst1.8 {d0[2]}, [r3]!
+            vst1.8 {d0[3]}, [r3]!
+            add r0, r0, r12
+            ldr r12, [sp, #76]
+            add r1, r1, r12
+            subs r5, r5, #1
+            bne LoopW
+        ldr r3, [sp, #64]
+        ldr r12, [sp]
+        add r12, r12, r3
+        str r12, [sp]
+        ldr r3, [sp, #72]
+        ldr r12, [sp, #4]
+        add r12, r12, r3
+        str r12, [sp, #4]
+        subs r4, r4, #1
+        bne LoopH
+
+    sub sp, sp, #112
+    vpop {q4-q7}
+    pop {r0-r8, r10, r11, pc}
+#endif
+#endif
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/DeconvDwFp32Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/DeconvDwFp32Center.S
@ -0,0 +1,69 @@
+#ifdef __arm__
+#ifndef __aarch64__
+
+.text
+.align 5
+.global DeconvDwFp32Center
+#ifndef __APPLE__
+.type DeconvDwFp32Center, %function
+#endif
+
+// void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
+//                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
+//                      size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
+// r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step
+// #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step
+DeconvDwFp32Center:
+    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
+    // according to https://stackoverflow.com/questions/53625807
+    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
+    // clang's rule seems more simple, though there are no subroutine calls here
+    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
+    push {r0-r8, r10, r11, lr}
+
+    ldr r10, [sp, #80] // in_kw_step
+    ldr r11, [sp, #76] // in_kh_step
+
+    LoopH:
+        ldr r0, [sp] // dst_w
+        ldr r1, [sp, #4] // src_w
+        ldr r4, [sp, #48] // width
+        LoopW:
+            mov r6, r0 // dst_kh
+            ldr r2, [sp, #8] // weight_kh
+            ldr r5, [sp, #52] // kernel_h
+            vld1.32 {q1}, [r1]
+            LoopKh:
+                mov r7, r6 // dst_kw
+                ldr r12, [sp, #56] // kernel_w
+                LoopKw:
+                    vld1.32 {q0}, [r7]
+                    vld1.32 {q2}, [r2]!
+                    vmla.f32 q0, q1, q2
+                    vst1.32 {q0}, [r7]
+                    add r7, r7, r10
+                    subs r12, r12, #1
+                    bne LoopKw
+                add r6, r6, r11
+                subs r5, r5, #1
+                bne LoopKh
+            ldr r12, [sp, #72]
+            add r0, r0, r12
+            ldr r8, [sp, #64]
+            add r1, r1, r8
+            subs r4, r4, #1
+            bne LoopW
+        ldr r8, [sp, #68]
+        ldr r12, [sp]
+        add r12, r12, r8
+        str r12, [sp]
+        ldr r8, [sp, #60]
+        ldr r12, [sp, #4]
+        add r12, r12, r8
+        str r12, [sp, #4]
+        subs r3, r3, #1
+        bne LoopH
+
+    pop {r0-r8, r10, r11, pc}
+#endif
+#endif
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/DeconvDwInt8Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/DeconvDwInt8Center.S
@ -0,0 +1,69 @@
+#ifdef __arm__
+#ifndef __aarch64__
+
+.text
+.align 5
+.global DeconvDwInt8Center
+#ifndef __APPLE__
+.type DeconvDwInt8Center, %function
+#endif
+
+// void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width,
+//                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
+//                      size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
+// r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step
+// #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step
+DeconvDwInt8Center:
+    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
+    // according to https://stackoverflow.com/questions/53625807
+    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
+    // clang's rule seems more simple, though there are no subroutine calls here
+    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
+    push {r0-r8, r10, r11, lr}
+
+    ldr r10, [sp, #80] // in_kw_step
+    ldr r11, [sp, #76] // in_kh_step
+
+    LoopH:
+        ldr r0, [sp] // dst_w
+        ldr r1, [sp, #4] // src_w
+        ldr r4, [sp, #48] // width
+        LoopW:
+            mov r6, r0 // dst_kh
+            ldr r2, [sp, #8] // weight_kh
+            ldr r5, [sp, #52] // kernel_h
+            vld1.16 {d2}, [r1]
+            LoopKh:
+                mov r7, r6 // dst_kw
+                ldr r12, [sp, #56] // kernel_w
+                LoopKw:
+                    vld1.32 {q0}, [r7]
+                    vld1.16 {d24}, [r2]!
+                    vmlal.s16 q0, d2, d24
+                    vst1.32 {q0}, [r7]
+                    add r7, r7, r10
+                    subs r12, r12, #1
+                    bne LoopKw
+                add r6, r6, r11
+                subs r5, r5, #1
+                bne LoopKh
+            ldr r12, [sp, #72]
+            add r0, r0, r12
+            ldr r8, [sp, #64]
+            add r1, r1, r8
+            subs r4, r4, #1
+            bne LoopW
+        ldr r8, [sp, #68]
+        ldr r12, [sp]
+        add r12, r12, r8
+        str r12, [sp]
+        ldr r8, [sp, #60]
+        ldr r12, [sp, #4]
+        add r12, r12, r8
+        str r12, [sp, #4]
+        subs r3, r3, #1
+        bne LoopH
+
+    pop {r0-r8, r10, r11, pc}
+#endif
+#endif
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwFp32Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwFp32Center.S
@ -32,24 +32,238 @@ ConvDwFp32Center:
    ldr x14, [sp, #48]
    ldr x15, [sp, #56]

-    ld1 {v5.4s}, [x3]
+    ld1 {v24.4s}, [x3]
+    movi v26.4s, #6
+    scvtf v26.4s, v26.4s
+    dup v27.4s, wzr

    LoopH:
        mov x23, x1
        mov x24, x5
        mov x3, x0
+        cmp x24, #8
+        blt LoopW
+        cmp x24, #16
+        blt LoopW8
+
+        LoopW16:
+            mov x19, #16
+            mul x19, x19, x11
+            mov x16, x23
+            mov x17, x2
+            mov x20, x6
+            mov v0.16b, v24.16b
+            mov v1.16b, v24.16b
+            mov v2.16b, v24.16b
+            mov v3.16b, v24.16b
+            mov v4.16b, v24.16b
+            mov v5.16b, v24.16b
+            mov v6.16b, v24.16b
+            mov v7.16b, v24.16b
+            mov v8.16b, v24.16b
+            mov v9.16b, v24.16b
+            mov v10.16b, v24.16b
+            mov v11.16b, v24.16b
+            mov v12.16b, v24.16b
+            mov v13.16b, v24.16b
+            mov v14.16b, v24.16b
+            mov v15.16b, v24.16b
+            LoopKh16:
+                mov x18, x7
+                mov x21, x16
+                LoopKw16:
+                    mov x22, x21
+                    ld1 {v25.4s}, [x17], #16
+                    ld1 {v16.4s}, [x22], x11
+                    ld1 {v17.4s}, [x22], x11
+                    fmla v0.4s, v16.4s, v25.4s
+                    fmla v1.4s, v17.4s, v25.4s
+                    ld1 {v18.4s}, [x22], x11
+                    ld1 {v19.4s}, [x22], x11
+                    fmla v2.4s, v18.4s, v25.4s
+                    fmla v3.4s, v19.4s, v25.4s
+                    ld1 {v20.4s}, [x22], x11
+                    ld1 {v21.4s}, [x22], x11
+                    fmla v4.4s, v20.4s, v25.4s
+                    fmla v5.4s, v21.4s, v25.4s
+                    ld1 {v22.4s}, [x22], x11
+                    ld1 {v23.4s}, [x22], x11
+                    fmla v6.4s, v22.4s, v25.4s
+                    fmla v7.4s, v23.4s, v25.4s
+                    ld1 {v16.4s}, [x22], x11
+                    ld1 {v17.4s}, [x22], x11
+                    fmla v8.4s, v16.4s, v25.4s
+                    fmla v9.4s, v17.4s, v25.4s
+                    ld1 {v18.4s}, [x22], x11
+                    ld1 {v19.4s}, [x22], x11
+                    fmla v10.4s, v18.4s, v25.4s
+                    fmla v11.4s, v19.4s, v25.4s
+                    ld1 {v20.4s}, [x22], x11
+                    ld1 {v21.4s}, [x22], x11
+                    fmla v12.4s, v20.4s, v25.4s
+                    fmla v13.4s, v21.4s, v25.4s
+                    ld1 {v22.4s}, [x22], x11
+                    ld1 {v23.4s}, [x22], x11
+                    fmla v14.4s, v22.4s, v25.4s
+                    fmla v15.4s, v23.4s, v25.4s
+                    subs x18, x18, #1
+                    add x21, x21, x13
+                    bne LoopKw16
+                add x16, x16, x12
+                subs x20, x20, #1
+                bne LoopKh16
+            cbnz x15, Relu616
+            cbnz x14, Relu16
+            b Write16
+        Relu616:
+            fmin v0.4s, v0.4s, v26.4s
+            fmin v1.4s, v1.4s, v26.4s
+            fmin v2.4s, v2.4s, v26.4s
+            fmin v3.4s, v3.4s, v26.4s
+            fmin v4.4s, v4.4s, v26.4s
+            fmin v5.4s, v5.4s, v26.4s
+            fmin v6.4s, v6.4s, v26.4s
+            fmin v7.4s, v7.4s, v26.4s
+            fmin v8.4s, v8.4s, v26.4s
+            fmin v9.4s, v9.4s, v26.4s
+            fmin v10.4s, v10.4s, v26.4s
+            fmin v11.4s, v11.4s, v26.4s
+            fmin v12.4s, v12.4s, v26.4s
+            fmin v13.4s, v13.4s, v26.4s
+            fmin v14.4s, v14.4s, v26.4s
+            fmin v15.4s, v15.4s, v26.4s
+        Relu16:
+            fmax v0.4s, v0.4s, v27.4s
+            fmax v1.4s, v1.4s, v27.4s
+            fmax v2.4s, v2.4s, v27.4s
+            fmax v3.4s, v3.4s, v27.4s
+            fmax v4.4s, v4.4s, v27.4s
+            fmax v5.4s, v5.4s, v27.4s
+            fmax v6.4s, v6.4s, v27.4s
+            fmax v7.4s, v7.4s, v27.4s
+            fmax v8.4s, v8.4s, v27.4s
+            fmax v9.4s, v9.4s, v27.4s
+            fmax v10.4s, v10.4s, v27.4s
+            fmax v11.4s, v11.4s, v27.4s
+            fmax v12.4s, v12.4s, v27.4s
+            fmax v13.4s, v13.4s, v27.4s
+            fmax v14.4s, v14.4s, v27.4s
+            fmax v15.4s, v15.4s, v27.4s
+        Write16:
+            st1 {v0.4s}, [x3], x9
+            st1 {v1.4s}, [x3], x9
+            st1 {v2.4s}, [x3], x9
+            st1 {v3.4s}, [x3], x9
+            st1 {v4.4s}, [x3], x9
+            st1 {v5.4s}, [x3], x9
+            st1 {v6.4s}, [x3], x9
+            st1 {v7.4s}, [x3], x9
+            st1 {v8.4s}, [x3], x9
+            st1 {v9.4s}, [x3], x9
+            st1 {v10.4s}, [x3], x9
+            st1 {v11.4s}, [x3], x9
+            st1 {v12.4s}, [x3], x9
+            st1 {v13.4s}, [x3], x9
+            st1 {v14.4s}, [x3], x9
+            st1 {v15.4s}, [x3], x9
+            add x23, x23, x19
+            sub x24, x24, #16
+            cmp x24, #0
+            ble LoopWEnd
+            cmp x24, #8
+            blt LoopW
+            cmp x24, #16
+            bge LoopW16
+        LoopW8:
+            mov x19, #8
+            mul x19, x19, x11
+            mov x16, x23
+            mov x17, x2
+            mov x20, x6
+            mov v0.16b, v24.16b
+            mov v1.16b, v24.16b
+            mov v2.16b, v24.16b
+            mov v3.16b, v24.16b
+            mov v4.16b, v24.16b
+            mov v5.16b, v24.16b
+            mov v6.16b, v24.16b
+            mov v7.16b, v24.16b
+            LoopKh8:
+                mov x18, x7
+                mov x21, x16
+                LoopKw8:
+                    mov x22, x21
+                    ld1 {v25.4s}, [x17], #16
+                    ld1 {v16.4s}, [x22], x11
+                    ld1 {v17.4s}, [x22], x11
+                    fmla v0.4s, v16.4s, v25.4s
+                    fmla v1.4s, v17.4s, v25.4s
+                    ld1 {v18.4s}, [x22], x11
+                    ld1 {v19.4s}, [x22], x11
+                    fmla v2.4s, v18.4s, v25.4s
+                    fmla v3.4s, v19.4s, v25.4s
+                    ld1 {v20.4s}, [x22], x11
+                    ld1 {v21.4s}, [x22], x11
+                    fmla v4.4s, v20.4s, v25.4s
+                    fmla v5.4s, v21.4s, v25.4s
+                    ld1 {v22.4s}, [x22], x11
+                    ld1 {v23.4s}, [x22], x11
+                    fmla v6.4s, v22.4s, v25.4s
+                    fmla v7.4s, v23.4s, v25.4s
+                    subs x18, x18, #1
+                    add x21, x21, x13
+                    bne LoopKw8
+                add x16, x16, x12
+                subs x20, x20, #1
+                bne LoopKh8
+            cbnz x15, Relu68
+            cbnz x14, Relu8
+            b Write8
+        Relu68:
+            fmin v0.4s, v0.4s, v26.4s
+            fmin v1.4s, v1.4s, v26.4s
+            fmin v2.4s, v2.4s, v26.4s
+            fmin v3.4s, v3.4s, v26.4s
+            fmin v4.4s, v4.4s, v26.4s
+            fmin v5.4s, v5.4s, v26.4s
+            fmin v6.4s, v6.4s, v26.4s
+            fmin v7.4s, v7.4s, v26.4s
+        Relu8:
+            fmax v0.4s, v0.4s, v27.4s
+            fmax v1.4s, v1.4s, v27.4s
+            fmax v2.4s, v2.4s, v27.4s
+            fmax v3.4s, v3.4s, v27.4s
+            fmax v4.4s, v4.4s, v27.4s
+            fmax v5.4s, v5.4s, v27.4s
+            fmax v6.4s, v6.4s, v27.4s
+            fmax v7.4s, v7.4s, v27.4s
+        Write8:
+            st1 {v0.4s}, [x3], x9
+            st1 {v1.4s}, [x3], x9
+            st1 {v2.4s}, [x3], x9
+            st1 {v3.4s}, [x3], x9
+            st1 {v4.4s}, [x3], x9
+            st1 {v5.4s}, [x3], x9
+            st1 {v6.4s}, [x3], x9
+            st1 {v7.4s}, [x3], x9
+            add x23, x23, x19
+            sub x24, x24, #8
+            cmp x24, #0
+            ble LoopWEnd
+            cmp x24, #8
+            bge LoopW8
        LoopW:
            mov x16, x23
            mov x17, x2
            mov x20, x6
-            mov v0.16b, v5.16b
+            mov v0.16b, v24.16b
            LoopKh:
                mov x18, x7
                mov x22, x16
                LoopKw:
-                    ld1 {v1.4s}, [x22], x13
-                    ld1 {v2.4s}, [x17], #16
-                    fmla v0.4s, v1.4s, v2.4s
+                    ld1 {v16.4s}, [x22], x13
+                    ld1 {v25.4s}, [x17], #16
+                    fmla v0.4s, v16.4s, v25.4s
                    subs x18, x18, #1
                    bne LoopKw
                add x16, x16, x12
@ -59,17 +273,15 @@ ConvDwFp32Center:
            cbnz x14, Relu
            b Write
        Relu6:
-            movi v4.4s, #6
-            scvtf v4.4s, v4.4s
-            fmin v0.4s, v0.4s, v4.4s
+            fmin v0.4s, v0.4s, v26.4s
        Relu:
-            dup v3.4s, wzr
-            fmax v0.4s, v0.4s, v3.4s
+            fmax v0.4s, v0.4s, v27.4s
        Write:
            st1 {v0.4s}, [x3], x9
            add x23, x23, x11
            subs x24, x24, #1
            bne LoopW
+    LoopWEnd:
        add x0, x0, x8
        add x1, x1, x10
        subs x4, x4, #1
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwInt8Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwInt8Center.S
@ -0,0 +1,558 @@
+#ifdef __aarch64__
+
+.text
+.align 5
+.global ConvDwInt8Center
+#ifndef __APPLE__
+.type ConvDwInt8Center, %function
+#endif
+
+// void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height, size_t width,
+//                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
+//                      size_t in_kh_step, size_t in_kw_step, int out_multiplier, int left_shift,
+//                      int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max);
+// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, 
+// x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
+// x14: out_multiplier, #56: left_shift, #64: right_shift, #72:out_zp, #80: acc_min, #88: acc_max
+ConvDwInt8Center:
+    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
+    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
+    // x19 ~ x29 should be also preserved
+    // whereas our coding style do not permit such amount of parameters
+    sub sp, sp, #48
+    stp x19, x20, [sp], #16
+    stp x21, x22, [sp], #16
+    stp x23, x24, [sp], #16
+
+    ldr x8, [sp]
+    ldr x9, [sp, #8]
+    ldr x10, [sp, #16]
+    ldr x11, [sp, #24]
+    ldr x12, [sp, #32]
+    ldr x13, [sp, #40]
+
+    ldr w14, [sp, #56]
+    dup v26.4s, w14
+
+    ldr x15, [sp, #48]
+    dup v27.4s, w15
+
+    ldr w16, [sp, #64]
+    dup v28.4s, w16
+
+    ldr w17, [sp, #72]
+    dup v29.4s, w17
+                
+    ldr w18, [sp, #80]
+    dup v30.4s, w18
+
+    ldr w19, [sp, #88]
+    dup v31.4s, w19
+
+    ld1 {v24.4s}, [x3]
+
+    LoopH:
+        mov x23, x1
+        mov x24, x5
+        mov x3, x0
+        cmp x24, #8
+        blt LoopW
+        cmp x24, #16
+        blt LoopW8
+
+        LoopW16:
+            mov x19, #16
+            mul x19, x19, x11
+            mov x16, x23
+            mov x17, x2
+            mov x20, x6
+            mov v0.16b, v24.16b
+            mov v1.16b, v24.16b
+            mov v2.16b, v24.16b
+            mov v3.16b, v24.16b
+            mov v4.16b, v24.16b
+            mov v5.16b, v24.16b
+            mov v6.16b, v24.16b
+            mov v7.16b, v24.16b
+            mov v8.16b, v24.16b
+            mov v9.16b, v24.16b
+            mov v10.16b, v24.16b
+            mov v11.16b, v24.16b
+            mov v12.16b, v24.16b
+            mov v13.16b, v24.16b
+            mov v14.16b, v24.16b
+            mov v15.16b, v24.16b
+            LoopKh16:
+                mov x18, x7
+                mov x21, x16
+                LoopKw16:
+                    mov x22, x21
+                    ld1 {v25.4h}, [x17], #8
+                    ld1 {v16.4h}, [x22], x13
+                    ld1 {v17.4h}, [x22], x13
+                    smlal v0.4s, v16.4h, v25.4h
+                    smlal v1.4s, v17.4h, v25.4h
+                    ld1 {v18.4h}, [x22], x13
+                    ld1 {v19.4h}, [x22], x13
+                    smlal v2.4s, v18.4h, v25.4h
+                    smlal v3.4s, v19.4h, v25.4h
+                    ld1 {v20.4h}, [x22], x13
+                    ld1 {v21.4h}, [x22], x13
+                    smlal v4.4s, v20.4h, v25.4h
+                    smlal v5.4s, v21.4h, v25.4h
+                    ld1 {v22.4h}, [x22], x13
+                    ld1 {v23.4h}, [x22], x13
+                    smlal v6.4s, v22.4h, v25.4h
+                    smlal v7.4s, v23.4h, v25.4h
+                    ld1 {v16.4h}, [x22], x13
+                    ld1 {v17.4h}, [x22], x13
+                    smlal v8.4s, v16.4h, v25.4h
+                    smlal v9.4s, v17.4h, v25.4h
+                    ld1 {v18.4h}, [x22], x13
+                    ld1 {v19.4h}, [x22], x13
+                    smlal v10.4s, v18.4h, v25.4h
+                    smlal v11.4s, v19.4h, v25.4h
+                    ld1 {v20.4h}, [x22], x13
+                    ld1 {v21.4h}, [x22], x13
+                    smlal v12.4s, v20.4h, v25.4h
+                    smlal v13.4s, v21.4h, v25.4h
+                    ld1 {v22.4h}, [x22], x13
+                    ld1 {v23.4h}, [x22], x13
+                    smlal v14.4s, v22.4h, v25.4h
+                    smlal v15.4s, v23.4h, v25.4h
+                    subs x18, x18, #1
+                    add x21, x21, x13
+                    bne LoopKw16
+                add x16, x16, x12
+                subs x20, x20, #1
+                bne LoopKh16
+
+            sqshl v0.4s, v0.4s ,v26.4s
+            sqshl v1.4s, v1.4s ,v26.4s
+            sqshl v2.4s, v2.4s ,v26.4s
+            sqshl v3.4s, v3.4s ,v26.4s
+            sqshl v4.4s, v4.4s ,v26.4s
+            sqshl v5.4s, v5.4s ,v26.4s
+            sqshl v6.4s, v6.4s ,v26.4s
+            sqshl v7.4s, v7.4s ,v26.4s
+            sqshl v8.4s, v8.4s ,v26.4s
+            sqshl v9.4s, v9.4s ,v26.4s
+            sqshl v10.4s, v10.4s ,v26.4s
+            sqshl v11.4s, v11.4s ,v26.4s
+            sqshl v12.4s, v12.4s ,v26.4s
+            sqshl v13.4s, v13.4s ,v26.4s
+            sqshl v14.4s, v14.4s ,v26.4s
+            sqshl v15.4s, v15.4s ,v26.4s
+            sqrdmulh v0.4s, v0.4s ,v27.4s
+            sqrdmulh v1.4s, v1.4s ,v27.4s
+            sqrdmulh v2.4s, v2.4s ,v27.4s
+            sqrdmulh v3.4s, v3.4s ,v27.4s
+            sqrdmulh v4.4s, v4.4s ,v27.4s
+            sqrdmulh v5.4s, v5.4s ,v27.4s
+            sqrdmulh v6.4s, v6.4s ,v27.4s
+            sqrdmulh v7.4s, v7.4s ,v27.4s
+            sqrdmulh v8.4s, v8.4s ,v27.4s
+            sqrdmulh v9.4s, v9.4s ,v27.4s
+            sqrdmulh v10.4s, v10.4s ,v27.4s
+            sqrdmulh v11.4s, v11.4s ,v27.4s
+            sqrdmulh v12.4s, v12.4s ,v27.4s
+            sqrdmulh v13.4s, v13.4s ,v27.4s
+            sqrdmulh v14.4s, v14.4s ,v27.4s
+            sqrdmulh v15.4s, v15.4s ,v27.4s
+            sqrshl v0.4s, v0.4s ,v28.4s
+            sqrshl v1.4s, v1.4s ,v28.4s
+            sqrshl v2.4s, v2.4s ,v28.4s
+            sqrshl v3.4s, v3.4s ,v28.4s
+            sqrshl v4.4s, v4.4s ,v28.4s
+            sqrshl v5.4s, v5.4s ,v28.4s
+            sqrshl v6.4s, v6.4s ,v28.4s
+            sqrshl v7.4s, v7.4s ,v28.4s
+            sqrshl v8.4s, v8.4s ,v28.4s
+            sqrshl v9.4s, v9.4s ,v28.4s
+            sqrshl v10.4s, v10.4s ,v28.4s
+            sqrshl v11.4s, v11.4s ,v28.4s
+            sqrshl v12.4s, v12.4s ,v28.4s
+            sqrshl v13.4s, v13.4s ,v28.4s
+            sqrshl v14.4s, v14.4s ,v28.4s
+            sqrshl v15.4s, v15.4s ,v28.4s
+            add v0.4s, v0.4s ,v29.4s
+            add v1.4s, v1.4s ,v29.4s
+            add v2.4s, v2.4s ,v29.4s
+            add v3.4s, v3.4s ,v29.4s
+            add v4.4s, v4.4s ,v29.4s
+            add v5.4s, v5.4s ,v29.4s
+            add v6.4s, v6.4s ,v29.4s
+            add v7.4s, v7.4s ,v29.4s
+            add v8.4s, v8.4s ,v29.4s
+            add v9.4s, v9.4s ,v29.4s
+            add v10.4s, v10.4s ,v29.4s
+            add v11.4s, v11.4s ,v29.4s
+            add v12.4s, v12.4s ,v29.4s
+            add v13.4s, v13.4s ,v29.4s
+            add v14.4s, v14.4s ,v29.4s
+            add v15.4s, v15.4s ,v29.4s
+            smax v0.4s, v0.4s ,v30.4s
+            smax v1.4s, v1.4s ,v30.4s
+            smax v2.4s, v2.4s ,v30.4s
+            smax v3.4s, v3.4s ,v30.4s
+            smax v4.4s, v4.4s ,v30.4s
+            smax v5.4s, v5.4s ,v30.4s
+            smax v6.4s, v6.4s ,v30.4s
+            smax v7.4s, v7.4s ,v30.4s
+            smax v8.4s, v8.4s ,v30.4s
+            smax v9.4s, v9.4s ,v30.4s
+            smax v10.4s, v10.4s ,v30.4s
+            smax v11.4s, v11.4s ,v30.4s
+            smax v12.4s, v12.4s ,v30.4s
+            smax v13.4s, v13.4s ,v30.4s
+            smax v14.4s, v14.4s ,v30.4s
+            smax v15.4s, v15.4s ,v30.4s
+            smin v0.4s, v0.4s ,v31.4s
+            smin v1.4s, v1.4s ,v31.4s
+            smin v2.4s, v2.4s ,v31.4s
+            smin v3.4s, v3.4s ,v31.4s
+            smin v4.4s, v4.4s ,v31.4s
+            smin v5.4s, v5.4s ,v31.4s
+            smin v6.4s, v6.4s ,v31.4s
+            smin v7.4s, v7.4s ,v31.4s
+            smin v8.4s, v8.4s ,v31.4s
+            smin v9.4s, v9.4s ,v31.4s
+            smin v10.4s, v10.4s ,v31.4s
+            smin v11.4s, v11.4s ,v31.4s
+            smin v12.4s, v12.4s ,v31.4s
+            smin v13.4s, v13.4s ,v31.4s
+            smin v14.4s, v14.4s ,v31.4s
+            smin v15.4s, v15.4s ,v31.4s
+
+            sqxtn v0.4h, v0.4s
+            sqxtn v1.4h, v1.4s
+            sqxtn v2.4h, v2.4s
+            sqxtn v3.4h, v3.4s
+            sqxtn v4.4h, v4.4s
+            sqxtn v5.4h, v5.4s
+            sqxtn v6.4h, v6.4s
+            sqxtn v7.4h, v7.4s
+            sqxtn v8.4h, v8.4s
+            sqxtn v9.4h, v9.4s
+            sqxtn v10.4h, v10.4s
+            sqxtn v11.4h, v11.4s
+            sqxtn v12.4h, v12.4s
+            sqxtn v13.4h, v13.4s
+            sqxtn v14.4h, v14.4s
+            sqxtn v15.4h, v15.4s
+            sqxtn v0.8b, v0.8h
+            sqxtn v1.8b, v1.8h
+            sqxtn v2.8b, v2.8h
+            sqxtn v3.8b, v3.8h
+            sqxtn v4.8b, v4.8h
+            sqxtn v5.8b, v5.8h
+            sqxtn v6.8b, v6.8h
+            sqxtn v7.8b, v7.8h
+            sqxtn v8.8b, v8.8h
+            sqxtn v9.8b, v9.8h
+            sqxtn v10.8b, v10.8h
+            sqxtn v11.8b, v11.8h
+            sqxtn v12.8b, v12.8h
+            sqxtn v13.8b, v13.8h
+            sqxtn v14.8b, v14.8h
+            sqxtn v15.8b, v15.8h
+
+            add x17, x3, #1
+            add x18, x3, #2
+            add x21, x3, #3
+            st1 {v0.b}[0], [x3], x9
+            st1 {v0.b}[1], [x17], x9
+            st1 {v0.b}[2], [x18], x9
+            st1 {v0.b}[3], [x21], x9
+
+            st1 {v1.b}[0], [x3], x9
+            st1 {v1.b}[1], [x17], x9
+            st1 {v1.b}[2], [x18], x9
+            st1 {v1.b}[3], [x21], x9
+
+            st1 {v2.b}[0], [x3], x9
+            st1 {v2.b}[1], [x17], x9
+            st1 {v2.b}[2], [x18], x9
+            st1 {v2.b}[3], [x21], x9
+
+            st1 {v3.b}[0], [x3], x9
+            st1 {v3.b}[1], [x17], x9
+            st1 {v3.b}[2], [x18], x9
+            st1 {v3.b}[3], [x21], x9
+
+            st1 {v4.b}[0], [x3], x9
+            st1 {v4.b}[1], [x17], x9
+            st1 {v4.b}[2], [x18], x9
+            st1 {v4.b}[3], [x21], x9
+
+            st1 {v5.b}[0], [x3], x9
+            st1 {v5.b}[1], [x17], x9
+            st1 {v5.b}[2], [x18], x9
+            st1 {v5.b}[3], [x21], x9
+
+            st1 {v6.b}[0], [x3], x9
+            st1 {v6.b}[1], [x17], x9
+            st1 {v6.b}[2], [x18], x9
+            st1 {v6.b}[3], [x21], x9
+
+            st1 {v7.b}[0], [x3], x9
+            st1 {v7.b}[1], [x17], x9
+            st1 {v7.b}[2], [x18], x9
+            st1 {v7.b}[3], [x21], x9
+
+            st1 {v8.b}[0], [x3], x9
+            st1 {v8.b}[1], [x17], x9
+            st1 {v8.b}[2], [x18], x9
+            st1 {v8.b}[3], [x21], x9
+
+            st1 {v9.b}[0], [x3], x9
+            st1 {v9.b}[1], [x17], x9
+            st1 {v9.b}[2], [x18], x9
+            st1 {v9.b}[3], [x21], x9
+
+            st1 {v10.b}[0], [x3], x9
+            st1 {v10.b}[1], [x17], x9
+            st1 {v10.b}[2], [x18], x9
+            st1 {v10.b}[3], [x21], x9
+
+            st1 {v11.b}[0], [x3], x9
+            st1 {v11.b}[1], [x17], x9
+            st1 {v11.b}[2], [x18], x9
+            st1 {v11.b}[3], [x21], x9
+
+            st1 {v12.b}[0], [x3], x9
+            st1 {v12.b}[1], [x17], x9
+            st1 {v12.b}[2], [x18], x9
+            st1 {v12.b}[3], [x21], x9
+
+            st1 {v13.b}[0], [x3], x9
+            st1 {v13.b}[1], [x17], x9
+            st1 {v13.b}[2], [x18], x9
+            st1 {v13.b}[3], [x21], x9
+
+            st1 {v14.b}[0], [x3], x9
+            st1 {v14.b}[1], [x17], x9
+            st1 {v14.b}[2], [x18], x9
+            st1 {v14.b}[3], [x21], x9
+
+            st1 {v15.b}[0], [x3], x9
+            st1 {v15.b}[1], [x17], x9
+            st1 {v15.b}[2], [x18], x9
+            st1 {v15.b}[3], [x21], x9
+
+            add x23, x23, x19
+            sub x24, x24, #16
+            cmp x24, #0
+            ble LoopWEnd
+            cmp x24, #8
+            blt LoopW
+            cmp x24, #16
+            bge LoopW16
+        LoopW8:
+            mov x19, #8
+            mul x19, x19, x11
+            mov x16, x23
+            mov x17, x2
+            mov x20, x6
+            mov v0.16b, v24.16b
+            mov v1.16b, v24.16b
+            mov v2.16b, v24.16b
+            mov v3.16b, v24.16b
+            mov v4.16b, v24.16b
+            mov v5.16b, v24.16b
+            mov v6.16b, v24.16b
+            mov v7.16b, v24.16b
+            LoopKh8:
+                mov x18, x7
+                mov x21, x16
+                LoopKw8:
+                    mov x22, x21
+                    ld1 {v25.4h}, [x17], #8
+                    ld1 {v16.4h}, [x22], x13
+                    ld1 {v17.4h}, [x22], x13
+                    smlal v0.4s, v16.4h, v25.4h
+                    smlal v1.4s, v17.4h, v25.4h
+                    ld1 {v18.4h}, [x22], x13
+                    ld1 {v19.4h}, [x22], x13
+                    smlal v2.4s, v18.4h, v25.4h
+                    smlal v3.4s, v19.4h, v25.4h
+                    ld1 {v20.4h}, [x22], x13
+                    ld1 {v21.4h}, [x22], x13
+                    smlal v4.4s, v20.4h, v25.4h
+                    smlal v5.4s, v21.4h, v25.4h
+                    ld1 {v22.4h}, [x22], x13
+                    ld1 {v23.4h}, [x22], x13
+                    smlal v6.4s, v22.4h, v25.4h
+                    smlal v7.4s, v23.4h, v25.4h
+                    subs x18, x18, #1
+                    add x21, x21, x13
+                    bne LoopKw8
+                add x16, x16, x12
+                subs x20, x20, #1
+                bne LoopKh8
+
+            sqshl v0.4s, v0.4s ,v26.4s
+            sqshl v1.4s, v1.4s ,v26.4s
+            sqshl v2.4s, v2.4s ,v26.4s
+            sqshl v3.4s, v3.4s ,v26.4s
+            sqshl v4.4s, v4.4s ,v26.4s
+            sqshl v5.4s, v5.4s ,v26.4s
+            sqshl v6.4s, v6.4s ,v26.4s
+            sqshl v7.4s, v7.4s ,v26.4s
+            sqrdmulh v0.4s, v0.4s ,v27.4s
+            sqrdmulh v1.4s, v1.4s ,v27.4s
+            sqrdmulh v2.4s, v2.4s ,v27.4s
+            sqrdmulh v3.4s, v3.4s ,v27.4s
+            sqrdmulh v4.4s, v4.4s ,v27.4s
+            sqrdmulh v5.4s, v5.4s ,v27.4s
+            sqrdmulh v6.4s, v6.4s ,v27.4s
+            sqrdmulh v7.4s, v7.4s ,v27.4s
+            sqrshl v0.4s, v0.4s ,v28.4s
+            sqrshl v1.4s, v1.4s ,v28.4s
+            sqrshl v2.4s, v2.4s ,v28.4s
+            sqrshl v3.4s, v3.4s ,v28.4s
+            sqrshl v4.4s, v4.4s ,v28.4s
+            sqrshl v5.4s, v5.4s ,v28.4s
+            sqrshl v6.4s, v6.4s ,v28.4s
+            sqrshl v7.4s, v7.4s ,v28.4s
+            add v0.4s, v0.4s ,v29.4s
+            add v1.4s, v1.4s ,v29.4s
+            add v2.4s, v2.4s ,v29.4s
+            add v3.4s, v3.4s ,v29.4s
+            add v4.4s, v4.4s ,v29.4s
+            add v5.4s, v5.4s ,v29.4s
+            add v6.4s, v6.4s ,v29.4s
+            add v7.4s, v7.4s ,v29.4s
+            smax v0.4s, v0.4s ,v30.4s
+            smax v1.4s, v1.4s ,v30.4s
+            smax v2.4s, v2.4s ,v30.4s
+            smax v3.4s, v3.4s ,v30.4s
+            smax v4.4s, v4.4s ,v30.4s
+            smax v5.4s, v5.4s ,v30.4s
+            smax v6.4s, v6.4s ,v30.4s
+            smax v7.4s, v7.4s ,v30.4s
+            smin v0.4s, v0.4s ,v31.4s
+            smin v1.4s, v1.4s ,v31.4s
+            smin v2.4s, v2.4s ,v31.4s
+            smin v3.4s, v3.4s ,v31.4s
+            smin v4.4s, v4.4s ,v31.4s
+            smin v5.4s, v5.4s ,v31.4s
+            smin v6.4s, v6.4s ,v31.4s
+            smin v7.4s, v7.4s ,v31.4s
+
+            sqxtn v0.4h, v0.4s
+            sqxtn v1.4h, v1.4s
+            sqxtn v2.4h, v2.4s
+            sqxtn v3.4h, v3.4s
+            sqxtn v4.4h, v4.4s
+            sqxtn v5.4h, v5.4s
+            sqxtn v6.4h, v6.4s
+            sqxtn v7.4h, v7.4s
+            sqxtn v0.8b, v0.8h
+            sqxtn v1.8b, v1.8h
+            sqxtn v2.8b, v2.8h
+            sqxtn v3.8b, v3.8h
+            sqxtn v4.8b, v4.8h
+            sqxtn v5.8b, v5.8h
+            sqxtn v6.8b, v6.8h
+            sqxtn v7.8b, v7.8h
+
+            add x17, x3, #1
+            add x18, x3, #2
+            add x21, x3, #3
+            st1 {v0.b}[0], [x3], x9
+            st1 {v0.b}[1], [x17], x9
+            st1 {v0.b}[2], [x18], x9
+            st1 {v0.b}[3], [x21], x9
+
+            st1 {v1.b}[0], [x3], x9
+            st1 {v1.b}[1], [x17], x9
+            st1 {v1.b}[2], [x18], x9
+            st1 {v1.b}[3], [x21], x9
+
+            st1 {v2.b}[0], [x3], x9
+            st1 {v2.b}[1], [x17], x9
+            st1 {v2.b}[2], [x18], x9
+            st1 {v2.b}[3], [x21], x9
+
+            st1 {v3.b}[0], [x3], x9
+            st1 {v3.b}[1], [x17], x9
+            st1 {v3.b}[2], [x18], x9
+            st1 {v3.b}[3], [x21], x9
+
+            st1 {v4.b}[0], [x3], x9
+            st1 {v4.b}[1], [x17], x9
+            st1 {v4.b}[2], [x18], x9
+            st1 {v4.b}[3], [x21], x9
+
+            st1 {v5.b}[0], [x3], x9
+            st1 {v5.b}[1], [x17], x9
+            st1 {v5.b}[2], [x18], x9
+            st1 {v5.b}[3], [x21], x9
+
+            st1 {v6.b}[0], [x3], x9
+            st1 {v6.b}[1], [x17], x9
+            st1 {v6.b}[2], [x18], x9
+            st1 {v6.b}[3], [x21], x9
+
+            st1 {v7.b}[0], [x3], x9
+            st1 {v7.b}[1], [x17], x9
+            st1 {v7.b}[2], [x18], x9
+            st1 {v7.b}[3], [x21], x9
+
+            add x23, x23, x19
+            sub x24, x24, #8
+            cmp x24, #0
+            ble LoopWEnd
+            cmp x24, #8
+            bge LoopW8
+        LoopW:
+            mov x16, x23
+            mov x17, x2
+            mov x20, x6
+            mov v0.16b, v24.16b
+            LoopKh:
+                mov x18, x7
+                mov x22, x16
+                LoopKw:
+                    ld1 {v16.4h}, [x22], x13
+                    ld1 {v25.4h}, [x17], #8
+                    smlal v0.4s, v16.4h, v25.4h
+                    subs x18, x18, #1
+                    bne LoopKw
+                add x16, x16, x12
+                subs x20, x20, #1
+                bne LoopKh
+
+            sqshl v0.4s, v0.4s ,v26.4s
+            sqrdmulh v0.4s, v0.4s ,v27.4s
+            sqrshl v0.4s, v0.4s ,v28.4s
+            add v0.4s, v0.4s ,v29.4s
+            smax v0.4s, v0.4s ,v30.4s
+            smin v0.4s, v0.4s ,v31.4s
+
+            sqxtn v0.4h, v0.4s
+            sqxtn v0.8b, v0.8h
+
+            mov x17, x3
+            st1 {v0.b}[0], [x17], #1
+            st1 {v0.b}[1], [x17], #1
+            st1 {v0.b}[2], [x17], #1
+            st1 {v0.b}[3], [x17], #1
+            add x3, x3, x9
+
+            add x23, x23, x11
+            subs x24, x24, #1
+            bne LoopW
+    LoopWEnd:
+        add x0, x0, x8
+        add x1, x1, x10
+        subs x4, x4, #1
+        bne LoopH
+
+    sub sp, sp, #48
+    ldp x19, x20, [sp], #16
+    ldp x21, x22, [sp], #16
+    ldp x23, x24, [sp], #16
+    ret
+#endif
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwFp32Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwFp32Center.S
@ -35,12 +35,12 @@ DeconvDwFp32Center:
            mov x18, x15
            mov x19, x2
            mov x20, x5
-            dup v0.4s, wzr
+            ld1 {v1.4s}, [x16], x8
            LoopKh:
                mov x21, x18
                mov x13, x6
                LoopKw:
-                    ld1 {v1.4s}, [x16]
+                    ld1 {v0.4s}, [x21]
                    ld1 {v2.4s}, [x19], #16
                    fmla v0.4s, v1.4s, v2.4s
                    st1 {v0.4s}, [x21], x12
@ -50,7 +50,6 @@ DeconvDwFp32Center:
                subs x20, x20, #1
                bne LoopKh
            add x15, x15, x10
-            add x16, x16, x8
            subs x17, x17, #1
            bne LoopW
        add x0, x0, x9
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwInt8Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwInt8Center.S
@ -0,0 +1,65 @@
+#ifdef __aarch64__
+
+.text
+.align 5
+.global DeconvDwInt8Center
+#ifndef __APPLE__
+.type DeconvDwInt8Center, %function
+#endif
+
+// void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width,
+//                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
+//                      size_t in_kh_step, size_t in_kw_step);
+// x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
+// x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
+DeconvDwInt8Center:
+    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
+    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
+    // x19 ~ x29 should be also preserved
+    // whereas our coding style do not permit such amount of parameters
+    sub sp, sp, #32
+    stp x19, x20, [sp], #16
+    stp x21, x22, [sp], #16
+
+    ldr x8, [sp]
+    ldr x9, [sp, #8]
+    ldr x10, [sp, #16]
+    ldr x11, [sp, #24]
+    ldr x12, [sp, #32]
+
+    LoopH:
+        mov x15, x0
+        mov x16, x1
+        mov x17, x4
+        LoopW:
+            mov x18, x15
+            mov x19, x2
+            mov x20, x5
+            ld1 {v1.4h}, [x16], x8
+            LoopKh:
+                mov x21, x18
+                mov x13, x6
+                LoopKw:
+                    ld1 {v0.4s}, [x21]
+                    ld1 {v2.4h}, [x19], #8
+                    smlal v0.4s, v1.4h, v2.4h
+                    st1 {v0.4s}, [x21], x12
+                    subs x13, x13, #1
+                    bne LoopKw
+                add x18, x18, x11
+                subs x20, x20, #1
+                bne LoopKh
+            add x15, x15, x10
+            add x16, x16, x8
+            subs x17, x17, #1
+            bne LoopW
+        add x0, x0, x9
+        add x1, x1, x7
+        subs x3, x3, #1
+        bne LoopH
+
+    sub sp, sp, #32
+    ldp x19, x20, [sp], #16
+    ldp x21, x22, [sp], #16
+    ret
+#endif
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/opt/ConvDwFp16Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/opt/ConvDwFp16Center.S
@ -0,0 +1,294 @@
+#ifdef __aarch64__
+
+.text
+.align 5
+.global ConvDwFp16Center
+#ifndef __APPLE__
+.type ConvDwFp16Center, %function
+#endif
+
+// void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, size_t height, size_t width,
+//                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
+//                      size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
+// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, 
+// x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
+// x14: relu, x15: relu6
+ConvDwFp16Center:
+    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
+    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
+    // x19 ~ x29 should be also preserved
+    // whereas our coding style do not permit such amount of parameters
+    sub sp, sp, #48
+    stp x19, x20, [sp], #16
+    stp x21, x22, [sp], #16
+    stp x23, x24, [sp], #16
+
+    ldr x8, [sp]
+    ldr x9, [sp, #8]
+    ldr x10, [sp, #16]
+    ldr x11, [sp, #24]
+    ldr x12, [sp, #32]
+    ldr x13, [sp, #40]
+    ldr x14, [sp, #48]
+    ldr x15, [sp, #56]
+
+    ld1 {v24.8h}, [x3]
+    movi v26.8h, #0x46, lsl #8
+    dup v27.4s, wzr
+
+    LoopH:
+        mov x23, x1
+        mov x24, x5
+        mov x3, x0
+        cmp x24, #8
+        blt LoopW
+        cmp x24, #16
+        blt LoopW8
+
+        LoopW16:
+            mov x19, #16
+            mul x19, x19, x11
+            mov x16, x23
+            mov x17, x2
+            mov x20, x6
+            mov v0.16b, v24.16b
+            mov v1.16b, v24.16b
+            mov v2.16b, v24.16b
+            mov v3.16b, v24.16b
+            mov v4.16b, v24.16b
+            mov v5.16b, v24.16b
+            mov v6.16b, v24.16b
+            mov v7.16b, v24.16b
+            mov v8.16b, v24.16b
+            mov v9.16b, v24.16b
+            mov v10.16b, v24.16b
+            mov v11.16b, v24.16b
+            mov v12.16b, v24.16b
+            mov v13.16b, v24.16b
+            mov v14.16b, v24.16b
+            mov v15.16b, v24.16b
+            LoopKh16:
+                mov x18, x7
+                mov x21, x16
+                LoopKw16:
+                    mov x22, x21
+                    ld1 {v25.8h}, [x17], #16
+                    ld1 {v16.8h}, [x22], x11
+                    ld1 {v17.8h}, [x22], x11
+                    fmla v0.8h, v16.8h, v25.8h
+                    fmla v1.8h, v17.8h, v25.8h
+                    ld1 {v18.8h}, [x22], x11
+                    ld1 {v19.8h}, [x22], x11
+                    fmla v2.8h, v18.8h, v25.8h
+                    fmla v3.8h, v19.8h, v25.8h
+                    ld1 {v20.8h}, [x22], x11
+                    ld1 {v21.8h}, [x22], x11
+                    fmla v4.8h, v20.8h, v25.8h
+                    fmla v5.8h, v21.8h, v25.8h
+                    ld1 {v22.8h}, [x22], x11
+                    ld1 {v23.8h}, [x22], x11
+                    fmla v6.8h, v22.8h, v25.8h
+                    fmla v7.8h, v23.8h, v25.8h
+                    ld1 {v16.8h}, [x22], x11
+                    ld1 {v17.8h}, [x22], x11
+                    fmla v8.8h, v16.8h, v25.8h
+                    fmla v9.8h, v17.8h, v25.8h
+                    ld1 {v18.8h}, [x22], x11
+                    ld1 {v19.8h}, [x22], x11
+                    fmla v10.8h, v18.8h, v25.8h
+                    fmla v11.8h, v19.8h, v25.8h
+                    ld1 {v20.8h}, [x22], x11
+                    ld1 {v21.8h}, [x22], x11
+                    fmla v12.8h, v20.8h, v25.8h
+                    fmla v13.8h, v21.8h, v25.8h
+                    ld1 {v22.8h}, [x22], x11
+                    ld1 {v23.8h}, [x22], x11
+                    fmla v14.8h, v22.8h, v25.8h
+                    fmla v15.8h, v23.8h, v25.8h
+                    subs x18, x18, #1
+                    add x21, x21, x13
+                    bne LoopKw16
+                add x16, x16, x12
+                subs x20, x20, #1
+                bne LoopKh16
+            cbnz x15, Relu616
+            cbnz x14, Relu16
+            b Write16
+        Relu616:
+            fmin v0.8h, v0.8h, v26.8h
+            fmin v1.8h, v1.8h, v26.8h
+            fmin v2.8h, v2.8h, v26.8h
+            fmin v3.8h, v3.8h, v26.8h
+            fmin v4.8h, v4.8h, v26.8h
+            fmin v5.8h, v5.8h, v26.8h
+            fmin v6.8h, v6.8h, v26.8h
+            fmin v7.8h, v7.8h, v26.8h
+            fmin v8.8h, v8.8h, v26.8h
+            fmin v9.8h, v9.8h, v26.8h
+            fmin v10.8h, v10.8h, v26.8h
+            fmin v11.8h, v11.8h, v26.8h
+            fmin v12.8h, v12.8h, v26.8h
+            fmin v13.8h, v13.8h, v26.8h
+            fmin v14.8h, v14.8h, v26.8h
+            fmin v15.8h, v15.8h, v26.8h
+        Relu16:
+            fmax v0.8h, v0.8h, v27.8h
+            fmax v1.8h, v1.8h, v27.8h
+            fmax v2.8h, v2.8h, v27.8h
+            fmax v3.8h, v3.8h, v27.8h
+            fmax v4.8h, v4.8h, v27.8h
+            fmax v5.8h, v5.8h, v27.8h
+            fmax v6.8h, v6.8h, v27.8h
+            fmax v7.8h, v7.8h, v27.8h
+            fmax v8.8h, v8.8h, v27.8h
+            fmax v9.8h, v9.8h, v27.8h
+            fmax v10.8h, v10.8h, v27.8h
+            fmax v11.8h, v11.8h, v27.8h
+            fmax v12.8h, v12.8h, v27.8h
+            fmax v13.8h, v13.8h, v27.8h
+            fmax v14.8h, v14.8h, v27.8h
+            fmax v15.8h, v15.8h, v27.8h
+        Write16:
+            st1 {v0.8h}, [x3], x9
+            st1 {v1.8h}, [x3], x9
+            st1 {v2.8h}, [x3], x9
+            st1 {v3.8h}, [x3], x9
+            st1 {v4.8h}, [x3], x9
+            st1 {v5.8h}, [x3], x9
+            st1 {v6.8h}, [x3], x9
+            st1 {v7.8h}, [x3], x9
+            st1 {v8.8h}, [x3], x9
+            st1 {v9.8h}, [x3], x9
+            st1 {v10.8h}, [x3], x9
+            st1 {v11.8h}, [x3], x9
+            st1 {v12.8h}, [x3], x9
+            st1 {v13.8h}, [x3], x9
+            st1 {v14.8h}, [x3], x9
+            st1 {v15.8h}, [x3], x9
+            add x23, x23, x19
+            sub x24, x24, #16
+            cmp x24, #0
+            ble LoopWEnd
+            cmp x24, #8
+            blt LoopW
+            cmp x24, #16
+            bge LoopW16
+        LoopW8:
+            mov x19, #8
+            mul x19, x19, x11
+            mov x16, x23
+            mov x17, x2
+            mov x20, x6
+            mov v0.16b, v24.16b
+            mov v1.16b, v24.16b
+            mov v2.16b, v24.16b
+            mov v3.16b, v24.16b
+            mov v4.16b, v24.16b
+            mov v5.16b, v24.16b
+            mov v6.16b, v24.16b
+            mov v7.16b, v24.16b
+            LoopKh8:
+                mov x18, x7
+                mov x21, x16
+                LoopKw8:
+                    mov x22, x21
+                    ld1 {v25.8h}, [x17], #16
+                    ld1 {v16.8h}, [x22], x11
+                    ld1 {v17.8h}, [x22], x11
+                    fmla v0.8h, v16.8h, v25.8h
+                    fmla v1.8h, v17.8h, v25.8h
+                    ld1 {v18.8h}, [x22], x11
+                    ld1 {v19.8h}, [x22], x11
+                    fmla v2.8h, v18.8h, v25.8h
+                    fmla v3.8h, v19.8h, v25.8h
+                    ld1 {v20.8h}, [x22], x11
+                    ld1 {v21.8h}, [x22], x11
+                    fmla v4.8h, v20.8h, v25.8h
+                    fmla v5.8h, v21.8h, v25.8h
+                    ld1 {v22.8h}, [x22], x11
+                    ld1 {v23.8h}, [x22], x11
+                    fmla v6.8h, v22.8h, v25.8h
+                    fmla v7.8h, v23.8h, v25.8h
+                    subs x18, x18, #1
+                    add x21, x21, x13
+                    bne LoopKw8
+                add x16, x16, x12
+                subs x20, x20, #1
+                bne LoopKh8
+            cbnz x15, Relu68
+            cbnz x14, Relu8
+            b Write8
+        Relu68:
+            fmin v0.8h, v0.8h, v26.8h
+            fmin v1.8h, v1.8h, v26.8h
+            fmin v2.8h, v2.8h, v26.8h
+            fmin v3.8h, v3.8h, v26.8h
+            fmin v4.8h, v4.8h, v26.8h
+            fmin v5.8h, v5.8h, v26.8h
+            fmin v6.8h, v6.8h, v26.8h
+            fmin v7.8h, v7.8h, v26.8h
+        Relu8:
+            fmax v0.8h, v0.8h, v27.8h
+            fmax v1.8h, v1.8h, v27.8h
+            fmax v2.8h, v2.8h, v27.8h
+            fmax v3.8h, v3.8h, v27.8h
+            fmax v4.8h, v4.8h, v27.8h
+            fmax v5.8h, v5.8h, v27.8h
+            fmax v6.8h, v6.8h, v27.8h
+            fmax v7.8h, v7.8h, v27.8h
+        Write8:
+            st1 {v0.8h}, [x3], x9
+            st1 {v1.8h}, [x3], x9
+            st1 {v2.8h}, [x3], x9
+            st1 {v3.8h}, [x3], x9
+            st1 {v4.8h}, [x3], x9
+            st1 {v5.8h}, [x3], x9
+            st1 {v6.8h}, [x3], x9
+            st1 {v7.8h}, [x3], x9
+            add x23, x23, x19
+            sub x24, x24, #8
+            cmp x24, #0
+            ble LoopWEnd
+            cmp x24, #8
+            bge LoopW8
+        LoopW:
+            mov x16, x23
+            mov x17, x2
+            mov x20, x6
+            mov v0.16b, v24.16b
+            LoopKh:
+                mov x18, x7
+                mov x22, x16
+                LoopKw:
+                    ld1 {v16.8h}, [x22], x13
+                    ld1 {v25.8h}, [x17], #16
+                    fmla v0.8h, v16.8h, v25.8h
+                    subs x18, x18, #1
+                    bne LoopKw
+                add x16, x16, x12
+                subs x20, x20, #1
+                bne LoopKh
+            cbnz x15, Relu6
+            cbnz x14, Relu
+            b Write
+        Relu6:
+            fmin v0.8h, v0.8h, v26.8h
+        Relu:
+            fmax v0.8h, v0.8h, v27.8h
+        Write:
+            st1 {v0.8h}, [x3], x9
+            add x23, x23, x11
+            subs x24, x24, #1
+            bne LoopW
+    LoopWEnd:
+        add x0, x0, x8
+        add x1, x1, x10
+        subs x4, x4, #1
+        bne LoopH
+
+    sub sp, sp, #48
+    ldp x19, x20, [sp], #16
+    ldp x21, x22, [sp], #16
+    ldp x23, x24, [sp], #16
+    ret
+#endif
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/opt/DeconvDwFp16Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/opt/DeconvDwFp16Center.S
@ -0,0 +1,64 @@
+#ifdef __aarch64__
+
+.text
+.align 5
+.global DeconvDwFp16Center
+#ifndef __APPLE__
+.type DeconvDwFp16Center, %function
+#endif
+
+// void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
+//                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
+//                      size_t in_kh_step, size_t in_kw_step);
+// x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
+// x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
+DeconvDwFp16Center:
+    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
+    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
+    // x19 ~ x29 should be also preserved
+    // whereas our coding style do not permit such amount of parameters
+    sub sp, sp, #32
+    stp x19, x20, [sp], #16
+    stp x21, x22, [sp], #16
+
+    ldr x8, [sp]
+    ldr x9, [sp, #8]
+    ldr x10, [sp, #16]
+    ldr x11, [sp, #24]
+    ldr x12, [sp, #32]
+
+    LoopH:
+        mov x15, x0
+        mov x16, x1
+        mov x17, x4
+        LoopW:
+            mov x18, x15
+            mov x19, x2
+            mov x20, x5
+            ld1 {v1.8h}, [x16], x8
+            LoopKh:
+                mov x21, x18
+                mov x13, x6
+                LoopKw:
+                    ld1 {v0.8h}, [x21]
+                    ld1 {v2.8h}, [x19], #16
+                    fmla v0.8h, v1.8h, v2.8h
+                    st1 {v0.8h}, [x21], x12
+                    subs x13, x13, #1
+                    bne LoopKw
+                add x18, x18, x11
+                subs x20, x20, #1
+                bne LoopKh
+            add x15, x15, x10
+            subs x17, x17, #1
+            bne LoopW
+        add x0, x0, x9
+        add x1, x1, x7
+        subs x3, x3, #1
+        bne LoopH
+
+    sub sp, sp, #32
+    ldp x19, x20, [sp], #16
+    ldp x21, x22, [sp], #16
+    ret
+#endif
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/common_func.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/common_func.h
@ -0,0 +1,44 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_COMMON_FUNC_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_COMMON_FUNC_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include "src/runtime/kernel/arm/opclib/op_base.h"
+#include "src/runtime/kernel/arm/opclib/conv_parameter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef ENABLE_ARM64
+void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
+                      size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step,
+                      size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step,
+                      size_t in_kw_step, size_t relu, size_t relu6);
+void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
+                        size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
+                        size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_COMMON_FUNC_H_ */
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc
@ -16,6 +16,7 @@

 #include "src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h"
 #include <arm_neon.h>
+#include "src/runtime/kernel/arm/opclib/fp16/common_func.h"

 /*conv depthwise fp16 begin*/
 void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
@ -79,6 +80,7 @@ void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *
  }  // height loop
 }

+#ifndef ENABLE_ARM64
 void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
                         int height, int width, int kernel_h, int kernel_w, int out_h_step, int block_channel,
                         int in_sh_step, int in_sw_step, int in_kh_step, int in_kw_step, bool is_relu, bool is_relu6) {
@ -97,12 +99,17 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *
        const float16_t *src_kw = src_kh;
        const float16_t *weight_kw = weight_kh;
        for (int kw = 0; kw < kernel_w; kw++) {
+#ifdef ENABLE_ARM64
          float16x8_t src_8 = vld1q_f16(src_kw);
          float16x8_t weight_8 = vld1q_f16(weight_kw);
          float16x8_t dst_8 = vld1q_f16(dst_w);
          dst_8 = vfmaq_f16(dst_8, src_8, weight_8);
          vst1q_f16(dst_w, dst_8);
-
+#else
+          for (int c = 0; c < C8NUM; c++) {
+            dst_w[c] += src_kw[c] * weight_kw[c];
+          }
+#endif
          src_kw += in_kw_step;
          weight_kw += C8NUM;
        }  // kernel_w loop
@ -122,6 +129,7 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *
    src_h += in_sh_step;
  }  // dst_height loop
 }
+#endif

 // conv depthwise fp16: sliding window
 void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
@ -149,11 +157,19 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo
        int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
        const float16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_;
        float16_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
-
+#ifdef ENABLE_ARM64
+        ConvDwFp16Center(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_,
+                         sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
+                         sliding->out_h_step_ * sizeof(float16_t), sliding->block_channel_ * sizeof(float16_t),
+                         sliding->in_sh_step_ * sizeof(float16_t), sliding->in_sw_step_ * sizeof(float16_t),
+                         sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t),
+                         conv_param->is_relu_, conv_param->is_relu6_);
+#else
        DepthwiseCenterFp16(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_,
                            sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
                            sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, sliding->in_sw_step_,
                            sliding->in_kh_step_, sliding->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_);
+#endif
      }
    }  // output C8 loop
    src += sliding->in_step_;
@ -214,6 +230,7 @@ void DeconvDepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float
  }  // height loop
 }

+#ifndef ENABLE_ARM64
 void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int height, int width,
                               int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step,
                               int in_sw_step, int in_kh_step, int in_kw_step) {
@ -229,12 +246,17 @@ void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float
        float16_t *dst_kw = dst_kh;
        const float16_t *weight_kw = weight_kh;
        for (int kw = 0; kw < kernel_w; kw++) {
+#ifdef ENABLE_ARM64
          float16x8_t src_8 = vld1q_f16(src_w);
          float16x8_t weight_8 = vld1q_f16(weight_kw);
          float16x8_t dst_8 = vld1q_f16(dst_kw);
          dst_8 = vfmaq_f16(dst_8, src_8, weight_8);
          vst1q_f16(dst_kw, dst_8);
-
+#else
+          for (int c = 0; c < C8NUM; c++) {
+            dst_kw[c] += src_w[c] * weight_kw[c];
+          }
+#endif
          dst_kw += in_kw_step;
          weight_kw += C8NUM;
        }  // kernel_w loop
@ -248,6 +270,7 @@ void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float
    src_h += out_h_step;
  }  // dst_height loop
 }
+#endif

 void DeconvDepthwisePostFuncFp16(float16_t *dst, const float16_t *bias, int block_channel,
                                 const ConvParameter *conv_param) {
@ -289,11 +312,18 @@ void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const f
        float16_t *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_;
        const float16_t *in_t =
          src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
-
+#ifdef ENABLE_ARM64
+        DeconvDwFp16Center(out_t, in_t, weight, sliding->bottom_ - sliding->top_,
+                           sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
+                           sliding->out_h_step_ * sizeof(float16_t), sliding->block_channel_ * sizeof(float16_t),
+                           sliding->in_sh_step_ * sizeof(float16_t), sliding->in_sw_step_ * sizeof(float16_t),
+                           sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t));
+#else
        DeconvDepthwiseCenterFp16(out_t, in_t, weight, sliding->bottom_ - sliding->top_,
                                  sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
                                  sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_,
                                  sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_);
+#endif
      }
      DeconvDepthwisePostFuncFp16(dst_data, bias, sliding->block_channel_, conv_param);
    }  // output C8 loop
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.h
@ -38,6 +38,15 @@ void MatrixSub(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stri
 void MatrixMultiAdd(float *c11, float *c12, float *c21, float *c22, float *x_ptr, size_t row, size_t col,
                    size_t c_stride, size_t x_stride);

+#ifdef ENABLE_ARM
+void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
+                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
+                      size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
+void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
+                        size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
+                        size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
+#endif
+
 #ifdef ENABLE_ARM64
 void BiasAdd(const float *bias, float *data, size_t oc4, size_t plan_size);
 void BiasAddRelu6(const float *bias, float *data, size_t oc4, size_t plan_size);
@ -49,12 +58,6 @@ void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc,
 void C4BiasAddRelu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride);
 void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride);
 void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride);
-void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
-                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
-                      size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
-void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
-                        size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
-                        size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
 #endif

 #ifdef __cplusplus
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/common_func.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/common_func.h
@ -0,0 +1,62 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_COMMON_FUNC_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_COMMON_FUNC_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include "src/runtime/kernel/arm/opclib/op_base.h"
+#include "src/runtime/kernel/arm/opclib/conv_parameter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef ENABLE_ARM
+void IndirectGemmInt16to32_8x4(int32_t *dst, const int16_t *src, const int16_t *weight, size_t ksize, size_t ic8,
+                               size_t oc4, size_t offset);
+
+#ifdef ENABLE_ARM64
+void IndirectGemmInt8_4x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
+                          size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
+                          size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before,
+                          size_t shift_after);
+#elif defined(ENABLE_ARM32)
+void IndirectGemmInt8_2x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
+                          size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
+                          size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before,
+                          size_t shift_after);
+#endif
+#endif
+
+#ifdef ENABLE_ARM
+void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width,
+                        size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
+                        size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
+void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height,
+                      size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel,
+                      size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int out_multiplier,
+                      int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_COMMON_FUNC_H_ */
+
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.cc
@ -17,6 +17,7 @@
 #include "src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.h"
 #include <string.h>
 #include "src/runtime/kernel/arm/opclib/quantization/fixed_point.h"
+#include "src/runtime/kernel/arm/opclib/int8/common_func.h"

 /*conv depthwise int8 begin*/
 void DepthwiseBorderPixelInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height,
@ -85,6 +86,7 @@ void DepthwiseBorderInt8(int8_t *dst, const int16_t *src, const int16_t *weight,
  }  // height loop
 }

+#ifndef ENABLE_ARM64
 void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height,
                         int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step,
                         int in_sw_step, int in_kh_step, int in_kw_step, int out_multiplier, int left_shift,
@ -133,6 +135,7 @@ void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight,
    src_h += in_sh_step;
  }  // dst_height loop
 }
+#endif

 void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
                const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) {
@ -158,7 +161,17 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
        int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
        const int16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * C4NUM;
        int8_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * C4NUM;
-
+#ifdef ENABLE_ARM64
+        ConvDwInt8Center(
+          out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_,
+          conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(int8_t),
+          sliding->block_channel_ * sizeof(int8_t), sliding->in_sh_step_ * sizeof(int16_t),
+          sliding->in_sw_step_ * sizeof(int16_t), sliding->in_kh_step_ * sizeof(int16_t),
+          sliding->in_kw_step_ * sizeof(int16_t), conv_param->conv_quant_arg_.quant_multiplier_[0],
+          conv_param->conv_quant_arg_.left_shift_[0], conv_param->conv_quant_arg_.right_shift_[0],
+          conv_param->conv_quant_arg_.quant_args_[2][0].zp_, conv_param->conv_quant_arg_.out_act_min_[0],
+          conv_param->conv_quant_arg_.out_act_max_[0]);
+#else
        DepthwiseCenterInt8(
          out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_,
          conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_, sliding->block_channel_,
@ -166,6 +179,7 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
          conv_param->conv_quant_arg_.quant_multiplier_[0], conv_param->conv_quant_arg_.left_shift_[0],
          conv_param->conv_quant_arg_.right_shift_[0], conv_param->conv_quant_arg_.quant_args_[2][0].zp_,
          conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0]);
+#endif
      }
    }  // output C4 loop
    src += sliding->in_step_;
@ -222,6 +236,7 @@ void DeconvDepthwiseBorderInt8(int32_t *dst, const int16_t *src, const int16_t *
  }  // height loop
 }

+#ifndef ENABLE_ARM64
 void DeconvDepthwiseCenterInt8(int32_t *dst, const int16_t *src, const int16_t *weight, int height, int width,
                               int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step,
                               int in_sw_step, int in_kh_step, int in_kw_step) {
@ -253,6 +268,7 @@ void DeconvDepthwiseCenterInt8(int32_t *dst, const int16_t *src, const int16_t *
    src_h += out_h_step;
  }  // dst_height loop
 }
+#endif

 void DeconvDepthwisePostFuncInt8(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel,
                                 const ConvParameter *conv_param, int out_multiplier, int left_shift, int right_shift,
@ -302,11 +318,18 @@ void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *in
        int32_t *out_t = output_buffer + oh_h_start * sliding->in_h_step_ + oh_w_start * C4NUM;
        const int16_t *in_t =
          src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
-
+#ifdef ENABLE_ARM64
+        DeconvDwInt8Center(out_t, in_t, weight, sliding->bottom_ - sliding->top_,
+                           sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
+                           sliding->out_h_step_ * sizeof(int16_t), sliding->block_channel_ * sizeof(int16_t),
+                           sliding->in_sh_step_ * sizeof(int32_t), sliding->in_sw_step_ * sizeof(int32_t),
+                           sliding->in_kh_step_ * sizeof(int32_t), sliding->in_kw_step_ * sizeof(int32_t));
+#else
        DeconvDepthwiseCenterInt8(out_t, in_t, weight, sliding->bottom_ - sliding->top_,
                                  sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
                                  sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_,
                                  sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_);
+#endif
      }
      DeconvDepthwisePostFuncInt8(
        dst_data, output_buffer, bias, sliding->block_channel_, conv_param,
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc
@ -17,25 +17,7 @@
 #include "src/runtime/kernel/arm/opclib/int8/conv_int8.h"
 #include <string.h>
 #include "src/runtime/kernel/arm/opclib/winograd_transform.h"
-
-extern "C" {
-#ifdef ENABLE_ARM
-void IndirectGemmInt16to32_8x4(int32_t *dst, const int16_t *src, const int16_t *weight, size_t ksize, size_t ic8,
-                               size_t oc4, size_t offset);
-
-#ifdef ENABLE_ARM64
-void IndirectGemmInt8_4x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
-                          size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
-                          size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before,
-                          size_t shift_after);
-#elif defined(ENABLE_ARM32)
-void IndirectGemmInt8_2x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
-                          size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
-                          size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before,
-                          size_t shift_after);
-#endif
-#endif
-}
+#include "src/runtime/kernel/arm/opclib/int8/common_func.h"

 void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
                      int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum,