optimization for depth wise convolution

This commit is contained in:
lixian 2020-08-06 21:04:15 +08:00
parent 0921c33f99
commit 48e2f85593
16 changed files with 1886 additions and 44 deletions

View File

@ -0,0 +1,161 @@
#ifdef __arm__
#ifndef __aarch64__
.text
.align 5
.global ConvDwFp32Center
#ifndef __APPLE__
.type ConvDwFp32Center, %function
#endif
// void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
// size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
// r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: weight, #56: kernel_h, #60: kernel_w,
// #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step
// #88: relu, #92: relu6
ConvDwFp32Center:
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
// clang's rule seems more simple, though there are no subroutine calls here
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
push {r0-r8, r10, r11, lr}
vpush {v4-v7}
add sp, sp, #112
ldr r4, [sp, #48]
vld1.32 {q13}, [r3]
vmov.i32 q14, #6
vcvt.f32.s32 q14, q14
veor q15, q15, q15
LoopH:
ldr r1, [sp, #4] // src_w
ldr r5, [sp, #52] // width
ldr r0, [sp] // dst_w
cmp r5, #4
blt LoopW
LoopW4:
mov r11, [sp, #76] // in_sw_step
mov r8, r1 // src_kh
ldr r2, [sp, #8] // weight_kh
ldr r6, [sp, #56] // kernel_h
vmov q0, q13
LoopKh4:
ldr r12, [sp, #80] //in_kh_step
ldr r7, [sp, #60] // kernel_w
mov lr, r8 // src_kw
LoopKw4:
mov r10, lr
vld1.32 {q12}, [r2]!
vld1.32 {q4}, [r10]
add r10, r10, r11
vmla.f32 q0, q4, q12
vld1.32 {q5}, [r10]
add r10, r10, r11
vmla.f32 q1, q5, q12
vld1.32 {q6}, [r10]
add r10, r10, r11
vmla.f32 q2, q6, q12
vld1.32 {q7}, [r10]
add r10, r10, r11
vmla.f32 q3, q7, q12
subs r7, r7, #1
add lr, lr, r12
bne LoopKw4
ldr r12, [sp, #80]
add r8, r8, r12
subs r6, r6, #1
bne LoopKh4
ldr r12, [sp, #92]
cmp r12, #0
bne Relu64
ldr r12, [sp, #88]
cmp r12, #0
bne Relu4
b Write4
Relu64:
vmin.f32 q0, q0, q14
vmin.f32 q1, q1, q14
vmin.f32 q2, q2, q14
vmin.f32 q3, q3, q14
Relu4:
vmax.f32 q0, q0, q15
vmax.f32 q1, q1, q15
vmax.f32 q2, q2, q15
vmax.f32 q3, q3, q15
Write4:
ldr r12, [sp, #68]
vst1.32 {q0}, [r0]
add r0, r0, r12
vst1.32 {q1}, [r0]
add r0, r0, r12
vst1.32 {q2}, [r0]
add r0, r0, r12
vst1.32 {q3}, [r0]
add r0, r0, r12
mov r12, #4
mul r11, r11, r12
add r1, r1, r11
sub r5, r5, #4
cmp r5, r5, #0
ble LoopWEnd
cmp r5, #4
bge LoopW
LoopW:
mov r8, r1 // src_kh
ldr r2, [sp, #8] // weight_kh
ldr r6, [sp, #56] // kernel_h
vmov q0, q13
LoopKh:
ldr r12, [sp, #84] //in_kw_step
ldr r7, [sp, #60] // kernel_w
mov r10, r8 // src_kw
LoopKw:
vld1.32 {q1}, [r10]
add r10, r10, r12
vld1.32 {q12}, [r2]!
vmla.f32 q0, q1, q12
subs r7, r7, #1
bne LoopKw
ldr r12, [sp, #80]
add r8, r8, r12
subs r6, r6, #1
bne LoopKh
ldr r12, [sp, #92]
cmp r12, #0
bne Relu6
ldr r12, [sp, #88]
cmp r12, #0
bne Relu
b Write
Relu6:
vmin.f32 q0, q0, q14
Relu:
vmax.f32 q0, q0, q15
Write:
ldr r12, [sp, #68]
vst1.32 {q0}, [r0]
add r0, r0, r12
ldr r12, [sp, #76]
add r1, r1, r12
subs r5, r5, #1
bne LoopW
ldr r3, [sp, #64]
ldr r12, [sp]
add r12, r12, r3
str r12, [sp]
ldr r3, [sp, #72]
ldr r12, [sp, #4]
add r12, r12, r3
str r12, [sp, #4]
subs r4, r4, #1
bne LoopH
LoopWEnd:
sub sp, sp, #112
vpop {v4-v7}
pop {r0-r8, r10, r11, pc}
#endif
#endif

View File

@ -0,0 +1,207 @@
#ifdef __arm__
#ifndef __aarch64__
.text
.align 5
.global ConvDwInt8Center
#ifndef __APPLE__
.type ConvDwInt8Center, %function
#endif
// void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
// size_t in_kh_step, size_t in_kw_step, int out_multiplier, int left_shift,
// int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max);
// r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: width, #56: kernel_h, #60: kernel_w,
// #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step
// #88: out_multiplier, #92: left_shift, #96: right_shift, #100: out_zp, #104: acc_min, #108: acc_max
ConvDwInt8Center:
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
// clang's rule seems more simple, though there are no subroutine calls here
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
push {r0-r8, r10, r11, lr}
vpush {q4-q7}
add sp, sp, #112
ldr r4, [sp, #48]
ldr r12, [sp, #92]
vdup.32 q9, r12
ldr r11, [sp, #88]
vdup.32 q10, r11
ldr r10, [sp, #96]
vdup.32 q11, r10
ldr r8, [sp, #100]
vdup.32 q12, r8
ldr r7, [sp, #104]
vdup.32 q13, r7
ldr r6, [sp, #108]
vdup.32 q14, r6
vld1.32 {q15}, [r3]
LoopH:
ldr r1, [sp, #4] // src_w
ldr r5, [sp, #52] // width
ldr r0, [sp] // dst_w
LoopW4:
mov r11, [sp, #76] // in_sw_step
mov r8, r1 // src_kh
ldr r2, [sp, #8] // weight_kh
ldr r6, [sp, #56] // kernel_h
vmov q0, q15
LoopKh4:
ldr r12, [sp, #80] //in_kh_step
ldr r7, [sp, #60] // kernel_w
mov r10, r8 // src_kw
LoopKw4:
vld1.16 {d24}, [r2]!
vld1.16 {d8}, [r10]
add r10, r10, r11
vmlal.s16 q0, d8, d24
vld1.16 {d10}, [r10]
add r10, r10, r11
vmlal.s16 q1, d10, d24
vld1.16 {d12}, [r10]
add r10, r10, r11
vmlal.s16 q2, d12, d24
vld1.16 {d14}, [r10]
add r10, r10, r11
vmlal.s16 q3, d14, d24
subs r7, r7, #1
bne LoopKw4
ldr r12, [sp, #80]
add r8, r8, r12
subs r6, r6, #1
bne LoopKh4
vshl.s32 q0, q0, q9
vshl.s32 q1, q1, q9
vshl.s32 q2, q2, q9
vshl.s32 q3, q3, q9
vqrdmulh.s32 q0, q0, q10
vqrdmulh.s32 q1, q1, q10
vqrdmulh.s32 q2, q2, q10
vqrdmulh.s32 q3, q3, q10
vrshl.s32 q0, q0, q11
vrshl.s32 q1, q1, q11
vrshl.s32 q2, q2, q11
vrshl.s32 q3, q3, q11
vadd.i32 q0, q0, q12
vadd.i32 q1, q1, q12
vadd.i32 q2, q2, q12
vadd.i32 q3, q3, q12
vmax.s32 q0, q0, q13
vmax.s32 q1, q1, q13
vmax.s32 q2, q2, q13
vmax.s32 q3, q3, q13
vmin.s32 q0, q0, q14
vmin.s32 q1, q1, q14
vmin.s32 q2, q2, q14
vmin.s32 q3, q3, q14
vqmovn.s32 d0, q0
vqmovn.s32 d2, q1
vqmovn.s32 d4, q2
vqmovn.s32 d6, q3
vqmovn.s16 d0, q0
vqmovn.s16 d2, q1
vqmovn.s16 d4, q2
vqmovn.s16 d6, q3
mov r3, r0
ldr r12, [sp, #68]
vst1.8 {d0[0]}, [r3]!
vst1.8 {d0[1]}, [r3]!
vst1.8 {d0[2]}, [r3]!
vst1.8 {d0[3]}, [r3]!
add r0, r0, r12
mov r3, r0
vst1.8 {d2[0]}, [r3]!
vst1.8 {d2[1]}, [r3]!
vst1.8 {d2[2]}, [r3]!
vst1.8 {d2[3]}, [r3]!
add r0, r0, r12
mov r3, r0
vst1.8 {d4[0]}, [r3]!
vst1.8 {d4[1]}, [r3]!
vst1.8 {d4[2]}, [r3]!
vst1.8 {d4[3]}, [r3]!
add r0, r0, r12
mov r3, r0
vst1.8 {d6[0]}, [r3]!
vst1.8 {d6[1]}, [r3]!
vst1.8 {d6[2]}, [r3]!
vst1.8 {d6[3]}, [r3]!
add r0, r0, r12
mov r3, r0
mov r12, #4
mul r11, r11, r12
add r1, r1, r11
subs r5, r5, #1
bne LoopW4
LoopW:
mov r8, r1 // src_kh
ldr r2, [sp, #8] // weight_kh
ldr r6, [sp, #56] // kernel_h
vmov q0, q15
LoopKh:
ldr r12, [sp, #84] //in_kw_step
ldr r7, [sp, #60] // kernel_w
mov r10, r8 // src_kw
LoopKw:
vld1.16 {d2}, [r10]
add r10, r10, r12
vld1.16 {d24}, [r2]!
vmlal.s16 q0, d2, d24
subs r7, r7, #1
bne LoopKw
ldr r12, [sp, #80]
add r8, r8, r12
subs r6, r6, #1
bne LoopKh
vshl.s32 q0, q0, q9
vqrdmulh.s32 q0, q0, q10
vrshl.s32 q0, q0, q11
vadd.i32 q0, q0, q12
vmax.s32 q0, q0, q13
vmin.s32 q0, q0, q14
vqmovn.s32 d0, q0
vqmovn.s16 d0, q0
mov r3, r0
ldr r12, [sp, #68]
vst1.8 {d0[0]}, [r3]!
vst1.8 {d0[1]}, [r3]!
vst1.8 {d0[2]}, [r3]!
vst1.8 {d0[3]}, [r3]!
add r0, r0, r12
ldr r12, [sp, #76]
add r1, r1, r12
subs r5, r5, #1
bne LoopW
ldr r3, [sp, #64]
ldr r12, [sp]
add r12, r12, r3
str r12, [sp]
ldr r3, [sp, #72]
ldr r12, [sp, #4]
add r12, r12, r3
str r12, [sp, #4]
subs r4, r4, #1
bne LoopH
sub sp, sp, #112
vpop {q4-q7}
pop {r0-r8, r10, r11, pc}
#endif
#endif

View File

@ -0,0 +1,69 @@
#ifdef __arm__
#ifndef __aarch64__
.text
.align 5
.global DeconvDwFp32Center
#ifndef __APPLE__
.type DeconvDwFp32Center, %function
#endif
// void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
// size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
// r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step
// #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step
DeconvDwFp32Center:
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
// clang's rule seems more simple, though there are no subroutine calls here
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
push {r0-r8, r10, r11, lr}
ldr r10, [sp, #80] // in_kw_step
ldr r11, [sp, #76] // in_kh_step
LoopH:
ldr r0, [sp] // dst_w
ldr r1, [sp, #4] // src_w
ldr r4, [sp, #48] // width
LoopW:
mov r6, r0 // dst_kh
ldr r2, [sp, #8] // weight_kh
ldr r5, [sp, #52] // kernel_h
vld1.32 {q1}, [r1]
LoopKh:
mov r7, r6 // dst_kw
ldr r12, [sp, #56] // kernel_w
LoopKw:
vld1.32 {q0}, [r7]
vld1.32 {q2}, [r2]!
vmla.f32 q0, q1, q2
vst1.32 {q0}, [r7]
add r7, r7, r10
subs r12, r12, #1
bne LoopKw
add r6, r6, r11
subs r5, r5, #1
bne LoopKh
ldr r12, [sp, #72]
add r0, r0, r12
ldr r8, [sp, #64]
add r1, r1, r8
subs r4, r4, #1
bne LoopW
ldr r8, [sp, #68]
ldr r12, [sp]
add r12, r12, r8
str r12, [sp]
ldr r8, [sp, #60]
ldr r12, [sp, #4]
add r12, r12, r8
str r12, [sp, #4]
subs r3, r3, #1
bne LoopH
pop {r0-r8, r10, r11, pc}
#endif
#endif

View File

@ -0,0 +1,69 @@
#ifdef __arm__
#ifndef __aarch64__
.text
.align 5
.global DeconvDwInt8Center
#ifndef __APPLE__
.type DeconvDwInt8Center, %function
#endif
// void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
// size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
// r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step
// #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step
DeconvDwInt8Center:
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
// clang's rule seems more simple, though there are no subroutine calls here
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
push {r0-r8, r10, r11, lr}
ldr r10, [sp, #80] // in_kw_step
ldr r11, [sp, #76] // in_kh_step
LoopH:
ldr r0, [sp] // dst_w
ldr r1, [sp, #4] // src_w
ldr r4, [sp, #48] // width
LoopW:
mov r6, r0 // dst_kh
ldr r2, [sp, #8] // weight_kh
ldr r5, [sp, #52] // kernel_h
vld1.16 {d2}, [r1]
LoopKh:
mov r7, r6 // dst_kw
ldr r12, [sp, #56] // kernel_w
LoopKw:
vld1.32 {q0}, [r7]
vld1.16 {d24}, [r2]!
vmlal.s16 q0, d2, d24
vst1.32 {q0}, [r7]
add r7, r7, r10
subs r12, r12, #1
bne LoopKw
add r6, r6, r11
subs r5, r5, #1
bne LoopKh
ldr r12, [sp, #72]
add r0, r0, r12
ldr r8, [sp, #64]
add r1, r1, r8
subs r4, r4, #1
bne LoopW
ldr r8, [sp, #68]
ldr r12, [sp]
add r12, r12, r8
str r12, [sp]
ldr r8, [sp, #60]
ldr r12, [sp, #4]
add r12, r12, r8
str r12, [sp, #4]
subs r3, r3, #1
bne LoopH
pop {r0-r8, r10, r11, pc}
#endif
#endif

View File

@ -32,24 +32,238 @@ ConvDwFp32Center:
ldr x14, [sp, #48]
ldr x15, [sp, #56]
ld1 {v5.4s}, [x3]
ld1 {v24.4s}, [x3]
movi v26.4s, #6
scvtf v26.4s, v26.4s
dup v27.4s, wzr
LoopH:
mov x23, x1
mov x24, x5
mov x3, x0
cmp x24, #8
blt LoopW
cmp x24, #16
blt LoopW8
LoopW16:
mov x19, #16
mul x19, x19, x11
mov x16, x23
mov x17, x2
mov x20, x6
mov v0.16b, v24.16b
mov v1.16b, v24.16b
mov v2.16b, v24.16b
mov v3.16b, v24.16b
mov v4.16b, v24.16b
mov v5.16b, v24.16b
mov v6.16b, v24.16b
mov v7.16b, v24.16b
mov v8.16b, v24.16b
mov v9.16b, v24.16b
mov v10.16b, v24.16b
mov v11.16b, v24.16b
mov v12.16b, v24.16b
mov v13.16b, v24.16b
mov v14.16b, v24.16b
mov v15.16b, v24.16b
LoopKh16:
mov x18, x7
mov x21, x16
LoopKw16:
mov x22, x21
ld1 {v25.4s}, [x17], #16
ld1 {v16.4s}, [x22], x11
ld1 {v17.4s}, [x22], x11
fmla v0.4s, v16.4s, v25.4s
fmla v1.4s, v17.4s, v25.4s
ld1 {v18.4s}, [x22], x11
ld1 {v19.4s}, [x22], x11
fmla v2.4s, v18.4s, v25.4s
fmla v3.4s, v19.4s, v25.4s
ld1 {v20.4s}, [x22], x11
ld1 {v21.4s}, [x22], x11
fmla v4.4s, v20.4s, v25.4s
fmla v5.4s, v21.4s, v25.4s
ld1 {v22.4s}, [x22], x11
ld1 {v23.4s}, [x22], x11
fmla v6.4s, v22.4s, v25.4s
fmla v7.4s, v23.4s, v25.4s
ld1 {v16.4s}, [x22], x11
ld1 {v17.4s}, [x22], x11
fmla v8.4s, v16.4s, v25.4s
fmla v9.4s, v17.4s, v25.4s
ld1 {v18.4s}, [x22], x11
ld1 {v19.4s}, [x22], x11
fmla v10.4s, v18.4s, v25.4s
fmla v11.4s, v19.4s, v25.4s
ld1 {v20.4s}, [x22], x11
ld1 {v21.4s}, [x22], x11
fmla v12.4s, v20.4s, v25.4s
fmla v13.4s, v21.4s, v25.4s
ld1 {v22.4s}, [x22], x11
ld1 {v23.4s}, [x22], x11
fmla v14.4s, v22.4s, v25.4s
fmla v15.4s, v23.4s, v25.4s
subs x18, x18, #1
add x21, x21, x13
bne LoopKw16
add x16, x16, x12
subs x20, x20, #1
bne LoopKh16
cbnz x15, Relu616
cbnz x14, Relu16
b Write16
Relu616:
fmin v0.4s, v0.4s, v26.4s
fmin v1.4s, v1.4s, v26.4s
fmin v2.4s, v2.4s, v26.4s
fmin v3.4s, v3.4s, v26.4s
fmin v4.4s, v4.4s, v26.4s
fmin v5.4s, v5.4s, v26.4s
fmin v6.4s, v6.4s, v26.4s
fmin v7.4s, v7.4s, v26.4s
fmin v8.4s, v8.4s, v26.4s
fmin v9.4s, v9.4s, v26.4s
fmin v10.4s, v10.4s, v26.4s
fmin v11.4s, v11.4s, v26.4s
fmin v12.4s, v12.4s, v26.4s
fmin v13.4s, v13.4s, v26.4s
fmin v14.4s, v14.4s, v26.4s
fmin v15.4s, v15.4s, v26.4s
Relu16:
fmax v0.4s, v0.4s, v27.4s
fmax v1.4s, v1.4s, v27.4s
fmax v2.4s, v2.4s, v27.4s
fmax v3.4s, v3.4s, v27.4s
fmax v4.4s, v4.4s, v27.4s
fmax v5.4s, v5.4s, v27.4s
fmax v6.4s, v6.4s, v27.4s
fmax v7.4s, v7.4s, v27.4s
fmax v8.4s, v8.4s, v27.4s
fmax v9.4s, v9.4s, v27.4s
fmax v10.4s, v10.4s, v27.4s
fmax v11.4s, v11.4s, v27.4s
fmax v12.4s, v12.4s, v27.4s
fmax v13.4s, v13.4s, v27.4s
fmax v14.4s, v14.4s, v27.4s
fmax v15.4s, v15.4s, v27.4s
Write16:
st1 {v0.4s}, [x3], x9
st1 {v1.4s}, [x3], x9
st1 {v2.4s}, [x3], x9
st1 {v3.4s}, [x3], x9
st1 {v4.4s}, [x3], x9
st1 {v5.4s}, [x3], x9
st1 {v6.4s}, [x3], x9
st1 {v7.4s}, [x3], x9
st1 {v8.4s}, [x3], x9
st1 {v9.4s}, [x3], x9
st1 {v10.4s}, [x3], x9
st1 {v11.4s}, [x3], x9
st1 {v12.4s}, [x3], x9
st1 {v13.4s}, [x3], x9
st1 {v14.4s}, [x3], x9
st1 {v15.4s}, [x3], x9
add x23, x23, x19
sub x24, x24, #16
cmp x24, #0
ble LoopWEnd
cmp x24, #8
blt LoopW
cmp x24, #16
bge LoopW16
LoopW8:
mov x19, #8
mul x19, x19, x11
mov x16, x23
mov x17, x2
mov x20, x6
mov v0.16b, v24.16b
mov v1.16b, v24.16b
mov v2.16b, v24.16b
mov v3.16b, v24.16b
mov v4.16b, v24.16b
mov v5.16b, v24.16b
mov v6.16b, v24.16b
mov v7.16b, v24.16b
LoopKh8:
mov x18, x7
mov x21, x16
LoopKw8:
mov x22, x21
ld1 {v25.4s}, [x17], #16
ld1 {v16.4s}, [x22], x11
ld1 {v17.4s}, [x22], x11
fmla v0.4s, v16.4s, v25.4s
fmla v1.4s, v17.4s, v25.4s
ld1 {v18.4s}, [x22], x11
ld1 {v19.4s}, [x22], x11
fmla v2.4s, v18.4s, v25.4s
fmla v3.4s, v19.4s, v25.4s
ld1 {v20.4s}, [x22], x11
ld1 {v21.4s}, [x22], x11
fmla v4.4s, v20.4s, v25.4s
fmla v5.4s, v21.4s, v25.4s
ld1 {v22.4s}, [x22], x11
ld1 {v23.4s}, [x22], x11
fmla v6.4s, v22.4s, v25.4s
fmla v7.4s, v23.4s, v25.4s
subs x18, x18, #1
add x21, x21, x13
bne LoopKw8
add x16, x16, x12
subs x20, x20, #1
bne LoopKh8
cbnz x15, Relu68
cbnz x14, Relu8
b Write8
Relu68:
fmin v0.4s, v0.4s, v26.4s
fmin v1.4s, v1.4s, v26.4s
fmin v2.4s, v2.4s, v26.4s
fmin v3.4s, v3.4s, v26.4s
fmin v4.4s, v4.4s, v26.4s
fmin v5.4s, v5.4s, v26.4s
fmin v6.4s, v6.4s, v26.4s
fmin v7.4s, v7.4s, v26.4s
Relu8:
fmax v0.4s, v0.4s, v27.4s
fmax v1.4s, v1.4s, v27.4s
fmax v2.4s, v2.4s, v27.4s
fmax v3.4s, v3.4s, v27.4s
fmax v4.4s, v4.4s, v27.4s
fmax v5.4s, v5.4s, v27.4s
fmax v6.4s, v6.4s, v27.4s
fmax v7.4s, v7.4s, v27.4s
Write8:
st1 {v0.4s}, [x3], x9
st1 {v1.4s}, [x3], x9
st1 {v2.4s}, [x3], x9
st1 {v3.4s}, [x3], x9
st1 {v4.4s}, [x3], x9
st1 {v5.4s}, [x3], x9
st1 {v6.4s}, [x3], x9
st1 {v7.4s}, [x3], x9
add x23, x23, x19
sub x24, x24, #8
cmp x24, #0
ble LoopWEnd
cmp x24, #8
bge LoopW8
LoopW:
mov x16, x23
mov x17, x2
mov x20, x6
mov v0.16b, v5.16b
mov v0.16b, v24.16b
LoopKh:
mov x18, x7
mov x22, x16
LoopKw:
ld1 {v1.4s}, [x22], x13
ld1 {v2.4s}, [x17], #16
fmla v0.4s, v1.4s, v2.4s
ld1 {v16.4s}, [x22], x13
ld1 {v25.4s}, [x17], #16
fmla v0.4s, v16.4s, v25.4s
subs x18, x18, #1
bne LoopKw
add x16, x16, x12
@ -59,17 +273,15 @@ ConvDwFp32Center:
cbnz x14, Relu
b Write
Relu6:
movi v4.4s, #6
scvtf v4.4s, v4.4s
fmin v0.4s, v0.4s, v4.4s
fmin v0.4s, v0.4s, v26.4s
Relu:
dup v3.4s, wzr
fmax v0.4s, v0.4s, v3.4s
fmax v0.4s, v0.4s, v27.4s
Write:
st1 {v0.4s}, [x3], x9
add x23, x23, x11
subs x24, x24, #1
bne LoopW
LoopWEnd:
add x0, x0, x8
add x1, x1, x10
subs x4, x4, #1

View File

@ -0,0 +1,558 @@
#ifdef __aarch64__
.text
.align 5
.global ConvDwInt8Center
#ifndef __APPLE__
.type ConvDwInt8Center, %function
#endif
// void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
// size_t in_kh_step, size_t in_kw_step, int out_multiplier, int left_shift,
// int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max);
// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w,
// x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
// x14: out_multiplier, #56: left_shift, #64: right_shift, #72:out_zp, #80: acc_min, #88: acc_max
ConvDwInt8Center:
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
sub sp, sp, #48
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
stp x23, x24, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
ldr x10, [sp, #16]
ldr x11, [sp, #24]
ldr x12, [sp, #32]
ldr x13, [sp, #40]
ldr w14, [sp, #56]
dup v26.4s, w14
ldr x15, [sp, #48]
dup v27.4s, w15
ldr w16, [sp, #64]
dup v28.4s, w16
ldr w17, [sp, #72]
dup v29.4s, w17
ldr w18, [sp, #80]
dup v30.4s, w18
ldr w19, [sp, #88]
dup v31.4s, w19
ld1 {v24.4s}, [x3]
LoopH:
mov x23, x1
mov x24, x5
mov x3, x0
cmp x24, #8
blt LoopW
cmp x24, #16
blt LoopW8
LoopW16:
mov x19, #16
mul x19, x19, x11
mov x16, x23
mov x17, x2
mov x20, x6
mov v0.16b, v24.16b
mov v1.16b, v24.16b
mov v2.16b, v24.16b
mov v3.16b, v24.16b
mov v4.16b, v24.16b
mov v5.16b, v24.16b
mov v6.16b, v24.16b
mov v7.16b, v24.16b
mov v8.16b, v24.16b
mov v9.16b, v24.16b
mov v10.16b, v24.16b
mov v11.16b, v24.16b
mov v12.16b, v24.16b
mov v13.16b, v24.16b
mov v14.16b, v24.16b
mov v15.16b, v24.16b
LoopKh16:
mov x18, x7
mov x21, x16
LoopKw16:
mov x22, x21
ld1 {v25.4h}, [x17], #8
ld1 {v16.4h}, [x22], x13
ld1 {v17.4h}, [x22], x13
smlal v0.4s, v16.4h, v25.4h
smlal v1.4s, v17.4h, v25.4h
ld1 {v18.4h}, [x22], x13
ld1 {v19.4h}, [x22], x13
smlal v2.4s, v18.4h, v25.4h
smlal v3.4s, v19.4h, v25.4h
ld1 {v20.4h}, [x22], x13
ld1 {v21.4h}, [x22], x13
smlal v4.4s, v20.4h, v25.4h
smlal v5.4s, v21.4h, v25.4h
ld1 {v22.4h}, [x22], x13
ld1 {v23.4h}, [x22], x13
smlal v6.4s, v22.4h, v25.4h
smlal v7.4s, v23.4h, v25.4h
ld1 {v16.4h}, [x22], x13
ld1 {v17.4h}, [x22], x13
smlal v8.4s, v16.4h, v25.4h
smlal v9.4s, v17.4h, v25.4h
ld1 {v18.4h}, [x22], x13
ld1 {v19.4h}, [x22], x13
smlal v10.4s, v18.4h, v25.4h
smlal v11.4s, v19.4h, v25.4h
ld1 {v20.4h}, [x22], x13
ld1 {v21.4h}, [x22], x13
smlal v12.4s, v20.4h, v25.4h
smlal v13.4s, v21.4h, v25.4h
ld1 {v22.4h}, [x22], x13
ld1 {v23.4h}, [x22], x13
smlal v14.4s, v22.4h, v25.4h
smlal v15.4s, v23.4h, v25.4h
subs x18, x18, #1
add x21, x21, x13
bne LoopKw16
add x16, x16, x12
subs x20, x20, #1
bne LoopKh16
sqshl v0.4s, v0.4s ,v26.4s
sqshl v1.4s, v1.4s ,v26.4s
sqshl v2.4s, v2.4s ,v26.4s
sqshl v3.4s, v3.4s ,v26.4s
sqshl v4.4s, v4.4s ,v26.4s
sqshl v5.4s, v5.4s ,v26.4s
sqshl v6.4s, v6.4s ,v26.4s
sqshl v7.4s, v7.4s ,v26.4s
sqshl v8.4s, v8.4s ,v26.4s
sqshl v9.4s, v9.4s ,v26.4s
sqshl v10.4s, v10.4s ,v26.4s
sqshl v11.4s, v11.4s ,v26.4s
sqshl v12.4s, v12.4s ,v26.4s
sqshl v13.4s, v13.4s ,v26.4s
sqshl v14.4s, v14.4s ,v26.4s
sqshl v15.4s, v15.4s ,v26.4s
sqrdmulh v0.4s, v0.4s ,v27.4s
sqrdmulh v1.4s, v1.4s ,v27.4s
sqrdmulh v2.4s, v2.4s ,v27.4s
sqrdmulh v3.4s, v3.4s ,v27.4s
sqrdmulh v4.4s, v4.4s ,v27.4s
sqrdmulh v5.4s, v5.4s ,v27.4s
sqrdmulh v6.4s, v6.4s ,v27.4s
sqrdmulh v7.4s, v7.4s ,v27.4s
sqrdmulh v8.4s, v8.4s ,v27.4s
sqrdmulh v9.4s, v9.4s ,v27.4s
sqrdmulh v10.4s, v10.4s ,v27.4s
sqrdmulh v11.4s, v11.4s ,v27.4s
sqrdmulh v12.4s, v12.4s ,v27.4s
sqrdmulh v13.4s, v13.4s ,v27.4s
sqrdmulh v14.4s, v14.4s ,v27.4s
sqrdmulh v15.4s, v15.4s ,v27.4s
sqrshl v0.4s, v0.4s ,v28.4s
sqrshl v1.4s, v1.4s ,v28.4s
sqrshl v2.4s, v2.4s ,v28.4s
sqrshl v3.4s, v3.4s ,v28.4s
sqrshl v4.4s, v4.4s ,v28.4s
sqrshl v5.4s, v5.4s ,v28.4s
sqrshl v6.4s, v6.4s ,v28.4s
sqrshl v7.4s, v7.4s ,v28.4s
sqrshl v8.4s, v8.4s ,v28.4s
sqrshl v9.4s, v9.4s ,v28.4s
sqrshl v10.4s, v10.4s ,v28.4s
sqrshl v11.4s, v11.4s ,v28.4s
sqrshl v12.4s, v12.4s ,v28.4s
sqrshl v13.4s, v13.4s ,v28.4s
sqrshl v14.4s, v14.4s ,v28.4s
sqrshl v15.4s, v15.4s ,v28.4s
add v0.4s, v0.4s ,v29.4s
add v1.4s, v1.4s ,v29.4s
add v2.4s, v2.4s ,v29.4s
add v3.4s, v3.4s ,v29.4s
add v4.4s, v4.4s ,v29.4s
add v5.4s, v5.4s ,v29.4s
add v6.4s, v6.4s ,v29.4s
add v7.4s, v7.4s ,v29.4s
add v8.4s, v8.4s ,v29.4s
add v9.4s, v9.4s ,v29.4s
add v10.4s, v10.4s ,v29.4s
add v11.4s, v11.4s ,v29.4s
add v12.4s, v12.4s ,v29.4s
add v13.4s, v13.4s ,v29.4s
add v14.4s, v14.4s ,v29.4s
add v15.4s, v15.4s ,v29.4s
smax v0.4s, v0.4s ,v30.4s
smax v1.4s, v1.4s ,v30.4s
smax v2.4s, v2.4s ,v30.4s
smax v3.4s, v3.4s ,v30.4s
smax v4.4s, v4.4s ,v30.4s
smax v5.4s, v5.4s ,v30.4s
smax v6.4s, v6.4s ,v30.4s
smax v7.4s, v7.4s ,v30.4s
smax v8.4s, v8.4s ,v30.4s
smax v9.4s, v9.4s ,v30.4s
smax v10.4s, v10.4s ,v30.4s
smax v11.4s, v11.4s ,v30.4s
smax v12.4s, v12.4s ,v30.4s
smax v13.4s, v13.4s ,v30.4s
smax v14.4s, v14.4s ,v30.4s
smax v15.4s, v15.4s ,v30.4s
smin v0.4s, v0.4s ,v31.4s
smin v1.4s, v1.4s ,v31.4s
smin v2.4s, v2.4s ,v31.4s
smin v3.4s, v3.4s ,v31.4s
smin v4.4s, v4.4s ,v31.4s
smin v5.4s, v5.4s ,v31.4s
smin v6.4s, v6.4s ,v31.4s
smin v7.4s, v7.4s ,v31.4s
smin v8.4s, v8.4s ,v31.4s
smin v9.4s, v9.4s ,v31.4s
smin v10.4s, v10.4s ,v31.4s
smin v11.4s, v11.4s ,v31.4s
smin v12.4s, v12.4s ,v31.4s
smin v13.4s, v13.4s ,v31.4s
smin v14.4s, v14.4s ,v31.4s
smin v15.4s, v15.4s ,v31.4s
sqxtn v0.4h, v0.4s
sqxtn v1.4h, v1.4s
sqxtn v2.4h, v2.4s
sqxtn v3.4h, v3.4s
sqxtn v4.4h, v4.4s
sqxtn v5.4h, v5.4s
sqxtn v6.4h, v6.4s
sqxtn v7.4h, v7.4s
sqxtn v8.4h, v8.4s
sqxtn v9.4h, v9.4s
sqxtn v10.4h, v10.4s
sqxtn v11.4h, v11.4s
sqxtn v12.4h, v12.4s
sqxtn v13.4h, v13.4s
sqxtn v14.4h, v14.4s
sqxtn v15.4h, v15.4s
sqxtn v0.8b, v0.8h
sqxtn v1.8b, v1.8h
sqxtn v2.8b, v2.8h
sqxtn v3.8b, v3.8h
sqxtn v4.8b, v4.8h
sqxtn v5.8b, v5.8h
sqxtn v6.8b, v6.8h
sqxtn v7.8b, v7.8h
sqxtn v8.8b, v8.8h
sqxtn v9.8b, v9.8h
sqxtn v10.8b, v10.8h
sqxtn v11.8b, v11.8h
sqxtn v12.8b, v12.8h
sqxtn v13.8b, v13.8h
sqxtn v14.8b, v14.8h
sqxtn v15.8b, v15.8h
add x17, x3, #1
add x18, x3, #2
add x21, x3, #3
st1 {v0.b}[0], [x3], x9
st1 {v0.b}[1], [x17], x9
st1 {v0.b}[2], [x18], x9
st1 {v0.b}[3], [x21], x9
st1 {v1.b}[0], [x3], x9
st1 {v1.b}[1], [x17], x9
st1 {v1.b}[2], [x18], x9
st1 {v1.b}[3], [x21], x9
st1 {v2.b}[0], [x3], x9
st1 {v2.b}[1], [x17], x9
st1 {v2.b}[2], [x18], x9
st1 {v2.b}[3], [x21], x9
st1 {v3.b}[0], [x3], x9
st1 {v3.b}[1], [x17], x9
st1 {v3.b}[2], [x18], x9
st1 {v3.b}[3], [x21], x9
st1 {v4.b}[0], [x3], x9
st1 {v4.b}[1], [x17], x9
st1 {v4.b}[2], [x18], x9
st1 {v4.b}[3], [x21], x9
st1 {v5.b}[0], [x3], x9
st1 {v5.b}[1], [x17], x9
st1 {v5.b}[2], [x18], x9
st1 {v5.b}[3], [x21], x9
st1 {v6.b}[0], [x3], x9
st1 {v6.b}[1], [x17], x9
st1 {v6.b}[2], [x18], x9
st1 {v6.b}[3], [x21], x9
st1 {v7.b}[0], [x3], x9
st1 {v7.b}[1], [x17], x9
st1 {v7.b}[2], [x18], x9
st1 {v7.b}[3], [x21], x9
st1 {v8.b}[0], [x3], x9
st1 {v8.b}[1], [x17], x9
st1 {v8.b}[2], [x18], x9
st1 {v8.b}[3], [x21], x9
st1 {v9.b}[0], [x3], x9
st1 {v9.b}[1], [x17], x9
st1 {v9.b}[2], [x18], x9
st1 {v9.b}[3], [x21], x9
st1 {v10.b}[0], [x3], x9
st1 {v10.b}[1], [x17], x9
st1 {v10.b}[2], [x18], x9
st1 {v10.b}[3], [x21], x9
st1 {v11.b}[0], [x3], x9
st1 {v11.b}[1], [x17], x9
st1 {v11.b}[2], [x18], x9
st1 {v11.b}[3], [x21], x9
st1 {v12.b}[0], [x3], x9
st1 {v12.b}[1], [x17], x9
st1 {v12.b}[2], [x18], x9
st1 {v12.b}[3], [x21], x9
st1 {v13.b}[0], [x3], x9
st1 {v13.b}[1], [x17], x9
st1 {v13.b}[2], [x18], x9
st1 {v13.b}[3], [x21], x9
st1 {v14.b}[0], [x3], x9
st1 {v14.b}[1], [x17], x9
st1 {v14.b}[2], [x18], x9
st1 {v14.b}[3], [x21], x9
st1 {v15.b}[0], [x3], x9
st1 {v15.b}[1], [x17], x9
st1 {v15.b}[2], [x18], x9
st1 {v15.b}[3], [x21], x9
add x23, x23, x19
sub x24, x24, #16
cmp x24, #0
ble LoopWEnd
cmp x24, #8
blt LoopW
cmp x24, #16
bge LoopW16
LoopW8:
mov x19, #8
mul x19, x19, x11
mov x16, x23
mov x17, x2
mov x20, x6
mov v0.16b, v24.16b
mov v1.16b, v24.16b
mov v2.16b, v24.16b
mov v3.16b, v24.16b
mov v4.16b, v24.16b
mov v5.16b, v24.16b
mov v6.16b, v24.16b
mov v7.16b, v24.16b
LoopKh8:
mov x18, x7
mov x21, x16
LoopKw8:
mov x22, x21
ld1 {v25.4h}, [x17], #8
ld1 {v16.4h}, [x22], x13
ld1 {v17.4h}, [x22], x13
smlal v0.4s, v16.4h, v25.4h
smlal v1.4s, v17.4h, v25.4h
ld1 {v18.4h}, [x22], x13
ld1 {v19.4h}, [x22], x13
smlal v2.4s, v18.4h, v25.4h
smlal v3.4s, v19.4h, v25.4h
ld1 {v20.4h}, [x22], x13
ld1 {v21.4h}, [x22], x13
smlal v4.4s, v20.4h, v25.4h
smlal v5.4s, v21.4h, v25.4h
ld1 {v22.4h}, [x22], x13
ld1 {v23.4h}, [x22], x13
smlal v6.4s, v22.4h, v25.4h
smlal v7.4s, v23.4h, v25.4h
subs x18, x18, #1
add x21, x21, x13
bne LoopKw8
add x16, x16, x12
subs x20, x20, #1
bne LoopKh8
sqshl v0.4s, v0.4s ,v26.4s
sqshl v1.4s, v1.4s ,v26.4s
sqshl v2.4s, v2.4s ,v26.4s
sqshl v3.4s, v3.4s ,v26.4s
sqshl v4.4s, v4.4s ,v26.4s
sqshl v5.4s, v5.4s ,v26.4s
sqshl v6.4s, v6.4s ,v26.4s
sqshl v7.4s, v7.4s ,v26.4s
sqrdmulh v0.4s, v0.4s ,v27.4s
sqrdmulh v1.4s, v1.4s ,v27.4s
sqrdmulh v2.4s, v2.4s ,v27.4s
sqrdmulh v3.4s, v3.4s ,v27.4s
sqrdmulh v4.4s, v4.4s ,v27.4s
sqrdmulh v5.4s, v5.4s ,v27.4s
sqrdmulh v6.4s, v6.4s ,v27.4s
sqrdmulh v7.4s, v7.4s ,v27.4s
sqrshl v0.4s, v0.4s ,v28.4s
sqrshl v1.4s, v1.4s ,v28.4s
sqrshl v2.4s, v2.4s ,v28.4s
sqrshl v3.4s, v3.4s ,v28.4s
sqrshl v4.4s, v4.4s ,v28.4s
sqrshl v5.4s, v5.4s ,v28.4s
sqrshl v6.4s, v6.4s ,v28.4s
sqrshl v7.4s, v7.4s ,v28.4s
add v0.4s, v0.4s ,v29.4s
add v1.4s, v1.4s ,v29.4s
add v2.4s, v2.4s ,v29.4s
add v3.4s, v3.4s ,v29.4s
add v4.4s, v4.4s ,v29.4s
add v5.4s, v5.4s ,v29.4s
add v6.4s, v6.4s ,v29.4s
add v7.4s, v7.4s ,v29.4s
smax v0.4s, v0.4s ,v30.4s
smax v1.4s, v1.4s ,v30.4s
smax v2.4s, v2.4s ,v30.4s
smax v3.4s, v3.4s ,v30.4s
smax v4.4s, v4.4s ,v30.4s
smax v5.4s, v5.4s ,v30.4s
smax v6.4s, v6.4s ,v30.4s
smax v7.4s, v7.4s ,v30.4s
smin v0.4s, v0.4s ,v31.4s
smin v1.4s, v1.4s ,v31.4s
smin v2.4s, v2.4s ,v31.4s
smin v3.4s, v3.4s ,v31.4s
smin v4.4s, v4.4s ,v31.4s
smin v5.4s, v5.4s ,v31.4s
smin v6.4s, v6.4s ,v31.4s
smin v7.4s, v7.4s ,v31.4s
sqxtn v0.4h, v0.4s
sqxtn v1.4h, v1.4s
sqxtn v2.4h, v2.4s
sqxtn v3.4h, v3.4s
sqxtn v4.4h, v4.4s
sqxtn v5.4h, v5.4s
sqxtn v6.4h, v6.4s
sqxtn v7.4h, v7.4s
sqxtn v0.8b, v0.8h
sqxtn v1.8b, v1.8h
sqxtn v2.8b, v2.8h
sqxtn v3.8b, v3.8h
sqxtn v4.8b, v4.8h
sqxtn v5.8b, v5.8h
sqxtn v6.8b, v6.8h
sqxtn v7.8b, v7.8h
add x17, x3, #1
add x18, x3, #2
add x21, x3, #3
st1 {v0.b}[0], [x3], x9
st1 {v0.b}[1], [x17], x9
st1 {v0.b}[2], [x18], x9
st1 {v0.b}[3], [x21], x9
st1 {v1.b}[0], [x3], x9
st1 {v1.b}[1], [x17], x9
st1 {v1.b}[2], [x18], x9
st1 {v1.b}[3], [x21], x9
st1 {v2.b}[0], [x3], x9
st1 {v2.b}[1], [x17], x9
st1 {v2.b}[2], [x18], x9
st1 {v2.b}[3], [x21], x9
st1 {v3.b}[0], [x3], x9
st1 {v3.b}[1], [x17], x9
st1 {v3.b}[2], [x18], x9
st1 {v3.b}[3], [x21], x9
st1 {v4.b}[0], [x3], x9
st1 {v4.b}[1], [x17], x9
st1 {v4.b}[2], [x18], x9
st1 {v4.b}[3], [x21], x9
st1 {v5.b}[0], [x3], x9
st1 {v5.b}[1], [x17], x9
st1 {v5.b}[2], [x18], x9
st1 {v5.b}[3], [x21], x9
st1 {v6.b}[0], [x3], x9
st1 {v6.b}[1], [x17], x9
st1 {v6.b}[2], [x18], x9
st1 {v6.b}[3], [x21], x9
st1 {v7.b}[0], [x3], x9
st1 {v7.b}[1], [x17], x9
st1 {v7.b}[2], [x18], x9
st1 {v7.b}[3], [x21], x9
add x23, x23, x19
sub x24, x24, #8
cmp x24, #0
ble LoopWEnd
cmp x24, #8
bge LoopW8
LoopW:
mov x16, x23
mov x17, x2
mov x20, x6
mov v0.16b, v24.16b
LoopKh:
mov x18, x7
mov x22, x16
LoopKw:
ld1 {v16.4h}, [x22], x13
ld1 {v25.4h}, [x17], #8
smlal v0.4s, v16.4h, v25.4h
subs x18, x18, #1
bne LoopKw
add x16, x16, x12
subs x20, x20, #1
bne LoopKh
sqshl v0.4s, v0.4s ,v26.4s
sqrdmulh v0.4s, v0.4s ,v27.4s
sqrshl v0.4s, v0.4s ,v28.4s
add v0.4s, v0.4s ,v29.4s
smax v0.4s, v0.4s ,v30.4s
smin v0.4s, v0.4s ,v31.4s
sqxtn v0.4h, v0.4s
sqxtn v0.8b, v0.8h
mov x17, x3
st1 {v0.b}[0], [x17], #1
st1 {v0.b}[1], [x17], #1
st1 {v0.b}[2], [x17], #1
st1 {v0.b}[3], [x17], #1
add x3, x3, x9
add x23, x23, x11
subs x24, x24, #1
bne LoopW
LoopWEnd:
add x0, x0, x8
add x1, x1, x10
subs x4, x4, #1
bne LoopH
sub sp, sp, #48
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ldp x23, x24, [sp], #16
ret
#endif

View File

@ -35,12 +35,12 @@ DeconvDwFp32Center:
mov x18, x15
mov x19, x2
mov x20, x5
dup v0.4s, wzr
ld1 {v1.4s}, [x16], x8
LoopKh:
mov x21, x18
mov x13, x6
LoopKw:
ld1 {v1.4s}, [x16]
ld1 {v0.4s}, [x21]
ld1 {v2.4s}, [x19], #16
fmla v0.4s, v1.4s, v2.4s
st1 {v0.4s}, [x21], x12
@ -50,7 +50,6 @@ DeconvDwFp32Center:
subs x20, x20, #1
bne LoopKh
add x15, x15, x10
add x16, x16, x8
subs x17, x17, #1
bne LoopW
add x0, x0, x9

View File

@ -0,0 +1,65 @@
#ifdef __aarch64__
.text
.align 5
.global DeconvDwInt8Center
#ifndef __APPLE__
.type DeconvDwInt8Center, %function
#endif
// void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
// size_t in_kh_step, size_t in_kw_step);
// x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
// x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
DeconvDwInt8Center:
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
sub sp, sp, #32
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
ldr x10, [sp, #16]
ldr x11, [sp, #24]
ldr x12, [sp, #32]
LoopH:
mov x15, x0
mov x16, x1
mov x17, x4
LoopW:
mov x18, x15
mov x19, x2
mov x20, x5
ld1 {v1.4h}, [x16], x8
LoopKh:
mov x21, x18
mov x13, x6
LoopKw:
ld1 {v0.4s}, [x21]
ld1 {v2.4h}, [x19], #8
smlal v0.4s, v1.4h, v2.4h
st1 {v0.4s}, [x21], x12
subs x13, x13, #1
bne LoopKw
add x18, x18, x11
subs x20, x20, #1
bne LoopKh
add x15, x15, x10
add x16, x16, x8
subs x17, x17, #1
bne LoopW
add x0, x0, x9
add x1, x1, x7
subs x3, x3, #1
bne LoopH
sub sp, sp, #32
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ret
#endif

View File

@ -0,0 +1,294 @@
#ifdef __aarch64__
.text
.align 5
.global ConvDwFp16Center
#ifndef __APPLE__
.type ConvDwFp16Center, %function
#endif
// void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
// size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w,
// x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
// x14: relu, x15: relu6
ConvDwFp16Center:
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
sub sp, sp, #48
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
stp x23, x24, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
ldr x10, [sp, #16]
ldr x11, [sp, #24]
ldr x12, [sp, #32]
ldr x13, [sp, #40]
ldr x14, [sp, #48]
ldr x15, [sp, #56]
ld1 {v24.8h}, [x3]
movi v26.8h, #0x46, lsl #8
dup v27.4s, wzr
LoopH:
mov x23, x1
mov x24, x5
mov x3, x0
cmp x24, #8
blt LoopW
cmp x24, #16
blt LoopW8
LoopW16:
mov x19, #16
mul x19, x19, x11
mov x16, x23
mov x17, x2
mov x20, x6
mov v0.16b, v24.16b
mov v1.16b, v24.16b
mov v2.16b, v24.16b
mov v3.16b, v24.16b
mov v4.16b, v24.16b
mov v5.16b, v24.16b
mov v6.16b, v24.16b
mov v7.16b, v24.16b
mov v8.16b, v24.16b
mov v9.16b, v24.16b
mov v10.16b, v24.16b
mov v11.16b, v24.16b
mov v12.16b, v24.16b
mov v13.16b, v24.16b
mov v14.16b, v24.16b
mov v15.16b, v24.16b
LoopKh16:
mov x18, x7
mov x21, x16
LoopKw16:
mov x22, x21
ld1 {v25.8h}, [x17], #16
ld1 {v16.8h}, [x22], x11
ld1 {v17.8h}, [x22], x11
fmla v0.8h, v16.8h, v25.8h
fmla v1.8h, v17.8h, v25.8h
ld1 {v18.8h}, [x22], x11
ld1 {v19.8h}, [x22], x11
fmla v2.8h, v18.8h, v25.8h
fmla v3.8h, v19.8h, v25.8h
ld1 {v20.8h}, [x22], x11
ld1 {v21.8h}, [x22], x11
fmla v4.8h, v20.8h, v25.8h
fmla v5.8h, v21.8h, v25.8h
ld1 {v22.8h}, [x22], x11
ld1 {v23.8h}, [x22], x11
fmla v6.8h, v22.8h, v25.8h
fmla v7.8h, v23.8h, v25.8h
ld1 {v16.8h}, [x22], x11
ld1 {v17.8h}, [x22], x11
fmla v8.8h, v16.8h, v25.8h
fmla v9.8h, v17.8h, v25.8h
ld1 {v18.8h}, [x22], x11
ld1 {v19.8h}, [x22], x11
fmla v10.8h, v18.8h, v25.8h
fmla v11.8h, v19.8h, v25.8h
ld1 {v20.8h}, [x22], x11
ld1 {v21.8h}, [x22], x11
fmla v12.8h, v20.8h, v25.8h
fmla v13.8h, v21.8h, v25.8h
ld1 {v22.8h}, [x22], x11
ld1 {v23.8h}, [x22], x11
fmla v14.8h, v22.8h, v25.8h
fmla v15.8h, v23.8h, v25.8h
subs x18, x18, #1
add x21, x21, x13
bne LoopKw16
add x16, x16, x12
subs x20, x20, #1
bne LoopKh16
cbnz x15, Relu616
cbnz x14, Relu16
b Write16
Relu616:
fmin v0.8h, v0.8h, v26.8h
fmin v1.8h, v1.8h, v26.8h
fmin v2.8h, v2.8h, v26.8h
fmin v3.8h, v3.8h, v26.8h
fmin v4.8h, v4.8h, v26.8h
fmin v5.8h, v5.8h, v26.8h
fmin v6.8h, v6.8h, v26.8h
fmin v7.8h, v7.8h, v26.8h
fmin v8.8h, v8.8h, v26.8h
fmin v9.8h, v9.8h, v26.8h
fmin v10.8h, v10.8h, v26.8h
fmin v11.8h, v11.8h, v26.8h
fmin v12.8h, v12.8h, v26.8h
fmin v13.8h, v13.8h, v26.8h
fmin v14.8h, v14.8h, v26.8h
fmin v15.8h, v15.8h, v26.8h
Relu16:
fmax v0.8h, v0.8h, v27.8h
fmax v1.8h, v1.8h, v27.8h
fmax v2.8h, v2.8h, v27.8h
fmax v3.8h, v3.8h, v27.8h
fmax v4.8h, v4.8h, v27.8h
fmax v5.8h, v5.8h, v27.8h
fmax v6.8h, v6.8h, v27.8h
fmax v7.8h, v7.8h, v27.8h
fmax v8.8h, v8.8h, v27.8h
fmax v9.8h, v9.8h, v27.8h
fmax v10.8h, v10.8h, v27.8h
fmax v11.8h, v11.8h, v27.8h
fmax v12.8h, v12.8h, v27.8h
fmax v13.8h, v13.8h, v27.8h
fmax v14.8h, v14.8h, v27.8h
fmax v15.8h, v15.8h, v27.8h
Write16:
st1 {v0.8h}, [x3], x9
st1 {v1.8h}, [x3], x9
st1 {v2.8h}, [x3], x9
st1 {v3.8h}, [x3], x9
st1 {v4.8h}, [x3], x9
st1 {v5.8h}, [x3], x9
st1 {v6.8h}, [x3], x9
st1 {v7.8h}, [x3], x9
st1 {v8.8h}, [x3], x9
st1 {v9.8h}, [x3], x9
st1 {v10.8h}, [x3], x9
st1 {v11.8h}, [x3], x9
st1 {v12.8h}, [x3], x9
st1 {v13.8h}, [x3], x9
st1 {v14.8h}, [x3], x9
st1 {v15.8h}, [x3], x9
add x23, x23, x19
sub x24, x24, #16
cmp x24, #0
ble LoopWEnd
cmp x24, #8
blt LoopW
cmp x24, #16
bge LoopW16
LoopW8:
mov x19, #8
mul x19, x19, x11
mov x16, x23
mov x17, x2
mov x20, x6
mov v0.16b, v24.16b
mov v1.16b, v24.16b
mov v2.16b, v24.16b
mov v3.16b, v24.16b
mov v4.16b, v24.16b
mov v5.16b, v24.16b
mov v6.16b, v24.16b
mov v7.16b, v24.16b
LoopKh8:
mov x18, x7
mov x21, x16
LoopKw8:
mov x22, x21
ld1 {v25.8h}, [x17], #16
ld1 {v16.8h}, [x22], x11
ld1 {v17.8h}, [x22], x11
fmla v0.8h, v16.8h, v25.8h
fmla v1.8h, v17.8h, v25.8h
ld1 {v18.8h}, [x22], x11
ld1 {v19.8h}, [x22], x11
fmla v2.8h, v18.8h, v25.8h
fmla v3.8h, v19.8h, v25.8h
ld1 {v20.8h}, [x22], x11
ld1 {v21.8h}, [x22], x11
fmla v4.8h, v20.8h, v25.8h
fmla v5.8h, v21.8h, v25.8h
ld1 {v22.8h}, [x22], x11
ld1 {v23.8h}, [x22], x11
fmla v6.8h, v22.8h, v25.8h
fmla v7.8h, v23.8h, v25.8h
subs x18, x18, #1
add x21, x21, x13
bne LoopKw8
add x16, x16, x12
subs x20, x20, #1
bne LoopKh8
cbnz x15, Relu68
cbnz x14, Relu8
b Write8
Relu68:
fmin v0.8h, v0.8h, v26.8h
fmin v1.8h, v1.8h, v26.8h
fmin v2.8h, v2.8h, v26.8h
fmin v3.8h, v3.8h, v26.8h
fmin v4.8h, v4.8h, v26.8h
fmin v5.8h, v5.8h, v26.8h
fmin v6.8h, v6.8h, v26.8h
fmin v7.8h, v7.8h, v26.8h
Relu8:
fmax v0.8h, v0.8h, v27.8h
fmax v1.8h, v1.8h, v27.8h
fmax v2.8h, v2.8h, v27.8h
fmax v3.8h, v3.8h, v27.8h
fmax v4.8h, v4.8h, v27.8h
fmax v5.8h, v5.8h, v27.8h
fmax v6.8h, v6.8h, v27.8h
fmax v7.8h, v7.8h, v27.8h
Write8:
st1 {v0.8h}, [x3], x9
st1 {v1.8h}, [x3], x9
st1 {v2.8h}, [x3], x9
st1 {v3.8h}, [x3], x9
st1 {v4.8h}, [x3], x9
st1 {v5.8h}, [x3], x9
st1 {v6.8h}, [x3], x9
st1 {v7.8h}, [x3], x9
add x23, x23, x19
sub x24, x24, #8
cmp x24, #0
ble LoopWEnd
cmp x24, #8
bge LoopW8
LoopW:
mov x16, x23
mov x17, x2
mov x20, x6
mov v0.16b, v24.16b
LoopKh:
mov x18, x7
mov x22, x16
LoopKw:
ld1 {v16.8h}, [x22], x13
ld1 {v25.8h}, [x17], #16
fmla v0.8h, v16.8h, v25.8h
subs x18, x18, #1
bne LoopKw
add x16, x16, x12
subs x20, x20, #1
bne LoopKh
cbnz x15, Relu6
cbnz x14, Relu
b Write
Relu6:
fmin v0.8h, v0.8h, v26.8h
Relu:
fmax v0.8h, v0.8h, v27.8h
Write:
st1 {v0.8h}, [x3], x9
add x23, x23, x11
subs x24, x24, #1
bne LoopW
LoopWEnd:
add x0, x0, x8
add x1, x1, x10
subs x4, x4, #1
bne LoopH
sub sp, sp, #48
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ldp x23, x24, [sp], #16
ret
#endif

View File

@ -0,0 +1,64 @@
#ifdef __aarch64__
.text
.align 5
.global DeconvDwFp16Center
#ifndef __APPLE__
.type DeconvDwFp16Center, %function
#endif
// void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
// size_t in_kh_step, size_t in_kw_step);
// x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
// x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
DeconvDwFp16Center:
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
sub sp, sp, #32
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
ldr x10, [sp, #16]
ldr x11, [sp, #24]
ldr x12, [sp, #32]
LoopH:
mov x15, x0
mov x16, x1
mov x17, x4
LoopW:
mov x18, x15
mov x19, x2
mov x20, x5
ld1 {v1.8h}, [x16], x8
LoopKh:
mov x21, x18
mov x13, x6
LoopKw:
ld1 {v0.8h}, [x21]
ld1 {v2.8h}, [x19], #16
fmla v0.8h, v1.8h, v2.8h
st1 {v0.8h}, [x21], x12
subs x13, x13, #1
bne LoopKw
add x18, x18, x11
subs x20, x20, #1
bne LoopKh
add x15, x15, x10
subs x17, x17, #1
bne LoopW
add x0, x0, x9
add x1, x1, x7
subs x3, x3, #1
bne LoopH
sub sp, sp, #32
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ret
#endif

View File

@ -0,0 +1,44 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_COMMON_FUNC_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_COMMON_FUNC_H_
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include "src/runtime/kernel/arm/opclib/op_base.h"
#include "src/runtime/kernel/arm/opclib/conv_parameter.h"
#ifdef __cplusplus
extern "C" {
#endif
#ifdef ENABLE_ARM64
void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step,
size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step,
size_t in_kw_step, size_t relu, size_t relu6);
void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
#endif
#ifdef __cplusplus
}
#endif
#endif /* MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_COMMON_FUNC_H_ */

View File

@ -16,6 +16,7 @@
#include "src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h"
#include <arm_neon.h>
#include "src/runtime/kernel/arm/opclib/fp16/common_func.h"
/*conv depthwise fp16 begin*/
void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
@ -79,6 +80,7 @@ void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *
} // height loop
}
#ifndef ENABLE_ARM64
void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
int height, int width, int kernel_h, int kernel_w, int out_h_step, int block_channel,
int in_sh_step, int in_sw_step, int in_kh_step, int in_kw_step, bool is_relu, bool is_relu6) {
@ -97,12 +99,17 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *
const float16_t *src_kw = src_kh;
const float16_t *weight_kw = weight_kh;
for (int kw = 0; kw < kernel_w; kw++) {
#ifdef ENABLE_ARM64
float16x8_t src_8 = vld1q_f16(src_kw);
float16x8_t weight_8 = vld1q_f16(weight_kw);
float16x8_t dst_8 = vld1q_f16(dst_w);
dst_8 = vfmaq_f16(dst_8, src_8, weight_8);
vst1q_f16(dst_w, dst_8);
#else
for (int c = 0; c < C8NUM; c++) {
dst_w[c] += src_kw[c] * weight_kw[c];
}
#endif
src_kw += in_kw_step;
weight_kw += C8NUM;
} // kernel_w loop
@ -122,6 +129,7 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *
src_h += in_sh_step;
} // dst_height loop
}
#endif
// conv depthwise fp16: sliding window
void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
@ -149,11 +157,19 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo
int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
const float16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_;
float16_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
#ifdef ENABLE_ARM64
ConvDwFp16Center(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_,
sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
sliding->out_h_step_ * sizeof(float16_t), sliding->block_channel_ * sizeof(float16_t),
sliding->in_sh_step_ * sizeof(float16_t), sliding->in_sw_step_ * sizeof(float16_t),
sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t),
conv_param->is_relu_, conv_param->is_relu6_);
#else
DepthwiseCenterFp16(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_,
sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, sliding->in_sw_step_,
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_);
#endif
}
} // output C8 loop
src += sliding->in_step_;
@ -214,6 +230,7 @@ void DeconvDepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float
} // height loop
}
#ifndef ENABLE_ARM64
void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int height, int width,
int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step,
int in_sw_step, int in_kh_step, int in_kw_step) {
@ -229,12 +246,17 @@ void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float
float16_t *dst_kw = dst_kh;
const float16_t *weight_kw = weight_kh;
for (int kw = 0; kw < kernel_w; kw++) {
#ifdef ENABLE_ARM64
float16x8_t src_8 = vld1q_f16(src_w);
float16x8_t weight_8 = vld1q_f16(weight_kw);
float16x8_t dst_8 = vld1q_f16(dst_kw);
dst_8 = vfmaq_f16(dst_8, src_8, weight_8);
vst1q_f16(dst_kw, dst_8);
#else
for (int c = 0; c < C8NUM; c++) {
dst_kw[c] += src_w[c] * weight_kw[c];
}
#endif
dst_kw += in_kw_step;
weight_kw += C8NUM;
} // kernel_w loop
@ -248,6 +270,7 @@ void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float
src_h += out_h_step;
} // dst_height loop
}
#endif
void DeconvDepthwisePostFuncFp16(float16_t *dst, const float16_t *bias, int block_channel,
const ConvParameter *conv_param) {
@ -289,11 +312,18 @@ void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const f
float16_t *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_;
const float16_t *in_t =
src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
#ifdef ENABLE_ARM64
DeconvDwFp16Center(out_t, in_t, weight, sliding->bottom_ - sliding->top_,
sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
sliding->out_h_step_ * sizeof(float16_t), sliding->block_channel_ * sizeof(float16_t),
sliding->in_sh_step_ * sizeof(float16_t), sliding->in_sw_step_ * sizeof(float16_t),
sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t));
#else
DeconvDepthwiseCenterFp16(out_t, in_t, weight, sliding->bottom_ - sliding->top_,
sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_,
sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_);
#endif
}
DeconvDepthwisePostFuncFp16(dst_data, bias, sliding->block_channel_, conv_param);
} // output C8 loop

View File

@ -38,6 +38,15 @@ void MatrixSub(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stri
void MatrixMultiAdd(float *c11, float *c12, float *c21, float *c22, float *x_ptr, size_t row, size_t col,
size_t c_stride, size_t x_stride);
#ifdef ENABLE_ARM
void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
#endif
#ifdef ENABLE_ARM64
void BiasAdd(const float *bias, float *data, size_t oc4, size_t plan_size);
void BiasAddRelu6(const float *bias, float *data, size_t oc4, size_t plan_size);
@ -49,12 +58,6 @@ void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc,
void C4BiasAddRelu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride);
void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride);
void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride);
void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
#endif
#ifdef __cplusplus

View File

@ -0,0 +1,62 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_COMMON_FUNC_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_COMMON_FUNC_H_
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include "src/runtime/kernel/arm/opclib/op_base.h"
#include "src/runtime/kernel/arm/opclib/conv_parameter.h"
#ifdef __cplusplus
extern "C" {
#endif
#ifdef ENABLE_ARM
void IndirectGemmInt16to32_8x4(int32_t *dst, const int16_t *src, const int16_t *weight, size_t ksize, size_t ic8,
size_t oc4, size_t offset);
#ifdef ENABLE_ARM64
void IndirectGemmInt8_4x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before,
size_t shift_after);
#elif defined(ENABLE_ARM32)
void IndirectGemmInt8_2x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before,
size_t shift_after);
#endif
#endif
#ifdef ENABLE_ARM
void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width,
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height,
size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel,
size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int out_multiplier,
int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max);
#endif
#ifdef __cplusplus
}
#endif
#endif /* MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_COMMON_FUNC_H_ */

View File

@ -17,6 +17,7 @@
#include "src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.h"
#include <string.h>
#include "src/runtime/kernel/arm/opclib/quantization/fixed_point.h"
#include "src/runtime/kernel/arm/opclib/int8/common_func.h"
/*conv depthwise int8 begin*/
void DepthwiseBorderPixelInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height,
@ -85,6 +86,7 @@ void DepthwiseBorderInt8(int8_t *dst, const int16_t *src, const int16_t *weight,
} // height loop
}
#ifndef ENABLE_ARM64
void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height,
int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step,
int in_sw_step, int in_kh_step, int in_kw_step, int out_multiplier, int left_shift,
@ -133,6 +135,7 @@ void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight,
src_h += in_sh_step;
} // dst_height loop
}
#endif
void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) {
@ -158,7 +161,17 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
const int16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * C4NUM;
int8_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * C4NUM;
#ifdef ENABLE_ARM64
ConvDwInt8Center(
out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_,
conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(int8_t),
sliding->block_channel_ * sizeof(int8_t), sliding->in_sh_step_ * sizeof(int16_t),
sliding->in_sw_step_ * sizeof(int16_t), sliding->in_kh_step_ * sizeof(int16_t),
sliding->in_kw_step_ * sizeof(int16_t), conv_param->conv_quant_arg_.quant_multiplier_[0],
conv_param->conv_quant_arg_.left_shift_[0], conv_param->conv_quant_arg_.right_shift_[0],
conv_param->conv_quant_arg_.quant_args_[2][0].zp_, conv_param->conv_quant_arg_.out_act_min_[0],
conv_param->conv_quant_arg_.out_act_max_[0]);
#else
DepthwiseCenterInt8(
out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_,
conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_, sliding->block_channel_,
@ -166,6 +179,7 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
conv_param->conv_quant_arg_.quant_multiplier_[0], conv_param->conv_quant_arg_.left_shift_[0],
conv_param->conv_quant_arg_.right_shift_[0], conv_param->conv_quant_arg_.quant_args_[2][0].zp_,
conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0]);
#endif
}
} // output C4 loop
src += sliding->in_step_;
@ -222,6 +236,7 @@ void DeconvDepthwiseBorderInt8(int32_t *dst, const int16_t *src, const int16_t *
} // height loop
}
#ifndef ENABLE_ARM64
void DeconvDepthwiseCenterInt8(int32_t *dst, const int16_t *src, const int16_t *weight, int height, int width,
int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step,
int in_sw_step, int in_kh_step, int in_kw_step) {
@ -253,6 +268,7 @@ void DeconvDepthwiseCenterInt8(int32_t *dst, const int16_t *src, const int16_t *
src_h += out_h_step;
} // dst_height loop
}
#endif
void DeconvDepthwisePostFuncInt8(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel,
const ConvParameter *conv_param, int out_multiplier, int left_shift, int right_shift,
@ -302,11 +318,18 @@ void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *in
int32_t *out_t = output_buffer + oh_h_start * sliding->in_h_step_ + oh_w_start * C4NUM;
const int16_t *in_t =
src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
#ifdef ENABLE_ARM64
DeconvDwInt8Center(out_t, in_t, weight, sliding->bottom_ - sliding->top_,
sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
sliding->out_h_step_ * sizeof(int16_t), sliding->block_channel_ * sizeof(int16_t),
sliding->in_sh_step_ * sizeof(int32_t), sliding->in_sw_step_ * sizeof(int32_t),
sliding->in_kh_step_ * sizeof(int32_t), sliding->in_kw_step_ * sizeof(int32_t));
#else
DeconvDepthwiseCenterInt8(out_t, in_t, weight, sliding->bottom_ - sliding->top_,
sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_,
sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_);
#endif
}
DeconvDepthwisePostFuncInt8(
dst_data, output_buffer, bias, sliding->block_channel_, conv_param,

View File

@ -17,25 +17,7 @@
#include "src/runtime/kernel/arm/opclib/int8/conv_int8.h"
#include <string.h>
#include "src/runtime/kernel/arm/opclib/winograd_transform.h"
extern "C" {
#ifdef ENABLE_ARM
void IndirectGemmInt16to32_8x4(int32_t *dst, const int16_t *src, const int16_t *weight, size_t ksize, size_t ic8,
size_t oc4, size_t offset);
#ifdef ENABLE_ARM64
void IndirectGemmInt8_4x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before,
size_t shift_after);
#elif defined(ENABLE_ARM32)
void IndirectGemmInt8_2x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before,
size_t shift_after);
#endif
#endif
}
#include "src/runtime/kernel/arm/opclib/int8/common_func.h"
void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum,