forked from OSSInnovation/mindspore
!3961 unroll loop for fp32 depth wise convolution to 16
Merge pull request !3961 from lixian/master
This commit is contained in:
commit
c2cd2df9e7
|
@ -0,0 +1,161 @@
|
|||
#ifdef __arm__
|
||||
#ifndef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global ConvDwFp32Center
|
||||
#ifndef __APPLE__
|
||||
.type ConvDwFp32Center, %function
|
||||
#endif
|
||||
|
||||
// void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
|
||||
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
|
||||
// size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
|
||||
// r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: weight, #56: kernel_h, #60: kernel_w,
|
||||
// #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step
|
||||
// #88: relu, #92: relu6
|
||||
ConvDwFp32Center:
|
||||
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
|
||||
// according to https://stackoverflow.com/questions/53625807
|
||||
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
|
||||
// clang's rule seems more simple, though there are no subroutine calls here
|
||||
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
|
||||
push {r0-r8, r10, r11, lr}
|
||||
vpush {v4-v7}
|
||||
add sp, sp, #112
|
||||
|
||||
ldr r4, [sp, #48]
|
||||
|
||||
vld1.32 {q13}, [r3]
|
||||
vmov.i32 q14, #6
|
||||
vcvt.f32.s32 q14, q14
|
||||
veor q15, q15, q15
|
||||
|
||||
LoopH:
|
||||
ldr r1, [sp, #4] // src_w
|
||||
ldr r5, [sp, #52] // width
|
||||
ldr r0, [sp] // dst_w
|
||||
cmp r5, #4
|
||||
blt LoopW
|
||||
LoopW4:
|
||||
mov r11, [sp, #76] // in_sw_step
|
||||
mov r8, r1 // src_kh
|
||||
ldr r2, [sp, #8] // weight_kh
|
||||
ldr r6, [sp, #56] // kernel_h
|
||||
vmov q0, q13
|
||||
LoopKh4:
|
||||
ldr r12, [sp, #80] //in_kh_step
|
||||
ldr r7, [sp, #60] // kernel_w
|
||||
mov lr, r8 // src_kw
|
||||
LoopKw4:
|
||||
mov r10, lr
|
||||
vld1.32 {q12}, [r2]!
|
||||
vld1.32 {q4}, [r10]
|
||||
add r10, r10, r11
|
||||
vmla.f32 q0, q4, q12
|
||||
vld1.32 {q5}, [r10]
|
||||
add r10, r10, r11
|
||||
vmla.f32 q1, q5, q12
|
||||
vld1.32 {q6}, [r10]
|
||||
add r10, r10, r11
|
||||
vmla.f32 q2, q6, q12
|
||||
vld1.32 {q7}, [r10]
|
||||
add r10, r10, r11
|
||||
vmla.f32 q3, q7, q12
|
||||
subs r7, r7, #1
|
||||
add lr, lr, r12
|
||||
bne LoopKw4
|
||||
ldr r12, [sp, #80]
|
||||
add r8, r8, r12
|
||||
subs r6, r6, #1
|
||||
bne LoopKh4
|
||||
ldr r12, [sp, #92]
|
||||
cmp r12, #0
|
||||
bne Relu64
|
||||
ldr r12, [sp, #88]
|
||||
cmp r12, #0
|
||||
bne Relu4
|
||||
b Write4
|
||||
Relu64:
|
||||
vmin.f32 q0, q0, q14
|
||||
vmin.f32 q1, q1, q14
|
||||
vmin.f32 q2, q2, q14
|
||||
vmin.f32 q3, q3, q14
|
||||
Relu4:
|
||||
vmax.f32 q0, q0, q15
|
||||
vmax.f32 q1, q1, q15
|
||||
vmax.f32 q2, q2, q15
|
||||
vmax.f32 q3, q3, q15
|
||||
Write4:
|
||||
ldr r12, [sp, #68]
|
||||
vst1.32 {q0}, [r0]
|
||||
add r0, r0, r12
|
||||
vst1.32 {q1}, [r0]
|
||||
add r0, r0, r12
|
||||
vst1.32 {q2}, [r0]
|
||||
add r0, r0, r12
|
||||
vst1.32 {q3}, [r0]
|
||||
add r0, r0, r12
|
||||
mov r12, #4
|
||||
mul r11, r11, r12
|
||||
add r1, r1, r11
|
||||
sub r5, r5, #4
|
||||
cmp r5, r5, #0
|
||||
ble LoopWEnd
|
||||
cmp r5, #4
|
||||
bge LoopW
|
||||
LoopW:
|
||||
mov r8, r1 // src_kh
|
||||
ldr r2, [sp, #8] // weight_kh
|
||||
ldr r6, [sp, #56] // kernel_h
|
||||
vmov q0, q13
|
||||
LoopKh:
|
||||
ldr r12, [sp, #84] //in_kw_step
|
||||
ldr r7, [sp, #60] // kernel_w
|
||||
mov r10, r8 // src_kw
|
||||
LoopKw:
|
||||
vld1.32 {q1}, [r10]
|
||||
add r10, r10, r12
|
||||
vld1.32 {q12}, [r2]!
|
||||
vmla.f32 q0, q1, q12
|
||||
subs r7, r7, #1
|
||||
bne LoopKw
|
||||
ldr r12, [sp, #80]
|
||||
add r8, r8, r12
|
||||
subs r6, r6, #1
|
||||
bne LoopKh
|
||||
ldr r12, [sp, #92]
|
||||
cmp r12, #0
|
||||
bne Relu6
|
||||
ldr r12, [sp, #88]
|
||||
cmp r12, #0
|
||||
bne Relu
|
||||
b Write
|
||||
Relu6:
|
||||
vmin.f32 q0, q0, q14
|
||||
Relu:
|
||||
vmax.f32 q0, q0, q15
|
||||
Write:
|
||||
ldr r12, [sp, #68]
|
||||
vst1.32 {q0}, [r0]
|
||||
add r0, r0, r12
|
||||
ldr r12, [sp, #76]
|
||||
add r1, r1, r12
|
||||
subs r5, r5, #1
|
||||
bne LoopW
|
||||
ldr r3, [sp, #64]
|
||||
ldr r12, [sp]
|
||||
add r12, r12, r3
|
||||
str r12, [sp]
|
||||
ldr r3, [sp, #72]
|
||||
ldr r12, [sp, #4]
|
||||
add r12, r12, r3
|
||||
str r12, [sp, #4]
|
||||
subs r4, r4, #1
|
||||
bne LoopH
|
||||
LoopWEnd:
|
||||
sub sp, sp, #112
|
||||
vpop {v4-v7}
|
||||
pop {r0-r8, r10, r11, pc}
|
||||
#endif
|
||||
#endif
|
|
@ -0,0 +1,207 @@
|
|||
#ifdef __arm__
|
||||
#ifndef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global ConvDwInt8Center
|
||||
#ifndef __APPLE__
|
||||
.type ConvDwInt8Center, %function
|
||||
#endif
|
||||
|
||||
// void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height, size_t width,
|
||||
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
|
||||
// size_t in_kh_step, size_t in_kw_step, int out_multiplier, int left_shift,
|
||||
// int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max);
|
||||
// r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: width, #56: kernel_h, #60: kernel_w,
|
||||
// #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step
|
||||
// #88: out_multiplier, #92: left_shift, #96: right_shift, #100: out_zp, #104: acc_min, #108: acc_max
|
||||
ConvDwInt8Center:
|
||||
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
|
||||
// according to https://stackoverflow.com/questions/53625807
|
||||
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
|
||||
// clang's rule seems more simple, though there are no subroutine calls here
|
||||
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
|
||||
push {r0-r8, r10, r11, lr}
|
||||
vpush {q4-q7}
|
||||
add sp, sp, #112
|
||||
|
||||
ldr r4, [sp, #48]
|
||||
|
||||
ldr r12, [sp, #92]
|
||||
vdup.32 q9, r12
|
||||
|
||||
ldr r11, [sp, #88]
|
||||
vdup.32 q10, r11
|
||||
|
||||
ldr r10, [sp, #96]
|
||||
vdup.32 q11, r10
|
||||
|
||||
ldr r8, [sp, #100]
|
||||
vdup.32 q12, r8
|
||||
|
||||
ldr r7, [sp, #104]
|
||||
vdup.32 q13, r7
|
||||
|
||||
ldr r6, [sp, #108]
|
||||
vdup.32 q14, r6
|
||||
|
||||
vld1.32 {q15}, [r3]
|
||||
|
||||
LoopH:
|
||||
ldr r1, [sp, #4] // src_w
|
||||
ldr r5, [sp, #52] // width
|
||||
ldr r0, [sp] // dst_w
|
||||
LoopW4:
|
||||
mov r11, [sp, #76] // in_sw_step
|
||||
mov r8, r1 // src_kh
|
||||
ldr r2, [sp, #8] // weight_kh
|
||||
ldr r6, [sp, #56] // kernel_h
|
||||
vmov q0, q15
|
||||
LoopKh4:
|
||||
ldr r12, [sp, #80] //in_kh_step
|
||||
ldr r7, [sp, #60] // kernel_w
|
||||
mov r10, r8 // src_kw
|
||||
LoopKw4:
|
||||
vld1.16 {d24}, [r2]!
|
||||
vld1.16 {d8}, [r10]
|
||||
add r10, r10, r11
|
||||
vmlal.s16 q0, d8, d24
|
||||
vld1.16 {d10}, [r10]
|
||||
add r10, r10, r11
|
||||
vmlal.s16 q1, d10, d24
|
||||
vld1.16 {d12}, [r10]
|
||||
add r10, r10, r11
|
||||
vmlal.s16 q2, d12, d24
|
||||
vld1.16 {d14}, [r10]
|
||||
add r10, r10, r11
|
||||
vmlal.s16 q3, d14, d24
|
||||
subs r7, r7, #1
|
||||
bne LoopKw4
|
||||
ldr r12, [sp, #80]
|
||||
add r8, r8, r12
|
||||
subs r6, r6, #1
|
||||
bne LoopKh4
|
||||
|
||||
vshl.s32 q0, q0, q9
|
||||
vshl.s32 q1, q1, q9
|
||||
vshl.s32 q2, q2, q9
|
||||
vshl.s32 q3, q3, q9
|
||||
vqrdmulh.s32 q0, q0, q10
|
||||
vqrdmulh.s32 q1, q1, q10
|
||||
vqrdmulh.s32 q2, q2, q10
|
||||
vqrdmulh.s32 q3, q3, q10
|
||||
vrshl.s32 q0, q0, q11
|
||||
vrshl.s32 q1, q1, q11
|
||||
vrshl.s32 q2, q2, q11
|
||||
vrshl.s32 q3, q3, q11
|
||||
vadd.i32 q0, q0, q12
|
||||
vadd.i32 q1, q1, q12
|
||||
vadd.i32 q2, q2, q12
|
||||
vadd.i32 q3, q3, q12
|
||||
vmax.s32 q0, q0, q13
|
||||
vmax.s32 q1, q1, q13
|
||||
vmax.s32 q2, q2, q13
|
||||
vmax.s32 q3, q3, q13
|
||||
vmin.s32 q0, q0, q14
|
||||
vmin.s32 q1, q1, q14
|
||||
vmin.s32 q2, q2, q14
|
||||
vmin.s32 q3, q3, q14
|
||||
|
||||
vqmovn.s32 d0, q0
|
||||
vqmovn.s32 d2, q1
|
||||
vqmovn.s32 d4, q2
|
||||
vqmovn.s32 d6, q3
|
||||
vqmovn.s16 d0, q0
|
||||
vqmovn.s16 d2, q1
|
||||
vqmovn.s16 d4, q2
|
||||
vqmovn.s16 d6, q3
|
||||
|
||||
mov r3, r0
|
||||
ldr r12, [sp, #68]
|
||||
vst1.8 {d0[0]}, [r3]!
|
||||
vst1.8 {d0[1]}, [r3]!
|
||||
vst1.8 {d0[2]}, [r3]!
|
||||
vst1.8 {d0[3]}, [r3]!
|
||||
add r0, r0, r12
|
||||
mov r3, r0
|
||||
vst1.8 {d2[0]}, [r3]!
|
||||
vst1.8 {d2[1]}, [r3]!
|
||||
vst1.8 {d2[2]}, [r3]!
|
||||
vst1.8 {d2[3]}, [r3]!
|
||||
add r0, r0, r12
|
||||
mov r3, r0
|
||||
vst1.8 {d4[0]}, [r3]!
|
||||
vst1.8 {d4[1]}, [r3]!
|
||||
vst1.8 {d4[2]}, [r3]!
|
||||
vst1.8 {d4[3]}, [r3]!
|
||||
add r0, r0, r12
|
||||
mov r3, r0
|
||||
vst1.8 {d6[0]}, [r3]!
|
||||
vst1.8 {d6[1]}, [r3]!
|
||||
vst1.8 {d6[2]}, [r3]!
|
||||
vst1.8 {d6[3]}, [r3]!
|
||||
add r0, r0, r12
|
||||
mov r3, r0
|
||||
mov r12, #4
|
||||
mul r11, r11, r12
|
||||
add r1, r1, r11
|
||||
subs r5, r5, #1
|
||||
bne LoopW4
|
||||
LoopW:
|
||||
mov r8, r1 // src_kh
|
||||
ldr r2, [sp, #8] // weight_kh
|
||||
ldr r6, [sp, #56] // kernel_h
|
||||
vmov q0, q15
|
||||
LoopKh:
|
||||
ldr r12, [sp, #84] //in_kw_step
|
||||
ldr r7, [sp, #60] // kernel_w
|
||||
mov r10, r8 // src_kw
|
||||
LoopKw:
|
||||
vld1.16 {d2}, [r10]
|
||||
add r10, r10, r12
|
||||
vld1.16 {d24}, [r2]!
|
||||
vmlal.s16 q0, d2, d24
|
||||
subs r7, r7, #1
|
||||
bne LoopKw
|
||||
ldr r12, [sp, #80]
|
||||
add r8, r8, r12
|
||||
subs r6, r6, #1
|
||||
bne LoopKh
|
||||
|
||||
vshl.s32 q0, q0, q9
|
||||
vqrdmulh.s32 q0, q0, q10
|
||||
vrshl.s32 q0, q0, q11
|
||||
vadd.i32 q0, q0, q12
|
||||
vmax.s32 q0, q0, q13
|
||||
vmin.s32 q0, q0, q14
|
||||
|
||||
vqmovn.s32 d0, q0
|
||||
vqmovn.s16 d0, q0
|
||||
|
||||
mov r3, r0
|
||||
ldr r12, [sp, #68]
|
||||
vst1.8 {d0[0]}, [r3]!
|
||||
vst1.8 {d0[1]}, [r3]!
|
||||
vst1.8 {d0[2]}, [r3]!
|
||||
vst1.8 {d0[3]}, [r3]!
|
||||
add r0, r0, r12
|
||||
ldr r12, [sp, #76]
|
||||
add r1, r1, r12
|
||||
subs r5, r5, #1
|
||||
bne LoopW
|
||||
ldr r3, [sp, #64]
|
||||
ldr r12, [sp]
|
||||
add r12, r12, r3
|
||||
str r12, [sp]
|
||||
ldr r3, [sp, #72]
|
||||
ldr r12, [sp, #4]
|
||||
add r12, r12, r3
|
||||
str r12, [sp, #4]
|
||||
subs r4, r4, #1
|
||||
bne LoopH
|
||||
|
||||
sub sp, sp, #112
|
||||
vpop {q4-q7}
|
||||
pop {r0-r8, r10, r11, pc}
|
||||
#endif
|
||||
#endif
|
|
@ -0,0 +1,69 @@
|
|||
#ifdef __arm__
|
||||
#ifndef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global DeconvDwFp32Center
|
||||
#ifndef __APPLE__
|
||||
.type DeconvDwFp32Center, %function
|
||||
#endif
|
||||
|
||||
// void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
|
||||
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
|
||||
// size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
|
||||
// r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step
|
||||
// #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step
|
||||
DeconvDwFp32Center:
|
||||
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
|
||||
// according to https://stackoverflow.com/questions/53625807
|
||||
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
|
||||
// clang's rule seems more simple, though there are no subroutine calls here
|
||||
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
|
||||
push {r0-r8, r10, r11, lr}
|
||||
|
||||
ldr r10, [sp, #80] // in_kw_step
|
||||
ldr r11, [sp, #76] // in_kh_step
|
||||
|
||||
LoopH:
|
||||
ldr r0, [sp] // dst_w
|
||||
ldr r1, [sp, #4] // src_w
|
||||
ldr r4, [sp, #48] // width
|
||||
LoopW:
|
||||
mov r6, r0 // dst_kh
|
||||
ldr r2, [sp, #8] // weight_kh
|
||||
ldr r5, [sp, #52] // kernel_h
|
||||
vld1.32 {q1}, [r1]
|
||||
LoopKh:
|
||||
mov r7, r6 // dst_kw
|
||||
ldr r12, [sp, #56] // kernel_w
|
||||
LoopKw:
|
||||
vld1.32 {q0}, [r7]
|
||||
vld1.32 {q2}, [r2]!
|
||||
vmla.f32 q0, q1, q2
|
||||
vst1.32 {q0}, [r7]
|
||||
add r7, r7, r10
|
||||
subs r12, r12, #1
|
||||
bne LoopKw
|
||||
add r6, r6, r11
|
||||
subs r5, r5, #1
|
||||
bne LoopKh
|
||||
ldr r12, [sp, #72]
|
||||
add r0, r0, r12
|
||||
ldr r8, [sp, #64]
|
||||
add r1, r1, r8
|
||||
subs r4, r4, #1
|
||||
bne LoopW
|
||||
ldr r8, [sp, #68]
|
||||
ldr r12, [sp]
|
||||
add r12, r12, r8
|
||||
str r12, [sp]
|
||||
ldr r8, [sp, #60]
|
||||
ldr r12, [sp, #4]
|
||||
add r12, r12, r8
|
||||
str r12, [sp, #4]
|
||||
subs r3, r3, #1
|
||||
bne LoopH
|
||||
|
||||
pop {r0-r8, r10, r11, pc}
|
||||
#endif
|
||||
#endif
|
|
@ -0,0 +1,69 @@
|
|||
#ifdef __arm__
|
||||
#ifndef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global DeconvDwInt8Center
|
||||
#ifndef __APPLE__
|
||||
.type DeconvDwInt8Center, %function
|
||||
#endif
|
||||
|
||||
// void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width,
|
||||
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
|
||||
// size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
|
||||
// r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step
|
||||
// #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step
|
||||
DeconvDwInt8Center:
|
||||
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
|
||||
// according to https://stackoverflow.com/questions/53625807
|
||||
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
|
||||
// clang's rule seems more simple, though there are no subroutine calls here
|
||||
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
|
||||
push {r0-r8, r10, r11, lr}
|
||||
|
||||
ldr r10, [sp, #80] // in_kw_step
|
||||
ldr r11, [sp, #76] // in_kh_step
|
||||
|
||||
LoopH:
|
||||
ldr r0, [sp] // dst_w
|
||||
ldr r1, [sp, #4] // src_w
|
||||
ldr r4, [sp, #48] // width
|
||||
LoopW:
|
||||
mov r6, r0 // dst_kh
|
||||
ldr r2, [sp, #8] // weight_kh
|
||||
ldr r5, [sp, #52] // kernel_h
|
||||
vld1.16 {d2}, [r1]
|
||||
LoopKh:
|
||||
mov r7, r6 // dst_kw
|
||||
ldr r12, [sp, #56] // kernel_w
|
||||
LoopKw:
|
||||
vld1.32 {q0}, [r7]
|
||||
vld1.16 {d24}, [r2]!
|
||||
vmlal.s16 q0, d2, d24
|
||||
vst1.32 {q0}, [r7]
|
||||
add r7, r7, r10
|
||||
subs r12, r12, #1
|
||||
bne LoopKw
|
||||
add r6, r6, r11
|
||||
subs r5, r5, #1
|
||||
bne LoopKh
|
||||
ldr r12, [sp, #72]
|
||||
add r0, r0, r12
|
||||
ldr r8, [sp, #64]
|
||||
add r1, r1, r8
|
||||
subs r4, r4, #1
|
||||
bne LoopW
|
||||
ldr r8, [sp, #68]
|
||||
ldr r12, [sp]
|
||||
add r12, r12, r8
|
||||
str r12, [sp]
|
||||
ldr r8, [sp, #60]
|
||||
ldr r12, [sp, #4]
|
||||
add r12, r12, r8
|
||||
str r12, [sp, #4]
|
||||
subs r3, r3, #1
|
||||
bne LoopH
|
||||
|
||||
pop {r0-r8, r10, r11, pc}
|
||||
#endif
|
||||
#endif
|
|
@ -32,24 +32,238 @@ ConvDwFp32Center:
|
|||
ldr x14, [sp, #48]
|
||||
ldr x15, [sp, #56]
|
||||
|
||||
ld1 {v5.4s}, [x3]
|
||||
ld1 {v24.4s}, [x3]
|
||||
movi v26.4s, #6
|
||||
scvtf v26.4s, v26.4s
|
||||
dup v27.4s, wzr
|
||||
|
||||
LoopH:
|
||||
mov x23, x1
|
||||
mov x24, x5
|
||||
mov x3, x0
|
||||
cmp x24, #8
|
||||
blt LoopW
|
||||
cmp x24, #16
|
||||
blt LoopW8
|
||||
|
||||
LoopW16:
|
||||
mov x19, #16
|
||||
mul x19, x19, x11
|
||||
mov x16, x23
|
||||
mov x17, x2
|
||||
mov x20, x6
|
||||
mov v0.16b, v24.16b
|
||||
mov v1.16b, v24.16b
|
||||
mov v2.16b, v24.16b
|
||||
mov v3.16b, v24.16b
|
||||
mov v4.16b, v24.16b
|
||||
mov v5.16b, v24.16b
|
||||
mov v6.16b, v24.16b
|
||||
mov v7.16b, v24.16b
|
||||
mov v8.16b, v24.16b
|
||||
mov v9.16b, v24.16b
|
||||
mov v10.16b, v24.16b
|
||||
mov v11.16b, v24.16b
|
||||
mov v12.16b, v24.16b
|
||||
mov v13.16b, v24.16b
|
||||
mov v14.16b, v24.16b
|
||||
mov v15.16b, v24.16b
|
||||
LoopKh16:
|
||||
mov x18, x7
|
||||
mov x21, x16
|
||||
LoopKw16:
|
||||
mov x22, x21
|
||||
ld1 {v25.4s}, [x17], #16
|
||||
ld1 {v16.4s}, [x22], x11
|
||||
ld1 {v17.4s}, [x22], x11
|
||||
fmla v0.4s, v16.4s, v25.4s
|
||||
fmla v1.4s, v17.4s, v25.4s
|
||||
ld1 {v18.4s}, [x22], x11
|
||||
ld1 {v19.4s}, [x22], x11
|
||||
fmla v2.4s, v18.4s, v25.4s
|
||||
fmla v3.4s, v19.4s, v25.4s
|
||||
ld1 {v20.4s}, [x22], x11
|
||||
ld1 {v21.4s}, [x22], x11
|
||||
fmla v4.4s, v20.4s, v25.4s
|
||||
fmla v5.4s, v21.4s, v25.4s
|
||||
ld1 {v22.4s}, [x22], x11
|
||||
ld1 {v23.4s}, [x22], x11
|
||||
fmla v6.4s, v22.4s, v25.4s
|
||||
fmla v7.4s, v23.4s, v25.4s
|
||||
ld1 {v16.4s}, [x22], x11
|
||||
ld1 {v17.4s}, [x22], x11
|
||||
fmla v8.4s, v16.4s, v25.4s
|
||||
fmla v9.4s, v17.4s, v25.4s
|
||||
ld1 {v18.4s}, [x22], x11
|
||||
ld1 {v19.4s}, [x22], x11
|
||||
fmla v10.4s, v18.4s, v25.4s
|
||||
fmla v11.4s, v19.4s, v25.4s
|
||||
ld1 {v20.4s}, [x22], x11
|
||||
ld1 {v21.4s}, [x22], x11
|
||||
fmla v12.4s, v20.4s, v25.4s
|
||||
fmla v13.4s, v21.4s, v25.4s
|
||||
ld1 {v22.4s}, [x22], x11
|
||||
ld1 {v23.4s}, [x22], x11
|
||||
fmla v14.4s, v22.4s, v25.4s
|
||||
fmla v15.4s, v23.4s, v25.4s
|
||||
subs x18, x18, #1
|
||||
add x21, x21, x13
|
||||
bne LoopKw16
|
||||
add x16, x16, x12
|
||||
subs x20, x20, #1
|
||||
bne LoopKh16
|
||||
cbnz x15, Relu616
|
||||
cbnz x14, Relu16
|
||||
b Write16
|
||||
Relu616:
|
||||
fmin v0.4s, v0.4s, v26.4s
|
||||
fmin v1.4s, v1.4s, v26.4s
|
||||
fmin v2.4s, v2.4s, v26.4s
|
||||
fmin v3.4s, v3.4s, v26.4s
|
||||
fmin v4.4s, v4.4s, v26.4s
|
||||
fmin v5.4s, v5.4s, v26.4s
|
||||
fmin v6.4s, v6.4s, v26.4s
|
||||
fmin v7.4s, v7.4s, v26.4s
|
||||
fmin v8.4s, v8.4s, v26.4s
|
||||
fmin v9.4s, v9.4s, v26.4s
|
||||
fmin v10.4s, v10.4s, v26.4s
|
||||
fmin v11.4s, v11.4s, v26.4s
|
||||
fmin v12.4s, v12.4s, v26.4s
|
||||
fmin v13.4s, v13.4s, v26.4s
|
||||
fmin v14.4s, v14.4s, v26.4s
|
||||
fmin v15.4s, v15.4s, v26.4s
|
||||
Relu16:
|
||||
fmax v0.4s, v0.4s, v27.4s
|
||||
fmax v1.4s, v1.4s, v27.4s
|
||||
fmax v2.4s, v2.4s, v27.4s
|
||||
fmax v3.4s, v3.4s, v27.4s
|
||||
fmax v4.4s, v4.4s, v27.4s
|
||||
fmax v5.4s, v5.4s, v27.4s
|
||||
fmax v6.4s, v6.4s, v27.4s
|
||||
fmax v7.4s, v7.4s, v27.4s
|
||||
fmax v8.4s, v8.4s, v27.4s
|
||||
fmax v9.4s, v9.4s, v27.4s
|
||||
fmax v10.4s, v10.4s, v27.4s
|
||||
fmax v11.4s, v11.4s, v27.4s
|
||||
fmax v12.4s, v12.4s, v27.4s
|
||||
fmax v13.4s, v13.4s, v27.4s
|
||||
fmax v14.4s, v14.4s, v27.4s
|
||||
fmax v15.4s, v15.4s, v27.4s
|
||||
Write16:
|
||||
st1 {v0.4s}, [x3], x9
|
||||
st1 {v1.4s}, [x3], x9
|
||||
st1 {v2.4s}, [x3], x9
|
||||
st1 {v3.4s}, [x3], x9
|
||||
st1 {v4.4s}, [x3], x9
|
||||
st1 {v5.4s}, [x3], x9
|
||||
st1 {v6.4s}, [x3], x9
|
||||
st1 {v7.4s}, [x3], x9
|
||||
st1 {v8.4s}, [x3], x9
|
||||
st1 {v9.4s}, [x3], x9
|
||||
st1 {v10.4s}, [x3], x9
|
||||
st1 {v11.4s}, [x3], x9
|
||||
st1 {v12.4s}, [x3], x9
|
||||
st1 {v13.4s}, [x3], x9
|
||||
st1 {v14.4s}, [x3], x9
|
||||
st1 {v15.4s}, [x3], x9
|
||||
add x23, x23, x19
|
||||
sub x24, x24, #16
|
||||
cmp x24, #0
|
||||
ble LoopWEnd
|
||||
cmp x24, #8
|
||||
blt LoopW
|
||||
cmp x24, #16
|
||||
bge LoopW16
|
||||
LoopW8:
|
||||
mov x19, #8
|
||||
mul x19, x19, x11
|
||||
mov x16, x23
|
||||
mov x17, x2
|
||||
mov x20, x6
|
||||
mov v0.16b, v24.16b
|
||||
mov v1.16b, v24.16b
|
||||
mov v2.16b, v24.16b
|
||||
mov v3.16b, v24.16b
|
||||
mov v4.16b, v24.16b
|
||||
mov v5.16b, v24.16b
|
||||
mov v6.16b, v24.16b
|
||||
mov v7.16b, v24.16b
|
||||
LoopKh8:
|
||||
mov x18, x7
|
||||
mov x21, x16
|
||||
LoopKw8:
|
||||
mov x22, x21
|
||||
ld1 {v25.4s}, [x17], #16
|
||||
ld1 {v16.4s}, [x22], x11
|
||||
ld1 {v17.4s}, [x22], x11
|
||||
fmla v0.4s, v16.4s, v25.4s
|
||||
fmla v1.4s, v17.4s, v25.4s
|
||||
ld1 {v18.4s}, [x22], x11
|
||||
ld1 {v19.4s}, [x22], x11
|
||||
fmla v2.4s, v18.4s, v25.4s
|
||||
fmla v3.4s, v19.4s, v25.4s
|
||||
ld1 {v20.4s}, [x22], x11
|
||||
ld1 {v21.4s}, [x22], x11
|
||||
fmla v4.4s, v20.4s, v25.4s
|
||||
fmla v5.4s, v21.4s, v25.4s
|
||||
ld1 {v22.4s}, [x22], x11
|
||||
ld1 {v23.4s}, [x22], x11
|
||||
fmla v6.4s, v22.4s, v25.4s
|
||||
fmla v7.4s, v23.4s, v25.4s
|
||||
subs x18, x18, #1
|
||||
add x21, x21, x13
|
||||
bne LoopKw8
|
||||
add x16, x16, x12
|
||||
subs x20, x20, #1
|
||||
bne LoopKh8
|
||||
cbnz x15, Relu68
|
||||
cbnz x14, Relu8
|
||||
b Write8
|
||||
Relu68:
|
||||
fmin v0.4s, v0.4s, v26.4s
|
||||
fmin v1.4s, v1.4s, v26.4s
|
||||
fmin v2.4s, v2.4s, v26.4s
|
||||
fmin v3.4s, v3.4s, v26.4s
|
||||
fmin v4.4s, v4.4s, v26.4s
|
||||
fmin v5.4s, v5.4s, v26.4s
|
||||
fmin v6.4s, v6.4s, v26.4s
|
||||
fmin v7.4s, v7.4s, v26.4s
|
||||
Relu8:
|
||||
fmax v0.4s, v0.4s, v27.4s
|
||||
fmax v1.4s, v1.4s, v27.4s
|
||||
fmax v2.4s, v2.4s, v27.4s
|
||||
fmax v3.4s, v3.4s, v27.4s
|
||||
fmax v4.4s, v4.4s, v27.4s
|
||||
fmax v5.4s, v5.4s, v27.4s
|
||||
fmax v6.4s, v6.4s, v27.4s
|
||||
fmax v7.4s, v7.4s, v27.4s
|
||||
Write8:
|
||||
st1 {v0.4s}, [x3], x9
|
||||
st1 {v1.4s}, [x3], x9
|
||||
st1 {v2.4s}, [x3], x9
|
||||
st1 {v3.4s}, [x3], x9
|
||||
st1 {v4.4s}, [x3], x9
|
||||
st1 {v5.4s}, [x3], x9
|
||||
st1 {v6.4s}, [x3], x9
|
||||
st1 {v7.4s}, [x3], x9
|
||||
add x23, x23, x19
|
||||
sub x24, x24, #8
|
||||
cmp x24, #0
|
||||
ble LoopWEnd
|
||||
cmp x24, #8
|
||||
bge LoopW8
|
||||
LoopW:
|
||||
mov x16, x23
|
||||
mov x17, x2
|
||||
mov x20, x6
|
||||
mov v0.16b, v5.16b
|
||||
mov v0.16b, v24.16b
|
||||
LoopKh:
|
||||
mov x18, x7
|
||||
mov x22, x16
|
||||
LoopKw:
|
||||
ld1 {v1.4s}, [x22], x13
|
||||
ld1 {v2.4s}, [x17], #16
|
||||
fmla v0.4s, v1.4s, v2.4s
|
||||
ld1 {v16.4s}, [x22], x13
|
||||
ld1 {v25.4s}, [x17], #16
|
||||
fmla v0.4s, v16.4s, v25.4s
|
||||
subs x18, x18, #1
|
||||
bne LoopKw
|
||||
add x16, x16, x12
|
||||
|
@ -59,17 +273,15 @@ ConvDwFp32Center:
|
|||
cbnz x14, Relu
|
||||
b Write
|
||||
Relu6:
|
||||
movi v4.4s, #6
|
||||
scvtf v4.4s, v4.4s
|
||||
fmin v0.4s, v0.4s, v4.4s
|
||||
fmin v0.4s, v0.4s, v26.4s
|
||||
Relu:
|
||||
dup v3.4s, wzr
|
||||
fmax v0.4s, v0.4s, v3.4s
|
||||
fmax v0.4s, v0.4s, v27.4s
|
||||
Write:
|
||||
st1 {v0.4s}, [x3], x9
|
||||
add x23, x23, x11
|
||||
subs x24, x24, #1
|
||||
bne LoopW
|
||||
LoopWEnd:
|
||||
add x0, x0, x8
|
||||
add x1, x1, x10
|
||||
subs x4, x4, #1
|
||||
|
|
|
@ -0,0 +1,558 @@
|
|||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global ConvDwInt8Center
|
||||
#ifndef __APPLE__
|
||||
.type ConvDwInt8Center, %function
|
||||
#endif
|
||||
|
||||
// void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height, size_t width,
|
||||
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
|
||||
// size_t in_kh_step, size_t in_kw_step, int out_multiplier, int left_shift,
|
||||
// int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max);
|
||||
// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w,
|
||||
// x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
|
||||
// x14: out_multiplier, #56: left_shift, #64: right_shift, #72:out_zp, #80: acc_min, #88: acc_max
|
||||
ConvDwInt8Center:
|
||||
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
sub sp, sp, #48
|
||||
stp x19, x20, [sp], #16
|
||||
stp x21, x22, [sp], #16
|
||||
stp x23, x24, [sp], #16
|
||||
|
||||
ldr x8, [sp]
|
||||
ldr x9, [sp, #8]
|
||||
ldr x10, [sp, #16]
|
||||
ldr x11, [sp, #24]
|
||||
ldr x12, [sp, #32]
|
||||
ldr x13, [sp, #40]
|
||||
|
||||
ldr w14, [sp, #56]
|
||||
dup v26.4s, w14
|
||||
|
||||
ldr x15, [sp, #48]
|
||||
dup v27.4s, w15
|
||||
|
||||
ldr w16, [sp, #64]
|
||||
dup v28.4s, w16
|
||||
|
||||
ldr w17, [sp, #72]
|
||||
dup v29.4s, w17
|
||||
|
||||
ldr w18, [sp, #80]
|
||||
dup v30.4s, w18
|
||||
|
||||
ldr w19, [sp, #88]
|
||||
dup v31.4s, w19
|
||||
|
||||
ld1 {v24.4s}, [x3]
|
||||
|
||||
LoopH:
|
||||
mov x23, x1
|
||||
mov x24, x5
|
||||
mov x3, x0
|
||||
cmp x24, #8
|
||||
blt LoopW
|
||||
cmp x24, #16
|
||||
blt LoopW8
|
||||
|
||||
LoopW16:
|
||||
mov x19, #16
|
||||
mul x19, x19, x11
|
||||
mov x16, x23
|
||||
mov x17, x2
|
||||
mov x20, x6
|
||||
mov v0.16b, v24.16b
|
||||
mov v1.16b, v24.16b
|
||||
mov v2.16b, v24.16b
|
||||
mov v3.16b, v24.16b
|
||||
mov v4.16b, v24.16b
|
||||
mov v5.16b, v24.16b
|
||||
mov v6.16b, v24.16b
|
||||
mov v7.16b, v24.16b
|
||||
mov v8.16b, v24.16b
|
||||
mov v9.16b, v24.16b
|
||||
mov v10.16b, v24.16b
|
||||
mov v11.16b, v24.16b
|
||||
mov v12.16b, v24.16b
|
||||
mov v13.16b, v24.16b
|
||||
mov v14.16b, v24.16b
|
||||
mov v15.16b, v24.16b
|
||||
LoopKh16:
|
||||
mov x18, x7
|
||||
mov x21, x16
|
||||
LoopKw16:
|
||||
mov x22, x21
|
||||
ld1 {v25.4h}, [x17], #8
|
||||
ld1 {v16.4h}, [x22], x13
|
||||
ld1 {v17.4h}, [x22], x13
|
||||
smlal v0.4s, v16.4h, v25.4h
|
||||
smlal v1.4s, v17.4h, v25.4h
|
||||
ld1 {v18.4h}, [x22], x13
|
||||
ld1 {v19.4h}, [x22], x13
|
||||
smlal v2.4s, v18.4h, v25.4h
|
||||
smlal v3.4s, v19.4h, v25.4h
|
||||
ld1 {v20.4h}, [x22], x13
|
||||
ld1 {v21.4h}, [x22], x13
|
||||
smlal v4.4s, v20.4h, v25.4h
|
||||
smlal v5.4s, v21.4h, v25.4h
|
||||
ld1 {v22.4h}, [x22], x13
|
||||
ld1 {v23.4h}, [x22], x13
|
||||
smlal v6.4s, v22.4h, v25.4h
|
||||
smlal v7.4s, v23.4h, v25.4h
|
||||
ld1 {v16.4h}, [x22], x13
|
||||
ld1 {v17.4h}, [x22], x13
|
||||
smlal v8.4s, v16.4h, v25.4h
|
||||
smlal v9.4s, v17.4h, v25.4h
|
||||
ld1 {v18.4h}, [x22], x13
|
||||
ld1 {v19.4h}, [x22], x13
|
||||
smlal v10.4s, v18.4h, v25.4h
|
||||
smlal v11.4s, v19.4h, v25.4h
|
||||
ld1 {v20.4h}, [x22], x13
|
||||
ld1 {v21.4h}, [x22], x13
|
||||
smlal v12.4s, v20.4h, v25.4h
|
||||
smlal v13.4s, v21.4h, v25.4h
|
||||
ld1 {v22.4h}, [x22], x13
|
||||
ld1 {v23.4h}, [x22], x13
|
||||
smlal v14.4s, v22.4h, v25.4h
|
||||
smlal v15.4s, v23.4h, v25.4h
|
||||
subs x18, x18, #1
|
||||
add x21, x21, x13
|
||||
bne LoopKw16
|
||||
add x16, x16, x12
|
||||
subs x20, x20, #1
|
||||
bne LoopKh16
|
||||
|
||||
sqshl v0.4s, v0.4s ,v26.4s
|
||||
sqshl v1.4s, v1.4s ,v26.4s
|
||||
sqshl v2.4s, v2.4s ,v26.4s
|
||||
sqshl v3.4s, v3.4s ,v26.4s
|
||||
sqshl v4.4s, v4.4s ,v26.4s
|
||||
sqshl v5.4s, v5.4s ,v26.4s
|
||||
sqshl v6.4s, v6.4s ,v26.4s
|
||||
sqshl v7.4s, v7.4s ,v26.4s
|
||||
sqshl v8.4s, v8.4s ,v26.4s
|
||||
sqshl v9.4s, v9.4s ,v26.4s
|
||||
sqshl v10.4s, v10.4s ,v26.4s
|
||||
sqshl v11.4s, v11.4s ,v26.4s
|
||||
sqshl v12.4s, v12.4s ,v26.4s
|
||||
sqshl v13.4s, v13.4s ,v26.4s
|
||||
sqshl v14.4s, v14.4s ,v26.4s
|
||||
sqshl v15.4s, v15.4s ,v26.4s
|
||||
sqrdmulh v0.4s, v0.4s ,v27.4s
|
||||
sqrdmulh v1.4s, v1.4s ,v27.4s
|
||||
sqrdmulh v2.4s, v2.4s ,v27.4s
|
||||
sqrdmulh v3.4s, v3.4s ,v27.4s
|
||||
sqrdmulh v4.4s, v4.4s ,v27.4s
|
||||
sqrdmulh v5.4s, v5.4s ,v27.4s
|
||||
sqrdmulh v6.4s, v6.4s ,v27.4s
|
||||
sqrdmulh v7.4s, v7.4s ,v27.4s
|
||||
sqrdmulh v8.4s, v8.4s ,v27.4s
|
||||
sqrdmulh v9.4s, v9.4s ,v27.4s
|
||||
sqrdmulh v10.4s, v10.4s ,v27.4s
|
||||
sqrdmulh v11.4s, v11.4s ,v27.4s
|
||||
sqrdmulh v12.4s, v12.4s ,v27.4s
|
||||
sqrdmulh v13.4s, v13.4s ,v27.4s
|
||||
sqrdmulh v14.4s, v14.4s ,v27.4s
|
||||
sqrdmulh v15.4s, v15.4s ,v27.4s
|
||||
sqrshl v0.4s, v0.4s ,v28.4s
|
||||
sqrshl v1.4s, v1.4s ,v28.4s
|
||||
sqrshl v2.4s, v2.4s ,v28.4s
|
||||
sqrshl v3.4s, v3.4s ,v28.4s
|
||||
sqrshl v4.4s, v4.4s ,v28.4s
|
||||
sqrshl v5.4s, v5.4s ,v28.4s
|
||||
sqrshl v6.4s, v6.4s ,v28.4s
|
||||
sqrshl v7.4s, v7.4s ,v28.4s
|
||||
sqrshl v8.4s, v8.4s ,v28.4s
|
||||
sqrshl v9.4s, v9.4s ,v28.4s
|
||||
sqrshl v10.4s, v10.4s ,v28.4s
|
||||
sqrshl v11.4s, v11.4s ,v28.4s
|
||||
sqrshl v12.4s, v12.4s ,v28.4s
|
||||
sqrshl v13.4s, v13.4s ,v28.4s
|
||||
sqrshl v14.4s, v14.4s ,v28.4s
|
||||
sqrshl v15.4s, v15.4s ,v28.4s
|
||||
add v0.4s, v0.4s ,v29.4s
|
||||
add v1.4s, v1.4s ,v29.4s
|
||||
add v2.4s, v2.4s ,v29.4s
|
||||
add v3.4s, v3.4s ,v29.4s
|
||||
add v4.4s, v4.4s ,v29.4s
|
||||
add v5.4s, v5.4s ,v29.4s
|
||||
add v6.4s, v6.4s ,v29.4s
|
||||
add v7.4s, v7.4s ,v29.4s
|
||||
add v8.4s, v8.4s ,v29.4s
|
||||
add v9.4s, v9.4s ,v29.4s
|
||||
add v10.4s, v10.4s ,v29.4s
|
||||
add v11.4s, v11.4s ,v29.4s
|
||||
add v12.4s, v12.4s ,v29.4s
|
||||
add v13.4s, v13.4s ,v29.4s
|
||||
add v14.4s, v14.4s ,v29.4s
|
||||
add v15.4s, v15.4s ,v29.4s
|
||||
smax v0.4s, v0.4s ,v30.4s
|
||||
smax v1.4s, v1.4s ,v30.4s
|
||||
smax v2.4s, v2.4s ,v30.4s
|
||||
smax v3.4s, v3.4s ,v30.4s
|
||||
smax v4.4s, v4.4s ,v30.4s
|
||||
smax v5.4s, v5.4s ,v30.4s
|
||||
smax v6.4s, v6.4s ,v30.4s
|
||||
smax v7.4s, v7.4s ,v30.4s
|
||||
smax v8.4s, v8.4s ,v30.4s
|
||||
smax v9.4s, v9.4s ,v30.4s
|
||||
smax v10.4s, v10.4s ,v30.4s
|
||||
smax v11.4s, v11.4s ,v30.4s
|
||||
smax v12.4s, v12.4s ,v30.4s
|
||||
smax v13.4s, v13.4s ,v30.4s
|
||||
smax v14.4s, v14.4s ,v30.4s
|
||||
smax v15.4s, v15.4s ,v30.4s
|
||||
smin v0.4s, v0.4s ,v31.4s
|
||||
smin v1.4s, v1.4s ,v31.4s
|
||||
smin v2.4s, v2.4s ,v31.4s
|
||||
smin v3.4s, v3.4s ,v31.4s
|
||||
smin v4.4s, v4.4s ,v31.4s
|
||||
smin v5.4s, v5.4s ,v31.4s
|
||||
smin v6.4s, v6.4s ,v31.4s
|
||||
smin v7.4s, v7.4s ,v31.4s
|
||||
smin v8.4s, v8.4s ,v31.4s
|
||||
smin v9.4s, v9.4s ,v31.4s
|
||||
smin v10.4s, v10.4s ,v31.4s
|
||||
smin v11.4s, v11.4s ,v31.4s
|
||||
smin v12.4s, v12.4s ,v31.4s
|
||||
smin v13.4s, v13.4s ,v31.4s
|
||||
smin v14.4s, v14.4s ,v31.4s
|
||||
smin v15.4s, v15.4s ,v31.4s
|
||||
|
||||
sqxtn v0.4h, v0.4s
|
||||
sqxtn v1.4h, v1.4s
|
||||
sqxtn v2.4h, v2.4s
|
||||
sqxtn v3.4h, v3.4s
|
||||
sqxtn v4.4h, v4.4s
|
||||
sqxtn v5.4h, v5.4s
|
||||
sqxtn v6.4h, v6.4s
|
||||
sqxtn v7.4h, v7.4s
|
||||
sqxtn v8.4h, v8.4s
|
||||
sqxtn v9.4h, v9.4s
|
||||
sqxtn v10.4h, v10.4s
|
||||
sqxtn v11.4h, v11.4s
|
||||
sqxtn v12.4h, v12.4s
|
||||
sqxtn v13.4h, v13.4s
|
||||
sqxtn v14.4h, v14.4s
|
||||
sqxtn v15.4h, v15.4s
|
||||
sqxtn v0.8b, v0.8h
|
||||
sqxtn v1.8b, v1.8h
|
||||
sqxtn v2.8b, v2.8h
|
||||
sqxtn v3.8b, v3.8h
|
||||
sqxtn v4.8b, v4.8h
|
||||
sqxtn v5.8b, v5.8h
|
||||
sqxtn v6.8b, v6.8h
|
||||
sqxtn v7.8b, v7.8h
|
||||
sqxtn v8.8b, v8.8h
|
||||
sqxtn v9.8b, v9.8h
|
||||
sqxtn v10.8b, v10.8h
|
||||
sqxtn v11.8b, v11.8h
|
||||
sqxtn v12.8b, v12.8h
|
||||
sqxtn v13.8b, v13.8h
|
||||
sqxtn v14.8b, v14.8h
|
||||
sqxtn v15.8b, v15.8h
|
||||
|
||||
add x17, x3, #1
|
||||
add x18, x3, #2
|
||||
add x21, x3, #3
|
||||
st1 {v0.b}[0], [x3], x9
|
||||
st1 {v0.b}[1], [x17], x9
|
||||
st1 {v0.b}[2], [x18], x9
|
||||
st1 {v0.b}[3], [x21], x9
|
||||
|
||||
st1 {v1.b}[0], [x3], x9
|
||||
st1 {v1.b}[1], [x17], x9
|
||||
st1 {v1.b}[2], [x18], x9
|
||||
st1 {v1.b}[3], [x21], x9
|
||||
|
||||
st1 {v2.b}[0], [x3], x9
|
||||
st1 {v2.b}[1], [x17], x9
|
||||
st1 {v2.b}[2], [x18], x9
|
||||
st1 {v2.b}[3], [x21], x9
|
||||
|
||||
st1 {v3.b}[0], [x3], x9
|
||||
st1 {v3.b}[1], [x17], x9
|
||||
st1 {v3.b}[2], [x18], x9
|
||||
st1 {v3.b}[3], [x21], x9
|
||||
|
||||
st1 {v4.b}[0], [x3], x9
|
||||
st1 {v4.b}[1], [x17], x9
|
||||
st1 {v4.b}[2], [x18], x9
|
||||
st1 {v4.b}[3], [x21], x9
|
||||
|
||||
st1 {v5.b}[0], [x3], x9
|
||||
st1 {v5.b}[1], [x17], x9
|
||||
st1 {v5.b}[2], [x18], x9
|
||||
st1 {v5.b}[3], [x21], x9
|
||||
|
||||
st1 {v6.b}[0], [x3], x9
|
||||
st1 {v6.b}[1], [x17], x9
|
||||
st1 {v6.b}[2], [x18], x9
|
||||
st1 {v6.b}[3], [x21], x9
|
||||
|
||||
st1 {v7.b}[0], [x3], x9
|
||||
st1 {v7.b}[1], [x17], x9
|
||||
st1 {v7.b}[2], [x18], x9
|
||||
st1 {v7.b}[3], [x21], x9
|
||||
|
||||
st1 {v8.b}[0], [x3], x9
|
||||
st1 {v8.b}[1], [x17], x9
|
||||
st1 {v8.b}[2], [x18], x9
|
||||
st1 {v8.b}[3], [x21], x9
|
||||
|
||||
st1 {v9.b}[0], [x3], x9
|
||||
st1 {v9.b}[1], [x17], x9
|
||||
st1 {v9.b}[2], [x18], x9
|
||||
st1 {v9.b}[3], [x21], x9
|
||||
|
||||
st1 {v10.b}[0], [x3], x9
|
||||
st1 {v10.b}[1], [x17], x9
|
||||
st1 {v10.b}[2], [x18], x9
|
||||
st1 {v10.b}[3], [x21], x9
|
||||
|
||||
st1 {v11.b}[0], [x3], x9
|
||||
st1 {v11.b}[1], [x17], x9
|
||||
st1 {v11.b}[2], [x18], x9
|
||||
st1 {v11.b}[3], [x21], x9
|
||||
|
||||
st1 {v12.b}[0], [x3], x9
|
||||
st1 {v12.b}[1], [x17], x9
|
||||
st1 {v12.b}[2], [x18], x9
|
||||
st1 {v12.b}[3], [x21], x9
|
||||
|
||||
st1 {v13.b}[0], [x3], x9
|
||||
st1 {v13.b}[1], [x17], x9
|
||||
st1 {v13.b}[2], [x18], x9
|
||||
st1 {v13.b}[3], [x21], x9
|
||||
|
||||
st1 {v14.b}[0], [x3], x9
|
||||
st1 {v14.b}[1], [x17], x9
|
||||
st1 {v14.b}[2], [x18], x9
|
||||
st1 {v14.b}[3], [x21], x9
|
||||
|
||||
st1 {v15.b}[0], [x3], x9
|
||||
st1 {v15.b}[1], [x17], x9
|
||||
st1 {v15.b}[2], [x18], x9
|
||||
st1 {v15.b}[3], [x21], x9
|
||||
|
||||
add x23, x23, x19
|
||||
sub x24, x24, #16
|
||||
cmp x24, #0
|
||||
ble LoopWEnd
|
||||
cmp x24, #8
|
||||
blt LoopW
|
||||
cmp x24, #16
|
||||
bge LoopW16
|
||||
LoopW8:
|
||||
mov x19, #8
|
||||
mul x19, x19, x11
|
||||
mov x16, x23
|
||||
mov x17, x2
|
||||
mov x20, x6
|
||||
mov v0.16b, v24.16b
|
||||
mov v1.16b, v24.16b
|
||||
mov v2.16b, v24.16b
|
||||
mov v3.16b, v24.16b
|
||||
mov v4.16b, v24.16b
|
||||
mov v5.16b, v24.16b
|
||||
mov v6.16b, v24.16b
|
||||
mov v7.16b, v24.16b
|
||||
LoopKh8:
|
||||
mov x18, x7
|
||||
mov x21, x16
|
||||
LoopKw8:
|
||||
mov x22, x21
|
||||
ld1 {v25.4h}, [x17], #8
|
||||
ld1 {v16.4h}, [x22], x13
|
||||
ld1 {v17.4h}, [x22], x13
|
||||
smlal v0.4s, v16.4h, v25.4h
|
||||
smlal v1.4s, v17.4h, v25.4h
|
||||
ld1 {v18.4h}, [x22], x13
|
||||
ld1 {v19.4h}, [x22], x13
|
||||
smlal v2.4s, v18.4h, v25.4h
|
||||
smlal v3.4s, v19.4h, v25.4h
|
||||
ld1 {v20.4h}, [x22], x13
|
||||
ld1 {v21.4h}, [x22], x13
|
||||
smlal v4.4s, v20.4h, v25.4h
|
||||
smlal v5.4s, v21.4h, v25.4h
|
||||
ld1 {v22.4h}, [x22], x13
|
||||
ld1 {v23.4h}, [x22], x13
|
||||
smlal v6.4s, v22.4h, v25.4h
|
||||
smlal v7.4s, v23.4h, v25.4h
|
||||
subs x18, x18, #1
|
||||
add x21, x21, x13
|
||||
bne LoopKw8
|
||||
add x16, x16, x12
|
||||
subs x20, x20, #1
|
||||
bne LoopKh8
|
||||
|
||||
sqshl v0.4s, v0.4s ,v26.4s
|
||||
sqshl v1.4s, v1.4s ,v26.4s
|
||||
sqshl v2.4s, v2.4s ,v26.4s
|
||||
sqshl v3.4s, v3.4s ,v26.4s
|
||||
sqshl v4.4s, v4.4s ,v26.4s
|
||||
sqshl v5.4s, v5.4s ,v26.4s
|
||||
sqshl v6.4s, v6.4s ,v26.4s
|
||||
sqshl v7.4s, v7.4s ,v26.4s
|
||||
sqrdmulh v0.4s, v0.4s ,v27.4s
|
||||
sqrdmulh v1.4s, v1.4s ,v27.4s
|
||||
sqrdmulh v2.4s, v2.4s ,v27.4s
|
||||
sqrdmulh v3.4s, v3.4s ,v27.4s
|
||||
sqrdmulh v4.4s, v4.4s ,v27.4s
|
||||
sqrdmulh v5.4s, v5.4s ,v27.4s
|
||||
sqrdmulh v6.4s, v6.4s ,v27.4s
|
||||
sqrdmulh v7.4s, v7.4s ,v27.4s
|
||||
sqrshl v0.4s, v0.4s ,v28.4s
|
||||
sqrshl v1.4s, v1.4s ,v28.4s
|
||||
sqrshl v2.4s, v2.4s ,v28.4s
|
||||
sqrshl v3.4s, v3.4s ,v28.4s
|
||||
sqrshl v4.4s, v4.4s ,v28.4s
|
||||
sqrshl v5.4s, v5.4s ,v28.4s
|
||||
sqrshl v6.4s, v6.4s ,v28.4s
|
||||
sqrshl v7.4s, v7.4s ,v28.4s
|
||||
add v0.4s, v0.4s ,v29.4s
|
||||
add v1.4s, v1.4s ,v29.4s
|
||||
add v2.4s, v2.4s ,v29.4s
|
||||
add v3.4s, v3.4s ,v29.4s
|
||||
add v4.4s, v4.4s ,v29.4s
|
||||
add v5.4s, v5.4s ,v29.4s
|
||||
add v6.4s, v6.4s ,v29.4s
|
||||
add v7.4s, v7.4s ,v29.4s
|
||||
smax v0.4s, v0.4s ,v30.4s
|
||||
smax v1.4s, v1.4s ,v30.4s
|
||||
smax v2.4s, v2.4s ,v30.4s
|
||||
smax v3.4s, v3.4s ,v30.4s
|
||||
smax v4.4s, v4.4s ,v30.4s
|
||||
smax v5.4s, v5.4s ,v30.4s
|
||||
smax v6.4s, v6.4s ,v30.4s
|
||||
smax v7.4s, v7.4s ,v30.4s
|
||||
smin v0.4s, v0.4s ,v31.4s
|
||||
smin v1.4s, v1.4s ,v31.4s
|
||||
smin v2.4s, v2.4s ,v31.4s
|
||||
smin v3.4s, v3.4s ,v31.4s
|
||||
smin v4.4s, v4.4s ,v31.4s
|
||||
smin v5.4s, v5.4s ,v31.4s
|
||||
smin v6.4s, v6.4s ,v31.4s
|
||||
smin v7.4s, v7.4s ,v31.4s
|
||||
|
||||
sqxtn v0.4h, v0.4s
|
||||
sqxtn v1.4h, v1.4s
|
||||
sqxtn v2.4h, v2.4s
|
||||
sqxtn v3.4h, v3.4s
|
||||
sqxtn v4.4h, v4.4s
|
||||
sqxtn v5.4h, v5.4s
|
||||
sqxtn v6.4h, v6.4s
|
||||
sqxtn v7.4h, v7.4s
|
||||
sqxtn v0.8b, v0.8h
|
||||
sqxtn v1.8b, v1.8h
|
||||
sqxtn v2.8b, v2.8h
|
||||
sqxtn v3.8b, v3.8h
|
||||
sqxtn v4.8b, v4.8h
|
||||
sqxtn v5.8b, v5.8h
|
||||
sqxtn v6.8b, v6.8h
|
||||
sqxtn v7.8b, v7.8h
|
||||
|
||||
add x17, x3, #1
|
||||
add x18, x3, #2
|
||||
add x21, x3, #3
|
||||
st1 {v0.b}[0], [x3], x9
|
||||
st1 {v0.b}[1], [x17], x9
|
||||
st1 {v0.b}[2], [x18], x9
|
||||
st1 {v0.b}[3], [x21], x9
|
||||
|
||||
st1 {v1.b}[0], [x3], x9
|
||||
st1 {v1.b}[1], [x17], x9
|
||||
st1 {v1.b}[2], [x18], x9
|
||||
st1 {v1.b}[3], [x21], x9
|
||||
|
||||
st1 {v2.b}[0], [x3], x9
|
||||
st1 {v2.b}[1], [x17], x9
|
||||
st1 {v2.b}[2], [x18], x9
|
||||
st1 {v2.b}[3], [x21], x9
|
||||
|
||||
st1 {v3.b}[0], [x3], x9
|
||||
st1 {v3.b}[1], [x17], x9
|
||||
st1 {v3.b}[2], [x18], x9
|
||||
st1 {v3.b}[3], [x21], x9
|
||||
|
||||
st1 {v4.b}[0], [x3], x9
|
||||
st1 {v4.b}[1], [x17], x9
|
||||
st1 {v4.b}[2], [x18], x9
|
||||
st1 {v4.b}[3], [x21], x9
|
||||
|
||||
st1 {v5.b}[0], [x3], x9
|
||||
st1 {v5.b}[1], [x17], x9
|
||||
st1 {v5.b}[2], [x18], x9
|
||||
st1 {v5.b}[3], [x21], x9
|
||||
|
||||
st1 {v6.b}[0], [x3], x9
|
||||
st1 {v6.b}[1], [x17], x9
|
||||
st1 {v6.b}[2], [x18], x9
|
||||
st1 {v6.b}[3], [x21], x9
|
||||
|
||||
st1 {v7.b}[0], [x3], x9
|
||||
st1 {v7.b}[1], [x17], x9
|
||||
st1 {v7.b}[2], [x18], x9
|
||||
st1 {v7.b}[3], [x21], x9
|
||||
|
||||
add x23, x23, x19
|
||||
sub x24, x24, #8
|
||||
cmp x24, #0
|
||||
ble LoopWEnd
|
||||
cmp x24, #8
|
||||
bge LoopW8
|
||||
LoopW:
|
||||
mov x16, x23
|
||||
mov x17, x2
|
||||
mov x20, x6
|
||||
mov v0.16b, v24.16b
|
||||
LoopKh:
|
||||
mov x18, x7
|
||||
mov x22, x16
|
||||
LoopKw:
|
||||
ld1 {v16.4h}, [x22], x13
|
||||
ld1 {v25.4h}, [x17], #8
|
||||
smlal v0.4s, v16.4h, v25.4h
|
||||
subs x18, x18, #1
|
||||
bne LoopKw
|
||||
add x16, x16, x12
|
||||
subs x20, x20, #1
|
||||
bne LoopKh
|
||||
|
||||
sqshl v0.4s, v0.4s ,v26.4s
|
||||
sqrdmulh v0.4s, v0.4s ,v27.4s
|
||||
sqrshl v0.4s, v0.4s ,v28.4s
|
||||
add v0.4s, v0.4s ,v29.4s
|
||||
smax v0.4s, v0.4s ,v30.4s
|
||||
smin v0.4s, v0.4s ,v31.4s
|
||||
|
||||
sqxtn v0.4h, v0.4s
|
||||
sqxtn v0.8b, v0.8h
|
||||
|
||||
mov x17, x3
|
||||
st1 {v0.b}[0], [x17], #1
|
||||
st1 {v0.b}[1], [x17], #1
|
||||
st1 {v0.b}[2], [x17], #1
|
||||
st1 {v0.b}[3], [x17], #1
|
||||
add x3, x3, x9
|
||||
|
||||
add x23, x23, x11
|
||||
subs x24, x24, #1
|
||||
bne LoopW
|
||||
LoopWEnd:
|
||||
add x0, x0, x8
|
||||
add x1, x1, x10
|
||||
subs x4, x4, #1
|
||||
bne LoopH
|
||||
|
||||
sub sp, sp, #48
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ldp x23, x24, [sp], #16
|
||||
ret
|
||||
#endif
|
|
@ -35,12 +35,12 @@ DeconvDwFp32Center:
|
|||
mov x18, x15
|
||||
mov x19, x2
|
||||
mov x20, x5
|
||||
dup v0.4s, wzr
|
||||
ld1 {v1.4s}, [x16], x8
|
||||
LoopKh:
|
||||
mov x21, x18
|
||||
mov x13, x6
|
||||
LoopKw:
|
||||
ld1 {v1.4s}, [x16]
|
||||
ld1 {v0.4s}, [x21]
|
||||
ld1 {v2.4s}, [x19], #16
|
||||
fmla v0.4s, v1.4s, v2.4s
|
||||
st1 {v0.4s}, [x21], x12
|
||||
|
@ -50,7 +50,6 @@ DeconvDwFp32Center:
|
|||
subs x20, x20, #1
|
||||
bne LoopKh
|
||||
add x15, x15, x10
|
||||
add x16, x16, x8
|
||||
subs x17, x17, #1
|
||||
bne LoopW
|
||||
add x0, x0, x9
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global DeconvDwInt8Center
|
||||
#ifndef __APPLE__
|
||||
.type DeconvDwInt8Center, %function
|
||||
#endif
|
||||
|
||||
// void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width,
|
||||
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
|
||||
// size_t in_kh_step, size_t in_kw_step);
|
||||
// x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
|
||||
// x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
|
||||
DeconvDwInt8Center:
|
||||
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
sub sp, sp, #32
|
||||
stp x19, x20, [sp], #16
|
||||
stp x21, x22, [sp], #16
|
||||
|
||||
ldr x8, [sp]
|
||||
ldr x9, [sp, #8]
|
||||
ldr x10, [sp, #16]
|
||||
ldr x11, [sp, #24]
|
||||
ldr x12, [sp, #32]
|
||||
|
||||
LoopH:
|
||||
mov x15, x0
|
||||
mov x16, x1
|
||||
mov x17, x4
|
||||
LoopW:
|
||||
mov x18, x15
|
||||
mov x19, x2
|
||||
mov x20, x5
|
||||
ld1 {v1.4h}, [x16], x8
|
||||
LoopKh:
|
||||
mov x21, x18
|
||||
mov x13, x6
|
||||
LoopKw:
|
||||
ld1 {v0.4s}, [x21]
|
||||
ld1 {v2.4h}, [x19], #8
|
||||
smlal v0.4s, v1.4h, v2.4h
|
||||
st1 {v0.4s}, [x21], x12
|
||||
subs x13, x13, #1
|
||||
bne LoopKw
|
||||
add x18, x18, x11
|
||||
subs x20, x20, #1
|
||||
bne LoopKh
|
||||
add x15, x15, x10
|
||||
add x16, x16, x8
|
||||
subs x17, x17, #1
|
||||
bne LoopW
|
||||
add x0, x0, x9
|
||||
add x1, x1, x7
|
||||
subs x3, x3, #1
|
||||
bne LoopH
|
||||
|
||||
sub sp, sp, #32
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ret
|
||||
#endif
|
|
@ -0,0 +1,294 @@
|
|||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global ConvDwFp16Center
|
||||
#ifndef __APPLE__
|
||||
.type ConvDwFp16Center, %function
|
||||
#endif
|
||||
|
||||
// void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, size_t height, size_t width,
|
||||
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
|
||||
// size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
|
||||
// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w,
|
||||
// x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
|
||||
// x14: relu, x15: relu6
|
||||
ConvDwFp16Center:
|
||||
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
sub sp, sp, #48
|
||||
stp x19, x20, [sp], #16
|
||||
stp x21, x22, [sp], #16
|
||||
stp x23, x24, [sp], #16
|
||||
|
||||
ldr x8, [sp]
|
||||
ldr x9, [sp, #8]
|
||||
ldr x10, [sp, #16]
|
||||
ldr x11, [sp, #24]
|
||||
ldr x12, [sp, #32]
|
||||
ldr x13, [sp, #40]
|
||||
ldr x14, [sp, #48]
|
||||
ldr x15, [sp, #56]
|
||||
|
||||
ld1 {v24.8h}, [x3]
|
||||
movi v26.8h, #0x46, lsl #8
|
||||
dup v27.4s, wzr
|
||||
|
||||
LoopH:
|
||||
mov x23, x1
|
||||
mov x24, x5
|
||||
mov x3, x0
|
||||
cmp x24, #8
|
||||
blt LoopW
|
||||
cmp x24, #16
|
||||
blt LoopW8
|
||||
|
||||
LoopW16:
|
||||
mov x19, #16
|
||||
mul x19, x19, x11
|
||||
mov x16, x23
|
||||
mov x17, x2
|
||||
mov x20, x6
|
||||
mov v0.16b, v24.16b
|
||||
mov v1.16b, v24.16b
|
||||
mov v2.16b, v24.16b
|
||||
mov v3.16b, v24.16b
|
||||
mov v4.16b, v24.16b
|
||||
mov v5.16b, v24.16b
|
||||
mov v6.16b, v24.16b
|
||||
mov v7.16b, v24.16b
|
||||
mov v8.16b, v24.16b
|
||||
mov v9.16b, v24.16b
|
||||
mov v10.16b, v24.16b
|
||||
mov v11.16b, v24.16b
|
||||
mov v12.16b, v24.16b
|
||||
mov v13.16b, v24.16b
|
||||
mov v14.16b, v24.16b
|
||||
mov v15.16b, v24.16b
|
||||
LoopKh16:
|
||||
mov x18, x7
|
||||
mov x21, x16
|
||||
LoopKw16:
|
||||
mov x22, x21
|
||||
ld1 {v25.8h}, [x17], #16
|
||||
ld1 {v16.8h}, [x22], x11
|
||||
ld1 {v17.8h}, [x22], x11
|
||||
fmla v0.8h, v16.8h, v25.8h
|
||||
fmla v1.8h, v17.8h, v25.8h
|
||||
ld1 {v18.8h}, [x22], x11
|
||||
ld1 {v19.8h}, [x22], x11
|
||||
fmla v2.8h, v18.8h, v25.8h
|
||||
fmla v3.8h, v19.8h, v25.8h
|
||||
ld1 {v20.8h}, [x22], x11
|
||||
ld1 {v21.8h}, [x22], x11
|
||||
fmla v4.8h, v20.8h, v25.8h
|
||||
fmla v5.8h, v21.8h, v25.8h
|
||||
ld1 {v22.8h}, [x22], x11
|
||||
ld1 {v23.8h}, [x22], x11
|
||||
fmla v6.8h, v22.8h, v25.8h
|
||||
fmla v7.8h, v23.8h, v25.8h
|
||||
ld1 {v16.8h}, [x22], x11
|
||||
ld1 {v17.8h}, [x22], x11
|
||||
fmla v8.8h, v16.8h, v25.8h
|
||||
fmla v9.8h, v17.8h, v25.8h
|
||||
ld1 {v18.8h}, [x22], x11
|
||||
ld1 {v19.8h}, [x22], x11
|
||||
fmla v10.8h, v18.8h, v25.8h
|
||||
fmla v11.8h, v19.8h, v25.8h
|
||||
ld1 {v20.8h}, [x22], x11
|
||||
ld1 {v21.8h}, [x22], x11
|
||||
fmla v12.8h, v20.8h, v25.8h
|
||||
fmla v13.8h, v21.8h, v25.8h
|
||||
ld1 {v22.8h}, [x22], x11
|
||||
ld1 {v23.8h}, [x22], x11
|
||||
fmla v14.8h, v22.8h, v25.8h
|
||||
fmla v15.8h, v23.8h, v25.8h
|
||||
subs x18, x18, #1
|
||||
add x21, x21, x13
|
||||
bne LoopKw16
|
||||
add x16, x16, x12
|
||||
subs x20, x20, #1
|
||||
bne LoopKh16
|
||||
cbnz x15, Relu616
|
||||
cbnz x14, Relu16
|
||||
b Write16
|
||||
Relu616:
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmin v1.8h, v1.8h, v26.8h
|
||||
fmin v2.8h, v2.8h, v26.8h
|
||||
fmin v3.8h, v3.8h, v26.8h
|
||||
fmin v4.8h, v4.8h, v26.8h
|
||||
fmin v5.8h, v5.8h, v26.8h
|
||||
fmin v6.8h, v6.8h, v26.8h
|
||||
fmin v7.8h, v7.8h, v26.8h
|
||||
fmin v8.8h, v8.8h, v26.8h
|
||||
fmin v9.8h, v9.8h, v26.8h
|
||||
fmin v10.8h, v10.8h, v26.8h
|
||||
fmin v11.8h, v11.8h, v26.8h
|
||||
fmin v12.8h, v12.8h, v26.8h
|
||||
fmin v13.8h, v13.8h, v26.8h
|
||||
fmin v14.8h, v14.8h, v26.8h
|
||||
fmin v15.8h, v15.8h, v26.8h
|
||||
Relu16:
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
fmax v1.8h, v1.8h, v27.8h
|
||||
fmax v2.8h, v2.8h, v27.8h
|
||||
fmax v3.8h, v3.8h, v27.8h
|
||||
fmax v4.8h, v4.8h, v27.8h
|
||||
fmax v5.8h, v5.8h, v27.8h
|
||||
fmax v6.8h, v6.8h, v27.8h
|
||||
fmax v7.8h, v7.8h, v27.8h
|
||||
fmax v8.8h, v8.8h, v27.8h
|
||||
fmax v9.8h, v9.8h, v27.8h
|
||||
fmax v10.8h, v10.8h, v27.8h
|
||||
fmax v11.8h, v11.8h, v27.8h
|
||||
fmax v12.8h, v12.8h, v27.8h
|
||||
fmax v13.8h, v13.8h, v27.8h
|
||||
fmax v14.8h, v14.8h, v27.8h
|
||||
fmax v15.8h, v15.8h, v27.8h
|
||||
Write16:
|
||||
st1 {v0.8h}, [x3], x9
|
||||
st1 {v1.8h}, [x3], x9
|
||||
st1 {v2.8h}, [x3], x9
|
||||
st1 {v3.8h}, [x3], x9
|
||||
st1 {v4.8h}, [x3], x9
|
||||
st1 {v5.8h}, [x3], x9
|
||||
st1 {v6.8h}, [x3], x9
|
||||
st1 {v7.8h}, [x3], x9
|
||||
st1 {v8.8h}, [x3], x9
|
||||
st1 {v9.8h}, [x3], x9
|
||||
st1 {v10.8h}, [x3], x9
|
||||
st1 {v11.8h}, [x3], x9
|
||||
st1 {v12.8h}, [x3], x9
|
||||
st1 {v13.8h}, [x3], x9
|
||||
st1 {v14.8h}, [x3], x9
|
||||
st1 {v15.8h}, [x3], x9
|
||||
add x23, x23, x19
|
||||
sub x24, x24, #16
|
||||
cmp x24, #0
|
||||
ble LoopWEnd
|
||||
cmp x24, #8
|
||||
blt LoopW
|
||||
cmp x24, #16
|
||||
bge LoopW16
|
||||
LoopW8:
|
||||
mov x19, #8
|
||||
mul x19, x19, x11
|
||||
mov x16, x23
|
||||
mov x17, x2
|
||||
mov x20, x6
|
||||
mov v0.16b, v24.16b
|
||||
mov v1.16b, v24.16b
|
||||
mov v2.16b, v24.16b
|
||||
mov v3.16b, v24.16b
|
||||
mov v4.16b, v24.16b
|
||||
mov v5.16b, v24.16b
|
||||
mov v6.16b, v24.16b
|
||||
mov v7.16b, v24.16b
|
||||
LoopKh8:
|
||||
mov x18, x7
|
||||
mov x21, x16
|
||||
LoopKw8:
|
||||
mov x22, x21
|
||||
ld1 {v25.8h}, [x17], #16
|
||||
ld1 {v16.8h}, [x22], x11
|
||||
ld1 {v17.8h}, [x22], x11
|
||||
fmla v0.8h, v16.8h, v25.8h
|
||||
fmla v1.8h, v17.8h, v25.8h
|
||||
ld1 {v18.8h}, [x22], x11
|
||||
ld1 {v19.8h}, [x22], x11
|
||||
fmla v2.8h, v18.8h, v25.8h
|
||||
fmla v3.8h, v19.8h, v25.8h
|
||||
ld1 {v20.8h}, [x22], x11
|
||||
ld1 {v21.8h}, [x22], x11
|
||||
fmla v4.8h, v20.8h, v25.8h
|
||||
fmla v5.8h, v21.8h, v25.8h
|
||||
ld1 {v22.8h}, [x22], x11
|
||||
ld1 {v23.8h}, [x22], x11
|
||||
fmla v6.8h, v22.8h, v25.8h
|
||||
fmla v7.8h, v23.8h, v25.8h
|
||||
subs x18, x18, #1
|
||||
add x21, x21, x13
|
||||
bne LoopKw8
|
||||
add x16, x16, x12
|
||||
subs x20, x20, #1
|
||||
bne LoopKh8
|
||||
cbnz x15, Relu68
|
||||
cbnz x14, Relu8
|
||||
b Write8
|
||||
Relu68:
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
fmin v1.8h, v1.8h, v26.8h
|
||||
fmin v2.8h, v2.8h, v26.8h
|
||||
fmin v3.8h, v3.8h, v26.8h
|
||||
fmin v4.8h, v4.8h, v26.8h
|
||||
fmin v5.8h, v5.8h, v26.8h
|
||||
fmin v6.8h, v6.8h, v26.8h
|
||||
fmin v7.8h, v7.8h, v26.8h
|
||||
Relu8:
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
fmax v1.8h, v1.8h, v27.8h
|
||||
fmax v2.8h, v2.8h, v27.8h
|
||||
fmax v3.8h, v3.8h, v27.8h
|
||||
fmax v4.8h, v4.8h, v27.8h
|
||||
fmax v5.8h, v5.8h, v27.8h
|
||||
fmax v6.8h, v6.8h, v27.8h
|
||||
fmax v7.8h, v7.8h, v27.8h
|
||||
Write8:
|
||||
st1 {v0.8h}, [x3], x9
|
||||
st1 {v1.8h}, [x3], x9
|
||||
st1 {v2.8h}, [x3], x9
|
||||
st1 {v3.8h}, [x3], x9
|
||||
st1 {v4.8h}, [x3], x9
|
||||
st1 {v5.8h}, [x3], x9
|
||||
st1 {v6.8h}, [x3], x9
|
||||
st1 {v7.8h}, [x3], x9
|
||||
add x23, x23, x19
|
||||
sub x24, x24, #8
|
||||
cmp x24, #0
|
||||
ble LoopWEnd
|
||||
cmp x24, #8
|
||||
bge LoopW8
|
||||
LoopW:
|
||||
mov x16, x23
|
||||
mov x17, x2
|
||||
mov x20, x6
|
||||
mov v0.16b, v24.16b
|
||||
LoopKh:
|
||||
mov x18, x7
|
||||
mov x22, x16
|
||||
LoopKw:
|
||||
ld1 {v16.8h}, [x22], x13
|
||||
ld1 {v25.8h}, [x17], #16
|
||||
fmla v0.8h, v16.8h, v25.8h
|
||||
subs x18, x18, #1
|
||||
bne LoopKw
|
||||
add x16, x16, x12
|
||||
subs x20, x20, #1
|
||||
bne LoopKh
|
||||
cbnz x15, Relu6
|
||||
cbnz x14, Relu
|
||||
b Write
|
||||
Relu6:
|
||||
fmin v0.8h, v0.8h, v26.8h
|
||||
Relu:
|
||||
fmax v0.8h, v0.8h, v27.8h
|
||||
Write:
|
||||
st1 {v0.8h}, [x3], x9
|
||||
add x23, x23, x11
|
||||
subs x24, x24, #1
|
||||
bne LoopW
|
||||
LoopWEnd:
|
||||
add x0, x0, x8
|
||||
add x1, x1, x10
|
||||
subs x4, x4, #1
|
||||
bne LoopH
|
||||
|
||||
sub sp, sp, #48
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ldp x23, x24, [sp], #16
|
||||
ret
|
||||
#endif
|
|
@ -0,0 +1,64 @@
|
|||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global DeconvDwFp16Center
|
||||
#ifndef __APPLE__
|
||||
.type DeconvDwFp16Center, %function
|
||||
#endif
|
||||
|
||||
// void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
|
||||
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
|
||||
// size_t in_kh_step, size_t in_kw_step);
|
||||
// x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
|
||||
// x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
|
||||
DeconvDwFp16Center:
|
||||
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
sub sp, sp, #32
|
||||
stp x19, x20, [sp], #16
|
||||
stp x21, x22, [sp], #16
|
||||
|
||||
ldr x8, [sp]
|
||||
ldr x9, [sp, #8]
|
||||
ldr x10, [sp, #16]
|
||||
ldr x11, [sp, #24]
|
||||
ldr x12, [sp, #32]
|
||||
|
||||
LoopH:
|
||||
mov x15, x0
|
||||
mov x16, x1
|
||||
mov x17, x4
|
||||
LoopW:
|
||||
mov x18, x15
|
||||
mov x19, x2
|
||||
mov x20, x5
|
||||
ld1 {v1.8h}, [x16], x8
|
||||
LoopKh:
|
||||
mov x21, x18
|
||||
mov x13, x6
|
||||
LoopKw:
|
||||
ld1 {v0.8h}, [x21]
|
||||
ld1 {v2.8h}, [x19], #16
|
||||
fmla v0.8h, v1.8h, v2.8h
|
||||
st1 {v0.8h}, [x21], x12
|
||||
subs x13, x13, #1
|
||||
bne LoopKw
|
||||
add x18, x18, x11
|
||||
subs x20, x20, #1
|
||||
bne LoopKh
|
||||
add x15, x15, x10
|
||||
subs x17, x17, #1
|
||||
bne LoopW
|
||||
add x0, x0, x9
|
||||
add x1, x1, x7
|
||||
subs x3, x3, #1
|
||||
bne LoopH
|
||||
|
||||
sub sp, sp, #32
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ret
|
||||
#endif
|
|
@ -0,0 +1,44 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_COMMON_FUNC_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_COMMON_FUNC_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "src/runtime/kernel/arm/opclib/op_base.h"
|
||||
#include "src/runtime/kernel/arm/opclib/conv_parameter.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
|
||||
size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step,
|
||||
size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step,
|
||||
size_t in_kw_step, size_t relu, size_t relu6);
|
||||
void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
|
||||
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
|
||||
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_COMMON_FUNC_H_ */
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
#include "src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h"
|
||||
#include <arm_neon.h>
|
||||
#include "src/runtime/kernel/arm/opclib/fp16/common_func.h"
|
||||
|
||||
/*conv depthwise fp16 begin*/
|
||||
void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
|
||||
|
@ -79,6 +80,7 @@ void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *
|
|||
} // height loop
|
||||
}
|
||||
|
||||
#ifndef ENABLE_ARM64
|
||||
void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
|
||||
int height, int width, int kernel_h, int kernel_w, int out_h_step, int block_channel,
|
||||
int in_sh_step, int in_sw_step, int in_kh_step, int in_kw_step, bool is_relu, bool is_relu6) {
|
||||
|
@ -97,12 +99,17 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *
|
|||
const float16_t *src_kw = src_kh;
|
||||
const float16_t *weight_kw = weight_kh;
|
||||
for (int kw = 0; kw < kernel_w; kw++) {
|
||||
#ifdef ENABLE_ARM64
|
||||
float16x8_t src_8 = vld1q_f16(src_kw);
|
||||
float16x8_t weight_8 = vld1q_f16(weight_kw);
|
||||
float16x8_t dst_8 = vld1q_f16(dst_w);
|
||||
dst_8 = vfmaq_f16(dst_8, src_8, weight_8);
|
||||
vst1q_f16(dst_w, dst_8);
|
||||
|
||||
#else
|
||||
for (int c = 0; c < C8NUM; c++) {
|
||||
dst_w[c] += src_kw[c] * weight_kw[c];
|
||||
}
|
||||
#endif
|
||||
src_kw += in_kw_step;
|
||||
weight_kw += C8NUM;
|
||||
} // kernel_w loop
|
||||
|
@ -122,6 +129,7 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *
|
|||
src_h += in_sh_step;
|
||||
} // dst_height loop
|
||||
}
|
||||
#endif
|
||||
|
||||
// conv depthwise fp16: sliding window
|
||||
void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
|
||||
|
@ -149,11 +157,19 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo
|
|||
int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
const float16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_;
|
||||
float16_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
ConvDwFp16Center(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_,
|
||||
sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
|
||||
sliding->out_h_step_ * sizeof(float16_t), sliding->block_channel_ * sizeof(float16_t),
|
||||
sliding->in_sh_step_ * sizeof(float16_t), sliding->in_sw_step_ * sizeof(float16_t),
|
||||
sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t),
|
||||
conv_param->is_relu_, conv_param->is_relu6_);
|
||||
#else
|
||||
DepthwiseCenterFp16(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_,
|
||||
sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
|
||||
sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, sliding->in_sw_step_,
|
||||
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_);
|
||||
#endif
|
||||
}
|
||||
} // output C8 loop
|
||||
src += sliding->in_step_;
|
||||
|
@ -214,6 +230,7 @@ void DeconvDepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float
|
|||
} // height loop
|
||||
}
|
||||
|
||||
#ifndef ENABLE_ARM64
|
||||
void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int height, int width,
|
||||
int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step,
|
||||
int in_sw_step, int in_kh_step, int in_kw_step) {
|
||||
|
@ -229,12 +246,17 @@ void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float
|
|||
float16_t *dst_kw = dst_kh;
|
||||
const float16_t *weight_kw = weight_kh;
|
||||
for (int kw = 0; kw < kernel_w; kw++) {
|
||||
#ifdef ENABLE_ARM64
|
||||
float16x8_t src_8 = vld1q_f16(src_w);
|
||||
float16x8_t weight_8 = vld1q_f16(weight_kw);
|
||||
float16x8_t dst_8 = vld1q_f16(dst_kw);
|
||||
dst_8 = vfmaq_f16(dst_8, src_8, weight_8);
|
||||
vst1q_f16(dst_kw, dst_8);
|
||||
|
||||
#else
|
||||
for (int c = 0; c < C8NUM; c++) {
|
||||
dst_kw[c] += src_w[c] * weight_kw[c];
|
||||
}
|
||||
#endif
|
||||
dst_kw += in_kw_step;
|
||||
weight_kw += C8NUM;
|
||||
} // kernel_w loop
|
||||
|
@ -248,6 +270,7 @@ void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float
|
|||
src_h += out_h_step;
|
||||
} // dst_height loop
|
||||
}
|
||||
#endif
|
||||
|
||||
void DeconvDepthwisePostFuncFp16(float16_t *dst, const float16_t *bias, int block_channel,
|
||||
const ConvParameter *conv_param) {
|
||||
|
@ -289,11 +312,18 @@ void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const f
|
|||
float16_t *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_;
|
||||
const float16_t *in_t =
|
||||
src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
DeconvDwFp16Center(out_t, in_t, weight, sliding->bottom_ - sliding->top_,
|
||||
sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
|
||||
sliding->out_h_step_ * sizeof(float16_t), sliding->block_channel_ * sizeof(float16_t),
|
||||
sliding->in_sh_step_ * sizeof(float16_t), sliding->in_sw_step_ * sizeof(float16_t),
|
||||
sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t));
|
||||
#else
|
||||
DeconvDepthwiseCenterFp16(out_t, in_t, weight, sliding->bottom_ - sliding->top_,
|
||||
sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
|
||||
sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_,
|
||||
sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_);
|
||||
#endif
|
||||
}
|
||||
DeconvDepthwisePostFuncFp16(dst_data, bias, sliding->block_channel_, conv_param);
|
||||
} // output C8 loop
|
||||
|
|
|
@ -38,6 +38,15 @@ void MatrixSub(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stri
|
|||
void MatrixMultiAdd(float *c11, float *c12, float *c21, float *c22, float *x_ptr, size_t row, size_t col,
|
||||
size_t c_stride, size_t x_stride);
|
||||
|
||||
#ifdef ENABLE_ARM
|
||||
void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
|
||||
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
|
||||
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
|
||||
void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
|
||||
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
|
||||
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
void BiasAdd(const float *bias, float *data, size_t oc4, size_t plan_size);
|
||||
void BiasAddRelu6(const float *bias, float *data, size_t oc4, size_t plan_size);
|
||||
|
@ -49,12 +58,6 @@ void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc,
|
|||
void C4BiasAddRelu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride);
|
||||
void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride);
|
||||
void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride);
|
||||
void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
|
||||
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
|
||||
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
|
||||
void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
|
||||
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
|
||||
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_COMMON_FUNC_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_COMMON_FUNC_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "src/runtime/kernel/arm/opclib/op_base.h"
|
||||
#include "src/runtime/kernel/arm/opclib/conv_parameter.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_ARM
|
||||
void IndirectGemmInt16to32_8x4(int32_t *dst, const int16_t *src, const int16_t *weight, size_t ksize, size_t ic8,
|
||||
size_t oc4, size_t offset);
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
void IndirectGemmInt8_4x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
|
||||
size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
|
||||
size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before,
|
||||
size_t shift_after);
|
||||
#elif defined(ENABLE_ARM32)
|
||||
void IndirectGemmInt8_2x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
|
||||
size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
|
||||
size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before,
|
||||
size_t shift_after);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_ARM
|
||||
void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width,
|
||||
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
|
||||
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
|
||||
void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height,
|
||||
size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel,
|
||||
size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int out_multiplier,
|
||||
int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_COMMON_FUNC_H_ */
|
||||
|
|
@ -17,6 +17,7 @@
|
|||
#include "src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.h"
|
||||
#include <string.h>
|
||||
#include "src/runtime/kernel/arm/opclib/quantization/fixed_point.h"
|
||||
#include "src/runtime/kernel/arm/opclib/int8/common_func.h"
|
||||
|
||||
/*conv depthwise int8 begin*/
|
||||
void DepthwiseBorderPixelInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height,
|
||||
|
@ -85,6 +86,7 @@ void DepthwiseBorderInt8(int8_t *dst, const int16_t *src, const int16_t *weight,
|
|||
} // height loop
|
||||
}
|
||||
|
||||
#ifndef ENABLE_ARM64
|
||||
void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height,
|
||||
int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step,
|
||||
int in_sw_step, int in_kh_step, int in_kw_step, int out_multiplier, int left_shift,
|
||||
|
@ -133,6 +135,7 @@ void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight,
|
|||
src_h += in_sh_step;
|
||||
} // dst_height loop
|
||||
}
|
||||
#endif
|
||||
|
||||
void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
|
||||
const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) {
|
||||
|
@ -158,7 +161,17 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
|
|||
int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
|
||||
const int16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * C4NUM;
|
||||
int8_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * C4NUM;
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
ConvDwInt8Center(
|
||||
out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_,
|
||||
conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(int8_t),
|
||||
sliding->block_channel_ * sizeof(int8_t), sliding->in_sh_step_ * sizeof(int16_t),
|
||||
sliding->in_sw_step_ * sizeof(int16_t), sliding->in_kh_step_ * sizeof(int16_t),
|
||||
sliding->in_kw_step_ * sizeof(int16_t), conv_param->conv_quant_arg_.quant_multiplier_[0],
|
||||
conv_param->conv_quant_arg_.left_shift_[0], conv_param->conv_quant_arg_.right_shift_[0],
|
||||
conv_param->conv_quant_arg_.quant_args_[2][0].zp_, conv_param->conv_quant_arg_.out_act_min_[0],
|
||||
conv_param->conv_quant_arg_.out_act_max_[0]);
|
||||
#else
|
||||
DepthwiseCenterInt8(
|
||||
out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_,
|
||||
conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_, sliding->block_channel_,
|
||||
|
@ -166,6 +179,7 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
|
|||
conv_param->conv_quant_arg_.quant_multiplier_[0], conv_param->conv_quant_arg_.left_shift_[0],
|
||||
conv_param->conv_quant_arg_.right_shift_[0], conv_param->conv_quant_arg_.quant_args_[2][0].zp_,
|
||||
conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0]);
|
||||
#endif
|
||||
}
|
||||
} // output C4 loop
|
||||
src += sliding->in_step_;
|
||||
|
@ -222,6 +236,7 @@ void DeconvDepthwiseBorderInt8(int32_t *dst, const int16_t *src, const int16_t *
|
|||
} // height loop
|
||||
}
|
||||
|
||||
#ifndef ENABLE_ARM64
|
||||
void DeconvDepthwiseCenterInt8(int32_t *dst, const int16_t *src, const int16_t *weight, int height, int width,
|
||||
int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step,
|
||||
int in_sw_step, int in_kh_step, int in_kw_step) {
|
||||
|
@ -253,6 +268,7 @@ void DeconvDepthwiseCenterInt8(int32_t *dst, const int16_t *src, const int16_t *
|
|||
src_h += out_h_step;
|
||||
} // dst_height loop
|
||||
}
|
||||
#endif
|
||||
|
||||
void DeconvDepthwisePostFuncInt8(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel,
|
||||
const ConvParameter *conv_param, int out_multiplier, int left_shift, int right_shift,
|
||||
|
@ -302,11 +318,18 @@ void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *in
|
|||
int32_t *out_t = output_buffer + oh_h_start * sliding->in_h_step_ + oh_w_start * C4NUM;
|
||||
const int16_t *in_t =
|
||||
src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
DeconvDwInt8Center(out_t, in_t, weight, sliding->bottom_ - sliding->top_,
|
||||
sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
|
||||
sliding->out_h_step_ * sizeof(int16_t), sliding->block_channel_ * sizeof(int16_t),
|
||||
sliding->in_sh_step_ * sizeof(int32_t), sliding->in_sw_step_ * sizeof(int32_t),
|
||||
sliding->in_kh_step_ * sizeof(int32_t), sliding->in_kw_step_ * sizeof(int32_t));
|
||||
#else
|
||||
DeconvDepthwiseCenterInt8(out_t, in_t, weight, sliding->bottom_ - sliding->top_,
|
||||
sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
|
||||
sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_,
|
||||
sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_);
|
||||
#endif
|
||||
}
|
||||
DeconvDepthwisePostFuncInt8(
|
||||
dst_data, output_buffer, bias, sliding->block_channel_, conv_param,
|
||||
|
|
|
@ -17,25 +17,7 @@
|
|||
#include "src/runtime/kernel/arm/opclib/int8/conv_int8.h"
|
||||
#include <string.h>
|
||||
#include "src/runtime/kernel/arm/opclib/winograd_transform.h"
|
||||
|
||||
extern "C" {
|
||||
#ifdef ENABLE_ARM
|
||||
void IndirectGemmInt16to32_8x4(int32_t *dst, const int16_t *src, const int16_t *weight, size_t ksize, size_t ic8,
|
||||
size_t oc4, size_t offset);
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
void IndirectGemmInt8_4x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
|
||||
size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
|
||||
size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before,
|
||||
size_t shift_after);
|
||||
#elif defined(ENABLE_ARM32)
|
||||
void IndirectGemmInt8_2x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
|
||||
size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
|
||||
size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before,
|
||||
size_t shift_after);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
#include "src/runtime/kernel/arm/opclib/int8/common_func.h"
|
||||
|
||||
void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
|
||||
int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum,
|
||||
|
|
Loading…
Reference in New Issue