fix compile bugs on arm32 and add BiasAdd/Relu/Dw arm64 kernels

This commit is contained in:
lixian 2020-07-31 21:07:33 +08:00
parent b6726e4a69
commit e52aa2b12f
12 changed files with 1019 additions and 118 deletions

View File

@ -1,4 +1,5 @@
#ifdef __aarch64__
#ifdef __arm__
#ifndef __aarch64__
.text
.align 5
@ -10,15 +11,16 @@
// void IndirectGemmFp32_8x4(float *output, float *input, float *weight, float *bias,
// size_t kSize, size_t ic4, size_t oc8, size_t offset, size_t mode, size_t writeC4, size_t relu, size_t relu6);
// r0: output, r1: input, r2: weight, r3: bias, r4: kSize, r5: ic4, r6: oc, r7: offset
// r8:mode, r10: writeMode, x10: relu, r10:relu6
// r8:mode, r10: writeMode, r10: relu, r10:relu6
// mode = 0 for general convolution, where one conv unit is a row
// mode = 1 for winograd/common gemm, where the total channels of one input is a row
IndirectGemmFp32_8x4:
.macro INIT_BIAS
veor q10, q10, q10
cbz x3, InitBias
vld1.32 q10, [x3]
cmp r3, #0
beq InitBias
vld1.32 q10, [r3]
InitBias:
vmov q11, q10
vmov q12, q10
@ -27,10 +29,11 @@ IndirectGemmFp32_8x4:
vmov q15, q10
.endm
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// r19 ~ r29 should be also preserved
// whereas our coding style do not permit such amount of parameters
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
// clang's rule seems more simple, though there are no subroutine calls here
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
push {r4-r8, r10, r11, lr}
vpush {q4-q7}
add sp, sp, #160
@ -41,7 +44,8 @@ IndirectGemmFp32_8x4:
ldr r7, [sp, #12]
ldr r8, [sp, #16]
cbnz r8, LoopOc
cmp r8, #0
bne LoopOc
// step is one for common convolution, where ic8 should multiply by kernel size
// step is (a+b-1) for F(a,b) in winograd
mul r5, r4, r5
@ -57,17 +61,18 @@ IndirectGemmFp32_8x4:
INIT_BIAS
// load input for output 1-2
vld1.32 {q0, q1, q2, q3}, [x12]!
vld1.32 {q0, q1}, [r12]!
vld1.32 {q2, q3}, [r12]!
// load weight
vld1.32 {q4, q5}, [x2]!
vld1.32 {q4, q5}, [r2]!
// step for output 1-2
vmul.f32 q8, q4, d0[0]
vmul.f32 q9, q4, d2[0]
vmla.f32 q8, q5, d0[1]
vmla.f32 q9, q5, d2[1]
vld1.32 {q6, q7}, [x2]!
vld1.32 {q6, q7}, [r2]!
subs x10, x5, #1
subs r10, r5, #1
beq LoopIcEnd
LoopIc:
@ -146,9 +151,11 @@ IndirectGemmFp32_8x4:
vmla.f32 q15, q7, d7[1]
ldr r10, [sp, #28]
cbnz r10, Relu6
cmp r10, #0
bne Relu6
ldr r10, [sp, #24]
cbnz x10, Relu
cmp r10, #0
bne Relu
b WriteStart
Relu6:
vmov.i32 q14, #6
@ -174,7 +181,8 @@ IndirectGemmFp32_8x4:
WriteStart:
ldr r10, [sp, #20]
cbnz x10, WriteC4
cmp r10, #0
bne WriteC4
cmp r6, #1
beq Write1
cmp r6, #2
@ -183,97 +191,97 @@ IndirectGemmFp32_8x4:
beq Write3
b Write4
Write1:
str s0, [r11]
add r11, r11, x7
str s4, [r11]
add r11, r11, x7
str s8, [r11]
add r11, r11, x7
str s12, [r11]
add r11, r11, x7
str s16, [r11]
add r11, r11, x7
str s20, [r11]
add r11, r11, x7
str s24, [r11]
add r11, r11, x7
str s28, [r11]
vst1.32 d0[0], [r11]
add r11, r11, r7
vst1.32 d2[0], [r11]
add r11, r11, r7
vst1.32 d4[0], [r11]
add r11, r11, r7
vst1.32 d6[0], [r11]
add r11, r11, r7
vst1.32 d8[0], [r11]
add r11, r11, r7
vst1.32 d10[0], [r11]
add r11, r11, r7
vst1.32 d12[0], [r11]
add r11, r11, r7
vst1.32 d14[0], [r11]
add r0, r0, #4
b WriteEnd
Write2:
str d0, [r11]
add r11, r11, x7
str d2, [r11]
add r11, r11, x7
str d4, [r11]
add r11, r11, x7
str d6, [r11]
add r11, r11, x7
str d8, [r11]
add r11, r11, x7
str d10, [r11]
add r11, r11, x7
str d12, [r11]
add r11, r11, x7
str d14, [r11]
vst1.32 d0, [r11]
add r11, r11, r7
vst1.32 d2, [r11]
add r11, r11, r7
vst1.32 d4, [r11]
add r11, r11, r7
vst1.32 d6, [r11]
add r11, r11, r7
vst1.32 d8, [r11]
add r11, r11, r7
vst1.32 d10, [r11]
add r11, r11, r7
vst1.32 d12, [r11]
add r11, r11, r7
vst1.32 d14, [r11]
add r0, r0, #8
b WriteEnd
Write3:
add r12, r11, #8
str d0, [r11]
add r11, r11, x7
str s2, [r12]
vst1.32 d0, [r11]
add r11, r11, r7
vst1.32 d1[0], [r12]
add r12, r12, r7
str d2, [r11]
add r11, r11, x7
str s6, [r12]
vst1.32 d2, [r11]
add r11, r11, r7
vst1.32 d3[0], [r12]
add r12, r12, r7
str d4, [r11]
add r11, r11, x7
str s10, [r12]
vst1.32 d4, [r11]
add r11, r11, r7
vst1.32 d5[0], [r12]
add r12, r12, r7
str d6, [r11]
add r11, r11, x7
str s14, [r12]
vst1.32 d6, [r11]
add r11, r11, r7
vst1.32 d7[0], [r12]
add r12, r12, r7
str d8, [r11]
add r11, r11, x7
str s18, [r12]
vst1.32 d8, [r11]
add r11, r11, r7
vst1.32 d9[0], [r12]
add r12, r12, r7
str d10, [r11]
add r11, r11, x7
str s22, [r12]
vst1.32 d10, [r11]
add r11, r11, r7
vst1.32 d11[0], [r12]
add r12, r12, r7
str d12, [r11]
add r11, r11, x7
str s26, [r12]
vst1.32 d12, [r11]
add r11, r11, r7
vst1.32 d13[0], [r12]
add r12, r12, r7
str d14, [r11]
str s30, [r12]
vst1.32 d14, [r11]
vst1.32 d15[0], [r12]
add r0, r0, #12
b WriteEnd
WriteC4:
vst1.32 q0, [r11], x7
vst1.32 q1, [r11], x7
vst1.32 q2, [r11], x7
vst1.32 q3, [r11], x7
vst1.32 q4, [r11], x7
vst1.32 q5, [r11], x7
vst1.32 q6, [r11], x7
vst1.32 q0, [r11], r7
vst1.32 q1, [r11], r7
vst1.32 q2, [r11], r7
vst1.32 q3, [r11], r7
vst1.32 q4, [r11], r7
vst1.32 q5, [r11], r7
vst1.32 q6, [r11], r7
vst1.32 q7, [r11]
add r0, r0, #16
b WriteEnd
Write4:
// prefetching is not prefered while writing results in spite of cache missings
// you could try prfm pstl2strm
// you could try prfm pstl2vst1.32m
// there are almost no benefits observed though
vst1.32 q0, [r11], x7
vst1.32 q1, [r11], x7
vst1.32 q2, [r11], x7
vst1.32 q3, [r11], x7
vst1.32 q4, [r11], x7
vst1.32 q5, [r11], x7
vst1.32 q6, [r11], x7
vst1.32 q0, [r11], r7
vst1.32 q1, [r11], r7
vst1.32 q2, [r11], r7
vst1.32 q3, [r11], r7
vst1.32 q4, [r11], r7
vst1.32 q5, [r11], r7
vst1.32 q6, [r11], r7
vst1.32 q7, [r11]
add r0, r0, #16
@ -283,7 +291,8 @@ IndirectGemmFp32_8x4:
bne LoopKsize
subs r6, r6, #4
cbz r3, NoStepFowrard
cmp r3, #0
beq NoStepFowrard
add r3, r3, #16
NoStepFowrard:
bgt LoopOc
@ -292,3 +301,4 @@ IndirectGemmFp32_8x4:
vpop {q4-q7}
pop {r4-r8, r10, r11, pc}
#endif
#endif

View File

@ -1,4 +1,5 @@
#ifdef __aarch64__
#ifdef __arm__
#ifndef __aarch64__
.text
.align 5
@ -10,8 +11,8 @@
// void IndirectGemmInt8_2x4(int8_t *output, int8_t *input, int8_t *weight, int32_t *bias, size_t ksize, size_t ic4,
// size_t oc, size_t offset, int32_t *input_sum, size_t act_min, size_t act_max, size_t out_zp, size_t out_multiplier,
// size_t shift_before, size_t shift_after);
// x0: output, x1: input, r2: weight, x3: bias, x4: kSize, x5: ic4, x6: oc, x7: offset
// x8: input_sum, x10: act_min, x11: act_max, x10: out_zp, x11: out_multiplier, x10: shift_before, x11: shift_after
// r0: output, r1: input, r2: weight, r3: bias, r4: kSize, r5: ic4, r6: oc, r7: offset
// r8: input_sum, r10: act_min, r11: act_max, r10: out_zp, r11: out_multiplier, r10: shift_before, r11: shift_after
IndirectGemmInt8_2x4:
.macro INIT_BIAS
@ -73,7 +74,7 @@ IndirectGemmInt8_2x4:
vmlal.s8 q6, d1, d9
vmlal.s8 q7, d1, d11
subs x10, x5, #1
subs r10, r5, #1
beq LoopIcEnd
LoopIc:
@ -107,7 +108,7 @@ IndirectGemmInt8_2x4:
vmlal.s8 q6, d1, d9
vmlal.s8 q7, d1, d11
subs x10, x10, #1
subs r10, r10, #1
bne LoopIc
LoopIcEnd:
@ -131,15 +132,23 @@ IndirectGemmInt8_2x4:
vld1.32 q0[], [r10]!
vld1.32 q1[], [r10]!
// pairwise add
vpadd.i32 q8, q8, q9
vpadd.i32 q10, q10, q11
vpadd.i32 q12, q12, q13
vpadd.i32 q14, q14, q15
vpadd.i32 q8, q8, q10
vpadd.i32 q12, q12, q14
vpadd.i32 d16, d16, d17
vpadd.i32 d18, d18, d19
vpadd.i32 d20, d20, d21
vpadd.i32 d22, d22, d23
vpadd.i32 d24, d24, d25
vpadd.i32 d26, d26, d27
vpadd.i32 d28, d28, d29
vpadd.i32 d30, d30, d31
vpadd.i32 d16, d16, d18
vpadd.i32 d17, d20, d22
vpadd.i32 d24, d24, d26
vpadd.i32 d25, d28, d30
vsub.i32 q8, q8, q0
vsub.i32 q12, q12, q1
cbz r3, NoBias
cmp r3, #0
beq NoBias
vld1.32 q2, [r3]
vadd.i32 q8, q8, q2
vadd.i32 q12, q12, q2
@ -182,34 +191,34 @@ IndirectGemmInt8_2x4:
// prefetching is not prefered while writing results in spite of cache missings
// you could try prfm pstl2strm
WriteStart:
cmp x6, #1
cmp r6, #1
beq Write1
cmp x6, #2
cmp r6, #2
beq Write2
cmp x6, #3
cmp r6, #3
beq Write3
b Write4
Write1:
vst1.8 {d0[0]}, [x11], x7
vst1.8 {d0[1]}, [x11]
vst1.8 {d0[0]}, [r11], r7
vst1.8 {d0[1]}, [r11]
add r0, r0, #1
b WriteEnd
Write2:
vst1.16 {d0[0]}, [x11], x7
vst1.16 {d0[1]}, [x11]
vst1.16 {d0[0]}, [r11], r7
vst1.16 {d0[1]}, [r11]
add r0, r0, #2
b WriteEnd
Write3:
add x14, x11, #2
vst1.16 {d0[0]}, [x11], x7
vst1.16 {d0[1]}, [x11]
vst1.8 {d0[0]}, [x14], x7
vst1.8 {d0[1]}, [x14]
add r14, r11, #2
vst1.16 {d0[0]}, [r11], r7
vst1.16 {d0[1]}, [r11]
vst1.8 {d0[0]}, [r14], r7
vst1.8 {d0[1]}, [r14]
add r0, r0, #3
b WriteEnd
Write4:
vst1.32 {d0[0]}, [x11], x7
vst1.32 {d0[1]}, [x11]
vst1.32 {d0[0]}, [r11], r7
vst1.32 {d0[1]}, [r11]
add r0, r0, #4
WriteEnd:
@ -218,7 +227,8 @@ IndirectGemmInt8_2x4:
bne LoopKsize
subs r6, r6, #4
cbz r3, NoStepFowrard
cmp r3, #0
beq NoStepFowrard
add r3, r3, #16
NoStepFowrard:
bgt LoopOc
@ -227,3 +237,4 @@ IndirectGemmInt8_2x4:
vpop {q4-q7}
pop {r4-r8, r10, r11, pc}
#endif
#endif

View File

@ -0,0 +1,131 @@
#ifdef __aarch64__
.text
.align 5
//.p2align 5,,15
.global C4BiasAdd
#ifndef __APPLE__
.type C4BiasAdd, %function
#endif
//void C4BiasAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride)
//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride
C4BiasAdd:
LoopOc:
ld1 {v4.4s}, [x2], #16
mov x6, x4
mov x7, x0
cmp x6, #4
blt Loop1
Loop4:
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
fadd v0.4s, v0.4s, v4.4s
fadd v1.4s, v1.4s, v4.4s
fadd v2.4s, v2.4s, v4.4s
fadd v3.4s, v3.4s, v4.4s
cmp x3, #4
bge Write4x4
cmp x3, #3
beq Write3x4
cmp x3, #2
beq Write2x4
Write1x4:
str s0, [x7]
add x7, x7, x5
str s1, [x7]
add x7, x7, x5
str s2, [x7]
add x7, x7, x5
str s3, [x7]
add x7, x7, x5
b WriteEndx4
Write2x4:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x5
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x5
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x5
b WriteEndx4
Write3x4:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
st1 {v0.s}[2], [x8], x5
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x5
st1 {v1.s}[2], [x8], x5
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x5
st1 {v2.s}[2], [x8], x5
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x5
st1 {v3.s}[2], [x8], x5
b WriteEndx4
Write4x4:
st1 {v0.4s}, [x7], x5
st1 {v1.4s}, [x7], x5
st1 {v2.4s}, [x7], x5
st1 {v3.4s}, [x7], x5
WriteEndx4:
subs x6, x6, #4
beq LoopOcEnd
cmp x6, #4
blt Loop1
b Loop4
Loop1:
ld1 {v0.4s}, [x1], #16
fadd v0.4s, v0.4s, v4.4s
cmp x3, #4
bge Write4
cmp x3, #3
beq Write3
cmp x3, #2
beq Write2
Write1:
str s0, [x7]
add x7, x7, x5
b WriteEnd
Write2:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
b WriteEnd
Write3:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
st1 {v0.s}[2], [x8], x5
b WriteEnd
Write4:
st1 {v0.4s}, [x7], x5
WriteEnd:
subs x6, x6, #1
bne Loop1
LoopOcEnd:
subs x3, x3, #4
add x0, x0, #16
bgt LoopOc
ret
#endif

View File

@ -0,0 +1,137 @@
#ifdef __aarch64__
.text
.align 5
//.p2align 5,,15
.global C4BiasAddRelu
#ifndef __APPLE__
.type C4BiasAddRelu, %function
#endif
//void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride)
//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride
C4BiasAddRelu:
dup v5.4s, wzr
LoopOc:
ld1 {v4.4s}, [x2], #16
mov x6, x4
mov x7, x0
cmp x6, #4
blt Loop1
Loop4:
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
fadd v0.4s, v0.4s, v4.4s
fadd v1.4s, v1.4s, v4.4s
fadd v2.4s, v2.4s, v4.4s
fadd v3.4s, v3.4s, v4.4s
fmax v0.4s, v0.4s, v5.4s
fmax v1.4s, v1.4s, v5.4s
fmax v2.4s, v2.4s, v5.4s
fmax v3.4s, v3.4s, v5.4s
cmp x3, #4
bge Write4x4
cmp x3, #3
beq Write3x4
cmp x3, #2
beq Write2x4
Write1x4:
str s0, [x7]
add x7, x7, x5
str s1, [x7]
add x7, x7, x5
str s2, [x7]
add x7, x7, x5
str s3, [x7]
add x7, x7, x5
b WriteEndx4
Write2x4:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x5
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x5
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x5
b WriteEndx4
Write3x4:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
st1 {v0.s}[2], [x8], x5
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x5
st1 {v1.s}[2], [x8], x5
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x5
st1 {v2.s}[2], [x8], x5
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x5
st1 {v3.s}[2], [x8], x5
b WriteEndx4
Write4x4:
st1 {v0.4s}, [x7], x5
st1 {v1.4s}, [x7], x5
st1 {v2.4s}, [x7], x5
st1 {v3.4s}, [x7], x5
WriteEndx4:
subs x6, x6, #4
beq LoopOcEnd
cmp x6, #4
blt Loop1
b Loop4
Loop1:
ld1 {v0.4s}, [x1], #16
fadd v0.4s, v0.4s, v4.4s
fmax v0.4s, v0.4s, v5.4s
cmp x3, #4
bge Write4
cmp x3, #3
beq Write3
cmp x3, #2
beq Write2
Write1:
str s0, [x7]
add x7, x7, x5
b WriteEnd
Write2:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
b WriteEnd
Write3:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
st1 {v0.s}[2], [x8], x5
b WriteEnd
Write4:
st1 {v0.4s}, [x7], x5
WriteEnd:
subs x6, x6, #1
bne Loop1
LoopOcEnd:
subs x3, x3, #4
add x0, x0, #16
bgt LoopOc
ret
#endif

View File

@ -0,0 +1,146 @@
#ifdef __aarch64__
.text
.align 5
//.p2align 5,,15
.global C4BiasAddRelu6
#ifndef __APPLE__
.type C4BiasAddRelu6, %function
#endif
//void C4BiC4BiasAddRelu6asAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride)
//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride
C4BiasAddRelu6:
dup v5.4s, wzr
movi v6.4s, #6
scvtf v6.4s, v6.4s
LoopOc:
ld1 {v4.4s}, [x2], #16
mov x6, x4
mov x7, x0
cmp x6, #4
blt Loop1
Loop4:
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
fadd v0.4s, v0.4s, v4.4s
fadd v1.4s, v1.4s, v4.4s
fadd v2.4s, v2.4s, v4.4s
fadd v3.4s, v3.4s, v4.4s
fmax v0.4s, v0.4s, v5.4s
fmax v1.4s, v1.4s, v5.4s
fmax v2.4s, v2.4s, v5.4s
fmax v3.4s, v3.4s, v5.4s
fmin v0.4s, v0.4s, v6.4s
fmin v1.4s, v1.4s, v6.4s
fmin v2.4s, v2.4s, v6.4s
fmin v3.4s, v3.4s, v6.4s
cmp x3, #4
bge Write4x4
cmp x3, #3
beq Write3x4
cmp x3, #2
beq Write2x4
Write1x4:
str s0, [x7]
add x7, x7, x5
str s1, [x7]
add x7, x7, x5
str s2, [x7]
add x7, x7, x5
str s3, [x7]
add x7, x7, x5
b WriteEndx4
Write2x4:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x5
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x5
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x5
b WriteEndx4
Write3x4:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
st1 {v0.s}[2], [x8], x5
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x5
st1 {v1.s}[2], [x8], x5
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x5
st1 {v2.s}[2], [x8], x5
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x5
st1 {v3.s}[2], [x8], x5
b WriteEndx4
Write4x4:
st1 {v0.4s}, [x7], x5
st1 {v1.4s}, [x7], x5
st1 {v2.4s}, [x7], x5
st1 {v3.4s}, [x7], x5
WriteEndx4:
subs x6, x6, #4
beq LoopOcEnd
cmp x6, #4
blt Loop1
b Loop4
Loop1:
ld1 {v0.4s}, [x1], #16
fadd v0.4s, v0.4s, v4.4s
fmax v0.4s, v0.4s, v5.4s
fmin v0.4s, v0.4s, v6.4s
cmp x3, #4
bge Write4
cmp x3, #3
beq Write3
cmp x3, #2
beq Write2
Write1:
str s0, [x7]
add x7, x7, x5
b WriteEnd
Write2:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
b WriteEnd
Write3:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x5
st1 {v0.s}[2], [x8], x5
b WriteEnd
Write4:
st1 {v0.4s}, [x7], x5
WriteEnd:
subs x6, x6, #1
bne Loop1
LoopOcEnd:
subs x3, x3, #4
add x0, x0, #16
bgt LoopOc
ret
#endif

View File

@ -0,0 +1,132 @@
#ifdef __aarch64__
.text
.align 5
//.p2align 5,,15
.global C4Relu
#ifndef __APPLE__
.type C4Relu, %function
#endif
//void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride)
//x0: dst, x1: input, x2: oc, x3: plane_size, x4: stride
C4Relu:
dup v5.4s, wzr
LoopOc:
mov x6, x3
mov x7, x0
cmp x6, #4
blt Loop1
Loop4:
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
fmax v0.4s, v0.4s, v5.4s
fmax v1.4s, v1.4s, v5.4s
fmax v2.4s, v2.4s, v5.4s
fmax v3.4s, v3.4s, v5.4s
cmp x2, #4
bge Write4x4
cmp x2, #3
beq Write3x4
cmp x2, #2
beq Write2x4
Write1x4:
str s0, [x7]
add x7, x7, x4
str s1, [x7]
add x7, x7, x4
str s2, [x7]
add x7, x7, x4
str s3, [x7]
add x7, x7, x4
b WriteEndx4
Write2x4:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x4
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x4
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x4
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x4
b WriteEndx4
Write3x4:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x4
st1 {v0.s}[2], [x8], x4
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x4
st1 {v1.s}[2], [x8], x4
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x4
st1 {v2.s}[2], [x8], x4
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x4
st1 {v3.s}[2], [x8], x4
b WriteEndx4
Write4x4:
st1 {v0.4s}, [x7], x4
st1 {v1.4s}, [x7], x4
st1 {v2.4s}, [x7], x4
st1 {v3.4s}, [x7], x4
WriteEndx4:
subs x6, x6, #4
beq LoopOcEnd
cmp x6, #4
blt Loop1
b Loop4
Loop1:
ld1 {v0.4s}, [x1], #16
fadd v0.4s, v0.4s, v4.4s
fmax v0.4s, v0.4s, v5.4s
cmp x2, #4
bge Write4
cmp x2, #3
beq Write3
cmp x2, #2
beq Write2
Write1:
str s0, [x7]
add x7, x7, x4
b WriteEnd
Write2:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x4
b WriteEnd
Write3:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x4
st1 {v0.s}[2], [x8], x4
b WriteEnd
Write4:
st1 {v0.4s}, [x7], x4
WriteEnd:
subs x6, x6, #1
bne Loop1
LoopOcEnd:
subs x2, x2, #4
add x0, x0, #16
bgt LoopOc
ret
#endif

View File

@ -0,0 +1,140 @@
#ifdef __aarch64__
.text
.align 5
//.p2align 5,,15
.global C4Relu6
#ifndef __APPLE__
.type C4Relu6, %function
#endif
//void C4Relu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride)
//x0: dst, x1: input, x2: oc, x2: plane_size, x3: stride
C4Relu6:
dup v5.4s, wzr
movi v6.4s, #6
scvtf v6.4s, v6.4s
LoopOc:
mov x6, x3
mov x7, x0
cmp x6, #4
blt Loop1
Loop4:
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
fmax v0.4s, v0.4s, v5.4s
fmax v1.4s, v1.4s, v5.4s
fmax v2.4s, v2.4s, v5.4s
fmax v3.4s, v3.4s, v5.4s
fmin v0.4s, v0.4s, v6.4s
fmin v1.4s, v1.4s, v6.4s
fmin v2.4s, v2.4s, v6.4s
fmin v3.4s, v3.4s, v6.4s
cmp x2, #4
bge Write4x4
cmp x2, #3
beq Write3x4
cmp x2, #2
beq Write2x4
Write1x4:
str s0, [x7]
add x7, x7, x4
str s1, [x7]
add x7, x7, x4
str s2, [x7]
add x7, x7, x4
str s3, [x7]
add x7, x7, x4
b WriteEndx4
Write2x4:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x4
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x4
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x4
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x4
b WriteEndx4
Write3x4:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x4
st1 {v0.s}[2], [x8], x4
dup s17, v1.s[1]
stp s1, s17, [x7]
add x7, x7, x4
st1 {v1.s}[2], [x8], x4
dup s18, v2.s[1]
stp s2, s18, [x7]
add x7, x7, x4
st1 {v2.s}[2], [x8], x4
dup s19, v3.s[1]
stp s3, s19, [x7]
add x7, x7, x4
st1 {v3.s}[2], [x8], x4
b WriteEndx4
Write4x4:
st1 {v0.4s}, [x7], x4
st1 {v1.4s}, [x7], x4
st1 {v2.4s}, [x7], x4
st1 {v3.4s}, [x7], x4
WriteEndx4:
subs x6, x6, #4
beq LoopOcEnd
cmp x6, #4
blt Loop1
b Loop4
Loop1:
ld1 {v0.4s}, [x1], #16
fadd v0.4s, v0.4s, v4.4s
fmax v0.4s, v0.4s, v5.4s
fmin v0.4s, v0.4s, v6.4s
cmp x2, #4
bge Write4
cmp x2, #3
beq Write3
cmp x2, #2
beq Write2
Write1:
str s0, [x7]
add x7, x7, x4
b WriteEnd
Write2:
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x4
b WriteEnd
Write3:
add x8, x7, #8
dup s16, v0.s[1]
stp s0, s16, [x7]
add x7, x7, x4
st1 {v0.s}[2], [x8], x4
b WriteEnd
Write4:
st1 {v0.4s}, [x7], x4
WriteEnd:
subs x6, x6, #1
bne Loop1
LoopOcEnd:
subs x2, x2, #4
add x0, x0, #16
bgt LoopOc
ret
#endif

View File

@ -0,0 +1,96 @@
#ifdef __aarch64__
.text
.align 5
.global ConvDwFp32Center
#ifndef __APPLE__
.type ConvDwFp32Center, %function
#endif
// void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
// size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w,
// x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
// x14: relu, x15: relu6
ConvDwFp32Center:
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
sub sp, sp, #48
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
stp x23, x24, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
ldr x10, [sp, #16]
ldr x11, [sp, #24]
ldr x12, [sp, #32]
ldr x13, [sp, #40]
ldr x14, [sp, #48]
ldr x15, [sp, #56]
mov x16, #4
mul x8, x8, x16
mul x9, x9, x16
mul x10, x10, x16
mul x11, x11, x16
mul x12, x12, x16
mul x13, x13, x16
mov x16, #16
mul x19, x7, x16
ld1 {v5.4s}, [x3]
LoopH:
mov x23, x1
mov x24, x5
mov x3, x0
LoopW:
mov x16, x23
mov x17, x2
mov x20, x6
ld1 {v0.4s}, [x3]
fadd v0.4s, v0.4s, v5.4s
LoopKh:
mov x18, x7
mov x21, x17
mov x22, x16
LoopKw:
ld1 {v1.4s}, [x22], x13
ld1 {v2.4s}, [x21], #16
fmla v0.4s, v1.4s, v2.4s
subs x18, x18, #1
bne LoopKw
add x16, x16, x12
add x17, x17, x19
subs x20, x20, #1
bne LoopKh
cbnz x15, Relu6
cbnz x14, Relu
b Write
Relu6:
movi v4.4s, #6
scvtf v4.4s, v4.4s
fmin v0.4s, v0.4s, v4.4s
Relu:
dup v3.4s, wzr
fmax v0.4s, v0.4s, v3.4s
Write:
st1 {v0.4s}, [x3], x9
add x23, x23, x11
subs x24, x24, #1
bne LoopW
add x0, x0, x8
add x1, x1, x10
subs x4, x4, #1
bne LoopH
sub sp, sp, #48
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ldp x23, x24, [sp], #16
ret
#endif

View File

@ -0,0 +1,77 @@
#ifdef __aarch64__
.text
.align 5
.global DeconvDwFp32Center
#ifndef __APPLE__
.type DeconvDwFp32Center, %function
#endif
// void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
// size_t in_kh_step, size_t in_kw_step);
// x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
// x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
DeconvDwFp32Center:
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
sub sp, sp, #32
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
ldr x10, [sp, #16]
ldr x11, [sp, #24]
ldr x12, [sp, #32]
mov x13, #4
mul x7, x7, x13
mul x8, x8, x13
mul x9, x9, x13
mul x10, x10, x13
mul x11, x11, x13
mul x12, x12, x13
mov x13, #16
mul x14, x6, x13
LoopH:
mov x15, x0
mov x16, x1
mov x17, x4
LoopW:
mov x18, x15
mov x19, x2
mov x20, x5
LoopKh:
mov x21, x18
mov x22, x19
mov x13, x6
LoopKw:
ld1 {v0.4s}, [x21]
ld1 {v1.4s}, [x16]
ld1 {v2.4s}, [x22], #16
fmla v0.4s, v1.4s, v2.4s
st1 {v0.4s}, [x21], x12
subs x13, x13, #1
bne LoopKw
add x18, x18, x11
add x19, x19, x14
subs x20, x20, #1
bne LoopKh
add x15, x15, x10
add x16, x16, x8
subs x17, x17, #1
bne LoopW
add x0, x0, x9
add x1, x1, x7
subs x3, x3, #1
bne LoopH
sub sp, sp, #32
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ret
#endif

View File

@ -81,20 +81,19 @@ void PostConvFuncFp32(const float *c4_out_ptr, float *out_ptr, const float *bias
}
}
#else
int oc4 = UP_DIV(output_channel, C4NUM);
if (bias_ptr != nullptr) {
if (is_relu) {
BiasAddRelu(bias_ptr, out_ptr, oc4, plane_size);
C4BiasAddRelu(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float));
} else if (is_relu6) {
BiasAddRelu6(bias_ptr, out_ptr, oc4, plane_size);
C4BiasAddRelu6(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float));
} else {
BiasAdd(bias_ptr, out_ptr, oc4, plane_size);
C4BiasAdd(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float));
}
} else {
if (is_relu) {
Relu(out_ptr, oc4 * plane_size);
C4Relu(out_ptr, c4_out_ptr, output_channel, plane_size, stride * sizeof(float));
} else if (is_relu6) {
Relu6(out_ptr, oc4 * plane_size);
C4Relu6(out_ptr, c4_out_ptr, output_channel, plane_size, stride * sizeof(float));
} else {
// do nothing
}

View File

@ -42,6 +42,17 @@ void BiasAddRelu6(const float *bias, float *data, size_t oc4, size_t plan_size);
void BiasAddRelu(const float *bias, float *data, size_t oc4, size_t plan_size);
void Relu6(float *data, size_t element4);
void Relu(float *data, size_t element4);
void C4BiasAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride);
void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride);
void C4BiasAddRelu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride);
void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride);
void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride);
void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
#endif
#ifdef __cplusplus

View File

@ -15,6 +15,7 @@
*/
#include "src/runtime/kernel/arm/opclib/fp32/conv_depthwise.h"
#include "src/runtime/kernel/arm/opclib/fp32/common_func.h"
#ifdef ENABLE_ARM64
#include <arm_neon.h>
#endif
@ -122,6 +123,10 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl
void DepthwiseCenter(float *dst, const float *src, const float *weight, const float *bias, int height, int width,
int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step, int in_sw_step,
int in_kh_step, int in_kw_step, bool is_relu, bool is_relu6) {
#ifdef ENABLE_ARM64
ConvDwFp32Center(dst, src, weight, bias, height, width, kernel_h, kernel_w, out_h_step, block_channel,
in_sh_step, in_sw_step, in_kh_step, in_kw_step, is_relu, is_relu6);
#else
float *dst_h = dst;
const float *src_h = src;
for (int oh = 0; oh < height; oh++) {
@ -163,6 +168,7 @@ void DepthwiseCenter(float *dst, const float *src, const float *weight, const fl
dst_h += out_h_step;
src_h += in_sh_step;
} // dst_height loop
#endif
}
// conv depthwise fp32: sliding window
@ -262,6 +268,10 @@ void DeconvDepthwiseBorder(float *dst, const float *src, const float *weight, in
void DeconvDepthwiseCenter(float *dst, const float *src, const float *weight, int height, int width, int kernel_h,
int kernel_w, int out_h_step, int block_channel, int in_sh_step, int in_sw_step,
int in_kh_step, int in_kw_step) {
#ifdef ENABLE_ARM64
DeconvDwFp32Center(dst, src, weight, height, width, kernel_h, kernel_w, out_h_step, block_channel,
in_sh_step, in_sw_step, in_kh_step, in_kw_step);
#else
float *dst_h = dst;
const float *src_h = src;
for (int oh = 0; oh < height; oh++) {
@ -297,6 +307,7 @@ void DeconvDepthwiseCenter(float *dst, const float *src, const float *weight, in
dst_h += in_sh_step;
src_h += out_h_step;
} // dst_height loop
#endif
}
void DeconvDepthwisePostFunc(float *dst, const float *bias, int block_channel, const ConvParameter *conv_param) {