fix compile bugs on arm32 and add BiasAdd/Relu/Dw arm64 kernels
This commit is contained in:
parent
b6726e4a69
commit
e52aa2b12f
|
@ -1,4 +1,5 @@
|
||||||
#ifdef __aarch64__
|
#ifdef __arm__
|
||||||
|
#ifndef __aarch64__
|
||||||
|
|
||||||
.text
|
.text
|
||||||
.align 5
|
.align 5
|
||||||
|
@ -10,15 +11,16 @@
|
||||||
// void IndirectGemmFp32_8x4(float *output, float *input, float *weight, float *bias,
|
// void IndirectGemmFp32_8x4(float *output, float *input, float *weight, float *bias,
|
||||||
// size_t kSize, size_t ic4, size_t oc8, size_t offset, size_t mode, size_t writeC4, size_t relu, size_t relu6);
|
// size_t kSize, size_t ic4, size_t oc8, size_t offset, size_t mode, size_t writeC4, size_t relu, size_t relu6);
|
||||||
// r0: output, r1: input, r2: weight, r3: bias, r4: kSize, r5: ic4, r6: oc, r7: offset
|
// r0: output, r1: input, r2: weight, r3: bias, r4: kSize, r5: ic4, r6: oc, r7: offset
|
||||||
// r8:mode, r10: writeMode, x10: relu, r10:relu6
|
// r8:mode, r10: writeMode, r10: relu, r10:relu6
|
||||||
// mode = 0 for general convolution, where one conv unit is a row
|
// mode = 0 for general convolution, where one conv unit is a row
|
||||||
// mode = 1 for winograd/common gemm, where the total channels of one input is a row
|
// mode = 1 for winograd/common gemm, where the total channels of one input is a row
|
||||||
IndirectGemmFp32_8x4:
|
IndirectGemmFp32_8x4:
|
||||||
|
|
||||||
.macro INIT_BIAS
|
.macro INIT_BIAS
|
||||||
veor q10, q10, q10
|
veor q10, q10, q10
|
||||||
cbz x3, InitBias
|
cmp r3, #0
|
||||||
vld1.32 q10, [x3]
|
beq InitBias
|
||||||
|
vld1.32 q10, [r3]
|
||||||
InitBias:
|
InitBias:
|
||||||
vmov q11, q10
|
vmov q11, q10
|
||||||
vmov q12, q10
|
vmov q12, q10
|
||||||
|
@ -27,10 +29,11 @@ IndirectGemmFp32_8x4:
|
||||||
vmov q15, q10
|
vmov q15, q10
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
|
||||||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
// according to https://stackoverflow.com/questions/53625807
|
||||||
// r19 ~ r29 should be also preserved
|
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
|
||||||
// whereas our coding style do not permit such amount of parameters
|
// clang's rule seems more simple, though there are no subroutine calls here
|
||||||
|
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
|
||||||
push {r4-r8, r10, r11, lr}
|
push {r4-r8, r10, r11, lr}
|
||||||
vpush {q4-q7}
|
vpush {q4-q7}
|
||||||
add sp, sp, #160
|
add sp, sp, #160
|
||||||
|
@ -41,7 +44,8 @@ IndirectGemmFp32_8x4:
|
||||||
ldr r7, [sp, #12]
|
ldr r7, [sp, #12]
|
||||||
ldr r8, [sp, #16]
|
ldr r8, [sp, #16]
|
||||||
|
|
||||||
cbnz r8, LoopOc
|
cmp r8, #0
|
||||||
|
bne LoopOc
|
||||||
// step is one for common convolution, where ic8 should multiply by kernel size
|
// step is one for common convolution, where ic8 should multiply by kernel size
|
||||||
// step is (a+b-1) for F(a,b) in winograd
|
// step is (a+b-1) for F(a,b) in winograd
|
||||||
mul r5, r4, r5
|
mul r5, r4, r5
|
||||||
|
@ -57,17 +61,18 @@ IndirectGemmFp32_8x4:
|
||||||
INIT_BIAS
|
INIT_BIAS
|
||||||
|
|
||||||
// load input for output 1-2
|
// load input for output 1-2
|
||||||
vld1.32 {q0, q1, q2, q3}, [x12]!
|
vld1.32 {q0, q1}, [r12]!
|
||||||
|
vld1.32 {q2, q3}, [r12]!
|
||||||
// load weight
|
// load weight
|
||||||
vld1.32 {q4, q5}, [x2]!
|
vld1.32 {q4, q5}, [r2]!
|
||||||
// step for output 1-2
|
// step for output 1-2
|
||||||
vmul.f32 q8, q4, d0[0]
|
vmul.f32 q8, q4, d0[0]
|
||||||
vmul.f32 q9, q4, d2[0]
|
vmul.f32 q9, q4, d2[0]
|
||||||
vmla.f32 q8, q5, d0[1]
|
vmla.f32 q8, q5, d0[1]
|
||||||
vmla.f32 q9, q5, d2[1]
|
vmla.f32 q9, q5, d2[1]
|
||||||
vld1.32 {q6, q7}, [x2]!
|
vld1.32 {q6, q7}, [r2]!
|
||||||
|
|
||||||
subs x10, x5, #1
|
subs r10, r5, #1
|
||||||
beq LoopIcEnd
|
beq LoopIcEnd
|
||||||
|
|
||||||
LoopIc:
|
LoopIc:
|
||||||
|
@ -146,9 +151,11 @@ IndirectGemmFp32_8x4:
|
||||||
vmla.f32 q15, q7, d7[1]
|
vmla.f32 q15, q7, d7[1]
|
||||||
|
|
||||||
ldr r10, [sp, #28]
|
ldr r10, [sp, #28]
|
||||||
cbnz r10, Relu6
|
cmp r10, #0
|
||||||
|
bne Relu6
|
||||||
ldr r10, [sp, #24]
|
ldr r10, [sp, #24]
|
||||||
cbnz x10, Relu
|
cmp r10, #0
|
||||||
|
bne Relu
|
||||||
b WriteStart
|
b WriteStart
|
||||||
Relu6:
|
Relu6:
|
||||||
vmov.i32 q14, #6
|
vmov.i32 q14, #6
|
||||||
|
@ -174,7 +181,8 @@ IndirectGemmFp32_8x4:
|
||||||
|
|
||||||
WriteStart:
|
WriteStart:
|
||||||
ldr r10, [sp, #20]
|
ldr r10, [sp, #20]
|
||||||
cbnz x10, WriteC4
|
cmp r10, #0
|
||||||
|
bne WriteC4
|
||||||
cmp r6, #1
|
cmp r6, #1
|
||||||
beq Write1
|
beq Write1
|
||||||
cmp r6, #2
|
cmp r6, #2
|
||||||
|
@ -183,97 +191,97 @@ IndirectGemmFp32_8x4:
|
||||||
beq Write3
|
beq Write3
|
||||||
b Write4
|
b Write4
|
||||||
Write1:
|
Write1:
|
||||||
str s0, [r11]
|
vst1.32 d0[0], [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str s4, [r11]
|
vst1.32 d2[0], [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str s8, [r11]
|
vst1.32 d4[0], [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str s12, [r11]
|
vst1.32 d6[0], [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str s16, [r11]
|
vst1.32 d8[0], [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str s20, [r11]
|
vst1.32 d10[0], [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str s24, [r11]
|
vst1.32 d12[0], [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str s28, [r11]
|
vst1.32 d14[0], [r11]
|
||||||
add r0, r0, #4
|
add r0, r0, #4
|
||||||
b WriteEnd
|
b WriteEnd
|
||||||
Write2:
|
Write2:
|
||||||
str d0, [r11]
|
vst1.32 d0, [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str d2, [r11]
|
vst1.32 d2, [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str d4, [r11]
|
vst1.32 d4, [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str d6, [r11]
|
vst1.32 d6, [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str d8, [r11]
|
vst1.32 d8, [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str d10, [r11]
|
vst1.32 d10, [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str d12, [r11]
|
vst1.32 d12, [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str d14, [r11]
|
vst1.32 d14, [r11]
|
||||||
add r0, r0, #8
|
add r0, r0, #8
|
||||||
b WriteEnd
|
b WriteEnd
|
||||||
Write3:
|
Write3:
|
||||||
add r12, r11, #8
|
add r12, r11, #8
|
||||||
str d0, [r11]
|
vst1.32 d0, [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str s2, [r12]
|
vst1.32 d1[0], [r12]
|
||||||
add r12, r12, r7
|
add r12, r12, r7
|
||||||
str d2, [r11]
|
vst1.32 d2, [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str s6, [r12]
|
vst1.32 d3[0], [r12]
|
||||||
add r12, r12, r7
|
add r12, r12, r7
|
||||||
str d4, [r11]
|
vst1.32 d4, [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str s10, [r12]
|
vst1.32 d5[0], [r12]
|
||||||
add r12, r12, r7
|
add r12, r12, r7
|
||||||
str d6, [r11]
|
vst1.32 d6, [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str s14, [r12]
|
vst1.32 d7[0], [r12]
|
||||||
add r12, r12, r7
|
add r12, r12, r7
|
||||||
str d8, [r11]
|
vst1.32 d8, [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str s18, [r12]
|
vst1.32 d9[0], [r12]
|
||||||
add r12, r12, r7
|
add r12, r12, r7
|
||||||
str d10, [r11]
|
vst1.32 d10, [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str s22, [r12]
|
vst1.32 d11[0], [r12]
|
||||||
add r12, r12, r7
|
add r12, r12, r7
|
||||||
str d12, [r11]
|
vst1.32 d12, [r11]
|
||||||
add r11, r11, x7
|
add r11, r11, r7
|
||||||
str s26, [r12]
|
vst1.32 d13[0], [r12]
|
||||||
add r12, r12, r7
|
add r12, r12, r7
|
||||||
str d14, [r11]
|
vst1.32 d14, [r11]
|
||||||
str s30, [r12]
|
vst1.32 d15[0], [r12]
|
||||||
add r0, r0, #12
|
add r0, r0, #12
|
||||||
b WriteEnd
|
b WriteEnd
|
||||||
WriteC4:
|
WriteC4:
|
||||||
vst1.32 q0, [r11], x7
|
vst1.32 q0, [r11], r7
|
||||||
vst1.32 q1, [r11], x7
|
vst1.32 q1, [r11], r7
|
||||||
vst1.32 q2, [r11], x7
|
vst1.32 q2, [r11], r7
|
||||||
vst1.32 q3, [r11], x7
|
vst1.32 q3, [r11], r7
|
||||||
vst1.32 q4, [r11], x7
|
vst1.32 q4, [r11], r7
|
||||||
vst1.32 q5, [r11], x7
|
vst1.32 q5, [r11], r7
|
||||||
vst1.32 q6, [r11], x7
|
vst1.32 q6, [r11], r7
|
||||||
vst1.32 q7, [r11]
|
vst1.32 q7, [r11]
|
||||||
add r0, r0, #16
|
add r0, r0, #16
|
||||||
b WriteEnd
|
b WriteEnd
|
||||||
Write4:
|
Write4:
|
||||||
// prefetching is not prefered while writing results in spite of cache missings
|
// prefetching is not prefered while writing results in spite of cache missings
|
||||||
// you could try prfm pstl2strm
|
// you could try prfm pstl2vst1.32m
|
||||||
// there are almost no benefits observed though
|
// there are almost no benefits observed though
|
||||||
vst1.32 q0, [r11], x7
|
vst1.32 q0, [r11], r7
|
||||||
vst1.32 q1, [r11], x7
|
vst1.32 q1, [r11], r7
|
||||||
vst1.32 q2, [r11], x7
|
vst1.32 q2, [r11], r7
|
||||||
vst1.32 q3, [r11], x7
|
vst1.32 q3, [r11], r7
|
||||||
vst1.32 q4, [r11], x7
|
vst1.32 q4, [r11], r7
|
||||||
vst1.32 q5, [r11], x7
|
vst1.32 q5, [r11], r7
|
||||||
vst1.32 q6, [r11], x7
|
vst1.32 q6, [r11], r7
|
||||||
vst1.32 q7, [r11]
|
vst1.32 q7, [r11]
|
||||||
add r0, r0, #16
|
add r0, r0, #16
|
||||||
|
|
||||||
|
@ -283,7 +291,8 @@ IndirectGemmFp32_8x4:
|
||||||
bne LoopKsize
|
bne LoopKsize
|
||||||
|
|
||||||
subs r6, r6, #4
|
subs r6, r6, #4
|
||||||
cbz r3, NoStepFowrard
|
cmp r3, #0
|
||||||
|
beq NoStepFowrard
|
||||||
add r3, r3, #16
|
add r3, r3, #16
|
||||||
NoStepFowrard:
|
NoStepFowrard:
|
||||||
bgt LoopOc
|
bgt LoopOc
|
||||||
|
@ -292,3 +301,4 @@ IndirectGemmFp32_8x4:
|
||||||
vpop {q4-q7}
|
vpop {q4-q7}
|
||||||
pop {r4-r8, r10, r11, pc}
|
pop {r4-r8, r10, r11, pc}
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#ifdef __aarch64__
|
#ifdef __arm__
|
||||||
|
#ifndef __aarch64__
|
||||||
|
|
||||||
.text
|
.text
|
||||||
.align 5
|
.align 5
|
||||||
|
@ -10,8 +11,8 @@
|
||||||
// void IndirectGemmInt8_2x4(int8_t *output, int8_t *input, int8_t *weight, int32_t *bias, size_t ksize, size_t ic4,
|
// void IndirectGemmInt8_2x4(int8_t *output, int8_t *input, int8_t *weight, int32_t *bias, size_t ksize, size_t ic4,
|
||||||
// size_t oc, size_t offset, int32_t *input_sum, size_t act_min, size_t act_max, size_t out_zp, size_t out_multiplier,
|
// size_t oc, size_t offset, int32_t *input_sum, size_t act_min, size_t act_max, size_t out_zp, size_t out_multiplier,
|
||||||
// size_t shift_before, size_t shift_after);
|
// size_t shift_before, size_t shift_after);
|
||||||
// x0: output, x1: input, r2: weight, x3: bias, x4: kSize, x5: ic4, x6: oc, x7: offset
|
// r0: output, r1: input, r2: weight, r3: bias, r4: kSize, r5: ic4, r6: oc, r7: offset
|
||||||
// x8: input_sum, x10: act_min, x11: act_max, x10: out_zp, x11: out_multiplier, x10: shift_before, x11: shift_after
|
// r8: input_sum, r10: act_min, r11: act_max, r10: out_zp, r11: out_multiplier, r10: shift_before, r11: shift_after
|
||||||
IndirectGemmInt8_2x4:
|
IndirectGemmInt8_2x4:
|
||||||
|
|
||||||
.macro INIT_BIAS
|
.macro INIT_BIAS
|
||||||
|
@ -73,7 +74,7 @@ IndirectGemmInt8_2x4:
|
||||||
vmlal.s8 q6, d1, d9
|
vmlal.s8 q6, d1, d9
|
||||||
vmlal.s8 q7, d1, d11
|
vmlal.s8 q7, d1, d11
|
||||||
|
|
||||||
subs x10, x5, #1
|
subs r10, r5, #1
|
||||||
beq LoopIcEnd
|
beq LoopIcEnd
|
||||||
|
|
||||||
LoopIc:
|
LoopIc:
|
||||||
|
@ -107,7 +108,7 @@ IndirectGemmInt8_2x4:
|
||||||
vmlal.s8 q6, d1, d9
|
vmlal.s8 q6, d1, d9
|
||||||
vmlal.s8 q7, d1, d11
|
vmlal.s8 q7, d1, d11
|
||||||
|
|
||||||
subs x10, x10, #1
|
subs r10, r10, #1
|
||||||
bne LoopIc
|
bne LoopIc
|
||||||
|
|
||||||
LoopIcEnd:
|
LoopIcEnd:
|
||||||
|
@ -131,15 +132,23 @@ IndirectGemmInt8_2x4:
|
||||||
vld1.32 q0[], [r10]!
|
vld1.32 q0[], [r10]!
|
||||||
vld1.32 q1[], [r10]!
|
vld1.32 q1[], [r10]!
|
||||||
// pairwise add
|
// pairwise add
|
||||||
vpadd.i32 q8, q8, q9
|
vpadd.i32 d16, d16, d17
|
||||||
vpadd.i32 q10, q10, q11
|
vpadd.i32 d18, d18, d19
|
||||||
vpadd.i32 q12, q12, q13
|
vpadd.i32 d20, d20, d21
|
||||||
vpadd.i32 q14, q14, q15
|
vpadd.i32 d22, d22, d23
|
||||||
vpadd.i32 q8, q8, q10
|
vpadd.i32 d24, d24, d25
|
||||||
vpadd.i32 q12, q12, q14
|
vpadd.i32 d26, d26, d27
|
||||||
|
vpadd.i32 d28, d28, d29
|
||||||
|
vpadd.i32 d30, d30, d31
|
||||||
|
|
||||||
|
vpadd.i32 d16, d16, d18
|
||||||
|
vpadd.i32 d17, d20, d22
|
||||||
|
vpadd.i32 d24, d24, d26
|
||||||
|
vpadd.i32 d25, d28, d30
|
||||||
vsub.i32 q8, q8, q0
|
vsub.i32 q8, q8, q0
|
||||||
vsub.i32 q12, q12, q1
|
vsub.i32 q12, q12, q1
|
||||||
cbz r3, NoBias
|
cmp r3, #0
|
||||||
|
beq NoBias
|
||||||
vld1.32 q2, [r3]
|
vld1.32 q2, [r3]
|
||||||
vadd.i32 q8, q8, q2
|
vadd.i32 q8, q8, q2
|
||||||
vadd.i32 q12, q12, q2
|
vadd.i32 q12, q12, q2
|
||||||
|
@ -182,34 +191,34 @@ IndirectGemmInt8_2x4:
|
||||||
// prefetching is not prefered while writing results in spite of cache missings
|
// prefetching is not prefered while writing results in spite of cache missings
|
||||||
// you could try prfm pstl2strm
|
// you could try prfm pstl2strm
|
||||||
WriteStart:
|
WriteStart:
|
||||||
cmp x6, #1
|
cmp r6, #1
|
||||||
beq Write1
|
beq Write1
|
||||||
cmp x6, #2
|
cmp r6, #2
|
||||||
beq Write2
|
beq Write2
|
||||||
cmp x6, #3
|
cmp r6, #3
|
||||||
beq Write3
|
beq Write3
|
||||||
b Write4
|
b Write4
|
||||||
Write1:
|
Write1:
|
||||||
vst1.8 {d0[0]}, [x11], x7
|
vst1.8 {d0[0]}, [r11], r7
|
||||||
vst1.8 {d0[1]}, [x11]
|
vst1.8 {d0[1]}, [r11]
|
||||||
add r0, r0, #1
|
add r0, r0, #1
|
||||||
b WriteEnd
|
b WriteEnd
|
||||||
Write2:
|
Write2:
|
||||||
vst1.16 {d0[0]}, [x11], x7
|
vst1.16 {d0[0]}, [r11], r7
|
||||||
vst1.16 {d0[1]}, [x11]
|
vst1.16 {d0[1]}, [r11]
|
||||||
add r0, r0, #2
|
add r0, r0, #2
|
||||||
b WriteEnd
|
b WriteEnd
|
||||||
Write3:
|
Write3:
|
||||||
add x14, x11, #2
|
add r14, r11, #2
|
||||||
vst1.16 {d0[0]}, [x11], x7
|
vst1.16 {d0[0]}, [r11], r7
|
||||||
vst1.16 {d0[1]}, [x11]
|
vst1.16 {d0[1]}, [r11]
|
||||||
vst1.8 {d0[0]}, [x14], x7
|
vst1.8 {d0[0]}, [r14], r7
|
||||||
vst1.8 {d0[1]}, [x14]
|
vst1.8 {d0[1]}, [r14]
|
||||||
add r0, r0, #3
|
add r0, r0, #3
|
||||||
b WriteEnd
|
b WriteEnd
|
||||||
Write4:
|
Write4:
|
||||||
vst1.32 {d0[0]}, [x11], x7
|
vst1.32 {d0[0]}, [r11], r7
|
||||||
vst1.32 {d0[1]}, [x11]
|
vst1.32 {d0[1]}, [r11]
|
||||||
add r0, r0, #4
|
add r0, r0, #4
|
||||||
|
|
||||||
WriteEnd:
|
WriteEnd:
|
||||||
|
@ -218,7 +227,8 @@ IndirectGemmInt8_2x4:
|
||||||
bne LoopKsize
|
bne LoopKsize
|
||||||
|
|
||||||
subs r6, r6, #4
|
subs r6, r6, #4
|
||||||
cbz r3, NoStepFowrard
|
cmp r3, #0
|
||||||
|
beq NoStepFowrard
|
||||||
add r3, r3, #16
|
add r3, r3, #16
|
||||||
NoStepFowrard:
|
NoStepFowrard:
|
||||||
bgt LoopOc
|
bgt LoopOc
|
||||||
|
@ -227,3 +237,4 @@ IndirectGemmInt8_2x4:
|
||||||
vpop {q4-q7}
|
vpop {q4-q7}
|
||||||
pop {r4-r8, r10, r11, pc}
|
pop {r4-r8, r10, r11, pc}
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
|
@ -0,0 +1,131 @@
|
||||||
|
#ifdef __aarch64__
|
||||||
|
|
||||||
|
.text
|
||||||
|
.align 5
|
||||||
|
//.p2align 5,,15
|
||||||
|
.global C4BiasAdd
|
||||||
|
#ifndef __APPLE__
|
||||||
|
.type C4BiasAdd, %function
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//void C4BiasAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride)
|
||||||
|
//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride
|
||||||
|
|
||||||
|
C4BiasAdd:
|
||||||
|
|
||||||
|
LoopOc:
|
||||||
|
ld1 {v4.4s}, [x2], #16
|
||||||
|
mov x6, x4
|
||||||
|
mov x7, x0
|
||||||
|
cmp x6, #4
|
||||||
|
blt Loop1
|
||||||
|
|
||||||
|
Loop4:
|
||||||
|
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
|
||||||
|
fadd v0.4s, v0.4s, v4.4s
|
||||||
|
fadd v1.4s, v1.4s, v4.4s
|
||||||
|
fadd v2.4s, v2.4s, v4.4s
|
||||||
|
fadd v3.4s, v3.4s, v4.4s
|
||||||
|
|
||||||
|
cmp x3, #4
|
||||||
|
bge Write4x4
|
||||||
|
cmp x3, #3
|
||||||
|
beq Write3x4
|
||||||
|
cmp x3, #2
|
||||||
|
beq Write2x4
|
||||||
|
|
||||||
|
Write1x4:
|
||||||
|
str s0, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
str s1, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
str s2, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
str s3, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
b WriteEndx4
|
||||||
|
Write2x4:
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
dup s17, v1.s[1]
|
||||||
|
stp s1, s17, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
dup s18, v2.s[1]
|
||||||
|
stp s2, s18, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
dup s19, v3.s[1]
|
||||||
|
stp s3, s19, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
b WriteEndx4
|
||||||
|
Write3x4:
|
||||||
|
add x8, x7, #8
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
st1 {v0.s}[2], [x8], x5
|
||||||
|
dup s17, v1.s[1]
|
||||||
|
stp s1, s17, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
st1 {v1.s}[2], [x8], x5
|
||||||
|
dup s18, v2.s[1]
|
||||||
|
stp s2, s18, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
st1 {v2.s}[2], [x8], x5
|
||||||
|
dup s19, v3.s[1]
|
||||||
|
stp s3, s19, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
st1 {v3.s}[2], [x8], x5
|
||||||
|
b WriteEndx4
|
||||||
|
Write4x4:
|
||||||
|
st1 {v0.4s}, [x7], x5
|
||||||
|
st1 {v1.4s}, [x7], x5
|
||||||
|
st1 {v2.4s}, [x7], x5
|
||||||
|
st1 {v3.4s}, [x7], x5
|
||||||
|
|
||||||
|
WriteEndx4:
|
||||||
|
subs x6, x6, #4
|
||||||
|
beq LoopOcEnd
|
||||||
|
cmp x6, #4
|
||||||
|
blt Loop1
|
||||||
|
b Loop4
|
||||||
|
|
||||||
|
Loop1:
|
||||||
|
ld1 {v0.4s}, [x1], #16
|
||||||
|
fadd v0.4s, v0.4s, v4.4s
|
||||||
|
|
||||||
|
cmp x3, #4
|
||||||
|
bge Write4
|
||||||
|
cmp x3, #3
|
||||||
|
beq Write3
|
||||||
|
cmp x3, #2
|
||||||
|
beq Write2
|
||||||
|
|
||||||
|
Write1:
|
||||||
|
str s0, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
b WriteEnd
|
||||||
|
Write2:
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
b WriteEnd
|
||||||
|
Write3:
|
||||||
|
add x8, x7, #8
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
st1 {v0.s}[2], [x8], x5
|
||||||
|
b WriteEnd
|
||||||
|
Write4:
|
||||||
|
st1 {v0.4s}, [x7], x5
|
||||||
|
WriteEnd:
|
||||||
|
subs x6, x6, #1
|
||||||
|
bne Loop1
|
||||||
|
LoopOcEnd:
|
||||||
|
subs x3, x3, #4
|
||||||
|
add x0, x0, #16
|
||||||
|
bgt LoopOc
|
||||||
|
|
||||||
|
ret
|
||||||
|
#endif
|
|
@ -0,0 +1,137 @@
|
||||||
|
#ifdef __aarch64__
|
||||||
|
|
||||||
|
.text
|
||||||
|
.align 5
|
||||||
|
//.p2align 5,,15
|
||||||
|
.global C4BiasAddRelu
|
||||||
|
#ifndef __APPLE__
|
||||||
|
.type C4BiasAddRelu, %function
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride)
|
||||||
|
//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride
|
||||||
|
|
||||||
|
C4BiasAddRelu:
|
||||||
|
dup v5.4s, wzr
|
||||||
|
LoopOc:
|
||||||
|
ld1 {v4.4s}, [x2], #16
|
||||||
|
mov x6, x4
|
||||||
|
mov x7, x0
|
||||||
|
cmp x6, #4
|
||||||
|
blt Loop1
|
||||||
|
|
||||||
|
Loop4:
|
||||||
|
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
|
||||||
|
fadd v0.4s, v0.4s, v4.4s
|
||||||
|
fadd v1.4s, v1.4s, v4.4s
|
||||||
|
fadd v2.4s, v2.4s, v4.4s
|
||||||
|
fadd v3.4s, v3.4s, v4.4s
|
||||||
|
|
||||||
|
fmax v0.4s, v0.4s, v5.4s
|
||||||
|
fmax v1.4s, v1.4s, v5.4s
|
||||||
|
fmax v2.4s, v2.4s, v5.4s
|
||||||
|
fmax v3.4s, v3.4s, v5.4s
|
||||||
|
|
||||||
|
cmp x3, #4
|
||||||
|
bge Write4x4
|
||||||
|
cmp x3, #3
|
||||||
|
beq Write3x4
|
||||||
|
cmp x3, #2
|
||||||
|
beq Write2x4
|
||||||
|
|
||||||
|
Write1x4:
|
||||||
|
str s0, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
str s1, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
str s2, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
str s3, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
b WriteEndx4
|
||||||
|
Write2x4:
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
dup s17, v1.s[1]
|
||||||
|
stp s1, s17, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
dup s18, v2.s[1]
|
||||||
|
stp s2, s18, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
dup s19, v3.s[1]
|
||||||
|
stp s3, s19, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
b WriteEndx4
|
||||||
|
Write3x4:
|
||||||
|
add x8, x7, #8
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
st1 {v0.s}[2], [x8], x5
|
||||||
|
dup s17, v1.s[1]
|
||||||
|
stp s1, s17, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
st1 {v1.s}[2], [x8], x5
|
||||||
|
dup s18, v2.s[1]
|
||||||
|
stp s2, s18, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
st1 {v2.s}[2], [x8], x5
|
||||||
|
dup s19, v3.s[1]
|
||||||
|
stp s3, s19, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
st1 {v3.s}[2], [x8], x5
|
||||||
|
b WriteEndx4
|
||||||
|
Write4x4:
|
||||||
|
st1 {v0.4s}, [x7], x5
|
||||||
|
st1 {v1.4s}, [x7], x5
|
||||||
|
st1 {v2.4s}, [x7], x5
|
||||||
|
st1 {v3.4s}, [x7], x5
|
||||||
|
|
||||||
|
WriteEndx4:
|
||||||
|
subs x6, x6, #4
|
||||||
|
beq LoopOcEnd
|
||||||
|
cmp x6, #4
|
||||||
|
blt Loop1
|
||||||
|
b Loop4
|
||||||
|
|
||||||
|
Loop1:
|
||||||
|
ld1 {v0.4s}, [x1], #16
|
||||||
|
fadd v0.4s, v0.4s, v4.4s
|
||||||
|
fmax v0.4s, v0.4s, v5.4s
|
||||||
|
|
||||||
|
cmp x3, #4
|
||||||
|
bge Write4
|
||||||
|
cmp x3, #3
|
||||||
|
beq Write3
|
||||||
|
cmp x3, #2
|
||||||
|
beq Write2
|
||||||
|
|
||||||
|
Write1:
|
||||||
|
str s0, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
b WriteEnd
|
||||||
|
Write2:
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
b WriteEnd
|
||||||
|
Write3:
|
||||||
|
add x8, x7, #8
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
st1 {v0.s}[2], [x8], x5
|
||||||
|
b WriteEnd
|
||||||
|
Write4:
|
||||||
|
st1 {v0.4s}, [x7], x5
|
||||||
|
WriteEnd:
|
||||||
|
subs x6, x6, #1
|
||||||
|
bne Loop1
|
||||||
|
LoopOcEnd:
|
||||||
|
subs x3, x3, #4
|
||||||
|
add x0, x0, #16
|
||||||
|
bgt LoopOc
|
||||||
|
|
||||||
|
ret
|
||||||
|
#endif
|
|
@ -0,0 +1,146 @@
|
||||||
|
#ifdef __aarch64__
|
||||||
|
|
||||||
|
.text
|
||||||
|
.align 5
|
||||||
|
//.p2align 5,,15
|
||||||
|
.global C4BiasAddRelu6
|
||||||
|
#ifndef __APPLE__
|
||||||
|
.type C4BiasAddRelu6, %function
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//void C4BiC4BiasAddRelu6asAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride)
|
||||||
|
//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride
|
||||||
|
|
||||||
|
C4BiasAddRelu6:
|
||||||
|
dup v5.4s, wzr
|
||||||
|
movi v6.4s, #6
|
||||||
|
scvtf v6.4s, v6.4s
|
||||||
|
|
||||||
|
LoopOc:
|
||||||
|
ld1 {v4.4s}, [x2], #16
|
||||||
|
mov x6, x4
|
||||||
|
mov x7, x0
|
||||||
|
cmp x6, #4
|
||||||
|
blt Loop1
|
||||||
|
|
||||||
|
Loop4:
|
||||||
|
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
|
||||||
|
fadd v0.4s, v0.4s, v4.4s
|
||||||
|
fadd v1.4s, v1.4s, v4.4s
|
||||||
|
fadd v2.4s, v2.4s, v4.4s
|
||||||
|
fadd v3.4s, v3.4s, v4.4s
|
||||||
|
|
||||||
|
fmax v0.4s, v0.4s, v5.4s
|
||||||
|
fmax v1.4s, v1.4s, v5.4s
|
||||||
|
fmax v2.4s, v2.4s, v5.4s
|
||||||
|
fmax v3.4s, v3.4s, v5.4s
|
||||||
|
|
||||||
|
fmin v0.4s, v0.4s, v6.4s
|
||||||
|
fmin v1.4s, v1.4s, v6.4s
|
||||||
|
fmin v2.4s, v2.4s, v6.4s
|
||||||
|
fmin v3.4s, v3.4s, v6.4s
|
||||||
|
|
||||||
|
cmp x3, #4
|
||||||
|
bge Write4x4
|
||||||
|
cmp x3, #3
|
||||||
|
beq Write3x4
|
||||||
|
cmp x3, #2
|
||||||
|
beq Write2x4
|
||||||
|
|
||||||
|
Write1x4:
|
||||||
|
str s0, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
str s1, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
str s2, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
str s3, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
b WriteEndx4
|
||||||
|
Write2x4:
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
dup s17, v1.s[1]
|
||||||
|
stp s1, s17, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
dup s18, v2.s[1]
|
||||||
|
stp s2, s18, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
dup s19, v3.s[1]
|
||||||
|
stp s3, s19, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
b WriteEndx4
|
||||||
|
Write3x4:
|
||||||
|
add x8, x7, #8
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
st1 {v0.s}[2], [x8], x5
|
||||||
|
dup s17, v1.s[1]
|
||||||
|
stp s1, s17, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
st1 {v1.s}[2], [x8], x5
|
||||||
|
dup s18, v2.s[1]
|
||||||
|
stp s2, s18, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
st1 {v2.s}[2], [x8], x5
|
||||||
|
dup s19, v3.s[1]
|
||||||
|
stp s3, s19, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
st1 {v3.s}[2], [x8], x5
|
||||||
|
b WriteEndx4
|
||||||
|
Write4x4:
|
||||||
|
st1 {v0.4s}, [x7], x5
|
||||||
|
st1 {v1.4s}, [x7], x5
|
||||||
|
st1 {v2.4s}, [x7], x5
|
||||||
|
st1 {v3.4s}, [x7], x5
|
||||||
|
|
||||||
|
WriteEndx4:
|
||||||
|
subs x6, x6, #4
|
||||||
|
beq LoopOcEnd
|
||||||
|
cmp x6, #4
|
||||||
|
blt Loop1
|
||||||
|
b Loop4
|
||||||
|
|
||||||
|
Loop1:
|
||||||
|
ld1 {v0.4s}, [x1], #16
|
||||||
|
fadd v0.4s, v0.4s, v4.4s
|
||||||
|
fmax v0.4s, v0.4s, v5.4s
|
||||||
|
fmin v0.4s, v0.4s, v6.4s
|
||||||
|
|
||||||
|
cmp x3, #4
|
||||||
|
bge Write4
|
||||||
|
cmp x3, #3
|
||||||
|
beq Write3
|
||||||
|
cmp x3, #2
|
||||||
|
beq Write2
|
||||||
|
|
||||||
|
Write1:
|
||||||
|
str s0, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
b WriteEnd
|
||||||
|
Write2:
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
b WriteEnd
|
||||||
|
Write3:
|
||||||
|
add x8, x7, #8
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x5
|
||||||
|
st1 {v0.s}[2], [x8], x5
|
||||||
|
b WriteEnd
|
||||||
|
Write4:
|
||||||
|
st1 {v0.4s}, [x7], x5
|
||||||
|
WriteEnd:
|
||||||
|
subs x6, x6, #1
|
||||||
|
bne Loop1
|
||||||
|
LoopOcEnd:
|
||||||
|
subs x3, x3, #4
|
||||||
|
add x0, x0, #16
|
||||||
|
bgt LoopOc
|
||||||
|
|
||||||
|
ret
|
||||||
|
#endif
|
|
@ -0,0 +1,132 @@
|
||||||
|
#ifdef __aarch64__
|
||||||
|
|
||||||
|
.text
|
||||||
|
.align 5
|
||||||
|
//.p2align 5,,15
|
||||||
|
.global C4Relu
|
||||||
|
#ifndef __APPLE__
|
||||||
|
.type C4Relu, %function
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride)
|
||||||
|
//x0: dst, x1: input, x2: oc, x3: plane_size, x4: stride
|
||||||
|
|
||||||
|
C4Relu:
|
||||||
|
dup v5.4s, wzr
|
||||||
|
LoopOc:
|
||||||
|
mov x6, x3
|
||||||
|
mov x7, x0
|
||||||
|
cmp x6, #4
|
||||||
|
blt Loop1
|
||||||
|
|
||||||
|
Loop4:
|
||||||
|
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
|
||||||
|
|
||||||
|
fmax v0.4s, v0.4s, v5.4s
|
||||||
|
fmax v1.4s, v1.4s, v5.4s
|
||||||
|
fmax v2.4s, v2.4s, v5.4s
|
||||||
|
fmax v3.4s, v3.4s, v5.4s
|
||||||
|
|
||||||
|
cmp x2, #4
|
||||||
|
bge Write4x4
|
||||||
|
cmp x2, #3
|
||||||
|
beq Write3x4
|
||||||
|
cmp x2, #2
|
||||||
|
beq Write2x4
|
||||||
|
|
||||||
|
Write1x4:
|
||||||
|
str s0, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
str s1, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
str s2, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
str s3, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
b WriteEndx4
|
||||||
|
Write2x4:
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
dup s17, v1.s[1]
|
||||||
|
stp s1, s17, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
dup s18, v2.s[1]
|
||||||
|
stp s2, s18, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
dup s19, v3.s[1]
|
||||||
|
stp s3, s19, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
b WriteEndx4
|
||||||
|
Write3x4:
|
||||||
|
add x8, x7, #8
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
st1 {v0.s}[2], [x8], x4
|
||||||
|
dup s17, v1.s[1]
|
||||||
|
stp s1, s17, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
st1 {v1.s}[2], [x8], x4
|
||||||
|
dup s18, v2.s[1]
|
||||||
|
stp s2, s18, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
st1 {v2.s}[2], [x8], x4
|
||||||
|
dup s19, v3.s[1]
|
||||||
|
stp s3, s19, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
st1 {v3.s}[2], [x8], x4
|
||||||
|
b WriteEndx4
|
||||||
|
Write4x4:
|
||||||
|
st1 {v0.4s}, [x7], x4
|
||||||
|
st1 {v1.4s}, [x7], x4
|
||||||
|
st1 {v2.4s}, [x7], x4
|
||||||
|
st1 {v3.4s}, [x7], x4
|
||||||
|
|
||||||
|
WriteEndx4:
|
||||||
|
subs x6, x6, #4
|
||||||
|
beq LoopOcEnd
|
||||||
|
cmp x6, #4
|
||||||
|
blt Loop1
|
||||||
|
b Loop4
|
||||||
|
|
||||||
|
Loop1:
|
||||||
|
ld1 {v0.4s}, [x1], #16
|
||||||
|
fadd v0.4s, v0.4s, v4.4s
|
||||||
|
fmax v0.4s, v0.4s, v5.4s
|
||||||
|
|
||||||
|
cmp x2, #4
|
||||||
|
bge Write4
|
||||||
|
cmp x2, #3
|
||||||
|
beq Write3
|
||||||
|
cmp x2, #2
|
||||||
|
beq Write2
|
||||||
|
|
||||||
|
Write1:
|
||||||
|
str s0, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
b WriteEnd
|
||||||
|
Write2:
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
b WriteEnd
|
||||||
|
Write3:
|
||||||
|
add x8, x7, #8
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
st1 {v0.s}[2], [x8], x4
|
||||||
|
b WriteEnd
|
||||||
|
Write4:
|
||||||
|
st1 {v0.4s}, [x7], x4
|
||||||
|
WriteEnd:
|
||||||
|
subs x6, x6, #1
|
||||||
|
bne Loop1
|
||||||
|
LoopOcEnd:
|
||||||
|
subs x2, x2, #4
|
||||||
|
add x0, x0, #16
|
||||||
|
bgt LoopOc
|
||||||
|
|
||||||
|
ret
|
||||||
|
#endif
|
|
@ -0,0 +1,140 @@
|
||||||
|
#ifdef __aarch64__
|
||||||
|
|
||||||
|
.text
|
||||||
|
.align 5
|
||||||
|
//.p2align 5,,15
|
||||||
|
.global C4Relu6
|
||||||
|
#ifndef __APPLE__
|
||||||
|
.type C4Relu6, %function
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//void C4Relu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride)
|
||||||
|
//x0: dst, x1: input, x2: oc, x2: plane_size, x3: stride
|
||||||
|
|
||||||
|
C4Relu6:
|
||||||
|
dup v5.4s, wzr
|
||||||
|
movi v6.4s, #6
|
||||||
|
scvtf v6.4s, v6.4s
|
||||||
|
|
||||||
|
LoopOc:
|
||||||
|
mov x6, x3
|
||||||
|
mov x7, x0
|
||||||
|
cmp x6, #4
|
||||||
|
blt Loop1
|
||||||
|
|
||||||
|
Loop4:
|
||||||
|
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
|
||||||
|
fmax v0.4s, v0.4s, v5.4s
|
||||||
|
fmax v1.4s, v1.4s, v5.4s
|
||||||
|
fmax v2.4s, v2.4s, v5.4s
|
||||||
|
fmax v3.4s, v3.4s, v5.4s
|
||||||
|
|
||||||
|
fmin v0.4s, v0.4s, v6.4s
|
||||||
|
fmin v1.4s, v1.4s, v6.4s
|
||||||
|
fmin v2.4s, v2.4s, v6.4s
|
||||||
|
fmin v3.4s, v3.4s, v6.4s
|
||||||
|
|
||||||
|
cmp x2, #4
|
||||||
|
bge Write4x4
|
||||||
|
cmp x2, #3
|
||||||
|
beq Write3x4
|
||||||
|
cmp x2, #2
|
||||||
|
beq Write2x4
|
||||||
|
|
||||||
|
Write1x4:
|
||||||
|
str s0, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
str s1, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
str s2, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
str s3, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
b WriteEndx4
|
||||||
|
Write2x4:
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
dup s17, v1.s[1]
|
||||||
|
stp s1, s17, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
dup s18, v2.s[1]
|
||||||
|
stp s2, s18, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
dup s19, v3.s[1]
|
||||||
|
stp s3, s19, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
b WriteEndx4
|
||||||
|
Write3x4:
|
||||||
|
add x8, x7, #8
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
st1 {v0.s}[2], [x8], x4
|
||||||
|
dup s17, v1.s[1]
|
||||||
|
stp s1, s17, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
st1 {v1.s}[2], [x8], x4
|
||||||
|
dup s18, v2.s[1]
|
||||||
|
stp s2, s18, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
st1 {v2.s}[2], [x8], x4
|
||||||
|
dup s19, v3.s[1]
|
||||||
|
stp s3, s19, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
st1 {v3.s}[2], [x8], x4
|
||||||
|
b WriteEndx4
|
||||||
|
Write4x4:
|
||||||
|
st1 {v0.4s}, [x7], x4
|
||||||
|
st1 {v1.4s}, [x7], x4
|
||||||
|
st1 {v2.4s}, [x7], x4
|
||||||
|
st1 {v3.4s}, [x7], x4
|
||||||
|
|
||||||
|
WriteEndx4:
|
||||||
|
subs x6, x6, #4
|
||||||
|
beq LoopOcEnd
|
||||||
|
cmp x6, #4
|
||||||
|
blt Loop1
|
||||||
|
b Loop4
|
||||||
|
|
||||||
|
Loop1:
|
||||||
|
ld1 {v0.4s}, [x1], #16
|
||||||
|
fadd v0.4s, v0.4s, v4.4s
|
||||||
|
fmax v0.4s, v0.4s, v5.4s
|
||||||
|
fmin v0.4s, v0.4s, v6.4s
|
||||||
|
|
||||||
|
cmp x2, #4
|
||||||
|
bge Write4
|
||||||
|
cmp x2, #3
|
||||||
|
beq Write3
|
||||||
|
cmp x2, #2
|
||||||
|
beq Write2
|
||||||
|
|
||||||
|
Write1:
|
||||||
|
str s0, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
b WriteEnd
|
||||||
|
Write2:
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
b WriteEnd
|
||||||
|
Write3:
|
||||||
|
add x8, x7, #8
|
||||||
|
dup s16, v0.s[1]
|
||||||
|
stp s0, s16, [x7]
|
||||||
|
add x7, x7, x4
|
||||||
|
st1 {v0.s}[2], [x8], x4
|
||||||
|
b WriteEnd
|
||||||
|
Write4:
|
||||||
|
st1 {v0.4s}, [x7], x4
|
||||||
|
WriteEnd:
|
||||||
|
subs x6, x6, #1
|
||||||
|
bne Loop1
|
||||||
|
LoopOcEnd:
|
||||||
|
subs x2, x2, #4
|
||||||
|
add x0, x0, #16
|
||||||
|
bgt LoopOc
|
||||||
|
|
||||||
|
ret
|
||||||
|
#endif
|
|
@ -0,0 +1,96 @@
|
||||||
|
#ifdef __aarch64__
|
||||||
|
|
||||||
|
.text
|
||||||
|
.align 5
|
||||||
|
.global ConvDwFp32Center
|
||||||
|
#ifndef __APPLE__
|
||||||
|
.type ConvDwFp32Center, %function
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
|
||||||
|
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
|
||||||
|
// size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
|
||||||
|
// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w,
|
||||||
|
// x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
|
||||||
|
// x14: relu, x15: relu6
|
||||||
|
ConvDwFp32Center:
|
||||||
|
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||||
|
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||||
|
// x19 ~ x29 should be also preserved
|
||||||
|
// whereas our coding style do not permit such amount of parameters
|
||||||
|
sub sp, sp, #48
|
||||||
|
stp x19, x20, [sp], #16
|
||||||
|
stp x21, x22, [sp], #16
|
||||||
|
stp x23, x24, [sp], #16
|
||||||
|
|
||||||
|
ldr x8, [sp]
|
||||||
|
ldr x9, [sp, #8]
|
||||||
|
ldr x10, [sp, #16]
|
||||||
|
ldr x11, [sp, #24]
|
||||||
|
ldr x12, [sp, #32]
|
||||||
|
ldr x13, [sp, #40]
|
||||||
|
ldr x14, [sp, #48]
|
||||||
|
ldr x15, [sp, #56]
|
||||||
|
|
||||||
|
mov x16, #4
|
||||||
|
mul x8, x8, x16
|
||||||
|
mul x9, x9, x16
|
||||||
|
mul x10, x10, x16
|
||||||
|
mul x11, x11, x16
|
||||||
|
mul x12, x12, x16
|
||||||
|
mul x13, x13, x16
|
||||||
|
mov x16, #16
|
||||||
|
mul x19, x7, x16
|
||||||
|
|
||||||
|
ld1 {v5.4s}, [x3]
|
||||||
|
|
||||||
|
LoopH:
|
||||||
|
mov x23, x1
|
||||||
|
mov x24, x5
|
||||||
|
mov x3, x0
|
||||||
|
LoopW:
|
||||||
|
mov x16, x23
|
||||||
|
mov x17, x2
|
||||||
|
mov x20, x6
|
||||||
|
ld1 {v0.4s}, [x3]
|
||||||
|
fadd v0.4s, v0.4s, v5.4s
|
||||||
|
LoopKh:
|
||||||
|
mov x18, x7
|
||||||
|
mov x21, x17
|
||||||
|
mov x22, x16
|
||||||
|
LoopKw:
|
||||||
|
ld1 {v1.4s}, [x22], x13
|
||||||
|
ld1 {v2.4s}, [x21], #16
|
||||||
|
fmla v0.4s, v1.4s, v2.4s
|
||||||
|
subs x18, x18, #1
|
||||||
|
bne LoopKw
|
||||||
|
add x16, x16, x12
|
||||||
|
add x17, x17, x19
|
||||||
|
subs x20, x20, #1
|
||||||
|
bne LoopKh
|
||||||
|
cbnz x15, Relu6
|
||||||
|
cbnz x14, Relu
|
||||||
|
b Write
|
||||||
|
Relu6:
|
||||||
|
movi v4.4s, #6
|
||||||
|
scvtf v4.4s, v4.4s
|
||||||
|
fmin v0.4s, v0.4s, v4.4s
|
||||||
|
Relu:
|
||||||
|
dup v3.4s, wzr
|
||||||
|
fmax v0.4s, v0.4s, v3.4s
|
||||||
|
Write:
|
||||||
|
st1 {v0.4s}, [x3], x9
|
||||||
|
add x23, x23, x11
|
||||||
|
subs x24, x24, #1
|
||||||
|
bne LoopW
|
||||||
|
add x0, x0, x8
|
||||||
|
add x1, x1, x10
|
||||||
|
subs x4, x4, #1
|
||||||
|
bne LoopH
|
||||||
|
|
||||||
|
sub sp, sp, #48
|
||||||
|
ldp x19, x20, [sp], #16
|
||||||
|
ldp x21, x22, [sp], #16
|
||||||
|
ldp x23, x24, [sp], #16
|
||||||
|
ret
|
||||||
|
#endif
|
|
@ -0,0 +1,77 @@
|
||||||
|
#ifdef __aarch64__
|
||||||
|
|
||||||
|
.text
|
||||||
|
.align 5
|
||||||
|
.global DeconvDwFp32Center
|
||||||
|
#ifndef __APPLE__
|
||||||
|
.type DeconvDwFp32Center, %function
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
|
||||||
|
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
|
||||||
|
// size_t in_kh_step, size_t in_kw_step);
|
||||||
|
// x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
|
||||||
|
// x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
|
||||||
|
DeconvDwFp32Center:
|
||||||
|
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||||
|
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||||
|
// x19 ~ x29 should be also preserved
|
||||||
|
// whereas our coding style do not permit such amount of parameters
|
||||||
|
sub sp, sp, #32
|
||||||
|
stp x19, x20, [sp], #16
|
||||||
|
stp x21, x22, [sp], #16
|
||||||
|
|
||||||
|
ldr x8, [sp]
|
||||||
|
ldr x9, [sp, #8]
|
||||||
|
ldr x10, [sp, #16]
|
||||||
|
ldr x11, [sp, #24]
|
||||||
|
ldr x12, [sp, #32]
|
||||||
|
|
||||||
|
mov x13, #4
|
||||||
|
mul x7, x7, x13
|
||||||
|
mul x8, x8, x13
|
||||||
|
mul x9, x9, x13
|
||||||
|
mul x10, x10, x13
|
||||||
|
mul x11, x11, x13
|
||||||
|
mul x12, x12, x13
|
||||||
|
mov x13, #16
|
||||||
|
mul x14, x6, x13
|
||||||
|
|
||||||
|
LoopH:
|
||||||
|
mov x15, x0
|
||||||
|
mov x16, x1
|
||||||
|
mov x17, x4
|
||||||
|
LoopW:
|
||||||
|
mov x18, x15
|
||||||
|
mov x19, x2
|
||||||
|
mov x20, x5
|
||||||
|
LoopKh:
|
||||||
|
mov x21, x18
|
||||||
|
mov x22, x19
|
||||||
|
mov x13, x6
|
||||||
|
LoopKw:
|
||||||
|
ld1 {v0.4s}, [x21]
|
||||||
|
ld1 {v1.4s}, [x16]
|
||||||
|
ld1 {v2.4s}, [x22], #16
|
||||||
|
fmla v0.4s, v1.4s, v2.4s
|
||||||
|
st1 {v0.4s}, [x21], x12
|
||||||
|
subs x13, x13, #1
|
||||||
|
bne LoopKw
|
||||||
|
add x18, x18, x11
|
||||||
|
add x19, x19, x14
|
||||||
|
subs x20, x20, #1
|
||||||
|
bne LoopKh
|
||||||
|
add x15, x15, x10
|
||||||
|
add x16, x16, x8
|
||||||
|
subs x17, x17, #1
|
||||||
|
bne LoopW
|
||||||
|
add x0, x0, x9
|
||||||
|
add x1, x1, x7
|
||||||
|
subs x3, x3, #1
|
||||||
|
bne LoopH
|
||||||
|
|
||||||
|
sub sp, sp, #32
|
||||||
|
ldp x19, x20, [sp], #16
|
||||||
|
ldp x21, x22, [sp], #16
|
||||||
|
ret
|
||||||
|
#endif
|
|
@ -81,20 +81,19 @@ void PostConvFuncFp32(const float *c4_out_ptr, float *out_ptr, const float *bias
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
int oc4 = UP_DIV(output_channel, C4NUM);
|
|
||||||
if (bias_ptr != nullptr) {
|
if (bias_ptr != nullptr) {
|
||||||
if (is_relu) {
|
if (is_relu) {
|
||||||
BiasAddRelu(bias_ptr, out_ptr, oc4, plane_size);
|
C4BiasAddRelu(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float));
|
||||||
} else if (is_relu6) {
|
} else if (is_relu6) {
|
||||||
BiasAddRelu6(bias_ptr, out_ptr, oc4, plane_size);
|
C4BiasAddRelu6(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float));
|
||||||
} else {
|
} else {
|
||||||
BiasAdd(bias_ptr, out_ptr, oc4, plane_size);
|
C4BiasAdd(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (is_relu) {
|
if (is_relu) {
|
||||||
Relu(out_ptr, oc4 * plane_size);
|
C4Relu(out_ptr, c4_out_ptr, output_channel, plane_size, stride * sizeof(float));
|
||||||
} else if (is_relu6) {
|
} else if (is_relu6) {
|
||||||
Relu6(out_ptr, oc4 * plane_size);
|
C4Relu6(out_ptr, c4_out_ptr, output_channel, plane_size, stride * sizeof(float));
|
||||||
} else {
|
} else {
|
||||||
// do nothing
|
// do nothing
|
||||||
}
|
}
|
||||||
|
|
|
@ -42,6 +42,17 @@ void BiasAddRelu6(const float *bias, float *data, size_t oc4, size_t plan_size);
|
||||||
void BiasAddRelu(const float *bias, float *data, size_t oc4, size_t plan_size);
|
void BiasAddRelu(const float *bias, float *data, size_t oc4, size_t plan_size);
|
||||||
void Relu6(float *data, size_t element4);
|
void Relu6(float *data, size_t element4);
|
||||||
void Relu(float *data, size_t element4);
|
void Relu(float *data, size_t element4);
|
||||||
|
void C4BiasAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride);
|
||||||
|
void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride);
|
||||||
|
void C4BiasAddRelu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride);
|
||||||
|
void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride);
|
||||||
|
void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride);
|
||||||
|
void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
|
||||||
|
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
|
||||||
|
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
|
||||||
|
void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
|
||||||
|
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
|
||||||
|
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "src/runtime/kernel/arm/opclib/fp32/conv_depthwise.h"
|
#include "src/runtime/kernel/arm/opclib/fp32/conv_depthwise.h"
|
||||||
|
#include "src/runtime/kernel/arm/opclib/fp32/common_func.h"
|
||||||
#ifdef ENABLE_ARM64
|
#ifdef ENABLE_ARM64
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
#endif
|
#endif
|
||||||
|
@ -122,6 +123,10 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl
|
||||||
void DepthwiseCenter(float *dst, const float *src, const float *weight, const float *bias, int height, int width,
|
void DepthwiseCenter(float *dst, const float *src, const float *weight, const float *bias, int height, int width,
|
||||||
int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step, int in_sw_step,
|
int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step, int in_sw_step,
|
||||||
int in_kh_step, int in_kw_step, bool is_relu, bool is_relu6) {
|
int in_kh_step, int in_kw_step, bool is_relu, bool is_relu6) {
|
||||||
|
#ifdef ENABLE_ARM64
|
||||||
|
ConvDwFp32Center(dst, src, weight, bias, height, width, kernel_h, kernel_w, out_h_step, block_channel,
|
||||||
|
in_sh_step, in_sw_step, in_kh_step, in_kw_step, is_relu, is_relu6);
|
||||||
|
#else
|
||||||
float *dst_h = dst;
|
float *dst_h = dst;
|
||||||
const float *src_h = src;
|
const float *src_h = src;
|
||||||
for (int oh = 0; oh < height; oh++) {
|
for (int oh = 0; oh < height; oh++) {
|
||||||
|
@ -163,6 +168,7 @@ void DepthwiseCenter(float *dst, const float *src, const float *weight, const fl
|
||||||
dst_h += out_h_step;
|
dst_h += out_h_step;
|
||||||
src_h += in_sh_step;
|
src_h += in_sh_step;
|
||||||
} // dst_height loop
|
} // dst_height loop
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// conv depthwise fp32: sliding window
|
// conv depthwise fp32: sliding window
|
||||||
|
@ -262,6 +268,10 @@ void DeconvDepthwiseBorder(float *dst, const float *src, const float *weight, in
|
||||||
void DeconvDepthwiseCenter(float *dst, const float *src, const float *weight, int height, int width, int kernel_h,
|
void DeconvDepthwiseCenter(float *dst, const float *src, const float *weight, int height, int width, int kernel_h,
|
||||||
int kernel_w, int out_h_step, int block_channel, int in_sh_step, int in_sw_step,
|
int kernel_w, int out_h_step, int block_channel, int in_sh_step, int in_sw_step,
|
||||||
int in_kh_step, int in_kw_step) {
|
int in_kh_step, int in_kw_step) {
|
||||||
|
#ifdef ENABLE_ARM64
|
||||||
|
DeconvDwFp32Center(dst, src, weight, height, width, kernel_h, kernel_w, out_h_step, block_channel,
|
||||||
|
in_sh_step, in_sw_step, in_kh_step, in_kw_step);
|
||||||
|
#else
|
||||||
float *dst_h = dst;
|
float *dst_h = dst;
|
||||||
const float *src_h = src;
|
const float *src_h = src;
|
||||||
for (int oh = 0; oh < height; oh++) {
|
for (int oh = 0; oh < height; oh++) {
|
||||||
|
@ -297,6 +307,7 @@ void DeconvDepthwiseCenter(float *dst, const float *src, const float *weight, in
|
||||||
dst_h += in_sh_step;
|
dst_h += in_sh_step;
|
||||||
src_h += out_h_step;
|
src_h += out_h_step;
|
||||||
} // dst_height loop
|
} // dst_height loop
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void DeconvDepthwisePostFunc(float *dst, const float *bias, int block_channel, const ConvParameter *conv_param) {
|
void DeconvDepthwisePostFunc(float *dst, const float *bias, int block_channel, const ConvParameter *conv_param) {
|
||||||
|
|
Loading…
Reference in New Issue