forked from mindspore-Ecosystem/mindspore
!4739 [MS][LITE][Develop]add fp32 sliding window kernel
Merge pull request !4739 from lixian/master
This commit is contained in:
commit
0ec5a57072
|
@ -258,8 +258,7 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::tensor::Ten
|
|||
kernel =
|
||||
new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
|
||||
} else if (use_sw) {
|
||||
// kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
||||
kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
||||
kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
||||
} else {
|
||||
kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
||||
}
|
||||
|
|
|
@ -18,7 +18,9 @@ ConvDwFp32Center:
|
|||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
sub sp, sp, #48
|
||||
sub sp, sp, #176
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
stp x19, x20, [sp], #16
|
||||
stp x21, x22, [sp], #16
|
||||
stp x23, x24, [sp], #16
|
||||
|
@ -287,7 +289,9 @@ ConvDwFp32Center:
|
|||
subs x4, x4, #1
|
||||
bne LoopH
|
||||
|
||||
sub sp, sp, #48
|
||||
sub sp, sp, #176
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ldp x23, x24, [sp], #16
|
||||
|
|
|
@ -19,7 +19,9 @@ ConvDwInt8Center:
|
|||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
sub sp, sp, #48
|
||||
sub sp, sp, #176
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
stp x19, x20, [sp], #16
|
||||
stp x21, x22, [sp], #16
|
||||
stp x23, x24, [sp], #16
|
||||
|
@ -631,7 +633,9 @@ ConvDwInt8Center:
|
|||
subs x4, x4, #1
|
||||
bne LoopH
|
||||
|
||||
sub sp, sp, #48
|
||||
sub sp, sp, #176
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ldp x23, x24, [sp], #16
|
||||
|
|
|
@ -0,0 +1,446 @@
|
|||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global ConvSwFp32Center
|
||||
#ifndef __APPLE__
|
||||
.type ConvSwFp32Center, %function
|
||||
#endif
|
||||
|
||||
// void ConvSwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
|
||||
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t ic4, size_t in_sh_step,
|
||||
// size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
|
||||
// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: kernel_h, x7: kernel_w,
|
||||
// x8: out_h_step, x9: block_channel, x10: ic4, x11: in_sh_step, x12: in_sw_step, x13: in_kh_step, x14: in_kw_step
|
||||
// x26: relu, x16: relu6
|
||||
ConvSwFp32Center:
|
||||
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
sub sp, sp, #208
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
stp x19, x20, [sp], #16
|
||||
stp x21, x22, [sp], #16
|
||||
stp x23, x24, [sp], #16
|
||||
stp x25, x26, [sp], #16
|
||||
stp x27, x28, [sp], #16
|
||||
|
||||
ldr x8, [sp]
|
||||
ldr x9, [sp, #8]
|
||||
ldr x10, [sp, #16]
|
||||
ldr x11, [sp, #24]
|
||||
ldr x12, [sp, #32]
|
||||
ldr x13, [sp, #40]
|
||||
ldr x14, [sp, #48]
|
||||
mul x15, x6, x7
|
||||
mul x15, x10, x15
|
||||
mov x16, #16
|
||||
mul x15, x15, x16
|
||||
|
||||
ld1 {v25.4s}, [x3]
|
||||
movi v26.4s, #6
|
||||
scvtf v26.4s, v26.4s
|
||||
dup v27.4s, wzr
|
||||
|
||||
LoopH:
|
||||
mov x17, x1
|
||||
mov x18, x5
|
||||
mov x3, x0
|
||||
cmp x18, #8
|
||||
blt LoopW
|
||||
cmp x18, #16
|
||||
blt LoopW8
|
||||
|
||||
LoopW16:
|
||||
mov x19, #16
|
||||
mul x19, x19, x12
|
||||
mov x20, x17
|
||||
mov x21, x2
|
||||
mov x22, x6
|
||||
mov v0.16b, v25.16b
|
||||
mov v1.16b, v25.16b
|
||||
mov v2.16b, v25.16b
|
||||
mov v3.16b, v25.16b
|
||||
mov v4.16b, v25.16b
|
||||
mov v5.16b, v25.16b
|
||||
mov v6.16b, v25.16b
|
||||
mov v7.16b, v25.16b
|
||||
mov v8.16b, v25.16b
|
||||
mov v9.16b, v25.16b
|
||||
mov v10.16b, v25.16b
|
||||
mov v11.16b, v25.16b
|
||||
mov v12.16b, v25.16b
|
||||
mov v13.16b, v25.16b
|
||||
mov v14.16b, v25.16b
|
||||
mov v15.16b, v25.16b
|
||||
LoopKh16:
|
||||
mov x23, x7
|
||||
mov x24, x20
|
||||
LoopKw16:
|
||||
mov x25, x24
|
||||
mov x27, x10
|
||||
LoopIc16:
|
||||
mov x26, x25
|
||||
mov x16, x21
|
||||
ld1 {v28.4s}, [x16], x15
|
||||
ld1 {v29.4s}, [x16], x15
|
||||
ld1 {v30.4s}, [x16], x15
|
||||
ld1 {v31.4s}, [x16], x15
|
||||
zip1 v20.4s, v28.4s, v29.4s
|
||||
zip2 v21.4s, v28.4s, v29.4s
|
||||
zip1 v22.4s, v30.4s, v31.4s
|
||||
zip2 v23.4s, v30.4s, v31.4s
|
||||
ld1 {v16.4s}, [x26], x12
|
||||
ld1 {v17.4s}, [x26], x12
|
||||
trn1 v28.2d, v20.2d, v22.2d
|
||||
trn2 v29.2d, v20.2d, v22.2d
|
||||
trn1 v30.2d, v21.2d, v23.2d
|
||||
trn2 v31.2d, v21.2d, v23.2d
|
||||
ld1 {v18.4s}, [x26], x12
|
||||
ld1 {v19.4s}, [x26], x12
|
||||
fmla v0.4s, v28.4s, v16.s[0]
|
||||
fmla v1.4s, v28.4s, v17.s[0]
|
||||
fmla v0.4s, v29.4s, v16.s[1]
|
||||
fmla v1.4s, v29.4s, v17.s[1]
|
||||
fmla v0.4s, v30.4s, v16.s[2]
|
||||
fmla v1.4s, v30.4s, v17.s[2]
|
||||
fmla v0.4s, v31.4s, v16.s[3]
|
||||
fmla v1.4s, v31.4s, v17.s[3]
|
||||
ld1 {v20.4s}, [x26], x12
|
||||
ld1 {v21.4s}, [x26], x12
|
||||
fmla v2.4s, v28.4s, v18.s[0]
|
||||
fmla v3.4s, v28.4s, v19.s[0]
|
||||
fmla v2.4s, v29.4s, v18.s[1]
|
||||
fmla v3.4s, v29.4s, v19.s[1]
|
||||
fmla v2.4s, v30.4s, v18.s[2]
|
||||
fmla v3.4s, v30.4s, v19.s[2]
|
||||
fmla v2.4s, v31.4s, v18.s[3]
|
||||
fmla v3.4s, v31.4s, v19.s[3]
|
||||
ld1 {v22.4s}, [x26], x12
|
||||
ld1 {v23.4s}, [x26], x12
|
||||
fmla v4.4s, v28.4s, v20.s[0]
|
||||
fmla v5.4s, v28.4s, v21.s[0]
|
||||
fmla v4.4s, v29.4s, v20.s[1]
|
||||
fmla v5.4s, v29.4s, v21.s[1]
|
||||
fmla v4.4s, v30.4s, v20.s[2]
|
||||
fmla v5.4s, v30.4s, v21.s[2]
|
||||
fmla v4.4s, v31.4s, v20.s[3]
|
||||
fmla v5.4s, v31.4s, v21.s[3]
|
||||
ld1 {v16.4s}, [x26], x12
|
||||
ld1 {v17.4s}, [x26], x12
|
||||
fmla v6.4s, v28.4s, v22.s[0]
|
||||
fmla v7.4s, v28.4s, v23.s[0]
|
||||
fmla v6.4s, v29.4s, v22.s[1]
|
||||
fmla v7.4s, v29.4s, v23.s[1]
|
||||
fmla v6.4s, v30.4s, v22.s[2]
|
||||
fmla v7.4s, v30.4s, v23.s[2]
|
||||
fmla v6.4s, v31.4s, v22.s[3]
|
||||
fmla v7.4s, v31.4s, v23.s[3]
|
||||
ld1 {v18.4s}, [x26], x12
|
||||
ld1 {v19.4s}, [x26], x12
|
||||
fmla v8.4s, v28.4s, v16.s[0]
|
||||
fmla v9.4s, v28.4s, v17.s[0]
|
||||
fmla v8.4s, v29.4s, v16.s[1]
|
||||
fmla v9.4s, v29.4s, v17.s[1]
|
||||
fmla v8.4s, v30.4s, v16.s[2]
|
||||
fmla v9.4s, v30.4s, v17.s[2]
|
||||
fmla v8.4s, v31.4s, v16.s[3]
|
||||
fmla v9.4s, v31.4s, v17.s[3]
|
||||
ld1 {v20.4s}, [x26], x12
|
||||
ld1 {v21.4s}, [x26], x12
|
||||
fmla v10.4s, v28.4s, v18.s[0]
|
||||
fmla v11.4s, v28.4s, v19.s[0]
|
||||
fmla v10.4s, v29.4s, v18.s[1]
|
||||
fmla v11.4s, v29.4s, v19.s[1]
|
||||
fmla v10.4s, v30.4s, v18.s[2]
|
||||
fmla v11.4s, v30.4s, v19.s[2]
|
||||
fmla v10.4s, v31.4s, v18.s[3]
|
||||
fmla v11.4s, v31.4s, v19.s[3]
|
||||
ld1 {v22.4s}, [x26], x12
|
||||
ld1 {v23.4s}, [x26], x12
|
||||
fmla v12.4s, v28.4s, v20.s[0]
|
||||
fmla v13.4s, v28.4s, v21.s[0]
|
||||
fmla v12.4s, v29.4s, v20.s[1]
|
||||
fmla v13.4s, v29.4s, v21.s[1]
|
||||
fmla v12.4s, v30.4s, v20.s[2]
|
||||
fmla v13.4s, v30.4s, v21.s[2]
|
||||
fmla v12.4s, v31.4s, v20.s[3]
|
||||
fmla v13.4s, v31.4s, v21.s[3]
|
||||
fmla v14.4s, v28.4s, v22.s[0]
|
||||
fmla v15.4s, v28.4s, v23.s[0]
|
||||
fmla v14.4s, v29.4s, v22.s[1]
|
||||
fmla v15.4s, v29.4s, v23.s[1]
|
||||
fmla v14.4s, v30.4s, v22.s[2]
|
||||
fmla v15.4s, v30.4s, v23.s[2]
|
||||
fmla v14.4s, v31.4s, v22.s[3]
|
||||
fmla v15.4s, v31.4s, v23.s[3]
|
||||
add x21, x21, #16
|
||||
add x25, x25, #16
|
||||
subs x27, x27, #1
|
||||
bgt LoopIc16
|
||||
subs x23, x23, #1
|
||||
add x24, x24, x14
|
||||
bne LoopKw16
|
||||
add x20, x20, x13
|
||||
subs x22, x22, #1
|
||||
bne LoopKh16
|
||||
ldr x16, [sp, #64]
|
||||
cbnz x16, Relu616
|
||||
ldr x26, [sp, #56]
|
||||
cbnz x26, Relu16
|
||||
b Write16
|
||||
Relu616:
|
||||
fmin v0.4s, v0.4s, v26.4s
|
||||
fmin v1.4s, v1.4s, v26.4s
|
||||
fmin v2.4s, v2.4s, v26.4s
|
||||
fmin v3.4s, v3.4s, v26.4s
|
||||
fmin v4.4s, v4.4s, v26.4s
|
||||
fmin v5.4s, v5.4s, v26.4s
|
||||
fmin v6.4s, v6.4s, v26.4s
|
||||
fmin v7.4s, v7.4s, v26.4s
|
||||
fmin v8.4s, v8.4s, v26.4s
|
||||
fmin v9.4s, v9.4s, v26.4s
|
||||
fmin v10.4s, v10.4s, v26.4s
|
||||
fmin v11.4s, v11.4s, v26.4s
|
||||
fmin v12.4s, v12.4s, v26.4s
|
||||
fmin v13.4s, v13.4s, v26.4s
|
||||
fmin v14.4s, v14.4s, v26.4s
|
||||
fmin v15.4s, v15.4s, v26.4s
|
||||
Relu16:
|
||||
fmax v0.4s, v0.4s, v27.4s
|
||||
fmax v1.4s, v1.4s, v27.4s
|
||||
fmax v2.4s, v2.4s, v27.4s
|
||||
fmax v3.4s, v3.4s, v27.4s
|
||||
fmax v4.4s, v4.4s, v27.4s
|
||||
fmax v5.4s, v5.4s, v27.4s
|
||||
fmax v6.4s, v6.4s, v27.4s
|
||||
fmax v7.4s, v7.4s, v27.4s
|
||||
fmax v8.4s, v8.4s, v27.4s
|
||||
fmax v9.4s, v9.4s, v27.4s
|
||||
fmax v10.4s, v10.4s, v27.4s
|
||||
fmax v11.4s, v11.4s, v27.4s
|
||||
fmax v12.4s, v12.4s, v27.4s
|
||||
fmax v13.4s, v13.4s, v27.4s
|
||||
fmax v14.4s, v14.4s, v27.4s
|
||||
fmax v15.4s, v15.4s, v27.4s
|
||||
Write16:
|
||||
st1 {v0.4s}, [x3], x9
|
||||
st1 {v1.4s}, [x3], x9
|
||||
st1 {v2.4s}, [x3], x9
|
||||
st1 {v3.4s}, [x3], x9
|
||||
st1 {v4.4s}, [x3], x9
|
||||
st1 {v5.4s}, [x3], x9
|
||||
st1 {v6.4s}, [x3], x9
|
||||
st1 {v7.4s}, [x3], x9
|
||||
st1 {v8.4s}, [x3], x9
|
||||
st1 {v9.4s}, [x3], x9
|
||||
st1 {v10.4s}, [x3], x9
|
||||
st1 {v11.4s}, [x3], x9
|
||||
st1 {v12.4s}, [x3], x9
|
||||
st1 {v13.4s}, [x3], x9
|
||||
st1 {v14.4s}, [x3], x9
|
||||
st1 {v15.4s}, [x3], x9
|
||||
add x17, x17, x19
|
||||
sub x18, x18, #16
|
||||
cmp x18, #0
|
||||
ble LoopWEnd
|
||||
cmp x18, #8
|
||||
blt LoopW
|
||||
cmp x18, #16
|
||||
bge LoopW16
|
||||
LoopW8:
|
||||
mov x19, #8
|
||||
mul x19, x19, x12
|
||||
mov x20, x17
|
||||
mov x21, x2
|
||||
mov x22, x6
|
||||
mov v0.16b, v25.16b
|
||||
mov v1.16b, v25.16b
|
||||
mov v2.16b, v25.16b
|
||||
mov v3.16b, v25.16b
|
||||
mov v4.16b, v25.16b
|
||||
mov v5.16b, v25.16b
|
||||
mov v6.16b, v25.16b
|
||||
mov v7.16b, v25.16b
|
||||
LoopKh8:
|
||||
mov x23, x7
|
||||
mov x24, x20
|
||||
LoopKw8:
|
||||
mov x25, x24
|
||||
mov x27, x10
|
||||
LoopIc8:
|
||||
mov x26, x25
|
||||
mov x16, x21
|
||||
ld1 {v28.4s}, [x16], x15
|
||||
ld1 {v29.4s}, [x16], x15
|
||||
ld1 {v30.4s}, [x16], x15
|
||||
ld1 {v31.4s}, [x16], x15
|
||||
zip1 v20.4s, v28.4s, v29.4s
|
||||
zip2 v21.4s, v28.4s, v29.4s
|
||||
zip1 v22.4s, v30.4s, v31.4s
|
||||
zip2 v23.4s, v30.4s, v31.4s
|
||||
ld1 {v16.4s}, [x26], x12
|
||||
ld1 {v17.4s}, [x26], x12
|
||||
trn1 v28.2d, v20.2d, v22.2d
|
||||
trn2 v29.2d, v20.2d, v22.2d
|
||||
trn1 v30.2d, v21.2d, v23.2d
|
||||
trn2 v31.2d, v21.2d, v23.2d
|
||||
ld1 {v18.4s}, [x26], x12
|
||||
ld1 {v19.4s}, [x26], x12
|
||||
fmla v0.4s, v28.4s, v16.s[0]
|
||||
fmla v1.4s, v28.4s, v17.s[0]
|
||||
fmla v0.4s, v29.4s, v16.s[1]
|
||||
fmla v1.4s, v29.4s, v17.s[1]
|
||||
fmla v0.4s, v30.4s, v16.s[2]
|
||||
fmla v1.4s, v30.4s, v17.s[2]
|
||||
fmla v0.4s, v31.4s, v16.s[3]
|
||||
fmla v1.4s, v31.4s, v17.s[3]
|
||||
ld1 {v20.4s}, [x26], x12
|
||||
ld1 {v21.4s}, [x26], x12
|
||||
fmla v2.4s, v28.4s, v18.s[0]
|
||||
fmla v3.4s, v28.4s, v19.s[0]
|
||||
fmla v2.4s, v29.4s, v18.s[1]
|
||||
fmla v3.4s, v29.4s, v19.s[1]
|
||||
fmla v2.4s, v30.4s, v18.s[2]
|
||||
fmla v3.4s, v30.4s, v19.s[2]
|
||||
fmla v2.4s, v31.4s, v18.s[3]
|
||||
fmla v3.4s, v31.4s, v19.s[3]
|
||||
ld1 {v22.4s}, [x26], x12
|
||||
ld1 {v23.4s}, [x26], x12
|
||||
fmla v4.4s, v28.4s, v20.s[0]
|
||||
fmla v5.4s, v28.4s, v21.s[0]
|
||||
fmla v4.4s, v29.4s, v20.s[1]
|
||||
fmla v5.4s, v29.4s, v21.s[1]
|
||||
fmla v4.4s, v30.4s, v20.s[2]
|
||||
fmla v5.4s, v30.4s, v21.s[2]
|
||||
fmla v4.4s, v31.4s, v20.s[3]
|
||||
fmla v5.4s, v31.4s, v21.s[3]
|
||||
fmla v6.4s, v28.4s, v22.s[0]
|
||||
fmla v7.4s, v28.4s, v23.s[0]
|
||||
fmla v6.4s, v29.4s, v22.s[1]
|
||||
fmla v7.4s, v29.4s, v23.s[1]
|
||||
fmla v6.4s, v30.4s, v22.s[2]
|
||||
fmla v7.4s, v30.4s, v23.s[2]
|
||||
fmla v6.4s, v31.4s, v22.s[3]
|
||||
fmla v7.4s, v31.4s, v23.s[3]
|
||||
add x21, x21, #16
|
||||
add x25, x25, #16
|
||||
subs x27, x27, #1
|
||||
bgt LoopIc8
|
||||
subs x23, x23, #1
|
||||
add x24, x24, x14
|
||||
bne LoopKw8
|
||||
add x20, x20, x13
|
||||
subs x22, x22, #1
|
||||
bne LoopKh8
|
||||
ldr x16, [sp, #64]
|
||||
cbnz x16, Relu68
|
||||
ldr x26, [sp, #56]
|
||||
cbnz x26, Relu8
|
||||
b Write8
|
||||
Relu68:
|
||||
fmin v0.4s, v0.4s, v26.4s
|
||||
fmin v1.4s, v1.4s, v26.4s
|
||||
fmin v2.4s, v2.4s, v26.4s
|
||||
fmin v3.4s, v3.4s, v26.4s
|
||||
fmin v4.4s, v4.4s, v26.4s
|
||||
fmin v5.4s, v5.4s, v26.4s
|
||||
fmin v6.4s, v6.4s, v26.4s
|
||||
fmin v7.4s, v7.4s, v26.4s
|
||||
Relu8:
|
||||
fmax v0.4s, v0.4s, v27.4s
|
||||
fmax v1.4s, v1.4s, v27.4s
|
||||
fmax v2.4s, v2.4s, v27.4s
|
||||
fmax v3.4s, v3.4s, v27.4s
|
||||
fmax v4.4s, v4.4s, v27.4s
|
||||
fmax v5.4s, v5.4s, v27.4s
|
||||
fmax v6.4s, v6.4s, v27.4s
|
||||
fmax v7.4s, v7.4s, v27.4s
|
||||
Write8:
|
||||
st1 {v0.4s}, [x3], x9
|
||||
st1 {v1.4s}, [x3], x9
|
||||
st1 {v2.4s}, [x3], x9
|
||||
st1 {v3.4s}, [x3], x9
|
||||
st1 {v4.4s}, [x3], x9
|
||||
st1 {v5.4s}, [x3], x9
|
||||
st1 {v6.4s}, [x3], x9
|
||||
st1 {v7.4s}, [x3], x9
|
||||
add x17, x17, x19
|
||||
sub x18, x18, #8
|
||||
cmp x18, #0
|
||||
ble LoopWEnd
|
||||
cmp x18, #8
|
||||
bge LoopW8
|
||||
LoopW:
|
||||
mov x20, x17
|
||||
mov x21, x2
|
||||
mov x22, x6
|
||||
mov v0.16b, v25.16b
|
||||
LoopKh:
|
||||
mov x23, x7
|
||||
mov x24, x20
|
||||
LoopKw:
|
||||
mov x25, x24
|
||||
mov x27, x10
|
||||
LoopIc:
|
||||
mov x26, x25
|
||||
mov x16, x21
|
||||
ld1 {v28.4s}, [x16], x15
|
||||
ld1 {v29.4s}, [x16], x15
|
||||
ld1 {v30.4s}, [x16], x15
|
||||
ld1 {v31.4s}, [x16], x15
|
||||
zip1 v20.4s, v28.4s, v29.4s
|
||||
zip2 v21.4s, v28.4s, v29.4s
|
||||
zip1 v22.4s, v30.4s, v31.4s
|
||||
zip2 v23.4s, v30.4s, v31.4s
|
||||
ld1 {v16.4s}, [x26], x12
|
||||
trn1 v28.2d, v20.2d, v22.2d
|
||||
trn2 v29.2d, v20.2d, v22.2d
|
||||
trn1 v30.2d, v21.2d, v23.2d
|
||||
trn2 v31.2d, v21.2d, v23.2d
|
||||
fmla v0.4s, v28.4s, v16.s[0]
|
||||
fmla v0.4s, v29.4s, v16.s[1]
|
||||
fmla v0.4s, v30.4s, v16.s[2]
|
||||
fmla v0.4s, v31.4s, v16.s[3]
|
||||
add x21, x21, #16
|
||||
add x25, x25, #16
|
||||
subs x27, x27, #1
|
||||
bgt LoopIc
|
||||
subs x23, x23, #1
|
||||
add x24, x24, x14
|
||||
bne LoopKw
|
||||
add x20, x20, x13
|
||||
subs x22, x22, #1
|
||||
bne LoopKh
|
||||
ldr x16, [sp, #64]
|
||||
cbnz x16, Relu6
|
||||
ldr x26, [sp, #56]
|
||||
cbnz x26, Relu
|
||||
b Write
|
||||
Relu6:
|
||||
fmin v0.4s, v0.4s, v26.4s
|
||||
Relu:
|
||||
fmax v0.4s, v0.4s, v27.4s
|
||||
Write:
|
||||
st1 {v0.4s}, [x3], x9
|
||||
add x17, x17, x12
|
||||
subs x18, x18, #1
|
||||
bne LoopW
|
||||
LoopWEnd:
|
||||
add x0, x0, x8
|
||||
add x1, x1, x11
|
||||
subs x4, x4, #1
|
||||
bne LoopH
|
||||
|
||||
sub sp, sp, #208
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ldp x23, x24, [sp], #16
|
||||
ldp x25, x26, [sp], #16
|
||||
ldp x27, x28, [sp], #16
|
||||
ret
|
||||
#endif
|
|
@ -71,6 +71,11 @@ void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_
|
|||
|
||||
void PostFuncBiasReluC8(float *dst, const float *src, const float *bias, size_t oc8div, size_t oc8mod,
|
||||
size_t plane_size, size_t stride, size_t relu_type);
|
||||
|
||||
void ConvSwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height,
|
||||
size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel,
|
||||
size_t ic4, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step,
|
||||
size_t relu, size_t relu6);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
#include "nnacl/fp32/conv.h"
|
||||
#include <string.h>
|
||||
#include "nnacl/fp32/common_func.h"
|
||||
#include "nnacl/winograd_transform.h"
|
||||
|
||||
void SWBorderPixel(float *dst, const float *src, const float *weight, const float *bias, int height, int width,
|
||||
|
@ -83,6 +84,7 @@ void SWBorder(float *dst, const float *src, const float *weight, const float *bi
|
|||
} // height loop
|
||||
}
|
||||
|
||||
#ifndef ENABLE_ARM64
|
||||
void SWCenter(float *dst, const float *src, const float *weight, const float *bias, int height, int width, int kernel_h,
|
||||
int kernel_w, int out_h_step, int block_channel, int ic4, int in_sh_step, int in_sw_step, int in_kh_step,
|
||||
int in_kw_step, bool is_relu, bool is_relu6) {
|
||||
|
@ -135,6 +137,7 @@ void SWCenter(float *dst, const float *src, const float *weight, const float *bi
|
|||
src_h += in_sh_step;
|
||||
} // dst_height loop
|
||||
}
|
||||
#endif
|
||||
|
||||
// fp32 sliding window
|
||||
void ConvSWFp32(const float *input_data, const float *packed_weight, const float *bias_data, float *tmp_out_block,
|
||||
|
@ -172,11 +175,23 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
|
|||
src_data + in_h_start * slidingWindow_param->in_h_step_ + in_w_start * slidingWindow_param->ic4_channel_;
|
||||
float *out_t = dst_data + slidingWindow_param->top_ * slidingWindow_param->out_h_step_ +
|
||||
slidingWindow_param->left_ * slidingWindow_param->block_channel_;
|
||||
#ifdef ENABLE_ARM64
|
||||
ConvSwFp32Center(out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_,
|
||||
slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_,
|
||||
conv_param->kernel_w_, slidingWindow_param->out_h_step_ * sizeof(float),
|
||||
slidingWindow_param->block_channel_ * sizeof(float), ic4,
|
||||
slidingWindow_param->in_sh_step_ * sizeof(float),
|
||||
slidingWindow_param->in_sw_step_ * sizeof(float),
|
||||
slidingWindow_param->in_kh_step_ * sizeof(float),
|
||||
slidingWindow_param->in_kw_step_ * sizeof(float),
|
||||
conv_param->is_relu_, conv_param->is_relu6_);
|
||||
#else
|
||||
SWCenter(out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_,
|
||||
slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_, conv_param->kernel_w_,
|
||||
slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, ic4,
|
||||
slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_,
|
||||
conv_param->kernel_w_, slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, ic4,
|
||||
slidingWindow_param->in_sh_step_, slidingWindow_param->in_sw_step_, slidingWindow_param->in_kh_step_,
|
||||
slidingWindow_param->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_);
|
||||
#endif
|
||||
}
|
||||
} // output C4 loop
|
||||
src += slidingWindow_param->in_step_;
|
||||
|
|
Loading…
Reference in New Issue