forked from mindspore-Ecosystem/mindspore
!4523 [MS][LITE] optimize arm cpu fp32/fp16 op: add assembly file for deconv depthwise border
Merge pull request !4523 from yangruoqi713/deconv_dw
This commit is contained in:
commit
1745c1c1d7
|
@ -0,0 +1,39 @@
|
|||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global DeconvDwFp32Border
|
||||
#ifndef __APPLE__
|
||||
.type DeconvDwFp32Border, %function
|
||||
#endif
|
||||
|
||||
// void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_t height, size_t width,
|
||||
// size_t in_kh_step, size_t in_kw_step, size_t kernel_w)
|
||||
|
||||
// x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w
|
||||
DeconvDwFp32Border:
|
||||
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
ld1 {v1.4s}, [x1]
|
||||
|
||||
mov x13, x0
|
||||
mov x14, x2
|
||||
LoopH:
|
||||
mov x15, x13
|
||||
mov x16, x14
|
||||
mov x17, x4
|
||||
LoopW:
|
||||
ld1 {v0.4s}, [x15]
|
||||
ld1 {v2.4s}, [x16], #16
|
||||
fmla v0.4s, v1.4s, v2.4s
|
||||
st1 {v0.4s}, [x15], x6
|
||||
subs x17, x17, #1
|
||||
bne LoopW
|
||||
subs x3, x3, #1
|
||||
add x13, x13, x5
|
||||
add x14, x14, x7
|
||||
bne LoopH
|
||||
ret
|
||||
#endif
|
|
@ -0,0 +1,39 @@
|
|||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global DeconvDwFp16Border
|
||||
#ifndef __APPLE__
|
||||
.type DeconvDwFp16Border, %function
|
||||
#endif
|
||||
|
||||
// void DeconvDwFp16Border(float *dst, const float *src, const float *weight, size_t height, size_t width,
|
||||
// size_t in_kh_step, size_t in_kw_step, size_t kernel_w)
|
||||
|
||||
// x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w
|
||||
DeconvDwFp16Border:
|
||||
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
ld1 {v1.8h}, [x1]
|
||||
|
||||
mov x13, x0
|
||||
mov x14, x2
|
||||
LoopH:
|
||||
mov x15, x13
|
||||
mov x16, x14
|
||||
mov x17, x4
|
||||
LoopW:
|
||||
ld1 {v0.8h}, [x15]
|
||||
ld1 {v2.8h}, [x16], #16
|
||||
fmla v0.8h, v1.8h, v2.8h
|
||||
st1 {v0.8h}, [x15], x6
|
||||
subs x17, x17, #1
|
||||
bne LoopW
|
||||
subs x3, x3, #1
|
||||
add x13, x13, x5
|
||||
add x14, x14, x7
|
||||
bne LoopH
|
||||
ret
|
||||
#endif
|
|
@ -35,6 +35,8 @@ void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *wei
|
|||
size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step,
|
||||
size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step,
|
||||
size_t relu, size_t relu6);
|
||||
void DeconvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
|
||||
size_t in_kh_step, size_t in_kw_step, size_t kernel_w);
|
||||
void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
|
||||
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
|
||||
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
|
||||
|
|
|
@ -184,7 +184,7 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo
|
|||
|
||||
/*deconv depthwise fp16 begin*/
|
||||
void DeconvDepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int height,
|
||||
int width, int in_kh_step, int in_kw_step, int kernel_w) {
|
||||
int width, int in_kh_step, int in_kw_step, int kernel_w_step) {
|
||||
float16_t *dst_kh = dst;
|
||||
const float16_t *weight_kh = weight;
|
||||
for (int kh = 0; kh < height; kh++) {
|
||||
|
@ -201,7 +201,7 @@ void DeconvDepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const
|
|||
weight_kw += C8NUM;
|
||||
} // kernel_w loop
|
||||
dst_kh += in_kh_step;
|
||||
weight_kh += kernel_w * C8NUM;
|
||||
weight_kh += kernel_w_step;
|
||||
} // kernel_h loop
|
||||
}
|
||||
|
||||
|
@ -224,9 +224,14 @@ void DeconvDepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float
|
|||
|
||||
const float16_t *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C8NUM;
|
||||
float16_t *dst_kernel = dst_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_;
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
DeconvDwFp16Border(dst_kernel, src_kernel, weight_kernel, end_kh - start_kh, end_kw - start_kw,
|
||||
sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t),
|
||||
conv_param->kernel_w_ * C8NUM * sizeof(float16_t));
|
||||
#else
|
||||
DeconvDepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, end_kh - start_kh, end_kw - start_kw,
|
||||
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_);
|
||||
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C8NUM);
|
||||
#endif
|
||||
src_kernel += sliding->block_channel_;
|
||||
} // width loop
|
||||
src_h += sliding->out_h_step_;
|
||||
|
|
|
@ -61,6 +61,10 @@ void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_
|
|||
|
||||
void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
|
||||
size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6);
|
||||
|
||||
void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_t height, size_t width,
|
||||
size_t in_kh_step, size_t in_kw_step, size_t kernel_w);
|
||||
|
||||
void PostFuncBiasReluC8(float *dst, const float *src, const float *bias, size_t oc8div, size_t oc8mod,
|
||||
size_t plane_size, size_t stride, size_t relu_type);
|
||||
#endif
|
||||
|
|
|
@ -634,7 +634,7 @@ void ConvDw3x3Fp32(float *output_data, const float *input_data, const float *wei
|
|||
|
||||
/*deconv depthwise fp32 begin*/
|
||||
void DeconvDepthwiseBorderPixel(float *dst, const float *src, const float *weight, int height, int width,
|
||||
int in_kh_step, int in_kw_step, int kernel_w) {
|
||||
int in_kh_step, int in_kw_step, int kernel_w_step) {
|
||||
float *dst_kh = dst;
|
||||
const float *weight_kh = weight;
|
||||
for (int kh = 0; kh < height; kh++) {
|
||||
|
@ -656,7 +656,7 @@ void DeconvDepthwiseBorderPixel(float *dst, const float *src, const float *weigh
|
|||
weight_kw += C4NUM;
|
||||
} // kernel_w loop
|
||||
dst_kh += in_kh_step;
|
||||
weight_kh += kernel_w * C4NUM;
|
||||
weight_kh += kernel_w_step;
|
||||
} // kernel_h loop
|
||||
}
|
||||
|
||||
|
@ -678,9 +678,14 @@ void DeconvDepthwiseBorder(float *dst, const float *src, const float *weight, in
|
|||
|
||||
const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C4NUM;
|
||||
float *dst_kernel = dst_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_;
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
DeconvDwFp32Border(dst_kernel, src_kernel, weight_kernel, end_kh - start_kh, end_kw - start_kw,
|
||||
sliding->in_kh_step_ * sizeof(float), sliding->in_kw_step_ * sizeof(float),
|
||||
conv_param->kernel_w_ * C4NUM * sizeof(float));
|
||||
#else
|
||||
DeconvDepthwiseBorderPixel(dst_kernel, src_kernel, weight_kernel, end_kh - start_kh, end_kw - start_kw,
|
||||
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_);
|
||||
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C4NUM);
|
||||
#endif
|
||||
src_kernel += sliding->block_channel_;
|
||||
} // width loop
|
||||
src_h += sliding->out_h_step_;
|
||||
|
|
Loading…
Reference in New Issue