From 951a237a1d4a58ea3218783703381e1b10dfc99c Mon Sep 17 00:00:00 2001 From: yangruoqi713 Date: Fri, 14 Aug 2020 11:25:31 +0800 Subject: [PATCH] [MS][LITE] optimize arm cpu fp32 op: conv depthwise --- .../arm/fp16/convolution_depthwise_fp16.cc | 2 +- .../arm/fp16/deconvolution_depthwise_fp16.cc | 2 +- .../nnacl/assembly/arm64/ConvDwFp32Border.S | 56 +++++++++++++++++++ .../nnacl/assembly/arm64/ConvDwFp32Center.S | 2 +- .../kernel/arm/nnacl/fp32/common_func.h | 3 + .../kernel/arm/nnacl/fp32/conv_depthwise.c | 25 ++++----- mindspore/lite/test/models_caffe.cfg | 2 +- 7 files changed, 75 insertions(+), 17 deletions(-) create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Border.S diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc index 1e0c8546b7..cf07cb9c84 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc @@ -139,7 +139,7 @@ int ConvolutionDepthwiseFp16CPUKernel::ReSize() { } ConvolutionBaseCPUKernel::Init(); - InitSlidingParam(sliding_, conv_param_, C8NUM); + InitSlidingParamConvDw(sliding_, conv_param_, C8NUM); auto ret = InitBuffer(); if (ret != 0) { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc index ee3feb8294..d5570e621e 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc @@ -55,7 +55,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() { conv_param_->output_channel_ = in_tensors_.front()->shape().at(kNHWC_C); // init sliding_ window param - InitSlidingParam(sliding_, conv_param_, C8NUM); + InitSlidingParamConvDw(sliding_, conv_param_, C8NUM); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Border.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Border.S new file mode 100644 index 0000000000..151d054ad0 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Border.S @@ -0,0 +1,56 @@ +#ifdef __aarch64__ + +.text +.align 5 +.global ConvDwFp32Border +#ifndef __APPLE__ +.type ConvDwFp32Border, %function +#endif + +// void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, +// size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6) + +// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step, +// x8: kernel_w, x9: relu, x10: relu6 +ConvDwFp32Border: + // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to + // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers + // x19 ~ x29 should be also preserved + // whereas our coding style do not permit such amount of parameters + ldr x8, [sp] + ldr x9, [sp, #8] + ldr x10, [sp, #16] + + ld1 {v0.4s}, [x3] // bias + movi v1.4s, #6 // relu 6 + scvtf v1.4s, v1.4s + dup v2.4s, wzr // relu + + mov x13, x1 + mov x14, x2 + LoopH: + mov x15, x13 + mov x16, x14 + mov x17, x5 + LoopW: + ld1 {v3.4s}, [x15], x7 + ld1 {v4.4s}, [x16], #16 + fmla v0.4s, v3.4s, v4.4s + subs x17, x17, #1 + bne LoopW + subs x4, x4, #1 + add x13, x13, x6 + add x14, x14, x8 + bne LoopH + cbnz x10, Relu6 + cbnz x9, Relu + b Write + Relu6: + fmin v0.4s, v0.4s, v1.4s + Relu: + fmax v0.4s, v0.4s, v2.4s + Write: + st1 {v0.4s}, [x0] + + ret +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Center.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Center.S index 6b51afbe05..9a11ce84db 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Center.S +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Center.S @@ -10,7 +10,7 @@ // void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, // size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, // size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); -// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, +// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: kernel_h, x7: kernel_w, // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step // x14: relu, x15: relu6 ConvDwFp32Center: diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/common_func.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/common_func.h index 110ffbdbe3..90b0d76215 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/common_func.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/common_func.h @@ -58,6 +58,9 @@ void C4BiasAddRelu(float *dst, const float *input, const float *bias, size_t oc, void C4BiasAddRelu6(float *dst, const float *input, const float *bias, size_t oc, size_t plane_size, size_t stride); void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); + +void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, + size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6); #endif #ifdef __cplusplus diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv_depthwise.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv_depthwise.c index f689c28b6f..a6806f1ba8 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv_depthwise.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv_depthwise.c @@ -86,8 +86,9 @@ void AppendSlidingParamConvDw(SlidingWindowParam *sliding, const ConvParameter * } /*conv depthwise fp32 begin*/ +#ifndef ENABLE_ARM64 void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, const float *bias, int height, int width, - int in_kh_step, int in_kw_step, int kernel_w, bool is_relu, bool is_relu6) { + int in_kh_step, int in_kw_step, int kernel_w_step, bool is_relu, bool is_relu6) { const float *src_kh = src; const float *weight_kh = weight; for (int c = 0; c < C4NUM; c++) { @@ -97,22 +98,14 @@ void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, con const float *src_kw = src_kh; const float *weight_kw = weight_kh; for (int kw = 0; kw < width; kw++) { -#ifdef ENABLE_ARM64 - float32x4_t src_4 = vld1q_f32(src_kw); - float32x4_t weight_4 = vld1q_f32(weight_kw); - float32x4_t dst_4 = vld1q_f32(dst); - dst_4 = vfmaq_f32(dst_4, src_4, weight_4); - vst1q_f32(dst, dst_4); -#else for (int c = 0; c < C4NUM; c++) { dst[c] += src_kw[c] * weight_kw[c]; } -#endif src_kw += in_kw_step; weight_kw += C4NUM; } // kernel_w loop src_kh += in_kh_step; - weight_kh += kernel_w * C4NUM; + weight_kh += kernel_w_step; } // kernel_h loop for (int c = 0; c < C4NUM; c++) { dst[c] += bias[c]; @@ -120,6 +113,7 @@ void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, con dst[c] = (is_relu6) ? (MSMIN(6, MSMAX(0, dst[c]))) : (dst[c]); } } +#endif void DepthwiseBorder(float *dst, const float *src, const float *weight, const float *bias, int top, int bottom, int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) { @@ -140,10 +134,15 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl const float *src_kernel = src_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_; const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C4NUM; +#ifdef ENABLE_ARM64 + ConvDwFp32Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, + sliding->in_kh_step_ * sizeof(float), sliding->in_kw_step_ * sizeof(float), + conv_param->kernel_w_ * C4NUM * sizeof(float), conv_param->is_relu_, conv_param->is_relu6_); +#else DepthwiseBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, - sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_, conv_param->is_relu_, - conv_param->is_relu6_); - + sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C4NUM, + conv_param->is_relu_, conv_param->is_relu6_); +#endif dst_kernel += sliding->block_channel_; } // width loop dst_h += sliding->out_h_step_; diff --git a/mindspore/lite/test/models_caffe.cfg b/mindspore/lite/test/models_caffe.cfg index dddb7b378a..78c9b71082 100644 --- a/mindspore/lite/test/models_caffe.cfg +++ b/mindspore/lite/test/models_caffe.cfg @@ -5,7 +5,7 @@ gender_res_large_deploy glasses hat isface -#ml_bank_detect_0312 +ml_bank_detect_0312 ml_face_div_parsing ml_hardware_eyeclose ml_ocr_detect_20200305