!4441 [MS][LITE] optimize arm cpu fp32 op: add assembly function for conv depthwise border calculating

Merge pull request !4441 from yangruoqi713/lite
2020-08-14 18:38:10 +08:00 · 2020-08-14 18:38:10 +08:00 · df46f27053
parent 32208e367e 951a237a1d
commit df46f27053
7 changed files with 75 additions and 17 deletions
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@ -139,7 +139,7 @@ int ConvolutionDepthwiseFp16CPUKernel::ReSize() {
  }

  ConvolutionBaseCPUKernel::Init();
-  InitSlidingParam(sliding_, conv_param_, C8NUM);
+  InitSlidingParamConvDw(sliding_, conv_param_, C8NUM);

  auto ret = InitBuffer();
  if (ret != 0) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
@ -55,7 +55,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() {
  conv_param_->output_channel_ = in_tensors_.front()->shape().at(kNHWC_C);

  // init sliding_ window param
-  InitSlidingParam(sliding_, conv_param_, C8NUM);
+  InitSlidingParamConvDw(sliding_, conv_param_, C8NUM);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Border.S
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Border.S
@ -0,0 +1,56 @@
+#ifdef __aarch64__
+
+.text
+.align 5
+.global ConvDwFp32Border
+#ifndef __APPLE__
+.type ConvDwFp32Border, %function
+#endif
+
+// void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
+//                       size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6)
+
+// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step,
+// x8: kernel_w, x9: relu, x10: relu6
+ConvDwFp32Border:
+    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
+    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
+    // x19 ~ x29 should be also preserved
+    // whereas our coding style do not permit such amount of parameters
+    ldr x8, [sp]
+    ldr x9, [sp, #8]
+    ldr x10, [sp, #16]
+
+    ld1 {v0.4s}, [x3] // bias
+    movi v1.4s, #6    // relu 6
+    scvtf v1.4s, v1.4s
+    dup v2.4s, wzr    // relu
+
+    mov x13, x1
+    mov x14, x2
+    LoopH:
+        mov x15, x13
+        mov x16, x14
+        mov x17, x5
+        LoopW:
+            ld1 {v3.4s}, [x15], x7
+            ld1 {v4.4s}, [x16], #16
+            fmla v0.4s, v3.4s, v4.4s
+            subs x17, x17, #1
+            bne LoopW
+        subs x4, x4, #1
+        add x13, x13, x6
+        add x14, x14, x8
+        bne LoopH
+    cbnz x10, Relu6
+    cbnz x9, Relu
+    b Write
+    Relu6:
+        fmin v0.4s, v0.4s, v1.4s
+    Relu:
+        fmax v0.4s, v0.4s, v2.4s
+    Write:
+        st1 {v0.4s}, [x0]
+
+    ret
+#endif
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Center.S
@ -10,7 +10,7 @@
 // void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
 //                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
 //                      size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
-// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, 
+// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: kernel_h, x7: kernel_w,
 // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
 // x14: relu, x15: relu6
 ConvDwFp32Center:
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/common_func.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/common_func.h
@ -58,6 +58,9 @@ void C4BiasAddRelu(float *dst, const float *input, const float *bias, size_t oc,
 void C4BiasAddRelu6(float *dst, const float *input, const float *bias, size_t oc, size_t plane_size, size_t stride);
 void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride);
 void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride);
+
+void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
+                      size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6);
 #endif

 #ifdef __cplusplus
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv_depthwise.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv_depthwise.c
@ -86,8 +86,9 @@ void AppendSlidingParamConvDw(SlidingWindowParam *sliding, const ConvParameter *
 }

 /*conv depthwise fp32 begin*/
+#ifndef ENABLE_ARM64
 void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, const float *bias, int height, int width,
-                          int in_kh_step, int in_kw_step, int kernel_w, bool is_relu, bool is_relu6) {
+                          int in_kh_step, int in_kw_step, int kernel_w_step, bool is_relu, bool is_relu6) {
  const float *src_kh = src;
  const float *weight_kh = weight;
  for (int c = 0; c < C4NUM; c++) {
@ -97,22 +98,14 @@ void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, con
    const float *src_kw = src_kh;
    const float *weight_kw = weight_kh;
    for (int kw = 0; kw < width; kw++) {
-#ifdef ENABLE_ARM64
-      float32x4_t src_4 = vld1q_f32(src_kw);
-      float32x4_t weight_4 = vld1q_f32(weight_kw);
-      float32x4_t dst_4 = vld1q_f32(dst);
-      dst_4 = vfmaq_f32(dst_4, src_4, weight_4);
-      vst1q_f32(dst, dst_4);
-#else
      for (int c = 0; c < C4NUM; c++) {
        dst[c] += src_kw[c] * weight_kw[c];
      }
-#endif
      src_kw += in_kw_step;
      weight_kw += C4NUM;
    }  // kernel_w loop
    src_kh += in_kh_step;
-    weight_kh += kernel_w * C4NUM;
+    weight_kh += kernel_w_step;
  }  // kernel_h loop
  for (int c = 0; c < C4NUM; c++) {
    dst[c] += bias[c];
@ -120,6 +113,7 @@ void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, con
    dst[c] = (is_relu6) ? (MSMIN(6, MSMAX(0, dst[c]))) : (dst[c]);
  }
 }
+#endif

 void DepthwiseBorder(float *dst, const float *src, const float *weight, const float *bias, int top, int bottom,
                     int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) {
@ -140,10 +134,15 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl
      const float *src_kernel = src_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_;
      const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C4NUM;

+#ifdef ENABLE_ARM64
+      ConvDwFp32Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
+                       sliding->in_kh_step_ * sizeof(float), sliding->in_kw_step_ * sizeof(float),
+                       conv_param->kernel_w_ * C4NUM * sizeof(float), conv_param->is_relu_, conv_param->is_relu6_);
+#else
      DepthwiseBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
-                           sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_, conv_param->is_relu_,
-                           conv_param->is_relu6_);
-
+                           sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C4NUM,
+                           conv_param->is_relu_, conv_param->is_relu6_);
+#endif
      dst_kernel += sliding->block_channel_;
    }  // width loop
    dst_h += sliding->out_h_step_;
--- a/mindspore/lite/test/models_caffe.cfg
+++ b/mindspore/lite/test/models_caffe.cfg
@ -5,7 +5,7 @@ gender_res_large_deploy
 glasses
 hat
 isface
-#ml_bank_detect_0312
+ml_bank_detect_0312
 ml_face_div_parsing
 ml_hardware_eyeclose
 ml_ocr_detect_20200305