!4547 [MS][LITE]deconvolution fp16 post function

Merge pull request !4547 from ling/deconv
2020-08-17 15:29:50 +08:00 · 2020-08-17 15:29:50 +08:00 · 31ff088789
parent c9062e1428 fb6b50e8c2
commit 31ff088789
6 changed files with 70 additions and 70 deletions
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
@ -124,7 +124,7 @@ int DeConvFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
 }

 int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) {
-  int oc = MSMIN(thread_stride_ * C8NUM, conv_param_->output_channel_ - task_id * thread_stride_ * C8NUM);
+  int oc = MSMIN(thread_stride_, UP_DIV(conv_param_->output_channel_, C8NUM) - task_id * thread_stride_);
  int oc_res = MSMIN(thread_stride_ * C8NUM, conv_param_->output_channel_ - task_id * thread_stride_ * C8NUM);
  if (oc <= 0) {
    return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/PostFuncBiasReluC8.S
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/PostFuncBiasReluC8.S
@ -204,6 +204,9 @@ Loop_C1:
  beq End
  mov w13, w5
  ld1 {v16.4s, v17.4s}, [x2], #32
+  mov x25,  #4
+  mul x24, x10, x25
+  add x0, x0, x24

  cmp x4, #1
  beq Loop_C1_1
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/MatmulFp16.S
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/MatmulFp16.S
@ -863,7 +863,10 @@ End2:
  subs w7, w7, #8 // rhs col - 8
  add x1, x1, x15 // rhs ptr + stride
  add x3, x3, #16 // bias ptr + stride
+  ldrb w13, [sp, #8]
+  cbz w13, NoDstStep
  add x2, x2, #16 // dst ptr + stride
+NoDstStep:
  bgt L1

 End1:
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/PostFuncBiasReluC8Fp16.S
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/PostFuncBiasReluC8Fp16.S
@ -16,21 +16,20 @@

 // v0 ~ v7 value
 // v16  bias data
-// x24  x25  weite loop tmp buf
+// x22 x23  x24  x25  write loop tmp buf
 // x26  relu6  #6;    x27 relu #0
 // w10  oc8 loop control
 // w13  hw  loop control

 PostFuncBiasReluC8Fp16:
-  movi v26.8h, #6
-  scvtf v26.8h, v26.8h
+  movi v26.8h, #0x46, lsl #8
  dup v27.8h, wzr
  mov w10, #0

 Loop_C8:
  cmp w10, w3
  beq Loop_C1
-  mov x25,  #4
+  mov x25, #2
  mul x24, x10, x25
  add x25, x0, x24
  add w10, w10, #8
@ -118,6 +117,7 @@ Write_4x8:
  st1 {v1.8h}, [x25], x6
  st1 {v2.8h}, [x25], x6
  st1 {v3.8h}, [x25], x6
+  b Loop_4x8

 Loop_1x8:
  cmp w7, #2
@ -159,6 +159,9 @@ Loop_C1:
  beq End
  mov w13, w5
  ld1 {v16.8h}, [x2], #16
+  mov x25,  #2
+  mul x24, x10, x25
+  add x22, x0, x24

  cmp x4, #1
  beq Loop_C1_1
@ -189,7 +192,7 @@ Loop_C1_1_Relu6:
  fadd v0.8h, v0.8h, v16.8h
  fmin v0.8h, v0.8h, v26.8h
  fmax v0.8h, v0.8h, v27.8h
-  st1 {v1.h}[0], [x0], x6
+  st1 {v0.h}[0], [x22], x6
  b Loop_C1_1_Relu6
 Loop_C1_1_Relu:
  cmp w13, #0
@ -198,7 +201,7 @@ Loop_C1_1_Relu:
  ld1 {v0.8h}, [x1], #16
  fadd v0.8h, v0.8h, v16.8h
  fmax v0.8h, v0.8h, v27.8h
-  st1 {v1.h}[0], [x0], x6
+  st1 {v0.h}[0], [x22], x6
  b Loop_C1_1_Relu
 Loop_C1_1_Write:
  cmp w13, #0
@ -206,7 +209,7 @@ Loop_C1_1_Write:
  sub w13, w13, #1
  ld1 {v0.8h}, [x1], #16
  fadd v0.8h, v0.8h, v16.8h
-  st1 {v1.h}[0], [x0], x6
+  st1 {v0.h}[0], [x22], x6
  b Loop_C1_1_Write

 Loop_C1_2:
@ -224,8 +227,8 @@ Loop_C1_2_Relu6:
  fadd v0.8h, v0.8h, v16.8h
  fmin v0.8h, v0.8h, v26.8h
  fmax v0.8h, v0.8h, v27.8h
-  st1 {v1.h}[0], [x0], x6
-  st1 {v1.h}[1], [x24], x6
+  st1 {v0.h}[0], [x22], x6
+  st1 {v0.h}[1], [x24], x6
  b Loop_C1_2_Relu6
 Loop_C1_2_Relu:
  cmp w13, #0
@ -234,8 +237,8 @@ Loop_C1_2_Relu:
  ld1 {v0.8h}, [x1], #16
  fadd v0.8h, v0.8h, v16.8h
  fmax v0.8h, v0.8h, v27.8h
-  st1 {v1.h}[0], [x0], x6
-  st1 {v1.h}[1], [x24], x6
+  st1 {v0.h}[0], [x22], x6
+  st1 {v0.h}[1], [x24], x6
  b Loop_C1_2_Relu
 Loop_C1_2_Write:
  cmp w13, #0
@ -243,14 +246,14 @@ Loop_C1_2_Write:
  sub w13, w13, #1
  ld1 {v0.8h}, [x1], #16
  fadd v0.8h, v0.8h, v16.8h
-  st1 {v1.h}[0], [x0], x6
-  st1 {v1.h}[1], [x24], x6
+  st1 {v0.h}[0], [x22], x6
+  st1 {v0.h}[1], [x24], x6
  b Loop_C1_2_Write


 Loop_C1_3:
-  add x24, x0, #2
-  add x25, x0, #4
+  add x24, x22, #2
+  add x25, x22, #4
  cmp w7, #2
  beq Loop_C1_3_Relu6
  cmp w7, #1
@ -264,9 +267,9 @@ Loop_C1_3_Relu6:
  fadd v0.8h, v0.8h, v16.8h
  fmin v0.8h, v0.8h, v26.8h
  fmax v0.8h, v0.8h, v27.8h
-  st1 {v1.h}[0], [x0], x6
-  st1 {v1.h}[1], [x24], x6
-  st1 {v1.h}[2], [x25], x6
+  st1 {v0.h}[0], [x22], x6
+  st1 {v0.h}[1], [x24], x6
+  st1 {v0.h}[2], [x25], x6
  b Loop_C1_3_Relu6
 Loop_C1_3_Relu:
  cmp w13, #0
@ -275,9 +278,9 @@ Loop_C1_3_Relu:
  ld1 {v0.8h}, [x1], #16
  fadd v0.8h, v0.8h, v16.8h
  fmax v0.8h, v0.8h, v27.8h
-  st1 {v1.h}[0], [x0], x6
-  st1 {v1.h}[1], [x24], x6
-  st1 {v1.h}[2], [x25], x6
+  st1 {v0.h}[0], [x22], x6
+  st1 {v0.h}[1], [x24], x6
+  st1 {v0.h}[2], [x25], x6
  b Loop_C1_3_Relu
 Loop_C1_3_Write:
  cmp w13, #0
@ -285,9 +288,9 @@ Loop_C1_3_Write:
  sub w13, w13, #1
  ld1 {v0.8h}, [x1], #16
  fadd v0.8h, v0.8h, v16.8h
-  st1 {v1.h}[0], [x0], x6
-  st1 {v1.h}[1], [x24], x6
-  st1 {v1.h}[2], [x25], x6
+  st1 {v0.h}[0], [x22], x6
+  st1 {v0.h}[1], [x24], x6
+  st1 {v0.h}[2], [x25], x6
  b Loop_C1_3_Write

 Loop_C1_4:
@ -304,7 +307,7 @@ Loop_C1_4_Relu6:
  fadd v0.8h, v0.8h, v16.8h
  fmin v0.8h, v0.8h, v26.8h
  fmax v0.8h, v0.8h, v27.8h
-  st1 {v0.4h}, [x0], x6
+  st1 {v0.4h}, [x22], x6
  b Loop_C1_4_Relu6
 Loop_C1_4_Relu:
  cmp w13, #0
@ -313,7 +316,7 @@ Loop_C1_4_Relu:
  ld1 {v0.8h}, [x1], #16
  fadd v0.8h, v0.8h, v16.8h
  fmax v0.8h, v0.8h, v27.8h
-  st1 {v0.4h}, [x0], x6
+  st1 {v0.4h}, [x22], x6
  b Loop_C1_4_Relu6
 Loop_C1_4_Write:
  cmp w13, #0
@ -321,11 +324,11 @@ Loop_C1_4_Write:
  sub w13, w13, #1
  ld1 {v0.8h}, [x1], #16
  fadd v0.8h, v0.8h, v16.8h
-  st1 {v0.4h}, [x0], x6
+  st1 {v0.4h}, [x22], x6
  b Loop_C1_4_Write

 Loop_C1_5:
-  add x25, x0, #16
+  add x25, x22, #8
  cmp w7, #2
  beq Loop_C1_5_Relu6
  cmp w7, #1
@ -339,9 +342,8 @@ Loop_C1_5_Relu6:
  fadd v0.8h, v0.8h, v16.8h
  fmin v0.8h, v0.8h, v26.8h
  fmax v0.8h, v0.8h, v27.8h
-  st1 {v0.4h}, [x0], x6
-  str h1, [x25]
-  add x25, x25, x6
+  st1 {v0.4h}, [x22], x6
+  st1 {v0.h}[4], [x25], x6
  b Loop_C1_5_Relu6
 Loop_C1_5_Relu:
  cmp w13, #0
@ -350,9 +352,8 @@ Loop_C1_5_Relu:
  ld1 {v0.8h}, [x1], #16
  fadd v0.8h, v0.8h, v16.8h
  fmax v0.8h, v0.8h, v27.8h
-  st1 {v0.4h}, [x0], x6
-  str h1, [x25]
-  add x25, x25, x6
+  st1 {v0.4h}, [x22], x6
+  st1 {v0.h}[4], [x25], x6
  b Loop_C1_5_Relu
 Loop_C1_5_Write:
  cmp w13, #0
@ -360,14 +361,13 @@ Loop_C1_5_Write:
  sub w13, w13, #1
  ld1 {v0.8h}, [x1], #16
  fadd v0.8h, v0.8h, v16.8h
-  st1 {v0.4h}, [x0], x6
-  str h1, [x25]
-  add x25, x25, x6
+  st1 {v0.4h}, [x22], x6
+  st1 {v0.h}[4], [x25], x6
  b Loop_C1_5_Write

 Loop_C1_6:
-  add x23, x0, #8
-  add x24, x0, #10
+  add x23, x22, #8
+  add x24, x22, #10
  cmp w7, #2
  beq Loop_C1_6_Relu6
  cmp w7, #1
@ -381,9 +381,9 @@ Loop_C1_6_Relu6:
  fadd v0.8h, v0.8h, v16.8h
  fmin v0.8h, v0.8h, v26.8h
  fmax v0.8h, v0.8h, v27.8h
-  st1 {v0.4h}, [x0], x6
-  st1 {v1.h}[4], [x23], x6
-  st1 {v1.h}[5], [x24], x6
+  st1 {v0.4h}, [x22], x6
+  st1 {v0.h}[4], [x23], x6
+  st1 {v0.h}[5], [x24], x6
  b Loop_C1_6_Relu6
 Loop_C1_6_Relu:
  cmp w13, #0
@ -392,9 +392,9 @@ Loop_C1_6_Relu:
  ld1 {v0.8h}, [x1], #16
  fadd v0.8h, v0.8h, v16.8h
  fmax v0.8h, v0.8h, v27.8h
-  st1 {v0.4h}, [x0], x6
-  st1 {v1.h}[4], [x23], x6
-  st1 {v1.h}[5], [x24], x6
+  st1 {v0.4h}, [x22], x6
+  st1 {v0.h}[4], [x23], x6
+  st1 {v0.h}[5], [x24], x6
  b Loop_C1_6_Relu
 Loop_C1_6_Write:
  cmp w13, #0
@ -402,15 +402,15 @@ Loop_C1_6_Write:
  sub w13, w13, #1
  ld1 {v0.8h}, [x1], #16
  fadd v0.8h, v0.8h, v16.8h
-  st1 {v0.4h}, [x0], x6
-  st1 {v1.h}[4], [x23], x6
-  st1 {v1.h}[5], [x24], x6
+  st1 {v0.4h}, [x22], x6
+  st1 {v0.h}[4], [x23], x6
+  st1 {v0.h}[5], [x24], x6
  b Loop_C1_6_Write

 Loop_C1_7:
-  add x23, x0, #8
-  add x24, x0, #10
-  add x25, x0, #12
+  add x23, x22, #8
+  add x24, x22, #10
+  add x25, x22, #12
  cmp w7, #2
  beq Loop_C1_7_Relu6
  cmp w7, #1
@ -424,10 +424,10 @@ Loop_C1_7_Relu6:
  fadd v0.8h, v0.8h, v16.8h
  fmin v0.8h, v0.8h, v26.8h
  fmax v0.8h, v0.8h, v27.8h
-  st1 {v0.4h}, [x0], x6
-  st1 {v1.h}[4], [x23], x6
-  st1 {v1.h}[5], [x24], x6
-  st1 {v1.h}[6], [x25], x6
+  st1 {v0.4h}, [x22], x6
+  st1 {v0.h}[4], [x23], x6
+  st1 {v0.h}[5], [x24], x6
+  st1 {v0.h}[6], [x25], x6
  b Loop_C1_7_Relu6
 Loop_C1_7_Relu:
  cmp w13, #0
@ -436,10 +436,10 @@ Loop_C1_7_Relu:
  ld1 {v0.8h}, [x1], #16
  fadd v0.8h, v0.8h, v16.8h
  fmax v0.8h, v0.8h, v27.8h
-  st1 {v0.4h}, [x0], x6
-  st1 {v1.h}[4], [x23], x6
-  st1 {v1.h}[5], [x24], x6
-  st1 {v1.h}[6], [x25], x6
+  st1 {v0.4h}, [x22], x6
+  st1 {v0.h}[4], [x23], x6
+  st1 {v0.h}[5], [x24], x6
+  st1 {v0.h}[6], [x25], x6
  b Loop_C1_7_Relu
 Loop_C1_7_Write:
  cmp w13, #0
@ -447,11 +447,10 @@ Loop_C1_7_Write:
  sub w13, w13, #1
  ld1 {v0.8h}, [x1], #16
  fadd v0.8h, v0.8h, v16.8h
-  fmax v0.8h, v0.8h, v27.8h
-  st1 {v0.4h}, [x0], x6
-  st1 {v1.h}[4], [x23], x6
-  st1 {v1.h}[5], [x24], x6
-  st1 {v1.h}[6], [x25], x6
+  st1 {v0.4h}, [x22], x6
+  st1 {v0.h}[4], [x23], x6
+  st1 {v0.h}[5], [x24], x6
+  st1 {v0.h}[6], [x25], x6
  b Loop_C1_7_Write

 End:
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/deconv_fp16.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/deconv_fp16.c
@ -37,16 +37,12 @@ void PostConvFuncCommFp16(float16_t *out_ptr, const float16_t *src_ptr_, const f

 void PostConvFuncFp16C8(const float16_t *c8_out_ptr, float16_t *out_ptr, const float16_t *bias_ptr,
                        size_t output_channel, size_t plane_size, size_t stride, bool is_relu, bool is_relu6) {
-#ifdef DEBUG_CODE
-  PostConvFuncCommFp16(out_ptr, c8_out_ptr, bias_ptr, output_channel, plane_size, stride, is_relu, is_relu6, C8NUM);
-#else
  size_t oc8mod = output_channel % C8NUM;
  size_t oc8div = output_channel - oc8mod;
  size_t stride_size = stride * sizeof(float16_t);
  size_t relu_type = is_relu ? 1 : 0;
  relu_type = is_relu6 ? 2 : relu_type;
  PostFuncBiasReluC8Fp16(out_ptr, c8_out_ptr, bias_ptr, oc8div, oc8mod, plane_size, stride_size, relu_type);
-#endif
  return;
 }

--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/matmul_fp16.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/matmul_fp16.c
@ -74,7 +74,6 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl
 void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
                int depth, int row, int col, int stride, bool write_nhwc) {
  MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, write_nhwc);
-  //  MatMul16x8(a, b, c, bias, (int)act_type, depth, row, col, stride, write_nhwc);
  return;
 }