!16305 [MS][LITE][CPU] arm32 fp16 support npu and weight quant

From: @lzkcode Reviewed-by: @zhanghaibo5,@zhang_xue_tong Signed-off-by: @zhang_xue_tong
2021-05-17 09:23:43 +08:00 · 2021-05-17 09:23:43 +08:00 · 5e7e39f925
parent 6c374dd21a db7bd742a7
commit 5e7e39f925
7 changed files with 56 additions and 55 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/activation_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/activation_fp16.c
@ -56,7 +56,7 @@ int Relu6Fp16(const float16_t *data, float16_t *dst, int ele_num) {
 int LReluFp16(const float16_t *src, float16_t *dst, int ele_num, float16_t alpha) {
  int i = 0;
 #ifdef ENABLE_NEON
-  int ele_c8 = UP_ROUND(ele_num, C8NUM);
+  int ele_c8 = DOWN_ROUND(ele_num, C8NUM);
  for (; i < ele_c8; i += C8NUM) {
    float16x8_t src_tmp = vld1q_f16(src + i);
    float16x8_t mul_tmp = vmulq_n_f16(src_tmp, alpha);
@ -187,7 +187,7 @@ int GeluFp16(const float16_t *src, int length, float16_t *dst, bool approximate)
  if (approximate) {
    // dst = 0.5 * x * (1 + tanh((2 / pi) ^ 0.5 * (x + 0.044715x^3)))
 #ifdef ENABLE_NEON
-    int C8 = UP_ROUND(length, C8NUM);
+    int C8 = DOWN_ROUND(length, C8NUM);
    for (; i < C8; i += C8NUM) {
      float16x8_t in = vld1q_f16(src + i);
      float16x8_t res =
@ -202,7 +202,7 @@ int GeluFp16(const float16_t *src, int length, float16_t *dst, bool approximate)
    }
  } else {
 #ifdef ENABLE_NEON
-    int C8 = UP_ROUND(length, C8NUM);
+    int C8 = DOWN_ROUND(length, C8NUM);
    for (; i < C8; i += C8NUM) {
      float16x8_t in = vld1q_f16(src + i);
      const float16x8_t res = 0.5f * in * (1.0f + MS_ERFX8_F16(in / (float16_t)1.4142135623730951f));
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/power_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/power_fp16.c
@ -70,7 +70,7 @@ void PowerBroadCastFp16(const float16_t *input, const float16_t *exponent, float
  }
  int i = 0;
 #ifdef ENABLE_NEON
-  int len_c8 = UP_ROUND(len, C8NUM);
+  int len_c8 = DOWN_ROUND(len, C8NUM);
  float16x8_t scale_8 = vmovq_n_f16(scale);
  float16x8_t shift_8 = vmovq_n_f16(shift);
  for (; i < len_c8; i += C8NUM) {
@ -88,7 +88,7 @@ void PowerSingleFp16(const float16_t *input, const float16_t *exponent, float16_
  int i = 0;
  PowerScalarFunFp16 PowerScalarFunFp16_ = NULL;
 #ifdef ENABLE_NEON
-  int len_c8 = UP_ROUND(len, C8NUM);
+  int len_c8 = DOWN_ROUND(len, C8NUM);
  float16x8_t scale_8 = vmovq_n_f16(scale);
  float16x8_t shift_8 = vmovq_n_f16(shift);
  for (; i < len_c8; i += C8NUM) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c
@ -228,7 +228,7 @@ int Gelu(const float *src, int length, float *dst, bool approximate) {
  if (approximate) {
    // dst = 0.5 * x * (1 + tanh((2 / pi) ^ 0.5 * (x + 0.044715x^3)))
 #if defined(ENABLE_AVX)
-    int C8 = UP_ROUND(length, C8NUM);
+    int C8 = DOWN_ROUND(length, C8NUM);
    for (; i < C8; i += C8NUM) {
      MS_FLOAT32X8 in = MS_LD256_F32(src + i);
      MS_FLOAT32X8 res = 0.5 * in * (1.0 + MS_TANHX8_F32((0.79788456080287f + 0.035677408136f * in * in) * in));
@ -236,7 +236,7 @@ int Gelu(const float *src, int length, float *dst, bool approximate) {
    }
 #endif
 #if defined(ENABLE_SSE) || defined(ENABLE_ARM)
-    int C4 = UP_ROUND(length, C4NUM);
+    int C4 = DOWN_ROUND(length, C4NUM);
    for (; i < C4; i += C4NUM) {
      MS_FLOAT32X4 in = MS_LDQ_F32(src + i);
      MS_FLOAT32X4 res = 0.5 * in * (1.0 + MS_TANHX4_F32((0.79788456080287f + 0.035677408136f * in * in) * in));
@ -248,7 +248,7 @@ int Gelu(const float *src, int length, float *dst, bool approximate) {
    }
  } else {
 #if defined(ENABLE_AVX) || defined(ENABLE_SSE) || defined(ENABLE_ARM)
-    int C4 = UP_ROUND(length, C4NUM);
+    int C4 = DOWN_ROUND(length, C4NUM);
    for (; i < C4; i += C4NUM) {
      MS_FLOAT32X4 in = MS_LDQ_F32(src + i);
      MS_FLOAT32X4 res = 0.5 * in * (1.0 + MS_ERFX4_F32(in / 1.4142135623730951f));
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c
@ -158,9 +158,9 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
            if (ow_block < ow_block_num[oc_block - 1]) {  // ow is not enough and process one ow
              ow_block = 1;
            }
-            kernel[oc_block - 1][ow_block > 1](dst_w + ow * sw_param->block_channel_, src_w, weight, bias, kernel_h,
-                                               kernel_w, act_type, ow_block, oc_block, oc_algin, ic_algin, in_kw_step,
-                                               in_kh_step, in_sw_step, 0);
+            kernel[oc_block - 1][ow_block / ow_block_num[oc_block - 1]](
+              dst_w + ow * sw_param->block_channel_, src_w, weight, bias, kernel_h, kernel_w, act_type, ow_block,
+              oc_block, oc_algin, ic_algin, in_kw_step, in_kh_step, in_sw_step, 0);
            src_w += ow_block * in_sw_step;
          }
          // ow in left
@ -177,11 +177,11 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
 void SWConv3x32Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
                      size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
                      size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
-  in_kh_step *= 4;
-  in_sw_step *= 4;
-  in_kw_step *= 4;
-  oc_algin *= 4;
-  kw_remainder *= 4;
+  in_kh_step *= sizeof(float);
+  in_sw_step *= sizeof(float);
+  in_kw_step *= sizeof(float);
+  oc_algin *= sizeof(float);
+  kw_remainder *= sizeof(float);
  asm volatile(
    "cmpq $0, %2\n"
    "je 0f\n"
@ -315,10 +315,10 @@ void SWConv3x32Kernel(float *dst, const float *src, const float *weight, const f
 void SWConv1x32Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
                      size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
                      size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
-  in_kh_step *= 4;
-  in_kw_step *= 4;
-  oc_algin *= 4;
-  kw_remainder *= 4;
+  in_kh_step *= sizeof(float);
+  in_kw_step *= sizeof(float);
+  oc_algin *= sizeof(float);
+  kw_remainder *= sizeof(float);
  asm volatile(
    "cmpq $0, %2\n"
    "je 0f\n"
@ -397,13 +397,13 @@ void SWConv1x32Kernel(float *dst, const float *src, const float *weight, const f
 void SWConv4x24Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
                      size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
                      size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
-  in_kh_step *= 4;
-  in_kw_step *= 4;
-  in_sw_step *= 4;
-  kw_remainder *= 4;
+  in_kh_step *= sizeof(float);
+  in_kw_step *= sizeof(float);
+  in_sw_step *= sizeof(float);
+  kw_remainder *= sizeof(float);
  size_t src_3_step = 3 * in_sw_step;
  float *dst_3 = dst + 3 * oc_algin;
-  oc_algin *= 4;
+  oc_algin *= sizeof(float);
  asm volatile(
    "cmpq $0, %2\n"
    "je 0f\n"
@ -545,10 +545,10 @@ void SWConv4x24Kernel(float *dst, const float *src, const float *weight, const f
 void SWConv1x24Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
                      size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
                      size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
-  in_kh_step *= 4;
-  in_kw_step *= 4;
-  kw_remainder *= 4;
-  oc_algin *= 4;
+  in_kh_step *= sizeof(float);
+  in_kw_step *= sizeof(float);
+  kw_remainder *= sizeof(float);
+  oc_algin *= sizeof(float);
  asm volatile(
    "cmpq $0, %2\n"
    "je 0f\n"
@ -621,13 +621,13 @@ void SWConv1x24Kernel(float *dst, const float *src, const float *weight, const f
 void SWConv6x16Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
                      size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
                      size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
-  in_kh_step *= 4;
-  in_kw_step *= 4;
-  in_sw_step *= 4;
-  kw_remainder *= 4;
+  in_kh_step *= sizeof(float);
+  in_kw_step *= sizeof(float);
+  in_sw_step *= sizeof(float);
+  kw_remainder *= sizeof(float);
  size_t src_3_step = 3 * in_sw_step;
  float *dst_3 = dst + 3 * oc_algin;
-  oc_algin *= 4;
+  oc_algin *= sizeof(float);
  asm volatile(
    "cmpq $0, %2\n"
    "je 0f\n"
@ -772,10 +772,10 @@ void SWConv6x16Kernel(float *dst, const float *src, const float *weight, const f
 void SWConv1x16Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
                      size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
                      size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
-  in_kh_step *= 4;
-  in_kw_step *= 4;
-  kw_remainder *= 4;
-  oc_algin *= 4;
+  in_kh_step *= sizeof(float);
+  in_kw_step *= sizeof(float);
+  kw_remainder *= sizeof(float);
+  oc_algin *= sizeof(float);
  asm volatile(
    "cmpq $0, %2\n"
    "je 0f\n"
@ -842,15 +842,15 @@ void SWConv1x16Kernel(float *dst, const float *src, const float *weight, const f
 void SWConv12x8Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
                      size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
                      size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
-  in_kh_step *= 4;
-  in_sw_step *= 4;
-  in_kw_step *= 4;
-  kw_remainder *= 4;
+  in_kh_step *= sizeof(float);
+  in_sw_step *= sizeof(float);
+  in_kw_step *= sizeof(float);
+  kw_remainder *= sizeof(float);
  size_t src_3_step = 3 * in_sw_step;
  float *dst_3 = dst + 3 * oc_algin;
  float *dst_5 = dst + 5 * oc_algin;
  float *dst_9 = dst + 9 * oc_algin;
-  oc_algin *= 4;
+  oc_algin *= sizeof(float);
  asm volatile(
    "cmpq $0, %0\n"
    "je 0f\n"
@ -1001,12 +1001,12 @@ void SWConv12x8Kernel(float *dst, const float *src, const float *weight, const f
 void SWConv4x8Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
                     size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
                     size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
-  in_kh_step *= 4;
-  in_sw_step *= 4;
-  in_kw_step *= 4;
+  in_kh_step *= sizeof(float);
+  in_sw_step *= sizeof(float);
+  in_kw_step *= sizeof(float);
  size_t src_step = 3 * in_sw_step;
  float *dst_3 = dst + 3 * oc_algin;
-  oc_algin *= 4;
+  oc_algin *= sizeof(float);
  asm volatile(
    "cmpq $0, %2\n"
    "je 0f\n"
@ -1091,10 +1091,10 @@ void SWConv4x8Kernel(float *dst, const float *src, const float *weight, const fl
 void SWConv1x8Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h,
                     size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin,
                     size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) {
-  in_kh_step *= 4;
-  in_kw_step *= 4;
-  kw_remainder *= 4;
-  oc_algin *= 4;
+  in_kh_step *= sizeof(float);
+  in_kw_step *= sizeof(float);
+  kw_remainder *= sizeof(float);
+  oc_algin *= sizeof(float);
  asm volatile(
    "cmpq $0, %2\n"
    "je 0f\n"
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/power_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/power_fp32.c
@ -67,7 +67,7 @@ void PowerBroadCast(const float *input, const float *exponent, float *output, in
  }
  int i = 0;
 #if defined(ENABLE_AVX) || defined(ENABLE_SSE) || defined(ENABLE_ARM)
-  int len_c4 = UP_ROUND(len, C4NUM);
+  int len_c4 = DOWN_ROUND(len, C4NUM);
  MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale);
  MS_FLOAT32X4 shift_4 = MS_MOVQ_F32(shift);
  for (; i < len_c4; i += C4NUM) {
@ -84,7 +84,7 @@ void PowerSingle(const float *input, const float *exponent, float *output, int l
  int i = 0;
  PowerScalarFun PowerScalarFun_ = NULL;
 #if defined(ENABLE_AVX) || defined(ENABLE_SSE) || defined(ENABLE_ARM)
-  int len_c4 = UP_ROUND(len, C4NUM);
+  int len_c4 = DOWN_ROUND(len, C4NUM);
  MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale);
  MS_FLOAT32X4 shift_4 = MS_MOVQ_F32(shift);
  for (; i < len_c4; i += C4NUM) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/op_base.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/op_base.h
@ -39,7 +39,8 @@
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
 #define UP_ROUND(x, y) (((x) + (y) - (1)) / (y) * (y))
 #define UP_ROUND_DIV(x, y) (x % y == 0 ? (x / y) : (x / y) + 1)
-#define DOWN_DIV(x, y) (((x) - (y) + (1)) / (y))
+#define DOWN_DIV(x, y) ((x) / (y))
+#define DOWN_ROUND(x, y) ((x) / (y) * (y))

 #define MSVALID(left, x, right) (MSMIN((MSMAX(left, x)), right))

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow_fp32.cc
@ -186,8 +186,8 @@ int ConvolutionSWCPUKernel::Run() {
    FreeTmpBuffer();
    return error_code;
  }
-  auto out_data = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
  if (oc_res_ != 0) {
+    auto out_data = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
    PackNHWCXToNHWCFp32(output_data_, out_data, conv_param_->output_batch_,
                        conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_, oc_tile_);
  }