!20493 [MS][LITE][CPU] x86 初始化時間及算子优化

Merge pull request !20493 from liuzhongkai/init_time
2021-07-20 10:49:53 +00:00 · 2021-07-20 10:49:53 +00:00 · 62b39b5d7e
parent 4d403f5a39 5bd952d1a3
commit 62b39b5d7e
6 changed files with 92 additions and 14 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/layer_norm_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/layer_norm_fp16.c
@ -27,15 +27,12 @@ int LayerNormMeanAndSquareFp16(const float16_t *src, int num, float16_t *mean, f
  float square_sum = 0.0f;
  for (; index <= num - C8NUM; index += C8NUM) {
    float16x8_t srcv = vld1q_f16(src + index);
-    float16x8_t squarev = vmulq_f16(srcv, srcv);
-
+    for (int i = 0; i < C8NUM; ++i) {
+      square_sum += srcv[i] * srcv[i];
+    }
    float16x4_t sum2 = vadd_f16(vget_low_f16(srcv), vget_high_f16(srcv));
    float32x4_t sum_f32 = vcvt_f32_f16(sum2);
    sum += MS_ADDVQ_F32(sum_f32);
-
-    float16x4_t square2 = vadd_f16(vget_low_f16(squarev), vget_high_f16(squarev));
-    float32x4_t square_f32 = vcvt_f32_f16(square2);
-    square_sum += MS_ADDVQ_F32(square_f32);
  }
  for (; index < num; index++) {
    sum += src[index];
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pack_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pack_fp32.c
@ -166,9 +166,83 @@ void PackNHWCToNHWCXFp32(const void *src, void *dst, int batch, int plane, int c
  }
 }

-void PackNHWCTo1HWCNXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
+#ifdef ENABLE_AVX
+// PackNHWCToNXHWCXFp32 is SWPackNHWCToNXHWCXFp32 asm optimize
+void PackNHWCToNXHWCXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
                          float *tmp_weight, const float *src) {
-  // pack weight NHWC to 1HWCxN32 1HWCxN24 1HWCxN16 1HWCxN8
+  // pack weight NHWC to N32HWC32 N24HWC24 N16HWC16 N8HWC8
+  // output_channel: batch
+  int ic8 = DOWN_ROUND(input_channel, C8NUM);
+  int oc_block8 = DOWN_DIV(output_channel, C8NUM);
+  int oc_block = 0;
+  int oc = 0;
+  int oc_remainder_step = 0;
+  if (oc_block8 != oc_block_num) {
+    oc_block8 = oc_block8 / C4NUM * C4NUM;
+    oc_remainder_step = (oc_block_num - oc_block8) * C8NUM;
+  }
+  int plane = kernel_w * kernel_h;
+  if (plane == 1) {  // conv 1x1 weight pack
+    for (; oc < oc_block8; oc += (oc_block / C8NUM)) {
+      oc_block = MSMIN(C4NUM, oc_block8 - oc) * C8NUM;  // max_tile = 32 ==> 24 ==> 16 ==> 8
+      for (int oc_tmp = 0; oc_tmp < oc_block; oc_tmp += C8NUM) {
+        int ic = 0;
+        for (; ic < ic8; ic += C8NUM) {
+          Transpose8X8Fp32Avx(src + ic, tmp_weight + ic * oc_block + oc_tmp, input_channel, oc_block);
+        }
+        for (; ic < input_channel; ++ic) {
+          for (int j = 0; j < C8NUM; ++j) {
+            tmp_weight[ic * oc_block + oc_tmp + j] = src[ic + input_channel * j];
+          }
+        }
+        src += C8NUM * input_channel;
+      }
+      tmp_weight += oc_block * input_channel;
+    }
+    oc = output_channel - oc_block8 * C8NUM;
+    for (int oc_remainder = 0; oc_remainder < oc; ++oc_remainder) {
+      for (int ic = 0; ic < input_channel; ++ic) {
+        tmp_weight[oc_remainder + oc_remainder_step * ic] = src[ic + oc_remainder * input_channel];
+      }
+    }
+  } else {
+    for (; oc < oc_block8; oc += (oc_block / C8NUM)) {
+      oc_block = MSMIN(C4NUM, oc_block8 - oc) * C8NUM;  // max_tile = 32 ==> 24 ==> 16 ==> 8
+      for (int oc_tmp = 0; oc_tmp < oc_block; oc_tmp += C8NUM) {
+        for (int hw = 0; hw < plane; ++hw) {
+          int ic = 0;
+          for (; ic < ic8; ic += C8NUM) {
+            Transpose8X8Fp32Avx(src + hw * input_channel + ic,
+                                tmp_weight + hw * oc_block * input_channel + ic * oc_block + oc_tmp,
+                                input_channel * plane, oc_block);
+          }
+          for (; ic < input_channel; ++ic) {
+            for (int j = 0; j < C8NUM; ++j) {
+              tmp_weight[ic * oc_block + oc_tmp + j + hw * oc_block * input_channel] =
+                src[ic + input_channel * j * plane + hw * input_channel];
+            }
+          }
+        }
+        src += C8NUM * plane * input_channel;
+      }
+      tmp_weight += oc_block * input_channel * plane;
+    }
+    oc = output_channel - oc_block8 * C8NUM;
+    for (int oc_remainder = 0; oc_remainder < oc; ++oc_remainder) {
+      for (int hw = 0; hw < plane; ++hw) {
+        for (int ic = 0; ic < input_channel; ++ic) {
+          tmp_weight[oc_remainder + oc_remainder_step * ic + hw * input_channel * oc_remainder_step] =
+            src[ic + (oc_remainder * plane + hw) * input_channel];
+        }
+      }
+    }
+  }
+}
+
+#ifdef ENABLE_DEBUG
+void SWPackNHWCToNXHWCXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
+                            float *tmp_weight, const float *src) {
+  // pack weight NHWC to N32HWC32 N24HWC24 N16HWC16 N8HWC8
  int oc_block = 0;
  for (int i = 0; i < oc_block_num; i += oc_block) {
    oc_block = MSMIN(C4NUM, oc_block_num - i);  // max_tile = 4
@ -189,6 +263,9 @@ void PackNHWCTo1HWCNXFp32(int kernel_h, int kernel_w, int output_channel, int oc
    }
  }
 }
+#endif
+#endif
+
 void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel) {
  int c8 = UP_DIV(channel, C8NUM);
  int c8_channel = c8 * C8NUM;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pack_fp32.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pack_fp32.h
@ -30,8 +30,6 @@ void PackNHWCToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int
 void PackNCHWToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int channel);
 void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel);
 void PackNHWCToNHWCXFp32(const void *src, void *dst, int batch, int plane, int channel, int oc_tile);
-void PackNHWCTo1HWCNXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
-                          float *tmp_weight, const float *src);
 void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel);
 // Note: If not multithreaded, please set task_id = 0 and thread_count = 0;
 void PackNHWCToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count);
@ -61,6 +59,12 @@ void Transpose8X8Fp32Arm64(const float *src_ptr, float *dst_ptr, int src_stride,
 void Transpose8X8Fp32Arm32(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride);
 #endif
 #ifdef ENABLE_AVX
+void PackNHWCToNXHWCXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
+                          float *tmp_weight, const float *src);
+#ifdef ENABLE_DEBUG
+void SWPackNHWCToNXHWCXFp32(int kernel_h, int kernel_w, int output_channel, int oc_block_num, int input_channel,
+                            float *tmp_weight, const float *src);
+#endif
 void Transpose8X8Fp32Avx(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride);
 #endif
 #if defined(ENABLE_SSE) && !defined(ENABLE_AVX)
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_utils.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_utils.c
@ -3895,7 +3895,7 @@ int SelectOutputUnit(const ConvParameter *conv_param) {

 bool CheckIfUseWinograd(int *output_unit, const ConvParameter *conv_param) {
  if (conv_param->kernel_w_ == conv_param->kernel_h_ && conv_param->dilation_h_ == 1 && conv_param->dilation_w_ == 1 &&
-      conv_param->stride_h_ == 1 && conv_param->stride_w_ == 1) {
+      conv_param->stride_h_ == 1 && conv_param->stride_w_ == 1 && conv_param->input_channel_ != 1) {
    *output_unit = SelectOutputUnit(conv_param);
    if (*output_unit > 1) {
      return true;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_x86_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_x86_fp32.cc
@ -50,7 +50,7 @@ int ConvolutionDepthwiseSWCPUKernelX86::InitWeightBias() {
    MS_LOG(ERROR) << "Malloc packed_weight_ is failed!";
    return RET_NULL_PTR;
  }
-  PackNHWCTo1HWCNXFp32(weight_tensor->Height(), weight_tensor->Width(), weight_tensor->Batch(), oc_algin,
+  PackNHWCToNXHWCXFp32(weight_tensor->Height(), weight_tensor->Width(), weight_tensor->Batch(), oc_algin,
                       weight_tensor->Channel(), packed_weight_, origin_weight_);
  if (in_tensors_.size() == kInputSize2) {
    auto bias_size = oc_algin * oc_tile_;
@ -194,7 +194,7 @@ void ConvolutionDepthwiseSWCPUKernelX86::FreePackedInputOutput() {
 void ConvolutionDepthwiseSWCPUKernelX86::PackWeight() {
  auto weight_tensor = in_tensors_.at(kWeightIndex);
  int oc_algin = UP_DIV(weight_tensor->Batch(), oc_tile_);
-  PackNHWCTo1HWCNXFp32(weight_tensor->Height(), weight_tensor->Width(), weight_tensor->Batch(), oc_algin,
+  PackNHWCToNXHWCXFp32(weight_tensor->Height(), weight_tensor->Width(), weight_tensor->Batch(), oc_algin,
                       weight_tensor->Channel(), packed_weight_, origin_weight_);
 }

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow_fp32.cc
@ -45,7 +45,7 @@ int ConvolutionSWCPUKernel::InitWeightBias() {
    return RET_NULL_PTR;
  }
  memset(packed_weight_, 0, pack_weight_size * sizeof(float));
-  PackNHWCTo1HWCNXFp32(kernel_h, kernel_w, output_channel, oc_block_num, input_channel, packed_weight_,
+  PackNHWCToNXHWCXFp32(kernel_h, kernel_w, output_channel, oc_block_num, input_channel, packed_weight_,
                       ori_weight_data_);
  if (in_tensors_.size() == kInputSize2) {
    packed_bias_ = reinterpret_cast<float *>(malloc(oc_block_num * oc_tile_ * sizeof(float)));