From 247e24d8d43a470def665afa24e1c92ce3436f99 Mon Sep 17 00:00:00 2001
From: greatpan <changpan_yin@163.com>
Date: Thu, 25 Aug 2022 16:54:34 +0800
Subject: [PATCH] conv im2col support avx512

---
 .jenkins/check/config/filter_cppcheck.txt     |   1 +
 .../device/cpu/kernel/nnacl/CMakeLists.txt    |   4 +-
 .../nnacl/fp32/conv_im2col_avx512_fp32.c      |  92 ++++++++++++++
 .../nnacl/fp32/conv_im2col_avx512_fp32.h      |  38 ++++++
 .../cpu/kernel/nnacl/fp32/conv_im2col_fp32.c  |  64 ++++++++++
 .../cpu/kernel/nnacl/fp32/conv_im2col_fp32.h  |  33 +++++
 .../device/cpu/kernel/nnacl/fp32/div_fp32.c   |   4 -
 .../plugin/device/cpu/kernel/nnacl/op_base.h  |   1 +
 .../lite/src/litert/kernel/cpu/CMakeLists.txt |   4 +-
 .../fp32/convolution_im2col_avx512_fp32.cc    | 119 ++++++++++++++++++
 .../cpu/fp32/convolution_im2col_avx512_fp32.h |  39 ++++++
 .../cpu/fp32/convolution_im2col_avx_fp32.cc   |  12 ++
 .../cpu/fp32/convolution_im2col_base_fp32.cc  |  10 +-
 .../cpu/fp32/convolution_im2col_fp32.cc       |  29 ++++-
 14 files changed, 437 insertions(+), 13 deletions(-)
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/conv_im2col_avx512_fp32.c
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/conv_im2col_avx512_fp32.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/conv_im2col_fp32.c
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/conv_im2col_fp32.h
 create mode 100644 mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_avx512_fp32.cc
 create mode 100644 mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_avx512_fp32.h

diff --git a/.jenkins/check/config/filter_cppcheck.txt b/.jenkins/check/config/filter_cppcheck.txt
index 397bbb3bc85..eab3d75d040 100644
--- a/.jenkins/check/config/filter_cppcheck.txt
+++ b/.jenkins/check/config/filter_cppcheck.txt
@@ -61,3 +61,4 @@
 "mindspore/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental"                               "unreadVariable"
 "mindspore/mindspore/lite/python/src/pybind_module.cc"                                                "syntaxError"
 "mindspore/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_fp32.cc"                      "knownConditionTrueFalse"
+"mindspore/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_fp32.cc"                      "shadowVariable"
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/CMakeLists.txt b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/CMakeLists.txt
index b1f79fea8e9..8ca042818e8 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/CMakeLists.txt
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/CMakeLists.txt
@@ -92,7 +92,9 @@ file(GLOB KERNEL_SRC
     ${NNACL_DIR}/experimental/*.c
 )
 
-set(KERNEL_AVX512_FILE ${NNACL_DIR}/fp32/matmul_avx512_fp32.c)
+set(KERNEL_AVX512_FILE  ${NNACL_DIR}/fp32/matmul_avx512_fp32.c
+                        ${NNACL_DIR}/fp32/conv_im2col_avx512_fp32.c
+)
 list(REMOVE_ITEM KERNEL_SRC ${KERNEL_AVX512_FILE})
 
 set(KERNEL_AVX_FILE ${NNACL_DIR}/fp32/conv_sw_avx_fp32.c
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/conv_im2col_avx512_fp32.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/conv_im2col_avx512_fp32.c
new file mode 100644
index 00000000000..1f85816de2f
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/conv_im2col_avx512_fp32.c
@@ -0,0 +1,92 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp32/conv_im2col_avx512_fp32.h"
+#include "nnacl/fp32/conv_im2col_fp32.h"
+#include "nnacl/fp32/matmul_avx512_fp32.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+// fp32 conv common
+void ConvIm2ColAVX512Fp32(const float *input_data, float *packed_input, const float *packed_weight,
+                          const float *bias_data, float *output_data, int task_id, const ConvParameter *conv_param,
+                          int cal_num) {
+  if (conv_param->thread_num_ == 0) {
+    return;
+  }
+  int output_hw = conv_param->output_h_ * conv_param->output_w_;
+  int out_channel_align = UP_ROUND(conv_param->output_channel_, C16NUM);
+
+  int block_per_thread = UP_DIV(UP_DIV(output_hw, cal_num), conv_param->thread_num_);
+  int start_block = block_per_thread * task_id;
+  int start_hw = start_block * cal_num;
+  int end_hw = MSMIN(output_hw, (start_block + block_per_thread) * cal_num);
+  if (start_hw >= end_hw) {
+    return;
+  }
+  int out_stride = out_channel_align * cal_num;
+  int deep = conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_;
+  packed_input += task_id * deep * cal_num;
+  size_t input_size = deep * cal_num * sizeof(float);
+
+  for (int b = 0; b < conv_param->input_batch_; b++) {
+    int in_offset = b * conv_param->input_channel_ * conv_param->input_h_ * conv_param->input_w_;
+    int out_offset = b * out_channel_align * output_hw + start_hw * out_channel_align;
+    for (int i = start_hw; i < end_hw; i += cal_num, out_offset += out_stride) {
+      int real_cal_row = MSMIN(output_hw - i, cal_num);
+      memset(packed_input, 0, input_size);
+      Im2ColDataPackUnitFp32(input_data + in_offset, conv_param, packed_input, real_cal_row, i);
+
+      float *gemm_output = output_data + out_offset;
+      MatMulAvx512Fp32(packed_input, packed_weight, gemm_output, bias_data, (size_t)conv_param->act_type_, deep,
+                       out_channel_align, out_channel_align, real_cal_row);
+    }
+  }
+}
+
+// fp32 conv common
+void ConvIm2ColAVX512Fp32CutByBatch(const float *input_data, float *packed_input, const float *packed_weight,
+                                    const float *bias_data, float *output_data, int task_id,
+                                    const ConvParameter *conv_param, int cal_num) {
+  if (conv_param->thread_num_ == 0) {
+    return;
+  }
+  int output_hw = conv_param->output_h_ * conv_param->output_w_;
+  int out_channel_align = UP_ROUND(conv_param->output_channel_, C16NUM);
+
+  int block_batch_per_thread = UP_DIV(conv_param->input_batch_, conv_param->thread_num_);
+  int start_batch = block_batch_per_thread * task_id;
+  int end_batch = MSMIN(conv_param->input_batch_, (start_batch + block_batch_per_thread));
+
+  int out_stride = out_channel_align * cal_num;
+  int deep = conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_;
+  packed_input += task_id * deep * cal_num;
+
+  size_t input_size = deep * cal_num * sizeof(float);
+
+  for (int b = start_batch; b < end_batch; b++) {
+    int in_offset = b * conv_param->input_channel_ * conv_param->input_h_ * conv_param->input_w_;
+    int out_offset = b * out_channel_align * output_hw;
+    for (int i = 0; i < output_hw; i += cal_num, out_offset += out_stride) {
+      int real_cal_row = MSMIN(output_hw - i, cal_num);
+      memset(packed_input, 0, input_size);
+      Im2ColDataPackUnitFp32(input_data + in_offset, conv_param, packed_input, real_cal_row, i);
+
+      float *gemm_output = output_data + out_offset;
+      MatMulAvx512Fp32(packed_input, packed_weight, gemm_output, bias_data, (size_t)conv_param->act_type_, deep,
+                       out_channel_align, out_channel_align, real_cal_row);
+    }
+  }
+}
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/conv_im2col_avx512_fp32.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/conv_im2col_avx512_fp32.h
new file mode 100644
index 00000000000..1af6e4566f9
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/conv_im2col_avx512_fp32.h
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_CONV_IM2COL_AVX512_H_
+#define MINDSPORE_NNACL_FP32_CONV_IM2COL_AVX512_H_
+
+#include "nnacl/conv_parameter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ConvIm2ColAVX512Fp32(const float *input_data, float *packed_input, const float *packed_weight,
+                          const float *bias_data, float *output_data, int task_id, const ConvParameter *conv_param,
+                          int cal_num);
+
+void ConvIm2ColAVX512Fp32CutByBatch(const float *input_data, float *packed_input, const float *packed_weight,
+                                    const float *bias_data, float *output_data, int task_id,
+                                    const ConvParameter *conv_param, int cal_num);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_NNACL_FP32_CONV_IM2COL_AVX512_H_
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/conv_im2col_fp32.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/conv_im2col_fp32.c
new file mode 100644
index 00000000000..eeab3ae1c6c
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/conv_im2col_fp32.c
@@ -0,0 +1,64 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp32/conv_im2col_fp32.h"
+
+void Im2ColDataPackUnitFp32(const float *input_data, const ConvParameter *conv_param, float *packed_input,
+                            int real_cal_num, int block_index) {
+  // input format : nhwc
+  int kernel_h = conv_param->kernel_h_;
+  int kernel_w = conv_param->kernel_w_;
+  int kernel_plane = kernel_h * kernel_w;
+  int dilation_h = conv_param->dilation_h_;
+  int dilation_w = conv_param->dilation_w_;
+  int out_w = conv_param->output_w_;
+  if (dilation_h == 0 || dilation_w == 0 || out_w == 0) {
+    return;
+  }
+  int in_channel = conv_param->input_channel_;
+  int in_w = conv_param->input_w_;
+  for (int i = 0; i < real_cal_num; i++) {
+    int block_start = block_index + i;
+    int input_h = block_start / out_w * conv_param->stride_h_ - conv_param->pad_u_;
+    int input_w = block_start % out_w * conv_param->stride_w_ - conv_param->pad_l_;
+    if (conv_param->input_h_ - input_h < 0 || in_w - input_w < 0) {
+      continue;
+    }
+    int input_stride = (input_h * in_w + input_w) * in_channel;
+    int kh_s = MSMAX(0, UP_DIV(-input_h, dilation_h));
+    int kh_e = MSMIN(kernel_h, UP_DIV(conv_param->input_h_ - input_h, dilation_h));
+    int kw_s = MSMAX(0, UP_DIV(-input_w, dilation_w));
+    int kw_e = MSMIN(kernel_w, UP_DIV(in_w - input_w, dilation_w));
+    if (dilation_w == 1 && dilation_h == 1) {
+      for (int j = kh_s; j < kh_e; j++) {
+        int input_y_stride = j * in_w * in_channel + input_stride;
+        int input_x_stride = input_y_stride + kw_s * in_channel;
+        int input_plane_offset = (j * kernel_w + kw_s) * in_channel + i * in_channel * kernel_plane;
+        memcpy(packed_input + input_plane_offset, input_data + input_x_stride,
+               (kw_e - kw_s) * in_channel * sizeof(float));
+      }  // kernel_h loop
+    } else {
+      for (int j = kh_s; j < kh_e; j++) {
+        int input_y_stride = j * dilation_h * in_w * in_channel + input_stride;
+        for (int k = kw_s; k < kw_e; ++k) {
+          int input_x_stride = input_y_stride + k * dilation_w * in_channel;
+          int input_plane_offset = (j * kernel_w + k) * in_channel + i * in_channel * kernel_plane;
+          memcpy(packed_input + input_plane_offset, input_data + input_x_stride, in_channel * sizeof(float));
+        }
+      }  // kernel_h loop
+    }
+  }  // tile num loop
+}
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/conv_im2col_fp32.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/conv_im2col_fp32.h
new file mode 100644
index 00000000000..10eb2724b60
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/conv_im2col_fp32.h
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_CONV_IM2COL_H_
+#define MINDSPORE_NNACL_FP32_CONV_IM2COL_H_
+
+#include "nnacl/conv_parameter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void Im2ColDataPackUnitFp32(const float *input_data, const ConvParameter *conv_param, float *packed_input,
+                            int real_cal_num, int block_index);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_NNACL_FP32_CONV_IM2COL_H_
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/div_fp32.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/div_fp32.c
index f6fa5994e8c..60a27df1943 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/div_fp32.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/div_fp32.c
@@ -29,10 +29,6 @@ int ElementOptDiv(const float *in0, const float *in1, float *out, int size, cons
       out[index] = in0[0] / in1[index];
     }
   } else {
-    if (in1[0] == 0) {
-      return NNACL_ERRCODE_DIVISOR_ZERO;
-    }
-
     SIMD_RUN_NO_SCALAR(ElementOptDivNum1, index, in0, in1, out, size);
     for (; index < size; index++) {
       out[index] = in0[index] / in1[0];
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h
index 26221249a23..23f3f3ca3f4 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h
@@ -52,6 +52,7 @@
 #define C56NUM 56
 #define C64NUM 64
 #define C128NUM 128
+#define C150NUM 150
 #define C256NUM 256
 #define C1500NUM 1500
 #define TILE_NUM 8
diff --git a/mindspore/lite/src/litert/kernel/cpu/CMakeLists.txt b/mindspore/lite/src/litert/kernel/cpu/CMakeLists.txt
index 9af8c654999..1cd9a69386b 100644
--- a/mindspore/lite/src/litert/kernel/cpu/CMakeLists.txt
+++ b/mindspore/lite/src/litert/kernel/cpu/CMakeLists.txt
@@ -59,7 +59,9 @@ if(NOT("${X86_64_SIMD}" STREQUAL "avx" OR "${X86_64_SIMD}" STREQUAL "avx512"))
 endif()
 
 if(NOT("${X86_64_SIMD}" STREQUAL "avx512"))
-    set(KERNEL_SRC_AVX512_FILE ${CMAKE_CURRENT_SOURCE_DIR}/fp32/matmul_fp32_avx512.cc)
+    set(KERNEL_SRC_AVX512_FILE  ${CMAKE_CURRENT_SOURCE_DIR}/fp32/convolution_im2col_avx512_fp32.cc
+                                {CMAKE_CURRENT_SOURCE_DIR}/fp32/matmul_fp32_avx512.cc
+    )
     list(REMOVE_ITEM KERNEL_SRC ${KERNEL_SRC_AVX512_FILE})
 endif()
 
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_avx512_fp32.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_avx512_fp32.cc
new file mode 100644
index 00000000000..6b513fd5f54
--- /dev/null
+++ b/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_avx512_fp32.cc
@@ -0,0 +1,119 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/litert/kernel/cpu/fp32/convolution_im2col_avx512_fp32.h"
+#include "nnacl/fp32/conv_im2col_avx512_fp32.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_NULL_PTR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::kernel {
+void ConvolutionIm2ColAVX512CPUKernel::InitGlobalVariable() {
+  oc_tile_ = C16NUM;
+  row_tile_ = C150NUM;
+
+  rowMajor2ColNMajorFunc = RowMajor2Col64Major;
+}
+
+int ConvolutionIm2ColAVX512CPUKernel::InitTmpBuffer() {
+  MS_ASSERT(ctx_->allocator != nullptr);
+  CHECK_NULL_RETURN(out_tensors_[0]);
+  CHECK_NULL_RETURN(out_tensors_[0]->MutableData());
+
+  int unit_size =
+    conv_param_->kernel_h_ * conv_param_->kernel_w_ * conv_param_->input_channel_ * row_tile_ * thread_count_;
+
+  if (packed_input_ != nullptr) {
+    ctx_->allocator->Free(packed_input_);
+    packed_input_ = nullptr;
+  }
+  packed_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(unit_size * sizeof(float)));
+  if (packed_input_ == nullptr) {
+    MS_LOG(ERROR) << "malloc packed input failed.";
+    return RET_ERROR;
+  }
+
+  if (conv_param_->output_channel_ % oc_tile_ != 0) {
+    output_need_align_ = true;
+    if (tmp_output_ != nullptr) {
+      ctx_->allocator->Free(tmp_output_);
+    }
+
+    // avx512 need to malloc dst aligned to C16NUM
+    int oc_algin = UP_ROUND(conv_param_->output_channel_, oc_tile_);
+    int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * oc_algin;
+    tmp_output_ =
+      reinterpret_cast<float *>(ctx_->allocator->Malloc(pack_output_size * static_cast<int>(sizeof(float))));
+    if (tmp_output_ == nullptr) {
+      MS_LOG(ERROR) << "malloc tmp output data failed.";
+      return RET_NULL_PTR;
+    }
+  }
+
+  return RET_OK;
+}
+
+int ConvolutionIm2ColAVX512CPUKernel::RunImpl(int task_id) {
+  auto ori_input_data = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->data());
+  if (out_tensors_[0]->format() != NC4HW4) {
+    if (use_batch_cut_flag_) {
+      ConvIm2ColAVX512Fp32CutByBatch(ori_input_data, packed_input_, reinterpret_cast<float *>(packed_weight_),
+                                     reinterpret_cast<float *>(bias_data_), tmp_output_, task_id, conv_param_,
+                                     row_tile_);
+    } else {
+      ConvIm2ColAVX512Fp32(ori_input_data, packed_input_, reinterpret_cast<float *>(packed_weight_),
+                           reinterpret_cast<float *>(bias_data_), tmp_output_, task_id, conv_param_, row_tile_);
+    }
+  } else {
+    MS_LOG(ERROR) << "ConvolutionIm2ColAVX512CPUKernel do not support NC4HW4 output-format's avx512 version";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ConvolutionIm2ColAVX512CPUKernel::Run() {
+  auto ret = InitTmpBuffer();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init tmp buffer failed.";
+    FreeTmpBuffer();
+    return RET_ERROR;
+  }
+  auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->MutableData());
+  if (!output_need_align_) {
+    tmp_output_ = output_addr;
+  }
+  if (RepackWeight() != RET_OK) {
+    FreeTmpBuffer();
+    MS_LOG(ERROR) << "Repack weight failed.";
+    return RET_ERROR;
+  }
+  ret = ParallelLaunch(this->ms_context_, ConvolutionIm2ColImpl, this, thread_count_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "conv error error_code[" << ret << "]";
+  }
+
+  if (output_need_align_) {
+    PackNHWCXToNHWCFp32(tmp_output_, output_addr, conv_param_->output_batch_,
+                        conv_param_->output_w_ * conv_param_->output_h_, conv_param_->output_channel_, oc_tile_);
+  } else {
+    tmp_output_ = nullptr;
+  }
+
+  FreeTmpBuffer();
+  return ret;
+}
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_avx512_fp32.h b/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_avx512_fp32.h
new file mode 100644
index 00000000000..f16bee7717f
--- /dev/null
+++ b/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_avx512_fp32.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_CONVOLUTION_IM2COL_AVX512_FP32_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_CONVOLUTION_IM2COL_AVX512_FP32_H_
+
+#include <vector>
+#include "src/litert/kernel/cpu/fp32/convolution_im2col_base_fp32.h"
+
+namespace mindspore::kernel {
+class ConvolutionIm2ColAVX512CPUKernel : public ConvolutionIm2ColBaseCPUKernel {
+ public:
+  ConvolutionIm2ColAVX512CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                                   const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
+                                   float *origin_weight, float *origin_bias)
+      : ConvolutionIm2ColBaseCPUKernel(parameter, inputs, outputs, ctx, origin_weight, origin_bias) {}
+  ~ConvolutionIm2ColAVX512CPUKernel() override {}
+
+  void InitGlobalVariable() override;
+  int InitTmpBuffer() override;
+  int RunImpl(int task_id) override;
+  int Run() override;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_CONVOLUTION_IM2COL_FP32_H_
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_avx_fp32.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_avx_fp32.cc
index acc4ab63630..8a07f8e8d53 100644
--- a/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_avx_fp32.cc
+++ b/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_avx_fp32.cc
@@ -37,12 +37,20 @@ int ConvolutionIm2ColAVXCPUKernel::InitTmpBuffer() {
   int unit_size =
     conv_param_->kernel_h_ * conv_param_->kernel_w_ * conv_param_->input_channel_ * row_tile_ * thread_count_;
 
+  if (packed_input_ != nullptr) {
+    ctx_->allocator->Free(packed_input_);
+    packed_input_ = nullptr;
+  }
   packed_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(unit_size * sizeof(float)));
   if (packed_input_ == nullptr) {
     MS_LOG(ERROR) << "malloc packed input failed.";
     return RET_ERROR;
   }
 
+  if (col_major_input_ != nullptr) {
+    ctx_->allocator->Free(col_major_input_);
+    col_major_input_ = nullptr;
+  }
   col_major_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(unit_size * sizeof(float)));
   if (col_major_input_ == nullptr) {
     MS_LOG(ERROR) << "malloc col_major_input_ failed.";
@@ -54,6 +62,10 @@ int ConvolutionIm2ColAVXCPUKernel::InitTmpBuffer() {
     int oc_algin = UP_DIV(conv_param_->output_channel_, oc_tile_);
     int pack_output_size =
       conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * oc_tile_ * oc_algin;
+    if (tmp_output_ != nullptr) {
+      ctx_->allocator->Free(tmp_output_);
+      tmp_output_ = nullptr;
+    }
     tmp_output_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(pack_output_size * sizeof(float)));
     if (tmp_output_ == nullptr) {
       MS_LOG(ERROR) << "Malloc tmp_output_ buffer is failed.";
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_base_fp32.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_base_fp32.cc
index 083b5f25998..951c0860724 100644
--- a/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_base_fp32.cc
+++ b/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_base_fp32.cc
@@ -25,6 +25,7 @@
 
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_INFER_INVALID;
+using mindspore::lite::RET_NULL_PTR;
 using mindspore::lite::RET_OK;
 
 namespace mindspore::kernel {
@@ -44,13 +45,20 @@ int ConvolutionIm2ColBaseCPUKernel::InitTmpBuffer() {
 
   int unit_size =
     conv_param_->kernel_h_ * conv_param_->kernel_w_ * conv_param_->input_channel_ * row_tile_ * thread_count_;
-
+  if (packed_input_ != nullptr) {
+    ctx_->allocator->Free(packed_input_);
+    packed_input_ = nullptr;
+  }
   packed_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(unit_size * sizeof(float)));
   if (packed_input_ == nullptr) {
     MS_LOG(ERROR) << "malloc packed input failed.";
     return RET_ERROR;
   }
 
+  if (col_major_input_ != nullptr) {
+    ctx_->allocator->Free(col_major_input_);
+    col_major_input_ = nullptr;
+  }
   col_major_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(unit_size * sizeof(float)));
   if (col_major_input_ == nullptr) {
     MS_LOG(ERROR) << "malloc col_major_input_ failed.";
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_fp32.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_fp32.cc
index 060a17f5c37..409a1ee7cc2 100644
--- a/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_fp32.cc
+++ b/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_im2col_fp32.cc
@@ -16,6 +16,9 @@
 
 #include "src/litert/kernel/cpu/fp32/convolution_im2col_fp32.h"
 #include "src/litert/kernel/cpu/fp32/convolution_im2col_base_fp32.h"
+#if defined(ENABLE_AVX512)
+#include "src/litert/kernel/cpu/fp32/convolution_im2col_avx512_fp32.h"
+#endif
 #if defined(ENABLE_AVX)
 #include "src/litert/kernel/cpu/fp32/convolution_im2col_avx_fp32.h"
 #endif
@@ -31,12 +34,22 @@
 #if defined(ENABLE_ARM64)
 #include "src/litert/kernel/cpu/fp32/convolution_im2col_arm64_fp32.h"
 #endif
+#include "nnacl/intrinsics/ms_simd_cpu_info.h"
 
 namespace mindspore::kernel {
 LiteKernel *CreateConvolutionIm2ColCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                                              const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                                              float *origin_weight, float *origin_bias) {
   LiteKernel *kernel = nullptr;
+#if defined(ENABLE_AVX512)
+  if (kernel == nullptr && outputs.front()->format() != NC4HW4) {
+    AVX512_HARDWARE_SELF_AWARENESS_BEGIN;
+    kernel = new (std::nothrow)
+      kernel::ConvolutionIm2ColAVX512CPUKernel(parameter, inputs, outputs, ctx, origin_weight, origin_bias);
+    AVX512_HARDWARE_SELF_AWARENESS_END;
+  }
+#endif
+
 #if defined(ENABLE_AVX)
   if (kernel == nullptr) {
     kernel = new (std::nothrow)
@@ -45,21 +58,25 @@ LiteKernel *CreateConvolutionIm2ColCPUKernel(OpParameter *parameter, const std::
 #endif
 
 #if defined(ENABLE_SSE)
-  if (kernel == nullptr) {
+  if (kernel == nullptr && outputs.front()->format() != NC4HW4) {
     kernel = new (std::nothrow)
       kernel::ConvolutionIm2ColSSECPUKernel(parameter, inputs, outputs, ctx, origin_weight, origin_bias);
   }
 #endif
 
 #if defined(ENABLE_ARM64)
-  kernel = new (std::nothrow)
-    kernel::ConvolutionIm2ColARM64CPUKernel(parameter, inputs, outputs, ctx, origin_weight, origin_bias);
+  if (kernel == nullptr) {
+    kernel = new (std::nothrow)
+      kernel::ConvolutionIm2ColARM64CPUKernel(parameter, inputs, outputs, ctx, origin_weight, origin_bias);
+  }
 #elif defined(ENABLE_ARM32)
-  kernel = new (std::nothrow)
-    kernel::ConvolutionIm2ColARM32CPUKernel(parameter, inputs, outputs, ctx, origin_weight, origin_bias);
+  if (kernel == nullptr && outputs.front()->format() != NC4HW4) {
+    kernel = new (std::nothrow)
+      kernel::ConvolutionIm2ColARM32CPUKernel(parameter, inputs, outputs, ctx, origin_weight, origin_bias);
+  }
 #endif
 
-  if (kernel == nullptr) {
+  if (kernel == nullptr && outputs.front()->format() != NC4HW4) {
     kernel = new (std::nothrow)
       kernel::ConvolutionIm2ColBaseCPUKernel(parameter, inputs, outputs, ctx, origin_weight, origin_bias);
   }