!8277 [MS][LITE][CPU]optimize int8 reduce mean && extract functions

From: @fuzhiye Reviewed-by: @zhang_xue_tong Signed-off-by:
2020-11-09 09:21:27 +08:00 · 2020-11-09 09:21:27 +08:00 · 4330052001
parent 69105d4a2c 35acaabc02
commit 4330052001
8 changed files with 793 additions and 209 deletions
--- a/mindspore/lite/nnacl/int8/reduce_int8.c
+++ b/mindspore/lite/nnacl/int8/reduce_int8.c
@ -20,6 +20,160 @@
 #include "nnacl/quantization/fixed_point.h"
 #include "nnacl/common_func.h"

+int ReduceMeanN(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
+  return NNACL_OK;
+}
+int ReduceMeanH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
+  return NNACL_OK;
+}
+int ReduceMeanW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
+  return NNACL_OK;
+}
+int ReduceMeanC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
+  return NNACL_OK;
+}
+int ReduceMeanNH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
+  return NNACL_OK;
+}
+int ReduceMeanNW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
+  return NNACL_OK;
+}
+int ReduceMeanNC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
+  return NNACL_OK;
+}
+int ReduceMeanHW(int n, int plane, int count, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg,
+                 int32_t bias) {
+  int stride = plane * UP_ROUND(c, C4NUM);
+  for (int batch = 0; batch < n; ++batch) {
+    int8_t *in_ptr = in_data + batch * stride;
+    int8_t *out_ptr = out_data + batch * c;
+    for (int i = 0; i < count; ++i) {
+      int32_t sum_array = 0;
+      int j = 0;
+#ifdef ENABLE_ARM64
+      for (; j < plane; j += 16) {
+        int8x16_t in_data_vec = vld1q_s8(in_ptr);
+        sum_array += vaddlvq_s8(in_data_vec);
+        in_ptr += 16;
+      }
+      for (; j < plane; j += 8) {
+        int8x8_t in_data_vec = vld1_s8(in_ptr);
+        sum_array += vaddlv_s8(in_data_vec);
+        in_ptr += 8;
+      }
+      for (; j < plane; j += 4) {
+        int32x4_t in_data_vec;
+        in_data_vec[0] = in_ptr[0];
+        in_data_vec[1] = in_ptr[1];
+        in_data_vec[2] = in_ptr[2];
+        in_data_vec[3] = in_ptr[3];
+        sum_array += vaddvq_s32(in_data_vec);
+        in_ptr += 4;
+      }
+#elif ENABLE_ARM32
+      int32x4_t accum = vmovq_n_s32(0);
+      for (; j < plane; j += 16) {
+        int32x4_t in_data_vec1;
+        int32x4_t in_data_vec2;
+        int32x4_t in_data_vec3;
+        int32x4_t in_data_vec4;
+        in_data_vec1[0] = in_ptr[0];
+        in_data_vec1[1] = in_ptr[1];
+        in_data_vec1[2] = in_ptr[2];
+        in_data_vec1[3] = in_ptr[3];
+        in_data_vec2[0] = in_ptr[4];
+        in_data_vec2[1] = in_ptr[5];
+        in_data_vec2[2] = in_ptr[6];
+        in_data_vec2[3] = in_ptr[7];
+        in_data_vec3[0] = in_ptr[8];
+        in_data_vec3[1] = in_ptr[9];
+        in_data_vec3[2] = in_ptr[10];
+        in_data_vec3[3] = in_ptr[11];
+        in_data_vec4[0] = in_ptr[12];
+        in_data_vec4[1] = in_ptr[13];
+        in_data_vec4[2] = in_ptr[14];
+        in_data_vec4[3] = in_ptr[15];
+        accum = vaddq_s32(accum, in_data_vec1);
+        accum = vaddq_s32(accum, in_data_vec2);
+        accum = vaddq_s32(accum, in_data_vec3);
+        accum = vaddq_s32(accum, in_data_vec4);
+        in_ptr += 16;
+      }
+      for (; j < plane; j += 8) {
+        int32x4_t in_data_vec1;
+        int32x4_t in_data_vec2;
+        in_data_vec1[0] = in_ptr[0];
+        in_data_vec1[1] = in_ptr[1];
+        in_data_vec1[2] = in_ptr[2];
+        in_data_vec1[3] = in_ptr[3];
+        in_data_vec2[0] = in_ptr[4];
+        in_data_vec2[1] = in_ptr[5];
+        in_data_vec2[2] = in_ptr[6];
+        in_data_vec2[3] = in_ptr[7];
+        accum = vaddq_s32(accum, in_data_vec1);
+        accum = vaddq_s32(accum, in_data_vec2);
+        in_ptr += 8;
+      }
+      for (; j < plane; j += 4) {
+        int32x4_t in_data_vec;
+        in_data_vec[0] = in_ptr[0];
+        in_data_vec[1] = in_ptr[1];
+        in_data_vec[2] = in_ptr[2];
+        in_data_vec[3] = in_ptr[3];
+        accum = vaddq_s32(accum, in_data_vec);
+        in_ptr += 4;
+      }
+      sum_array += accum[0];
+      sum_array += accum[1];
+      sum_array += accum[2];
+      sum_array += accum[3];
+#endif
+      for (; j < plane; j++) {
+        sum_array += in_ptr[0];
+        in_ptr++;
+      }
+      int32_t mean =
+        RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(sum_array * (1 << (unsigned int)quant_arg.left_shift_),
+                                                              quant_arg.multiplier_),
+                            quant_arg.right_shift_);
+      mean += bias;
+      mean = MSMIN(mean, INT8_MAX);
+      mean = MSMAX(mean, INT8_MIN);
+      out_ptr[0] = mean;
+      out_ptr++;
+    }
+  }
+  return NNACL_OK;
+}
+
+int ReduceMeanHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
+  return NNACL_OK;
+}
+
+int ReduceMeanWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
+  return NNACL_OK;
+}
+
+int ReduceMeanNHW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
+  return NNACL_OK;
+}
+
+int ReduceMeanNHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
+  return NNACL_OK;
+}
+
+int ReduceMeanNWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
+  return NNACL_OK;
+}
+
+int ReduceMeanHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
+  return NNACL_OK;
+}
+
+int ReduceMeanNHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
+  return NNACL_OK;
+}
+
 // Get x such that (x-zp_in) * scale_in = mean
 // Assuming reduce n axes, this works for first n-1 reduce. One call for one reduce.
 int ReduceMeanInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
--- a/mindspore/lite/nnacl/int8/reduce_int8.h
+++ b/mindspore/lite/nnacl/int8/reduce_int8.h
@ -21,6 +21,23 @@
 extern "C" {
 #endif

+int ReduceMeanN(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
+int ReduceMeanH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
+int ReduceMeanW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
+int ReduceMeanC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
+int ReduceMeanNH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
+int ReduceMeanNW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
+int ReduceMeanNC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
+int ReduceMeanHW(int n, int plane, int count, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg,
+                 int32_t bias);
+int ReduceMeanHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
+int ReduceMeanWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
+int ReduceMeanNHW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
+int ReduceMeanNHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
+int ReduceMeanNWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
+int ReduceMeanHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
+int ReduceMeanNHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
+
 int ReduceMeanInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                   int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
 int ReduceMeanLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
--- a/mindspore/lite/nnacl/pack.c
+++ b/mindspore/lite/nnacl/pack.c
@ -929,13 +929,144 @@ void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int
  return;
 }

-void PackNHWCToNCHWInt8(const void *src, void *dst, int batch, int plane, int channel) {
-  for (int n = 0; n < batch; n++) {
-    for (int c = 0; c < channel; c++) {
-      for (int hw = 0; hw < plane; hw++) {
-        int nhwc_index = n * channel * plane + hw * channel + c;
-        int nchw_index = n * channel * plane + c * plane + hw;
-        ((int8_t *)dst)[nchw_index] = ((int8_t *)src)[nhwc_index];
+void PackNHWCToNCHWInt8(const void *src, void *dst, int batches, int plane, int channel) {
+  int hw8 = plane / C8NUM * C8NUM;
+  int c8 = channel / C8NUM * C8NUM;
+  int batch = plane * channel;
+  for (int n = 0; n < batches; n++) {
+    const int8_t *src_batch = (const int8_t *)src + n * batch;
+    int8_t *dst_batch = (int8_t *)dst + n * batch;
+    int hw = 0;
+    for (; hw < hw8; hw += C8NUM) {
+      int c = 0;
+      for (; c < c8; c += C8NUM) {
+        const int8_t *src_ptr = src_batch + hw * channel + c;
+        int8_t *dst_ptr = dst_batch + c * plane + hw;
+#ifdef ENABLE_ARM64
+        size_t srcStride = channel * sizeof(int8_t);
+        size_t dstStride = plane * sizeof(int8_t);
+        asm volatile(
+          "mov x10, %[src_ptr]\n"
+          "mov x11, %[dst_ptr]\n"
+
+          "ld1 {v0.8b}, [x10], %[srcStride]\n"
+          "ld1 {v1.8b}, [x10], %[srcStride]\n"
+          "ld1 {v2.8b}, [x10], %[srcStride]\n"
+          "ld1 {v3.8b}, [x10], %[srcStride]\n"
+
+          "trn1 v4.8b, v0.8b, v1.8b\n"
+          "trn2 v5.8b, v0.8b, v1.8b\n"
+          "trn1 v6.8b, v2.8b, v3.8b\n"
+          "trn2 v7.8b, v2.8b, v3.8b\n"
+
+          "ld1 {v0.8b}, [x10], %[srcStride]\n"
+          "ld1 {v1.8b}, [x10], %[srcStride]\n"
+          "ld1 {v2.8b}, [x10], %[srcStride]\n"
+          "ld1 {v3.8b}, [x10], %[srcStride]\n"
+
+          "trn1 v8.4h, v4.4h, v6.4h\n"
+          "trn2 v9.4h, v4.4h, v6.4h\n"
+          "trn1 v10.4h, v5.4h, v7.4h\n"
+          "trn2 v11.4h, v5.4h, v7.4h\n"
+
+          "trn1 v4.8b, v0.8b, v1.8b\n"
+          "trn2 v5.8b, v0.8b, v1.8b\n"
+          "trn1 v6.8b, v2.8b, v3.8b\n"
+          "trn2 v7.8b, v2.8b, v3.8b\n"
+
+          "trn1 v12.4h, v4.4h, v6.4h\n"
+          "trn2 v13.4h, v4.4h, v6.4h\n"
+          "trn1 v14.4h, v5.4h, v7.4h\n"
+          "trn2 v15.4h, v5.4h, v7.4h\n"
+
+          "trn1 v0.2s, v8.2s, v12.2s\n"
+          "trn2 v4.2s, v8.2s, v12.2s\n"
+          "trn1 v1.2s, v10.2s, v14.2s\n"
+          "trn2 v5.2s, v10.2s, v14.2s\n"
+          "trn1 v2.2s, v9.2s, v13.2s\n"
+          "trn2 v6.2s, v9.2s, v13.2s\n"
+          "trn1 v3.2s, v11.2s, v15.2s\n"
+          "trn2 v7.2s, v11.2s, v15.2s\n"
+
+          "st1 {v0.8b}, [x11], %[dstStride]\n"
+          "st1 {v1.8b}, [x11], %[dstStride]\n"
+          "st1 {v2.8b}, [x11], %[dstStride]\n"
+          "st1 {v3.8b}, [x11], %[dstStride]\n"
+          "st1 {v4.8b}, [x11], %[dstStride]\n"
+          "st1 {v5.8b}, [x11], %[dstStride]\n"
+          "st1 {v6.8b}, [x11], %[dstStride]\n"
+          "st1 {v7.8b}, [x11], %[dstStride]\n"
+          :
+          :
+          [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride)
+          : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
+            "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+            "v30", "v31");
+#elif ENABLE_ARM32
+        size_t srcStride = channel * sizeof(int8_t);
+        size_t dstStride = plane * sizeof(int8_t);
+        asm volatile(
+          "mov r10, %[src_ptr]\n"
+          "mov r12, %[dst_ptr]\n"
+
+          "vld1.8 {d0}, [r10], %[srcStride]\n"
+          "vld1.8 {d1}, [r10], %[srcStride]\n"
+          "vld1.8 {d2}, [r10], %[srcStride]\n"
+          "vld1.8 {d3}, [r10], %[srcStride]\n"
+          "vld1.8 {d4}, [r10], %[srcStride]\n"
+          "vld1.8 {d5}, [r10], %[srcStride]\n"
+          "vld1.8 {d6}, [r10], %[srcStride]\n"
+          "vld1.8 {d7}, [r10], %[srcStride]\n"
+
+          "vtrn.8 d0, d1\n"
+          "vtrn.8 d2, d3\n"
+          "vtrn.8 d4, d5\n"
+          "vtrn.8 d6, d7\n"
+
+          "vtrn.16 d0, d2\n"
+          "vtrn.16 d1, d3\n"
+          "vtrn.16 d4, d6\n"
+          "vtrn.16 d5, d7\n"
+
+          "vtrn.32 d0, d4\n"
+          "vtrn.32 d1, d5\n"
+          "vtrn.32 d2, d6\n"
+          "vtrn.32 d3, d7\n"
+
+          "vst1.8 {d0}, [r12], %[dstStride]\n"
+          "vst1.8 {d1}, [r12], %[dstStride]\n"
+          "vst1.8 {d2}, [r12], %[dstStride]\n"
+          "vst1.8 {d3}, [r12], %[dstStride]\n"
+          "vst1.8 {d4}, [r12], %[dstStride]\n"
+          "vst1.8 {d5}, [r12], %[dstStride]\n"
+          "vst1.8 {d6}, [r12], %[dstStride]\n"
+          "vst1.8 {d7}, [r12], %[dstStride]\n"
+          :
+          :
+          [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride)
+          : "r10", "r12", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+            "q15");
+#else
+        for (int tr = 0; tr < C8NUM; tr++) {
+          for (int tc = 0; tc < C8NUM; tc++) {
+            dst_ptr[tc * plane + tr] = src_ptr[tr * channel + tc];
+          }
+        }
+#endif
+      }
+      for (; c < channel; c++) {
+        const int8_t *src_ptr = src_batch + hw * channel + c;
+        int8_t *dst_ptr = dst_batch + c * plane + hw;
+        for (size_t i = 0; i < C8NUM; i++) {
+          dst_ptr[i] = src_ptr[i * channel];
+        }
+      }
+    }
+    for (; hw < plane; hw++) {
+      const int8_t *src_ptr = src_batch + hw * channel;
+      int8_t *dst_ptr = dst_batch + hw;
+      for (size_t i = 0; i < channel; i++) {
+        dst_ptr[i * plane] = src_ptr[i];
      }
    }
  }
--- a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc
@ -108,7 +108,6 @@ int ReduceBaseCPUKernel::Init() {
  }

  mode_ = reduce_param->mode_;
-  memcpy(axes_, reduce_param->axes_, sizeof(reduce_param->axes_));
  reduce_to_end_ = reduce_param->reduce_to_end_;

  auto ret = CheckInputsOutputs();
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
@ -201,8 +201,8 @@ kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &i
  return nullptr;
 }

-void FreeMemoryFp16(std::vector<kernel::LiteKernel *> group_convs, std::vector<lite::Tensor *> new_inputs,
-                    std::vector<lite::Tensor *> new_outputs) {
+void FreeMemoryFp16(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
+                    const std::vector<lite::Tensor *> &new_outputs) {
  for (auto sub_conv : group_convs) {
    if (sub_conv != nullptr) {
      delete sub_conv;
@ -220,49 +220,131 @@ void FreeMemoryFp16(std::vector<kernel::LiteKernel *> group_convs, std::vector<l
  }
 }

+lite::Tensor *CreateInputTensor(TypeId data_type, std::vector<int> in_shape, bool infered_flag) {
+  auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR);
+  if (in_tensor == nullptr) {
+    MS_LOG(ERROR) << "new in_tensor failed.";
+    return nullptr;
+  }
+  if (infered_flag) {
+    auto ret = in_tensor->MallocData();
+    if (ret != RET_OK) {
+      delete in_tensor;
+      MS_LOG(ERROR) << "in tensor malloc failed.";
+      return nullptr;
+    }
+  }
+  return in_tensor;
+}
+
+lite::Tensor *CreateFilterTensor(TypeId data_type, std::vector<int> filter_shape,
+                                 const std::vector<lite::Tensor *> &inputs, int copy_length, int index) {
+  auto filter_tensor =
+    new (std::nothrow) lite::Tensor(data_type, filter_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
+  if (filter_tensor == nullptr) {
+    MS_LOG(ERROR) << "new filter_tensor failed.";
+    return nullptr;
+  }
+  auto ret = filter_tensor->MallocData();
+  if (ret != RET_OK) {
+    delete filter_tensor;
+    MS_LOG(ERROR) << "filter_tensor malloc failed.";
+    return nullptr;
+  }
+  if (data_type == kNumberTypeFloat16) {
+    auto *origin_weight = reinterpret_cast<float16_t *>(inputs.at(kWeightIndex)->data_c());
+    memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float16_t));
+  } else {
+    MS_ASSERT(data_type == kNumberTypeFloat32);
+    auto *origin_weight = reinterpret_cast<float *>(inputs.at(kWeightIndex)->data_c());
+    memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float));
+  }
+  return filter_tensor;
+}
+
+lite::Tensor *CreateBiasTensor(TypeId data_type, std::vector<int> bias_shape, const std::vector<lite::Tensor *> &inputs,
+                               int new_out_channel, int index) {
+  auto *origin_bias = inputs.at(kBiasIndex)->data_c();
+  auto bias_tensor =
+    new (std::nothrow) lite::Tensor(data_type, bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
+  if (bias_tensor == nullptr) {
+    MS_LOG(ERROR) << "new bias_tensor failed.";
+    return nullptr;
+  }
+  auto ret = bias_tensor->MallocData();
+  if (ret != RET_OK) {
+    delete bias_tensor;
+    MS_LOG(ERROR) << "bias_tensor malloc failed.";
+    return nullptr;
+  }
+  if (data_type == kNumberTypeFloat16) {
+    auto bias_data = reinterpret_cast<float16_t *>(origin_bias);
+    memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float16_t));
+  } else {
+    MS_ASSERT(data_type == kNumberTypeFloat32);
+    auto bias_data = reinterpret_cast<float *>(origin_bias);
+    memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float));
+  }
+  return bias_tensor;
+}
+
+lite::Tensor *CreateOutputTensor(std::vector<int> out_shape, const std::vector<lite::Tensor *> &outputs,
+                                 bool infered_flag, int index) {
+  auto out_tensor = new (std::nothrow) lite::Tensor();
+  if (out_tensor == nullptr) {
+    MS_LOG(ERROR) << "new tmp_out_tensor failed.";
+    return nullptr;
+  }
+  out_tensor->set_data_type(outputs.at(index)->data_type());
+  out_tensor->SetFormat(outputs.at(index)->GetFormat());
+  if (infered_flag) {
+    out_tensor->set_shape(out_shape);
+    auto ret = out_tensor->MallocData();
+    if (ret != RET_OK) {
+      delete out_tensor;
+      MS_LOG(ERROR) << "out_tensor malloc data failed.";
+      return nullptr;
+    }
+  }
+  return out_tensor;
+}
+
 kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                                  const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                                  const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
                                                  int group) {
-  std::vector<kernel::LiteKernel *> group_convs;
-  std::vector<int> in_shape;
-  std::vector<int> filter_shape;
-  std::vector<int> bias_shape;
-  std::vector<int> out_shape;
-
+  int out_unit;
+  bool has_bias = inputs.size() == 3;
+  bool use_winograd = false;
+  bool infered_flag = (primitive != nullptr && primitive->GetInferFlag());
  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
-  int out_channel = inputs.at(kWeightIndex)->Batch();
+
+  // update new shape info for each sub kernel
  int new_in_channel = inputs.at(kWeightIndex)->Channel();
  int new_out_channel = 0;
  if (group == 0) {
    MS_LOG(ERROR) << "Divisor 'group' cannot be 0.";
    return nullptr;
  } else {
-    new_out_channel = out_channel / group;
+    new_out_channel = inputs.at(kWeightIndex)->Batch() / group;
  }
-  int kernel_h = conv_param->kernel_h_;
-  int kernel_w = conv_param->kernel_w_;
-  int input_num = inputs.size();
-  int output_num = outputs.size();
-  bool has_bias = input_num == 3;
-  bool use_winograd = false;
-  int out_unit;
-  bool infered_flag = (primitive != nullptr && primitive->GetInferFlag());

+  std::vector<int> in_shape;
+  std::vector<int> out_shape;
  if (infered_flag) {
    int batch = inputs.front()->Batch();
-    int in_h = inputs.front()->Height();
-    int in_w = inputs.front()->Width();
    conv_param->input_channel_ = new_in_channel;
    conv_param->output_channel_ = new_out_channel;
    CheckIfUseWinogradFp16(&use_winograd, &out_unit, conv_param);
-    in_shape = {batch, in_h, in_w, new_in_channel};
+    in_shape = {batch, inputs.front()->Height(), inputs.front()->Width(), new_in_channel};
    out_shape = {batch, conv_param->output_h_, conv_param->output_w_, new_out_channel};
  }
+  std::vector<int> filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel};
+  std::vector<int> bias_shape = {new_out_channel};

-  filter_shape = {new_out_channel, kernel_h, kernel_w, new_in_channel};
-  bias_shape = {new_out_channel};
-
+  // new group conv op
+  std::vector<kernel::LiteKernel *> group_convs;
+  // create tensors for every sub conv kernel
  for (int i = 0; i < group; ++i) {
    std::vector<lite::Tensor *> new_inputs;
    std::vector<lite::Tensor *> new_outputs;
@ -272,116 +354,56 @@ kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vector<lite::Tensor
      MS_LOG(ERROR) << "Get new conv parameter failed.";
      return nullptr;
    }
-    // get new input for each group
-    auto in_tensor =
-      new (std::nothrow) lite::Tensor(inputs.front()->data_type(), in_shape, Format_NHWC, lite::Tensor::Category::VAR);
+
+    // create new input for each group
+    auto in_tensor = CreateInputTensor(inputs.front()->data_type(), in_shape, infered_flag);
    if (in_tensor == nullptr) {
      delete new_conv_parameter;
      FreeMemoryFp16(group_convs, new_inputs, new_outputs);
-      MS_LOG(ERROR) << "new in_tensor failed.";
+      MS_LOG(ERROR) << "create input tensor failed.";
      return nullptr;
    }
-    if (infered_flag) {
-      auto ret = in_tensor->MallocData();
-      if (ret != RET_OK) {
-        delete new_conv_parameter;
-        delete in_tensor;
-        FreeMemoryFp16(group_convs, new_inputs, new_outputs);
-        MS_LOG(ERROR) << "in tensor malloc failed.";
-        return nullptr;
-      }
-    }
    new_inputs.emplace_back(in_tensor);

-    // new weight
-    auto filter_tensor = new (std::nothrow) lite::Tensor(inputs.at(kWeightIndex)->data_type(), filter_shape,
-                                                         Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
+    // create new weight
+    int copy_length = conv_param->kernel_h_ * conv_param->kernel_w_ * new_in_channel * new_out_channel;
+    auto filter_tensor = CreateFilterTensor(inputs.at(kWeightIndex)->data_type(), filter_shape, inputs, copy_length, i);
    if (filter_tensor == nullptr) {
      delete new_conv_parameter;
      FreeMemoryFp16(group_convs, new_inputs, new_outputs);
-      MS_LOG(ERROR) << "new filter_tensor failed.";
+      MS_LOG(ERROR) << "create filter tensor failed.";
      return nullptr;
    }
-    auto ret = filter_tensor->MallocData();
-    if (ret != RET_OK) {
-      delete new_conv_parameter;
-      delete filter_tensor;
-      FreeMemoryFp16(group_convs, new_inputs, new_outputs);
-      MS_LOG(ERROR) << "filter_tensor malloc failed.";
-      return nullptr;
-    }
-    int copy_length = kernel_h * kernel_w * new_in_channel * new_out_channel;
-    auto filter_data_type = inputs.at(kWeightIndex)->data_type();
-    if (filter_data_type == kNumberTypeFloat16) {
-      auto *origin_weight = reinterpret_cast<float16_t *>(inputs.at(kWeightIndex)->data_c());
-      memcpy(filter_tensor->data_c(), origin_weight + i * copy_length, copy_length * sizeof(float16_t));
-    } else {
-      MS_ASSERT(filter_data_type == kNumberTypeFloat32);
-      auto *origin_weight = reinterpret_cast<float *>(inputs.at(kWeightIndex)->data_c());
-      memcpy(filter_tensor->data_c(), origin_weight + i * copy_length, copy_length * sizeof(float));
-    }
    new_inputs.emplace_back(filter_tensor);

-    // if has bias, set new bias
+    // if has bias, create new bias
    if (has_bias) {
-      auto *origin_bias = inputs.at(kBiasIndex)->data_c();
-      auto bias_data_type = inputs.at(kBiasIndex)->data_type();
-      auto bias_tensor = new (std::nothrow)
-        lite::Tensor(inputs.at(kBiasIndex)->data_type(), bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
+      auto bias_tensor = CreateBiasTensor(inputs.at(kBiasIndex)->data_type(), bias_shape, inputs, new_out_channel, i);
      if (bias_tensor == nullptr) {
        delete new_conv_parameter;
        FreeMemoryFp16(group_convs, new_inputs, new_outputs);
-        MS_LOG(ERROR) << "new bias_tensor failed.";
+        MS_LOG(ERROR) << "create bias_tensor failed.";
        return nullptr;
      }
-      ret = bias_tensor->MallocData();
-      if (ret != RET_OK) {
-        delete new_conv_parameter;
-        delete bias_tensor;
-        FreeMemoryFp16(group_convs, new_inputs, new_outputs);
-        MS_LOG(ERROR) << "bias_tensor malloc failed.";
-        return nullptr;
-      }
-      if (bias_data_type == kNumberTypeFloat16) {
-        auto bias_data = reinterpret_cast<float16_t *>(origin_bias);
-        memcpy(bias_tensor->data_c(), bias_data + i * new_out_channel, new_out_channel * sizeof(float16_t));
-      } else {
-        MS_ASSERT(bias_data_type == kNumberTypeFloat32);
-        auto bias_data = reinterpret_cast<float *>(origin_bias);
-        memcpy(bias_tensor->data_c(), bias_data + i * new_out_channel, new_out_channel * sizeof(float));
-      }
      new_inputs.emplace_back(bias_tensor);
    }

-    // set new output tensor
-    for (int j = 0; j < output_num; ++j) {
-      auto tmp_out_tensor = new (std::nothrow) lite::Tensor();
-      if (tmp_out_tensor == nullptr) {
+    // create new output tensors
+    for (size_t j = 0; j < outputs.size(); ++j) {
+      auto out_tensor = CreateOutputTensor(out_shape, outputs, infered_flag, j);
+      if (out_tensor == nullptr) {
        delete new_conv_parameter;
        FreeMemoryFp16(group_convs, new_inputs, new_outputs);
-        MS_LOG(ERROR) << "new tmp_out_tensor failed.";
+        MS_LOG(ERROR) << "new out_tensor failed.";
        return nullptr;
      }
-      tmp_out_tensor->set_data_type(outputs.at(j)->data_type());
-      tmp_out_tensor->SetFormat(outputs.at(j)->GetFormat());
-      if (infered_flag) {
-        tmp_out_tensor->set_shape(out_shape);
-        ret = tmp_out_tensor->MallocData();
-        if (ret != RET_OK) {
-          delete new_conv_parameter;
-          delete tmp_out_tensor;
-          FreeMemoryFp16(group_convs, new_inputs, new_outputs);
-          MS_LOG(ERROR) << "tmp_out_tensor malloc data failed.";
-          return nullptr;
-        }
-      }
-      new_outputs.emplace_back(tmp_out_tensor);
+      new_outputs.emplace_back(out_tensor);
    }
-
    group_convs.emplace_back(CpuConvFp16KernelSelect(new_inputs, new_outputs,
                                                     reinterpret_cast<OpParameter *>(new_conv_parameter), ctx,
                                                     primitive, use_winograd, out_unit));
  }
+
  return new (std::nothrow)
    GroupConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive, group_convs, group);
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc
@ -169,8 +169,8 @@ ConvParameter *CreateNewConvParameter(ConvParameter *parameter) {
  return conv_parameter;
 }

-void FreeMemoryFp32(std::vector<kernel::LiteKernel *> group_convs, std::vector<lite::Tensor *> new_inputs,
-                    std::vector<lite::Tensor *> new_outputs) {
+void FreeMemoryFp32(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
+                    const std::vector<lite::Tensor *> &new_outputs) {
  for (auto sub_conv : group_convs) {
    if (sub_conv != nullptr) {
      delete sub_conv;
@ -188,6 +188,87 @@ void FreeMemoryFp32(std::vector<kernel::LiteKernel *> group_convs, std::vector<l
  }
 }

+lite::Tensor *CreateInputTensorFp32(TypeId data_type, std::vector<int> in_shape, bool infered_flag) {
+  auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR);
+  if (in_tensor == nullptr) {
+    MS_LOG(ERROR) << "new in_tensor failed.";
+    return nullptr;
+  }
+  if (infered_flag) {
+    auto ret = in_tensor->MallocData();
+    if (ret != RET_OK) {
+      delete in_tensor;
+      MS_LOG(ERROR) << "in tensor malloc failed.";
+      return nullptr;
+    }
+  }
+  return in_tensor;
+}
+
+lite::Tensor *CreateFilterTensorFp32(TypeId data_type, std::vector<int> filter_shape,
+                                     const std::vector<lite::Tensor *> &inputs, int copy_length, int index) {
+  auto filter_tensor =
+    new (std::nothrow) lite::Tensor(data_type, filter_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
+  if (filter_tensor == nullptr) {
+    MS_LOG(ERROR) << "new filter_tensor failed.";
+    return nullptr;
+  }
+  auto ret = filter_tensor->MallocData();
+  if (ret != RET_OK) {
+    delete filter_tensor;
+    MS_LOG(ERROR) << "filter_tensor malloc failed.";
+    return nullptr;
+  }
+
+  MS_ASSERT(data_type == kNumberTypeFloat32);
+  auto *origin_weight = reinterpret_cast<float *>(inputs.at(kWeightIndex)->data_c());
+  memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float));
+  return filter_tensor;
+}
+
+lite::Tensor *CreateBiasTensorFp32(TypeId data_type, std::vector<int> bias_shape,
+                                   const std::vector<lite::Tensor *> &inputs, int new_out_channel, int index) {
+  auto *origin_bias = inputs.at(kBiasIndex)->data_c();
+  auto bias_tensor =
+    new (std::nothrow) lite::Tensor(data_type, bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
+  if (bias_tensor == nullptr) {
+    MS_LOG(ERROR) << "new bias_tensor failed.";
+    return nullptr;
+  }
+  auto ret = bias_tensor->MallocData();
+  if (ret != RET_OK) {
+    delete bias_tensor;
+    MS_LOG(ERROR) << "bias_tensor malloc failed.";
+    return nullptr;
+  }
+  MS_ASSERT(data_type == kNumberTypeFloat32);
+  auto bias_data = reinterpret_cast<float *>(origin_bias);
+  memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float));
+
+  return bias_tensor;
+}
+
+lite::Tensor *CreateOutputTensorFp32(std::vector<int> out_shape, const std::vector<lite::Tensor *> &outputs,
+                                     bool infered_flag, int index) {
+  auto out_tensor = new (std::nothrow) lite::Tensor();
+  if (out_tensor == nullptr) {
+    MS_LOG(ERROR) << "new tmp_out_tensor failed.";
+    return nullptr;
+  }
+  out_tensor->set_data_type(outputs.at(index)->data_type());
+  out_tensor->SetFormat(outputs.at(index)->GetFormat());
+  if (infered_flag) {
+    out_tensor->set_shape(out_shape);
+    auto ret = out_tensor->MallocData();
+    if (ret != RET_OK) {
+      delete out_tensor;
+      MS_LOG(ERROR) << "out_tensor malloc data failed.";
+      return nullptr;
+    }
+  }
+  return out_tensor;
+}
+
 kernel::LiteKernel *CpuConvFp32KernelSelect(const std::vector<lite::Tensor *> &inputs,
                                            const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                            const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
@ -208,31 +289,22 @@ kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor
                                                  const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                                  const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
                                                  int group) {
-  std::vector<kernel::LiteKernel *> group_convs;
-  std::vector<int> in_shape;
-  std::vector<int> filter_shape;
-  std::vector<int> bias_shape;
-  std::vector<int> out_shape;
-
+  int out_unit;
+  bool has_bias = inputs.size() == 3;
+  bool use_winograd = false;
+  bool infered_flag = primitive != nullptr && primitive->GetInferFlag();
  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
-  int out_channel = inputs.at(kWeightIndex)->Batch();
+
+  std::vector<int> in_shape;
+  std::vector<int> out_shape;
  int new_in_channel = inputs.at(kWeightIndex)->Channel();
  int new_out_channel = 0;
  if (group == 0) {
    MS_LOG(ERROR) << "Divisor 'group' cannot be 0.";
    return nullptr;
  } else {
-    new_out_channel = out_channel / group;
+    new_out_channel = inputs.at(kWeightIndex)->Batch() / group;
  }
-  int kernel_h = conv_param->kernel_h_;
-  int kernel_w = conv_param->kernel_w_;
-  int input_num = inputs.size();
-  int output_num = outputs.size();
-  bool has_bias = input_num == 3;
-  bool use_winograd = false;
-  int out_unit;
-  bool infered_flag = primitive != nullptr && primitive->GetInferFlag();
-
  if (infered_flag) {
    int batch = inputs.front()->Batch();
    int in_h = inputs.front()->Height();
@ -243,11 +315,11 @@ kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor
    in_shape = {batch, in_h, in_w, new_in_channel};
    out_shape = {batch, conv_param->output_h_, conv_param->output_w_, new_out_channel};
  }
+  std::vector<int> filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel};
+  std::vector<int> bias_shape = {new_out_channel};

-  filter_shape = {new_out_channel, kernel_h, kernel_w, new_in_channel};
-  bias_shape = {new_out_channel};
-  auto *origin_weight = reinterpret_cast<float *>(inputs.at(kWeightIndex)->data_c());
-
+  // create sub kernels
+  std::vector<kernel::LiteKernel *> group_convs;
  for (int i = 0; i < group; ++i) {
    std::vector<lite::Tensor *> new_inputs;
    std::vector<lite::Tensor *> new_outputs;
@ -257,100 +329,58 @@ kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor
      MS_LOG(ERROR) << "Get new conv parameter failed.";
      return nullptr;
    }
-    // get new input for each group
-    auto in_tensor =
-      new (std::nothrow) lite::Tensor(inputs.front()->data_type(), in_shape, Format_NHWC, lite::Tensor::Category::VAR);
+
+    // create new input for each group
+    auto in_tensor = CreateInputTensorFp32(inputs.front()->data_type(), in_shape, infered_flag);
    if (in_tensor == nullptr) {
      delete new_conv_parameter;
      FreeMemoryFp32(group_convs, new_inputs, new_outputs);
-      MS_LOG(ERROR) << "new in_tensor failed.";
+      MS_LOG(ERROR) << "create input tensor failed.";
      return nullptr;
    }
-    if (infered_flag) {
-      auto ret = in_tensor->MallocData();
-      if (ret != RET_OK) {
-        delete new_conv_parameter;
-        delete in_tensor;
-        FreeMemoryFp32(group_convs, new_inputs, new_outputs);
-        MS_LOG(ERROR) << "in tensor malloc failed.";
-        return nullptr;
-      }
-    }
    new_inputs.emplace_back(in_tensor);

-    // new weight
-    auto filter_tensor = new (std::nothrow) lite::Tensor(inputs.at(kWeightIndex)->data_type(), filter_shape,
-                                                         Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
+    // create new weight
+    int copy_length = conv_param->kernel_h_ * conv_param->kernel_w_ * new_in_channel * new_out_channel;
+    auto filter_tensor =
+      CreateFilterTensorFp32(inputs.at(kWeightIndex)->data_type(), filter_shape, inputs, copy_length, i);
    if (filter_tensor == nullptr) {
      delete new_conv_parameter;
      FreeMemoryFp32(group_convs, new_inputs, new_outputs);
-      MS_LOG(ERROR) << "new filter_tensor failed.";
+      MS_LOG(ERROR) << "create filter tensor failed.";
      return nullptr;
    }
-    auto ret = filter_tensor->MallocData();
-    if (ret != RET_OK) {
-      delete new_conv_parameter;
-      delete filter_tensor;
-      FreeMemoryFp32(group_convs, new_inputs, new_outputs);
-      MS_LOG(ERROR) << "filter_tensor malloc failed.";
-      return nullptr;
-    }
-    int copy_length = kernel_h * kernel_w * new_in_channel * new_out_channel;
-    memcpy(filter_tensor->data_c(), origin_weight + i * copy_length, copy_length * sizeof(float));
    new_inputs.emplace_back(filter_tensor);

-    // if has bias, set new bias
+    // if has bias, create new bias
    if (has_bias) {
-      auto *origin_bias = reinterpret_cast<float *>(inputs.at(kBiasIndex)->data_c());
-      auto bias_tensor = new (std::nothrow)
-        lite::Tensor(inputs.at(kBiasIndex)->data_type(), bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
+      auto bias_tensor =
+        CreateBiasTensorFp32(inputs.at(kBiasIndex)->data_type(), bias_shape, inputs, new_out_channel, i);
      if (bias_tensor == nullptr) {
        delete new_conv_parameter;
        FreeMemoryFp32(group_convs, new_inputs, new_outputs);
-        MS_LOG(ERROR) << "new bias_tensor failed.";
+        MS_LOG(ERROR) << "create bias_tensor failed.";
        return nullptr;
      }
-      ret = bias_tensor->MallocData();
-      if (ret != RET_OK) {
-        delete new_conv_parameter;
-        delete bias_tensor;
-        FreeMemoryFp32(group_convs, new_inputs, new_outputs);
-        MS_LOG(ERROR) << "bias_tensor malloc failed.";
-        return nullptr;
-      }
-      memcpy(bias_tensor->data_c(), origin_bias + i * new_out_channel, new_out_channel * sizeof(float));
      new_inputs.emplace_back(bias_tensor);
    }

-    // set new output tensor
-    for (int j = 0; j < output_num; ++j) {
-      auto tmp_out_tensor = new (std::nothrow) lite::Tensor();
-      if (tmp_out_tensor == nullptr) {
+    // create new output tensor
+    for (size_t j = 0; j < outputs.size(); ++j) {
+      auto out_tensor = CreateOutputTensorFp32(out_shape, outputs, infered_flag, j);
+      if (out_tensor == nullptr) {
        delete new_conv_parameter;
        FreeMemoryFp32(group_convs, new_inputs, new_outputs);
-        MS_LOG(ERROR) << "new tmp_out_tensor failed.";
+        MS_LOG(ERROR) << "new out_tensor failed.";
        return nullptr;
      }
-      tmp_out_tensor->set_data_type(outputs.at(j)->data_type());
-      tmp_out_tensor->SetFormat(outputs.at(j)->GetFormat());
-      if (infered_flag) {
-        tmp_out_tensor->set_shape(out_shape);
-        ret = tmp_out_tensor->MallocData();
-        if (ret != RET_OK) {
-          delete new_conv_parameter;
-          delete tmp_out_tensor;
-          FreeMemoryFp32(group_convs, new_inputs, new_outputs);
-          MS_LOG(ERROR) << "tmp_out_tensor malloc data failed.";
-          return nullptr;
-        }
-      }
-      new_outputs.emplace_back(tmp_out_tensor);
+      new_outputs.emplace_back(out_tensor);
    }
-
    group_convs.emplace_back(CpuConvFp32KernelSelect(new_inputs, new_outputs,
                                                     reinterpret_cast<OpParameter *>(new_conv_parameter), ctx,
                                                     primitive, use_winograd, out_unit));
  }
+
  return new (std::nothrow)
    GroupConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive, group_convs, group);
 }
--- a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc
@ -19,6 +19,7 @@
 #include "src/kernel_registry.h"
 #include "src/runtime/runtime_api.h"
 #include "nnacl/quantization/quantize.h"
+#include "nnacl/pack.h"
 #include "include/errorcode.h"

 using mindspore::lite::KernelRegistrar;
@ -38,11 +39,77 @@ using mindspore::schema::PrimitiveType_Mean;
 using mindspore::schema::PrimitiveType_Reduce;

 namespace mindspore::kernel {
+void ReduceInt8CPUKernel::OneAxis() {
+  auto axis_info = axes_[0];
+  if (axis_info == 0) {
+    pattern_ = kernel::N;
+  } else if (axis_info == 1) {
+    pattern_ = kernel::H;
+  } else if (axis_info == 2) {
+    pattern_ = kernel::W;
+  } else {
+    pattern_ = kernel::C;
+  }
+}
+
+void ReduceInt8CPUKernel::TwoAxes() {
+  auto axis_info1 = axes_[0];
+  auto axis_info2 = axes_[1];
+  auto axis_sum = axis_info1 + axis_info2;
+  if (axis_sum == 1) {
+    pattern_ = kernel::NH;
+  } else if (axis_sum == 2) {
+    pattern_ = kernel::NW;
+  } else if (axis_sum == 3) {
+    if (axis_info1 == 0) {
+      pattern_ = kernel::NC;
+    } else {
+      pattern_ = kernel::HW;
+    }
+  } else if (axis_sum == 4) {
+    pattern_ = kernel::HC;
+  } else {
+    MS_ASSERT(axis_sum == 5);
+    pattern_ = kernel::WC;
+  }
+}
+
+void ReduceInt8CPUKernel::ThreeAxes() {
+  auto axis_info1 = axes_[0];
+  auto axis_info2 = axes_[1];
+  auto axis_info3 = axes_[2];
+  auto axis_sum = axis_info1 + axis_info2 + axis_info3;
+  if (axis_sum == 3) {
+    pattern_ = kernel::NHW;
+  } else if (axis_sum == 4) {
+    pattern_ = kernel::NHC;
+  } else if (axis_sum == 5) {
+    pattern_ = kernel::NWC;
+  } else {
+    MS_ASSERT(axis_sum == 6);
+    pattern_ = kernel::HWC;
+  }
+}
+
+void ReduceInt8CPUKernel::Match4DReducePattern() {
+  if (num_axes_ == 1) {
+    OneAxis();
+  } else if (num_axes_ == 2) {
+    TwoAxes();
+  } else if (num_axes_ == 3) {
+    ThreeAxes();
+  } else {
+    MS_ASSERT(num_axes_ == 4);
+    pattern_ = kernel::NHWC;
+  }
+}
+
 int ReduceInt8CPUKernel::Init() {
  auto ret = ReduceBaseCPUKernel::Init();
  if (ret != RET_OK) {
    return ret;
  }
+  Match4DReducePattern();
  if (!this->in_tensors_[0]->shape().empty()) {
    this->valid_shape_ = true;
    ret = CalculateQuantArgs();
@ -96,6 +163,64 @@ int ReduceInt8CPUKernel::Init() {
  return ReSize();
 }

+void ReduceInt8CPUKernel::ReduceMean4DCalQuantParam() {
+  int reduce_num = 1;
+  auto in_shape = in_tensors_.front()->shape();
+  switch (pattern_) {
+    case N:
+      reduce_num = in_shape[0];
+      break;
+    case H:
+      reduce_num = in_shape[1];
+      break;
+    case W:
+      reduce_num = in_shape[2];
+      break;
+    case C:
+      reduce_num = in_shape[3];
+      break;
+    case NH:
+      reduce_num = in_shape[0] * in_shape[1];
+      break;
+    case NW:
+      reduce_num = in_shape[0] * in_shape[2];
+      break;
+    case NC:
+      reduce_num = in_shape[0] * in_shape[3];
+      break;
+    case HW:
+      reduce_num = in_shape[1] * in_shape[2];
+      break;
+    case HC:
+      reduce_num = in_shape[1] * in_shape[3];
+      break;
+    case WC:
+      reduce_num = in_shape[2] * in_shape[3];
+      break;
+    case NHW:
+      reduce_num = in_shape[0] * in_shape[1] * in_shape[2];
+      break;
+    case NHC:
+      reduce_num = in_shape[0] * in_shape[1] * in_shape[3];
+      break;
+    case NWC:
+      reduce_num = in_shape[0] * in_shape[2] * in_shape[3];
+      break;
+    case HWC:
+      reduce_num = in_shape[1] * in_shape[2] * in_shape[3];
+      break;
+    case NHWC:
+      reduce_num = in_shape[0] * in_shape[1] * in_shape[2] * in_shape[3];
+      break;
+  }
+  bias_ = quant_arg_.out_zp_ - quant_arg_.in_zp_ * quant_arg_.in_scale_ / quant_arg_.out_scale_;
+  int shift;
+  double reciprocal = quant_arg_.in_scale_ / (quant_arg_.out_scale_ * reduce_num);
+  QuantizeMultiplierSmallerThanOne(reciprocal, &reduce_mean_quant_param_.multiplier_, &shift);
+  reduce_mean_quant_param_.left_shift_ = shift < 0 ? -shift : 0;
+  reduce_mean_quant_param_.right_shift_ = shift > 0 ? shift : 0;
+}
+
 int ReduceInt8CPUKernel::CalculateQuantArgs() {
  lite::Tensor *input = in_tensors_.at(0);
  lite::Tensor *output = out_tensors_.at(0);
@ -117,18 +242,24 @@ int ReduceInt8CPUKernel::CalculateQuantArgs() {
  // (quant_out - zp_out)*scale_out = sum((quant_in -zp)*scale_in) * (1/num) for each axis in axes
  // quant_out = sum(quant_in-zp) * (scale_in/scale_out) * (1/num)
  if (mode_ == static_cast<int>(schema::ReduceMode_ReduceMean)) {
-    for (auto i = 0; i < num_axes_; i++) {
-      auto axis = axes_[i];
-      double reciprocal = 1.0 / in_tensors_.at(0)->shape()[axis];
-      QuantMulArg *qm = new (std::nothrow) QuantMulArg;
-      if (qm == nullptr) {
-        MS_LOG(ERROR) << "Reduce new QuantMulArg failed.";
-        return RET_NULL_PTR;
+    if (input->shape().size() == 4 && pattern_ == kernel::HW) {
+      // special case, can use pattern
+      ReduceMean4DCalQuantParam();
+      pattern_impl_ = true;
+    } else {
+      for (auto i = 0; i < num_axes_; i++) {
+        auto axis = axes_[i];
+        double reciprocal = 1.0 / in_tensors_.at(0)->shape()[axis];
+        QuantMulArg *qm = new (std::nothrow) QuantMulArg;
+        if (qm == nullptr) {
+          MS_LOG(ERROR) << "Reduce new QuantMulArg failed.";
+          return RET_NULL_PTR;
+        }
+        QuantizeMultiplierSmallerThanOne(reciprocal, &qm->multiplier_, &shift);
+        qm->left_shift_ = shift < 0 ? -shift : 0;
+        qm->right_shift_ = shift > 0 ? shift : 0;
+        mean_multipliers_.push_back(qm);
      }
-      QuantizeMultiplierSmallerThanOne(reciprocal, &qm->multiplier_, &shift);
-      qm->left_shift_ = shift < 0 ? -shift : 0;
-      qm->right_shift_ = shift > 0 ? shift : 0;
-      mean_multipliers_.push_back(qm);
    }
  }

@ -230,6 +361,16 @@ int ReduceInt8Impl(void *cdata, int task_id) {
  return RET_OK;
 }

+int ReduceMeanPatternInt8Impl(void *cdata, int task_id) {
+  auto reduce = reinterpret_cast<ReduceInt8CPUKernel *>(cdata);
+  auto error_code = reduce->Reduce4DExecute(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "Reduce Run error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
 void ReduceInt8CPUKernel::GetQuantArgs(size_t i) {
  MS_ASSERT(i < static_cast<size_t>(num_axis_));
  if (mode_ == static_cast<int>(schema::ReduceMode_ReduceMean)) {
@ -250,6 +391,78 @@ void ReduceInt8CPUKernel::GetQuantArgs(size_t i) {
  }
 }

+int ReduceInt8CPUKernel::Reduce4DExecute(int task_id) {
+  auto input = in_tensors_.at(0);
+  auto in_data = reinterpret_cast<int8_t *>(input->data_c());
+  auto in_shape = input->shape();
+  MS_ASSERT(in_shape.size() == 4);
+  int n = in_shape.at(0);
+  int h = in_shape.at(1);
+  int w = in_shape.at(2);
+  int c = in_shape.at(3);
+  auto output_data = reinterpret_cast<int8_t *>(out_tensors_.at(0)->data_c());
+  switch (pattern_) {
+    case N:
+      return ReduceMeanN(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
+    case H:
+      return ReduceMeanH(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
+    case W:
+      return ReduceMeanW(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
+    case C:
+      return ReduceMeanC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
+    case NH:
+      return ReduceMeanNH(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
+    case NW:
+      return ReduceMeanNW(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
+    case NC:
+      return ReduceMeanNC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
+    case HW: {
+      // data has been convert into NCHW format for efficiently
+      int num = UP_DIV(c, ctx_->thread_num_);
+      int count = c - task_id * num;
+      count = count > num ? num : count;
+      int plane = h * w;
+      return ReduceMeanHW(n, plane, count, c, nchw_in_data_ + task_id * num * plane, output_data + task_id * num,
+                          reduce_mean_quant_param_, bias_);
+    }
+    case HC:
+      return ReduceMeanHC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
+    case WC:
+      return ReduceMeanWC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
+    case NHW:
+      return ReduceMeanNHW(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
+    case NHC:
+      return ReduceMeanNHC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
+    case NWC:
+      return ReduceMeanNWC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
+    case HWC:
+      return ReduceMeanHWC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
+    case NHWC:
+      return ReduceMeanNHWC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
+  }
+  return RET_OK;
+}
+
+int ReduceInt8CPUKernel::Fast4DReduceMeanHWImpl() {
+  auto input = in_tensors_.at(0);
+  auto input_data = reinterpret_cast<int8_t *>(input->data_c());
+  nchw_in_data_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(input->ElementsNum()));
+  if (nchw_in_data_ == nullptr) {
+    MS_LOG(ERROR) << "malloc nchw_in_data_ failed.";
+    return RET_ERROR;
+  }
+  PackNHWCToNCHWInt8(reinterpret_cast<void *>(input_data), reinterpret_cast<void *>(nchw_in_data_), input->Batch(),
+                     input->Height() * input->Width(), input->Channel());
+  auto ret = ParallelLaunch(this->context_->thread_pool_, ReduceMeanPatternInt8Impl, this, context_->thread_num_);
+  if (ret != RET_OK) {
+    ctx_->allocator->Free(nchw_in_data_);
+    MS_LOG(ERROR) << "Reduce run error, error_code[" << ret << "]";
+    return RET_ERROR;
+  }
+  ctx_->allocator->Free(nchw_in_data_);
+  return RET_OK;
+}
+
 int ReduceInt8CPUKernel::Run() {
  if (!this->valid_shape_) {
    auto ret = CalculateQuantArgs();
@ -257,6 +470,11 @@ int ReduceInt8CPUKernel::Run() {
      return ret;
    }
  }
+  // now only implement reduce mean mode 4d reduce HW case, otherwise go into reference impl
+  if (mode_ == static_cast<int>(schema::ReduceMode_ReduceMean) && pattern_impl_ && pattern_ == kernel::HW) {
+    return Fast4DReduceMeanHWImpl();
+  }
+
  auto ret = MallocTmpBuffer();
  if (ret != RET_OK) {
    FreeTmpBuffer();
@ -313,6 +531,7 @@ int ReduceInt8CPUKernel::CallReduceUnit(int task_id) {
  }
  return ret;
 }
+
 kernel::LiteKernel *CpuReduceInt8KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                               const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
                                               const lite::InnerContext *ctx, const kernel::KernelKey &desc,
--- a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.h
@ -28,6 +28,7 @@
 using mindspore::schema::ReduceMode;

 namespace mindspore::kernel {
+enum Four_DIMENSION_REDUCE_TEMPLATE { N, H, W, C, NH, NW, NC, HW, HC, WC, NHW, NHC, NWC, HWC, NHWC };
 class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
  typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                         int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
@ -38,7 +39,7 @@ class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
  ReduceInt8CPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
                      const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                      const mindspore::lite::PrimitiveC *primitive)
-      : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {}
+      : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive), ctx_(ctx) {}
  ~ReduceInt8CPUKernel() {
    for (auto qm : mean_multipliers_) {
      delete qm;
@ -59,6 +60,8 @@ class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
  int Init() override;
  int ReSize() override;
  int Run() override;
+  int Fast4DReduceMeanHWImpl();
+  int Reduce4DExecute(int task_id);
  int CallReduceUnit(int task_id);
  int ReduceLastAxis(int task_id);

@ -68,22 +71,31 @@ class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
 private:
  int MallocTmpBuffer();
  void FreeTmpBuffer();
-
+  void Match4DReducePattern();
+  void OneAxis();
+  void TwoAxes();
+  void ThreeAxes();
+  void ReduceMean4DCalQuantParam();
  int CalculateQuantArgs();
  void GetQuantArgs(size_t i);

 private:
  ReduceParameter *param_ = nullptr;
  ReduceQuantArg quant_arg_;
+  int8_t *nchw_in_data_ = nullptr;
+  int32_t bias_;

 private:
+  const lite::InnerContext *ctx_;
  int32_t *begin_src_data_ = nullptr;
  int8_t *last_dst_data_ = nullptr;
  std::vector<int32_t *> data_buffers_;
  const int32_t *src_data_ = nullptr;
  int32_t *dst_data_ = nullptr;
  bool valid_shape_ = false;
-
+  bool pattern_impl_ = false;
+  Four_DIMENSION_REDUCE_TEMPLATE pattern_;
+  QuantMulArg reduce_mean_quant_param_;  // used in reduce mean 4D situation
  Reducer reducer_ = nullptr;
  LastReducer last_reducer_ = nullptr;
  std::vector<QuantMulArg *> mean_multipliers_;