!38386 nnacl kernel

Merge pull request !38386 from ling/sr
2022-07-28 01:22:24 +00:00 · 2022-07-28 01:22:24 +00:00 · e103ba3aab
parent 4b08678e85 ec9e53efeb
commit e103ba3aab
30 changed files with 561 additions and 273 deletions
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/base_matmul.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/base_matmul.c
@ -1,102 +0,0 @@
-/**
- * Copyright 2022 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/experimental/base_matmul.h"
-#include "nnacl/experimental/fp32_funcs.h"
-
-typedef struct BaseMatmulStru {
-  KernelBase *base;
-  size_t deep;
-  size_t row;
-  size_t col;
-  size_t thread_num;
-  uint8_t *a_ptr;
-  uint8_t *b_ptr;
-  uint8_t *c_ptr;
-  uint8_t *bias;
-  uint8_t *tmp_ptr;
-  float min;
-  float max;
-  size_t row_unit;
-  size_t row_tile;
-} BaseMatmulStru;
-
-int BaseMatmulRun(void *param, int task_id, float lhs_scale, float rhs_scale) {
-  BaseMatmulStru *mm = (BaseMatmulStru *)param;
-  if (mm == NULL) {
-    return -1;
-  }
-
-  size_t pack_uint = mm->base->funcs->pack * mm->base->funcs->byte;
-
-  for (size_t i = task_id; i < mm->row_unit; i += mm->thread_num) {
-    int xStart = i * mm->row_tile;
-    uint8_t *a = mm->a_ptr + xStart * pack_uint;
-    uint8_t *tmp = mm->tmp_ptr + mm->row_tile * mm->deep * task_id * mm->base->funcs->byte;
-    mm->base->funcs->PackLeft(tmp, a, mm->row_tile, mm->deep, mm->row);
-    mm->base->funcs->Matmul(mm->c_ptr + xStart * pack_uint, tmp, mm->b_ptr, mm->bias, mm->row_tile, mm->deep, mm->col,
-                            mm->row * mm->base->funcs->pack, mm->min, mm->max);
-  }
-  return 0;
-}
-
-void BaseMatmul(uint8_t *a_ptr, uint8_t *b_ptr, uint8_t *bias, uint8_t *c_ptr, int row, int deep, int col,
-                ActType act_type, int thread_num, KernelBase *base) {
-  BaseMatmulStru basemm;
-  if (a_ptr == NULL || b_ptr == NULL || c_ptr == NULL) {
-    return;
-  }
-  basemm.base = base;
-  basemm.deep = deep;
-  basemm.col = col;
-  basemm.row = row;
-  basemm.a_ptr = a_ptr;
-  basemm.b_ptr = b_ptr;
-  basemm.c_ptr = c_ptr;
-  basemm.bias = bias;
-  basemm.thread_num = thread_num;
-
-  int byte = basemm.base->funcs->byte;
-  int pack = basemm.base->funcs->pack;
-  int row_tile, deep_tile, col_tile;
-  basemm.base->funcs->InitMatmulTileCount(&row_tile, &deep_tile, &col_tile);
-
-  basemm.row_tile = row_tile;
-  if (row_tile == 0) {
-    return;
-  }
-  basemm.row_unit = row / row_tile;
-
-  if (bias != NULL || act_type != ActType_No) {
-    GetPostParameters(act_type, &basemm.min, &basemm.max);
-  }
-
-  basemm.tmp_ptr = (uint8_t *)basemm.base->env->alloc(basemm.base->env->allocator,
-                                                      thread_num * UP_ROUND(deep, deep_tile) * row_tile * byte);
-  basemm.base->env->parallelLaunch(basemm.base->env->threadPool, BaseMatmulRun, &basemm, thread_num);
-
-  size_t row_remain = row - basemm.row_unit * row_tile;
-  if (row_remain != 0) {
-    int32_t start_row = basemm.row_unit * row_tile;
-    uint8_t *a_remain_ptr = a_ptr + start_row * pack * byte;
-    basemm.base->funcs->PackLeft(basemm.tmp_ptr, a_remain_ptr, row_remain, deep, row);
-    basemm.base->funcs->MatMulRes(c_ptr + start_row * pack * byte, basemm.tmp_ptr, b_ptr, bias, row_remain, basemm.deep,
-                                  basemm.col, basemm.row * basemm.base->funcs->pack, basemm.min, basemm.max);
-  }
-
-  basemm.base->env->free(basemm.base->env->allocator, basemm.tmp_ptr);
-  return;
-}
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/core_funcs.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/core_funcs.h
@ -1,33 +0,0 @@
-/**
- * Copyright 2022 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_NNACL_EXPERIMENT_CORE_FUNCS_H_
-#define MINDSPORE_NNACL_EXPERIMENT_CORE_FUNCS_H_
-
-typedef struct CoreFuncs {
-  int pack;
-  int byte;
-  void (*InitMatmulTileCount)(int *row_tile, int *deep_tile, int *col_tile);
-  void (*PackNcX)(const void *src, void *dst, int batch, int plane, int channel);
-  void (*UnPackNcX)(const void *src, void *dst, int batch, int plane, int channel);
-  void (*PackLeft)(void *dst, void *src, size_t row, size_t deep, size_t src_stride);
-  void (*PackRight)(const void *src, void *dst, int batch, int plane, int channel);
-  void (*Matmul)(void *c_ptr, void *a_ptr, void *b_ptr, void *bias, size_t row, size_t deep, size_t col,
-                 size_t dst_stride, float min, float max);
-  void (*MatMulRes)(void *c_ptr, void *a_ptr, void *b_ptr, void *bias, size_t row, size_t deep, size_t col,
-                    size_t dst_stride, float min, float max);
-} CoreFuncs;
-
-#endif  // MINDSPORE_NNACL_EXPERIMENT_CORE_FUNCS_H_
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core.h
@ -0,0 +1,70 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_EXPERIMENT_MS_CORE_H_
+#define MINDSPORE_NNACL_EXPERIMENT_MS_CORE_H_
+
+#include <float.h>
+#include "nnacl/op_base.h"
+#include "nnacl/exp_parameter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct CoreFuncs {
+  int pack;
+  int byte;
+  int (*ExpFusion)(const void *src_data, void *dst_data, const ExpParameter *param, int task_id);
+  void (*PackNcX)(const void *src, void *dst, int batch, int plane, int channel);
+  void (*UnPackNcX)(const void *src, void *dst, int batch, int plane, int channel);
+  void (*PostParam)(ActType act, float *min, float *max);
+
+  void (*ExpMatmulTile)(int *row_tile, int *deep_tile, int *col_tile);
+  void (*ExpMatmulPackIn)(void *dst, void *src, size_t row, size_t deep, size_t src_stride);
+  void (*ExpMatmulBlock)(void *c_ptr, void *a_ptr, void *b_ptr, void *bias, size_t row, size_t deep, size_t col,
+                         size_t dst_stride, float min, float max);
+  void (*ExpMatMulRemain)(void *c_ptr, void *a_ptr, void *b_ptr, void *bias, size_t row, size_t deep, size_t col,
+                          size_t dst_stride, float min, float max);
+
+  void (*OptMatmulTile)(int *row_tile, int *col_tile);
+} CoreFuncs;
+
+/* x86 */
+void InitCore(CoreFuncs *funcs_);
+
+/* arm64 fp32 */
+void InitFp32Core(CoreFuncs *funcs_);
+
+/* arm64 fp16 */
+void InitFp16Core(CoreFuncs *funcs_);
+
+/* arm32 */
+void InitArm32Core(CoreFuncs *funcs_);
+
+/* avx */
+void InitAvxCore(CoreFuncs *funcs_);
+
+/* avx512 */
+void InitAvx512Core(CoreFuncs *funcs_);
+
+/* sse */
+void InitSseCore(CoreFuncs *funcs_);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // MINDSPORE_NNACL_EXPERIMENT_MS_CORE_H_
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_arm32.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_arm32.c
@ -13,20 +13,17 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef MINDSPORE_NNACL_EXPERIMENT_FP32_FUNCS_H_
-#define MINDSPORE_NNACL_EXPERIMENT_FP32_FUNCS_H_

-#include "nnacl/kernel.h"
+#ifdef ENABLE_ARM32
+#include "nnacl/experimental/ms_core.h"
+void InitOptMatmulTileArm32(int *row_tile, int *col_tile) {
+  *row_tile = C12NUM;
+  *col_tile = C4NUM;
+}

-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void InitFp32Funcs(CoreFuncs *funcs_);
-
-void GetPostParameters(ActType act, float *min, float *max);
-
-#ifdef __cplusplus
+void InitArm32Core(CoreFuncs *funcs_) {
+  funcs_->pack = C4NUM;
+  funcs_->byte = sizeof(float);
+  funcs_->OptMatmulTile = InitOptMatmulTileArm32;
 }
 #endif
-#endif  // MINDSPORE_NNACL_EXPERIMENT_FP32_FUNCS_H_
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_arm64_fp16.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_arm64_fp16.c
@ -13,18 +13,14 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef MINDSPORE_NNACL_EXPERIMENT_FP16_FUNCS_H_
-#define MINDSPORE_NNACL_EXPERIMENT_FP16_FUNCS_H_

-#include "nnacl/kernel.h"
+#ifdef ENABLE_FP16
+#include "nnacl/experimental/ms_core.h"
+#include "nnacl/fp16/exp_fp16.h"

-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void InitFp16Funcs(CoreFuncs *funcs_);
-
-#ifdef __cplusplus
+void InitFp16Core(CoreFuncs *funcs_) {
+  funcs_->pack = C8NUM;
+  funcs_->byte = sizeof(float16_t);
+  funcs_->ExpFusion = ExpFusionFp16;
 }
 #endif
-#endif  // MINDSPORE_NNACL_EXPERIMENT_FP32_FUNCS_H_
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_arm64_fp32.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_arm64_fp32.c
@ -14,11 +14,11 @@
 * limitations under the License.
 */

-#include "nnacl/experimental/fp16_funcs.h"
-
-void InitFp16Funcs(CoreFuncs *funcs_) {
 #ifdef ENABLE_ARM64
-  funcs_->pack = C8NUM;
-  funcs_->byte = sizeof(float16_t);
-#endif
+#include "nnacl/experimental/ms_core.h"
+
+void InitFp32Core(CoreFuncs *funcs_) {
+  funcs_->pack = C4NUM;
+  funcs_->byte = sizeof(float);
 }
+#endif
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_avx.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_avx.c
@ -0,0 +1,29 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef ENABLE_AVX
+#include "nnacl/experimental/ms_core.h"
+void InitOptMatmulTileAvx(int *row_tile, int *col_tile) {
+  *row_tile = C6NUM;
+  *col_tile = C16NUM;
+}
+
+void InitAvxCore(CoreFuncs *funcs_) {
+  funcs_->pack = C8NUM;
+  funcs_->byte = sizeof(float);
+  funcs_->OptMatmulTile = InitOptMatmulTileAvx;
+}
+#endif
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_avx512.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_avx512.c
@ -0,0 +1,24 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef ENABLE_AVX512
+#include "nnacl/experimental/ms_core.h"
+
+void InitAvx512Core(CoreFuncs *funcs_) {
+  funcs_->pack = C16NUM;
+  funcs_->byte = sizeof(float);
+}
+#endif
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_sse.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_sse.c
@ -0,0 +1,29 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef ENABLE_SSE
+#include "nnacl/experimental/ms_core.h"
+void InitOptMatmulTileSse(int *row_tile, int *col_tile) {
+  *row_tile = C4NUM;
+  *col_tile = C8NUM;
+}
+
+void InitSseCore(CoreFuncs *funcs_) {
+  funcs_->pack = C4NUM;
+  funcs_->byte = sizeof(float);
+  funcs_->OptMatmulTile = InitOptMatmulTileSse;
+}
+#endif
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_x86.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_x86.c
@ -14,10 +14,10 @@
 * limitations under the License.
 */

-#include "nnacl/experimental/fp32_funcs.h"
-#include <float.h>
+#include "nnacl/experimental/ms_core.h"
 #include "nnacl/op_base.h"
 #include "nnacl/fp32/pack_fp32.h"
+#include "nnacl/fp32/exp_fp32.h"

 void GetPostParameters(ActType act, float *min, float *max) {
 #define RELU6_VALUE 6.0f
@ -35,13 +35,13 @@ void GetPostParameters(ActType act, float *min, float *max) {
  return;
 }

-void InitBaseMMFp32TileCount(int *row_tile, int *deep_tile, int *col_tile) {
+void InitExpMMFp32TileCount(int *row_tile, int *deep_tile, int *col_tile) {
  *row_tile = C16NUM;
  *col_tile = C4NUM;
  *deep_tile = 1;
 }

-void PackMatmulA(void *dst_ptr, void *src_ptr, size_t row, size_t deep, size_t src_stride) {
+void PackExpMatmulIn(void *dst_ptr, void *src_ptr, size_t row, size_t deep, size_t src_stride) {
  /* src_stride : total row */
  float *dst = (float *)dst_ptr;
  float *src = (float *)src_ptr;
@ -54,8 +54,8 @@ void PackMatmulA(void *dst_ptr, void *src_ptr, size_t row, size_t deep, size_t s
  }
 }

-static void DoBaseMatmul(float *c_ptr, const float *a_ptr, const float *b_ptr, const float *bias, size_t row,
-                         size_t deep, size_t col, size_t dst_stride, float min, float max) {
+static void ExpMatmul(float *c_ptr, const float *a_ptr, const float *b_ptr, const float *bias, size_t row, size_t deep,
+                      size_t col, size_t dst_stride, float min, float max) {
  /* dst_stride : total_row * pack */
  for (size_t r = 0; r < row; r++) {
    for (size_t c = 0; c < col; c++) {
@ -79,32 +79,39 @@ static void DoBaseMatmul(float *c_ptr, const float *a_ptr, const float *b_ptr, c
  }
 }

-void BaseMatMul(void *c_ptr, void *a_ptr, void *b_ptr, void *bias_ptr, size_t row, size_t deep, size_t col,
-                size_t dst_stride, float min, float max) {
+void ExpMatMulBlock(void *c_ptr, void *a_ptr, void *b_ptr, void *bias_ptr, size_t row, size_t deep, size_t col,
+                    size_t dst_stride, float min, float max) {
  float *c = (float *)c_ptr;
  float *a = (float *)a_ptr;
  float *b = (float *)b_ptr;
  float *bias = (float *)bias_ptr;
-  return DoBaseMatmul(c, a, b, bias, row, deep, col, dst_stride, min, max);
+  return ExpMatmul(c, a, b, bias, row, deep, col, dst_stride, min, max);
 }

-void BaseMatMulRes(void *c_ptr, void *a_ptr, void *b_ptr, void *bias_ptr, size_t row, size_t deep, size_t col,
-                   size_t dst_stride, float min, float max) {
+void ExpMatmulRemain(void *c_ptr, void *a_ptr, void *b_ptr, void *bias_ptr, size_t row, size_t deep, size_t col,
+                     size_t dst_stride, float min, float max) {
  float *c = (float *)c_ptr;
  float *a = (float *)a_ptr;
  float *b = (float *)b_ptr;
  float *bias = (float *)bias_ptr;
-  return DoBaseMatmul(c, a, b, bias, row, deep, col, dst_stride, min, max);
+  return ExpMatmul(c, a, b, bias, row, deep, col, dst_stride, min, max);
 }

-void InitFp32Funcs(CoreFuncs *funcs_) {
+void InitOptMatmulTile(int *row_tile, int *col_tile) {
+  *row_tile = C12NUM;
+  *col_tile = C8NUM;
+}
+
+void InitCore(CoreFuncs *funcs_) {
  funcs_->pack = C4NUM;
  funcs_->byte = sizeof(float);
-  funcs_->InitMatmulTileCount = InitBaseMMFp32TileCount;
-  funcs_->PackNcX = NULL;
-  funcs_->UnPackNcX = NULL;
-  funcs_->PackLeft = PackMatmulA;
-  funcs_->PackRight = NULL;
-  funcs_->Matmul = BaseMatMul;
-  funcs_->MatMulRes = BaseMatMulRes;
+  funcs_->ExpMatmulTile = InitExpMMFp32TileCount;
+  funcs_->PackNcX = PackNCHWToNC4HW4Fp32;
+  funcs_->UnPackNcX = PackNC4HW4ToNCHWFp32;
+  funcs_->ExpMatmulPackIn = PackExpMatmulIn;
+  funcs_->ExpMatmulBlock = ExpMatMulBlock;
+  funcs_->ExpMatMulRemain = ExpMatmulRemain;
+  funcs_->ExpFusion = ExpFusionFp32;
+  funcs_->OptMatmulTile = InitOptMatmulTile;
+  funcs_->PostParam = GetPostParameters;
 }
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/pack_fp32.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/pack_fp32.c
@ -1249,8 +1249,8 @@ void UnPackC4Uint(const void *src, void *dst, size_t plane, size_t channel) {
    size_t c_div = c / C4NUM;
    size_t c_mod = c % C4NUM;
    for (size_t p = 0; p < plane; p++) {
-      int src_offset = c_div * plane * C4NUM + plane * C4NUM + c_mod;
-      int dst_offset = p * channel + c;
+      int src_offset = c_div * plane * C4NUM + p * C4NUM + c_mod;
+      int dst_offset = c * plane + p;
      fp32_dst[dst_offset] = fp32_src[src_offset];
    }
  }
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel.c
@ -13,20 +13,21 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+
 #include "nnacl/kernel.h"
 #include "nnacl/tensor_c.h"
 #include "nnacl/op_base.h"
-#include "nnacl/experimental/fp32_funcs.h"
-#include "nnacl/experimental/fp16_funcs.h"
+#include "nnacl/experimental/ms_core.h"
 #ifdef _MSC_VER
 #include "nnacl/experimental/conv.h"
 #include "nnacl/kernel/exp.h"
 #endif

 static KernelCreator g_kernelCreatorRegistry[PrimType_MAX][Format_MAX][16];
+#define REGIST_DT(DT) (DT - kNumberTypeBegin - 1)

 void RegKernelCreator(int opType, int format, int dataType, KernelCreator creator) {
-  g_kernelCreatorRegistry[opType][format][dataType - kNumberTypeBegin - 1] = creator;
+  g_kernelCreatorRegistry[opType][format][REGIST_DT(dataType)] = creator;
 }

 void Init_MSC_VER_kernels(void) {
@ -35,14 +36,13 @@ void Init_MSC_VER_kernels(void) {
   * register here first time */
  static bool inited = false;
  if (inited == false) {
-    g_kernelCreatorRegistry[PrimType_Conv2DFusion][Format_NC4HW4][kNumberTypeFloat32 - kNumberTypeBegin - 1] =
-      CreateConv;
-    g_kernelCreatorRegistry[PrimType_ExpFusion][Format_NHWC][kNumberTypeFloat32 - kNumberTypeBegin - 1] = CreateExp;
-    g_kernelCreatorRegistry[PrimType_ExpFusion][Format_NHWC][kNumberTypeFloat16 - kNumberTypeBegin - 1] = CreateExp;
-    g_kernelCreatorRegistry[PrimType_ExpFusion][Format_NCHW][kNumberTypeFloat32 - kNumberTypeBegin - 1] = CreateExp;
-    g_kernelCreatorRegistry[PrimType_ExpFusion][Format_NCHW][kNumberTypeFloat16 - kNumberTypeBegin - 1] = CreateExp;
-    g_kernelCreatorRegistry[PrimType_ExpFusion][Format_NC4HW4][kNumberTypeFloat32 - kNumberTypeBegin - 1] = CreateExp;
-    g_kernelCreatorRegistry[PrimType_ExpFusion][Format_NC8HW8][kNumberTypeFloat16 - kNumberTypeBegin - 1] = CreateExp;
+    g_kernelCreatorRegistry[PrimType_Conv2DFusion][Format_NC4HW4][REGIST_DT(kNumberTypeFloat32)] = CreateConv;
+    g_kernelCreatorRegistry[PrimType_ExpFusion][Format_NHWC][REGIST_DT(kNumberTypeFloat32)] = CreateExp;
+    g_kernelCreatorRegistry[PrimType_ExpFusion][Format_NHWC][REGIST_DT(kNumberTypeFloat16)] = CreateExp;
+    g_kernelCreatorRegistry[PrimType_ExpFusion][Format_NCHW][REGIST_DT(kNumberTypeFloat32)] = CreateExp;
+    g_kernelCreatorRegistry[PrimType_ExpFusion][Format_NCHW][REGIST_DT(kNumberTypeFloat16)] = CreateExp;
+    g_kernelCreatorRegistry[PrimType_ExpFusion][Format_NC4HW4][REGIST_DT(kNumberTypeFloat32)] = CreateExp;
+    g_kernelCreatorRegistry[PrimType_ExpFusion][Format_NC8HW8][REGIST_DT(kNumberTypeFloat16)] = CreateExp;
    inited = true;
  }
 #endif
@ -51,18 +51,18 @@ void Init_MSC_VER_kernels(void) {

 bool SupportKernelC(int opType, int format, int dataType) {
  Init_MSC_VER_kernels();
-  KernelCreator creator = g_kernelCreatorRegistry[opType][format][dataType - kNumberTypeBegin - 1];
+  KernelCreator creator = g_kernelCreatorRegistry[opType][format][REGIST_DT(dataType)];
  return creator != NULL;
 }

 KernelBase *CreateKernel(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize, int data_type,
                         FormatC format) {
  Init_MSC_VER_kernels();
-  KernelCreator creator = g_kernelCreatorRegistry[param->type_][format][data_type - kNumberTypeBegin - 1];
+  KernelCreator creator = g_kernelCreatorRegistry[param->type_][format][REGIST_DT(data_type)];
  if (creator == NULL) {
    return NULL;
  }
-  return creator(param, in, insize, out, outsize);
+  return creator(param, in, insize, out, outsize, data_type, format);
 }

 ExecEnv *GetExecEnv() {
@ -71,14 +71,51 @@ ExecEnv *GetExecEnv() {
 }

 CoreFuncs *GetCoreFuncs(bool use_fp16) {
-  static CoreFuncs fp23funcs;
-  InitFp32Funcs(&fp23funcs);
-  static CoreFuncs fp16funcs;
-  InitFp16Funcs(&fp16funcs);
+  static CoreFuncs core;
+  InitCore(&core);

-  if (use_fp16) {
-    return &fp16funcs;
-  }
+#ifdef ENABLE_AVX512
+  static CoreFuncs core_avx512;
+  InitCore(&core_avx512);
+  InitSseCore(&core_avx512);
+  InitAvxCore(&core_avx512);
+  InitAvx512Core(&core_avx512);
+  return &core_avx512;
+#endif

-  return &fp23funcs;
+#ifdef ENABLE_AVX
+  static CoreFuncs core_avx;
+  InitCore(&core_avx);
+  InitSseCore(&core_avx);
+  InitAvxCore(&core_avx);
+  return &core_avx;
+#endif
+
+#ifdef ENABLE_SSE
+  static CoreFuncs core_sse;
+  InitCore(&core_sse);
+  InitSseCore(&core_sse);
+  return &core_sse;
+#endif
+
+#ifdef ENABLE_ARM32
+  static CoreFuncs core_arm32;
+  InitCore(&core_arm32);
+  InitArm32Core(&core_arm32);
+  return &core_arm32;
+#endif
+
+#ifdef ENABLE_ARM64
+  static CoreFuncs core_fp32;
+  InitCore(&core_fp32);
+  InitFp32Core(&core_fp32);
+  static CoreFuncs core_fp16;
+  InitCore(&core_fp16);
+#ifdef ENABLE_FP16
+  InitFp16Core(&core_fp16);
+#endif
+  return use_fp16 ? &core_fp16 : &core_fp32;
+#endif
+
+  return &core;
 }
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel.h
@ -15,9 +15,10 @@
 */
 #ifndef MINDSPORE_NNACL_KERNEL_H_
 #define MINDSPORE_NNACL_KERNEL_H_
+
 #include "nnacl/op_base.h"
 #include "nnacl/infer/common_infer.h"
-#include "nnacl/experimental/core_funcs.h"
+#include "nnacl/experimental/ms_core.h"

 typedef struct ExecEnv {
  void *allocator;
@ -55,7 +56,8 @@ typedef struct KernelBase {
  }
 #endif

-typedef KernelBase *(*KernelCreator)(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize);
+typedef KernelBase *(*KernelCreator)(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize,
+                                     int data_type, FormatC format);
 void RegKernelCreator(int opType, int format, int dataType, KernelCreator func);
 CoreFuncs *GetCoreFuncs(bool use_fp16);

--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/convolution.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/convolution.c
@ -0,0 +1,28 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/kernel/convolution.h"
+#include "nnacl/kernel/convolution_1x1.h"
+#include "nnacl/tensor_c.h"
+#include "nnacl/op_base.h"
+
+KernelBase *CreateConvolution(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize,
+                              int data_type, FormatC format) {
+  return CreateConv1x1(param, in, insize, out, outsize, data_type, format);
+}
+
+REG_KERNEL_CREATOR(PrimType_Conv2DFusion, Format_NC4HW4, kNumberTypeFloat32, CreateConvolution);
+REG_KERNEL_CREATOR(PrimType_Conv2DFusion, Format_NC8HW8, kNumberTypeFloat16, CreateConvolution);
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/conv1x1.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/conv1x1.h
@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef MINDSPORE_NNACL_EXPERIMENT_CONV1X1_H_
-#define MINDSPORE_NNACL_EXPERIMENT_CONV1X1_H_
+#ifndef MINDSPORE_NNACL_KERNEL_CONVOLUTION_H_
+#define MINDSPORE_NNACL_KERNEL_CONVOLUTION_H_

 #include "nnacl/op_base.h"
 #include "nnacl/tensor_c.h"
@ -24,9 +24,10 @@
 extern "C" {
 #endif

-KernelBase *CreateConv1x1(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize);
+KernelBase *CreateConvolution(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize,
+                              int data_type, FormatC format);

 #ifdef __cplusplus
 }
 #endif
-#endif  // MINDSPORE_NNACL_EXPERIMENT_CONV1X1_H_
+#endif  // MINDSPORE_NNACL_KERNEL_CONVOLUTION_1X1_H_
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/convolution_1x1.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/convolution_1x1.c
@ -14,34 +14,37 @@
 * limitations under the License.
 */

-#include "nnacl/experimental/conv1x1.h"
+#include "nnacl/kernel/convolution_1x1.h"
 #include <stdint.h>
 #include "nnacl/conv_parameter.h"
 #include "nnacl/tensor_c.h"
 #include "nnacl/op_base.h"
-#include "nnacl/experimental/base_matmul.h"

-typedef struct Conv1x1Stru {
-  KernelBase base;
-  uint8_t *bias_;
-  uint8_t *weight_;
-} Conv1x1Stru;
-
-int conv1x1_resize(struct KernelBase *self) { return 0; }
-
-int conv1x1_prepare(struct KernelBase *self) {
+int conv1x1_exp_resize(struct KernelBase *self) {
  Conv1x1Stru *conv = (Conv1x1Stru *)self;
  ConvParameter *param = (ConvParameter *)conv->base.param;
+  conv->exp_.row = param->input_h_ * param->input_w_;
+  conv->exp_.deep = param->input_channel_;
+  conv->exp_.col = param->output_channel_;
+  conv->exp_.thread_num = param->op_parameter_.thread_num_;
+  if (conv->bias_ != NULL || param->act_type_ != ActType_No) {
+    conv->exp_.base->funcs->PostParam(param->act_type_, &conv->exp_.min, &conv->exp_.max);
+  }
+  return 0;
+}

-  conv->base.funcs = GetCoreFuncs(conv->base.in[0].data_type_ == kNumberTypeFloat16);
+int conv1x1_exp_prepare(struct KernelBase *self) {
+  Conv1x1Stru *conv = (Conv1x1Stru *)self;
+  ConvParameter *param = (ConvParameter *)conv->base.param;
+  conv->exp_.base = &conv->base;

  int row_tile, deep_tile, col_tile;
-  conv->base.funcs->InitMatmulTileCount(&row_tile, &deep_tile, &col_tile);
+  conv->base.funcs->ExpMatmulTile(&row_tile, &deep_tile, &col_tile);

  conv->weight_ = (uint8_t *)(conv->base.env->alloc(
    conv->base.env->allocator,
    UP_ROUND(param->output_channel_, col_tile) * UP_ROUND(param->input_channel_, deep_tile) * row_tile));
-  conv->base.funcs->PackRight(conv->base.in[1].data_, conv->weight_, 1, param->input_channel_, param->output_channel_);
+  conv->base.funcs->PackNcX(conv->base.in[1].data_, conv->weight_, 1, param->input_channel_, param->output_channel_);

  if (conv->base.insize < kInputSize2) {
    conv->bias_ = NULL;
@ -57,27 +60,21 @@ int conv1x1_prepare(struct KernelBase *self) {
  return 0;
 }

-int conv1x1_release(struct KernelBase *self) {
+int conv1x1_exp_release(struct KernelBase *self) {
  Conv1x1Stru *conv = (Conv1x1Stru *)self;
  conv->base.env->free(conv->base.env->allocator, conv->bias_);
  conv->base.env->free(conv->base.env->allocator, conv->weight_);
  return 0;
 }

-int conv1x1_compute(struct KernelBase *self) {
+int conv1x1_exp_compute(struct KernelBase *self) {
  Conv1x1Stru *conv = (Conv1x1Stru *)self;
-  ConvParameter *param = (ConvParameter *)conv->base.param;
-
-  BaseMatmul(conv->base.in[0].data_, conv->weight_, conv->bias_, conv->base.out[0].data_,
-             param->input_h_ * param->input_w_, param->input_channel_, param->output_channel_, param->act_type_,
-             param->op_parameter_.thread_num_, &conv->base);
+  ExperimentalMatmul(conv->base.in[0].data_, conv->weight_, conv->bias_, conv->base.out[0].data_, &conv->exp_);
  return 0;
 }

-KernelBase *CreateConv1x1(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize) {
-  if (in[0].format_ != Format_NC4HW4) {
-    return NULL;
-  }
+KernelBase *CreateConv1x1(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize, int data_type,
+                          FormatC format) {
  Conv1x1Stru *conv1x1 = (Conv1x1Stru *)malloc(sizeof(Conv1x1Stru));
  conv1x1->base.param = param;
  conv1x1->base.in = in;
@ -85,10 +82,14 @@ KernelBase *CreateConv1x1(OpParameter *param, TensorC *in, size_t insize, Tensor
  conv1x1->base.out = out;
  conv1x1->base.outsize = outsize;
  conv1x1->base.env = GetExecEnv();
-  conv1x1->base.prepare = conv1x1_prepare;
-  conv1x1->base.resize = conv1x1_resize;
-  conv1x1->base.release = conv1x1_release;
-  conv1x1->base.compute = conv1x1_compute;
+  conv1x1->base.funcs = GetCoreFuncs(data_type == kNumberTypeFloat16);
+
+  if (format == Format_NC4HW4) {
+    conv1x1->base.prepare = conv1x1_exp_prepare;
+    conv1x1->base.resize = conv1x1_exp_resize;
+    conv1x1->base.release = conv1x1_exp_release;
+    conv1x1->base.compute = conv1x1_exp_compute;
+  }

  return (KernelBase *)conv1x1;
 }
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/convolution_1x1.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/convolution_1x1.h
@ -0,0 +1,43 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_KERNEL_CONVOLUTION_1X1_H_
+#define MINDSPORE_NNACL_KERNEL_CONVOLUTION_1X1_H_
+
+#include "nnacl/op_base.h"
+#include "nnacl/tensor_c.h"
+#include "nnacl/kernel.h"
+#include "nnacl/kernel/matmul_optimize.h"
+#include "nnacl/kernel/matmul_experimental.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct Conv1x1Stru {
+  KernelBase base;
+  uint8_t *bias_;
+  uint8_t *weight_;
+  MatmulOptStru opt_;
+  MatmulExpStru exp_;
+} Conv1x1Stru;
+
+KernelBase *CreateConv1x1(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize, int data_type,
+                          FormatC format);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // MINDSPORE_NNACL_KERNEL_CONVOLUTION_1X1_H_
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/exp.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/exp.c
@ -19,14 +19,6 @@
 #include "nnacl/exp_parameter.h"
 #include "nnacl/tensor_c.h"
 #include "nnacl/op_base.h"
-#include "nnacl/fp32/exp_fp32.h"
-#ifdef ENABLE_FP16
-#include "nnacl/fp16/exp_fp16.h"
-#endif
-
-typedef struct ExpStru {
-  KernelBase base;
-} ExpStru;

 int exp_resize(struct KernelBase *self) {
  ExpStru *exp = (ExpStru *)self;
@ -65,14 +57,8 @@ int exp_do_compute(void *param, int task_id, float lhs_scale, float rhs_scale) {
  ExpStru *exp_stru = (ExpStru *)param;
  ExpParameter *exp_param = (ExpParameter *)exp_stru->base.param;

-  int ret = NNACL_ERR;
-  if (exp_stru->base.out[0].data_type_ == kNumberTypeFloat32) {
-    ret = ExpFusionFp32(exp_stru->base.in[0].data_, exp_stru->base.out[0].data_, exp_param, task_id);
-#ifdef ENABLE_FP16
-  } else if (exp_stru->base.out[0].data_type_ == kNumberTypeFloat16) {
-    ret = ExpFusionFp16(exp_stru->base.in[0].data_, exp_stru->base.out[0].data_, exp_param, task_id);
-#endif
-  }
+  int ret =
+    exp_stru->base.funcs->ExpFusion(exp_stru->base.in[0].data_, exp_stru->base.out[0].data_, exp_param, task_id);

  return ret;
 }
@ -81,7 +67,8 @@ int exp_compute(struct KernelBase *self) {
  return self->env->parallelLaunch(self->env->threadPool, exp_do_compute, self, self->param->thread_num_);
 }

-KernelBase *CreateExp(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize) {
+KernelBase *CreateExp(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize, int data_type,
+                      FormatC format) {
  ExpStru *exp = (ExpStru *)malloc(sizeof(ExpStru));
  exp->base.param = param;
  exp->base.in = in;
@ -93,6 +80,7 @@ KernelBase *CreateExp(OpParameter *param, TensorC *in, size_t insize, TensorC *o
  exp->base.resize = exp_resize;
  exp->base.release = exp_release;
  exp->base.compute = exp_compute;
+  exp->base.funcs = GetCoreFuncs(data_type == kNumberTypeFloat16);

  return (KernelBase *)exp;
 }
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/exp.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/exp.h
@ -24,7 +24,12 @@
 extern "C" {
 #endif

-KernelBase *CreateExp(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize);
+typedef struct ExpStru {
+  KernelBase base;
+} ExpStru;
+
+KernelBase *CreateExp(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize, int data_type,
+                      FormatC format);

 #ifdef __cplusplus
 }
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/gather_d.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/gather_d.c
@ -103,7 +103,8 @@ int gather_d_compute(struct KernelBase *self) {
  return status;
 }

-KernelBase *CreateGatherD(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize) {
+KernelBase *CreateGatherD(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize, int data_type,
+                          FormatC format) {
  GatherDStru *gather_d = (GatherDStru *)malloc(sizeof(GatherDStru));
  gather_d->base.param = param;
  gather_d->base.in = in;
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/gather_d.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/gather_d.h
@ -24,7 +24,8 @@
 extern "C" {
 #endif

-KernelBase *CreateGatherD(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize);
+KernelBase *CreateGatherD(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize, int data_type,
+                          FormatC format);

 #ifdef __cplusplus
 }
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/group_norm.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/group_norm.c
@ -100,7 +100,8 @@ static int groupnorm_compute(struct KernelBase *self) {
  return self->env->parallelLaunch(self->env->threadPool, groupnorm_do_compute, self, self->param->thread_num_);
 }

-KernelBase *CreateGroupNorm(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize) {
+KernelBase *CreateGroupNorm(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize, int data_type,
+                            FormatC format) {
  GroupNormStru *groupnorm = (GroupNormStru *)malloc(sizeof(GroupNormStru));
  if (groupnorm == NULL) {
    return NULL;
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/group_norm.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/group_norm.h
@ -26,7 +26,8 @@
 extern "C" {
 #endif

-KernelBase *CreateGroupNorm(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize);
+KernelBase *CreateGroupNorm(OpParameter *param, TensorC *in, size_t insize, TensorC *out, size_t outsize, int data_type,
+                            FormatC format);

 #ifdef __cplusplus
 }
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/matmul_experimental.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/matmul_experimental.c
@ -0,0 +1,75 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/kernel/matmul_experimental.h"
+
+int ExpMatmulRun(void *param, int task_id, float lhs_scale, float rhs_scale) {
+  MatmulExpStru *matmul = (MatmulExpStru *)param;
+  if (matmul == NULL) {
+    return -1;
+  }
+
+  size_t pack_uint = matmul->base->funcs->pack * matmul->base->funcs->byte;
+
+  for (size_t i = task_id; i < matmul->row_unit; i += matmul->thread_num) {
+    int xStart = i * matmul->row_tile;
+    uint8_t *a = matmul->a_ptr + xStart * pack_uint;
+    uint8_t *tmp = matmul->tmp_ptr + matmul->row_tile * matmul->deep * task_id * matmul->base->funcs->byte;
+    matmul->base->funcs->ExpMatmulPackIn(tmp, a, matmul->row_tile, matmul->deep, matmul->row);
+    matmul->base->funcs->ExpMatmulBlock(matmul->c_ptr + xStart * pack_uint, tmp, matmul->b_ptr, matmul->bias,
+                                        matmul->row_tile, matmul->deep, matmul->col,
+                                        matmul->row * matmul->base->funcs->pack, matmul->min, matmul->max);
+  }
+  return 0;
+}
+
+void ExperimentalMatmul(uint8_t *a_ptr, uint8_t *b_ptr, uint8_t *bias, uint8_t *c_ptr, MatmulExpStru *matmul) {
+  if (a_ptr == NULL || b_ptr == NULL || c_ptr == NULL) {
+    return;
+  }
+
+  matmul->a_ptr = a_ptr;
+  matmul->b_ptr = b_ptr;
+  matmul->c_ptr = c_ptr;
+  matmul->bias = bias;
+
+  int byte = matmul->base->funcs->byte;
+  int pack = matmul->base->funcs->pack;
+  int row_tile, deep_tile, col_tile;
+  matmul->base->funcs->ExpMatmulTile(&row_tile, &deep_tile, &col_tile);
+
+  matmul->row_tile = row_tile;
+  if (row_tile == 0) {
+    return;
+  }
+  matmul->row_unit = matmul->row / row_tile;
+
+  size_t tmp_size = matmul->thread_num * UP_ROUND(matmul->deep, deep_tile) * row_tile * byte;
+  matmul->tmp_ptr = (uint8_t *)matmul->base->env->alloc(matmul->base->env->allocator, tmp_size);
+  matmul->base->env->parallelLaunch(matmul->base->env->threadPool, ExpMatmulRun, matmul, matmul->thread_num);
+
+  size_t row_remain = matmul->row - matmul->row_unit * row_tile;
+  if (row_remain != 0) {
+    int32_t start_row = matmul->row_unit * row_tile;
+    uint8_t *a_remain_ptr = a_ptr + start_row * pack * byte;
+    matmul->base->funcs->ExpMatmulPackIn(matmul->tmp_ptr, a_remain_ptr, row_remain, matmul->deep, matmul->row);
+    matmul->base->funcs->ExpMatMulRemain(c_ptr + start_row * pack * byte, matmul->tmp_ptr, b_ptr, bias, row_remain,
+                                         matmul->deep, matmul->col, matmul->row * pack, matmul->min, matmul->max);
+  }
+
+  matmul->base->env->free(matmul->base->env->allocator, matmul->tmp_ptr);
+  return;
+}
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/matmul_experimental.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/matmul_experimental.h
@ -0,0 +1,49 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_EXPERIMENT_MATMUL_EXPERIMENTAL_H_
+#define MINDSPORE_NNACL_EXPERIMENT_MATMUL_EXPERIMENTAL_H_
+
+#include "nnacl/kernel.h"
+#include "nnacl/experimental/ms_core.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct MatmulExpStru {
+  KernelBase *base;
+  size_t deep;
+  size_t row;
+  size_t col;
+  size_t thread_num;
+  uint8_t *a_ptr;
+  uint8_t *b_ptr;
+  uint8_t *c_ptr;
+  uint8_t *bias;
+  uint8_t *tmp_ptr;
+  float min;
+  float max;
+  size_t row_unit;
+  size_t row_tile;
+} MatmulExpStru;
+
+void ExperimentalMatmul(uint8_t *a_ptr, uint8_t *b_ptr, uint8_t *bias, uint8_t *c_ptr, MatmulExpStru *matmul);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // MINDSPORE_NNACL_EXPERIMENT_MATMUL_EXPERIMENTAL_H_
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/matmul_optimize.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/kernel/matmul_optimize.c
@ -0,0 +1,22 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/kernel/matmul_optimize.h"
+
+void MatmulOpt_prepare(MatmulOptStru *matmul) {
+  matmul->base->funcs->OptMatmulTile(&matmul->row_tile, &matmul->col_tile);
+  return;
+}
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/base_matmul.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/base_matmul.h
@ -13,19 +13,27 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef MINDSPORE_NNACL_EXPERIMENT_BASE_MATMUL_H_
-#define MINDSPORE_NNACL_EXPERIMENT_BASE_MATMUL_H_
+
+#ifndef MINDSPORE_NNACL_EXPERIMENT_MATMUL_OPTIMIZE_H_
+#define MINDSPORE_NNACL_EXPERIMENT_MATMUL_OPTIMIZE_H_

 #include "nnacl/kernel.h"
+#include "nnacl/matmul_parameter.h"
+#include "nnacl/experimental/ms_core.h"

 #ifdef __cplusplus
 extern "C" {
 #endif
+typedef struct MatmulOptStru {
+  KernelBase *base;
+  MatMulParameter param;
+  int row_tile;
+  int col_tile;
+} MatmulOptStru;

-void BaseMatmul(uint8_t *a_ptr, uint8_t *b_ptr, uint8_t *bias, uint8_t *c_ptr, int row, int deep, int col,
-                ActType act_type, int thread_num, KernelBase *base);
+void MatmulOpt_prepare(MatmulOptStru *matmul);

 #ifdef __cplusplus
 }
 #endif
-#endif  // MINDSPORE_NNACL_EXPERIMENT_BASE_MATMUL_H_
+#endif  // MINDSPORE_NNACL_EXPERIMENT_MATMUL_OPTIMIZE_H_
--- a/mindspore/lite/src/runtime/pass/runtime_ncx_pass.cc
+++ b/mindspore/lite/src/runtime/pass/runtime_ncx_pass.cc
@ -26,7 +26,7 @@

 namespace mindspore::lite::pass {
 #ifdef ENABLE_RUNTIME_NCX_PASS
-std::set<schema::PrimitiveType> ncxhwx_kernels = {};
+std::set<schema::PrimitiveType> ncxhwx_kernels = {schema::PrimitiveType_Conv2DFusion};

 bool RuntimeNCXPassVaild(kernel::SubGraphKernel *subgraph) {
  if (subgraph->subgraph_type() == kernel::kNotSubGraph) {
--- a/mindspore/lite/test/config_level0/cropped_size.cfg
+++ b/mindspore/lite/test/config_level0/cropped_size.cfg
@ -1,2 +1,2 @@
 Note: This is the mindspore Lite inference framework size threshold. Offline review is required before modify this value!!!
-1022266
+1300000
--- a/mindspore/lite/tools/cropper/build_cropper_config.sh
+++ b/mindspore/lite/tools/cropper/build_cropper_config.sh
@ -176,6 +176,7 @@ getCommonFile() {
    mindspore/lite/src/expression/ops_utils.h
    mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/tensor_c_utils.h
    mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/tensorlist_c_utils.h
+    mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core.h
    mindspore/core/utils/log_adapter.h
    mindspore/core/ir/api_tensor_impl.h
    mindspore/lite/src/runtime/cxx_api/tensor/tensor_impl.h
@ -227,6 +228,13 @@ getCommonFile() {
    mindspore/lite/src/expression/ops_utils.cc
    mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/tensor_c_utils.c
    mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/tensorlist_c_utils.c
+    mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_x86.c
+    mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_sse.c
+    mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_avx.c
+    mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_avx512.c
+    mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_arm64_fp32.c
+    mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_arm64_fp16.c
+    mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/ms_core_arm32.c
  )
  all_files=("${src_files[@]}" "${regist_files[@]}" "${common_files[@]}" "${runtime_files_cc[@]}"
    "${others_files_c[@]}" "${assembly_files[@]}" "${mindrt_files[@]}"