!25455 [MSLITE] code clean

Merge pull request !25455 from ling/pr
2021-10-28 01:48:46 +00:00 · 2021-10-28 01:48:46 +00:00 · 59f03e8e21
parent 3bc883b99b 8230f06ca2
commit 59f03e8e21
40 changed files with 413 additions and 353 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pack_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pack_fp32.c
@ -258,35 +258,36 @@ void PackNHWCToNXHWCXFp32(int kernel_h, int kernel_w, int output_channel, int oc
        tmp_weight[oc_remainder + oc_remainder_step * ic] = src[ic + oc_remainder * input_channel];
      }
    }
-  } else {
-    for (; oc < oc_block8; oc += (oc_block / C8NUM)) {
-      oc_block = MSMIN(C4NUM, oc_block8 - oc) * C8NUM;  // max_tile = 32 ==> 24 ==> 16 ==> 8
-      for (int oc_tmp = 0; oc_tmp < oc_block; oc_tmp += C8NUM) {
-        for (int hw = 0; hw < plane; ++hw) {
-          int ic = 0;
-          for (; ic < ic8; ic += C8NUM) {
-            Transpose8X8Fp32Avx(src + hw * input_channel + ic,
-                                tmp_weight + hw * oc_block * input_channel + ic * oc_block + oc_tmp,
-                                input_channel * plane, oc_block);
-          }
-          for (; ic < input_channel; ++ic) {
-            for (int j = 0; j < C8NUM; ++j) {
-              tmp_weight[ic * oc_block + oc_tmp + j + hw * oc_block * input_channel] =
-                src[ic + input_channel * j * plane + hw * input_channel];
-            }
-          }
-        }
-        src += C8NUM * plane * input_channel;
-      }
-      tmp_weight += oc_block * input_channel * plane;
-    }
-    oc = output_channel - oc_block8 * C8NUM;
-    for (int oc_remainder = 0; oc_remainder < oc; ++oc_remainder) {
+    return;
+  }
+
+  for (; oc < oc_block8; oc += (oc_block / C8NUM)) {
+    oc_block = MSMIN(C4NUM, oc_block8 - oc) * C8NUM;  // max_tile = 32 ==> 24 ==> 16 ==> 8
+    for (int oc_tmp = 0; oc_tmp < oc_block; oc_tmp += C8NUM) {
      for (int hw = 0; hw < plane; ++hw) {
-        for (int ic = 0; ic < input_channel; ++ic) {
-          tmp_weight[oc_remainder + oc_remainder_step * ic + hw * input_channel * oc_remainder_step] =
-            src[ic + (oc_remainder * plane + hw) * input_channel];
+        int ic = 0;
+        for (; ic < ic8; ic += C8NUM) {
+          Transpose8X8Fp32Avx(src + hw * input_channel + ic,
+                              tmp_weight + hw * oc_block * input_channel + ic * oc_block + oc_tmp,
+                              input_channel * plane, oc_block);
        }
+        for (; ic < input_channel; ++ic) {
+          for (int j = 0; j < C8NUM; ++j) {
+            tmp_weight[ic * oc_block + oc_tmp + j + hw * oc_block * input_channel] =
+              src[ic + input_channel * j * plane + hw * input_channel];
+          }
+        }
+      }
+      src += C8NUM * plane * input_channel;
+    }
+    tmp_weight += oc_block * input_channel * plane;
+  }
+  oc = output_channel - oc_block8 * C8NUM;
+  for (int oc_remainder = 0; oc_remainder < oc; ++oc_remainder) {
+    for (int hw = 0; hw < plane; ++hw) {
+      for (int ic = 0; ic < input_channel; ++ic) {
+        tmp_weight[oc_remainder + oc_remainder_step * ic + hw * input_channel * oc_remainder_step] =
+          src[ic + (oc_remainder * plane + hw) * input_channel];
      }
    }
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pooling_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pooling_fp32.c
@ -19,15 +19,12 @@
 #include "nnacl/errorcode.h"
 #include "nnacl/op_base.h"

-int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, int task_id,
-               float minf, float maxf) {
-  int win_w = pooling_param->window_w_;
-  int win_h = pooling_param->window_h_;
+int AvgPoolingBatch(const float *src_b_ptr, float *dst_b_ptr, const PoolingParameter *pooling_param, int task_id,
+                    float minf, float maxf) {
+  int win_w = pooling_param->window_w_, win_h = pooling_param->window_h_;
+  int in_w = pooling_param->input_w_, in_h = pooling_param->input_h_;
+  int output_w = pooling_param->output_w_, output_h = pooling_param->output_h_;
  int channel = pooling_param->input_channel_;
-  int in_w = pooling_param->input_w_;
-  int in_h = pooling_param->input_h_;
-  int output_w = pooling_param->output_w_;
-  int output_h = pooling_param->output_h_;
  int out_plane = output_w * output_h;
  int out_tile_count = UP_DIV(out_plane, TILE_NUM);
  NNACL_CHECK_ZERO_RETURN_ERR(output_w);
@ -42,190 +39,218 @@ int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter
  MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf);
 #endif

-  for (int batch = 0; batch < pooling_param->output_batch_; batch++) {
-    const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel;
-    float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel;
-    for (int thread_id = task_id; thread_id < out_tile_count; thread_id += pooling_param->thread_num_) {
-      int cal_start_index = thread_id * TILE_NUM;
-      int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
-      for (int i = 0; i < real_cal_num; i++) {
-        int index = cal_start_index + i;
-        int out_w_index = index % output_w;
-        int out_h_index = index / output_w;
-        int in_w_index = out_w_index * pooling_param->stride_w_ - pooling_param->pad_l_;
-        int in_h_index = out_h_index * pooling_param->stride_h_ - pooling_param->pad_u_;
+  for (int thread_id = task_id; thread_id < out_tile_count; thread_id += pooling_param->thread_num_) {
+    int cal_start_index = thread_id * TILE_NUM;
+    int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
+    for (int i = 0; i < real_cal_num; i++) {
+      int index = cal_start_index + i;
+      int out_w_index = index % output_w;
+      int out_h_index = index / output_w;
+      int in_w_index = out_w_index * pooling_param->stride_w_ - pooling_param->pad_l_;
+      int in_h_index = out_h_index * pooling_param->stride_h_ - pooling_param->pad_u_;

-        const float *src_plane_ptr = src_b_ptr;
-        float *dst_plane_ptr = dst_b_ptr + index * channel;
+      const float *src_plane_ptr = src_b_ptr;
+      float *dst_plane_ptr = dst_b_ptr + index * channel;

-        int real_win_h_start = MSMAX(0, -in_h_index);
-        int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
-        int real_win_w_start = MSMAX(0, -in_w_index);
-        int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
-        int ci = 0;
+      int real_win_h_start = MSMAX(0, -in_h_index);
+      int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
+      int real_win_w_start = MSMAX(0, -in_w_index);
+      int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
+      int ci = 0;
 #ifdef ENABLE_AVX
-        for (; ci < c8; ci += C8NUM) {
-          const float *src_c_ptr = src_plane_ptr + ci;
-          float *dst_c_ptr = dst_plane_ptr + ci;
-          MS_FLOAT32X8 tmp_avg = MS_MOV256_F32(0);
-          int real_count = 0;
-          for (int h = real_win_h_start; h < real_win_h_end; h++) {
-            for (int w = real_win_w_start; w < real_win_w_end; w++) {
-              const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
-              tmp_avg = MS_ADD256_F32(tmp_avg, MS_LD256_F32(src_win_ptr));
-              ++real_count;
-            }  // win_w loop
-          }    // win_h loop
-          if (real_count == 0) {
-            return NNACL_ERR;
-          }
-          tmp_avg = MS_DIV256_F32(tmp_avg, MS_MOV256_F32(real_count));
-          tmp_avg = MS_MAX256_F32(tmp_avg, min_value_8);
-          tmp_avg = MS_MIN256_F32(tmp_avg, max_value_8);
-          MS_ST256_F32(dst_c_ptr, tmp_avg);
-        }  // ic8-1 loop
+      for (; ci < c8; ci += C8NUM) {
+        const float *src_c_ptr = src_plane_ptr + ci;
+        float *dst_c_ptr = dst_plane_ptr + ci;
+        MS_FLOAT32X8 tmp_avg = MS_MOV256_F32(0);
+        int real_count = 0;
+        for (int h = real_win_h_start; h < real_win_h_end; h++) {
+          for (int w = real_win_w_start; w < real_win_w_end; w++) {
+            const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
+            tmp_avg = MS_ADD256_F32(tmp_avg, MS_LD256_F32(src_win_ptr));
+            ++real_count;
+          }  // win_w loop
+        }    // win_h loop
+        if (real_count == 0) {
+          return NNACL_ERR;
+        }
+        tmp_avg = MS_DIV256_F32(tmp_avg, MS_MOV256_F32(real_count));
+        tmp_avg = MS_MAX256_F32(tmp_avg, min_value_8);
+        tmp_avg = MS_MIN256_F32(tmp_avg, max_value_8);
+        MS_ST256_F32(dst_c_ptr, tmp_avg);
+      }  // ic8-1 loop
 #endif
 #if defined(ENABLE_NEON) || defined(ENABLE_SSE)
-        for (; ci < c4; ci += C4NUM) {
-          const float *src_c_ptr = src_plane_ptr + ci;
-          float *dst_c_ptr = dst_plane_ptr + ci;
-          MS_FLOAT32X4 tmp_avg = MS_MOVQ_F32(0);
-          int real_count = 0;
-          for (int h = real_win_h_start; h < real_win_h_end; h++) {
-            for (int w = real_win_w_start; w < real_win_w_end; w++) {
-              const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
-              tmp_avg = MS_ADDQ_F32(tmp_avg, MS_LDQ_F32(src_win_ptr));
-              ++real_count;
-            }  // win_w loop
-          }    // win_h loop
-          if (real_count == 0) {
-            return NNACL_ERR;
-          }
-          tmp_avg = MS_DIVQ_F32(tmp_avg, MS_MOVQ_F32(real_count));
-          tmp_avg = MS_MAXQ_F32(tmp_avg, min_value);
-          tmp_avg = MS_MINQ_F32(tmp_avg, max_value);
-          MS_STQ_F32(dst_c_ptr, tmp_avg);
-        }  // ic4-1 loop
+      for (; ci < c4; ci += C4NUM) {
+        const float *src_c_ptr = src_plane_ptr + ci;
+        float *dst_c_ptr = dst_plane_ptr + ci;
+        MS_FLOAT32X4 tmp_avg = MS_MOVQ_F32(0);
+        int real_count = 0;
+        for (int h = real_win_h_start; h < real_win_h_end; h++) {
+          for (int w = real_win_w_start; w < real_win_w_end; w++) {
+            const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
+            tmp_avg = MS_ADDQ_F32(tmp_avg, MS_LDQ_F32(src_win_ptr));
+            ++real_count;
+          }  // win_w loop
+        }    // win_h loop
+        if (real_count == 0) {
+          return NNACL_ERR;
+        }
+        tmp_avg = MS_DIVQ_F32(tmp_avg, MS_MOVQ_F32(real_count));
+        tmp_avg = MS_MAXQ_F32(tmp_avg, min_value);
+        tmp_avg = MS_MINQ_F32(tmp_avg, max_value);
+        MS_STQ_F32(dst_c_ptr, tmp_avg);
+      }  // ic4-1 loop
 #endif
-        for (; ci < channel; ci++) {
-          const float *src_c_ptr = src_plane_ptr + ci;
-          float *dst_c_ptr = dst_plane_ptr + ci;
-          float tmp_avg = 0;
-          int real_count = 0;
-          for (int h = real_win_h_start; h < real_win_h_end; h++) {
-            for (int w = real_win_w_start; w < real_win_w_end; w++) {
-              const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
-              tmp_avg += src_win_ptr[0];
-              ++real_count;
-            }  // win_w loop
-          }    // win_h loop
-          if (real_count == 0) {
-            return NNACL_ERR;
-          }
-          tmp_avg = tmp_avg / (float)real_count;
-          tmp_avg = fmaxf(tmp_avg, minf);
-          tmp_avg = fminf(tmp_avg, maxf);
-          dst_c_ptr[0] = tmp_avg;
-        }  // channel_res loop
-      }    // real_cal_num loop
-    }      // out_plane loop
-  }        // out_batch loop
+      for (; ci < channel; ci++) {
+        const float *src_c_ptr = src_plane_ptr + ci;
+        float *dst_c_ptr = dst_plane_ptr + ci;
+        float tmp_avg = 0;
+        int real_count = 0;
+        for (int h = real_win_h_start; h < real_win_h_end; h++) {
+          for (int w = real_win_w_start; w < real_win_w_end; w++) {
+            const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
+            tmp_avg += src_win_ptr[0];
+            ++real_count;
+          }  // win_w loop
+        }    // win_h loop
+        if (real_count == 0) {
+          return NNACL_ERR;
+        }
+        tmp_avg = tmp_avg / (float)real_count;
+        tmp_avg = fmaxf(tmp_avg, minf);
+        tmp_avg = fminf(tmp_avg, maxf);
+        dst_c_ptr[0] = tmp_avg;
+      }  // channel_res loop
+    }    // real_cal_num loop
+  }      // out_plane loop
+  return NNACL_OK;
+}
+
+int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, int task_id,
+               float minf, float maxf) {
+  int in_w = pooling_param->input_w_;
+  int in_h = pooling_param->input_h_;
+  int output_w = pooling_param->output_w_;
+  int output_h = pooling_param->output_h_;
+  int channel = pooling_param->input_channel_;
+  int output_batch = pooling_param->output_batch_;
+
+  for (int batch = 0; batch < output_batch; batch++) {
+    const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel;
+    float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel;
+    int ret = AvgPoolingBatch(src_b_ptr, dst_b_ptr, pooling_param, task_id, minf, maxf);
+    if (ret != NNACL_OK) {
+      return ret;
+    }
+  }
+  return NNACL_OK;
+}
+
+int MaxPoolingBatch(const float *src_b_ptr, float *dst_b_ptr, const PoolingParameter *pooling_param, int task_id,
+                    float minf, float maxf) {
+  int in_w = pooling_param->input_w_, in_h = pooling_param->input_h_;
+  int win_w = pooling_param->window_w_, win_h = pooling_param->window_h_;
+  int output_w = pooling_param->output_w_, output_h = pooling_param->output_h_;
+  int channel = pooling_param->input_channel_;
+  int out_plane = output_w * output_h;
+  int out_tile_count = UP_DIV(out_plane, TILE_NUM);
+  NNACL_CHECK_ZERO_RETURN_ERR(output_w);
+#ifdef ENABLE_AVX
+  int c8 = channel / C8NUM * C8NUM;
+  MS_FLOAT32X8 min_value_8 = MS_MOV256_F32(minf);
+  MS_FLOAT32X8 max_value_8 = MS_MOV256_F32(maxf);
+#endif
+#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
+  int c4 = channel / C4NUM * C4NUM;
+  MS_FLOAT32X4 min_value = MS_MOVQ_F32(minf);
+  MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf);
+#endif
+
+  for (int thread_id = task_id; thread_id < out_tile_count; thread_id += pooling_param->thread_num_) {
+    int cal_start_index = thread_id * TILE_NUM;
+    int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
+    for (int i = 0; i < real_cal_num; i++) {
+      int index = cal_start_index + i;
+      int out_w_index = index % output_w;
+      int out_h_index = index / output_w;
+      int in_w_index = out_w_index * pooling_param->stride_w_ - pooling_param->pad_l_;
+      int in_h_index = out_h_index * pooling_param->stride_h_ - pooling_param->pad_u_;
+
+      const float *src_plane_ptr = src_b_ptr;
+      float *dst_plane_ptr = dst_b_ptr + index * channel;
+
+      int real_win_h_start = MSMAX(0, -in_h_index);
+      int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
+      int real_win_w_start = MSMAX(0, -in_w_index);
+      int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
+      int ci = 0;
+#ifdef ENABLE_AVX
+      for (; ci < c8; ci += C8NUM) {
+        const float *src_c_ptr = src_plane_ptr + ci;
+        float *dst_c_ptr = dst_plane_ptr + ci;
+        MS_FLOAT32X8 tmp_max = MS_MOV256_F32(-FLT_MAX);
+        for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
+          for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
+            const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
+            tmp_max = MS_MAX256_F32(tmp_max, MS_LD256_F32(src_win_ptr));
+          }  // win_w loop
+        }    // win_h loop
+        tmp_max = MS_MAX256_F32(tmp_max, min_value_8);
+        tmp_max = MS_MIN256_F32(tmp_max, max_value_8);
+        MS_ST256_F32(dst_c_ptr, tmp_max);
+      }  // ic8 loop
+#endif
+#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
+      for (; ci < c4; ci += C4NUM) {
+        const float *src_c_ptr = src_plane_ptr + ci;
+        float *dst_c_ptr = dst_plane_ptr + ci;
+        MS_FLOAT32X4 tmp_max = MS_MOVQ_F32(-FLT_MAX);
+        for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
+          for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
+            const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
+            tmp_max = MS_MAXQ_F32(tmp_max, MS_LDQ_F32(src_win_ptr));
+          }  // win_w loop
+        }    // win_h loop
+        tmp_max = MS_MAXQ_F32(tmp_max, min_value);
+        tmp_max = MS_MINQ_F32(tmp_max, max_value);
+        MS_STQ_F32(dst_c_ptr, tmp_max);
+      }  // ic4 loop
+#endif
+      for (; ci < channel; ci++) {
+        float *dst_c_ptr = dst_plane_ptr + ci;
+        const float *src_c_ptr = src_plane_ptr + ci;
+        float tmp_max = -FLT_MAX;
+        for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
+          for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
+            const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
+            tmp_max = fmaxf(tmp_max, src_win_ptr[0]);
+          }  // win_w loop
+        }    // win_h loop
+        tmp_max = fmaxf(tmp_max, minf);
+        tmp_max = fminf(tmp_max, maxf);
+        dst_c_ptr[0] = tmp_max;
+      }  // channel_res loop
+    }    // real_cal_num loop
+  }      // out_plane loop
  return NNACL_OK;
 }

 int MaxPooling(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, int task_id,
               float minf, float maxf) {
-  int win_w = pooling_param->window_w_;
-  int win_h = pooling_param->window_h_;
-  int channel = pooling_param->input_channel_;
  int in_w = pooling_param->input_w_;
  int in_h = pooling_param->input_h_;
  int output_w = pooling_param->output_w_;
  int output_h = pooling_param->output_h_;
+  int channel = pooling_param->input_channel_;
  int output_batch = pooling_param->output_batch_;
-  int out_plane = output_w * output_h;
-  int out_tile_count = UP_DIV(out_plane, TILE_NUM);
-  NNACL_CHECK_ZERO_RETURN_ERR(output_w);
-#ifdef ENABLE_AVX
-  int c8 = channel / C8NUM * C8NUM;
-  MS_FLOAT32X8 min_value_8 = MS_MOV256_F32(minf);
-  MS_FLOAT32X8 max_value_8 = MS_MOV256_F32(maxf);
-#endif
-#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
-  int c4 = channel / C4NUM * C4NUM;
-  MS_FLOAT32X4 min_value = MS_MOVQ_F32(minf);
-  MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf);
-#endif

  for (int batch = 0; batch < output_batch; batch++) {
    const float *src_b_ptr = input_ptr + batch * in_h * in_w * channel;
    float *dst_b_ptr = output_ptr + batch * output_h * output_w * channel;
-    for (int thread_id = task_id; thread_id < out_tile_count; thread_id += pooling_param->thread_num_) {
-      int cal_start_index = thread_id * TILE_NUM;
-      int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
-      for (int i = 0; i < real_cal_num; i++) {
-        int index = cal_start_index + i;
-        int out_w_index = index % output_w;
-        int out_h_index = index / output_w;
-        int in_w_index = out_w_index * pooling_param->stride_w_ - pooling_param->pad_l_;
-        int in_h_index = out_h_index * pooling_param->stride_h_ - pooling_param->pad_u_;
-
-        const float *src_plane_ptr = src_b_ptr;
-        float *dst_plane_ptr = dst_b_ptr + index * channel;
-
-        int real_win_h_start = MSMAX(0, -in_h_index);
-        int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
-        int real_win_w_start = MSMAX(0, -in_w_index);
-        int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
-        int ci = 0;
-#ifdef ENABLE_AVX
-        for (; ci < c8; ci += C8NUM) {
-          const float *src_c_ptr = src_plane_ptr + ci;
-          float *dst_c_ptr = dst_plane_ptr + ci;
-          MS_FLOAT32X8 tmp_max = MS_MOV256_F32(-FLT_MAX);
-          for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
-            for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
-              const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
-              tmp_max = MS_MAX256_F32(tmp_max, MS_LD256_F32(src_win_ptr));
-            }  // win_w loop
-          }    // win_h loop
-          tmp_max = MS_MAX256_F32(tmp_max, min_value_8);
-          tmp_max = MS_MIN256_F32(tmp_max, max_value_8);
-          MS_ST256_F32(dst_c_ptr, tmp_max);
-        }  // ic8 loop
-#endif
-#if defined(ENABLE_NEON) || defined(ENABLE_SSE)
-        for (; ci < c4; ci += C4NUM) {
-          const float *src_c_ptr = src_plane_ptr + ci;
-          float *dst_c_ptr = dst_plane_ptr + ci;
-          MS_FLOAT32X4 tmp_max = MS_MOVQ_F32(-FLT_MAX);
-          for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
-            for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
-              const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
-              tmp_max = MS_MAXQ_F32(tmp_max, MS_LDQ_F32(src_win_ptr));
-            }  // win_w loop
-          }    // win_h loop
-          tmp_max = MS_MAXQ_F32(tmp_max, min_value);
-          tmp_max = MS_MINQ_F32(tmp_max, max_value);
-          MS_STQ_F32(dst_c_ptr, tmp_max);
-        }  // ic4 loop
-#endif
-        for (; ci < channel; ci++) {
-          float *dst_c_ptr = dst_plane_ptr + ci;
-          const float *src_c_ptr = src_plane_ptr + ci;
-          float tmp_max = -FLT_MAX;
-          for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
-            for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
-              const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
-              tmp_max = fmaxf(tmp_max, src_win_ptr[0]);
-            }  // win_w loop
-          }    // win_h loop
-          tmp_max = fmaxf(tmp_max, minf);
-          tmp_max = fminf(tmp_max, maxf);
-          dst_c_ptr[0] = tmp_max;
-        }  // channel_res loop
-      }    // real_cal_num loop
-    }      // out_plane loop
-  }        // out_batch loop
+    int ret = MaxPoolingBatch(src_b_ptr, dst_b_ptr, pooling_param, task_id, minf, maxf);
+    if (ret != NNACL_OK) {
+      return ret;
+    }
+  }
  return NNACL_OK;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/op_base.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/op_base.h
@ -76,6 +76,7 @@
 #define THIRD_INPUT 2
 #define FOURTH_INPUT 3
 #define FIFTH_INPUT 4
+#define SIXTH_INPUT 5

 #define DIMENSION_1D 1
 #define DIMENSION_2D 2
--- a/mindspore/lite/src/lite_session.cc
+++ b/mindspore/lite/src/lite_session.cc
@ -165,6 +165,10 @@ int LiteSession::ConvertTensorsData(const lite::Model *model, size_t tensor_inde

  auto ret = DecompressTensor(*src_tensor, dst_tensor);
  if (ret == RET_NO_CHANGE) {
+    if (src_tensor->data()->size() < dst_tensor->Size()) {
+      MS_LOG(ERROR) << "Tensor data shape invalid";
+      return RET_ERROR;
+    }
    dst_tensor->set_data(const_cast<unsigned char *>(src_tensor->data()->data()));
    dst_tensor->set_own_data(false);
  } else if (ret != RET_OK) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc
@ -28,12 +28,32 @@ namespace mindspore::kernel {
 namespace {
 constexpr size_t kPadCommonInputSize = 2;
 }  // namespace
-int PadFp16CPUKernel::RunImpl(int task_id) {
+int PadFp16CPUKernel::RunImpl(int task_id) const {
  PadFp16(input_, output_, in_, out_, pad_param_->paddings_, task_id, op_parameter_->thread_num_);
  return RET_OK;
 }

-int PadFp16CPUKernel::RunMirrorPadImpl(int task_id) {
+void PadFp16CPUKernel::RunMirrorPadImplFast(const MirrorPadBlock &block, const float16_t *input_data,
+                                            float16_t *output_data) const {
+  for (int a = 0; a < block.size_[0]; a++) {
+    int out_a_index = block.out_offset_ + a * block.out_stride_[0];
+    for (int b = 0; b < block.size_[1]; b++) {
+      int out_b_index = out_a_index + b * block.out_stride_[1];
+      for (int c = 0; c < block.size_[2]; ++c) {
+        int out_c_index = out_b_index + c * block.out_stride_[2];
+        for (int d = 0; d < block.size_[3]; ++d) {
+          int out_d_index = out_c_index + d * block.out_stride_[3];
+          for (int e = 0; e < block.size_[4]; ++e) {
+            int output_index = out_d_index + e * block.out_stride_[4];
+            MirrorPadFp16(input_data, output_data, in_, pad_param_, output_index, output_index + block.size_[5]);
+          }
+        }
+      }
+    }
+  }
+}
+
+int PadFp16CPUKernel::RunMirrorPadImpl(int task_id) const {
  auto input = in_tensors_.at(0);
  CHECK_NULL_RETURN(input);
  auto output = out_tensors_.at(0);
@ -51,23 +71,7 @@ int PadFp16CPUKernel::RunMirrorPadImpl(int task_id) {
    /* calculate region part */
    for (size_t i = task_id; i < mirror_pad_block_.size(); i += op_parameter_->thread_num_) {
      auto block = mirror_pad_block_[i];
-
-      for (int a = 0; a < block.size_[0]; a++) {
-        int out_a_index = block.out_offset_ + a * block.out_stride_[0];
-        for (int b = 0; b < block.size_[1]; b++) {
-          int out_b_index = out_a_index + b * block.out_stride_[1];
-          for (int c = 0; c < block.size_[2]; ++c) {
-            int out_c_index = out_b_index + c * block.out_stride_[2];
-            for (int d = 0; d < block.size_[3]; ++d) {
-              int out_d_index = out_c_index + d * block.out_stride_[3];
-              for (int e = 0; e < block.size_[4]; ++e) {
-                int output_index = out_d_index + e * block.out_stride_[4];
-                MirrorPadFp16(input_data, output_data, in_, pad_param_, output_index, output_index + block.size_[5]);
-              }
-            }
-          }
-        }
-      }
+      RunMirrorPadImplFast(block, input_data, output_data);
    }
    return RET_OK;
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.h
@ -30,8 +30,11 @@ class PadFp16CPUKernel : public PadCPUKernel {
  ~PadFp16CPUKernel() {}

  int Run() override;
-  int RunImpl(int task_id) override;
-  int RunMirrorPadImpl(int task_id) override;
+  int RunImpl(int task_id) const override;
+  int RunMirrorPadImpl(int task_id) const override;
+
+ private:
+  void RunMirrorPadImplFast(const MirrorPadBlock &block, const float16_t *input_data, float16_t *output_data) const;

 private:
  float16_t *input_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/prelu_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/prelu_fp16.cc
@ -25,7 +25,7 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_PReLUFusion;

 namespace mindspore::kernel {
-int PReluFp16CPUKernel::DoExcute(int task_id) {
+int PReluFp16CPUKernel::DoExcute(int task_id) const {
  int thread_num = param_->op_parameter_.thread_num_;
  if (thread_num == 0) {
    MS_LOG(ERROR) << "thread_num is 0!";
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/prelu_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/prelu_fp16.h
@ -27,7 +27,7 @@ class PReluFp16CPUKernel : public PReluCPUKernel {
      : PReluCPUKernel(parameter, inputs, outputs, ctx) {}
  ~PReluFp16CPUKernel() = default;

-  int DoExcute(int task_id) override;
+  int DoExcute(int task_id) const override;
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_PRELU_FP16_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd_fp32.cc
@ -102,7 +102,7 @@ void GatherNdCPUKernel::InitOffset() {
  }
 }

-int GatherNdCPUKernel::DoGatherNd(int task_id) {
+int GatherNdCPUKernel::DoGatherNd(int task_id) const {
  int count = MSMIN(thread_sz_stride_, count_ - task_id * thread_sz_stride_);
  if (count <= 0) {
    return RET_OK;
@ -116,8 +116,8 @@ int GatherNdCPUKernel::DoGatherNd(int task_id) {
  return RET_OK;
 }

-int GatherNdRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
-  auto g_kernel = reinterpret_cast<GatherNdCPUKernel *>(cdata);
+int GatherNdRun(const void *cdata, int task_id, float, float) {
+  auto g_kernel = reinterpret_cast<const GatherNdCPUKernel *>(cdata);
  auto ret = g_kernel->DoGatherNd(task_id);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "GatherNdRun error task_id[" << task_id << "] error_code[" << ret << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd_fp32.h
@ -37,7 +37,7 @@ class GatherNdCPUKernel : public InnerKernel {
  int Prepare() override;
  int ReSize() override;
  int Run() override;
-  int DoGatherNd(int task_id);
+  int DoGatherNd(int task_id) const;

 private:
  void InitOffset();
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.cc
@ -42,7 +42,7 @@ int GatherCPUKernel::Prepare() {

 int GatherCPUKernel::ReSize() { return RET_OK; }

-int GatherCPUKernel::DoGather(int task_id) {
+int GatherCPUKernel::DoGather(int task_id) const {
  auto input_tensor = in_tensors_.at(0);
  auto indices_tensor = in_tensors_.at(1);
  auto out_tensor = out_tensors_.at(0);
@ -81,8 +81,8 @@ int GatherCPUKernel::DoGather(int task_id) {
  return error_code;
 }

-int GatherRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
-  auto gather_kernel = reinterpret_cast<GatherCPUKernel *>(cdata);
+int GatherRun(const void *cdata, int task_id, float, float) {
+  auto gather_kernel = reinterpret_cast<const GatherCPUKernel *>(cdata);
  auto error_code = gather_kernel->DoGather(task_id);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "GatherRun error task_id[" << task_id << "] error_code[" << error_code << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.h
@ -34,7 +34,7 @@ class GatherCPUKernel : public InnerKernel {
  int Prepare() override;
  int ReSize() override;
  int Run() override;
-  int DoGather(int task_id);
+  int DoGather(int task_id) const;

 private:
  int *indices_data_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/glu_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/glu_fp32.cc
@ -35,7 +35,7 @@ int GluCPUKernel::MallocTmpBuffer() {
  FreeTmpBuffer();
  auto in_tensor = in_tensors_.front();
  for (int i = 0; i < kSplitNum; i++) {
-    split_ptr_[i] = reinterpret_cast<int8_t *>(ms_context_->allocator->Malloc(in_tensor->Size() / kSplitNum));
+    split_ptr_[i] = ms_context_->allocator->Malloc(in_tensor->Size() / kSplitNum);
    if (split_ptr_[i] == nullptr) {
      MS_LOG(ERROR) << "GluCPUKernel malloc split ptr failed.";
      return RET_ERROR;
@ -96,8 +96,7 @@ int GluCPUKernel::ReSize() {
  return RET_OK;
 }

-int GluCPUKernel::Split(int task_id) {
-  input_ptr_ = in_tensors_.front()->data();
+int GluCPUKernel::Split(int task_id) const {
  MS_CHECK_INT_MUL_NOT_OVERFLOW(task_id, thread_n_stride_, RET_ERROR);
  int num_unit_thread = MSMIN(thread_n_stride_, num_unit_ - task_id * thread_n_stride_);
  if (num_unit_thread <= 0) {
@ -105,8 +104,8 @@ int GluCPUKernel::Split(int task_id) {
  }
  int thread_offset = task_id * thread_n_stride_;
  auto ret =
-    DoSplit(input_ptr_, reinterpret_cast<void **>(split_ptr_.data()), in_tensors_.front()->shape().data(),
-            thread_offset, num_unit_thread, &split_param_, lite::DataTypeSize(in_tensors_.front()->data_type()));
+    DoSplit(input_ptr_, const_cast<void **>(split_ptr_.data()), in_tensors_.front()->shape().data(), thread_offset,
+            num_unit_thread, &split_param_, lite::DataTypeSize(in_tensors_.front()->data_type()));
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Split error task_id[" << task_id << "] error_code[" << ret << "]";
    return RET_ERROR;
@ -114,7 +113,7 @@ int GluCPUKernel::Split(int task_id) {
  return RET_OK;
 }

-int GluCPUKernel::Sigmoid(int task_id) {
+int GluCPUKernel::Sigmoid(int task_id) const {
  auto input_addr = reinterpret_cast<float *>(split_ptr_.at(1));
  auto output_addr = reinterpret_cast<float *>(sigmoid_ptr_);
  auto length = in_tensors_.at(0)->ElementsNum() / kGluBranchNum;
@ -128,7 +127,7 @@ int GluCPUKernel::Sigmoid(int task_id) {
  return ::Sigmoid(input_addr + stride * task_id, count, output_addr + stride * task_id);
 }

-int GluCPUKernel::Mul(int task_id) {
+int GluCPUKernel::Mul(int task_id) const {
  auto input_addr0 = reinterpret_cast<float *>(split_ptr_.at(0));
  auto input_addr1 = reinterpret_cast<float *>(sigmoid_ptr_);
  auto output_addr = reinterpret_cast<float *>(out_tensors_.at(0)->data());
@ -144,22 +143,24 @@ int GluCPUKernel::Mul(int task_id) {
  return ElementMul(input_addr0 + offset, input_addr1 + offset, output_addr + offset, count);
 }

-static int SplitRun(void *cdata, int task_id, float, float) {
-  auto g_kernel = reinterpret_cast<GluCPUKernel *>(cdata);
+static int SplitRun(const void *cdata, int task_id, float, float) {
+  auto g_kernel = reinterpret_cast<const GluCPUKernel *>(cdata);
  return g_kernel->Split(task_id);
 }

-static int SigmoidRun(void *cdata, int task_id, float, float) {
-  auto activation_kernel = reinterpret_cast<GluCPUKernel *>(cdata);
+static int SigmoidRun(const void *cdata, int task_id, float, float) {
+  auto activation_kernel = reinterpret_cast<const GluCPUKernel *>(cdata);
  return activation_kernel->Sigmoid(task_id);
 }

-static int MulRun(void *cdata, int task_id, float, float) {
-  auto g_kernel = reinterpret_cast<GluCPUKernel *>(cdata);
+static int MulRun(const void *cdata, int task_id, float, float) {
+  auto g_kernel = reinterpret_cast<const GluCPUKernel *>(cdata);
  return g_kernel->Mul(task_id);
 }

 int GluCPUKernel::Run() {
+  input_ptr_ = in_tensors_.front()->data();
+
  auto ret = MallocTmpBuffer();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Malloc tmp buffer failed";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/glu_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/glu_fp32.h
@ -43,9 +43,11 @@ class GluCPUKernel : public InnerKernel {
  int Prepare() override;
  int ReSize() override;
  int Run() override;
-  int Split(int task_id);
-  int Sigmoid(int task_id);
-  int Mul(int task_id);
+  int Split(int task_id) const;
+  int Sigmoid(int task_id) const;
+  int Mul(int task_id) const;
+
+ private:
  void FreeTmpBuffer();
  int MallocTmpBuffer();

@ -54,8 +56,8 @@ class GluCPUKernel : public InnerKernel {
  GluParameter *glu_param_ = nullptr;
  void *input_ptr_ = nullptr;
  int8_t *sigmoid_ptr_ = nullptr;
-  std::vector<int8_t *> split_ptr_;
-  int split_sizes_[kSplitNum];
+  std::vector<void *> split_ptr_;
+  int split_sizes_[kSplitNum] = {0};
  int thread_n_stride_ = 0;
  int usable_thread_num_ = 0;
  int num_unit_ = 0;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/gru_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gru_fp32.h
@ -50,7 +50,7 @@ class GruCPUKernel : public InnerKernel {
  const int weight_r_index = 2;
  const int bias_index = 3;

-  float *buffer_[4];
+  float *buffer_[4] = {nullptr};
  const int gate_num = 3;
  const int packed_input_index = 0;
  const int input_gate_index = 1;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.cc
@ -45,7 +45,7 @@ int InstanceNormCPUKernel::ReSize() {
  return RET_OK;
 }

-int InstanceNormCPUKernel::DoInstanceNorm(int task_id) {
+int InstanceNormCPUKernel::DoInstanceNorm(int task_id) const {
  int ret = 0;
  if (in_tensors_[0]->format() == NC4HW4) {  // arm64 x86-avx x86-sse x86
 #ifdef ENABLE_AVX
@ -63,8 +63,8 @@ int InstanceNormCPUKernel::DoInstanceNorm(int task_id) {
  return RET_OK;
 }

-int InstanceNormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
-  auto kernel = reinterpret_cast<InstanceNormCPUKernel *>(cdata);
+int InstanceNormRun(const void *cdata, int task_id, float, float) {
+  auto kernel = reinterpret_cast<const InstanceNormCPUKernel *>(cdata);
  auto ret = kernel->DoInstanceNorm(task_id);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "InstanceNormRun error task_id[" << task_id << "] error_code[" << ret << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.h
@ -35,7 +35,9 @@ class InstanceNormCPUKernel : public InnerKernel {
  int Prepare() override;
  int ReSize() override;
  int Run() override;
-  int DoInstanceNorm(int task_id);
+  int DoInstanceNorm(int task_id) const;
+
+ private:
  void FreeTmpBuffer() {
    if (tmp_src_data_ != nullptr) {
      ms_context_->allocator->Free(tmp_src_data_);
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/l2_norm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/l2_norm_fp32.cc
@ -89,7 +89,7 @@ int L2NormCPUKernel::ReSize() {
  return RET_OK;
 }

-int L2NormCPUKernel::CalcSquareSum(int task_id) {
+int L2NormCPUKernel::CalcSquareSum(int task_id) const {
  int unit = UP_DIV(l2_norm_param_->data_num_, op_parameter_->thread_num_);
  if (INT_MUL_OVERFLOW(task_id, unit)) {
    MS_LOG(ERROR) << "int mul overflow.";
@ -100,7 +100,7 @@ int L2NormCPUKernel::CalcSquareSum(int task_id) {
  return CalcThreadSquareSum(input_ptr_, tmp_sum_ + task_id, begin, end);
 }

-int L2NormCPUKernel::DivSqrtSum(int task_id) {
+int L2NormCPUKernel::DivSqrtSum(int task_id) const {
  int unit = UP_DIV(l2_norm_param_->data_num_, op_parameter_->thread_num_);
  if (INT_MUL_OVERFLOW(task_id, unit)) {
    MS_LOG(ERROR) << "int mul overflow.";
@ -111,7 +111,7 @@ int L2NormCPUKernel::DivSqrtSum(int task_id) {
  return ThreadDivSqrtSum(input_ptr_, output_ptr_, l2_norm_param_, sqrt_sum_, begin, end);
 }

-int L2NormCPUKernel::CalcL2NormTrailingAxis(int task_id) {
+int L2NormCPUKernel::CalcL2NormTrailingAxis(int task_id) const {
  auto input = in_tensors_.at(0);
  if (input->shape().back() == 0) {
    MS_LOG(ERROR) << "input->shape().back() is 0";
@ -128,8 +128,8 @@ int L2NormCPUKernel::CalcL2NormTrailingAxis(int task_id) {
  return ThreadTrailingAxis(input_ptr_, output_ptr_, l2_norm_param_, begin, end);
 }

-int SquareSumRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
-  auto kernel = reinterpret_cast<L2NormCPUKernel *>(cdata);
+int SquareSumRun(const void *cdata, int task_id, float, float) {
+  auto kernel = reinterpret_cast<const L2NormCPUKernel *>(cdata);
  auto ret = kernel->CalcSquareSum(task_id);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "L2Norm SquareSumRun error task_id[" << task_id << "] error_code[" << ret << "]";
@ -138,9 +138,9 @@ int SquareSumRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
  return RET_OK;
 }

-int L2NormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
+int L2NormRun(const void *cdata, int task_id, float, float) {
  CHECK_NULL_RETURN(cdata);
-  auto kernel = reinterpret_cast<L2NormCPUKernel *>(cdata);
+  auto kernel = reinterpret_cast<const L2NormCPUKernel *>(cdata);
  auto ret = kernel->DivSqrtSum(task_id);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "L2Norm L2NormRun error task_id[" << task_id << "] error_code[" << ret << "]";
@ -149,9 +149,9 @@ int L2NormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
  return RET_OK;
 }

-int L2NormTrailingAxisRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
+int L2NormTrailingAxisRun(const void *cdata, int task_id, float, float) {
  CHECK_NULL_RETURN(cdata);
-  auto kernel = reinterpret_cast<L2NormCPUKernel *>(cdata);
+  auto kernel = reinterpret_cast<const L2NormCPUKernel *>(cdata);
  auto ret = kernel->CalcL2NormTrailingAxis(task_id);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "L2Norm TrailingAxisRun error task_id[" << task_id << "] error_code[" << ret << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/l2_norm_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/l2_norm_fp32.h
@ -36,9 +36,9 @@ class L2NormCPUKernel : public InnerKernel {
  }
  ~L2NormCPUKernel() { FreeTmpBuffer(); }

-  int CalcSquareSum(int task_id);
-  int DivSqrtSum(int task_id);
-  int CalcL2NormTrailingAxis(int task_id);
+  int CalcSquareSum(int task_id) const;
+  int DivSqrtSum(int task_id) const;
+  int CalcL2NormTrailingAxis(int task_id) const;

  int Prepare() override;
  int ReSize() override;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/layer_norm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/layer_norm_fp32.cc
@ -65,7 +65,7 @@ int LayerNormCPUKernel::ReSize() {
  return RET_OK;
 }

-int LayerNormCPUKernel::DoLayerNorm(int thread_id) {
+int LayerNormCPUKernel::DoLayerNorm(int thread_id) const {
  auto ret = LayerNorm(src_data_, gamma_data_, beta_data_, dst_data_, mean_data_, var_data_, param_, thread_id);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "DoLayerNorm error error_code[" << ret << "]";
@ -74,8 +74,8 @@ int LayerNormCPUKernel::DoLayerNorm(int thread_id) {
  return RET_OK;
 }

-int LayerNormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
-  auto kernel = reinterpret_cast<LayerNormCPUKernel *>(cdata);
+int LayerNormRun(const void *cdata, int task_id, float, float) {
+  auto kernel = reinterpret_cast<const LayerNormCPUKernel *>(cdata);
  CHECK_NULL_RETURN(kernel);
  auto ret = kernel->DoLayerNorm(task_id);
  if (ret != RET_OK) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/layer_norm_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/layer_norm_fp32.h
@ -35,7 +35,7 @@ class LayerNormCPUKernel : public InnerKernel {
  int Prepare() override;
  int ReSize() override;
  int Run() override;
-  int DoLayerNorm(int thread_id);
+  int DoLayerNorm(int thread_id) const;

 private:
  LayerNormParameter *param_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm_fp32.cc
@ -35,7 +35,7 @@ int LocalResponseNormCPUKernel::Prepare() {

 int LocalResponseNormCPUKernel::ReSize() { return RET_OK; }

-int LocalResponseNormCPUKernel::DoLocalResponseNorm(int task_id) {
+int LocalResponseNormCPUKernel::DoLocalResponseNorm(int task_id) const {
  auto input_tensor = in_tensors_.front();
  auto out_tensor = out_tensors_.front();
  auto input_ptr = reinterpret_cast<float *>(input_tensor->MutableData());
@ -67,8 +67,8 @@ int LocalResponseNormCPUKernel::DoLocalResponseNorm(int task_id) {
  return RET_OK;
 }

-int LocalResponseNormRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
-  auto lrn = reinterpret_cast<LocalResponseNormCPUKernel *>(cdata);
+int LocalResponseNormRun(const void *cdata, int task_id, float, float) {
+  auto lrn = reinterpret_cast<const LocalResponseNormCPUKernel *>(cdata);
  auto error_code = lrn->DoLocalResponseNorm(task_id);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "LocalResponseNormRun error task_id[" << task_id << "] error_code[" << error_code << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm_fp32.h
@ -32,7 +32,7 @@ class LocalResponseNormCPUKernel : public InnerKernel {
  int Prepare() override;
  int ReSize() override;
  int Run() override;
-  int DoLocalResponseNorm(int task_id);
+  int DoLocalResponseNorm(int task_id) const;

 private:
  int thread_count_;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/log_softmax_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/log_softmax_fp32.cc
@ -79,7 +79,7 @@ int LogSoftmaxCPUKernel::ReSize() {
  return RET_OK;
 }

-int LogSoftmaxCPUKernel::DoLogSoftmaxLastAxis(int task_id) {
+int LogSoftmaxCPUKernel::DoLogSoftmaxLastAxis(int task_id) const {
  MS_CHECK_FALSE(op_parameter_->thread_num_ == 0, RET_ERROR);
  int unit = UP_DIV(out_plane_size_, op_parameter_->thread_num_);
  int begin = task_id * unit;
@ -94,8 +94,8 @@ int LogSoftmaxCPUKernel::DoLogSoftmaxLastAxis(int task_id) {
  return RET_OK;
 }

-int LogSoftmaxLastAxisRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
-  auto kernel = reinterpret_cast<LogSoftmaxCPUKernel *>(cdata);
+int LogSoftmaxLastAxisRun(const void *cdata, int task_id, float, float) {
+  auto kernel = reinterpret_cast<const LogSoftmaxCPUKernel *>(cdata);
  CHECK_NULL_RETURN(kernel);
  auto ret = kernel->DoLogSoftmaxLastAxis(task_id);
  if (ret != RET_OK) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/log_softmax_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/log_softmax_fp32.h
@ -32,7 +32,7 @@ class LogSoftmaxCPUKernel : public SoftmaxBaseCPUKernel {
  int Prepare() override;
  int ReSize() override;
  int Run() override;
-  int DoLogSoftmaxLastAxis(int task_id);
+  int DoLogSoftmaxLastAxis(int task_id) const;

 private:
  float *tmp_data_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/lstm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/lstm_fp32.cc
@ -322,7 +322,7 @@ int LstmCPUKernel::MallocRunBuffer() {
  return RET_OK;
 }

-void LstmCPUKernel::InputWeightMatMul(int task_id) {
+void LstmCPUKernel::InputWeightMatMul(int task_id) const {
  int current_start_oc = task_id * input_thread_stride_ * col_tile_;
  int current_rest_oc = 0;
  current_rest_oc = lstm_param_->hidden_size_ - current_start_oc;
@ -339,8 +339,8 @@ void LstmCPUKernel::InputWeightMatMul(int task_id) {
            cur_oc, lstm_param_->hidden_size_, OutType_Nhwc);
 }

-int LstmInputMulWeightRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
-  auto kernel = reinterpret_cast<LstmCPUKernel *>(cdata);
+int LstmInputMulWeightRun(const void *cdata, int task_id, float, float) {
+  auto kernel = reinterpret_cast<const LstmCPUKernel *>(cdata);
  CHECK_NULL_RETURN(kernel);
  kernel->InputWeightMatMul(task_id);
  return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/lstm_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/lstm_fp32.h
@ -36,7 +36,7 @@ class LstmCPUKernel : public InnerKernel {
  int ReSize() override;
  int Run() override;

-  void InputWeightMatMul(int task_id);
+  void InputWeightMatMul(int task_id) const;

 private:
  void FreeTmpBuffer();
@ -50,9 +50,9 @@ class LstmCPUKernel : public InnerKernel {
                         const float *state_bias, float *hidden_state, float *cell_state, bool is_backward);
  int InnerExecute(float *output, const float *input, float *hidden_state, float *cell_state);
  void RecordStates(const float *cell_state, int step);
-  const float *weight_loop_;
-  const float *bias_loop_;
-  float *gate_loop_;
+  const float *weight_loop_ = nullptr;
+  const float *bias_loop_ = nullptr;
+  float *gate_loop_ = nullptr;
  int input_thread_count_ = 0;
  int input_thread_stride_ = 0;

@ -64,7 +64,7 @@ class LstmCPUKernel : public InnerKernel {
  const int weight_h_index = 2;
  const int bias_index = 3;

-  float *buffer_[7];
+  float *buffer_[7] = {nullptr};
  const int gate_num = 4;
  const int packed_input_index = 0;
  const int input_gate_index = 1;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc
@ -21,9 +21,9 @@
 using mindspore::lite::RET_NULL_PTR;

 namespace mindspore::kernel {
-int MatmulBaseFloatRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
+int MatmulBaseFloatRun(const void *cdata, int task_id, float, float) {
  CHECK_NULL_RETURN(cdata);
-  auto op = reinterpret_cast<MatmulFp32BaseCPUKernel *>(cdata);
+  auto op = reinterpret_cast<const MatmulFp32BaseCPUKernel *>(cdata);
  auto error_code = op->FloatRun(task_id);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]";
@ -126,32 +126,44 @@ int MatmulFp32BaseCPUKernel::CalBroadCastBiasDataElements() {
 }

 int MatmulFp32BaseCPUKernel::InitBiasData() {
-  if (in_tensors_.size() == 3) {
-    auto bias_tensor = in_tensors_[2];
-    size_t max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), col_tile_);
-    // malloc addr need to aligned to 32 bytes
+  if (in_tensors_.size() != FOURTH_INPUT) {
+    return RET_OK;
+  }
+  auto bias_tensor = in_tensors_[THIRD_INPUT];
+  if (bias_tensor == nullptr) {
+    MS_LOG(ERROR) << "bias_tensor invalid";
+    return RET_ERROR;
+  }
+
+  if (bias_tensor->ElementsNum() == 1) {
+    // broadcast bias data
+    size_t max_bias_data = CalBroadCastBiasDataElements();
    bias_ptr_ = reinterpret_cast<float *>(malloc(max_bias_data * static_cast<int>(sizeof(float))));
    if (bias_ptr_ == nullptr) {
      MS_LOG(ERROR) << "malloc bias_ptr_ failed";
      return RET_ERROR;
    }
-    // whether to broadcast bias data
-    if (bias_tensor->ElementsNum() == 1) {
-      max_bias_data = CalBroadCastBiasDataElements();
-      float broadcast_data = (reinterpret_cast<float *>(bias_tensor->data()))[0];
-      // broadcast bias data
-      for (size_t i = 0; i < max_bias_data; ++i) {
-        bias_ptr_[i] = broadcast_data;
-      }
-    } else {
-      memset(bias_ptr_, 0, max_bias_data * static_cast<int>(sizeof(float)));
-      memcpy(bias_ptr_, bias_tensor->data(), bias_tensor->ElementsNum() * static_cast<int>(sizeof(float)));
+    float broadcast_data = (reinterpret_cast<float *>(bias_tensor->data()))[0];
+    // broadcast bias data
+    for (size_t i = 0; i < max_bias_data; ++i) {
+      bias_ptr_[i] = broadcast_data;
    }
+    return RET_OK;
  }
+
+  size_t max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), col_tile_);
+  // malloc addr need to aligned to 32 bytes
+  bias_ptr_ = reinterpret_cast<float *>(malloc(max_bias_data * static_cast<int>(sizeof(float))));
+  if (bias_ptr_ == nullptr) {
+    MS_LOG(ERROR) << "malloc bias_ptr_ failed";
+    return RET_ERROR;
+  }
+  memset(bias_ptr_, 0, max_bias_data * static_cast<int>(sizeof(float)));
+  memcpy(bias_ptr_, bias_tensor->data(), bias_tensor->ElementsNum() * static_cast<int>(sizeof(float)));
  return RET_OK;
 }

-int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) {
+int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) const {
  CHECK_NULL_RETURN(src_ptr);
 #ifdef ENABLE_ARM64
  if (vec_matmul_) {
@ -175,7 +187,7 @@ int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) {
  return RET_OK;
 }

-int MatmulFp32BaseCPUKernel::InitMatrixB(const float *src_ptr) {
+int MatmulFp32BaseCPUKernel::InitMatrixB(const float *src_ptr) const {
  CHECK_NULL_RETURN(src_ptr);
  for (int i = 0; i < params_->batch; i++) {
    const float *src = src_ptr + i * params_->deep_ * params_->col_;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h
@ -47,8 +47,8 @@ class MatmulFp32BaseCPUKernel : public InnerKernel {
 protected:
  int InitBufferA();
  int InitBufferB();
-  int InitMatrixA(const float *src_ptr);
-  int InitMatrixB(const float *src_ptr);
+  int InitMatrixA(const float *src_ptr) const;
+  int InitMatrixB(const float *src_ptr) const;
  void FreeBiasBuf();
  int InitBiasData();
  void InitParameter();
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/non_max_suppression_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/non_max_suppression_fp32.cc
@ -110,7 +110,7 @@ void ExpandDims(std::vector<int> *shape, size_t size) {
 }

 int NonMaxSuppressionCPUKernel::Run_Selecte(bool simple_out, int box_num, int batch_num, int class_num,
-                                            float *scores_data, float *box_data) {
+                                            const float *scores_data, const float *box_data) {
  std::vector<NMSBox> selected_box_per_class;
  selected_box_per_class.reserve(std::min(static_cast<int32_t>(box_num), max_output_per_class_));
  std::vector<NMSIndex> selected_index;
@ -119,8 +119,8 @@ int NonMaxSuppressionCPUKernel::Run_Selecte(bool simple_out, int box_num, int ba
    int batch_offset = i * class_num * box_num;
    for (auto j = 0; j < class_num; ++j) {
      // per batch per class filter
-      float *per_class_scores = scores_data + batch_offset + j * box_num;
-      float *box = box_data + i * box_num * kBoxPointNum;
+      const float *per_class_scores = scores_data + batch_offset + j * box_num;
+      const float *box = box_data + i * box_num * kBoxPointNum;
      std::vector<NMSBox> above_score_candidates;
      above_score_candidates.reserve(box_num);
      for (auto k = 0; k < box_num; ++k) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/non_max_suppression_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/non_max_suppression_fp32.h
@ -41,7 +41,8 @@ class NonMaxSuppressionCPUKernel : public InnerKernel {

 private:
  int GetParams();
-  int Run_Selecte(bool simple_out, int box_num, int batch_num, int class_num, float *scores_data, float *box_data);
+  int Run_Selecte(bool simple_out, int box_num, int batch_num, int class_num, const float *scores_data,
+                  const float *box_data);

 private:
  int center_point_box_ = 0;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.cc
@ -206,8 +206,8 @@ int PadCPUKernel::ExtendPaddings(int *paddings, int length, const int *ori_paddi
  return RET_OK;
 }

-int PadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
-  auto padKernel = reinterpret_cast<PadCPUKernel *>(cdata);
+int PadImpl(const void *cdata, int task_id, float, float) {
+  auto padKernel = reinterpret_cast<const PadCPUKernel *>(cdata);
  int error_code = padKernel->RunImpl(task_id);
  if (error_code != NNACL_OK) {
    MS_LOG(ERROR) << "Pad Run error task_id[" << task_id << "] error_code[" << error_code << "]";
@ -216,7 +216,7 @@ int PadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
  return RET_OK;
 }

-int PadCPUKernel::RunImpl(int task_id) {
+int PadCPUKernel::RunImpl(int task_id) const {
  auto input = in_tensors_.at(0);
  auto output = out_tensors_.at(0);
  auto input_data = reinterpret_cast<float *>(input->data());
@ -228,8 +228,8 @@ int PadCPUKernel::RunImpl(int task_id) {
  return RET_OK;
 }

-int MirrorPadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
-  auto padKernel = reinterpret_cast<PadCPUKernel *>(cdata);
+int MirrorPadImpl(const void *cdata, int task_id, float, float) {
+  auto padKernel = reinterpret_cast<const PadCPUKernel *>(cdata);
  int error_code = padKernel->RunMirrorPadImpl(task_id);
  if (error_code != NNACL_OK) {
    MS_LOG(ERROR) << "Pad Run error task_id[" << task_id << "] error_code[" << error_code << "]";
@ -238,7 +238,27 @@ int MirrorPadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
  return RET_OK;
 }

-int PadCPUKernel::RunMirrorPadImpl(int task_id) {
+void PadCPUKernel::RunMirrorPadImplFast(const MirrorPadBlock &block, const float *input_data,
+                                        float *output_data) const {
+  for (int a = 0; a < block.size_[FIRST_INPUT]; a++) {
+    int out_a_index = block.out_offset_ + a * block.out_stride_[FIRST_INPUT];
+    for (int b = 0; b < block.size_[SECOND_INPUT]; b++) {
+      int out_b_index = out_a_index + b * block.out_stride_[SECOND_INPUT];
+      for (int c = 0; c < block.size_[THIRD_INPUT]; ++c) {
+        int out_c_index = out_b_index + c * block.out_stride_[THIRD_INPUT];
+        for (int d = 0; d < block.size_[FOURTH_INPUT]; ++d) {
+          int out_d_index = out_c_index + d * block.out_stride_[FOURTH_INPUT];
+          for (int e = 0; e < block.size_[FIFTH_INPUT]; ++e) {
+            int output_index = out_d_index + e * block.out_stride_[FIFTH_INPUT];
+            MirrorPad(input_data, output_data, in_, pad_param_, output_index, output_index + block.size_[SIXTH_INPUT]);
+          }
+        }
+      }
+    }
+  }
+}
+
+int PadCPUKernel::RunMirrorPadImpl(int task_id) const {
  auto input = in_tensors_.at(0);
  auto output = out_tensors_.at(0);
  auto input_data = reinterpret_cast<float *>(input->data());
@ -253,23 +273,7 @@ int PadCPUKernel::RunMirrorPadImpl(int task_id) {
    /* calculate region part */
    for (size_t i = task_id; i < mirror_pad_block_.size(); i += static_cast<size_t>(op_parameter_->thread_num_)) {
      auto block = mirror_pad_block_[i];
-
-      for (int a = 0; a < block.size_[0]; a++) {
-        int out_a_index = block.out_offset_ + a * block.out_stride_[0];
-        for (int b = 0; b < block.size_[1]; b++) {
-          int out_b_index = out_a_index + b * block.out_stride_[1];
-          for (int c = 0; c < block.size_[2]; ++c) {
-            int out_c_index = out_b_index + c * block.out_stride_[2];
-            for (int d = 0; d < block.size_[3]; ++d) {
-              int out_d_index = out_c_index + d * block.out_stride_[3];
-              for (int e = 0; e < block.size_[4]; ++e) {
-                int output_index = out_d_index + e * block.out_stride_[4];
-                MirrorPad(input_data, output_data, in_, pad_param_, output_index, output_index + block.size_[5]);
-              }
-            }
-          }
-        }
-      }
+      RunMirrorPadImplFast(block, input_data, output_data);
    }
    return RET_OK;
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.h
@ -41,8 +41,8 @@ class PadCPUKernel : public InnerKernel {
  int Prepare() override;
  int ReSize() override;
  int Run() override;
-  virtual int RunImpl(int task_id);
-  virtual int RunMirrorPadImpl(int task_id);
+  virtual int RunImpl(int task_id) const;
+  virtual int RunMirrorPadImpl(int task_id) const;

 private:
  int CheckPaddings(const int *paddings, int length, const int *input_shape, int mode);
@ -50,6 +50,7 @@ class PadCPUKernel : public InnerKernel {
  int ExtendShape(int *shape, int length, const int *ori_shape, int rank) const;
  int ExtendPaddings(int *paddings, int length, const int *ori_paddings, int ori_length) const;
  void InitMirrorPadBlock();
+  void RunMirrorPadImplFast(const MirrorPadBlock &block, const float *input_data, float *output_data) const;

 protected:
  int HandleMirrorPad();
@ -60,8 +61,8 @@ class PadCPUKernel : public InnerKernel {
  std::vector<MirrorPadBlock> mirror_pad_block_;
 };

-int PadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale);
-int MirrorPadImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale);
+int PadImpl(const void *cdata, int task_id, float, float);
+int MirrorPadImpl(const void *cdata, int task_id, float, float);
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_PAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.cc
@ -50,7 +50,7 @@ int PoolingCPUKernel::ReSize() {
  return RET_OK;
 }

-int PoolingCPUKernel::RunImpl(int task_id) {
+int PoolingCPUKernel::RunImpl(int task_id) const {
  auto input_ptr = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->MutableData());
  CHECK_NULL_RETURN(input_ptr);
  auto output_ptr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->MutableData());
@ -76,8 +76,8 @@ int PoolingCPUKernel::RunImpl(int task_id) {
  return RET_OK;
 }

-int PoolingImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
-  auto pooling = reinterpret_cast<PoolingCPUKernel *>(cdata);
+int PoolingImpl(const void *cdata, int task_id, float, float) {
+  auto pooling = reinterpret_cast<const PoolingCPUKernel *>(cdata);
  auto error_code = pooling->RunImpl(task_id);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Pooling Run error task_id[" << task_id << "] error_code[" << error_code << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.h
@ -32,7 +32,7 @@ class PoolingCPUKernel : public PoolingBaseCPUKernel {
  int Prepare() override;
  int ReSize() override;
  int Run() override;
-  int RunImpl(int task_id);
+  int RunImpl(int task_id) const;

 private:
 };
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/power_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/power_fp32.cc
@ -33,9 +33,8 @@ int PowerCPUKernel::Prepare() {

 int PowerCPUKernel::ReSize() { return RET_OK; }

-int PowerImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
-  CHECK_NULL_RETURN(cdata);
-  auto kernel = reinterpret_cast<PowerCPUKernel *>(cdata);
+int PowerImpl(const void *cdata, int task_id, float, float) {
+  auto kernel = reinterpret_cast<const PowerCPUKernel *>(cdata);
  CHECK_NULL_RETURN(kernel);
  auto ret = kernel->RunImpl(task_id);
  if (ret != RET_OK) {
@ -54,7 +53,7 @@ int PowerCPUKernel::Run() {
  return RET_OK;
 }

-int PowerCPUKernel::RunImpl(int task_id) {
+int PowerCPUKernel::RunImpl(int task_id) const {
  auto x_addr = reinterpret_cast<float *>(in_tensors_.at(0)->MutableData());
  CHECK_NULL_RETURN(x_addr);
  auto output_addr = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData());
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/power_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/power_fp32.h
@ -36,7 +36,7 @@ class PowerCPUKernel : public InnerKernel {
  int Prepare() override;
  int ReSize() override;
  int Run() override;
-  int RunImpl(int task_id);
+  int RunImpl(int task_id) const;

 private:
  int thread_count_;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/prelu_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/prelu_fp32.cc
@ -27,8 +27,8 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_PReLUFusion;

 namespace mindspore::kernel {
-static int PReluRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
-  auto PRelu = reinterpret_cast<PReluCPUKernel *>(cdata);
+static int PReluRun(const void *cdata, int task_id, float, float) {
+  auto PRelu = reinterpret_cast<const PReluCPUKernel *>(cdata);
  auto ret = PRelu->DoExcute(task_id);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "PReluRun error task_id[" << task_id << "] error_code[" << ret << "]";
@ -55,7 +55,7 @@ int PReluCPUKernel::Prepare() {
  return ReSize();
 }

-int PReluCPUKernel::DoExcute(int task_id) {
+int PReluCPUKernel::DoExcute(int task_id) const {
  int thread_num = param_->op_parameter_.thread_num_;
  if (thread_num == 0) {
    MS_LOG(ERROR) << "thread_num is 0!";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/prelu_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/prelu_fp32.h
@ -34,7 +34,7 @@ class PReluCPUKernel : public InnerKernel {
  int Prepare() override;
  int ReSize() override;
  int Run() override;
-  virtual int DoExcute(int task_id);
+  virtual int DoExcute(int task_id) const;

 protected:
  PReluParameter *param_;
--- a/mindspore/lite/src/runtime/runtime_pass.cc
+++ b/mindspore/lite/src/runtime/runtime_pass.cc
@ -84,7 +84,7 @@ void Nc4hw4PassReplace(std::vector<kernel::LiteKernel *> *kernels, std::vector<T
  return;
 }

-bool Nc4hw4PassMatch(std::vector<kernel::LiteKernel *> *kernels, size_t index) {
+bool Nc4hw4PassMatch(const std::vector<kernel::LiteKernel *> *kernels, size_t index) {
  kernel::LiteKernel *start_kernel = kernels->at(index);
  if (IsContain(Nc4hw4FormatOutOpList, start_kernel->type()) == false) {
    return false;
@ -179,7 +179,7 @@ void Nc4hw4PassAct(std::vector<kernel::LiteKernel *> *kernels, std::vector<Tenso
  return;
 }

-void ConvNormC4PassActReplace(kernel::LiteKernel *conv_op, kernel::LiteKernel *in_op) {
+void ConvNormC4PassActReplace(const kernel::LiteKernel *conv_op, const kernel::LiteKernel *in_op) {
  conv_op->out_tensors().front()->set_format(NC4HW4);
  in_op->in_tensors().front()->set_format(NC4HW4);
 }