diff --git a/mindspore/lite/nnacl/fp32/conv.c b/mindspore/lite/nnacl/fp32/conv.c
index 04e3e9115f..9db14f8b33 100644
--- a/mindspore/lite/nnacl/fp32/conv.c
+++ b/mindspore/lite/nnacl/fp32/conv.c
@@ -221,7 +221,6 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons
   int ic4 = UP_DIV(in_channel, C4NUM);
   int kernel_plane = kernel_h * kernel_w;
   int unit_size = kernel_plane * ic4 * C4NUM;
-  int packed_input_size = output_tile_count * TILE_NUM * unit_size;
   bool relu = conv_param->act_type_ == ActType_Relu;
   bool relu6 = conv_param->act_type_ == ActType_Relu6;
 
@@ -232,13 +231,14 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons
   size_t output_offset = out_channel * sizeof(float);
 
   for (int b = 0; b < in_batch; b++) {
-    int in_batch_offset = b * ic4 * C4NUM * in_h * in_w;
+    int in_batch_offset = b * in_channel * in_h * in_w;
     int out_batch_offset = b * out_channel * out_h * out_w;
-    int gemm_in_batch_offset = b * packed_input_size;
     for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
       int start_index = thread_id * TILE_NUM;
       int real_cal_num = (output_count - start_index) < TILE_NUM ? (output_count - start_index) : TILE_NUM;
-      float *gemm_input = packed_input + thread_id * unit_size * TILE_NUM + gemm_in_batch_offset;
+      float *gemm_input = packed_input + task_id * unit_size * TILE_NUM;
+      size_t packed_input_size = unit_size * TILE_NUM * sizeof(float);
+      memset(gemm_input, 0, packed_input_size);
       Im2ColPackUnitFp32(input_data + in_batch_offset, conv_param, gemm_input, real_cal_num, start_index);
 
       int out_offset = thread_id * TILE_NUM * out_channel + out_batch_offset;
@@ -291,7 +291,7 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
   // step 1 : filter transform (pre-processed offline)
   // step 2 : input transform (online)
   for (int b = 0; b < in_batch; b++) {
-    int in_batch_offset = b * ic4 * C4NUM * conv_param->input_h_ * conv_param->input_w_;
+    int in_batch_offset = b * in_channel * conv_param->input_h_ * conv_param->input_w_;
     int out_batch_offset = b * out_channel * conv_param->output_w_ * conv_param->output_h_;
     for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_num) {
       int out_tile_index = thread_id * tile_num;
@@ -322,144 +322,9 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
   }
 }
 
-void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, int width, int channel,
-                          int output_unit) {
-  int out_h_block_num = UP_DIV(height, output_unit);
-  int out_w_block_num = UP_DIV(width, output_unit);
-  int c4 = UP_DIV(channel, C4NUM);
-  int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit;
-  for (int b = 0; b < batch; b++) {
-    int src_batch_offset = b * c4 * c4_block;
-    int dst_batch_offset = b * height * width * channel;
-    for (int h = 0; h < height; h++) {
-      int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit);
-      int dst_h_offset = dst_batch_offset + h * width * channel;
-      for (int w = 0; w < width; w++) {
-        int src_w_offset = src_h_offset + w * C4NUM;
-        int dst_w_offset = dst_h_offset + w * channel;
-        for (int c = 0; c < c4 - 1; c++) {
-          int src_c4_offset = src_w_offset + c * c4_block;
-          int dst_c4_offset = dst_w_offset + c * C4NUM;
-#ifdef ENABLE_NEON
-          vst1q_f32(dst + dst_c4_offset, vld1q_f32(src + src_c4_offset));
-#else
-          for (int i = 0; i < C4NUM; ++i) {
-            dst[dst_c4_offset + i] = src[src_c4_offset + i];
-          }
-#endif
-        }
-        int c_res = channel - (c4 - 1) * C4NUM;
-        int src_c_res_offset = (c4 - 1) * c4_block;
-        int dst_c_res_offset = (c4 - 1) * C4NUM;
-        for (int c = 0; c < c_res; c++) {
-          int src_c4_res_offset = src_w_offset + src_c_res_offset + c;
-          int dst_c4_res_offset = dst_w_offset + dst_c_res_offset + c;
-          dst[dst_c4_res_offset] = src[src_c4_res_offset];
-        }
-      }
-    }
-  }
-}
-
-void UnPackWinogradReluOutput(const float *src, float *dst, int batch, int height, int width, int channel,
-                              int output_unit) {
-  int out_h_block_num = UP_DIV(height, output_unit);
-  int out_w_block_num = UP_DIV(width, output_unit);
-  int c4 = UP_DIV(channel, C4NUM);
-  int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit;
-  for (int b = 0; b < batch; b++) {
-    int src_batch_offset = b * c4 * c4_block;
-    int dst_batch_offset = b * height * width * channel;
-    for (int h = 0; h < height; h++) {
-      int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit);
-      int dst_h_offset = dst_batch_offset + h * width * channel;
-      for (int w = 0; w < width; w++) {
-        int src_w_offset = src_h_offset + w * C4NUM;
-        int dst_w_offset = dst_h_offset + w * channel;
-        for (int c = 0; c < c4 - 1; c++) {
-          int src_c4_offset = src_w_offset + c * c4_block;
-          int dst_c4_offset = dst_w_offset + c * C4NUM;
-#ifdef ENABLE_NEON
-          float32x4_t input_ptr = vld1q_f32(src + src_c4_offset);
-          float32x4_t zero = vdupq_n_f32(0);
-          input_ptr = vmaxq_f32(zero, input_ptr);
-          vst1q_f32(dst + dst_c4_offset, input_ptr);
-#else
-          for (int i = 0; i < C4NUM; ++i) {
-            float input_data = src[src_c4_offset + i];
-            input_data = input_data < 0 ? 0 : input_data;
-            dst[dst_c4_offset + i] = input_data;
-          }
-#endif
-        }
-        int c_res = channel - (c4 - 1) * C4NUM;
-        int src_c_res_offset = (c4 - 1) * c4_block;
-        int dst_c_res_offset = (c4 - 1) * C4NUM;
-        for (int c = 0; c < c_res; c++) {
-          int src_c4_res_offset = src_w_offset + src_c_res_offset + c;
-          int dst_c4_res_offset = dst_w_offset + dst_c_res_offset + c;
-          float input_data = src[src_c4_res_offset];
-          input_data = input_data < 0 ? 0 : input_data;
-          dst[dst_c4_res_offset] = input_data;
-        }
-      }
-    }
-  }
-}
-
-void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int height, int width, int channel,
-                               int output_unit) {
-  int out_h_block_num = UP_DIV(height, output_unit);
-  int out_w_block_num = UP_DIV(width, output_unit);
-  int c4 = UP_DIV(channel, C4NUM);
-  int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit;
-  for (int b = 0; b < batch; b++) {
-    int src_batch_offset = b * c4 * c4_block;
-    int dst_batch_offset = b * height * width * channel;
-    for (int h = 0; h < height; h++) {
-      int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit);
-      int dst_h_offset = dst_batch_offset + h * width * channel;
-      for (int w = 0; w < width; w++) {
-        int src_w_offset = src_h_offset + w * C4NUM;
-        int dst_w_offset = dst_h_offset + w * channel;
-        for (int c = 0; c < c4 - 1; c++) {
-          int src_c4_offset = src_w_offset + c * c4_block;
-          int dst_c4_offset = dst_w_offset + c * C4NUM;
-#ifdef ENABLE_NEON
-          float32x4_t input_ptr = vld1q_f32(src + src_c4_offset);
-          float32x4_t zero = vdupq_n_f32(0);
-          float32x4_t six = vdupq_n_f32(6);
-          input_ptr = vmaxq_f32(zero, input_ptr);
-          input_ptr = vminq_f32(six, input_ptr);
-          vst1q_f32(dst + dst_c4_offset, input_ptr);
-#else
-          for (int i = 0; i < C4NUM; ++i) {
-            float input_data = src[src_c4_offset + i];
-            input_data = input_data < 0 ? 0 : input_data;
-            input_data = input_data > 6 ? 6 : input_data;
-            dst[dst_c4_offset + i] = input_data;
-          }
-#endif
-        }
-        int c_res = channel - (c4 - 1) * C4NUM;
-        int src_c_res_offset = (c4 - 1) * c4_block;
-        int dst_c_res_offset = (c4 - 1) * C4NUM;
-        for (int c = 0; c < c_res; c++) {
-          int src_c4_res_offset = src_w_offset + src_c_res_offset + c;
-          int dst_c4_res_offset = dst_w_offset + dst_c_res_offset + c;
-          float input_data = src[src_c4_res_offset];
-          input_data = input_data < 0 ? 0 : input_data;
-          input_data = input_data > 6 ? 6 : input_data;
-          dst[dst_c4_res_offset] = input_data;
-        }
-      }
-    }
-  }
-}
-
 // fp32 conv3x3
 void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, TmpBufferAddress *buffer_list,
-                 int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func) {
+                 int task_id, ConvParameter *conv_param) {
   int thread_count = conv_param->thread_num_;
   int ic4 = UP_DIV(conv_param->input_channel_, C4NUM);
   int output_channel = conv_param->output_channel_;
@@ -488,7 +353,7 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat
 
   int input_batch = conv_param->input_batch_;
   for (int batch = 0; batch < input_batch; batch++) {
-    int in_batch_offset = batch * ic4 * C4NUM * conv_param->input_h_ * conv_param->input_w_;
+    int in_batch_offset = batch * conv_param->input_channel_ * conv_param->input_h_ * conv_param->input_w_;
     int nc4hw4_buffer_offset = batch * oc4 * C4NUM * conv_param->output_h_ * conv_param->output_w_;
 
     for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
diff --git a/mindspore/lite/nnacl/fp32/conv.h b/mindspore/lite/nnacl/fp32/conv.h
index 9b280ed820..5aeba9fb1a 100644
--- a/mindspore/lite/nnacl/fp32/conv.h
+++ b/mindspore/lite/nnacl/fp32/conv.h
@@ -67,7 +67,7 @@ void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int heig
 
 // fp32 conv3x3
 void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, TmpBufferAddress *buffer_list,
-                 int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func);
+                 int task_id, ConvParameter *conv_param);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/lite/nnacl/int8/conv_int8.c b/mindspore/lite/nnacl/int8/conv_int8.c
index 4d2e9aadd5..d0d1c23928 100644
--- a/mindspore/lite/nnacl/int8/conv_int8.c
+++ b/mindspore/lite/nnacl/int8/conv_int8.c
@@ -273,7 +273,6 @@ void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, c
   int kernel_plane = kernel_h * kernel_w;
   int plane_block = UP_DIV(kernel_plane, C4NUM);
   int unit_size = plane_block * C4NUM * ic4 * C4NUM;
-  int packed_input_size = output_tile_count * tile_n * unit_size;
   int input_sum_offset;
   if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
     input_sum_offset = tile_n * oc4 * C4NUM;
@@ -284,12 +283,11 @@ void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, c
   for (int b = 0; b < in_batch; b++) {
     int in_batch_offset = b * ic4 * C4NUM * in_h * in_w;
     int out_batch_offset = b * out_channel * out_h * out_w;
-    int gemm_in_batch_offset = b * packed_input_size;
     for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
       int start_index = thread_id * tile_n;
       int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n;
       int32_t *tmp_input_sum = input_sum + task_id * input_sum_offset;
-      int8_t *gemm_input = packed_input + thread_id * unit_size * tile_n + gemm_in_batch_offset;
+      int8_t *gemm_input = packed_input + task_id * unit_size * tile_n;
       // clear tmp buffer before compute
       memset(gemm_input, (int8_t)input_zp, unit_size * tile_n);
       int out_offset = thread_id * tile_n * out_channel + out_batch_offset;
@@ -336,7 +334,6 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight
   int ic4 = UP_DIV(in_channel, C4NUM);
   int kernel_plane = kernel_h * kernel_w;
   int unit_size = kernel_plane * ic4 * C4NUM;
-  int packed_input_size = output_tile_count * tile_n * unit_size;
   int input_sum_offset;
   if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
     input_sum_offset = tile_n * oc4 * C4NUM;
@@ -347,12 +344,11 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight
   for (int b = 0; b < in_batch; b++) {
     int in_batch_offset = b * ic4 * C4NUM * in_h * in_w;
     int out_batch_offset = b * out_channel * out_h * out_w;
-    int gemm_in_batch_offset = b * packed_input_size;
     for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
       int start_index = thread_id * tile_n;
       int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n;
       int32_t *tmp_input_sum = input_sum + task_id * input_sum_offset;
-      int8_t *gemm_input = packed_input + thread_id * unit_size * tile_n + gemm_in_batch_offset;
+      int8_t *gemm_input = packed_input + task_id * unit_size * tile_n;
       // clear tmp buffer before compute
       memset(gemm_input, (int8_t)input_zp, unit_size * tile_n);
       int out_offset = thread_id * tile_n * out_channel + out_batch_offset;
diff --git a/mindspore/lite/nnacl/pack.c b/mindspore/lite/nnacl/pack.c
index 83cb2dee3e..5c3aa18499 100644
--- a/mindspore/lite/nnacl/pack.c
+++ b/mindspore/lite/nnacl/pack.c
@@ -297,24 +297,24 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa
   int in_h = conv_param->input_h_;
   int in_w = conv_param->input_w_;
   int out_w = conv_param->output_w_;
+  int ic4_minus = in_channel / C4NUM;
   int ic4 = UP_DIV(in_channel, C4NUM);
-  memset(packed_input, 0, kernel_h * kernel_w * ic4 * C4NUM * TILE_NUM * sizeof(float));
 
   for (int i = 0; i < real_cal_num; i++) {
     int block_start = block_index + i;
     int input_h = block_start / out_w * stride_h - pad_h;
     int input_w = block_start % out_w * stride_w - pad_w;
-    int input_stride = input_h * in_w * ic4 * C4NUM + input_w * ic4 * C4NUM;
+    int input_stride = (input_h * in_w + input_w) * in_channel;
     int kh_s = MSMAX(0, UP_DIV(-input_h, dilation_h));
     int kh_e = MSMIN(kernel_h, UP_DIV(in_h - input_h, dilation_h));
     int kw_s = MSMAX(0, UP_DIV(-input_w, dilation_w));
     int kw_e = MSMIN(kernel_w, UP_DIV(in_w - input_w, dilation_w));
     for (int j = kh_s; j < kh_e; j++) {
-      int input_y_stride = j * dilation_h * in_w * ic4 * C4NUM + input_stride;
+      int input_y_stride = j * dilation_h * in_w * in_channel + input_stride;
       for (int n = kw_s; n < kw_e; n++) {
-        int input_x_stride = input_y_stride + n * dilation_w * ic4 * C4NUM;
+        int input_x_stride = input_y_stride + n * dilation_w * in_channel;
         int input_plane_offset = (j * kernel_w + n) * C8NUM * C4NUM * ic4 + i * C4NUM;
-        for (int m = 0; m < ic4; m++) {
+        for (int m = 0; m < ic4_minus; m++) {
           int channel_block_stride = input_x_stride + m * C4NUM;
           int channel_block_offset = input_plane_offset + m * C8NUM * C4NUM;
 #ifdef ENABLE_NEON
@@ -325,9 +325,15 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa
           }
 #endif
         }  // channel_block loop
-      }    // kernel_w loop
-    }      // kernel_h loop
-  }        // tile num loop
+        int ic_res = conv_param->input_channel_ - ic4_minus * C4NUM;
+        for (int l = 0; l < ic_res; ++l) {
+          int channel_block_stride = input_x_stride + ic4_minus * C4NUM + l;
+          int channel_block_offset = input_plane_offset + ic4_minus * C8NUM * C4NUM + l;
+          packed_input[channel_block_offset] = input_data[channel_block_stride];
+        }
+      }  // kernel_w loop
+    }    // kernel_h loop
+  }      // tile num loop
 }
 
 void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real_cal_num, int block_index,
@@ -346,6 +352,7 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real
   int in_channel = conv_param->input_channel_;
   int in_h = conv_param->input_h_;
   int in_w = conv_param->input_w_;
+  int ic4_minus = in_channel / C4NUM;
   int ic4 = UP_DIV(in_channel, C4NUM);
   int oc4 = UP_DIV(conv_param->output_channel_, C4NUM);
   int out_w = conv_param->output_w_;
@@ -362,19 +369,19 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real
         input_accumulator += ic4 * C4NUM * conv_param->conv_quant_arg_.input_quant_args_[0].zp_ * kernel_w;
         continue;
       }
-      int input_y_stride = input_y * in_w * ic4 * C4NUM;
+      int input_y_stride = input_y * in_w * in_channel;
       for (int n = 0; n < kernel_w; n++) {
         int input_x = input_w + n * dilation_w;
         if (input_x < 0 || input_x >= in_w) {
           input_accumulator += ic4 * C4NUM * conv_param->conv_quant_arg_.input_quant_args_[0].zp_;
           continue;
         }
-        int input_x_stride = input_y_stride + input_x * ic4 * C4NUM;
+        int input_x_stride = input_y_stride + input_x * in_channel;
         int plane_c4_block = (j * kernel_w + n) / C4NUM;
         int plane_c4_res = (j * kernel_w + n) % C4NUM;
         int input_plane_offset =
           plane_c4_block * tile_num * C4NUM * C4NUM * ic4 + plane_c4_res * C4NUM + input_cal_num_offset;
-        for (int m = 0; m < ic4; m++) {
+        for (int m = 0; m < ic4_minus; m++) {
           int channel_block_stride = input_x_stride + m * C4NUM;
           int channel_block_offset = input_plane_offset + m * tile_num * C4NUM * C4NUM;
           (packed_input + channel_block_offset)[0] = (input_data + channel_block_stride)[0];
@@ -386,8 +393,15 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real
           input_accumulator += (packed_input + channel_block_offset)[2];
           input_accumulator += (packed_input + channel_block_offset)[3];
         }  // channel_block loop
-      }    // kernel_w loop
-    }      // kernel_h loop
+        int ic_res = conv_param->input_channel_ - ic4_minus * C4NUM;
+        for (int l = 0; l < ic_res; ++l) {
+          int channel_block_stride = input_x_stride + ic4_minus * C4NUM + l;
+          int channel_block_offset = input_plane_offset + ic4_minus * tile_num * C4NUM + l;
+          packed_input[channel_block_offset] = input_data[channel_block_stride];
+          input_accumulator += (packed_input + channel_block_offset)[0];
+        }
+      }  // kernel_w loop
+    }    // kernel_h loop
     if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC)) {
       continue;
     } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
@@ -419,6 +433,7 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int r
   int in_channel = conv_param->input_channel_;
   int in_h = conv_param->input_h_;
   int in_w = conv_param->input_w_;
+  int ic4_minus = in_channel / C4NUM;
   int ic4 = UP_DIV(in_channel, C4NUM);
   int oc4 = UP_DIV(conv_param->output_channel_, C4NUM);
   int out_w = conv_param->output_w_;
@@ -428,26 +443,29 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int r
     int block_start = block_index + i;
     int input_h = block_start / out_w * stride_h - pad_h;
     int input_w = block_start % out_w * stride_w - pad_w;
-    for (int j = 0; j < kernel_h; j++) {
-      int input_y = input_h + j * dilation_h;
-      if (input_y < 0 || input_y >= in_h) {
-        continue;
-      }
-      int input_y_stride = input_y * in_w * ic4 * C4NUM;
-      for (int n = 0; n < kernel_w; n++) {
-        int input_x = input_w + n * dilation_w;
-        if (input_x < 0 || input_x >= in_w) {
-          continue;
-        }
-        int input_x_stride = input_y_stride + input_x * ic4 * C4NUM;
+    int input_stride = input_h * in_w * in_channel + input_w * in_channel;
+    int kh_s = MSMAX(0, UP_DIV(-input_h, dilation_h));
+    int kh_e = MSMIN(kernel_h, UP_DIV(in_h - input_h, dilation_h));
+    int kw_s = MSMAX(0, UP_DIV(-input_w, dilation_w));
+    int kw_e = MSMIN(kernel_w, UP_DIV(in_w - input_w, dilation_w));
+    for (int j = kh_s; j < kh_e; j++) {
+      int input_y_stride = j * dilation_h * in_w * in_channel + input_stride;
+      for (int n = kw_s; n < kw_e; n++) {
+        int input_x_stride = input_y_stride + n * dilation_w * in_channel;
         int input_plane_offset = (j * kernel_w + n) * tile_num * C4NUM * ic4 + i * C4NUM;
-        for (int m = 0; m < ic4; m++) {
+        for (int m = 0; m < ic4_minus; m++) {
           int channel_block_stride = input_x_stride + m * C4NUM;
           int channel_block_offset = input_plane_offset + m * tile_num * C4NUM;
           memcpy(packed_input + channel_block_offset, input_data + channel_block_stride, 4);
         }  // channel_block loop
-      }    // kernel_w loop
-    }      // kernel_h loop
+        int ic_res = conv_param->input_channel_ - ic4_minus * C4NUM;
+        for (int l = 0; l < ic_res; ++l) {
+          int channel_block_stride = input_x_stride + ic4_minus * C4NUM + l;
+          int channel_block_offset = input_plane_offset + ic4_minus * tile_num * C4NUM + l;
+          packed_input[channel_block_offset] = input_data[channel_block_stride];
+        }
+      }  // kernel_w loop
+    }    // kernel_h loop
     int32_t input_accumulator = 0;
     for (int j = 0; j < block_size; j++) {
       int block_offset = j * tile_num * ic4 * C4NUM + i * C4NUM;
diff --git a/mindspore/lite/nnacl/winograd_transform.c b/mindspore/lite/nnacl/winograd_transform.c
index f59d422205..c8a7eb9851 100644
--- a/mindspore/lite/nnacl/winograd_transform.c
+++ b/mindspore/lite/nnacl/winograd_transform.c
@@ -41,30 +41,48 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
     int interval_x_e = src_x_e < input_w ? input_unit : (input_w - src_x_s);
     int interval_y_e = src_y_e < input_h ? input_unit : (input_h - src_y_s);
 
-    int src_plane_offset = ic4 * C4NUM * (src_y_s * input_w + src_x_s);
+    int src_plane_offset = in_channel * (src_y_s * input_w + src_x_s);
     int dst_plane_offset = c * C4NUM * ic4;
     for (int ic = 0; ic < ic4; ic++) {
       // clear tmp buffer
       memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float));
 
-      // get real input block with padding
+      int real_c = in_channel - ic * C4NUM;
+      real_c = real_c > C4NUM ? C4NUM : real_c;
       int src_ic4_offset = src_plane_offset + ic * C4NUM;
-      for (int interval = interval_y_s; interval < interval_y_e; interval++) {
-        int src_y_offset = src_ic4_offset + (interval * input_w + interval_x_s) * ic4 * C4NUM;
-        int dst_y_offset = interval * input_unit * C4NUM + interval_x_s * C4NUM;
-        for (int j = 0; j < (interval_x_e - interval_x_s); j++) {
-          int src_x_offset = src_y_offset + j * ic4 * C4NUM;
-          int dst_x_offset = dst_y_offset + j * C4NUM;
-          float *src_addr = (float *)(input_data) + src_x_offset;
-          float *dst_addr = tmp_data + dst_x_offset;
+      // get real input block with padding
+      if (real_c == C4NUM) {
+        for (int interval = interval_y_s; interval < interval_y_e; interval++) {
+          int src_y_offset = src_ic4_offset + (interval * input_w + interval_x_s) * in_channel;
+          int dst_y_offset = interval * input_unit * C4NUM + interval_x_s * C4NUM;
+          for (int j = 0; j < (interval_x_e - interval_x_s); j++) {
+            int src_x_offset = src_y_offset + j * in_channel;
+            int dst_x_offset = dst_y_offset + j * C4NUM;
+            float *src_addr = (float *)(input_data) + src_x_offset;
+            float *dst_addr = tmp_data + dst_x_offset;
 #ifdef ENABLE_NEON
-          vst1q_f32(dst_addr, vld1q_f32(src_addr));
+            vst1q_f32(dst_addr, vld1q_f32(src_addr));
 #else
-          for (int k = 0; k < C4NUM; k++) {
-            dst_addr[k] = src_addr[k];
-          }
+            for (int k = 0; k < C4NUM; k++) {
+              dst_addr[k] = src_addr[k];
+            }
 #endif
-        }
+          }  // interval x loop
+        }    // interval y loop
+      } else {
+        for (int interval = interval_y_s; interval < interval_y_e; interval++) {
+          int src_y_offset = src_ic4_offset + (interval * input_w + interval_x_s) * in_channel;
+          int dst_y_offset = interval * input_unit * C4NUM + interval_x_s * C4NUM;
+          for (int j = 0; j < (interval_x_e - interval_x_s); j++) {
+            int src_x_offset = src_y_offset + j * in_channel;
+            int dst_x_offset = dst_y_offset + j * C4NUM;
+            float *src_addr = (float *)(input_data) + src_x_offset;
+            float *dst_addr = tmp_data + dst_x_offset;
+            for (int k = 0; k < real_c; k++) {
+              dst_addr[k] = src_addr[k];
+            }
+          }  // interval x loop
+        }    // interval y loop
       }
       // input transform
 #ifdef ENABLE_ARM32
@@ -314,29 +332,47 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa
     int real_y_start = origin_y > 0 ? 0 : -origin_y;
     int real_y_end = (origin_y + input_unit) < input_height ? input_unit : (input_height - origin_y);
 
-    int src_plane_offset = ic4 * C4NUM * (origin_y * input_width + origin_x);
+    int src_plane_offset = input_channel * (origin_y * input_width + origin_x);
     int dst_plane_offset = cal_id * C4NUM * ic4;
     for (int ic = 0; ic < ic4; ic++) {
       // clear tmp buffer
       memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float));
+      int real_c = input_channel - ic * C4NUM;
+      real_c = real_c > C4NUM ? C4NUM : real_c;
 
       // get real input block with padding
       int src_ic4_offset = src_plane_offset + ic * C4NUM;
-      for (int interval = real_y_start; interval < real_y_end; interval++) {
-        int src_y_offset = src_ic4_offset + (interval * input_width + real_x_start) * ic4 * C4NUM;
-        int dst_y_offset = interval * input_unit * C4NUM + real_x_start * C4NUM;
-        for (int j = 0; j < (real_x_end - real_x_start); j++) {
-          int src_x_offset = src_y_offset + j * ic4 * C4NUM;
-          int dst_x_offset = dst_y_offset + j * C4NUM;
-          float *src_addr = (float *)(input_data) + src_x_offset;
-          float *dst_addr = tmp_data + dst_x_offset;
+      if (real_c == C4NUM) {
+        for (int interval = real_y_start; interval < real_y_end; interval++) {
+          int src_y_offset = src_ic4_offset + (interval * input_width + real_x_start) * input_channel;
+          int dst_y_offset = interval * input_unit * C4NUM + real_x_start * C4NUM;
+          for (int j = 0; j < (real_x_end - real_x_start); j++) {
+            int src_x_offset = src_y_offset + j * input_channel;
+            int dst_x_offset = dst_y_offset + j * C4NUM;
+            float *src_addr = (float *)(input_data) + src_x_offset;
+            float *dst_addr = tmp_data + dst_x_offset;
 #ifdef ENABLE_NEON
-          vst1q_f32(dst_addr, vld1q_f32(src_addr));
+            vst1q_f32(dst_addr, vld1q_f32(src_addr));
 #else
-          for (int k = 0; k < C4NUM; k++) {
-            (dst_addr + k)[0] = (src_addr + k)[0];
-          }
+            for (int k = 0; k < C4NUM; k++) {
+              dst_addr[k] = src_addr[k];
+            }
 #endif
+          }
+        }
+      } else {
+        for (int interval = real_y_start; interval < real_y_end; interval++) {
+          int src_y_offset = src_ic4_offset + (interval * input_width + real_x_start) * input_channel;
+          int dst_y_offset = interval * input_unit * C4NUM + real_x_start * C4NUM;
+          for (int j = 0; j < (real_x_end - real_x_start); j++) {
+            int src_x_offset = src_y_offset + j * input_channel;
+            int dst_x_offset = dst_y_offset + j * C4NUM;
+            float *src_addr = (float *)(input_data) + src_x_offset;
+            float *dst_addr = tmp_data + dst_x_offset;
+            for (int k = 0; k < real_c; k++) {
+              dst_addr[k] = src_addr[k];
+            }
+          }
         }
       }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
index 60b3ad10b9..1769f1a3df 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
@@ -84,21 +84,8 @@ int ConvolutionCPUKernel::InitTmpBuffer() {
   MS_ASSERT(ctx_->allocator != nullptr);
 
   int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
-  size_t nhwc4_input_size =
-    ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
-  MS_ASSERT(nullptr != ctx_->allocator);
-  nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
-  if (nhwc4_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc nhwc4 input failed.";
-    return RET_ERROR;
-  }
-
-  int output_count = conv_param_->output_h_ * conv_param_->output_w_;
-  int output_tile_count = UP_DIV(output_count, TILE_NUM);
-  int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * ic4 * C4NUM;
-  int packed_input_size = output_tile_count * TILE_NUM * unit_size;
-  packed_input_ =
-    reinterpret_cast<float *>(ctx_->allocator->Malloc(conv_param_->input_batch_ * packed_input_size * sizeof(float)));
+  int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * ic4 * C4NUM * TILE_NUM * thread_count_;
+  packed_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(unit_size * sizeof(float)));
   if (packed_input_ == nullptr) {
     MS_LOG(ERROR) << "malloc packed input failed.";
     return RET_ERROR;
@@ -158,9 +145,11 @@ int ConvolutionCPUKernel::RunImpl(int task_id) {
     MS_LOG(ERROR) << "gemm_func is nullptr.";
     return RET_ERROR;
   }
+  auto input_tensor = in_tensors_.at(kInputIndex);
+  auto ori_input_data = reinterpret_cast<float *>(input_tensor->MutableData());
   auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->MutableData());
-  ConvFp32(reinterpret_cast<float *>(nhwc4_input_), packed_input_, packed_weight_,
-           reinterpret_cast<float *>(bias_data_), tmp_output_block_, output_addr, task_id, conv_param_, gemm_func_);
+  ConvFp32(ori_input_data, packed_input_, packed_weight_, reinterpret_cast<float *>(bias_data_), tmp_output_block_,
+           output_addr, task_id, conv_param_, gemm_func_);
   return RET_OK;
 }
 
@@ -187,11 +176,6 @@ int ConvolutionCPUKernel::Run() {
     return RET_ERROR;
   }
 
-  auto input_tensor = in_tensors_.at(kInputIndex);
-  auto ori_input_data = input_tensor->MutableData();
-  PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
-                      conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
-
   int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionImpl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv error error_code[" << error_code << "]";
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
index 7237cb6ce3..97188bd4b6 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
@@ -51,10 +51,6 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
       ctx_->allocator->Free(tmp_output_block_);
       tmp_output_block_ = nullptr;
     }
-    if (nhwc4_input_ != nullptr) {
-      ctx_->allocator->Free(nhwc4_input_);
-      nhwc4_input_ = nullptr;
-    }
     if (packed_input_ != nullptr) {
       ctx_->allocator->Free(packed_input_);
       packed_input_ = nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
index 2c57806f32..f63dca1e81 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
@@ -100,13 +100,6 @@ int Convolution3x3CPUKernel::InitTmpBuffer() {
 #else
   const int tile_num = 12;
 #endif
-  size_t nhwc4_input_size =
-    ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
-  nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
-  if (nhwc4_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
-    return RET_ERROR;
-  }
 
   size_t tile_buffer_size = thread_count_ * tile_num * C16NUM * ic4 * C4NUM * sizeof(float);
   tile_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tile_buffer_size));
@@ -152,16 +145,6 @@ int Convolution3x3CPUKernel::InitTmpBuffer() {
   return RET_OK;
 }
 
-void Convolution3x3CPUKernel::ConfigInputOutput() {
-  auto output_tensor = out_tensors_.at(kOutputIndex);
-  output_tensor->SetFormat(schema::Format::Format_NHWC);
-  // #ifdef ENABLE_ARM32
-  //   gemm_func_ = IndirectGemmFp32_8x4;
-  // #else
-  gemm_func_ = IndirectGemmFp32_8x8;
-  // #endif
-}
-
 int Convolution3x3CPUKernel::Init() {
   auto ret = InitWeightBias();
   if (ret != RET_OK) {
@@ -171,7 +154,6 @@ int Convolution3x3CPUKernel::Init() {
   if (!InferShapeDone()) {
     return RET_OK;
   }
-  ConfigInputOutput();
   return ReSize();
 }
 
@@ -191,12 +173,10 @@ int Convolution3x3CPUKernel::ReSize() {
 }
 
 int Convolution3x3CPUKernel::RunImpl(int task_id) {
-  if (gemm_func_ == nullptr) {
-    MS_LOG(ERROR) << "gemm_func is nullptr.";
-    return RET_ERROR;
-  }
-  Conv3x3Fp32(reinterpret_cast<float *>(nhwc4_input_), transformed_filter_addr_, reinterpret_cast<float *>(bias_data_),
-              tmp_buffer_address_list_, task_id, conv_param_, gemm_func_);
+  auto input_tensor = in_tensors_.at(kInputIndex);
+  auto ori_input_data = reinterpret_cast<float *>(input_tensor->MutableData());
+  Conv3x3Fp32(ori_input_data, transformed_filter_addr_, reinterpret_cast<float *>(bias_data_), tmp_buffer_address_list_,
+              task_id, conv_param_);
   return RET_OK;
 }
 
@@ -245,10 +225,6 @@ int Convolution3x3CPUKernel::Run() {
     MS_LOG(ERROR) << "Init tmp buffer failed.ret: " << ret;
     return RET_ERROR;
   }
-  auto input_tensor = in_tensors_.at(kInputIndex);
-  auto ori_input_data = input_tensor->MutableData();
-  PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
-                      conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
 
   int error_code = ParallelLaunch(this->context_->thread_pool_, Convolution3x3Impl, this, thread_count_);
   if (error_code != RET_OK) {
@@ -260,6 +236,7 @@ int Convolution3x3CPUKernel::Run() {
   ret = PostProcess();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Post process failed.";
+    FreeTmpBuffer();
     return ret;
   }
   FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
index b269481acc..30bfad66bf 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
@@ -40,15 +40,10 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel {
   int RunImpl(int task_id);
   int InitWeightBias();
   int InitTmpBuffer();
-  void ConfigInputOutput();
   int PostProcess();
 
  private:
   void FreeTmpBuffer() {
-    if (nhwc4_input_ != nullptr) {
-      ctx_->allocator->Free(nhwc4_input_);
-      nhwc4_input_ = nullptr;
-    }
     if (tile_buffer_ != nullptr) {
       ctx_->allocator->Free(tile_buffer_);
       tile_buffer_ = nullptr;
@@ -78,7 +73,6 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel {
   float *col_buffer_ = nullptr;
   float *nc4hw4_out_ = nullptr;
   TmpBufferAddress tmp_buffer_address_list_[5];
-  GEMM_FUNC_FP32 gemm_func_ = nullptr;
 };
 void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param, int oc_block, int oc_block_num);
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
index af21fc75e6..e7fb418f47 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
@@ -184,14 +184,6 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
 #endif
   MS_ASSERT(ctx_->allocator != nullptr);
 
-  size_t nhwc4_input_size =
-    ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
-  nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
-  if (nhwc4_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
-    return RET_MEMORY_FAILED;
-  }
-
   size_t tile_buffer_size = thread_count_ * tile_num * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float);
   trans_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tile_buffer_size));
   if (trans_input_ == nullptr) {
@@ -298,9 +290,11 @@ int ConvolutionWinogradCPUKernel::ReSize() {
 }
 
 int ConvolutionWinogradCPUKernel::RunImpl(int task_id) {
+  auto input_tensor = in_tensors_.at(kInputIndex);
+  auto ori_input_data = reinterpret_cast<float *>(input_tensor->MutableData());
   auto output_data = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
-  ConvWinogardFp32(reinterpret_cast<float *>(nhwc4_input_), trans_weight_, reinterpret_cast<const float *>(bias_data_),
-                   output_data, tmp_buffer_address_list_, task_id, conv_param_, in_func_, out_func_);
+  ConvWinogardFp32(ori_input_data, trans_weight_, reinterpret_cast<const float *>(bias_data_), output_data,
+                   tmp_buffer_address_list_, task_id, conv_param_, in_func_, out_func_);
   return RET_OK;
 }
 
@@ -314,30 +308,6 @@ int ConvolutionWinogradImpl(void *cdata, int task_id) {
   return RET_OK;
 }
 
-int ConvolutionWinogradCPUKernel::PostProcess() {
-  auto out_tensor = out_tensors_.front();
-  auto out_data = reinterpret_cast<float *>(out_tensor->MutableData());
-  auto act_type = conv_param_->act_type_;
-  switch (act_type) {
-    case ActType_No:
-      UnPackWinogradOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
-                           conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
-      break;
-    case ActType_Relu:
-      UnPackWinogradReluOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
-                               conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
-      break;
-    case ActType_Relu6:
-      UnPackWinogradRelu6Output(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
-                                conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
-      break;
-    default:
-      MS_LOG(ERROR) << "Unsupport activation type.";
-      return RET_ERROR;
-  }
-  return RET_OK;
-}
-
 int ConvolutionWinogradCPUKernel::Run() {
   auto prepare_ret = Prepare();
   if (prepare_ret != RET_OK) {
@@ -351,11 +321,6 @@ int ConvolutionWinogradCPUKernel::Run() {
     return RET_ERROR;
   }
 
-  auto input_tensor = in_tensors_.at(kInputIndex);
-  auto ori_input_data = input_tensor->MutableData();
-  PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
-                      conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
-
   int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionWinogradImpl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv winograd error error_code[" << error_code << "]";
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
index f1e97291c3..319aaf1c4e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
@@ -45,15 +45,10 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
   int InitWeightBias();
   int InitTmpBuffer();
   int ConfigInputOutput();
-  int PostProcess();
   int WinogradFilterTransform(const float *weight_data, float *matrix_g, float *matrix_gt, int oc_block);
 
  private:
   void FreeTmpBuffer() {
-    if (nhwc4_input_ != nullptr) {
-      ctx_->allocator->Free(nhwc4_input_);
-      nhwc4_input_ = nullptr;
-    }
     if (trans_input_ != nullptr) {
       ctx_->allocator->Free(trans_input_);
       trans_input_ = nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
index 76833240ef..167b556f4e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
@@ -137,25 +137,15 @@ int ConvolutionInt8CPUKernel::InitTmpBuffer() {
   MS_ASSERT(ctx_->allocator != nullptr);
 
   int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
-  int output_count = conv_param_->output_h_ * conv_param_->output_w_;
-  int output_tile_count = UP_DIV(output_count, tile_num_);
   int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_;
   int plane_c4 = UP_DIV(kernel_plane, C4NUM);
   int unit_size = plane_c4 * C4NUM * ic4 * C4NUM;
-  int packed_input_size = output_tile_count * tile_num_ * unit_size;
-  packed_input_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(conv_param_->input_batch_ * packed_input_size));
+  packed_input_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(unit_size * thread_count_ * tile_num_));
   if (packed_input_ == nullptr) {
     MS_LOG(ERROR) << "malloc packed_input_ failed.";
     return RET_ERROR;
   }
 
-  size_t nhwc4_input_size = ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_;
-  nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
-  if (nhwc4_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc nhwc4 input failed.";
-    return RET_ERROR;
-  }
-
   size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t);
   tmp_dst_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(tmp_dst_size));
   if (tmp_dst_ == nullptr) {
@@ -322,15 +312,15 @@ int ConvolutionInt8CPUKernel::ReSize() {
 }
 
 int ConvolutionInt8CPUKernel::RunImpl(int task_id) {
+  auto input_tensor = in_tensors_.at(kInputIndex);
+  auto ori_input_data = reinterpret_cast<int8_t *>(input_tensor->MutableData());
   auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->MutableData());
   if (support_optimize_) {
-    ConvInt8Opt(reinterpret_cast<int8_t *>(nhwc4_input_), packed_input_, packed_weight_,
-                reinterpret_cast<int32_t *>(bias_data_), tmp_dst_, tmp_out_, output_addr, input_sum_, task_id,
-                conv_param_, gemm_func_);
+    ConvInt8Opt(ori_input_data, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), tmp_dst_,
+                tmp_out_, output_addr, input_sum_, task_id, conv_param_, gemm_func_);
   } else {
-    ConvInt8(reinterpret_cast<int8_t *>(nhwc4_input_), packed_input_, packed_weight_,
-             reinterpret_cast<int32_t *>(bias_data_), tmp_dst_, tmp_out_, output_addr, input_sum_, task_id,
-             conv_param_);
+    ConvInt8(ori_input_data, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), tmp_dst_, tmp_out_,
+             output_addr, input_sum_, task_id, conv_param_);
   }
   return RET_OK;
 }
@@ -358,11 +348,6 @@ int ConvolutionInt8CPUKernel::Run() {
     return RET_ERROR;
   }
 
-  auto input_tensor = in_tensors_.at(kInputIndex);
-  auto ori_input_data = input_tensor->MutableData();
-  PackNHWCToNHWC4Int8(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
-                      conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
-
   int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionInt8Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv int8 error error_code[" << error_code << "]";
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
index 76ee710178..d836da459a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
@@ -55,10 +55,6 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel {
 
  private:
   void FreeTmpBuffer() {
-    if (nhwc4_input_ != nullptr) {
-      ctx_->allocator->Free(nhwc4_input_);
-      nhwc4_input_ = nullptr;
-    }
     if (packed_input_ != nullptr) {
       ctx_->allocator->Free(packed_input_);
       packed_input_ = nullptr;