optimize_transpose_fp16

2021-05-26 14:17:21 +08:00 · 2021-05-26 14:17:21 +08:00 · 69a11ce6e7
parent 3cc2b6513c
commit 69a11ce6e7
18 changed files with 188 additions and 311 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/transpose_base.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/transpose_base.c
@ -196,34 +196,34 @@
    }                                                                                                   \
  }

-#define TRANSPOSE_DIMS(TYPE, NAME)                                                                                 \
-  void TransposeDims##NAME(const TYPE *in_data, TYPE *out_data, const int *output_shape, int *size, int *position, \
-                           TransposeParameter *transpose_param, int task_id, int thread_num) {                     \
-    int *perm = transpose_param->perm_;                                                                            \
-    int *strides = transpose_param->strides_;                                                                      \
-    int *out_strides = transpose_param->out_strides_;                                                              \
-    int num_axes = transpose_param->num_axes_;                                                                     \
-    size_t data_size = (*size) * output_shape[0];                                                                  \
-    size_t offset_size = UP_DIV(data_size, thread_num);                                                            \
-    size_t task_offset = offset_size * task_id;                                                                    \
-    int count = data_size - task_offset;                                                                           \
-    if (count <= 0) {                                                                                              \
-      return;                                                                                                      \
-    }                                                                                                              \
-    count = MSMIN(offset_size, count);                                                                             \
-    for (size_t idx = task_offset; idx < task_offset + count; ++idx) {                                             \
-      int pos = idx;                                                                                               \
-      int output_idx = 0;                                                                                          \
-      int input_idx = 0;                                                                                           \
-      for (int i = 0; i < num_axes; ++i) {                                                                         \
-        *(position + i) = pos / *(size + i);                                                                       \
-        int out_stride = i < num_axes - 1 ? out_strides[i] : 1;                                                    \
-        output_idx += (*(position + i) * out_stride);                                                              \
-        input_idx += (*(position + i) * strides[perm[i]]);                                                         \
-        pos -= *(position + i) * (*(size + i));                                                                    \
-      }                                                                                                            \
-      out_data[output_idx] = in_data[input_idx];                                                                   \
-    }                                                                                                              \
+#define TRANSPOSE_DIMS(TYPE, NAME)                                                             \
+  void TransposeDims##NAME(const TYPE *in_data, TYPE *out_data, const int *output_shape,       \
+                           TransposeParameter *transpose_param, int task_id, int thread_num) { \
+    int *perm = transpose_param->perm_;                                                        \
+    int *strides = transpose_param->strides_;                                                  \
+    int *out_strides = transpose_param->out_strides_;                                          \
+    int num_axes = transpose_param->num_axes_;                                                 \
+    size_t data_size = (*out_strides) * output_shape[0];                                       \
+    size_t offset_size = UP_DIV(data_size, thread_num);                                        \
+    size_t task_offset = offset_size * task_id;                                                \
+    int count = data_size - task_offset;                                                       \
+    if (count <= 0) {                                                                          \
+      return;                                                                                  \
+    }                                                                                          \
+    count = MSMIN(offset_size, count);                                                         \
+    for (size_t idx = task_offset; idx < task_offset + count; ++idx) {                         \
+      int pos = idx;                                                                           \
+      int output_idx = 0;                                                                      \
+      int input_idx = 0;                                                                       \
+      for (int i = 0; i < num_axes; ++i) {                                                     \
+        int position = pos / *(out_strides + i);                                               \
+        int out_stride = i < num_axes - 1 ? out_strides[i] : 1;                                \
+        output_idx += (position * out_stride);                                                 \
+        input_idx += (position * strides[perm[i]]);                                            \
+        pos -= position * (*(out_strides + i));                                                \
+      }                                                                                        \
+      out_data[output_idx] = in_data[input_idx];                                               \
+    }                                                                                          \
  }

 #define DOTRANSPOSE(TYPE, NAME)                                                                               \
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/transpose_base.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/transpose_base.h
@ -40,21 +40,21 @@ int DoTransposeInt64(const int64_t *in_data, int64_t *out_data, const int *outpu
                     TransposeParameter *transpose_param);
 int DoTransposeBool(const bool *in_data, bool *out_data, const int *output_shape, TransposeParameter *transpose_param);

-void TransposeDimsUInt8(const uint8_t *in_data, uint8_t *out_data, const int *output_shape, int *size, int *position,
+void TransposeDimsUInt8(const uint8_t *in_data, uint8_t *out_data, const int *output_shape,
                        TransposeParameter *transpose_param, int task_id, int thread_num);
-void TransposeDimsUInt16(const uint16_t *in_data, uint16_t *out_data, const int *output_shape, int *size, int *position,
+void TransposeDimsUInt16(const uint16_t *in_data, uint16_t *out_data, const int *output_shape,
                         TransposeParameter *transpose_param, int task_id, int thread_num);
-void TransposeDimsUInt32(const uint32_t *in_data, uint32_t *out_data, const int *output_shape, int *size, int *position,
+void TransposeDimsUInt32(const uint32_t *in_data, uint32_t *out_data, const int *output_shape,
                         TransposeParameter *transpose_param, int task_id, int thread_num);
-void TransposeDimsUInt64(const uint64_t *in_data, uint64_t *out_data, const int *output_shape, int *size, int *position,
+void TransposeDimsUInt64(const uint64_t *in_data, uint64_t *out_data, const int *output_shape,
                         TransposeParameter *transpose_param, int task_id, int thread_num);
-void TransposeDimsInt16(const int16_t *in_data, int16_t *out_data, const int *output_shape, int *size, int *position,
+void TransposeDimsInt16(const int16_t *in_data, int16_t *out_data, const int *output_shape,
                        TransposeParameter *transpose_param, int task_id, int thread_num);
-void TransposeDimsInt32(const int32_t *in_data, int32_t *out_data, const int *output_shape, int *size, int *position,
+void TransposeDimsInt32(const int32_t *in_data, int32_t *out_data, const int *output_shape,
                        TransposeParameter *transpose_param, int task_id, int thread_num);
-void TransposeDimsInt64(const int64_t *in_data, int64_t *out_data, const int *output_shape, int *size, int *position,
+void TransposeDimsInt64(const int64_t *in_data, int64_t *out_data, const int *output_shape,
                        TransposeParameter *transpose_param, int task_id, int thread_num);
-void TransposeDimsBool(const bool *in_data, bool *out_data, const int *output_shape, int *size, int *position,
+void TransposeDimsBool(const bool *in_data, bool *out_data, const int *output_shape,
                       TransposeParameter *transpose_param, int task_id, int thread_num);

 #ifdef __cplusplus
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.c
@ -159,7 +159,8 @@ void PackNCHWToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int
  }
 }

-void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int channel) {
+void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int channel, int task_id,
+                        int thread_count) {
 #ifdef ENABLE_ARM64
  // Transpose16x8 in arm64
  const int hw_tile = C16NUM;
@ -167,13 +168,27 @@ void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int
  // Transpose8x8 in others
  const int hw_tile = C8NUM;
 #endif
-  int hw_align = plane / hw_tile * hw_tile;
+  int hw_align = plane / hw_tile;
+  int task_start = 0;
+  int task_end = plane;
+  if (thread_count > 0) {
+    int offset_hw = UP_DIV(hw_align, thread_count) * hw_tile;
+    task_start = offset_hw * task_id;
+    int count = plane - task_start;
+    if (count <= 0) {
+      return;
+    }
+    task_end = (task_id + 1) == thread_count ? plane : MSMIN(plane, task_start + offset_hw);
+    hw_align = task_start + ((task_end - task_start) >= offset_hw ? offset_hw : 0);
+  } else {
+    hw_align *= hw_tile;
+  }
  int c8 = channel / C8NUM * C8NUM;
  int batch = plane * channel;
  for (int n = 0; n < batches; n++) {
    const float16_t *src_batch = (const float16_t *)src + n * batch;
    float16_t *dst_batch = (float16_t *)dst + n * batch;
-    int hw = 0;
+    int hw = task_start;
    for (; hw < hw_align; hw += hw_tile) {
      int c = 0;
      for (; c < c8; c += C8NUM) {
@ -203,7 +218,7 @@ void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int
        }
      }
    }
-    for (; hw < plane; hw++) {
+    for (; hw < task_end; hw++) {
      const float16_t *src_ptr = src_batch + hw * channel;
      float16_t *dst_ptr = dst_batch + hw;
      for (size_t i = 0; i < channel; i++) {
@ -213,8 +228,8 @@ void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int
  }
 }

-void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel) {
-  return PackNHWCToNCHWFp16(src, dst, batch, channel, plane);
+void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count) {
+  return PackNHWCToNCHWFp16(src, dst, batch, channel, plane, task_id, thread_count);
 }

 void PackNHWCToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.h
@ -37,9 +37,9 @@ void PackNHWCToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int

 void PackNCHWToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int channel);

-void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel);
+void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count);

-void PackNHWCToNCHWFp16(const void *src, void *dst, int batch, int plane, int channel);
+void PackNHWCToNCHWFp16(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count);

 void PackNHWCToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel);

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/transpose_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/transpose_fp16.c
@ -173,38 +173,44 @@ void Fp16TransposeDim6(const float16_t *in_data, float16_t *out_data, int *strid
  }
 }

-void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *strides, const int *out_strides,
-                       const int *perm, const int *output_shape, int dims, int *size, int *position) {
-  *(size + dims - 1) = 1;
-  for (int i = dims - 1; i > 0; --i) {
-    *(size + i - 1) = *(size + i) * output_shape[i];
+void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape,
+                       TransposeParameter *param, int task_id, int thread_num) {
+  int *perm = param->perm_;
+  int *strides = param->strides_;
+  int *out_strides = param->out_strides_;
+  int num_axes = param->num_axes_;
+  size_t data_size = (*out_strides) * output_shape[0];
+  size_t offset_size = UP_DIV(data_size, thread_num);
+  size_t task_offset = offset_size * task_id;
+  int count = data_size - task_offset;
+  if (count <= 0) {
+    return;
  }
-
-  for (size_t idx = 0; idx < (*size) * output_shape[0]; ++idx) {
+  count = MSMIN(offset_size, count);
+  for (size_t idx = task_offset; idx < task_offset + count; ++idx) {
    int pos = idx;
    int output_idx = 0;
    int input_idx = 0;
-    for (int i = 0; i < dims; ++i) {
-      *(position + i) = pos / *(size + i);
-      int out_stride = i < dims - 1 ? out_strides[i] : 1;
-      output_idx += (*(position + i) * out_stride);
-      input_idx += (*(position + i) * strides[perm[i]]);
-      pos -= *(position + i) * (*(size + i));
+    for (int i = 0; i < num_axes; ++i) {
+      int position = pos / *(out_strides + i);
+      int out_stride = i < num_axes - 1 ? out_strides[i] : 1;
+      output_idx += (position * out_stride);
+      input_idx += (position * strides[perm[i]]);
+      pos -= position * (*(out_strides + i));
    }
    out_data[output_idx] = in_data[input_idx];
  }
 }

-int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *output_shape,
-                    TransposeParameter *transpose_param, int *size, int *position) {
+int DoTransposeFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape, TransposeParameter *param) {
  if (in_data == NULL || out_data == NULL) {
    return NNACL_ERR;
  }
-  int *perm = transpose_param->perm_;
-  int *strides = transpose_param->strides_;
-  int *out_strides = transpose_param->out_strides_;
-  int data_size = transpose_param->data_size_;
-  int num_axes = transpose_param->num_axes_;
+  int *perm = param->perm_;
+  int *strides = param->strides_;
+  int *out_strides = param->out_strides_;
+  int data_size = param->data_size_;
+  int num_axes = param->num_axes_;

  // check if transpose is needed
  bool needTranspose = false;
@ -235,7 +241,7 @@ int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *ou
  } else if (num_axes == 6) {
    Fp16TransposeDim6(in_data, out_data, strides, out_strides, perm, output_shape);
  } else {
-    TransposeDimsFp16(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position);
+    return NNACL_ERR;
  }
  return NNACL_OK;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/transpose_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/transpose_fp16.h
@ -24,8 +24,9 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *output_shape,
-                    TransposeParameter *transpose_param, int *size, int *position);
+void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape,
+                       TransposeParameter *param, int task_id, int thread_num);
+int DoTransposeFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape, TransposeParameter *param);
 #ifdef __cplusplus
 }
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/transpose_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/transpose_fp32.c
@ -171,13 +171,13 @@ void TransposeDim6Fp32(const float *in_data, float *out_data, const int *strides
  }
 }

-void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape, const int *size, int *position,
+void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape,
                       TransposeParameter *transpose_param, int task_id, int thread_num) {
  int *perm = transpose_param->perm_;
  int *strides = transpose_param->strides_;
  int *out_strides = transpose_param->out_strides_;
  int num_axes = transpose_param->num_axes_;
-  size_t data_size = (*size) * output_shape[0];
+  size_t data_size = (*out_strides) * output_shape[0];
  size_t offset_size = UP_DIV(data_size, thread_num);
  size_t task_offset = offset_size * task_id;
  int count = data_size - task_offset;
@ -190,11 +190,11 @@ void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_
    int output_idx = 0;
    int input_idx = 0;
    for (int i = 0; i < num_axes; ++i) {
-      *(position + i) = pos / *(size + i);
+      int position = pos / *(out_strides + i);
      int out_stride = i < num_axes - 1 ? out_strides[i] : 1;
-      output_idx += (*(position + i) * out_stride);
-      input_idx += (*(position + i) * strides[perm[i]]);
-      pos -= *(position + i) * (*(size + i));
+      output_idx += (position * out_stride);
+      input_idx += (position * strides[perm[i]]);
+      pos -= position * (*(out_strides + i));
    }
    out_data[output_idx] = in_data[input_idx];
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/transpose_fp32.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/transpose_fp32.h
@ -26,7 +26,7 @@ extern "C" {
 #endif

 int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *param);
-void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape, const int *size, int *position,
+void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape,
                       TransposeParameter *transpose_param, int task_id, int thread_num);
 #ifdef __cplusplus
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/transpose_int8.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/transpose_int8.c
@ -220,13 +220,13 @@ int DoTransposeInt8(const int8_t *in_data, int8_t *out_data, const int *output_s
  return NNACL_OK;
 }

-void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *output_shape, const int *size, int *position,
+void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *output_shape,
                       TransposeParameter *transpose_param, int task_id, int thread_num) {
  int *perm = transpose_param->perm_;
  int *strides = transpose_param->strides_;
  int *out_strides = transpose_param->out_strides_;
  int num_axes = transpose_param->num_axes_;
-  size_t data_size = (*size) * output_shape[0];
+  size_t data_size = (*out_strides) * output_shape[0];
  size_t offset_size = UP_DIV(data_size, thread_num);
  size_t task_offset = offset_size * task_id;
  int count = data_size - task_offset;
@ -239,11 +239,11 @@ void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *outpu
    int output_idx = 0;
    int input_idx = 0;
    for (int i = 0; i < num_axes; ++i) {
-      *(position + i) = pos / *(size + i);
+      int position = pos / *(out_strides + i);
      int out_stride = i < num_axes - 1 ? out_strides[i] : 1;
-      output_idx += (*(position + i) * out_stride);
-      input_idx += (*(position + i) * strides[perm[i]]);
-      pos -= *(position + i) * (*(size + i));
+      output_idx += (position * out_stride);
+      input_idx += (position * strides[perm[i]]);
+      pos -= position * (*(out_strides + i));
    }
    out_data[output_idx] = in_data[input_idx];
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/transpose_int8.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/transpose_int8.h
@ -27,7 +27,7 @@ extern "C" {

 int DoTransposeInt8(const int8_t *in_data, int8_t *out_data, const int *output_shape,
                    TransposeParameter *transpose_param);
-void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *output_shape, const int *size, int *position,
+void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *output_shape,
                       TransposeParameter *transpose_param, int task_id, int thread_num);
 #ifdef __cplusplus
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc
@ -127,18 +127,9 @@ void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, con
  auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
  const float block_size = 128.0;
  size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
-  int dims = SizeToInt(axes_.size());
-  int *size = new int[dims];
-  size[dims - 1] = 1;
-  for (int i = dims - 1; i > 0; i--) {
-    size[i - 1] = size[i] * output_shape_[i];
-  }
-  int **position = new int *[thread_num];
-  for (size_t i = 0; i < thread_num; ++i) {
-    position[i] = new int[dims];
-  }
  std::vector<common::Task> tasks;
-  std::function<void(const T *, T *, const int *, int *, int *, TransposeParameter *, int, int)> TransposeDims;
+  std::function<void(const T *, T *, const int *, TransposeParameter *, int, int)> TransposeDims;
+
  if constexpr (std::is_same_v<T, int8_t>) {
    TransposeDims = &TransposeDimsInt8;
  } else if constexpr (std::is_same_v<T, int16_t>) {
@ -162,15 +153,12 @@ void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, con
  }
  for (int task_id = 0; task_id < SizeToInt(thread_num); ++task_id) {
    auto task = [&, task_id, thread_num]() {
-      TransposeDims(input_addr, output_addr, output_shape, size, position[task_id], &transpose_param_, task_id,
-                    SizeToInt(thread_num));
+      TransposeDims(input_addr, output_addr, output_shape, &transpose_param_, task_id, SizeToInt(thread_num));
      return common::SUCCESS;
    };
    tasks.emplace_back(task);
  }
  common::ThreadPool::GetInstance().SyncRun(tasks);
-  delete[] size;
-  delete[] position;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@ -44,7 +44,7 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
    return RET_ERROR;
  }
  PackNCHWToNHWCFp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
-                     weight_tensor->Batch());
+                     weight_tensor->Batch(), 0, 0);

  bias_data_ = reinterpret_cast<float16_t *>(malloc(channel * sizeof(float16_t)));
  if (bias_data_ == nullptr) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc
@ -13,9 +13,9 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-
 #include "src/runtime/kernel/arm/fp16/transpose_fp16.h"
 #include <vector>
+#include "nnacl/fp16/pack_fp16.h"
 #include "nnacl/fp16/transpose_fp16.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
@ -30,63 +30,19 @@ using mindspore::lite::RET_OP_EXECUTE_FAILURE;
 using mindspore::schema::PrimitiveType_Transpose;

 namespace mindspore::kernel {
-int TransposeFp16CPUKernel::Init() {
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
-  return TransposeCPUKernel::ReSize();
+void TransposeFp16CPUKernel::GetNchwToNhwcFunc() { NHNCTransposeFunc_ = PackNCHWToNHWCFp16; }
+
+void TransposeFp16CPUKernel::GetNhwcToNchwFunc() { NHNCTransposeFunc_ = PackNHWCToNCHWFp16; }
+
+int TransposeFp16CPUKernel::TransposeDim2to6() {
+  return DoTransposeFp16(static_cast<const float16_t *>(in_data_), static_cast<float16_t *>(out_data_), out_shape_,
+                         param_);
 }

-int TransposeFp16CPUKernel::Run() {
-  MS_ASSERT(in_tensors_.size() == 1 || in_tensors_.size() == 2);
-  TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_);
-  param->data_size_ = in_tensors_[0]->Size();
-  MS_ASSERT(out_tensors_.size() == 1);
-  auto &in_tensor = in_tensors_.front();
-  auto &out_tensor = out_tensors_.front();
-  if (in_tensor == nullptr || out_tensor == nullptr) {
-    MS_LOG(ERROR) << "null pointer referencing.";
-    return RET_ERROR;
-  }
-  in_data_fp16_ = reinterpret_cast<float16_t *>(in_tensor->MutableData());
-  out_data_fp16_ = reinterpret_cast<float16_t *>(out_tensor->MutableData());
-  MS_ASSERT(in_data_fp16_);
-  MS_ASSERT(out_data_fp16_);
-
-  if (in_tensor->shape().size() != static_cast<size_t>(param->num_axes_)) {
-    memcpy(out_data_fp16_, in_data_fp16_, in_tensor->ElementsNum() * sizeof(float16_t));
-    return RET_OK;
-  }
-  int dims = out_tensor->shape().size();
-  if (dims > DIMENSION_6D) {
-    dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int)));
-    if (dim_size_ == nullptr) {
-      MS_LOG(ERROR) << "Malloc data failed";
-      return RET_ERROR;
-    }
-    position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int)));
-    if (position_ == nullptr) {
-      MS_LOG(ERROR) << "Malloc data failed";
-      context_->allocator->Free(dim_size_);
-      dim_size_ = nullptr;
-      return RET_ERROR;
-    }
-  }
-
-  MS_ASSERT(out_shape_);
-  auto ret = Fp16DoTranspose(in_data_fp16_, out_data_fp16_, out_shape_, param, dim_size_, position_);
-  if (dims > DIMENSION_6D) {
-    context_->allocator->Free(dim_size_);
-    context_->allocator->Free(position_);
-    dim_size_ = nullptr;
-    position_ = nullptr;
-  }
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Transpose run failed";
-    return RET_ERROR;
-  }
-
-  return ret;
+int TransposeFp16CPUKernel::TransposeDimGreaterThan6(int task_id) {
+  TransposeDimsFp16(static_cast<const float16_t *>(in_data_), static_cast<float16_t *>(out_data_), out_shape_, param_,
+                    task_id, op_parameter_->thread_num_);
+  return RET_OK;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Transpose, LiteKernelCreator<TransposeFp16CPUKernel>)
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -13,7 +13,6 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP16_TRANSPOSE_FP16_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP16_TRANSPOSE_FP16_H_

@ -32,12 +31,11 @@ class TransposeFp16CPUKernel : public TransposeCPUKernel {
      : TransposeCPUKernel(param, inputs, outputs, ctx) {}
  ~TransposeFp16CPUKernel() = default;

-  int Init() override;
-  int Run() override;
-
 private:
-  float16_t *in_data_fp16_ = nullptr;
-  float16_t *out_data_fp16_ = nullptr;
+  void GetNchwToNhwcFunc() override;
+  void GetNhwcToNchwFunc() override;
+  int TransposeDim2to6() override;
+  int TransposeDimGreaterThan6(int task_id) override;
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc
@ -13,7 +13,6 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-
 #include "src/runtime/kernel/arm/fp32/transpose_fp32.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
@ -36,16 +35,15 @@ int TransposeCPUKernel::Init() {
 }

 int TransposeCPUKernel::ReSize() {
-  TransposeParameter *param = reinterpret_cast<TransposeParameter *>(op_parameter_);
  if (in_tensors_.size() == 2) {
-    param->num_axes_ = in_tensors_.at(1)->ElementsNum();
+    param_->num_axes_ = in_tensors_.at(1)->ElementsNum();
  }
  int trans3d[3] = {0, 2, 1};
  int *perm_data = nullptr;
  auto input_tensor = in_tensors_.at(kInputIndex);
-  if (input_tensor->shape().size() != static_cast<size_t>(param->num_axes_)) {
-    if (input_tensor->shape().size() == 3 && param->num_axes_ == 4) {
-      param->num_axes_ = 3;
+  if (input_tensor->shape().size() != static_cast<size_t>(param_->num_axes_)) {
+    if (input_tensor->shape().size() == 3 && param_->num_axes_ == 4) {
+      param_->num_axes_ = 3;
      perm_data = trans3d;
    } else {
      return RET_OK;
@ -55,21 +53,19 @@ int TransposeCPUKernel::ReSize() {
    auto perm_tensor = in_tensors_.at(1);
    perm_data = reinterpret_cast<int *>(perm_tensor->data_c());
  }
-  // set perm data
-  MS_ASSERT(perm_data != nullptr);
-  for (int i = 0; i < param->num_axes_; ++i) {
-    param->perm_[i] = perm_data[i];
+  for (int i = 0; i < param_->num_axes_; ++i) {
+    param_->perm_[i] = perm_data[i];
  }
  auto &inTensor = in_tensors_.front();
  auto &outTensor = out_tensors_.front();
  auto in_shape = inTensor->shape();
  auto out_shape = outTensor->shape();
-  param->strides_[param->num_axes_ - 1] = 1;
-  param->out_strides_[param->num_axes_ - 1] = 1;
-  param->data_size_ = inTensor->Size();
-  for (int i = param->num_axes_ - 2; i >= 0; i--) {
-    param->strides_[i] = in_shape.at(i + 1) * param->strides_[i + 1];
-    param->out_strides_[i] = out_shape.at(i + 1) * param->out_strides_[i + 1];
+  param_->strides_[param_->num_axes_ - 1] = 1;
+  param_->out_strides_[param_->num_axes_ - 1] = 1;
+  param_->data_size_ = inTensor->Size();
+  for (int i = param_->num_axes_ - 2; i >= 0; i--) {
+    param_->strides_[i] = in_shape.at(i + 1) * param_->strides_[i + 1];
+    param_->out_strides_[i] = out_shape.at(i + 1) * param_->out_strides_[i + 1];
  }

  if (this->out_shape_ != nullptr) {
@ -92,35 +88,49 @@ TransposeCPUKernel::~TransposeCPUKernel() {
  }
 }

-void TransposeCPUKernel::GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor,
-                                              TransposeParameter *param) {
+void TransposeCPUKernel::GetNchwToNhwcFunc() { NHNCTransposeFunc_ = PackNCHWToNHWCFp32; }
+
+void TransposeCPUKernel::GetNhwcToNchwFunc() { NHNCTransposeFunc_ = PackNHWCToNCHWFp32; }
+
+int TransposeCPUKernel::TransposeDim2to6() {
+  return DoTransposeFp32(static_cast<const float *>(in_data_), static_cast<float *>(out_data_), out_shape_, param_);
+}
+
+int TransposeCPUKernel::TransposeDimGreaterThan6(int task_id) {
+  TransposeDimsFp32(static_cast<const float *>(in_data_), static_cast<float *>(out_data_), out_shape_, param_, task_id,
+                    op_parameter_->thread_num_);
+  return RET_OK;
+}
+
+void TransposeCPUKernel::GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor) {
+  if (in_tensor->shape().size() != 4) {
+    return;
+  }
  auto out_shape = out_tensor->shape();
-  if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 &&
-      param->perm_[3] == 1) {
+  if (param_->perm_[0] == 0 && param_->perm_[1] == 2 && param_->perm_[2] == 3 && param_->perm_[3] == 1) {
    nhnc_param_[0] = out_shape[0];
    nhnc_param_[1] = out_shape[1] * out_shape[2];
    nhnc_param_[2] = out_shape[3];
    if (in_tensor->data_type() == kNumberTypeFloat32) {
-      NHNCTransposeFunc_ = PackNCHWToNHWCFp32;
+      GetNchwToNhwcFunc();
    }
  }
-  if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 &&
-      param->perm_[3] == 2) {
+  if (param_->perm_[0] == 0 && param_->perm_[1] == 3 && param_->perm_[2] == 1 && param_->perm_[3] == 2) {
    nhnc_param_[0] = out_shape[0];
    nhnc_param_[1] = out_shape[2] * out_shape[3];
    nhnc_param_[2] = out_shape[1];
    if (in_tensor->data_type() == kNumberTypeFloat32) {
-      NHNCTransposeFunc_ = PackNHWCToNCHWFp32;
+      GetNhwcToNchwFunc();
    }
  }
 }

 int TransposeCPUKernel::RunImpl(int task_id) {
  if (NHNCTransposeFunc_ != nullptr) {
-    NHNCTransposeFunc_(in_data_, out_data_, nhnc_param_[0], nhnc_param_[1], nhnc_param_[2], task_id, thread_count_);
+    NHNCTransposeFunc_(in_data_, out_data_, nhnc_param_[0], nhnc_param_[1], nhnc_param_[2], task_id,
+                       op_parameter_->thread_num_);
  } else {
-    TransposeDimsFp32(in_data_, out_data_, out_shape_, dim_size_, position_ + dims_ * task_id, param_, task_id,
-                      thread_count_);
+    return TransposeDimGreaterThan6(task_id);
  }
  return RET_OK;
 }
@ -143,63 +153,26 @@ int TransposeCPUKernel::Run() {
    MS_LOG(ERROR) << "null pointer dreferencing.";
    return RET_ERROR;
  }
-  in_data_ = reinterpret_cast<float *>(in_tensor->MutableData());
-  out_data_ = reinterpret_cast<float *>(out_tensor->MutableData());
+  in_data_ = in_tensor->data_c();
+  out_data_ = out_tensor->data_c();
  MS_ASSERT(in_data_);
  MS_ASSERT(out_data_);

-  param_ = reinterpret_cast<TransposeParameter *>(this->op_parameter_);
  if (in_tensor->shape().size() != static_cast<size_t>(param_->num_axes_)) {
-    memcpy(out_data_, in_data_, in_tensor->ElementsNum() * sizeof(float));
+    memcpy(out_data_, in_data_, in_tensor->Size());
    return RET_OK;
  }
-  thread_count_ = op_parameter_->thread_num_;
-  GetNHNCTransposeFunc(in_tensor, out_tensor, param_);
+  GetNHNCTransposeFunc(in_tensor, out_tensor);
  if (NHNCTransposeFunc_ != nullptr) {
-    auto ret = static_cast<const lite::InnerContext *>(this->context_)
-                 ->thread_pool_->ParallelLaunch(TransposeImpl, this, thread_count_);
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "NHNCTransposeFunc_ is error!";
-    }
-    return ret;
+    return static_cast<const lite::InnerContext *>(this->context_)
+      ->thread_pool_->ParallelLaunch(TransposeImpl, this, op_parameter_->thread_num_);
  }
-
-  MS_ASSERT(out_shape_);
-  dims_ = out_tensor->shape().size();
-  if (dims_ > DIMENSION_6D) {
-    dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims_ * sizeof(int)));
-    if (dim_size_ == nullptr) {
-      MS_LOG(ERROR) << "Malloc data failed";
-      return RET_NULL_PTR;
-    }
-    *(dim_size_ + dims_ - 1) = 1;
-    for (int i = dims_ - 1; i > 0; --i) {
-      *(dim_size_ + i - 1) = *(dim_size_ + i) * out_shape_[i];
-    }
-    position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims_ * sizeof(int) * thread_count_));
-    if (position_ == nullptr) {
-      context_->allocator->Free(dim_size_);
-      MS_LOG(ERROR) << "Malloc data failed";
-      return RET_NULL_PTR;
-    }
-  }
-  int ret;
-  if (dims_ > DIMENSION_6D) {
-    ret = static_cast<const lite::InnerContext *>(this->context_)
-            ->thread_pool_->ParallelLaunch(TransposeImpl, this, thread_count_);
+  if (out_tensor->shape().size() <= DIMENSION_6D) {
+    return TransposeDim2to6();
  } else {
-    ret = DoTransposeFp32(in_data_, out_data_, out_shape_, param_);
+    return static_cast<const lite::InnerContext *>(this->context_)
+      ->thread_pool_->ParallelLaunch(TransposeImpl, this, op_parameter_->thread_num_);
  }
-  if (dims_ > DIMENSION_6D) {
-    context_->allocator->Free(dim_size_);
-    context_->allocator->Free(position_);
-    dim_size_ = nullptr;
-    position_ = nullptr;
-  }
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Transpose run failed";
-  }
-  return ret;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>)
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -13,7 +13,6 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_H_

@ -33,7 +32,9 @@ class TransposeCPUKernel : public InnerKernel {
 public:
  explicit TransposeCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
                              const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
-      : InnerKernel(param, inputs, outputs, ctx) {}
+      : InnerKernel(param, inputs, outputs, ctx) {
+    param_ = reinterpret_cast<TransposeParameter *>(param);
+  }
  ~TransposeCPUKernel() override;

  int Init() override;
@ -42,17 +43,18 @@ class TransposeCPUKernel : public InnerKernel {
  int RunImpl(int task_id);

 protected:
-  void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param);
-  float *in_data_ = nullptr;
-  float *out_data_ = nullptr;
+  virtual void GetNchwToNhwcFunc();
+  virtual void GetNhwcToNchwFunc();
+  virtual int TransposeDim2to6();
+  virtual int TransposeDimGreaterThan6(int task_id);
+
+  void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor);
+  void *in_data_ = nullptr;
+  void *out_data_ = nullptr;
  int *out_shape_ = nullptr;
-  int *dim_size_ = nullptr;
-  int *position_ = nullptr;
  TransposeParameter *param_ = nullptr;
  TransposeFunc NHNCTransposeFunc_ = nullptr;
-  int thread_count_ = 0;
  int nhnc_param_[3] = {0};
-  int dims_ = 0;
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.cc
@ -41,47 +41,6 @@ int TransposeInt8Run(void *cdata, int task_id) {
  return RET_OK;
 }

-void TransposeInt8CPUKernel::FreeTmpBuf() {
-  if (!extra_dims_) {
-    return;
-  }
-  if (dim_size_ != nullptr) {
-    context_->allocator->Free(dim_size_);
-    dim_size_ = nullptr;
-  }
-  if (position_ != nullptr) {
-    context_->allocator->Free(position_);
-    position_ = nullptr;
-  }
-  return;
-}
-
-int TransposeInt8CPUKernel::MallocTmpBuf() {
-  if (!extra_dims_) {
-    return RET_OK;
-  }
-
-  int dims = out_tensors_.at(0)->shape().size();
-
-  dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int)));
-  if (dim_size_ == nullptr) {
-    MS_LOG(ERROR) << "Malloc data failed";
-    return RET_ERROR;
-  }
-  *(dim_size_ + dims - 1) = 1;
-  for (int i = dims - 1; i > 0; --i) {
-    *(dim_size_ + i - 1) = *(dim_size_ + i) * out_shape_[i];
-  }
-  position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int) * op_parameter_->thread_num_));
-  if (position_ == nullptr) {
-    MS_LOG(ERROR) << "Malloc data failed";
-    context_->allocator->Free(dim_size_);
-    dim_size_ = nullptr;
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
 int TransposeInt8CPUKernel::ReSize() {
  auto in_tensor = in_tensors_.front();
  auto out_tensor = out_tensors_.front();
@ -105,20 +64,16 @@ int TransposeInt8CPUKernel::ReSize() {
    transpose_param_->strides_[i] = in_shape.at(i + 1) * transpose_param_->strides_[i + 1];
    transpose_param_->out_strides_[i] = out_shape.at(i + 1) * transpose_param_->out_strides_[i + 1];
  }
-
-  extra_dims_ = out_shape.size() > DIMENSION_6D;
  return RET_OK;
 }

 int TransposeInt8CPUKernel::DoTranspose(int task_id) {
-  int dims = out_tensors_.at(0)->shape().size();
  MS_ASSERT(in_ptr_);
  MS_ASSERT(out_ptr_);
  MS_ASSERT(in_shape_);
  MS_ASSERT(out_shape_);
  MS_ASSERT(transpose_param_);
-  TransposeDimsInt8(in_ptr_, out_ptr_, out_shape_, dim_size_, position_ + dims * task_id, transpose_param_, task_id,
-                    op_parameter_->thread_num_);
+  TransposeDimsInt8(in_ptr_, out_ptr_, out_shape_, transpose_param_, task_id, op_parameter_->thread_num_);
  return RET_OK;
 }

@ -158,22 +113,12 @@ int TransposeInt8CPUKernel::Run() {
  memcpy(in_shape_, in_dims.data(), in_dims.size() * sizeof(int));
  memcpy(out_shape_, out_dims.data(), out_dims.size() * sizeof(int));

-  int ret = MallocTmpBuf();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "MallocTmpBuf error_code[" << ret << "]";
-  }
-  if (extra_dims_) {
-    ret = static_cast<const lite::InnerContext *>(this->context_)
-            ->thread_pool_->ParallelLaunch(TransposeInt8Run, this, op_parameter_->thread_num_);
+  if (out_tensor->shape().size() > DIMENSION_6D) {
+    return static_cast<const lite::InnerContext *>(this->context_)
+      ->thread_pool_->ParallelLaunch(TransposeInt8Run, this, op_parameter_->thread_num_);
  } else {
-    ret = DoTransposeInt8(in_ptr_, out_ptr_, out_shape_, transpose_param_);
+    return DoTransposeInt8(in_ptr_, out_ptr_, out_shape_, transpose_param_);
  }
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]";
-  }
-
-  FreeTmpBuf();
-  return ret;
 }

 REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Transpose, LiteKernelCreator<TransposeInt8CPUKernel>)
--- a/mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.h
@ -43,19 +43,12 @@ class TransposeInt8CPUKernel : public InnerKernel {
 public:
  int DoTranspose(int task_id);

- private:
-  int MallocTmpBuf();
-  void FreeTmpBuf();
-
 private:
  void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param);
  TransposeParameter *transpose_param_;
  TransposeFunc NHNCTransposeFunc_ = nullptr;
  int8_t *in_ptr_ = nullptr;
  int8_t *out_ptr_ = nullptr;
-  int *dim_size_ = nullptr;
-  int *position_ = nullptr;
-  bool extra_dims_ = false;
  int in_shape_[20] = {0};
  int out_shape_[20] = {0};
  int nhnc_param_[3] = {0};