run mobilenet_v2 success

2020-12-20 08:34:56 +08:00 · 2020-12-20 08:34:56 +08:00 · d45b5b5126
parent a3d4dded12
commit d45b5b5126
24 changed files with 852 additions and 305 deletions
--- a/mindspore/lite/src/lite_kernel.h
+++ b/mindspore/lite/src/lite_kernel.h
@ -95,6 +95,8 @@ class LiteKernel {

  virtual int Init() { return mindspore::lite::RET_ERROR; }

+  OpParameter *op_parameter() { return op_parameter_; }
+
  std::string name() const { return this->name_; }

  virtual int Train() {
--- a/mindspore/lite/src/lite_session.cc
+++ b/mindspore/lite/src/lite_session.cc
@ -479,12 +479,6 @@ int LiteSession::Init(const Context *context) {
    is_running_.store(false);
    return ret;
  }
-  ret = InitNPURuntime();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Init NPU runtime failed.";
-    is_running_.store(false);
-    return ret;
-  }
  executor_ = new (std::nothrow) Executor();
  if (nullptr == executor_) {
    MS_LOG(ERROR) << "New Executor failed";
@ -661,18 +655,6 @@ int LiteSession::Resize(const std::vector<mindspore::tensor::MSTensor *> &inputs
  return RET_OK;
 }

-int LiteSession::InitNPURuntime() {
-#if SUPPORT_NPU
-  if (this->context_->IsNpuEnabled()) {
-    if (mindspore::lite::NPUManager::GetInstance()->InitClient() != RET_OK) {
-      MS_LOG(ERROR) << "NPU client init error.";
-      return RET_ERROR;
-    }
-  }
-#endif
-  return RET_OK;
-}
-
 int LiteSession::InitGPURuntime() {
 #if SUPPORT_GPU
  if (this->context_->IsGpuEnabled()) {
--- a/mindspore/lite/src/lite_session.h
+++ b/mindspore/lite/src/lite_session.h
@ -103,8 +103,6 @@ class LiteSession : public session::LiteSession {
 private:
  void ResetInputsShape(const std::vector<std::vector<int>> &dims);

-  int InitNPURuntime();
-
  int InitGPURuntime();

 protected:
--- a/mindspore/lite/src/runtime/agent/npu/npu_executor.cc
+++ b/mindspore/lite/src/runtime/agent/npu/npu_executor.cc
@ -17,10 +17,9 @@
 #include "src/runtime/agent/npu/npu_executor.h"
 #include "include/errorcode.h"
 #include "src/runtime/agent/npu/npu_manager.h"
-#include "nnacl/pack.h"
 namespace mindspore::lite {
 int NPUExecutor::Prepare(const std::vector<kernel::LiteKernel *> &kernels) {
-  this->client_ = mindspore::lite::NPUManager::GetInstance()->GetClient();
+  this->client_ = mindspore::lite::NPUManager::GetInstance()->GetClient(model_name_);
  if (this->client_ == nullptr) {
    MS_LOG(ERROR) << "client is nullptr.";
    return RET_ERROR;
@ -33,9 +32,8 @@ int NPUExecutor::Prepare(const std::vector<kernel::LiteKernel *> &kernels) {
 }

 int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
-                     const std::vector<kernel::LiteKernel *> &kernels, const std::vector<bool> &inputs_nhwc2nchw,
-                     const std::vector<bool> &outputs_nchw2nhwc, Allocator *allocator, const KernelCallBack &before,
-                     const KernelCallBack &after) {
+                     const std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator,
+                     const KernelCallBack &before, const KernelCallBack &after) {
  hiai::AiContext context;
  for (int i = 0; i < npu_input_tensors_.size(); ++i) {
    void *data = in_tensors[i]->data_c();
@ -43,12 +41,7 @@ int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<
      MS_LOG(ERROR) << model_name_ << " inputs data is nullptr";
      return RET_ERROR;
    }
-    if (inputs_nhwc2nchw[i]) {
-      PackNHWCToNCHWFp32(data, npu_input_tensors_[i]->GetBuffer(), in_tensors[i]->Batch(),
-                         in_tensors[i]->Width() * in_tensors[i]->Height(), in_tensors[i]->Channel());
-    } else {
-      memcpy(npu_input_tensors_[i]->GetBuffer(), data, in_tensors[i]->Size());
-    }
+    memcpy(npu_input_tensors_[i]->GetBuffer(), data, in_tensors[i]->Size());
  }
  context.AddPara("model_name", model_name_);
  if (this->client_ == nullptr) {
@ -68,12 +61,7 @@ int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<
      MS_LOG(ERROR) << "Malloc buffer failed.";
      return RET_ERROR;
    }
-    if (outputs_nchw2nhwc[i]) {
-      PackNCHWToNHWCFp32(npu_output_tensors_[i]->GetBuffer(), data, out_tensors[i]->Batch(),
-                         out_tensors[i]->Width() * out_tensors[i]->Height(), out_tensors[i]->Channel());
-    } else {
-      memcpy(data, npu_output_tensors_[i]->GetBuffer(), npu_output_tensors_[i]->GetSize());
-    }
+    memcpy(data, npu_output_tensors_[i]->GetBuffer(), npu_output_tensors_[i]->GetSize());
    out_tensors[i]->ResetRefCount();
  }
  return RET_OK;
--- a/mindspore/lite/src/runtime/agent/npu/npu_executor.h
+++ b/mindspore/lite/src/runtime/agent/npu/npu_executor.h
@ -32,8 +32,7 @@ class NPUExecutor : public Executor {
  int Prepare(const std::vector<kernel::LiteKernel *> &kernels) override;

  int Run(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
-          const std::vector<kernel::LiteKernel *> &kernels, const std::vector<bool> &inputs_nhwc2nchw,
-          const std::vector<bool> &outputs_nchw2nhwc, Allocator *allocator = nullptr,
+          const std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator = nullptr,
          const KernelCallBack &before = nullptr, const KernelCallBack &after = nullptr);

 private:
--- a/mindspore/lite/src/runtime/agent/npu/npu_fusion_pass.cc
+++ b/mindspore/lite/src/runtime/agent/npu/npu_fusion_pass.cc
@ -0,0 +1,224 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/runtime/agent/npu/npu_fusion_pass.h"
+#include <vector>
+#include "src/lite_kernel.h"
+#include "nnacl/concat_parameter.h"
+
+namespace mindspore::lite {
+bool CheckFusion(kernel::LiteKernel *kernel) {
+  auto pre_flag =
+    std::all_of(kernel->in_kernels().begin(), kernel->in_kernels().end(), [](const kernel::LiteKernel *kernel) {
+      return kernel->Type() == schema::PrimitiveType_Nchw2Nhwc && kernel->out_kernels().size() == 1;
+    });
+  if (!pre_flag) {
+    return false;
+  }
+  auto post_flag =
+    std::all_of(kernel->out_kernels().begin(), kernel->out_kernels().end(), [](const kernel::LiteKernel *kernel) {
+      return kernel->Type() == schema::PrimitiveType_Nhwc2Nchw && kernel->in_kernels().size() == 1;
+    });
+  return post_flag;
+}
+
+void NPUFusionPass::UpdatePreKernels(kernel::LiteKernel *cur_kernel) {
+  for (auto in_kernel : cur_kernel->in_kernels()) {
+    auto pre_kernel = in_kernel->in_kernels()[0];
+
+    auto pre_out_kernels = pre_kernel->out_kernels();
+    for (size_t i = 0; i < pre_out_kernels.size(); i++) {
+      if (pre_out_kernels[i] == in_kernel) {
+        pre_out_kernels[i] = cur_kernel;
+        break;
+      }
+    }
+    pre_kernel->set_out_kernels(pre_out_kernels);
+
+    auto cur_in_kernels = cur_kernel->in_kernels();
+    for (size_t i = 0; i < cur_in_kernels.size(); i++) {
+      if (cur_in_kernels[i] == in_kernel) {
+        cur_in_kernels[i] = pre_kernel;
+        break;
+      }
+    }
+    cur_kernel->set_in_kernels(cur_in_kernels);
+    kernels->erase(find(kernels->begin(), kernels->end(), in_kernel));
+  }
+}
+
+void NPUFusionPass::UpdatePostKernels(kernel::LiteKernel *cur_kernel) {
+  for (auto out_kernel : cur_kernel->out_kernels()) {
+    auto post_kernel = out_kernel->out_kernels()[0];
+
+    auto post_in_kernels = post_kernel->in_kernels();
+    for (size_t i = 0; i < post_in_kernels.size(); i++) {
+      if (post_in_kernels[i] == out_kernel) {
+        post_in_kernels[i] = cur_kernel;
+        break;
+      }
+    }
+    post_kernel->set_in_kernels(post_in_kernels);
+
+    auto cur_out_kernels = cur_kernel->out_kernels();
+    for (size_t i = 0; i < cur_out_kernels.size(); i++) {
+      if (cur_out_kernels[i] == out_kernel) {
+        cur_out_kernels[i] = post_kernel;
+        break;
+      }
+    }
+    cur_kernel->set_out_kernels(cur_out_kernels);
+    kernels->erase(find(kernels->begin(), kernels->end(), out_kernel));
+  }
+}
+
+void UpdatePreTensors(kernel::LiteKernel *cur_kernel) {
+  auto tensors_vec = cur_kernel->in_tensors();
+  for (auto in_kernel : cur_kernel->in_kernels()) {
+    lite::Tensor *cur_tensor = nullptr;
+    auto in_tensor = in_kernel->in_tensors()[0];
+    auto out_tensor = in_kernel->out_tensors()[0];
+    auto pre_kernel = in_kernel->in_kernels()[0];
+    for (size_t i = 0; i < pre_kernel->out_tensors().size(); i++) {
+      if (pre_kernel->out_tensors()[i] == in_tensor) {
+        cur_tensor = pre_kernel->out_tensors()[i];
+      }
+    }
+    for (size_t i = 0; i < tensors_vec.size(); i++) {
+      if (tensors_vec[i] == out_tensor) {
+        tensors_vec[i] = cur_tensor;
+      }
+    }
+  }
+  cur_kernel->set_in_tensors(tensors_vec);
+}
+
+void UpdatePostTensors(kernel::LiteKernel *cur_kernel) {
+  auto tensors_vec = cur_kernel->out_tensors();
+  for (auto out_kernel : cur_kernel->out_kernels()) {
+    auto in_tensor = out_kernel->in_tensors()[0];
+    auto out_tensor = out_kernel->out_tensors()[0];
+    auto post_kernel = out_kernel->out_kernels()[0];
+    lite::Tensor *cur_tensor = nullptr;
+    for (size_t i = 0; i < post_kernel->in_tensors().size(); i++) {
+      if (post_kernel->in_tensors()[i] == out_tensor) {
+        cur_tensor = post_kernel->in_tensors()[i];
+      }
+    }
+    for (size_t i = 0; i < tensors_vec.size(); i++) {
+      if (tensors_vec[i] == in_tensor) {
+        tensors_vec[i] = cur_tensor;
+      }
+    }
+  }
+  cur_kernel->set_out_tensors(tensors_vec);
+}
+
+int TransFormAxis(int axis) {
+  switch (axis) {
+    case 0:
+      return 0;
+    case 1:
+      return 2;
+    case 2:
+      return 3;
+    case 3:
+    case -1:
+      return 1;
+    default:
+      return -2;
+  }
+}
+
+int NPUFusionPass::AddFusion(kernel::LiteKernel *kernel) {
+  if (!CheckFusion(kernel)) {
+    return RET_OK;
+  }
+  UpdatePreTensors(kernel);
+  UpdatePostTensors(kernel);
+  UpdatePreKernels(kernel);
+  UpdatePostKernels(kernel);
+  return RET_OK;
+}
+
+int NPUFusionPass::ConcatFusion(kernel::LiteKernel *kernel) {
+  if (!CheckFusion(kernel)) {
+    return RET_OK;
+  }
+  UpdatePreTensors(kernel);
+  UpdatePostTensors(kernel);
+  UpdatePreKernels(kernel);
+  UpdatePostKernels(kernel);
+  auto concat_param = reinterpret_cast<ConcatParameter *>(kernel->op_parameter());
+  concat_param->axis_ = TransFormAxis(concat_param->axis_);
+  return RET_OK;
+}
+
+int NPUFusionPass::FormatFusion(kernel::LiteKernel *kernel) {
+  if (kernel->out_kernels().empty()) {
+    return RET_OK;
+  }
+  if (!std::all_of(kernel->out_kernels().begin(), kernel->out_kernels().end(), [](const kernel::LiteKernel *kernel) {
+        return kernel->Type() == schema::PrimitiveType_Nhwc2Nchw;
+      })) {
+    return RET_OK;
+  }
+  auto pre_kernel = kernel->in_kernels()[0];
+
+  auto pre_out_kernels = pre_kernel->out_kernels();
+  for (size_t i = 0; i < pre_out_kernels.size(); i++) {
+    if (pre_out_kernels[i] == kernel) {
+      pre_out_kernels.erase(pre_out_kernels.begin() + i);
+      break;
+    }
+  }
+  for (const auto &nc2nh : kernel->out_kernels()) {
+    for (const auto &post_kernel : nc2nh->out_kernels()) {
+      auto post_in_kernels = post_kernel->in_kernels();
+      for (size_t i = 0; i < post_in_kernels.size(); i++) {
+        if (post_in_kernels[i] == nc2nh) {
+          post_in_kernels[i] = pre_kernel;
+          break;
+        }
+      }
+      post_kernel->set_in_kernels(post_in_kernels);
+      pre_out_kernels.push_back(post_kernel);
+    }
+    kernels->erase(find(kernels->begin(), kernels->end(), nc2nh));
+  }
+  pre_kernel->set_out_kernels(pre_out_kernels);
+  kernels->erase(find(kernels->begin(), kernels->end(), kernel));
+  return RET_OK;
+}
+
+int NPUFusionPass::Fusion() {
+  for (auto kernel : *kernels) {
+    switch (kernel->Type()) {
+      case schema::PrimitiveType_Concat:
+        ConcatFusion(kernel);
+        continue;
+      case schema::PrimitiveType_Add:
+        AddFusion(kernel);
+        continue;
+      case schema::PrimitiveType_Nchw2Nhwc:
+        FormatFusion(kernel);
+        continue;
+      default:
+        continue;
+    }
+  }
+  return RET_OK;
+}
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/agent/npu/npu_fusion_pass.h
+++ b/mindspore/lite/src/runtime/agent/npu/npu_fusion_pass.h
@ -0,0 +1,40 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_FUSION_PASS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_FUSION_PASS_H_
+#include <vector>
+#include "src/lite_kernel.h"
+#include "src/ops/primitive_c.h"
+namespace mindspore::lite {
+class NPUFusionPass {
+ public:
+  explicit NPUFusionPass(std::vector<kernel::LiteKernel *> *dst_kernels) { kernels = dst_kernels; }
+  ~NPUFusionPass() = default;
+  int Fusion();
+
+ protected:
+  int ConcatFusion(kernel::LiteKernel *kernel);
+  int AddFusion(kernel::LiteKernel *kernel);
+  int FormatFusion(kernel::LiteKernel *kernel);
+  void UpdatePreKernels(kernel::LiteKernel *kernel);
+  void UpdatePostKernels(kernel::LiteKernel *kernel);
+
+ private:
+  std::vector<kernel::LiteKernel *> *kernels;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_FUSION_PASS_H_
--- a/mindspore/lite/src/runtime/agent/npu/npu_manager.cc
+++ b/mindspore/lite/src/runtime/agent/npu/npu_manager.cc
@ -15,21 +15,59 @@
 */

 #include "src/runtime/agent/npu/npu_manager.h"
+#include <sys/system_properties.h>
 #include <sys/fcntl.h>
 #include <unistd.h>
 #include "include/hiai_ir_build.h"
 #include "include/HiAiModelManagerService.h"
 #include "include/errorcode.h"
-#include "include/graph/op/all_ops.h"
 #include "src/common/file_utils.h"

 namespace mindspore::lite {
-
-bool NPUManager::IsSupportNPU() {
-  if (!is_npu_check_executor) {
-    CheckSupportNPU();
+#define MAX_MODEL_NUM 20
+int NPUManager::CompareVersion(const string &version1, const string &version2) {
+  std::istringstream iss1(version1);
+  std::istringstream iss2(version2);
+  string string1;
+  string string2;
+  while (!iss1.eof() || !iss2.eof()) {
+    getline(iss1, string1, '.');
+    getline(iss2, string2, '.');
+    if (stoi(string1) > stoi(string2)) return 1;
+    if (stoi(string1) < stoi(string2)) return -1;
+    string1 = string2 = "0";
  }
-  if (is_support_npu) {
+  return 0;
+}
+
+bool NPUManager::CheckEMUIVersion() {
+  char emui[128] = {0x00};
+  __system_property_get("ro.build.version.emui", emui);
+  std::string emui_str = emui;
+  int pos = emui_str.find('_');
+  if (pos != std::string::npos) {
+    auto version = emui_str.substr(pos + 1);
+    int ret = CompareVersion(version, "11.0.0");
+    if (ret < 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool NPUManager::CheckDDKVersion() {
+  auto client = std::make_shared<hiai::AiModelMngerClient>();
+  if (client->GetVersion() != nullptr) {
+    std::string version = client->GetVersion();
+    int ret = CompareVersion(version, "100.330.010.011");
+    if (ret < 0) {
+      return false;
+    }
+  }
+  return true;
+}
+bool NPUManager::IsSupportNPU() {
+  if (IsKirinChip() && CheckEMUIVersion() && CheckDDKVersion()) {
    MS_LOG(INFO) << "The current device support NPU.";
    return true;
  } else {
@ -38,36 +76,6 @@ bool NPUManager::IsSupportNPU() {
  }
 }

-std::string NPUManager::GetExecutorPath() {
-  std::string executor_path;
-  char cmdline[1024] = {0};
-  int fd = open("/proc/self/cmdline", O_RDONLY);
-  if (fd >= 0) {
-    char ch;
-    int i = 0;
-    while (read(fd, &ch, sizeof(ch)) > 0 && !isspace(ch)) {
-      if (':' == ch) {
-        break;
-      }
-      cmdline[i] = ch;
-      i++;
-    }
-    close(fd);
-  }
-  executor_path = std::string(cmdline);
-  if (executor_path.empty()) {
-    executor_path = "./";
-  }
-  // android
-  if (executor_path.substr(0, 11) == "/data/data/") {
-    executor_path = executor_path + '/';
-  } else {
-    // Linux
-    executor_path = executor_path.substr(0, executor_path.rfind('/')) + "/";
-  }
-  return executor_path;
-}
-
 bool NPUManager::IsKirinChip() {
  std::ifstream cpu_info("/proc/cpuinfo");
  if (!(cpu_info.good() && cpu_info.is_open())) {
@ -96,86 +104,6 @@ bool NPUManager::IsKirinChip() {
  return false;
 }

-bool WriteToOMFile(domi::ModelBufferData om_model_buff, const std::string &om_file_path) {
-  FILE *fp;
-  fp = fopen(om_file_path.c_str(), "wb");
-  if (fp == nullptr) {
-    MS_LOG(ERROR) << om_file_path.c_str() << " open failed.";
-    return false;
-  }
-
-  auto write_size = (uint32_t)fwrite(om_model_buff.data, 1, om_model_buff.length, fp);
-  if (write_size != om_model_buff.length) {
-    fclose(fp);
-    MS_LOG(ERROR) << "Write om file failed.";
-    return false;
-  }
-  fclose(fp);
-  return true;
-}
-
-bool NPUManager::CheckOmBuildIr(const std::string &path) {
-  // build test om model
-  std::shared_ptr<hiai::op::Add> add_op(new (std::nothrow) hiai::op::Add("add"));
-  if (add_op == nullptr) {
-    MS_LOG(ERROR) << "new add_op failed.";
-    return false;
-  }
-  ge::TensorDesc desc(ge::Shape({1}), ge::FORMAT_NCHW, ge::DT_FLOAT);
-  std::shared_ptr<hiai::op::Data> data = std::make_shared<hiai::op::Data>("data");
-  data->update_input_desc_x(desc);
-  add_op->set_input_x1(*data);
-  add_op->set_input_x2(*data);
-  domi::HiaiIrBuild ir_build;
-  ge::Graph ir_graph("graph");
-  std::vector<ge::Operator> inputs{*data, *data};
-  std::vector<ge::Operator> outputs{*add_op};
-  ir_graph.SetInputs(inputs).SetOutputs(outputs);
-  ge::Model om_model("test_model", "test_version");
-  om_model.SetGraph(ir_graph);
-
-  domi::ModelBufferData om_model_buff;
-  if (!ir_build.CreateModelBuff(om_model, om_model_buff)) {
-    MS_LOG(ERROR) << "Create model buffer failed.";
-    return false;
-  }
-  if (!ir_build.BuildIRModel(om_model, om_model_buff)) {
-    MS_LOG(ERROR) << "Build IR model failed.";
-    return false;
-  }
-
-  // save test om model
-  remove(path.c_str());
-  bool ret = WriteToOMFile(om_model_buff, path);
-  ir_build.ReleaseModelBuff(om_model_buff);
-  return ret;
-}
-
-void NPUManager::CheckSupportNPU() {
-  is_npu_check_executor = true;
-  std::string path_string = GetExecutorPath();
-
-  std::string test_model_path = path_string + "/mindspore_lite_test_npu.om";
-  std::ifstream ifs(test_model_path);
-  if (ifs.good() && ifs.is_open()) {
-    ifs.close();
-    is_support_npu = true;
-    return;
-  }
-  if (!IsKirinChip()) {
-    MS_LOG(ERROR) << "The current device chip NOT SUPPORT NPU";
-    is_support_npu = false;
-    return;
-  }
-
-  if (!CheckOmBuildIr(test_model_path)) {
-    MS_LOG(ERROR) << "Build OM IR error.";
-    is_support_npu = false;
-    return;
-  }
-  is_support_npu = true;
-}
-
 int NPUManager::AddModel(void *model_buf, uint32_t size, const std::string &model_name, int frequency) {
  hiai::MemBuffer *buffer = mc_builder_->InputMemBufferCreate(model_buf, size);
  if (buffer == nullptr) {
@ -188,33 +116,42 @@ int NPUManager::AddModel(void *model_buf, uint32_t size, const std::string &mode
  model_desc_.push_back(desc);
  mc_builder_->MemBufferDestroy(buffer);

+  model_map_.insert({model_name, index_});
  index_++;
  return RET_OK;
 }

-int NPUManager::InitClient() {
-  this->client_ = std::make_shared<hiai::AiModelMngerClient>();
-  if (this->client_ == nullptr) {
-    return RET_ERROR;
-  }
-  int ret = this->client_->Init(nullptr);
-  if (ret != hiai::AI_SUCCESS) {
-    return RET_ERROR;
-  }
-  mc_builder_ = std::make_shared<hiai::AiModelBuilder>(this->client_);
-  return RET_OK;
-}
-
 int NPUManager::LoadOMModel() {
-  int ret = this->client_->Load(model_desc_);
-  if (ret != hiai::AI_SUCCESS) {
-    MS_LOG(ERROR) << "Client load model failed." << ret;
-    return RET_ERROR;
+  for (int i = 0; i < index_ / MAX_MODEL_NUM + 1; i++) {
+    auto client = std::make_shared<hiai::AiModelMngerClient>();
+    if (client == nullptr) {
+      MS_LOG(ERROR) << "NPU client is nullptr.";
+      return RET_ERROR;
+    }
+    int ret = client->Init(nullptr);
+    if (ret != hiai::AI_SUCCESS) {
+      MS_LOG(ERROR) << "NPU client init failed. code is " << ret;
+      return RET_ERROR;
+    }
+    mc_builder_ = std::make_shared<hiai::AiModelBuilder>(client);
+
+    vector<std::shared_ptr<hiai::AiModelDescription>> desc(model_desc_.begin() + i * MAX_MODEL_NUM,
+                                                           ((i + 1) * MAX_MODEL_NUM > index_)
+                                                             ? model_desc_.begin() + index_
+                                                             : model_desc_.begin() + (i + 1) * MAX_MODEL_NUM);
+    ret = client->Load(desc);
+    if (ret != hiai::AI_SUCCESS) {
+      MS_LOG(ERROR) << "Client load model failed." << ret;
+      return RET_ERROR;
+    }
+    clients_.push_back(client);
  }
  return RET_OK;
 }

-std::shared_ptr<hiai::AiModelMngerClient> NPUManager::GetClient() { return client_; }
+std::shared_ptr<hiai::AiModelMngerClient> NPUManager::GetClient(const std::string &model_name) {
+  return clients_[model_map_[model_name] / MAX_MODEL_NUM];
+}

-int NPUManager::index() { return index_; }
+int NPUManager::index() const { return index_; }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/agent/npu/npu_manager.h
+++ b/mindspore/lite/src/runtime/agent/npu/npu_manager.h
@ -14,15 +14,21 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_UTILS_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_UTILS_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_MANAGER_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_MANAGER_H_
 #include <string>
 #include <memory>
 #include <vector>
+#include <unordered_map>
+#include <set>
+#include "schema/model_generated.h"
 #include "include/HiAiModelManagerService.h"

 namespace mindspore::lite {
-
+static std::set<mindspore::schema::PrimitiveType> npu_trans_nodes = {
+  schema::PrimitiveType_Conv2D,          schema::PrimitiveType_DeConv2D,
+  schema::PrimitiveType_DepthwiseConv2D, schema::PrimitiveType_DeDepthwiseConv2D,
+  schema::PrimitiveType_Resize,          schema::PrimitiveType_Pooling};
 class NPUManager {
 public:
  static NPUManager *GetInstance() {
@ -32,8 +38,6 @@ class NPUManager {

  bool IsSupportNPU();

-  int InitClient();
-
  // provide to subgraph to add model.
  int AddModel(void *model_buf, uint32_t size, const std::string &model_name, int frequency);

@ -41,18 +45,18 @@ class NPUManager {
  int LoadOMModel();

  // provide to executor.
-  std::shared_ptr<hiai::AiModelMngerClient> GetClient();
+  std::shared_ptr<hiai::AiModelMngerClient> GetClient(const std::string &model_name);

-  int index();
+  int index() const;

 private:
-  void CheckSupportNPU();
-
  bool IsKirinChip();

-  bool CheckOmBuildIr(const std::string &path);
+  bool CheckEMUIVersion();

-  std::string GetExecutorPath();
+  bool CheckDDKVersion();
+
+  int CompareVersion(const std::string &version1, const std::string &version2);

 private:
  int index_ = 0;
@ -61,12 +65,14 @@ class NPUManager {

  bool is_support_npu = false;

-  std::shared_ptr<hiai::AiModelMngerClient> client_ = nullptr;
+  std::vector<std::shared_ptr<hiai::AiModelMngerClient>> clients_;

  std::vector<std::shared_ptr<hiai::AiModelDescription>> model_desc_;

  std::shared_ptr<hiai::AiModelBuilder> mc_builder_ = nullptr;
+
+  std::unordered_map<std::string, int> model_map_;
 };

 }  // namespace mindspore::lite
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_UTILS_H_
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_MANAGER_H_
--- a/mindspore/lite/src/runtime/agent/npu/npu_pass_utils.cc
+++ b/mindspore/lite/src/runtime/agent/npu/npu_pass_utils.cc
@ -0,0 +1,102 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/kernel_registry.h"
+#include "src/ops/nhwc2nchw.h"
+#include "src/ops/nchw2nhwc.h"
+#include "src/runtime/agent/npu/npu_pass_utils.h"
+namespace mindspore::lite {
+using kernel::KERNEL_ARCH::kCPU;
+using kernel::KERNEL_ARCH::kNPU;
+PrimitiveC *NPUPassUtils::CreateNchw2NhwcPrimitive() {
+  flatbuffers::FlatBufferBuilder fbb(1024);
+  auto val_offset = schema::CreateNchw2Nhwc(fbb);
+  auto prim_offset = schema::CreatePrimitive(fbb, schema::PrimitiveType_Nchw2Nhwc, val_offset.o);
+  fbb.Finish(prim_offset);
+  auto buf = fbb.GetBufferPointer();
+  if (buf == nullptr) {
+    MS_LOG(ERROR) << "GetBufferPointer return nullptr";
+    fbb.Clear();
+    return nullptr;
+  }
+  auto primitive_buf = reinterpret_cast<char *>(malloc(fbb.GetSize()));
+  if (primitive_buf == nullptr) {
+    MS_LOG(ERROR) << "Malloc primitive_buf_ failed.";
+    fbb.Clear();
+    return nullptr;
+  }
+  memcpy(primitive_buf, buf, fbb.GetSize());
+  auto *primitive = PrimitiveC::NewPrimitiveC<Nchw2Nhwc>(flatbuffers::GetRoot<schema::Primitive>(primitive_buf));
+  free(primitive_buf);
+  fbb.Clear();
+  return primitive;
+}
+
+PrimitiveC *NPUPassUtils::CreateNhwc2NchwPrimitive() {
+  flatbuffers::FlatBufferBuilder fbb(1024);
+  auto val_offset = schema::CreateNhwc2Nchw(fbb);
+  auto prim_offset = schema::CreatePrimitive(fbb, schema::PrimitiveType_Nhwc2Nchw, val_offset.o);
+  fbb.Finish(prim_offset);
+  auto buf = fbb.GetBufferPointer();
+  if (buf == nullptr) {
+    MS_LOG(ERROR) << "GetBufferPointer return nullptr";
+    fbb.Clear();
+    return nullptr;
+  }
+  auto primitive_buf = reinterpret_cast<char *>(malloc(fbb.GetSize()));
+  if (primitive_buf == nullptr) {
+    MS_LOG(ERROR) << "Malloc primitive_buf_ failed.";
+    fbb.Clear();
+    return nullptr;
+  }
+  memcpy(primitive_buf, buf, fbb.GetSize());
+  auto *primitive = PrimitiveC::NewPrimitiveC<Nhwc2Nchw>(flatbuffers::GetRoot<schema::Primitive>(primitive_buf));
+  free(primitive_buf);
+  fbb.Clear();
+  return primitive;
+}
+
+kernel::LiteKernel *NPUPassUtils::CreateNchw2NhwcKernel(const std::vector<Tensor *> &in_tensors,
+                                                        const std::vector<Tensor *> &out_tensors,
+                                                        const InnerContext *ctx, const std::string &name) {
+  kernel::KernelKey key{kCPU, kNumberTypeFloat32, schema::PrimitiveType_Nchw2Nhwc};
+  auto nchw2nhwc_primitive = CreateNchw2NhwcPrimitive();
+  auto *nchw2nhwc_kernel =
+    KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, nchw2nhwc_primitive, ctx, key);
+  nchw2nhwc_kernel->set_name(name);
+  return nchw2nhwc_kernel;
+}
+
+kernel::LiteKernel *NPUPassUtils::CreateNhwc2NchwKernel(const std::vector<Tensor *> &in_tensors,
+                                                        const std::vector<Tensor *> &out_tensors,
+                                                        const InnerContext *ctx, const std::string &name) {
+  kernel::KernelKey key{kCPU, kNumberTypeFloat32, schema::PrimitiveType_Nhwc2Nchw};
+  auto nhwc2nchw_primitive = CreateNhwc2NchwPrimitive();
+  auto *nhwc2nchw_kernel =
+    KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, nhwc2nchw_primitive, ctx, key);
+  nhwc2nchw_kernel->set_name(name);
+  return nhwc2nchw_kernel;
+}
+
+void NPUPassUtils::UpdateKernel(kernel::LiteKernel *kernel, const std::vector<kernel::LiteKernel *> &in_kernels,
+                                const std::vector<kernel::LiteKernel *> &out_kernels,
+                                const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors) {
+  kernel->set_in_tensors(in_tensors);
+  kernel->set_out_tensors(out_tensors);
+  kernel->set_in_kernels(in_kernels);
+  kernel->set_out_kernels(out_kernels);
+}
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/agent/npu/npu_pass_utils.h
+++ b/mindspore/lite/src/runtime/agent/npu/npu_pass_utils.h
@ -0,0 +1,44 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_PASS_UTILS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_PASS_UTILS_H_
+#include <vector>
+#include <string>
+#include "src/ops/primitive_c.h"
+#include "src/lite_kernel.h"
+namespace mindspore::lite {
+class NPUPassUtils {
+ public:
+  static kernel::LiteKernel *CreateNchw2NhwcKernel(const std::vector<Tensor *> &in_tensors,
+                                                   const std::vector<Tensor *> &out_tensors, const InnerContext *ctx,
+                                                   const std::string &name);
+
+  static kernel::LiteKernel *CreateNhwc2NchwKernel(const std::vector<Tensor *> &in_tensors,
+                                                   const std::vector<Tensor *> &out_tensors, const InnerContext *ctx,
+                                                   const std::string &name);
+
+  static void UpdateKernel(kernel::LiteKernel *kernel, const std::vector<kernel::LiteKernel *> &in_kernels,
+                           const std::vector<kernel::LiteKernel *> &out_kernels,
+                           const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors);
+
+ private:
+  static PrimitiveC *CreateNchw2NhwcPrimitive();
+
+  static PrimitiveC *CreateNhwc2NchwPrimitive();
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_PASS_UTILS_H_
--- a/mindspore/lite/src/runtime/agent/npu/npu_transform_pass.cc
+++ b/mindspore/lite/src/runtime/agent/npu/npu_transform_pass.cc
@ -0,0 +1,201 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/runtime/agent/npu/npu_transform_pass.h"
+#include <vector>
+#include "src/lite_kernel.h"
+#include "src/runtime/agent/npu/npu_manager.h"
+#include "src/runtime/agent/npu/npu_pass_utils.h"
+namespace mindspore::lite {
+using kernel::KERNEL_ARCH::kCPU;
+using kernel::KERNEL_ARCH::kNPU;
+int NPUTransformPass::UpdateNH2NCTransNodePreKernel(kernel::LiteKernel *kernel, kernel::LiteKernel *trans_kernel,
+                                                    kernel::LiteKernel *after_kernel) {
+  std::vector<kernel::LiteKernel *> out_kernels;
+
+  for (auto out_kernel : kernel->out_kernels()) {
+    if (out_kernel == after_kernel) {
+      out_kernels.push_back(trans_kernel);
+    } else {
+      out_kernels.push_back(out_kernel);
+    }
+  }
+  NPUPassUtils::UpdateKernel(kernel, kernel->in_kernels(), out_kernels, kernel->in_tensors(), kernel->out_tensors());
+  return RET_OK;
+}
+
+int NPUTransformPass::UpdateNH2NCTransNodeAfterKernel(kernel::LiteKernel *kernel, kernel::LiteKernel *trans_kernel,
+                                                      kernel::LiteKernel *before_kernel) {
+  std::vector<lite::Tensor *> cur_kernel_in_tensors = {trans_kernel->out_tensors()[0]};
+  for (int i = 1; i < kernel->in_tensors().size(); i++) {
+    cur_kernel_in_tensors.push_back(kernel->in_tensors()[i]);
+  }
+  std::vector<kernel::LiteKernel *> cur_in_kernels = {trans_kernel};
+  for (int i = 0; i < kernel->in_kernels().size(); i++) {
+    auto in_kernel = kernel->in_kernels()[i];
+    if (in_kernel != kernel) {
+      cur_in_kernels.push_back(in_kernel);
+    }
+  }
+  NPUPassUtils::UpdateKernel(kernel, cur_in_kernels, kernel->out_kernels(), cur_kernel_in_tensors,
+                             kernel->out_tensors());
+  return RET_OK;
+}
+
+int NPUTransformPass::InsertPreNode(const InnerContext *context, std::vector<kernel::LiteKernel *>::iterator it,
+                                    std::vector<kernel::LiteKernel *> *all_kernels,
+                                    std::vector<Tensor *> *all_tensors) {
+  auto kernel = *it;
+  bool is_input_kernel = kernel->in_kernels().empty();
+  if (is_input_kernel || kernel->in_kernels()[0]->desc().arch != kNPU ||
+      npu_trans_nodes.find(kernel->in_kernels()[0]->Type()) == npu_trans_nodes.end()) {
+    kernel::LiteKernel *before_kernel = nullptr;
+    if (!is_input_kernel) {
+      before_kernel = kernel->in_kernels()[0];
+    }
+    // Create pre transform kernel out tensors.
+    std::vector<int> shapes{kernel->in_tensors()[0]->shape()[0], kernel->in_tensors()[0]->shape()[3],
+                            kernel->in_tensors()[0]->shape()[1], kernel->in_tensors()[0]->shape()[2]};
+    auto tensor = new Tensor(kernel->in_tensors()[0]->data_type(), shapes, schema::Format_NCHW, Tensor::VAR);
+    std::vector<Tensor *> pre_trans_out_tensors = {tensor};
+    all_tensors->push_back(pre_trans_out_tensors[0]);
+    // Replace the output tensor of the previous node
+    auto name = kernel->name() + "_pre_trans" + "_Nhwc2Nchw_" + std::to_string(total++);
+    auto *pre_trans_kernel =
+      NPUPassUtils::CreateNhwc2NchwKernel({kernel->in_tensors()[0]}, pre_trans_out_tensors, context, name);
+    // Insert Nhwc2Nchw into the front of the current queue
+    all_kernels->push_back(pre_trans_kernel);
+    // Replace the output kernel of the previous node
+    std::vector<kernel::LiteKernel *> pre_trans_in_kernel;
+    if (is_input_kernel) {
+      pre_trans_in_kernel = {};
+    } else {
+      pre_trans_in_kernel = {before_kernel};
+    }
+    NPUPassUtils::UpdateKernel(pre_trans_kernel, pre_trans_in_kernel, {kernel}, {kernel->in_tensors()[0]},
+                               pre_trans_out_tensors);
+
+    if (before_kernel != nullptr) {
+      UpdateNH2NCTransNodePreKernel(before_kernel, pre_trans_kernel, kernel);
+    }
+    UpdateNH2NCTransNodeAfterKernel(kernel, pre_trans_kernel, before_kernel);
+  }
+  return RET_OK;
+}
+
+int NPUTransformPass::InsertPostNode(const InnerContext *context, std::vector<kernel::LiteKernel *>::iterator it,
+                                     std::vector<kernel::LiteKernel *> *all_kernels,
+                                     std::vector<Tensor *> *all_tensors) {
+  auto kernel = *it;
+  // Single output multiple references
+  for (int i = 0; i < kernel->out_kernels().size(); i++) {
+    auto next_kernel = kernel->out_kernels().at(i);
+    if (next_kernel->desc().arch == kNPU && npu_trans_nodes.find(next_kernel->Type()) != npu_trans_nodes.end()) {
+      continue;
+    }
+    // Change format the output of the current kernel nhwc->nchw
+    auto shapes = {kernel->out_tensors()[0]->shape()[0], kernel->out_tensors()[0]->shape()[1],
+                   kernel->out_tensors()[0]->shape()[2], kernel->out_tensors()[0]->shape()[3]};
+    auto tensor = new Tensor(kernel->out_tensors()[0]->data_type(), shapes, schema::Format_NHWC, Tensor::VAR);
+    std::vector<Tensor *> post_trans_out_tensors = {tensor};
+    all_tensors->push_back(post_trans_out_tensors[0]);
+    // Use the output tensor of the current node as the input tensor of the post-conversion operator
+    auto name = kernel->name() + "_post_trans" + "_Nchw2Nhwc" + std::to_string(total++);
+    auto *post_trans_kernel =
+      NPUPassUtils::CreateNchw2NhwcKernel(kernel->out_tensors(), post_trans_out_tensors, context, name);
+    // Replace the input tensor of the next node
+    NPUPassUtils::UpdateKernel(post_trans_kernel, {kernel}, {next_kernel}, kernel->out_tensors(),
+                               post_trans_out_tensors);
+    // Directly insert in the back, will not affect the topological sort
+    all_kernels->push_back(post_trans_kernel);
+    UpdateNC2NHTransNodePreKernel(kernel, post_trans_kernel, next_kernel);
+    UpdateNC2NHTransNodeAfterKernel(kernel, post_trans_kernel, next_kernel);
+  }
+  return RET_OK;
+}
+
+int NPUTransformPass::UpdateNC2NHTransNodePreKernel(kernel::LiteKernel *kernel, kernel::LiteKernel *trans_kernel,
+                                                    kernel::LiteKernel *next_kernel) {
+  std::vector<kernel::LiteKernel *> cur_out_kernels;
+  for (auto out_kernel : kernel->out_kernels()) {
+    if (out_kernel == next_kernel) {
+      cur_out_kernels.push_back(trans_kernel);
+    } else {
+      cur_out_kernels.push_back(out_kernel);
+    }
+  }
+  auto kernel_out_tensor = kernel->out_tensors()[0];
+  // Change format the output of the current kernel nhwc->nchw
+  std::vector<int> kernel_out_new_shapes = {kernel_out_tensor->shape()[0], kernel_out_tensor->shape()[3],
+                                            kernel_out_tensor->shape()[1], kernel_out_tensor->shape()[2]};
+  kernel_out_tensor->set_format(schema::Format_NCHW);
+  kernel_out_tensor->set_shape(kernel_out_new_shapes);
+  NPUPassUtils::UpdateKernel(kernel, kernel->in_kernels(), cur_out_kernels, kernel->in_tensors(), {kernel_out_tensor});
+  return RET_OK;
+}
+
+int NPUTransformPass::UpdateNC2NHTransNodeAfterKernel(kernel::LiteKernel *kernel, kernel::LiteKernel *trans_kernel,
+                                                      kernel::LiteKernel *next_kernel) {
+  std::vector<Tensor *> next_in_tensors;
+  for (auto next_in_tensor : next_kernel->in_tensors()) {
+    if (next_in_tensor != kernel->out_tensors()[0]) {
+      next_in_tensors.push_back(next_in_tensor);
+    } else {
+      next_in_tensors.push_back(trans_kernel->out_tensors()[0]);
+    }
+  }
+  next_kernel->set_in_tensors(next_in_tensors);
+  std::vector<kernel::LiteKernel *> next_in_kernels;
+  for (auto in_kernel : next_kernel->in_kernels()) {
+    if (in_kernel == kernel) {
+      next_in_kernels.push_back(trans_kernel);
+    } else {
+      next_in_kernels.push_back(in_kernel);
+    }
+  }
+  NPUPassUtils::UpdateKernel(next_kernel, next_in_kernels, next_kernel->out_kernels(), next_in_tensors,
+                             next_kernel->out_tensors());
+
+  return RET_OK;
+}
+
+int NPUTransformPass::FormatTransformPass(const InnerContext *context, std::vector<kernel::LiteKernel *> *all_kernels,
+                                          std::vector<Tensor *> *all_tensors) {
+  if (context->IsNpuEnabled()) {
+    std::vector<kernel::LiteKernel *> new_kernels;
+
+    for (auto it = all_kernels->begin(); it != all_kernels->end(); it++) {
+      auto kernel = *it;
+      if (kernel->desc().arch != kNPU) {
+        new_kernels.push_back(kernel);
+        continue;
+      }
+      if (npu_trans_nodes.find(kernel->Type()) != npu_trans_nodes.end()) {
+        InsertPreNode(context, it, &new_kernels, all_tensors);
+        new_kernels.push_back(kernel);
+        InsertPostNode(context, it, &new_kernels, all_tensors);
+      } else {
+        new_kernels.push_back(kernel);
+      }
+    }
+    all_kernels->clear();
+    for (int i = 0; i < new_kernels.size(); i++) {
+      all_kernels->push_back(new_kernels[i]);
+    }
+  }
+  return RET_OK;
+}
+
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/agent/npu/npu_transform_pass.h
+++ b/mindspore/lite/src/runtime/agent/npu/npu_transform_pass.h
@ -0,0 +1,51 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_TRANSFORM_PASS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_TRANSFORM_PASS_H_
+#include <vector>
+#include "src/lite_kernel.h"
+#include "src/ops/primitive_c.h"
+namespace mindspore::lite {
+class NPUTransformPass {
+ public:
+  int FormatTransformPass(const InnerContext *context, std::vector<kernel::LiteKernel *> *all_kernels,
+                          std::vector<Tensor *> *all_tensors);
+
+ private:
+  int UpdateNH2NCTransNodePreKernel(kernel::LiteKernel *kernel, kernel::LiteKernel *trans_kernel,
+                                    kernel::LiteKernel *after_kernel);
+
+  int UpdateNH2NCTransNodeAfterKernel(kernel::LiteKernel *kernel, kernel::LiteKernel *trans_kernel,
+                                      kernel::LiteKernel *before_kernel);
+
+  int UpdateNC2NHTransNodePreKernel(kernel::LiteKernel *kernel, kernel::LiteKernel *trans_kernel,
+                                    kernel::LiteKernel *after_kernel);
+
+  int UpdateNC2NHTransNodeAfterKernel(kernel::LiteKernel *kernel, kernel::LiteKernel *trans_kernel,
+                                      kernel::LiteKernel *next_kernel);
+
+  int InsertPreNode(const InnerContext *context, std::vector<kernel::LiteKernel *>::iterator it,
+                    std::vector<kernel::LiteKernel *> *all_kernels, std::vector<Tensor *> *all_tensors);
+
+  int InsertPostNode(const InnerContext *context, std::vector<kernel::LiteKernel *>::iterator it,
+                     std::vector<kernel::LiteKernel *> *all_kernels, std::vector<Tensor *> *all_tensors);
+
+ private:
+  int total = 0;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_TRANSFORM_PASS_H_
--- a/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc
+++ b/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc
@ -24,7 +24,6 @@
 #include "include/graph/model.h"
 #include "include/hiai_ir_build.h"
 #include "include/HiAiModelManagerType.h"
-#include "include/context.h"
 #include "include/version.h"
 #include "src/common/utils.h"
 #include "src/runtime/agent/npu/npu_converter_utils.h"
@ -34,10 +33,6 @@ namespace mindspore::kernel {
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;

-std::set<schema::PrimitiveType> trans_nodes = {schema::PrimitiveType_Conv2D, schema::PrimitiveType_DeConv2D,
-                                               schema::PrimitiveType_DepthwiseConv2D,
-                                               schema::PrimitiveType_DeDepthwiseConv2D, schema::PrimitiveType_Resize};
-
 domi::ModelBufferData *SubGraphNpuKernel::BuildIRModel() {
  ge::Graph graph("NPUGraph");

@ -75,8 +70,7 @@ domi::ModelBufferData *SubGraphNpuKernel::BuildIRModel() {
 }

 int SubGraphNpuKernel::Run() {
-  return reinterpret_cast<lite::NPUExecutor *>(this->executor_)
-    ->Run(in_tensors_, out_tensors_, nodes_, inputs_nhwc2nchw_, outputs_nchw2nhwc_);
+  return reinterpret_cast<lite::NPUExecutor *>(this->executor_)->Run(in_tensors_, out_tensors_, nodes_);
 }

 int SubGraphNpuKernel::BuildNPUInputOp() {
@ -88,21 +82,7 @@ int SubGraphNpuKernel::BuildNPUInputOp() {
      if (IsSubGraphInputTensor(in_tensor)) {
        auto tensor_name = node->name() + "_" + std::to_string(count++);
        hiai::op::Data *data;
-        if (trans_nodes.find(node->Type()) != trans_nodes.end()) {
-          auto shape = in_tensor->shape();
-          data = new (std::nothrow) hiai::op::Data(tensor_name);
-          if (data == nullptr) {
-            MS_LOG(ERROR) << "New data failed.";
-            return RET_ERROR;
-          }
-          ge::TensorDesc tensor_desc(lite::ConverterToNPUShape({shape[0], shape[3], shape[1], shape[2]}),
-                                     ge::FORMAT_NCHW, lite::ConverterToNPUDataType(in_tensor->data_type()));
-          data->update_input_desc_x(tensor_desc);
-          inputs_nhwc2nchw_.push_back(true);
-        } else {
-          data = mindspore::lite::ConverterToNPUData(in_tensor, tensor_name);
-          inputs_nhwc2nchw_.push_back(false);
-        }
+        data = mindspore::lite::ConverterToNPUData(in_tensor, tensor_name);
        subgraph_input_op_.push_back(*data);
        node_input_op.push_back(data);
        continue;
@ -132,7 +112,7 @@ int SubGraphNpuKernel::BuildNPUInputOp() {

      // weight tensor
      if (is_weight_tensor) {
-        if (trans_nodes.find(node->Type()) == trans_nodes.end()) {
+        if (lite::npu_trans_nodes.find(node->Type()) == lite::npu_trans_nodes.end()) {
          auto name = node->name() + "_" + std::to_string(count++);
          auto weight_const = new (std::nothrow) hiai::op::Const(node->name() + "_" + std::to_string(count++));
          if (weight_const == nullptr) {
@ -162,11 +142,6 @@ std::vector<ge::Operator> SubGraphNpuKernel::GetNPUNodes(const vector<kernel::Li
  ops.reserve(nodes.size());
  for (int i = 0; i < nodes.size(); i++) {
    ops.push_back(*reinterpret_cast<NPUKernel *>(nodes[i])->GetNPUOp());
-    if (trans_nodes.find(schema::PrimitiveType(nodes[i]->GetPrimitive()->Type())) != trans_nodes.end()) {
-      outputs_nchw2nhwc_.push_back(true);
-    } else {
-      outputs_nchw2nhwc_.push_back(false);
-    }
  }
  return ops;
 }
--- a/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.h
+++ b/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.h
@ -69,10 +69,6 @@ class SubGraphNpuKernel : public SubGraphKernel {
  std::string GetOMModelName();

 private:
-  std::vector<bool> inputs_nhwc2nchw_;
-
-  std::vector<bool> outputs_nchw2nhwc_;
-
  domi::ModelBufferData *model_buffer_data_;

  std::vector<ge::Operator> subgraph_input_op_;
--- a/mindspore/lite/src/runtime/kernel/npu/convolution_base_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/convolution_base_npu.cc
@ -16,6 +16,7 @@

 #include "src/runtime/kernel/npu/convolution_base_npu.h"
 #include "src/runtime/agent/npu/npu_converter_utils.h"
+#include "nnacl/pack.h"

 namespace mindspore::kernel {
 ConvolutionBaseNPUKernel::~ConvolutionBaseNPUKernel() {
@ -39,14 +40,27 @@ int ConvolutionBaseNPUKernel::InitWeightBiasConst(const std::vector<lite::Tensor
    MS_LOG(ERROR) << "New weight const failed.";
    return RET_ERROR;
  }
-  auto weight_shape = inputs[1]->shape();
-  inputs[1]->set_shape({weight_shape[0], weight_shape[3], weight_shape[1], weight_shape[2]});
-  inputs[1]->set_format(schema::Format_NCHW);
-  auto weight_tensor = mindspore::lite::ConverterToNPUTensor(inputs[1]);
-  weight_->set_attr_value(weight_tensor);
+  auto w_shape = inputs[1]->shape();
+  auto nhwc_data = inputs[1]->data_c();
+  auto nchw_data = reinterpret_cast<float *>(malloc(inputs[1]->ElementsNum() * sizeof(float)));
+  if (nchw_data == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  PackNHWCToNCHWFp32(nhwc_data, nchw_data, w_shape[0], w_shape[1] * w_shape[2], w_shape[3]);

-  inputs[1]->set_shape(weight_shape);
-  inputs[1]->set_format(schema::Format_NHWC);
+  std::shared_ptr<ge::Tensor> weight_tensor = std::shared_ptr<ge::Tensor>(new (std::nothrow) ge::Tensor());
+  if (weight_tensor == nullptr) {
+    MS_LOG(ERROR) << "new weight_tensor failed.";
+    return RET_ERROR;
+  }
+  ge::TensorDesc tensor_desc(lite::ConverterToNPUShape({w_shape[0], w_shape[3], w_shape[1], w_shape[2]}),
+                             ge::FORMAT_NCHW, lite::ConverterToNPUDataType(inputs[1]->data_type()));
+  weight_tensor->SetTensorDesc(tensor_desc);
+  weight_tensor->SetData(reinterpret_cast<const uint8_t *>(nchw_data), inputs[1]->Size());
+
+  weight_->set_attr_value(weight_tensor);
+  free(nchw_data);

  if (inputs.size() >= 3) {
    bias_ = new (std::nothrow) hiai::op::Const(name_ + "_b");
--- a/mindspore/lite/src/runtime/kernel/npu/convolution_base_npu.h
+++ b/mindspore/lite/src/runtime/kernel/npu/convolution_base_npu.h
@ -17,17 +17,18 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_CONVOLUTION_BASE_NPU_H_

 #include <vector>
+#include <memory>
 #include "include/graph/op/all_ops.h"
 #include "src/runtime/kernel/npu/transpose_base_npu.h"
 #include "nnacl/conv_parameter.h"

 namespace mindspore::kernel {
-class ConvolutionBaseNPUKernel : public TransposeBaseNPUKernel {
+class ConvolutionBaseNPUKernel : public NPUKernel {
 public:
  ConvolutionBaseNPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                           const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                           const mindspore::lite::PrimitiveC *primitive)
-      : TransposeBaseNPUKernel(parameter, inputs, outputs, ctx, primitive) {}
+      : NPUKernel(parameter, inputs, outputs, ctx, primitive) {}
  ~ConvolutionBaseNPUKernel() override;

 protected:
--- a/mindspore/lite/src/runtime/kernel/npu/convolution_depthwise_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/convolution_depthwise_npu.cc
@ -25,7 +25,7 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 namespace mindspore::kernel {
 int ConvolutionDepthwiseNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs,
                                             const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter) {
-  return RET_ERROR;
+  return RET_OK;
 }

 int ConvolutionDepthwiseNPUKernel::SetConvDwParam() {
@ -49,19 +49,13 @@ int ConvolutionDepthwiseNPUKernel::SetConvDwParam() {
 int ConvolutionDepthwiseNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs,
                                                const std::vector<lite::Tensor *> &outputs,
                                                const std::vector<ge::Operator *> &npu_inputs) {
-  auto ret = SetPreTranspose(npu_inputs[0]);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "New pre transpose npu operator (NHWC -> NCHW) for op " << name_ << " failed.";
-    return RET_ERROR;
-  }
-
  // set conv attr param
  conv_dw_ = new (std::nothrow) hiai::op::ConvolutionDepthwise(name_ + "_conv_depthwise");
  if (conv_dw_ == nullptr) {
    MS_LOG(ERROR) << "New convolution depthwise operator for op " << name_ << " failed.";
    return RET_ERROR;
  }
-  ret = SetConvDwParam();
+  auto ret = SetConvDwParam();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Set npu op parameter for convolution depthwise op " << name_ << " failed.";
    return RET_ERROR;
@ -76,7 +70,7 @@ int ConvolutionDepthwiseNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *
  if (inputs.size() == 3) {
    conv_dw_->set_input_bias(*bias_);
  }
-  conv_dw_->set_input_x(*pre_trans_);
+  conv_dw_->set_input_x(*npu_inputs[0]);

  if (conv_param_->act_type_ != ActType_No) {
    ret = SetActivation(conv_dw_, conv_param_->act_type_);
@ -85,20 +79,16 @@ int ConvolutionDepthwiseNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *
      return RET_ERROR;
    }
  }
-
-  if (conv_param_->act_type_ == ActType_No) {
-    ret = SetPostTranspose(conv_dw_);
-  } else {
-    ret = SetPostTranspose(act_);
-  }
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "New post transpose npu operator (NCHW -> NHWC) for op " << name_ << " failed.";
-    return RET_ERROR;
-  }
  return RET_OK;
 }

-ge::Operator *mindspore::kernel::ConvolutionDepthwiseNPUKernel::GetNPUOp() { return post_trans_; }
+ge::Operator *mindspore::kernel::ConvolutionDepthwiseNPUKernel::GetNPUOp() {
+  if (conv_param_->act_type_ == ActType_No) {
+    return conv_dw_;
+  } else {
+    return act_;
+  }
+}

 ConvolutionDepthwiseNPUKernel::~ConvolutionDepthwiseNPUKernel() {
  if (conv_dw_ != nullptr) {
--- a/mindspore/lite/src/runtime/kernel/npu/convolution_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/convolution_npu.cc
@ -24,7 +24,7 @@ using mindspore::schema::PrimitiveType_Conv2D;
 namespace mindspore::kernel {
 int ConvolutionNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs,
                                    const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter) {
-  return RET_ERROR;
+  return RET_OK;
 }

 int ConvolutionNPUKernel::SetConvParam() {
@ -49,19 +49,13 @@ int ConvolutionNPUKernel::SetConvParam() {
 int ConvolutionNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs,
                                       const std::vector<lite::Tensor *> &outputs,
                                       const std::vector<ge::Operator *> &npu_inputs) {
-  auto ret = SetPreTranspose(npu_inputs[0]);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "New pre transpose npu operator (NHWC -> NCHW) for op " << name_ << " failed.";
-    return RET_ERROR;
-  }
-
  // set conv attr param
  conv_ = new (std::nothrow) hiai::op::Convolution(name_ + "_conv");
  if (conv_ == nullptr) {
    MS_LOG(ERROR) << "New convolution operator for convolution op " << name_ << " failed.";
    return RET_ERROR;
  }
-  ret = SetConvParam();
+  auto ret = SetConvParam();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Set npu op parameter for convolution op " << name_ << " failed.";
    return RET_ERROR;
@ -76,7 +70,7 @@ int ConvolutionNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs
  if (inputs.size() == 3) {
    conv_->set_input_bias(*bias_);
  }
-  conv_->set_input_x(*pre_trans_);
+  conv_->set_input_x(*npu_inputs[0]);

  if (conv_param_->act_type_ != ActType_No) {
    ret = SetActivation(conv_, conv_param_->act_type_);
@ -85,20 +79,16 @@ int ConvolutionNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs
      return RET_ERROR;
    }
  }
-
-  if (conv_param_->act_type_ == ActType_No) {
-    ret = SetPostTranspose(conv_);
-  } else {
-    ret = SetPostTranspose(act_);
-  }
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "New post transpose npu operator (NCHW -> NHWC) for op " << name_ << " failed.";
-    return RET_ERROR;
-  }
  return RET_OK;
 }

-ge::Operator *mindspore::kernel::ConvolutionNPUKernel::GetNPUOp() { return post_trans_; }
+ge::Operator *mindspore::kernel::ConvolutionNPUKernel::GetNPUOp() {
+  if (conv_param_->act_type_ == ActType_No) {
+    return conv_;
+  } else {
+    return act_;
+  }
+}

 ConvolutionNPUKernel::~ConvolutionNPUKernel() {
  if (conv_ != nullptr) {
--- a/mindspore/lite/src/runtime/kernel/npu/pooling_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/pooling_npu.cc
@ -62,23 +62,17 @@ int PoolingNPUKernel::SetPoolingParam() {
 int PoolingNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs,
                                   const std::vector<lite::Tensor *> &outputs,
                                   const std::vector<ge::Operator *> &npu_inputs) {
-  auto ret = SetPreTranspose(npu_inputs[0]);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "New pre transpose npu operator (NHWC -> NCHW) for op " << name_ << " failed.";
-    return RET_ERROR;
-  }
-
  pooling_ = new (std::nothrow) hiai::op::PoolingD(name_ + "_pooling");
  if (pooling_ == nullptr) {
    MS_LOG(ERROR) << "New pooling npu operator for op " << name_ << " failed.";
    return RET_ERROR;
  }
-  ret = SetPoolingParam();
+  auto ret = SetPoolingParam();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Set npu op parameter for convolution op " << name_ << " failed.";
    return RET_ERROR;
  }
-  pooling_->set_input_x(*pre_trans_);
+  pooling_->set_input_x(*npu_inputs[0]);

  if (pooling_param_->act_type_ != ActType_No) {
    ret = SetActivation(pooling_, pooling_param_->act_type_);
@ -87,20 +81,16 @@ int PoolingNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs,
      return RET_ERROR;
    }
  }
-
-  if (pooling_param_->act_type_ == ActType_No) {
-    ret = SetPostTranspose(pooling_);
-  } else {
-    ret = SetPostTranspose(act_);
-  }
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "New post transpose npu operator (NCHW -> NHWC) for op " << name_ << " failed.";
-    return RET_ERROR;
-  }
  return RET_OK;
 }

-ge::Operator *mindspore::kernel::PoolingNPUKernel::GetNPUOp() { return post_trans_; }
+ge::Operator *mindspore::kernel::PoolingNPUKernel::GetNPUOp() {
+  if (pooling_param_->act_type_ == ActType_No) {
+    return pooling_;
+  } else {
+    return act_;
+  }
+}

 PoolingNPUKernel::~PoolingNPUKernel() {
  if (pooling_ != nullptr) {
--- a/mindspore/lite/src/runtime/kernel/npu/resize_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/resize_npu.cc
@ -36,12 +36,6 @@ int ResizeNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs, const

 int ResizeNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
                                  const std::vector<ge::Operator *> &npu_inputs) {
-  auto ret = SetPreTranspose(npu_inputs[0]);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "New pre transpose npu operator (NHWC -> NCHW) for op " << name_ << " failed.";
-    return RET_ERROR;
-  }
-
  ge::TensorDesc sizeTensorDesc(ge::Shape({2}), ge::FORMAT_NCHW, ge::DT_INT32);
  ge::TensorPtr sizeTensor = std::make_shared<hiai::Tensor>(sizeTensorDesc);
  vector<int32_t> dataValue = {static_cast<int32_t>(new_height_), static_cast<int32_t>(new_width_)};
@ -55,7 +49,7 @@ int ResizeNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs, con
      return RET_ERROR;
    }
    op->set_attr_align_corners(align_corners_);
-    op->set_input_x(*pre_trans_);
+    op->set_input_x(*npu_inputs[0]);
    op->set_input_size(*out_size);
    op->set_attr_half_pixel_centers(preserve_aspect_ratio_);
    op_ = op;
@ -66,21 +60,14 @@ int ResizeNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs, con
      return RET_ERROR;
    }
    op->set_attr_align_corners(align_corners_);
-    op->set_input_x(*pre_trans_);
+    op->set_input_x(*npu_inputs[0]);
    op->set_input_size(*out_size);
    op_ = op;
  }
-
-  ret = SetPostTranspose(op_);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "New post transpose npu operator (NCHW -> NHWC) for op " << name_ << " failed.";
-    return RET_ERROR;
-  }
-
  return RET_OK;
 }

-ge::Operator *mindspore::kernel::ResizeNPUKernel::GetNPUOp() { return this->post_trans_; }
+ge::Operator *mindspore::kernel::ResizeNPUKernel::GetNPUOp() { return this->op_; }

 ResizeNPUKernel::~ResizeNPUKernel() {
  if (op_ != nullptr) {
--- a/mindspore/lite/src/runtime/kernel/npu/resize_npu.h
+++ b/mindspore/lite/src/runtime/kernel/npu/resize_npu.h
@ -24,12 +24,12 @@
 #include "include/graph/op/all_ops.h"
 #include "src/runtime/kernel/npu/transpose_base_npu.h"
 namespace mindspore::kernel {
-class ResizeNPUKernel : public TransposeBaseNPUKernel {
+class ResizeNPUKernel : public NPUKernel {
 public:
  ResizeNPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                  const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                  const mindspore::lite::PrimitiveC *primitive)
-      : TransposeBaseNPUKernel(parameter, inputs, outputs, ctx, primitive) {
+      : NPUKernel(parameter, inputs, outputs, ctx, primitive) {
    auto resize_parameter = reinterpret_cast<ResizeParameter *>(parameter);
    method_ = resize_parameter->method_;
    new_height_ = resize_parameter->new_height_;
--- a/mindspore/lite/src/scheduler.cc
+++ b/mindspore/lite/src/scheduler.cc
@ -33,6 +33,8 @@
 #if SUPPORT_NPU
 #include "src/runtime/agent/npu/subgraph_npu_kernel.h"
 #include "src/runtime/agent/npu/npu_manager.h"
+#include "src/runtime/agent/npu/npu_transform_pass.h"
+#include "src/runtime/agent/npu/npu_fusion_pass.h"
 #endif
 namespace mindspore::lite {
 using kernel::KERNEL_ARCH::kCPU;
@ -63,6 +65,11 @@ int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
    return ret;
  }
  FindAllInoutKernels(*dst_kernels);
+  ret = RunPass(dst_kernels);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Schedule run pass failed.";
+    return ret;
+  }
  ret = ConstructSubGraphs(dst_kernels);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConstructSubGraphs failed.";
@ -514,4 +521,25 @@ void Scheduler::FindAllInoutKernels(const std::vector<kernel::LiteKernel *> &ker
    kernel->FindInoutKernels(kernels);
  }
 }
+
+int Scheduler::RunPass(std::vector<kernel::LiteKernel *> *dst_kernels) {
+  int ret = RET_OK;
+#if SUPPORT_NPU
+  auto transform_pass = new NPUTransformPass;
+  ret = transform_pass->FormatTransformPass(context_, dst_kernels, &src_tensors_);
+  delete transform_pass;
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Run npu format transform pass failed.";
+    return ret;
+  }
+  auto fusion_pass = new NPUFusionPass(dst_kernels);
+  ret = fusion_pass->Fusion();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Run npu fussion transform pass failed.";
+    return ret;
+  }
+  delete fusion_pass;
+#endif
+  return ret;
+}
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/scheduler.h
+++ b/mindspore/lite/src/scheduler.h
@ -77,6 +77,8 @@ class Scheduler {

  static kernel::SubGraphType GetKernelSubGraphType(const kernel::LiteKernel *kernel);

+  int RunPass(std::vector<kernel::LiteKernel *> *dst_kernels);
+
 protected:
  const InnerContext *context_ = nullptr;
  Model *src_model_ = nullptr;