From 04e261ddc48c8b067e775ebde5c9514df034b206 Mon Sep 17 00:00:00 2001
From: ling <lingqiaomin@huawei.com>
Date: Wed, 17 Mar 2021 09:48:53 +0800
Subject: [PATCH] sub graph split

---
 mindspore/lite/CMakeLists.txt                 |   4 +
 mindspore/lite/micro/cmake/file_list.cmake    |   1 +
 mindspore/lite/src/CMakeLists.txt             |   1 +
 mindspore/lite/src/lite_kernel.cc             |  20 +-
 mindspore/lite/src/lite_kernel.h              |  16 +-
 mindspore/lite/src/lite_mindrt.cc             |  11 +-
 mindspore/lite/src/lite_mindrt.h              |   2 +
 mindspore/lite/src/lite_model.cc              |   6 +
 mindspore/lite/src/lite_model.h               |   1 +
 mindspore/lite/src/lite_session.cc            |  17 +-
 .../src/runtime/agent/npu/npu_executor.cc     |   5 +-
 .../runtime/agent/npu/subgraph_npu_kernel.h   |   1 +
 mindspore/lite/src/runtime/allocator.h        |   2 +-
 .../runtime/kernel/opencl/opencl_subgraph.h   |   1 +
 mindspore/lite/src/scheduler.cc               |  54 ++--
 mindspore/lite/src/scheduler.h                |   3 +-
 mindspore/lite/src/sub_graph_kernel.h         |   1 +
 mindspore/lite/src/sub_graph_split.cc         | 269 ++++++++++++++++++
 mindspore/lite/src/sub_graph_split.h          |  78 +++++
 mindspore/lite/src/tensor.cc                  |   4 +-
 mindspore/lite/src/tensor.h                   |   3 +-
 mindspore/lite/test/CMakeLists.txt            |   1 +
 mindspore/lite/tools/converter/CMakeLists.txt |   1 +
 23 files changed, 452 insertions(+), 50 deletions(-)
 create mode 100644 mindspore/lite/src/sub_graph_split.cc
 create mode 100644 mindspore/lite/src/sub_graph_split.h

diff --git a/mindspore/lite/CMakeLists.txt b/mindspore/lite/CMakeLists.txt
index 5cba5c89b02..07be73fc500 100644
--- a/mindspore/lite/CMakeLists.txt
+++ b/mindspore/lite/CMakeLists.txt
@@ -28,6 +28,7 @@ option(ENABLE_VERBOSE "" off)
 option(ENABLE_SSE "if x86_64 support SSE instruction set" off)
 option(ENABLE_AVX "if x86_64 support SSE instruction set" off)
 option(ENABLE_MINDRT "if support mindrt" on)
+option(SUBGRAPH_SPLIT "if support sub graph split" off)
 
 set(DIR_PREFIX mindspore-lite)
 set(MS_VERSION ${MS_VERSION_MAJOR}.${MS_VERSION_MINOR}.${MS_VERSION_REVISION})
@@ -57,6 +58,9 @@ else()
     set(PROCESS_UNIT cpu)
 endif()
 
+if(SUBGRAPH_SPLIT)
+    add_compile_definitions(SUBGRAPH_SPLIT)
+endif()
 
 if(SUPPORT_NPU)
     set(DDK_PATH "$ENV{HWHIAI_DDK}/ddk/ai_ddk_lib")
diff --git a/mindspore/lite/micro/cmake/file_list.cmake b/mindspore/lite/micro/cmake/file_list.cmake
index 7b00e993cd8..04be13d35e0 100644
--- a/mindspore/lite/micro/cmake/file_list.cmake
+++ b/mindspore/lite/micro/cmake/file_list.cmake
@@ -132,6 +132,7 @@ set(LITE_SRC
         ${LITE_DIR}/src/common/tensor_util.cc
         ${LITE_DIR}/src/runtime/infer_manager.cc
         ${LITE_DIR}/src/lite_model.cc
+        ${LITE_DIR}/src/sub_graph_split.cc
         ${LITE_DIR}/src/tensorlist.cc
         ${LITE_DIR}/src/tensor.cc
         ${LITE_DIR}/src/dequant.cc
diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt
index 0a2d0f01436..b79370f9b53 100644
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@@ -59,6 +59,7 @@ set(LITE_SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/kernel_registry.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/lite_kernel.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/sub_graph_kernel.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/sub_graph_split.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/scheduler.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/lite_session.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/errorcode.cc
diff --git a/mindspore/lite/src/lite_kernel.cc b/mindspore/lite/src/lite_kernel.cc
index 25f1ac14d78..3143174b3de 100644
--- a/mindspore/lite/src/lite_kernel.cc
+++ b/mindspore/lite/src/lite_kernel.cc
@@ -43,6 +43,16 @@ void LiteKernel::FreeWorkspace() {
   free(workspace_);
   workspace_ = nullptr;
 }
+
+int LiteKernel::DecOutTensorRefCount() {
+  for (auto *tensor : this->out_tensors_) {
+    tensor->set_ref_count(tensor->ref_count() - 1);
+    if (0 >= tensor->ref_count()) {
+      tensor->FreeData();
+    }
+  }
+  return 0;
+}
 #endif
 bool LiteKernel::IsReady(const std::vector<lite::Tensor *> &scope_tensors) {
   return std::all_of(this->in_tensors().begin(), this->in_tensors().end(), [&](lite::Tensor *in_tensor) {
@@ -66,16 +76,6 @@ void LiteKernel::InitOutTensorInitRefCount() {
   }
 }
 
-int LiteKernel::DecOutTensorRefCount() {
-  for (auto *tensor : this->out_tensors_) {
-    tensor->set_ref_count(tensor->ref_count() - 1);
-    if (0 >= tensor->ref_count()) {
-      tensor->FreeData();
-    }
-  }
-  return 0;
-}
-
 int LiteKernel::FreeInWorkTensor() const {
   for (auto &in_tensor : this->in_tensors_) {
     MS_ASSERT(in_tensor != nullptr);
diff --git a/mindspore/lite/src/lite_kernel.h b/mindspore/lite/src/lite_kernel.h
index 0c27f44a9a4..4106c8f0ee7 100644
--- a/mindspore/lite/src/lite_kernel.h
+++ b/mindspore/lite/src/lite_kernel.h
@@ -35,7 +35,16 @@ static constexpr int kPerTensor = 1;
 static constexpr size_t kPerBatch = 3;
 
 namespace mindspore::kernel {
-enum KERNEL_ARCH { kCPU, kGPU, kAPU, kNPU, kKernelArch_MIN = kCPU, kKernelArch_MAX = kNPU };
+enum KERNEL_ARCH {
+  kCPU,
+  kGPU,
+  kAPU,
+  kNPU,
+  kALL, /* Support GPU NPU CPU */
+  kKernelArch_MIN = kCPU,
+  kKernelArch_MAX = kALL
+};
+
 struct KernelKey {
   KERNEL_ARCH arch;
   TypeId data_type;
@@ -161,8 +170,6 @@ class LiteKernel {
 
   virtual void InitOutTensorInitRefCount();
 
-  int DecOutTensorRefCount();
-
   virtual int FreeInWorkTensor() const;
 
   KernelKey desc() const { return desc_; }
@@ -171,6 +178,8 @@ class LiteKernel {
 
   SubGraphType subgraph_type() const { return this->subgraph_type_; }
 
+  const lite::InnerContext *context() const { return this->context_; }
+
   virtual std::string ToString() const;
 
 #ifdef SUPPORT_TRAIN
@@ -179,6 +188,7 @@ class LiteKernel {
   static void AllocWorkspace(size_t size);
   static void FreeWorkspace();
   void *workspace() { return workspace_; }
+  int DecOutTensorRefCount();
 #endif
 
  protected:
diff --git a/mindspore/lite/src/lite_mindrt.cc b/mindspore/lite/src/lite_mindrt.cc
index 337a5a1f932..3ef293d7ffa 100644
--- a/mindspore/lite/src/lite_mindrt.cc
+++ b/mindspore/lite/src/lite_mindrt.cc
@@ -32,7 +32,7 @@ int LiteOpActor::CompileArrow() {
         }
       }
       if (to_input_index == -1) {
-        break;
+        continue;
       }
       auto id = out->name() + this->GetAID().Url();
       auto arrow = std::make_shared<OpArrow>(i, id, to_input_index);
@@ -41,12 +41,19 @@ int LiteOpActor::CompileArrow() {
         return RET_ERROR;
       }
       output_op_arrows_.emplace_back(std::move(arrow));
-      break;
     }
   }
   return RET_OK;
 }
 
+void LiteOpActor::AsyncOutput(OpContext<Tensor> *context) {
+  for (auto op_arrow : output_op_arrows_) {
+    auto data = context->outputData_->at(op_arrow->from_output_index_);
+    Async(op_arrow->to_op_id_, &mindspore::OpActor<Tensor>::RunOpData, data, context);
+  }
+  return;
+}
+
 void LiteOpActor::SetOutputData(OpContext<Tensor> *context) {
   auto size = context->outputData_->size();
   MS_ASSERT(size == context->results_->size());
diff --git a/mindspore/lite/src/lite_mindrt.h b/mindspore/lite/src/lite_mindrt.h
index 6f14d3fd8e9..fcf5f12666d 100644
--- a/mindspore/lite/src/lite_mindrt.h
+++ b/mindspore/lite/src/lite_mindrt.h
@@ -50,6 +50,7 @@ class LiteOpActor : public OpActor<lite::Tensor> {
       return;
     }
     input_op_datas_.erase(op_uuid);
+    AsyncOutput(context);
     SetOutputData(context);
   }
   void Init() {
@@ -83,6 +84,7 @@ class LiteOpActor : public OpActor<lite::Tensor> {
 
  private:
   void SetOutputData(OpContext<Tensor> *context);
+  void AsyncOutput(OpContext<Tensor> *context);
 
   kernel::LiteKernel *kernel_;
 };
diff --git a/mindspore/lite/src/lite_model.cc b/mindspore/lite/src/lite_model.cc
index ea4195af8fe..133ade34c43 100644
--- a/mindspore/lite/src/lite_model.cc
+++ b/mindspore/lite/src/lite_model.cc
@@ -104,6 +104,12 @@ void LiteModel::Free() {
     tensor_buf = nullptr;
   }
   attr_tensor_bufs_.resize(0);
+
+  for (auto &node_buf : node_bufs_) {
+    free(node_buf);
+    node_buf = nullptr;
+  }
+  node_bufs_.resize(0);
 }
 
 void LiteModel::Destroy() {
diff --git a/mindspore/lite/src/lite_model.h b/mindspore/lite/src/lite_model.h
index d7a6447ba71..11bf025c0f0 100644
--- a/mindspore/lite/src/lite_model.h
+++ b/mindspore/lite/src/lite_model.h
@@ -192,6 +192,7 @@ class LiteModel : public Model {
 
  public:
   size_t buf_size_ = 0;
+  std::vector<char *> node_bufs_;
 
  protected:
   std::vector<char *> attr_tensor_bufs_;
diff --git a/mindspore/lite/src/lite_session.cc b/mindspore/lite/src/lite_session.cc
index 987d1a1385b..7001a3f3312 100644
--- a/mindspore/lite/src/lite_session.cc
+++ b/mindspore/lite/src/lite_session.cc
@@ -399,8 +399,15 @@ int LiteSession::CompileGraph(Model *model) {
 #endif
   InitGraphInOutTensors(model);
 
+  ret = PrepareKernels(model);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare kernels failed: " << ret;
+    is_running_.store(false);
+    return ret;
+  }
+
 #ifdef ENABLE_MINDRT
-  if (context_->IsCpuEnabled() && !context_->IsGpuEnabled() && !context_->IsNpuEnabled() && kernels_.size() == 1) {
+  if (kernels_.size() == 1) {
     executor_ = new (std::nothrow) MindrtExecutor();
   } else {
     executor_ = new (std::nothrow) Executor();
@@ -420,16 +427,10 @@ int LiteSession::CompileGraph(Model *model) {
     is_running_.store(false);
     return ret;
   }
-  ret = PrepareKernels(model);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Prepare kernels failed: " << ret;
-    is_running_.store(false);
-    return ret;
-  }
 
   is_running_.store(false);
   return RET_OK;
-}
+}  // namespace lite
 
 int LiteSession::PrepareKernels(Model *model) {
   std::vector<kernel::LiteKernel *> all_kernels;
diff --git a/mindspore/lite/src/runtime/agent/npu/npu_executor.cc b/mindspore/lite/src/runtime/agent/npu/npu_executor.cc
index 38ff6b44897..99e47c58875 100644
--- a/mindspore/lite/src/runtime/agent/npu/npu_executor.cc
+++ b/mindspore/lite/src/runtime/agent/npu/npu_executor.cc
@@ -102,10 +102,7 @@ int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<
 
         memcpy(npu_input_tensors_[i]->GetBuffer(), data, in_tensors[index]->Size());
         inputs_visited[index] = true;
-        in_tensors[index]->set_ref_count(in_tensors[index]->ref_count() - 1);
-        if (in_tensors[index]->ref_count() <= 0) {
-          in_tensors[index]->FreeData();
-        }
+        in_tensors[index]->DecRefCount();
         break;
       }
     }
diff --git a/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.h b/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.h
index 490afbbe945..65b44abb4e6 100644
--- a/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.h
+++ b/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.h
@@ -38,6 +38,7 @@ class SubGraphNpuKernel : public SubGraphKernel {
                     const lite::InnerContext *ctx = nullptr, lite::NPUManager *npu_manager = nullptr)
       : SubGraphKernel(inputs, outputs, inKernels, outKernels, nodes, ctx), npu_manager_(npu_manager) {
     subgraph_type_ = kNpuSubGraph;
+    desc_.arch = kernel::KERNEL_ARCH::kNPU;
   }
 
   ~SubGraphNpuKernel() override;
diff --git a/mindspore/lite/src/runtime/allocator.h b/mindspore/lite/src/runtime/allocator.h
index dbd4568a99d..95f7c6c8d45 100644
--- a/mindspore/lite/src/runtime/allocator.h
+++ b/mindspore/lite/src/runtime/allocator.h
@@ -70,7 +70,7 @@ class DefaultAllocator : public Allocator {
   std::multimap<size_t, MemBuf *> freeList_;
   // 6 is empirical value
   int shiftFactor_ = 6;
-  bool lockFlag_ = false;
+  bool lockFlag_ = true;
 };
 
 constexpr int64_t MAX_MALLOC_SIZE = static_cast<size_t>(2000) * 1024 * 1024;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h
index 01933cefad0..38ba38d3641 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h
@@ -34,6 +34,7 @@ class OpenCLSubGraph : public SubGraphKernel {
       : SubGraphKernel(inputs, outputs, inKernels, outKernels, nodes, ctx) {
     ocl_runtime_ = ocl_runtime_wrap_.GetInstance();
     subgraph_type_ = kGpuSubGraph;
+    desc_.arch = kernel::KERNEL_ARCH::kGPU;
     this->name_ = "GpuSubGraph";
     nodes_set_.insert(nodes.begin(), nodes.end());
     all_kernels_infer_done_ = std::all_of(nodes_.begin(), nodes_.end(), [](const kernel::LiteKernel *kernel) {
diff --git a/mindspore/lite/src/scheduler.cc b/mindspore/lite/src/scheduler.cc
index ea5aa210315..73136349ca9 100644
--- a/mindspore/lite/src/scheduler.cc
+++ b/mindspore/lite/src/scheduler.cc
@@ -30,6 +30,7 @@
 #include "src/common/version_manager.h"
 #include "src/common/prim_util.h"
 #include "src/runtime/infer_manager.h"
+#include "src/sub_graph_split.h"
 #include "src/dequant.h"
 #include "nnacl/matmul_parameter.h"
 #if GPU_OPENCL
@@ -71,6 +72,12 @@ int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
   }
 
   this->graph_output_node_indexes_ = GetGraphOutputNodes(src_model_);
+
+#ifdef SUBGRAPH_SPLIT
+  auto search_sub_graph = SearchSubGraph(src_model_, this->graph_output_node_indexes_);
+  search_sub_graph.SubGraphSplitByOutput();
+#endif
+
   bool infer_shape_interrupt = false;
   auto ret = InferSubGraphShape(kMainSubGraphIndex, &infer_shape_interrupt);
   if (ret != RET_OK) {
@@ -89,7 +96,11 @@ int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
     MS_LOG(ERROR) << "Schedule run pass failed.";
     return ret;
   }
-  ret = ConstructSubGraphs(dst_kernels);
+
+  auto src_kernel = *dst_kernels;
+  dst_kernels->clear();
+  std::map<const kernel::LiteKernel *, bool> is_kernel_finish;
+  ret = ConstructSubGraphs(src_kernel, dst_kernels, &is_kernel_finish);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConstructSubGraphs failed.";
     return ret;
@@ -465,6 +476,14 @@ kernel::LiteKernel *Scheduler::SchedulePartialToKernel(const lite::Model::Node *
     MS_LOG(ERROR) << "Schedule partial failed, name: " << src_node->name_;
     return nullptr;
   }
+
+  FindAllInoutKernels(sub_kernels);
+  ret = RunPass(&sub_kernels);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "SchedulePartialToKernel run pass failed.";
+    return nullptr;
+  }
+
   auto cur_sub_graph_type = mindspore::lite::Scheduler::GetKernelSubGraphType(sub_kernels.front());
   auto subgraph = CreateSubGraphKernel(sub_kernels, &in_tensors, &out_tensors, cur_sub_graph_type);
   subgraph->set_name("subgraph_" + src_node->name_);
@@ -594,35 +613,33 @@ std::vector<kernel::LiteKernel *> Scheduler::FindAllSubGraphKernels(
   return sub_kernels;
 }
 
-int Scheduler::ConstructSubGraphs(std::vector<kernel::LiteKernel *> *kernels) {
-  auto old_kernels = *kernels;
-  kernels->clear();
-  std::map<const kernel::LiteKernel *, bool> is_kernel_finish;
-  for (auto kernel : old_kernels) {
-    is_kernel_finish[kernel] = false;
+int Scheduler::ConstructSubGraphs(std::vector<kernel::LiteKernel *> src_kernel,
+                                  std::vector<kernel::LiteKernel *> *dst_kernel,
+                                  std::map<const kernel::LiteKernel *, bool> *is_kernel_finish) {
+  for (auto kernel : src_kernel) {
+    (*is_kernel_finish)[kernel] = false;
   }
-
   while (true) {
-    auto head_kernel_iter = std::find_if(old_kernels.begin(), old_kernels.end(), [&](const kernel::LiteKernel *kernel) {
+    auto head_kernel_iter = std::find_if(src_kernel.begin(), src_kernel.end(), [&](const kernel::LiteKernel *kernel) {
       auto kernel_inputs = kernel->in_kernels();
-      if (is_kernel_finish[kernel]) {
+      if ((*is_kernel_finish)[kernel]) {
         return false;
       }
       // when merge is removed, this if is removed automatically
       if (kernel->Type() == schema::PrimitiveType_Merge) {
-        return MergeOpIsReady(kernel, is_kernel_finish);
+        return MergeOpIsReady(kernel, (*is_kernel_finish));
       } else {
         return std::all_of(kernel_inputs.begin(), kernel_inputs.end(),
-                           [&](kernel::LiteKernel *kernel) { return is_kernel_finish[kernel]; });
+                           [&](kernel::LiteKernel *kernel) { return (*is_kernel_finish)[kernel]; });
       }
     });
-    if (head_kernel_iter == old_kernels.end()) {
+    if (head_kernel_iter == src_kernel.end()) {
       break;
     }
     auto head_kernel = *head_kernel_iter;
     if (head_kernel->subgraph_type() != kernel::kNotSubGraph) {
-      is_kernel_finish[head_kernel] = true;
-      kernels->emplace_back(head_kernel);
+      (*is_kernel_finish)[head_kernel] = true;
+      dst_kernel->push_back(head_kernel);
       continue;
     }
     if (head_kernel->desc().arch == mindspore::kernel::kAPU) {
@@ -630,15 +647,15 @@ int Scheduler::ConstructSubGraphs(std::vector<kernel::LiteKernel *> *kernels) {
       return RET_NOT_SUPPORT;
     }
     auto cur_sub_graph_type = mindspore::lite::Scheduler::GetKernelSubGraphType(head_kernel);
-    auto sub_kernels = FindAllSubGraphKernels(head_kernel, &is_kernel_finish);
+    auto sub_kernels = FindAllSubGraphKernels(head_kernel, is_kernel_finish);
     auto subgraph = CreateSubGraphKernel(sub_kernels, nullptr, nullptr, cur_sub_graph_type);
     if (subgraph == nullptr) {
       MS_LOG(ERROR) << "Create SubGraphKernel failed";
       return RET_ERROR;
     }
-    kernels->emplace_back(subgraph);
+    dst_kernel->emplace_back(subgraph);
   }
-  for (auto *subgraph : *kernels) {
+  for (auto *subgraph : *dst_kernel) {
     auto ret = subgraph->Init();
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "Init SubGraph failed: " << ret;
@@ -832,6 +849,7 @@ int Scheduler::RunPass(std::vector<kernel::LiteKernel *> *dst_kernels) {
   npu_pass_manager_->AddPass(fusion_pass);
 
   ret = npu_pass_manager_->Run();
+  npu_pass_manager_->Clear();
 #endif
   return ret;
 }
diff --git a/mindspore/lite/src/scheduler.h b/mindspore/lite/src/scheduler.h
index f4fe4695201..589fc61dd85 100644
--- a/mindspore/lite/src/scheduler.h
+++ b/mindspore/lite/src/scheduler.h
@@ -74,7 +74,8 @@ class Scheduler {
   static void FindAllInoutKernels(const std::vector<kernel::LiteKernel *> &kernels);
 
   // vector<LiteKernel/SubGraphKernel> --> vector<SubGraphKernel>
-  int ConstructSubGraphs(std::vector<kernel::LiteKernel *> *kernels);
+  int ConstructSubGraphs(std::vector<kernel::LiteKernel *> src_kernel, std::vector<kernel::LiteKernel *> *dst_kernel,
+                         std::map<const kernel::LiteKernel *, bool> *sinked_kernel_map);
 
   // create subgraph_kernel from a vector of kernel
   kernel::SubGraphKernel *CreateSubGraphKernel(const std::vector<kernel::LiteKernel *> &kernels,
diff --git a/mindspore/lite/src/sub_graph_kernel.h b/mindspore/lite/src/sub_graph_kernel.h
index d4764fd93bd..0ae5a2d1118 100644
--- a/mindspore/lite/src/sub_graph_kernel.h
+++ b/mindspore/lite/src/sub_graph_kernel.h
@@ -128,6 +128,7 @@ class CpuSubGraph : public SubGraphKernel {
               std::vector<LiteKernel *> nodes, const lite::InnerContext *ctx)
       : SubGraphKernel(inputs, outputs, std::move(in_kernels), std::move(out_kernels), std::move(nodes), ctx) {
     subgraph_type_ = kCpuFP32SubGraph;
+    desc_.arch = kernel::KERNEL_ARCH::kCPU;
   }
 
   ~CpuSubGraph() override { delete this->executor_; }
diff --git a/mindspore/lite/src/sub_graph_split.cc b/mindspore/lite/src/sub_graph_split.cc
new file mode 100644
index 00000000000..763861fc2e4
--- /dev/null
+++ b/mindspore/lite/src/sub_graph_split.cc
@@ -0,0 +1,269 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/sub_graph_split.h"
+#include <vector>
+#include <utility>
+#include "src/tensor.h"
+#include "schema/inner/ops_generated.h"
+#include "schema/inner/model_generated.h"
+
+namespace mindspore::lite {
+#ifdef SUBGRAPH_SPLIT
+const schema::Primitive *SearchSubGraph::CreatePartialPrimitive(int64_t subgraph_index) {
+  flatbuffers::FlatBufferBuilder fbb(1024);
+  auto val_offset = schema::CreatePartialFusion(fbb, subgraph_index);
+  auto prim_offset = schema::CreatePrimitive(fbb, schema::PrimitiveType_PartialFusion, val_offset.o);
+  fbb.Finish(prim_offset);
+  auto tmp_buf = fbb.GetBufferPointer();
+  auto prim_buf = reinterpret_cast<char *>(malloc(fbb.GetSize()));
+  memcpy(prim_buf, tmp_buf, fbb.GetSize());
+
+  auto primitive = flatbuffers::GetRoot<schema::Primitive>(prim_buf);
+  fbb.Clear();
+
+  model_->node_bufs_.push_back(prim_buf);
+  return std::move(primitive);
+}
+
+void SearchSubGraph::ConvertSubGraphToModel() {
+  Model::SubGraph *main_graphs = model_->sub_graphs_.front();
+
+  for (Subgraph &subgraph : sub_graphs_) {
+    if (subgraph.nodes_.empty()) {
+      continue;
+    }
+    mindspore::kernel::KERNEL_ARCH device = subgraph.device_;
+
+    int new_sub_index = model_->sub_graphs_.size();
+    int partial_index = model_->all_nodes_.size();
+
+    Model::SubGraph *new_sub_graph = new (std::nothrow) Model::SubGraph();
+    if (new_sub_graph == nullptr) {
+      MS_LOG(ERROR) << "New sub graph failed!";
+      return;
+    }
+    new_sub_graph->name_ = "Subgraph-split-" + std::to_string(new_sub_index);
+
+    Model::Node *new_partial_node = new (std::nothrow) Model::Node();
+    if (new_partial_node == nullptr) {
+      MS_LOG(ERROR) << "New partial node failed!";
+      return;
+    }
+    new_partial_node->name_ = "Partial-subgraph-split-" + std::to_string(new_sub_index);
+    new_partial_node->node_type_ = mindspore::lite::NodeType_ValueNode;
+    new_partial_node->primitive_ = CreatePartialPrimitive(new_sub_index);
+
+    while (!subgraph.nodes_.empty()) {
+      uint32_t node_index = subgraph.nodes_.front();
+      new_sub_graph->node_indices_.push_back(node_index);
+      VectorErase(&main_graphs->node_indices_, node_index);
+      VectorErase(&subgraph.nodes_, node_index);
+      model_->all_nodes_[node_index]->device_type_ = device;
+    }
+
+    for (uint32_t head_index : subgraph.heads_) {
+      Model::Node *head_node = model_->all_nodes_[head_index];
+      std::vector<uint32_t> inputs = head_node->input_indices_;
+      for (auto input : inputs) {
+        if (tensors_[input].type_ == CONST) {
+          continue;
+        }
+        if (std::find(new_sub_graph->input_indices_.begin(), new_sub_graph->input_indices_.end(), input) !=
+            new_sub_graph->input_indices_.end()) {
+          continue;
+        }
+        new_sub_graph->input_indices_.insert(new_sub_graph->input_indices_.end(), input);
+        new_partial_node->input_indices_.insert(new_partial_node->input_indices_.end(), input);
+      }
+    }
+
+    for (uint32_t end_index : subgraph.ends_) {
+      Model::Node *end_node = model_->all_nodes_[end_index];
+      std::vector<uint32_t> outputs = end_node->output_indices_;
+      new_sub_graph->output_indices_.insert(new_sub_graph->output_indices_.end(), outputs.begin(), outputs.end());
+      new_partial_node->output_indices_.insert(new_partial_node->output_indices_.end(), outputs.begin(), outputs.end());
+    }
+
+    main_graphs->node_indices_.push_back(partial_index);
+    model_->all_nodes_.push_back(std::move(new_partial_node));
+    model_->sub_graphs_.push_back(std::move(new_sub_graph));
+  }
+  return;
+}
+
+bool SearchSubGraph::IsNodeSubGraphHead(uint32_t node_index, const std::vector<uint32_t> &ready_nodes) {
+  std::vector<uint32_t> output_indexes = node_list_[node_index]->output_indices_;
+  std::vector<uint32_t> output_nodes;
+  for (uint32_t out_t : output_indexes) {
+    std::vector<uint32_t> cur_nodes = tensors_[out_t].in_nodes_;
+    output_nodes.insert(output_nodes.end(), cur_nodes.begin(), cur_nodes.end());
+  }
+  for (uint32_t out_n : output_nodes) {
+    if (find(ready_nodes.begin(), ready_nodes.end(), out_n) == ready_nodes.end()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void SearchSubGraph::InsertNode(uint32_t index, Subgraph *subgraph) {
+  if (subgraph->search_terminate_) {
+    return;
+  }
+
+  Model::Node *node = node_list_[index];
+  if (node == nullptr) {
+    return;
+  }
+
+  std::vector<uint32_t> input = node->input_indices_;
+  /* remove const node */
+  for (int i = input.size() - 1; i >= 0; i--) {
+    if (tensors_[input[i]].type_ == CONST) {
+      input.erase(input.begin() + i);
+    }
+  }
+
+  /* all node_input is graph_input */
+  for (size_t i = 0; i < input.size(); i++) {
+    if (tensors_[input[i]].type_ != INPUT) {
+      break;
+    }
+    subgraph->heads_.clear();
+    subgraph->ends_.clear();
+    subgraph->nodes_.clear();
+    subgraph->search_terminate_ = true;
+    return;
+  }
+
+  /* split in graph */
+  if (IsNodeSubGraphHead(index, subgraph->nodes_)) {
+    if (subgraph->nodes_.empty()) {
+      subgraph->search_terminate_ = true;
+      return;
+    }
+    subgraph->heads_.push_back(subgraph->nodes_.front());
+    return;
+  }
+
+  if (find(output_nodes_.begin(), output_nodes_.end(), index) != output_nodes_.end()) {
+    subgraph->ends_.push_back(index);
+  }
+
+  /* node insert in current subgraph */
+  subgraph->nodes_.insert(subgraph->nodes_.begin(), index);
+  node_list_[index] = nullptr;
+
+  /* search for next node */
+  for (uint32_t in : input) {
+    auto next_nodes = tensors_[in].out_nodes_;
+    for (uint32_t next_node : next_nodes) {
+      InsertNode(next_node, subgraph);
+    }
+  }
+  return;
+}
+
+void SearchSubGraph::InitSearchSubGraph() {
+  for (uint32_t out : output_nodes_) {
+    Subgraph subgraph;
+
+    InsertNode(out, &subgraph);
+
+    sub_graphs_.push_back(std::move(subgraph));
+  }
+  return;
+}
+
+void SearchSubGraph::InitSearchTensor() {
+  tensors_.resize(model_->all_tensors_.size());
+
+  /* Set Tensor Type */
+  for (size_t i = 0; i < tensors_.size(); i++) {
+    tensors_[i].type_ = NORMAL;
+    mindspore::schema::Tensor *src_tensor = model_->all_tensors_[i];
+    auto category = TensorCategory(src_tensor);
+    if (category == mindspore::lite::Tensor::Category::CONST_TENSOR ||
+        category == mindspore::lite::Tensor::Category::CONST_SCALAR) {
+      tensors_[i].type_ = CONST;
+    }
+  }
+  std::vector<uint32_t> graph_input = model_->sub_graphs_[0]->input_indices_;
+  for (auto in : graph_input) {
+    tensors_[in].type_ = INPUT;
+  }
+
+  /* Set Tensor In and out Node */
+  for (size_t index = 0; index < model_->all_nodes_.size(); index++) {
+    Model::Node *node = model_->all_nodes_[index];
+    std::vector<uint32_t> input = node->input_indices_;
+    for (uint32_t in : input) {
+      tensors_[in].in_nodes_.push_back(index);
+    }
+    std::vector<uint32_t> output = node->output_indices_;
+    for (uint32_t out : output) {
+      tensors_[out].out_nodes_.push_back(index);
+    }
+  }
+  return;
+}
+
+void SearchSubGraph::InitSubgraphDevice() {
+  sub_graphs_[0].device_ = kernel::KERNEL_ARCH::kCPU;
+  sub_graphs_[1].device_ = kernel::KERNEL_ARCH::kALL;
+}
+
+void SearchSubGraph::InitMainGraphDevice() {
+  kernel::KERNEL_ARCH main_device = kernel::KERNEL_ARCH::kALL;
+  Model::SubGraph *main_graph = model_->sub_graphs_.front();
+  for (uint32_t node_index : main_graph->node_indices_) {
+    Model::Node *node = model_->all_nodes_[node_index];
+    node->device_type_ = main_device;
+  }
+}
+
+void SearchSubGraph::SubgraphFusion() {
+  Subgraph new_npu_sub;
+  Subgraph &npu_sub1 = sub_graphs_[1];
+  Subgraph &npu_sub2 = sub_graphs_[2];
+  new_npu_sub.nodes_.insert(new_npu_sub.nodes_.end(), npu_sub1.nodes_.begin(), npu_sub1.nodes_.end());
+  new_npu_sub.nodes_.insert(new_npu_sub.nodes_.end(), npu_sub2.nodes_.begin(), npu_sub2.nodes_.end());
+  new_npu_sub.heads_.insert(new_npu_sub.heads_.end(), npu_sub1.heads_.begin(), npu_sub1.heads_.end());
+  new_npu_sub.heads_.insert(new_npu_sub.heads_.end(), npu_sub2.heads_.begin(), npu_sub2.heads_.end());
+  new_npu_sub.ends_.insert(new_npu_sub.ends_.end(), npu_sub1.ends_.begin(), npu_sub1.ends_.end());
+  new_npu_sub.ends_.insert(new_npu_sub.ends_.end(), npu_sub2.ends_.begin(), npu_sub2.ends_.end());
+  sub_graphs_.erase(sub_graphs_.begin() + 2);
+  sub_graphs_.erase(sub_graphs_.begin() + 1);
+  sub_graphs_.insert(sub_graphs_.end(), std::move(new_npu_sub));
+  return;
+}
+
+void SearchSubGraph::SubGraphSplitByOutput() {
+  InitSearchTensor();
+
+  InitSearchSubGraph();
+
+  SubgraphFusion();
+
+  InitSubgraphDevice();
+
+  ConvertSubGraphToModel();
+
+  InitMainGraphDevice();
+}
+#endif
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/sub_graph_split.h b/mindspore/lite/src/sub_graph_split.h
new file mode 100644
index 00000000000..c0fc42a1fad
--- /dev/null
+++ b/mindspore/lite/src/sub_graph_split.h
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_SUB_GRAPH_SPLIT_H_
+#define MINDSPORE_LITE_SRC_SUB_GRAPH_SPLIT_H_
+
+#include <stack>
+#include <vector>
+#include "include/model.h"
+#include "src/lite_kernel.h"
+#include "src/lite_model.h"
+
+namespace mindspore::lite {
+#ifdef SUBGRAPH_SPLIT
+class SearchSubGraph {
+  enum TensorType { NORMAL, CONST, INPUT };
+
+  struct Tensor {
+    std::vector<uint32_t> in_nodes_; /* used current tensor as input */
+    std::vector<uint32_t> out_nodes_;
+    TensorType type_;
+  };
+
+  struct Subgraph {
+    std::vector<uint32_t> nodes_;
+    std::vector<uint32_t> heads_;
+    std::vector<uint32_t> ends_;
+    bool search_terminate_ = false;
+    mindspore::kernel::KERNEL_ARCH device_;
+  };
+
+ public:
+  SearchSubGraph(Model *model, std::vector<size_t> output_nodes) {
+    output_nodes_.insert(output_nodes_.end(), output_nodes.begin(), output_nodes.end());
+    node_list_ = model->all_nodes_;
+    model_ = reinterpret_cast<LiteModel *>(model);
+  }
+  ~SearchSubGraph() = default;
+
+ public:
+  void SubGraphSplitByOutput();
+
+ private:
+  void InitSearchTensor();
+  void InitSearchSubGraph();
+  void ConvertSubGraphToModel();
+  void InsertNode(uint32_t index, Subgraph *subgraph);
+  bool IsNodeSubGraphHead(uint32_t node_index, const std::vector<uint32_t> &ready_nodes);
+  const schema::Primitive *CreatePartialPrimitive(int64_t subgraph_index);
+  void InitSubgraphDevice();
+  void SubgraphFusion();
+  void InitMainGraphDevice();
+
+ private:
+  LiteModel *model_ = nullptr;
+  std::vector<Tensor> tensors_;
+  std::vector<Subgraph> sub_graphs_;
+  std::vector<size_t> output_nodes_;
+  std::vector<Model::Node *> node_list_;
+};
+
+#endif
+}  // namespace mindspore::lite
+
+#endif  // MINDSPORE_LITE_SRC_SUB_GRAPH_SPLIT_H_
diff --git a/mindspore/lite/src/tensor.cc b/mindspore/lite/src/tensor.cc
index f15b402c118..c9f7ce85cca 100644
--- a/mindspore/lite/src/tensor.cc
+++ b/mindspore/lite/src/tensor.cc
@@ -352,8 +352,8 @@ void Tensor::DecRefCount() {
   if (this->IsConst() || this->IsGraphInput()) {
     return;
   }
-  this->ref_count_--;
-  if (this->ref_count_ <= 0) {
+  bool free_data = --ref_count_ <= 0;
+  if (free_data) {
     FreeData();
     this->ref_count_ = 0;
   }
diff --git a/mindspore/lite/src/tensor.h b/mindspore/lite/src/tensor.h
index 4929aa69d2b..1fb28d66611 100644
--- a/mindspore/lite/src/tensor.h
+++ b/mindspore/lite/src/tensor.h
@@ -22,6 +22,7 @@
 #include <string>
 #include <numeric>
 #include <functional>
+#include <atomic>
 #include "include/ms_tensor.h"
 #include "src/runtime/allocator.h"
 
@@ -205,7 +206,7 @@ class Tensor : public mindspore::tensor::MSTensor {
   std::vector<int> shape_;
   schema::Format format_;
   Category category_;
-  size_t ref_count_ = 0;
+  std::atomic_int ref_count_ = 0;
   size_t init_ref_count_ = 0;
   std::vector<QuantArg> quant_params_;
   std::vector<float> quant_clusters_;
diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt
index 66634ae1835..d384fdc6678 100644
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@@ -144,6 +144,7 @@ set(TEST_LITE_SRC
         ${LITE_DIR}/src/dequant.cc
         ${LITE_DIR}/src/huffman_decode.cc
         ${LITE_DIR}/src/sub_graph_kernel.cc
+        ${LITE_DIR}/src/sub_graph_split.cc
         ${LITE_DIR}/src/lite_model.cc
         ${LITE_DIR}/src/scheduler.cc
         ${LITE_DIR}/src/common/graph_util.cc
diff --git a/mindspore/lite/tools/converter/CMakeLists.txt b/mindspore/lite/tools/converter/CMakeLists.txt
index 7a9b12fbc0f..91d48d96a6b 100644
--- a/mindspore/lite/tools/converter/CMakeLists.txt
+++ b/mindspore/lite/tools/converter/CMakeLists.txt
@@ -109,6 +109,7 @@ set(LITE_SRC
         ${SRC_DIR}/lite_kernel.cc
         ${SRC_DIR}/scheduler.cc
         ${SRC_DIR}/sub_graph_kernel.cc
+        ${SRC_DIR}/sub_graph_split.cc
         ${SRC_DIR}/lite_session.cc
         ${SRC_DIR}/executor.cc
         ${SRC_DIR}/lite_model.cc