!23301 optimzie allocator for lite inference

Merge pull request !23301 from ling/sr
2021-09-24 09:46:51 +00:00 · 2021-09-24 09:46:51 +00:00 · 8d7483186f
parent d67c91edf5 540d7f4557
commit 8d7483186f
15 changed files with 575 additions and 102 deletions
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@ -92,6 +92,7 @@ set(LITE_SRC
        ${CMAKE_CURRENT_SOURCE_DIR}/common/prim_util.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/common/tensor_util.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/runtime/inner_allocator.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/runtime/optimize_allocator.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/runtime/infer_manager.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/tensor.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/ms_tensor.cc
--- a/mindspore/lite/src/lite_mindrt.cc
+++ b/mindspore/lite/src/lite_mindrt.cc
@ -141,7 +141,7 @@ int LiteOpActor::IsolateInputData(std::vector<std::shared_ptr<LiteOpActor>> *act
    for (LiteQuantParam quant : old_tensor->quant_params()) {
      new_tensor->AddQuantParam(quant);
    }
-    isolate_input_map_.insert(std::make_pair(new_tensor, old_tensor));
+    isolate_input_map_->insert(std::make_pair(new_tensor, old_tensor));
    ReplaceNodeInTensor(kernel_, old_tensor, new_tensor);
    /* set subgraph input for copy data */
    kernel_->set_in_tensor(new_tensor, i);
@ -149,7 +149,10 @@ int LiteOpActor::IsolateInputData(std::vector<std::shared_ptr<LiteOpActor>> *act
  return RET_OK;
 }

-int LiteOpActor::LiteActorInit(std::vector<std::shared_ptr<LiteOpActor>> *actors) {
+int LiteOpActor::LiteActorInit(std::vector<std::shared_ptr<LiteOpActor>> *actors,
+                               std::unordered_map<Tensor *, Tensor *> *input_map) {
+  isolate_input_map_ = input_map;
+
  /* Init output arrow */
  auto ret = CompileArrow();
  if (ret != RET_OK) {
@ -175,7 +178,7 @@ int LiteOpActor::LiteActorInit(std::vector<std::shared_ptr<LiteOpActor>> *actors

 int LiteOpActor::ResizeGraphInput(const std::vector<mindspore::tensor::MSTensor *> &inputs,
                                  const std::vector<std::vector<int>> &dims) {
-  for (auto map : isolate_input_map_) {
+  for (auto map : *isolate_input_map_) {
    auto isolate_tensor = map.first;
    auto src_tensor = map.second;
    for (size_t i = 0; i < inputs.size(); i++) {
--- a/mindspore/lite/src/lite_mindrt.h
+++ b/mindspore/lite/src/lite_mindrt.h
@ -51,11 +51,6 @@ class LiteOpActor : public OpActor<lite::Tensor> {
 #endif
  }
  ~LiteOpActor() override {
-    for (auto map : isolate_input_map_) {
-      auto isolate_input_tensor = map.first;
-      isolate_input_tensor->set_data(nullptr);
-      delete isolate_input_tensor;
-    }
    delete call_node_;
    delete partial_node_;
  }
@ -69,7 +64,8 @@ class LiteOpActor : public OpActor<lite::Tensor> {
    }
    return ret;
  }
-  int LiteActorInit(std::vector<std::shared_ptr<LiteOpActor>> *actors);
+  int LiteActorInit(std::vector<std::shared_ptr<LiteOpActor>> *actors,
+                    std::unordered_map<Tensor *, Tensor *> *input_map);
  int ResizeGraphInput(const std::vector<mindspore::tensor::MSTensor *> &inputs,
                       const std::vector<std::vector<int>> &dims);

@ -93,7 +89,7 @@ class LiteOpActor : public OpActor<lite::Tensor> {
  std::unordered_map<kernel::LiteKernel *, AID> subgraph_to_actor_{};
  std::vector<OpDataPtr<Tensor>> outputs_data_{};
  std::vector<Tensor *> inputs_data_{};
-  std::unordered_map<Tensor *, Tensor *> isolate_input_map_{}; /* <calculate-tensor,  src-input-tensor> */
+  std::unordered_map<Tensor *, Tensor *> *isolate_input_map_ = nullptr; /* real obj in session */

 private:
  void ReplaceNodeInTensor(kernel::LiteKernel *kernel, Tensor *old_tensor, Tensor *new_tensor);
--- a/mindspore/lite/src/lite_session.cc
+++ b/mindspore/lite/src/lite_session.cc
@ -31,6 +31,7 @@
 #include "src/kernel_registry.h"
 #include "src/lite_model.h"
 #include "src/weight_decoder.h"
+#include "src/runtime/optimize_allocator.h"
 #ifdef ENABLE_MINDRT
 #include "src/mindrt_executor.h"
 #endif
@ -430,7 +431,7 @@ int LiteSession::IsolateOutputTensor() {
    }
    src_tensor->set_ref_count(1);

-    graph_output_map_.insert(std::make_pair(new_tensor, src_tensor));
+    isolate_graph_output_map_.insert(std::make_pair(new_tensor, src_tensor));

    /* set new tensor for calculate */
    for (auto subgraph : kernels_) {
@ -471,6 +472,8 @@ int LiteSession::IsolateOutputTensor() {
 }

 void LiteSession::FreePackOpWeight(const std::vector<kernel::LiteKernel *> &kernels) {
+  // For reducing runtime RAM
+  // free pack-op weight because pack-op will not access origin weight in runtime
  for (auto *kernel : kernels) {
    MS_ASSERT(kernel != nullptr);
    if (kernel->subgraph_type() == kernel::kNotSubGraph) {
@ -493,29 +496,14 @@ void LiteSession::FreePackOpWeight(const std::vector<kernel::LiteKernel *> &kern
 }

 int LiteSession::CompileGraph(Model *model) {
-  bool expected = false;
-  if (!is_running_.compare_exchange_strong(expected, true)) {
-    MS_LOG(ERROR) << "Not support multi-threading";
-    return RET_ERROR;
-  }
-  // model.MetaGraph ==> kernels
-  if (model == nullptr) {
-    MS_LOG(ERROR) << "The input model is nullptr.";
+  auto ret = PreCheck(model);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "schedule check failed: " << ret;
    is_running_.store(false);
-    return RET_PARAM_INVALID;
-  }
-  if (model->buf == nullptr) {
-    MS_LOG(ERROR) << "The input model buf is nullptr.";
-    is_running_.store(false);
-    return RET_PARAM_INVALID;
-  }
-  if (!reinterpret_cast<LiteModel *>(model)->ModelVerify()) {
-    MS_LOG(ERROR) << "wrong model input, please check";
-    is_running_.store(false);
-    return RET_ERROR;
+    return ret;
  }

-  auto ret = ConvertTensors(model);
+  ret = ConvertTensors(model);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvertTensors failed: " << ret;
    is_running_.store(false);
@ -523,14 +511,10 @@ int LiteSession::CompileGraph(Model *model) {
  }
  InitGraphInputTensors(model);
  InitGraphOutputTensors(model);
-#ifndef ENABLE_FP16
-  if (context_->GetCpuInfo().enable_float16_) {
-    MS_LOG(WARNING) << unsupport_fp16_log;
-  }
-#endif
+
  // scheduler kernels
-  Scheduler scheduler(context_, ms_context_, model, &tensors_, inputs_, outputs_, is_train_session_, execution_plan_,
-                      delegate_, delegate_device_type_);
+  Scheduler scheduler(context_, ms_context_, model, &tensors_, inputs_, outputs_, is_train_session_, &is_infershape_,
+                      &is_control_flow_, execution_plan_, delegate_, delegate_device_type_);
  scheduler.SetupSchedulerCb(std::move(sched_cb_));
  ret = scheduler.Schedule(&kernels_);
  if (ret != RET_OK) {
@ -552,33 +536,22 @@ int LiteSession::CompileGraph(Model *model) {
    return RET_OK;
  }

-#ifdef ENABLE_MINDRT
-  ret = IsolateOutputTensor();
+  ret = InitExecutor();
  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Isolate output tensor failed.";
-    is_running_.store(false);
-    return ret;
-  }
-  executor_ = new (std::nothrow) MindrtExecutor(&graph_output_map_);
-#else
-  executor_ = new (std::nothrow) Executor();
-#endif
-  if (executor_ == nullptr) {
-    MS_LOG(ERROR) << "New Executor failed";
-    is_running_.store(false);
-    return RET_ERROR;
-  }
-
-  ret = executor_->Prepare(this->kernels_, this->inputs_, this->outputs_, context_);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Prepare executor failed: " << ret;
+    MS_LOG(ERROR) << "InitExecutor failed: " << ret;
    is_running_.store(false);
    return ret;
  }

-  // For reducing runtime RAM, free packop weight because packop will pack weight and will not access to origin weight
  FreePackOpWeight(kernels_);

+  ret = OptimizeRuntimeAllocator();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "OptimizeRuntimeAllocator failed.";
+    is_running_.store(false);
+    return ret;
+  }
+
  is_running_.store(false);
  return RET_OK;
 }
@ -824,19 +797,25 @@ LiteSession::~LiteSession() {
    tensor = nullptr;
  }

-  for (auto item : graph_output_map_) {
+  for (auto item : isolate_graph_output_map_) {
    auto isolate_output_tensor = item.first;
    isolate_output_tensor->set_data(nullptr);
    delete isolate_output_tensor;
    isolate_output_tensor = nullptr;
  }

+  for (auto map : isolate_input_map_) {
+    auto isolate_input_tensor = map.first;
+    isolate_input_tensor->set_data(nullptr);
+    delete isolate_input_tensor;
+  }
+
  // Tensor * in input_map output_map are freed in tensors
  input_map_.clear();
  output_node_map_.clear();
  output_tensor_map_.clear();
  input_vec_.clear();
-  graph_output_map_.clear();
+  isolate_graph_output_map_.clear();

  delete this->executor_;
  this->executor_ = nullptr;
@ -986,6 +965,157 @@ int LiteSession::Resize(const std::vector<mindspore::tensor::MSTensor *> &inputs
  return RET_OK;
 }

+int LiteSession::PreCheck(Model *model) {
+  bool expected = false;
+  if (!is_running_.compare_exchange_strong(expected, true)) {
+    MS_LOG(ERROR) << "Not support multi-threading";
+    return RET_ERROR;
+  }
+  if (model == nullptr) {
+    MS_LOG(ERROR) << "The input model is nullptr.";
+    return RET_PARAM_INVALID;
+  }
+  if (model->buf == nullptr) {
+    MS_LOG(ERROR) << "The input model buf is nullptr.";
+    return RET_PARAM_INVALID;
+  }
+  if (!reinterpret_cast<LiteModel *>(model)->ModelVerify()) {
+    MS_LOG(ERROR) << "wrong model input, please check";
+    return RET_ERROR;
+  }
+
+#ifndef ENABLE_FP16
+  if (context_->GetCpuInfo().enable_float16_) {
+    MS_LOG(WARNING) << unsupport_fp16_log;
+  }
+#endif
+  return RET_OK;
+}
+
+int LiteSession::InitExecutor() {
+  int ret = RET_OK;
+#ifdef ENABLE_MINDRT
+  ret = IsolateOutputTensor();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Isolate output tensor failed.";
+    return ret;
+  }
+  executor_ = new (std::nothrow) MindrtExecutor(&isolate_graph_output_map_, &isolate_input_map_);
+#else
+  executor_ = new (std::nothrow) Executor();
+#endif
+  if (executor_ == nullptr) {
+    MS_LOG(ERROR) << "New Executor failed";
+    return RET_ERROR;
+  }
+
+  ret = executor_->Prepare(kernels_, inputs_, outputs_, context_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare executor failed: " << ret;
+    return ret;
+  }
+  return RET_OK;
+}
+
+int LiteSession::OptimizeRuntimeAllocator() {
+  return RET_OK;
+
+  if (is_infershape_ != RET_OK) {
+    MS_LOG(ERROR) << "Not support opt allocator in runtime-infershape.";
+    return RET_OK;
+  }
+  if (is_control_flow_ == true) {
+    MS_LOG(ERROR) << "Not support opt allocator in control flow model.";
+    return RET_OK;
+  }
+
+  AllocatorPtr default_allocator = context_->allocator;
+  OptAllocatorPtr optimize_allocator = std::make_shared<OptimizeAllocator>();
+  std::unordered_map<lite::Tensor *, int> ref_count;
+
+  for (auto subgraph : kernels_) {
+    if (subgraph->desc().arch != kernel::KERNEL_ARCH::kCPU) {
+      continue;
+    }
+
+    for (auto in_tensor : subgraph->in_tensors()) {
+      auto iter = isolate_input_map_.find(in_tensor);
+      if (isolate_input_map_.end() == iter) break;
+      auto src_t = iter->second;
+
+      if (src_t->data_type() == in_tensor->data_type()) {
+        in_tensor->set_allocator(src_t->allocator());
+        ref_count[src_t] += in_tensor->init_ref_count();
+        continue;
+      }
+
+      if (src_t->allocator() == default_allocator) {
+        src_t->set_allocator(optimize_allocator);
+        ref_count[src_t] = src_t->init_ref_count();
+        optimize_allocator->MallocTensorData(src_t);
+      }
+      if (ref_count[in_tensor]-- <= 0) {
+        optimize_allocator->FreeTensorData(in_tensor);
+      }
+    }
+
+    auto kernel_list = reinterpret_cast<kernel::SubGraphKernel *>(subgraph)->nodes();
+    for (auto kernel : kernel_list) {
+      /* malloc for output */
+      for (auto tensor : kernel->out_tensors()) {
+        if (tensor->IsGraphOutput() == true) {
+          continue;
+        }
+        if (tensor->allocator() != default_allocator) {
+          continue;
+        }
+        tensor->set_allocator(optimize_allocator);
+        ref_count[tensor] = tensor->init_ref_count();
+        optimize_allocator->MallocTensorData(tensor);
+      }
+
+      /* free input after run */
+      for (auto tensor : kernel->in_tensors()) {
+        if (tensor->allocator() != optimize_allocator) {
+          continue;
+        }
+        if (ref_count[tensor]-- <= 0) {
+          optimize_allocator->FreeTensorData(tensor);
+        }
+      }
+    }
+  }
+
+  auto ret = OptAllocatorSetData(optimize_allocator);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "using optimize allocator failed.";
+    return ret;
+  }
+  return RET_OK;
+}
+
+int LiteSession::OptAllocatorSetData(OptAllocatorPtr opt_allocator) {
+  void *data = opt_allocator->MallocOptData();
+  if (data == nullptr) {
+    MS_LOG(ERROR) << "malloc optimize data failed.";
+    return RET_ERROR;
+  }
+  int8_t *int8_data = reinterpret_cast<int8_t *>(data);
+  auto offset_map = opt_allocator->GetOffsetMap();
+
+  for (auto tensor : tensors_) {
+    if (tensor->allocator() != opt_allocator) {
+      continue;
+    }
+    auto offset_iter = offset_map.find(tensor);
+    if (offset_iter == offset_map.end()) {
+      return RET_ERROR;
+    }
+    tensor->set_data(int8_data + offset_iter->second);
+  }
+  return RET_OK;
+}
+
 int LiteSession::InitGPURuntime() {
  if (context_->IsCpuEnabled()) {
    CpuBindMode cpu_bind_mode = context_->GetCpuDeviceInfo()->cpu_bind_mode_;
--- a/mindspore/lite/src/lite_session.h
+++ b/mindspore/lite/src/lite_session.h
@ -28,6 +28,7 @@
 #include "include/lite_session.h"
 #include "include/model.h"
 #include "src/inner_context.h"
+#include "src/runtime/optimize_allocator.h"
 #include "schema/model_generated.h"
 #include "src/executor.h"
 #include "src/tensor.h"
@ -125,12 +126,19 @@ class LiteSession : public session::LiteSession {
  static void FreePackOpWeight(const std::vector<kernel::LiteKernel *> &kernels);

 private:
+  int PreCheck(Model *model);
+
+  int InitExecutor();
+
  void ResetInputsShape(const std::vector<std::vector<int>> &dims);

  int InitGPURuntime();

  bool IsIsolatedSubGraph(kernel::LiteKernel *kernel);

+  int OptimizeRuntimeAllocator();
+  int OptAllocatorSetData(OptAllocatorPtr opt_allocator);
+
 protected:
  InnerContext *context_ = nullptr;
  mindspore::Context *ms_context_ = nullptr;
@ -150,7 +158,11 @@ class LiteSession : public session::LiteSession {
  std::vector<std::string> output_tensor_names_;
  // graph output tensor name -- output tensor
  std::unordered_map<std::string, mindspore::tensor::MSTensor *> output_tensor_map_;
-  std::unordered_map<Tensor *, Tensor *> graph_output_map_; /* <calculate-tensor,  graph-output-tensor> */
+
+  // graph isolate tensors
+  std::unordered_map<Tensor *, Tensor *> isolate_graph_output_map_; /* <calculate-tensor,  graph-output-tensor> */
+  std::unordered_map<Tensor *, Tensor *> isolate_input_map_;        /* <calculate-tensor,  src-input-tensor> */
+
  Executor *executor_ = nullptr;
  Model *model_ = nullptr;
  std::atomic<bool> is_running_ = {false};
@ -159,6 +171,8 @@ class LiteSession : public session::LiteSession {
 #if GPU_OPENCL
  opencl::OpenCLRuntimeInnerWrapper *opencl_runtime_wrapper_{nullptr};
 #endif
+  int is_infershape_{RET_ERROR};
+  bool is_control_flow_ = false;
  std::unique_ptr<SchedulerCb> sched_cb_;
  std::shared_ptr<Delegate> delegate_ = nullptr;
  int delegate_device_type_ = -1;  // -1: not specified; 0: CPU; 1: GPU; 2: NPU
--- a/mindspore/lite/src/mindrt_executor.cc
+++ b/mindspore/lite/src/mindrt_executor.cc
@ -55,13 +55,13 @@ int MindrtExecutor::PrepareOutputData(const std::vector<kernel::LiteKernel *> &k
      continue;
    }
    auto current_output_map =
-      std::find_if(output_tensor_map_->begin(), output_tensor_map_->end(), [&](const auto output_map_tensor) {
+      std::find_if(isolate_output_map_->begin(), isolate_output_map_->end(), [&](const auto output_map_tensor) {
        if (graph_output_tensor == output_map_tensor.second) {
          return true;
        }
        return false;
      });
-    MS_ASSERT(current_output_map != output_tensor_map_->end());
+    MS_ASSERT(current_output_map != isolate_output_map_->end());
    Tensor *subgraph_output_tensor = current_output_map->first;

    for (size_t j = 0; j < kernels.size(); ++j) {
@ -120,7 +120,7 @@ int MindrtExecutor::Prepare(const std::vector<kernel::LiteKernel *> &kernels, co
  }

  for (auto actor : op_actors_) {
-    ret = actor->LiteActorInit(&op_actors_);
+    ret = actor->LiteActorInit(&op_actors_, isolate_input_map_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "LiteActorInit failed, actor aid: " << actor->GetAID();
      return ret;
@ -131,7 +131,7 @@ int MindrtExecutor::Prepare(const std::vector<kernel::LiteKernel *> &kernels, co
 }

 void MindrtExecutor::TransferGraphOutput() {
-  for (auto tensor_map : *output_tensor_map_) {
+  for (auto tensor_map : *isolate_output_map_) {
    auto dst_tensor = tensor_map.second;
    auto src_tensor = tensor_map.first;
    dst_tensor->set_shape(src_tensor->shape());
@ -151,7 +151,7 @@ void MindrtExecutor::TransferGraphOutput() {
 }

 void MindrtExecutor::FreeOutputTensor() {
-  for (auto tensor_map : *output_tensor_map_) {
+  for (auto tensor_map : *isolate_output_map_) {
    auto src_tensor = tensor_map.first;
    auto dst_tensor = tensor_map.second;
    if (dst_tensor->allocator() != nullptr) {
--- a/mindspore/lite/src/mindrt_executor.h
+++ b/mindspore/lite/src/mindrt_executor.h
@ -29,7 +29,9 @@
 namespace mindspore::lite {
 class MindrtExecutor : public Executor {
 public:
-  explicit MindrtExecutor(std::unordered_map<Tensor *, Tensor *> *output_map) : output_tensor_map_(output_map) {}
+  explicit MindrtExecutor(std::unordered_map<Tensor *, Tensor *> *output_map,
+                          std::unordered_map<Tensor *, Tensor *> *input_map)
+      : isolate_output_map_(output_map), isolate_input_map_(input_map) {}
  virtual ~MindrtExecutor() { MindrtTerminate(op_actors_); }

  int Prepare(const std::vector<kernel::LiteKernel *> &kernels, const std::vector<Tensor *> &inputs,
@ -52,7 +54,8 @@ class MindrtExecutor : public Executor {
  std::vector<std::shared_ptr<LiteOpActor>> op_actors_;
  std::vector<OpDataPtr<Tensor>> input_data_;
  std::vector<OpDataPtr<Tensor>> output_data_;
-  std::unordered_map<Tensor *, Tensor *> *output_tensor_map_;
+  std::unordered_map<Tensor *, Tensor *> *isolate_output_map_;
+  std::unordered_map<Tensor *, Tensor *> *isolate_input_map_;
 };

 }  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/optimize_allocator.cc
+++ b/mindspore/lite/src/runtime/optimize_allocator.cc
@ -0,0 +1,102 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/optimize_allocator.h"
+
+namespace mindspore {
+OptimizeAllocator::OptimizeAllocator(size_t aligned_size) {
+  aligned_size_ = aligned_size;
+  return;
+}
+
+OptimizeAllocator::~OptimizeAllocator() {
+  if (data_ == nullptr) {
+    free(data_);
+    data_ = nullptr;
+  }
+}
+
+void *OptimizeAllocator::MallocOptData() {
+  if (data_ == nullptr) {
+    data_ = malloc(total_size_);
+  }
+  return data_;
+}
+
+size_t OptimizeAllocator::FindMinFree(size_t size) {
+  size_t min_size = total_size_;
+  size_t min_addr = total_size_;
+  for (auto const &itr : free_list_) {
+    if (itr.second >= size && min_size > itr.second) {
+      min_size = itr.second;
+      min_addr = itr.first;
+    }
+  }
+  return min_addr;
+}
+
+void OptimizeAllocator::FreeTensorData(lite::Tensor *tensor) {
+  size_t offset = offset_map_[tensor];
+  free_list_[offset] = used_list_[offset];
+  used_list_.erase(offset);
+
+  size_t length = free_list_[offset];
+
+  size_t post_offset = offset + length;
+  auto post_iter = free_list_.find(post_offset);
+  if (post_iter != free_list_.end()) {
+    size_t post_length = post_iter->second;
+    free_list_[offset] = length + post_length;
+    free_list_.erase(post_offset);
+  }
+
+  auto pre_iter = free_list_.lower_bound(offset);
+  if (pre_iter != free_list_.begin()) {
+    pre_iter--;
+    size_t pre_offset = pre_iter->first;
+    if ((pre_offset + free_list_[pre_offset]) == offset) {
+      free_list_[pre_offset] = free_list_[pre_offset] + length;
+      free_list_.erase(offset);
+    }
+  }
+}
+
+void OptimizeAllocator::MallocTensorData(lite::Tensor *tensor) {
+  size_t size = tensor->Size();
+  size_t offset = FindMinFree(size);
+
+  if (offset > total_size_) {
+    if (free_list_.empty()) {
+      offset = total_size_;
+    } else {
+      offset = free_list_.rbegin()->first;
+      if (offset + free_list_[offset] < total_size_) {
+        offset = total_size_;
+      } else {
+        free_list_.erase(offset);
+      }
+    }
+  } else {
+    if (free_list_[offset] > size) {
+      free_list_[offset + size] = free_list_[offset] - size;
+    }
+    free_list_.erase(offset);
+  }
+
+  used_list_[offset] = size;
+  offset_map_[tensor] = offset;
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/runtime/optimize_allocator.h
+++ b/mindspore/lite/src/runtime/optimize_allocator.h
@ -0,0 +1,61 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_OPTIMIZE_ALLOCATOR_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_OPTIMIZE_ALLOCATOR_H_
+
+#include <memory>
+#include <map>
+#include <unordered_map>
+#include "include/api/allocator.h"
+#include "include/errorcode.h"
+#include "src/tensor.h"
+
+namespace mindspore {
+class OptimizeAllocator : public Allocator {
+ public:
+  explicit OptimizeAllocator(size_t aligned_size = 32);
+  ~OptimizeAllocator() override;
+
+ public:
+  void *Malloc(size_t size) override { return nullptr; }
+  void Free(void *ptr) override { return; }
+  int RefCount(void *ptr) override { return lite::RET_OK; }
+  int SetRefCount(void *ptr, int ref_count) override { return lite::RET_OK; }
+  int IncRefCount(void *ptr, int ref_count) override { return lite::RET_OK; }
+  int DecRefCount(void *ptr, int ref_count) override { return lite::RET_OK; }
+
+ public:
+  void MallocTensorData(lite::Tensor *tensor);
+  void FreeTensorData(lite::Tensor *tensor);
+  void *MallocOptData();
+  const std::unordered_map<lite::Tensor *, size_t> &GetOffsetMap() const { return offset_map_; }
+
+ private:
+  size_t FindMinFree(size_t size);
+
+ private:
+  void *data_ = nullptr;
+  size_t total_size_;
+  std::unordered_map<lite::Tensor *, size_t> offset_map_;
+  std::map<size_t, size_t> free_list_; /* offset, size */
+  std::map<size_t, size_t> used_list_; /* offset, size */
+};
+
+using OptAllocatorPtr = std::shared_ptr<OptimizeAllocator>;
+}  // namespace mindspore
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_OPTIMIZE_ALLOCATOR_H_
--- a/mindspore/lite/src/scheduler.cc
+++ b/mindspore/lite/src/scheduler.cc
@ -241,15 +241,17 @@ int Scheduler::InitKernels(std::vector<kernel::LiteKernel *> dst_kernels) {
 }

 int Scheduler::SchedulePreProcess() {
+  schema_version_ = reinterpret_cast<LiteModel *>(src_model_)->GetSchemaVersion();
+
  this->graph_output_node_indexes_ = GetGraphOutputNodes(src_model_);

-  int infershape_ret = InferSubGraphShape(kMainSubGraphIndex);
-  if (infershape_ret != RET_OK && infershape_ret != RET_INFER_INVALID) {
+  *is_infershape_ = InferSubGraphShape(kMainSubGraphIndex);
+  if (*is_infershape_ != RET_OK && *is_infershape_ != RET_INFER_INVALID) {
    MS_LOG(ERROR) << "op infer shape failed.";
-    return infershape_ret;
+    return *is_infershape_;
  }

-  if (context_->enable_parallel_ && infershape_ret != RET_INFER_INVALID) {
+  if (context_->enable_parallel_ && *is_infershape_ != RET_INFER_INVALID) {
 #ifndef AUTO_PARALLEL_CLIP
    auto search_sub_graph =
      SearchSubGraph(context_, src_model_, src_tensors_, &op_parameters_, &graph_output_node_indexes_);
@ -275,6 +277,21 @@ int Scheduler::CheckCpuValid(std::vector<kernel::LiteKernel *> *dst_kernels) {
  return RET_OK;
 }

+int Scheduler::ConstructSubGraphs(std::vector<kernel::LiteKernel *> *dst_kernels) {
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
+  if (IsControlFlowParttern(*dst_kernels)) {
+    *is_control_flow_ = true;
+    return ConstructControlFlowMainGraph(dst_kernels);
+  }
+#endif
+
+  *is_control_flow_ = false;
+  auto src_kernel = *dst_kernels;
+  dst_kernels->clear();
+  std::map<const kernel::LiteKernel *, bool> is_kernel_finish;
+  return ConstructNormalSubGraphs(src_kernel, dst_kernels, &is_kernel_finish);
+}
+
 int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
  int check_input_ret = CheckInputParam(dst_kernels);
  if (check_input_ret != RET_OK) {
@ -282,8 +299,6 @@ int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
    return check_input_ret;
  }

-  schema_version_ = reinterpret_cast<LiteModel *>(src_model_)->GetSchemaVersion();
-
  int ret = SchedulePreProcess();
  if (ret != RET_OK) {
    return ret;
@ -307,7 +322,6 @@ int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
    MS_LOG(ERROR) << "Repalce delegate kernels failed.";
    return ret;
  }
-  context_->thread_pool()->SetSpinCountMinValue();
 #endif

  ret = CheckCpuValid(dst_kernels);
@ -322,26 +336,11 @@ int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
  RuntimePass(context_, dst_kernels, src_tensors_);
 #endif

-#ifndef CONTROLFLOW_TENSORLIST_CLIP
-  if (IsControlFlowParttern(*dst_kernels)) {
-    ret = ConstructControlFlowMainGraph(dst_kernels);
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "ConstructControlFlowMainGraph failed.";
-      return ret;
-    }
-  } else {
-#endif
-    auto src_kernel = *dst_kernels;
-    dst_kernels->clear();
-    std::map<const kernel::LiteKernel *, bool> is_kernel_finish;
-    ret = ConstructSubGraphs(src_kernel, dst_kernels, &is_kernel_finish);
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "ConstructSubGraphs failed.";
-      return ret;
-    }
-#ifndef CONTROLFLOW_TENSORLIST_CLIP
+  ret = ConstructSubGraphs(dst_kernels);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ConstructSubGraphs failed.";
+    return ret;
  }
-#endif

  ret = InitKernels(*dst_kernels);
  if (ret != RET_OK) {
@ -457,6 +456,9 @@ int Scheduler::InitDelegateKernels(std::vector<kernel::LiteKernel *> *dst_kernel
    return RET_OK;
  }

+  /* set delegate spin count */
+  context_->thread_pool()->SetSpinCountMinValue();
+
  /* external delegate */
  if (delegate_device_type_ == -1) {
    auto ret = ReplaceDelegateKernels(dst_kernels);
@ -1521,9 +1523,9 @@ kernel::LiteKernel *FindAllSubGraphKernels(const std::vector<kernel::LiteKernel
 }
 }  // namespace

-int Scheduler::ConstructSubGraphs(std::vector<kernel::LiteKernel *> src_kernel,
-                                  std::vector<kernel::LiteKernel *> *dst_kernel,
-                                  std::map<const kernel::LiteKernel *, bool> *is_kernel_finish) {
+int Scheduler::ConstructNormalSubGraphs(std::vector<kernel::LiteKernel *> src_kernel,
+                                        std::vector<kernel::LiteKernel *> *dst_kernel,
+                                        std::map<const kernel::LiteKernel *, bool> *is_kernel_finish) {
  if (src_kernel.empty()) {
    return RET_OK;
  }
--- a/mindspore/lite/src/scheduler.h
+++ b/mindspore/lite/src/scheduler.h
@ -41,9 +41,9 @@ class Scheduler {
 public:
  Scheduler(const InnerContext *ctx, const mindspore::Context *ms_ctx, Model *src_model,
            std::vector<Tensor *> *src_tensors, const std::vector<Tensor *> &input_tensors,
-            const std::vector<Tensor *> &output_tensors, bool is_train_session,
-            std::map<std::string, TypeId> *executions, std::shared_ptr<Delegate> delegate = nullptr,
-            int delegate_device_type = -1)
+            const std::vector<Tensor *> &output_tensors, bool is_train_session, int *is_infershape,
+            bool *is_control_flow, std::map<std::string, TypeId> *executions,
+            std::shared_ptr<Delegate> delegate = nullptr, int delegate_device_type = -1)
      : context_(ctx),
        ms_context_(ms_ctx),
        src_model_(src_model),
@ -51,6 +51,8 @@ class Scheduler {
        inputs_(input_tensors),
        outputs_(output_tensors),
        is_train_session_(is_train_session),
+        is_control_flow_(is_control_flow),
+        is_infershape_(is_infershape),
        delegate_(delegate),
        delegate_device_type_(delegate_device_type),
        execution_plan_(executions) {}
@ -102,8 +104,12 @@ class Scheduler {
  // find in_kernels_ and out_kernels of kernel, sub_graph and nodes_ in sub_graph
  static void FindAllInoutKernels(const std::vector<kernel::LiteKernel *> &kernels);
  // vector<LiteKernel/SubGraphKernel> --> vector<SubGraphKernel>
-  int ConstructSubGraphs(std::vector<kernel::LiteKernel *> src_kernel, std::vector<kernel::LiteKernel *> *dst_kernel,
-                         std::map<const kernel::LiteKernel *, bool> *sinked_kernel_map);
+  int ConstructNormalSubGraphs(std::vector<kernel::LiteKernel *> src_kernel,
+                               std::vector<kernel::LiteKernel *> *dst_kernel,
+                               std::map<const kernel::LiteKernel *, bool> *sinked_kernel_map);
+
+  int ConstructSubGraphs(std::vector<kernel::LiteKernel *> *dst_kernel);
+
  // create subgraph_kernel from a vector of kernel
  std::vector<kernel::LiteKernel *> ScheduleMainSubGraphToKernels();
  kernel::LiteKernel *SchedulePartialToSubGraphKernel(const int &subgraph_index);
@ -147,6 +153,8 @@ class Scheduler {
  std::vector<size_t> graph_output_node_indexes_;
  std::map<int, OpParameter *> op_parameters_;
  bool is_train_session_ = false;
+  bool *is_control_flow_ = nullptr;
+  int *is_infershape_ = nullptr;
  std::unique_ptr<SchedulerCb> sched_cb_;
  std::map<kernel::Kernel *, const schema::Primitive *> primitives_;
  std::shared_ptr<Delegate> delegate_ = nullptr;
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@ -22,6 +22,7 @@ file(GLOB_RECURSE TEST_UT_SRC
        ${TEST_DIR}/ut/src/registry/registry_custom_op_test.cc
        ${TEST_DIR}/ut/src/runtime/runtime_pass_tests.cc
        ${TEST_DIR}/st/multiple_device_test.cc
+        ${TEST_DIR}/st/optimize_allocator_test.cc
        ${TEST_DIR}/st/mindrt_parallel_runtime_test.cc
        ${TEST_DIR}/st/mix_data_type_test.cc
        ${TEST_DIR}/ut/nnacl/infer/*.cc
--- a/mindspore/lite/test/runtest.sh
+++ b/mindspore/lite/test/runtest.sh
@ -95,5 +95,8 @@ echo 'run custom delegate st test'
 echo 'runtime pass'
 ./lite-test --gtest_filter="RuntimePass.*"

+echo 'Optimize Allocator'
+./lite-test --gtest_filter="OptAllocator.*"
+
 echo 'Runtime config file test'
 ./lite-test --gtest_filter="MixDataTypeTest.Config1"
--- a/mindspore/lite/test/st/optimize_allocator_test.cc
+++ b/mindspore/lite/test/st/optimize_allocator_test.cc
@ -0,0 +1,148 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either address or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common/common_test.h"
+#include "schema/inner/model_generated.h"
+#include "src/lite_session.h"
+#include "src/sub_graph_kernel.h"
+#include "ir/dtype/type_id.h"
+#include "include/version.h"
+#include "include/model.h"
+
+namespace mindspore {
+class OptAllocator : public mindspore::CommonTest {
+ public:
+  OptAllocator() = default;
+};
+
+void CreateModel1(mindspore::schema::MetaGraphT *meta_graph) {
+  meta_graph->name = "graph";
+  meta_graph->version = mindspore::lite::Version();
+
+  /*      cos
+   *     /   \
+   *   sin    |
+   *     \   /
+   *      add
+   *       |
+   * */
+
+  auto cos = std::make_unique<mindspore::schema::CNodeT>();
+  cos->inputIndex = {0};
+  cos->outputIndex = {1};
+  cos->primitive = std::make_unique<mindspore::schema::PrimitiveT>();
+  cos->primitive->value.type = mindspore::schema::PrimitiveType_Cos;
+  auto cos_primitive = new mindspore::schema::CosT;
+  cos->primitive->value.value = cos_primitive;
+  cos->name = "cos";
+
+  auto sin = std::make_unique<mindspore::schema::CNodeT>();
+  sin->inputIndex = {1};
+  sin->outputIndex = {2};
+  sin->primitive = std::make_unique<mindspore::schema::PrimitiveT>();
+  sin->primitive->value.type = mindspore::schema::PrimitiveType_Sin;
+  auto sin_primitive = new mindspore::schema::SinT;
+  sin->primitive->value.value = sin_primitive;
+  sin->name = "sin";
+
+  auto add = std::make_unique<mindspore::schema::CNodeT>();
+  add->inputIndex = {1, 2};
+  add->outputIndex = {3};
+  add->primitive = std::make_unique<mindspore::schema::PrimitiveT>();
+  add->primitive->value.type = mindspore::schema::PrimitiveType_AddFusion;
+  auto add_primitive = new mindspore::schema::AddFusionT;
+  add->primitive->value.value = add_primitive;
+  add->name = "add";
+
+  /* tensors */
+  auto tensor0 = std::make_unique<mindspore::schema::TensorT>();
+  tensor0->nodeType = mindspore::lite::NodeType_ValueNode;
+  tensor0->format = mindspore::schema::Format_NHWC;
+  tensor0->dataType = mindspore::TypeId::kNumberTypeFloat32;
+  tensor0->dims = {4};
+  tensor0->offset = -1;
+  tensor0->name = "input";
+
+  auto tensor1 = std::make_unique<mindspore::schema::TensorT>();
+  tensor1->nodeType = mindspore::lite::NodeType_ValueNode;
+  tensor1->format = mindspore::schema::Format_NHWC;
+  tensor1->dataType = mindspore::TypeId::kNumberTypeFloat32;
+  tensor1->dims = {4};
+  tensor1->offset = -1;
+  tensor1->name = "cos";
+
+  auto tensor2 = std::make_unique<mindspore::schema::TensorT>();
+  tensor2->nodeType = mindspore::lite::NodeType_ValueNode;
+  tensor2->format = mindspore::schema::Format_NHWC;
+  tensor2->dataType = mindspore::TypeId::kNumberTypeFloat32;
+  tensor2->dims = {4};
+  tensor2->offset = -1;
+  tensor2->name = "sin";
+
+  auto tensor3 = std::make_unique<mindspore::schema::TensorT>();
+  tensor3->nodeType = mindspore::lite::NodeType_ValueNode;
+  tensor3->format = mindspore::schema::Format_NHWC;
+  tensor3->dataType = mindspore::TypeId::kNumberTypeFloat32;
+  tensor3->dims = {4};
+  tensor3->offset = -1;
+  tensor3->name = "add";
+
+  meta_graph->nodes.emplace_back(std::move(cos));
+  meta_graph->nodes.emplace_back(std::move(sin));
+  meta_graph->nodes.emplace_back(std::move(add));
+
+  meta_graph->allTensors.emplace_back(std::move(tensor0));
+  meta_graph->allTensors.emplace_back(std::move(tensor1));
+  meta_graph->allTensors.emplace_back(std::move(tensor2));
+  meta_graph->allTensors.emplace_back(std::move(tensor3));
+
+  meta_graph->inputIndex = {0};
+  meta_graph->outputIndex = {3};
+}
+
+TEST_F(OptAllocator, OptAllocator1) {
+  auto meta_graph = std::make_shared<mindspore::schema::MetaGraphT>();
+  CreateModel1(meta_graph.get());
+
+  flatbuffers::FlatBufferBuilder builder(1024);
+  auto offset = mindspore::schema::MetaGraph::Pack(builder, meta_graph.get());
+  builder.Finish(offset);
+  mindspore::schema::FinishMetaGraphBuffer(builder, offset);
+  size_t size = builder.GetSize();
+  const char *content = reinterpret_cast<char *>(builder.GetBufferPointer());
+
+  auto context = std::make_shared<mindspore::lite::Context>();
+  auto *lite_session = mindspore::session::LiteSession::CreateSession(content, size, context.get());
+  ASSERT_NE(lite_session, nullptr);
+
+  auto input = lite_session->GetInputs().front();
+  std::vector<float> in_data = {1.0, 2.0, 3.0, 4.0};
+  memcpy(input->MutableData(), in_data.data(), input->Size());
+
+  auto ret = lite_session->RunGraph();
+  ASSERT_EQ(mindspore::lite::RET_OK, ret);
+
+  /* checkout output */
+  void *out_data = lite_session->GetOutputs().begin()->second->MutableData();
+  float *fp32_data = reinterpret_cast<float *>(out_data);
+
+  ASSERT_LE(fabs(fp32_data[0] - (1.054698)), 0.01);
+  ASSERT_LE(fabs(fp32_data[1] - (-0.820386)), 0.01);
+  ASSERT_LE(fabs(fp32_data[2] - (-1.826014)), 0.01);
+  ASSERT_LE(fabs(fp32_data[3] - (-1.261727)), 0.01);
+
+  delete lite_session;
+}
+}  // namespace mindspore
--- a/mindspore/lite/tools/converter/CMakeLists.txt
+++ b/mindspore/lite/tools/converter/CMakeLists.txt
@ -159,6 +159,7 @@ set(LITE_SRC
        ${SRC_DIR}/common/prim_util.cc
        ${SRC_DIR}/common/tensor_util.cc
        ${SRC_DIR}/runtime/inner_allocator.cc
+        ${SRC_DIR}/runtime/optimize_allocator.cc
        ${SRC_DIR}/runtime/infer_manager.cc
        ${SRC_DIR}/runtime/runtime_pass.cc
        ${SRC_DIR}/inner_context.cc