!34113 [MS][LITE]Add Hi35xx keep original output

Merge pull request !34113 from gongdaguo1/add_hi3516_origin_output
2022-05-10 06:13:47 +00:00 · 2022-05-10 06:13:47 +00:00 · 16f9b62536
parent 30fa3fdd1c 56d1d44563
commit 16f9b62536
17 changed files with 781 additions and 196 deletions
--- a/include/api/model.h
+++ b/include/api/model.h
@ -161,6 +161,14 @@ class MS_API Model {
  Status Predict(const std::vector<MSTensor> &inputs, std::vector<MSTensor> *outputs,
                 const MSKernelCallBack &before = nullptr, const MSKernelCallBack &after = nullptr);

+  /// \brief Inference model.
+  ///
+  /// \param[in] before CallBack before predict.
+  /// \param[in] after CallBack after predict.
+  ///
+  /// \return Status.
+  Status Predict(const MSKernelCallBack &before = nullptr, const MSKernelCallBack &after = nullptr);
+
  /// \brief Train model by step.
  ///
  /// \param[in] before CallBack before predict.
--- a/mindspore/lite/providers/nnie/src/custom_allocator.h
+++ b/mindspore/lite/providers/nnie/src/custom_allocator.h
@ -0,0 +1,47 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NNIE_SRC_CUSTOM_ALLOCATOR_H_
+#define NNIE_SRC_CUSTOM_ALLOCATOR_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <mutex>
+#include <map>
+#include <unordered_map>
+#include <unordered_set>
+#include <atomic>
+#include "include/api/allocator.h"
+#include "include/hi_type.h"
+
+namespace mindspore {
+namespace nnie {
+class CustomAllocator : public Allocator {
+ public:
+  CustomAllocator() {}
+  ~CustomAllocator() override{};
+  void *Malloc(size_t size) override { return nullptr; }
+  void Free(void *ptr) override {}
+  int RefCount(void *ptr) override { return 1; }
+  int SetRefCount(void *ptr, int ref_count) override { return ref_count; }
+  int DecRefCount(void *ptr, int ref_count) override { return 1; }
+  int IncRefCount(void *ptr, int ref_count) override { return 1; }
+};
+}  // namespace nnie
+}  // namespace mindspore
+
+#endif  // NNIE_SRC_CUSTOM_ALLOCATOR_H_
--- a/mindspore/lite/providers/nnie/src/custom_fp32.cc
+++ b/mindspore/lite/providers/nnie/src/custom_fp32.cc
@ -20,9 +20,6 @@
 #include "schema/model_generated.h"
 #include "include/registry/register_kernel.h"
 #include "include/errorcode.h"
-#include "src/nnie_manager.h"
-#include "src/nnie_print.h"
-#include "src/nnie_cfg_parser.h"

 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
@ -30,29 +27,29 @@ using mindspore::schema::PrimitiveType_Custom;

 namespace mindspore {
 namespace nnie {
-bool CustomCPUKernel::load_model_ = false;
+static std::shared_ptr<Allocator> kCustomAllocator = std::make_shared<nnie::CustomAllocator>();

-int CustomCPUKernel::run_seg_ = 0;
-bool CustomCPUKernel::roi_used_ = false;
 int CustomCPUKernel::Prepare() {
-  if (!load_model_) {
-    Flags flags;
-    if (flags.Init(*this) != RET_OK) {
+  if ((manager_) == nullptr) {
+    LOGE("manager_ is nullptr.");
+    return RET_ERROR;
+  }
+  if (!manager_->GetLoadModel()) {
+    if (manager_->GetFlags()->Init(*this) != RET_OK) {
      LOGE("Nnie config init fail.");
      return RET_ERROR;
    }
-
-    if (nnie::NNIEManager::GetInstance()->CfgInit(flags.max_roi_num_, flags.time_step_, flags.core_ids_) != RET_OK) {
+    if (manager_->CfgInit(*manager_->GetFlags(), manager_->GetMaxSegId()) != RET_OK) {
      LOGE("Nnie init cfg fail.");
      return RET_ERROR;
    }

-    if (nnie::NNIEManager::GetInstance()->Init(reinterpret_cast<char *>(inputs_[inputs_.size() - 1].MutableData()),
-                                               static_cast<int>(inputs_[inputs_.size() - 1].ElementNum()), inputs_)) {
+    if (manager_->Init(reinterpret_cast<char *>(inputs_[inputs_.size() - 1].MutableData()),
+                       static_cast<int>(inputs_[inputs_.size() - 1].ElementNum()), inputs_)) {
      LOGI("Load WK Model Fail.");
      return RET_OK;
    }
-    load_model_ = true;
+    manager_->SetLoadModel(true);
  }
  outputs_shapes_.resize(outputs_.size());
  for (size_t i = 0; i < outputs_.size(); i++) {
@ -62,38 +59,51 @@ int CustomCPUKernel::Prepare() {
 }

 int CustomCPUKernel::ReSize() {
-  if (load_model_) {
-    nnie::NNIEManager::GetInstance()->Release();
-    load_model_ = false;
+  if (manager_->GetLoadModel() && seg_id() == 0) {
+    manager_->Release(true);
+    manager_->SetLoadModel(false);
  }

  return Prepare();
 }

 int CustomCPUKernel::Execute() {
-  if (!load_model_) {
+  if (!manager_->GetLoadModel()) {
    LOGE("WK Model is not load.");
    return RET_ERROR;
  }
-  run_seg_ = seg_id_;
+  Flags *flags = manager_->GetFlags();
+  if (flags->keep_origin_output_) {
+    if (seg_id_ == 0) {
+      if (manager_->LoadInputs(&inputs_, kCustomAllocator) != RET_OK) {
+        LOGE("Unable to find the physical address corresponding to the input tensor.");
+        return RET_ERROR;
+      }
+    }
+    if (seg_id_ == manager_->GetMaxSegId()) {
+      if (manager_->LoadOutputs(&outputs_, kCustomAllocator) != RET_OK) {
+        LOGE("Unable to find the physical address corresponding to the output tensor.");
+        return RET_ERROR;
+      }
+    }
+  }

-  if (nnie::NNIEManager::GetInstance()->FillData(&inputs_, run_seg_) != RET_OK) {
+  if (manager_->FillData(&inputs_, seg_id_) != RET_OK) {
    LOGE("Fail Fill Data.");
    return RET_ERROR;
  }

-  if (nnie::NNIEManager::GetInstance()->Run(&outputs_, run_seg_, outputs_shapes_) != RET_OK) {
+  if (manager_->Run(&outputs_, seg_id_, outputs_shapes_) != RET_OK) {
    LOGE("Fail WK Run.");
    return RET_ERROR;
  }
-  run_seg_++;
  return RET_OK;
 }

 CustomCPUKernel::~CustomCPUKernel() {
-  if (load_model_) {
-    nnie::NNIEManager::GetInstance()->Release();
-    load_model_ = false;
+  if (manager_->GetLoadModel()) {
+    manager_->Release(false);
+    manager_->SetLoadModel(false);
  }
 }

@ -159,7 +169,13 @@ std::shared_ptr<mindspore::kernel::Kernel> CustomCreateKernel(const std::vector<
      forward_bbox = true;
    }
  }
-  auto kernel = std::make_shared<CustomCPUKernel>(ndims, forward_bbox, inputs, outputs, primitive, ctx);
+  auto model_buf = static_cast<const void *>(inputs[inputs.size() - 1].Data().get());
+  auto manager = nnie::NNIEManager::GetInstance(model_buf);
+  if ((manager) == nullptr) {
+    LOGE("malloc NNIEManager failed.");
+    return nullptr;
+  }
+  auto kernel = std::make_shared<CustomCPUKernel>(manager, ndims, forward_bbox, inputs, outputs, primitive, ctx);
  if (kernel == nullptr) {
    LOGE("new custom kernel is nullptr");
    return nullptr;
--- a/mindspore/lite/providers/nnie/src/custom_fp32.h
+++ b/mindspore/lite/providers/nnie/src/custom_fp32.h
@ -19,10 +19,16 @@

 #include <vector>
 #include <string>
+#include <memory>
 #include "include/schema/model_generated.h"
 #include "include/context.h"
 #include "include/api/kernel.h"
 #include "src/custom_infer.h"
+#include "include/hi_type.h"
+#include "src/nnie_cfg_parser.h"
+#include "src/nnie_manager.h"
+#include "src/nnie_print.h"
+#include "src/custom_allocator.h"

 using mindspore::kernel::Kernel;
 using mindspore::tensor::MSTensor;
@ -30,12 +36,14 @@ namespace mindspore {
 namespace nnie {
 class CustomCPUKernel : public Kernel {
 public:
-  CustomCPUKernel(int seg_id, bool forward_bbox, const std::vector<MSTensor> &inputs,
+  CustomCPUKernel(nnie::NNIEManager *manager, int seg_id, bool forward_bbox, const std::vector<MSTensor> &inputs,
                  const std::vector<MSTensor> &outputs, const mindspore::schema::Primitive *primitive,
                  const mindspore::Context *ctx)
-      : Kernel(inputs, outputs, primitive, ctx), seg_id_(seg_id), forward_bbox_(forward_bbox) {
-    if (forward_bbox) {
-      roi_used_ = true;
+      : Kernel(inputs, outputs, primitive, ctx), manager_(manager), seg_id_(seg_id), forward_bbox_(forward_bbox) {
+    if ((manager_) == nullptr) {
+      LOGE("manager_ is nullptr.");
+    } else {
+      manager_->SetMaxSegId(seg_id);
    }
  }

@ -54,9 +62,7 @@ class CustomCPUKernel : public Kernel {
  void set_forward_bbox(bool flag) { forward_bbox_ = flag; }

 private:
-  static bool load_model_;
-  static int run_seg_;
-  static bool roi_used_;
+  nnie::NNIEManager *manager_ = nullptr;
  int seg_id_ = 0;
  bool forward_bbox_ = false;
  std::vector<std::vector<int64_t>> outputs_shapes_;
--- a/mindspore/lite/providers/nnie/src/nnie_cfg_parser.cc
+++ b/mindspore/lite/providers/nnie/src/nnie_cfg_parser.cc
@ -34,6 +34,7 @@ namespace {
 constexpr auto kTimeStep = "TimeStep";
 constexpr auto kMazRoiNum = "MaxROINum";
 constexpr auto kCoreIds = "CoreIds";
+constexpr auto kKeepOrigin = "KeepOriginalOutput";
 constexpr auto DELIM = ",";
 constexpr int MAX_CORE_ID = 7;
 }  // namespace
@ -46,25 +47,49 @@ void PrintInvalidChar(const std::string &key, const std::string &dat) {
  LOGE(message.c_str());
 }

-int Flags::Init(const kernel::Kernel &kernel) {
-  auto nnie_arg = kernel.GetConfig("nnie");
-  if (nnie_arg.find(kTimeStep) != nnie_arg.end()) {
-    if (IsValidUnsignedNum(nnie_arg.at(kTimeStep)) == true) {
-      this->time_step_ = stoi(nnie_arg.at(kTimeStep));
+int Flags::ParserInt(const std::map<std::string, std::string> &nnie_arg, const std::string key, int *val) {
+  auto iter = nnie_arg.find(key);
+  if (iter != nnie_arg.end()) {
+    auto str = iter->second;
+    if (IsValidUnsignedNum(str) == true) {
+      *val = stoi(str);
    } else {
-      PrintInvalidChar(kTimeStep, nnie_arg.at(kTimeStep));
+      PrintInvalidChar(key, str);
      return RET_ERROR;
    }
  }
+  return RET_OK;
+}

-  if (nnie_arg.find(kMazRoiNum) != nnie_arg.end()) {
-    if (IsValidUnsignedNum(nnie_arg.at(kMazRoiNum)) == true) {
-      this->max_roi_num_ = stoi(nnie_arg.at(kMazRoiNum));
+int Flags::ParserBool(const std::map<std::string, std::string> &nnie_arg, const std::string key, bool *val) {
+  auto iter = nnie_arg.find(key);
+  if (iter != nnie_arg.end()) {
+    auto str = iter->second;
+    if (str.find("on") != std::string::npos) {
+      *val = true;
+    } else if (str.find("off") != std::string::npos) {
+      *val = false;
    } else {
-      PrintInvalidChar(kMazRoiNum, nnie_arg.at(kMazRoiNum));
+      PrintInvalidChar(key, str);
      return RET_ERROR;
    }
  }
+  return RET_OK;
+}
+
+int Flags::Init(const kernel::Kernel &kernel) {
+  auto nnie_arg = kernel.GetConfig("nnie");
+  if (ParserInt(nnie_arg, kTimeStep, &this->time_step_) != RET_OK) {
+    return RET_ERROR;
+  }
+
+  if (ParserInt(nnie_arg, kMazRoiNum, &this->max_roi_num_) != RET_OK) {
+    return RET_ERROR;
+  }
+
+  if (ParserBool(nnie_arg, kKeepOrigin, &this->keep_origin_output_) != RET_OK) {
+    return RET_ERROR;
+  }

  if (nnie_arg.find(kCoreIds) != nnie_arg.end()) {
    auto ids = nnie_arg.at(kCoreIds);
@ -85,6 +110,7 @@ int Flags::Init(const kernel::Kernel &kernel) {
      return RET_ERROR;
    }
  }
+
  return RET_OK;
 }
 }  // namespace nnie
--- a/mindspore/lite/providers/nnie/src/nnie_cfg_parser.h
+++ b/mindspore/lite/providers/nnie/src/nnie_cfg_parser.h
@ -16,10 +16,17 @@
 #ifndef MINDSPORE_LITE_TOOLS_BENCHMARK_NNIE_NNIE_CFG_PARSER_H_
 #define MINDSPORE_LITE_TOOLS_BENCHMARK_NNIE_NNIE_CFG_PARSER_H_
 #include <vector>
+#include <map>
+#include <string>
 #include "include/api/kernel.h"
+#include "include/hi_type.h"

 namespace mindspore {
 namespace nnie {
+typedef struct {
+  HI_U64 phy_;
+  HI_U32 size_;
+} MEM_ITEM;
 /**
 * Flags is a config container.
 * Member objects:
@ -39,6 +46,11 @@ class Flags {
  int time_step_{1};
  int max_roi_num_{300};
  std::vector<int> core_ids_{0};
+  bool keep_origin_output_{false};
+
+ private:
+  int ParserInt(const std::map<std::string, std::string> &nnie_arg, const std::string key, int *val);
+  int ParserBool(const std::map<std::string, std::string> &nnie_arg, const std::string key, bool *val);
 };
 }  // namespace nnie
 }  // namespace mindspore
--- a/mindspore/lite/providers/nnie/src/nnie_common.cc
+++ b/mindspore/lite/providers/nnie/src/nnie_common.cc
@ -26,6 +26,7 @@ using mindspore::lite::RET_OK;
 namespace mindspore {
 namespace nnie {
 constexpr int kSleepUs = 100;
+constexpr int kCompressionWidth = 2;
 static void NnieParamRelease(NnieParam *nnie_param) {
  if (nnie_param == nullptr) {
    return;
@ -141,7 +142,8 @@ static void FillForwardInfo(NnieCfg *nnie_cfg, NnieParam *nnie_param) {
 }

 static void GetBlobMemSize(SVP_NNIE_NODE_S nnie_node[], HI_U32 node_num, HI_U32 total_step, SVP_BLOB_S blob[],
-                           HI_U32 align32, HI_U32 *total_size, HI_U32 blob_size[], bool *mem_alloc = nullptr) {
+                           HI_U32 align32, HI_U32 *total_size, HI_U32 blob_size[], bool malloc_allow,
+                           bool *mem_alloc = nullptr) {
  HI_U32 i = 0;
  HI_U32 size;
  HI_U32 stride;
@ -173,7 +175,9 @@ static void GetBlobMemSize(SVP_NNIE_NODE_S nnie_node[], HI_U32 node_num, HI_U32
        blob_size[i] = 0;
      }
    }
-    *total_size += blob_size[i];
+    if (malloc_allow) {
+      *total_size += blob_size[i];
+    }
    blob[i].u32Stride = stride;
  }
 }
@ -208,18 +212,71 @@ static int GetTaskAndBlobBufSize(NnieCfg *nnie_cfg, NnieParam *nnie_param, HI_U3
                        j);
      }
    }
+    bool malloc_allow = (!nnie_cfg->pass_align16_io_) || i != 0;
    GetBlobMemSize(&(nnie_param->model_->astSeg[i].astSrcNode[0]), nnie_param->model_->astSeg[i].u16SrcNum, total_step,
                   &(nnie_param->seg_data_[i].src_[0]), NNIE_ALIGN_16, total_size, &(blob_size[i].src_size_[0]),
-                   &(nnie_param->mem_cfg_.seg_[i].src_node_[0]));
+                   malloc_allow, &(nnie_param->mem_cfg_.seg_[i].src_node_[0]));

+    malloc_allow = (!nnie_cfg->pass_align16_io_) || (i + 1) != nnie_param->model_->u32NetSegNum;
    GetBlobMemSize(&(nnie_param->model_->astSeg[i].astDstNode[0]), nnie_param->model_->astSeg[i].u16DstNum, total_step,
-                   &(nnie_param->seg_data_[i].dst_[0]), NNIE_ALIGN_16, total_size, &(blob_size[i].dst_size_[0]));
+                   &(nnie_param->seg_data_[i].dst_[0]), NNIE_ALIGN_16, total_size, &(blob_size[i].dst_size_[0]),
+                   malloc_allow);
+  }
+  return RET_OK;
+}
+
+static int NnieSetBlobAddr(HI_U64 *phy_addr, HI_U8 **vir_addr, NnieParam *nnie_param, NnieBlobSize *blob_size,
+                           bool pass_align16_io) {
+  HI_U32 i, j;
+  for (i = 0; i < nnie_param->model_->u32NetSegNum; i++) {
+    if ((!pass_align16_io) || i != 0) {
+      for (j = 0; j < nnie_param->model_->astSeg[i].u16SrcNum; j++) {
+        if (j != 0) {
+          *phy_addr += blob_size[i].src_size_[j - 1];
+          *vir_addr += blob_size[i].src_size_[j - 1];
+        }
+        if (nnie_param->mem_cfg_.seg_[i].src_node_[j]) {
+          if (!ConnectNnieInnerNode(nnie_param->model_->astSeg[i].astSrcNode[j].szName, nnie_param,
+                                    &(nnie_param->seg_data_[i].src_[j]))) {
+            LOGE("ConnectNnieInnerNode failed! ");
+            return RET_ERROR;
+          }
+        } else {
+          nnie_param->seg_data_[i].src_[j].u64PhyAddr = *phy_addr;
+          nnie_param->seg_data_[i].src_[j].u64VirAddr = (HI_U64)(HI_UL)*vir_addr;
+        }
+      }
+      *phy_addr += blob_size[i].src_size_[j - 1];
+      *vir_addr += blob_size[i].src_size_[j - 1];
+    } else {
+      for (j = 0; j < nnie_param->model_->astSeg[i].u16SrcNum; j++) {
+        nnie_param->seg_data_[i].src_[j].u64PhyAddr = 0;
+        nnie_param->seg_data_[i].src_[j].u64VirAddr = 0;
+      }
+    }
+    if ((!pass_align16_io) || (i + 1) != nnie_param->model_->u32NetSegNum) {
+      for (j = 0; j < nnie_param->model_->astSeg[i].u16DstNum; j++) {
+        if (j != 0) {
+          *phy_addr += blob_size[i].dst_size_[j - 1];
+          *vir_addr += blob_size[i].dst_size_[j - 1];
+        }
+        nnie_param->seg_data_[i].dst_[j].u64PhyAddr = *phy_addr;
+        nnie_param->seg_data_[i].dst_[j].u64VirAddr = (HI_U64)(HI_UL)*vir_addr;
+      }
+      *phy_addr += blob_size[i].dst_size_[j - 1];
+      *vir_addr += blob_size[i].dst_size_[j - 1];
+    } else {
+      for (j = 0; j < nnie_param->model_->astSeg[i].u16SrcNum; j++) {
+        nnie_param->seg_data_[i].dst_[j].u64PhyAddr = 0;
+        nnie_param->seg_data_[i].dst_[j].u64VirAddr = 0;
+      }
+    }
  }
  return RET_OK;
 }

 static int NnieParamInit(NnieCfg *nnie_cfg, NnieParam *nnie_param) {
-  HI_U32 i, j;
+  HI_U32 i;
  HI_U32 total_size = 0, total_task_buf_size = 0, tmp_buf_size_ = 0;
  HI_S32 ret = HI_SUCCESS;
  HI_U32 off_set = 0;
@ -288,36 +345,9 @@ static int NnieParamInit(NnieCfg *nnie_cfg, NnieParam *nnie_param) {

  phy_addr = phy_addr + total_task_buf_size + tmp_buf_size_;
  vir_addr = vir_addr + total_task_buf_size + tmp_buf_size_;
-  for (i = 0; i < nnie_param->model_->u32NetSegNum; i++) {
-    for (j = 0; j < nnie_param->model_->astSeg[i].u16SrcNum; j++) {
-      if (j != 0) {
-        phy_addr += blob_size[i].src_size_[j - 1];
-        vir_addr += blob_size[i].src_size_[j - 1];
-      }
-      if (nnie_param->mem_cfg_.seg_[i].src_node_[j]) {
-        if (!ConnectNnieInnerNode(nnie_param->model_->astSeg[i].astSrcNode[j].szName, nnie_param,
-                                  &(nnie_param->seg_data_[i].src_[j]))) {
-          LOGE("ConnectNnieInnerNode failed! ");
-          return RET_ERROR;
-        }
-      } else {
-        nnie_param->seg_data_[i].src_[j].u64PhyAddr = phy_addr;
-        nnie_param->seg_data_[i].src_[j].u64VirAddr = (HI_U64)(HI_UL)vir_addr;
-      }
-    }
-    phy_addr += blob_size[i].src_size_[j - 1];
-    vir_addr += blob_size[i].src_size_[j - 1];
-
-    for (j = 0; j < nnie_param->model_->astSeg[i].u16DstNum; j++) {
-      if (j != 0) {
-        phy_addr += blob_size[i].dst_size_[j - 1];
-        vir_addr += blob_size[i].dst_size_[j - 1];
-      }
-      nnie_param->seg_data_[i].dst_[j].u64PhyAddr = phy_addr;
-      nnie_param->seg_data_[i].dst_[j].u64VirAddr = (HI_U64)(HI_UL)vir_addr;
-    }
-    phy_addr += blob_size[i].dst_size_[j - 1];
-    vir_addr += blob_size[i].dst_size_[j - 1];
+  if (NnieSetBlobAddr(&phy_addr, &vir_addr, nnie_param, blob_size, nnie_cfg->pass_align16_io_) != RET_OK) {
+    LOGE("SetBlobAddr failed!");
+    return RET_ERROR;
  }
  if (has_roi) {
    nnie_param->rpn_bbox_.u64PhyAddr = phy_addr;
@ -536,70 +566,108 @@ int FillByFloat(HI_U32 input_size, HI_U32 num, HI_U32 width, HI_U32 stride, HI_F
  return RET_OK;
 }

-static int NnieFillSrcData(NnieCfg *nnie_cfg, NnieParam *nnie_param, NnieDataIndex *input_data_idx, int64_t *shape,
-                           int size) {
-  HI_U32 i, j, n, ret;
-  HI_U32 height, width, channel, stride, dim;
-  HI_U8 *input_addr_u8 = nullptr;
-  HI_S32 *input_addr_s32 = nullptr;
-  HI_U32 *step_addr_u32 = nullptr;
-  HI_FLOAT *float_src_data = nullptr;
-  HI_U8 *u8_src_data = nullptr;
+static int NnieFillSrcDataSeq(NnieCfg *nnie_cfg, SVP_SRC_BLOB_S *blob, HI_U32 input_size) {
+  HI_U32 *step_addr_u32 = NNIE_CONVERT_64BIT_ADDR(HI_U32, blob->unShape.stSeq.u64VirAddrStep);
+  HI_U32 dim = blob->unShape.stSeq.u32Dim;
+  HI_U32 stride = blob->u32Stride;
+  HI_U32 i, j, n;
  HI_U32 total_step_num = 0;
-  HI_U32 input_size = 1;
-  SVP_SRC_BLOB_S *blob = &nnie_param->seg_data_[input_data_idx->seg_idx_].src_[input_data_idx->node_idx_];
-  for (n = 0; n < (HI_U32)size; n++) {
-    input_size *= shape[n];
-  }
-  input_addr_u8 = NNIE_CONVERT_64BIT_ADDR(HI_U8, blob->u64VirAddr);
-  input_addr_s32 = NNIE_CONVERT_64BIT_ADDR(HI_S32, blob->u64VirAddr);
-  float_src_data = reinterpret_cast<float *>(nnie_cfg->data_ptr_);
-  u8_src_data = reinterpret_cast<unsigned char *>(nnie_cfg->data_ptr_);
-  if (SVP_BLOB_TYPE_SEQ_S32 == blob->enType) {
-    step_addr_u32 = NNIE_CONVERT_64BIT_ADDR(HI_U32, blob->unShape.stSeq.u64VirAddrStep);
-    dim = blob->unShape.stSeq.u32Dim;
-    stride = blob->u32Stride;
+  HI_U8 *input_addr_u8 = NNIE_CONVERT_64BIT_ADDR(HI_U8, blob->u64VirAddr);
+  HI_S32 *input_addr_s32 = NNIE_CONVERT_64BIT_ADDR(HI_S32, blob->u64VirAddr);
+  HI_FLOAT *float_src_data = reinterpret_cast<float *>(nnie_cfg->data_ptr_);

-    for (n = 0; n < blob->u32Num; n++) {
+  for (n = 0; n < blob->u32Num; n++) {
+    total_step_num += *(step_addr_u32 + n);
+  }
+
+  if (input_size != total_step_num * dim) {
+    LOGE("input size error:%d <-> %d.", input_size, total_step_num * dim);
+    return RET_ERROR;
+  }
+  for (n = 0; n < blob->u32Num; n++) {
+    for (i = 0; i < *(step_addr_u32 + n); i++) {
+      for (j = 0; j < dim; j++) {
+        input_addr_s32[j] = (float_src_data[j] * NNIE_QUANT_BASE);
+      }
+      input_addr_u8 += stride;
+      input_addr_s32 = reinterpret_cast<HI_S32 *>(input_addr_u8);
+      float_src_data += dim;
+    }
+  }
+  NnieMemFlushCache(blob->u64PhyAddr, NNIE_CONVERT_64BIT_ADDR(HI_VOID, blob->u64VirAddr), total_step_num * stride);
+  return RET_OK;
+}
+
+HI_U32 GetBlobSize(const SVP_SRC_BLOB_S &blob) {
+  if (SVP_BLOB_TYPE_SEQ_S32 == blob.enType) {
+    HI_U32 stride = blob.u32Stride;
+    HI_U32 total_step_num = 0;
+    HI_U32 *step_addr_u32 = NNIE_CONVERT_64BIT_ADDR(HI_U32, blob.unShape.stSeq.u64VirAddrStep);
+    size_t n;
+    for (n = 0; n < blob.u32Num; n++) {
      total_step_num += *(step_addr_u32 + n);
    }
+    return total_step_num * stride;
+  }

-    if (input_size != total_step_num * dim) {
-      LOGE("input size error:%d <-> %d.", input_size, total_step_num * dim);
-      return RET_ERROR;
-    }
-    for (n = 0; n < blob->u32Num; n++) {
-      for (i = 0; i < *(step_addr_u32 + n); i++) {
-        for (j = 0; j < dim; j++) {
-          input_addr_s32[j] = (float_src_data[j] * NNIE_QUANT_BASE);
-        }
-        input_addr_u8 += stride;
-        input_addr_s32 = reinterpret_cast<HI_S32 *>(input_addr_u8);
-        float_src_data += dim;
-      }
-    }
-    NnieMemFlushCache(blob->u64PhyAddr, NNIE_CONVERT_64BIT_ADDR(HI_VOID, blob->u64VirAddr), total_step_num * stride);
+  HI_U32 stride = blob.u32Stride;
+  HI_U32 height = blob.unShape.stWhc.u32Height;
+  HI_U32 channel = blob.unShape.stWhc.u32Chn;
+  if (SVP_BLOB_TYPE_YVU420SP == blob.enType) {
+    return blob.u32Num * static_cast<HI_U32>(channel * height / kCompressionWidth) * stride;
+  } else if (SVP_BLOB_TYPE_YVU422SP == blob.enType) {
+    return blob.u32Num * height * kCompressionWidth * stride;
  } else {
-    height = blob->unShape.stWhc.u32Height;
-    width = blob->unShape.stWhc.u32Width;
-    channel = blob->unShape.stWhc.u32Chn;
-    stride = blob->u32Stride;
-    if (SVP_BLOB_TYPE_YVU420SP == blob->enType) {
-      ret = FillByUnsignedChar(input_size, blob->u32Num * static_cast<HI_U32>(channel * height / 2), width, stride,
-                               u8_src_data, input_addr_u8);
-    } else if (SVP_BLOB_TYPE_YVU422SP == blob->enType) {
-      ret = FillByUnsignedChar(input_size, blob->u32Num * height * 2, width, stride, u8_src_data, input_addr_u8);
-    } else {
-      if (SVP_BLOB_TYPE_U8 == blob->enType) {
-        ret =
-          FillByUnsignedChar(input_size, blob->u32Num * channel * height, width, stride, u8_src_data, input_addr_u8);
+    return blob.u32Num * channel * height * stride;
+  }
+}
+
+static int NnieFillSrcData(NnieCfg *nnie_cfg, NnieParam *nnie_param, NnieDataIndex *input_data_idx, int64_t *shape,
+                           int size) {
+  HI_U32 i, ret;
+  HI_U32 input_size = 1;
+  SVP_SRC_BLOB_S *blob = &nnie_param->seg_data_[input_data_idx->seg_idx_].src_[input_data_idx->node_idx_];
+  for (i = 0; i < (HI_U32)size; i++) {
+    input_size *= shape[i];
+  }
+
+  if (SVP_BLOB_TYPE_SEQ_S32 == blob->enType) {
+    return NnieFillSrcDataSeq(nnie_cfg, blob, input_size);
+  } else {
+    HI_U8 *input_addr_u8 = NNIE_CONVERT_64BIT_ADDR(HI_U8, blob->u64VirAddr);
+    HI_S32 *input_addr_s32 = NNIE_CONVERT_64BIT_ADDR(HI_S32, blob->u64VirAddr);
+    HI_FLOAT *float_src_data = reinterpret_cast<float *>(nnie_cfg->data_ptr_);
+    HI_U8 *u8_src_data = reinterpret_cast<unsigned char *>(nnie_cfg->data_ptr_);
+    HI_U32 height = blob->unShape.stWhc.u32Height;
+    HI_U32 width = blob->unShape.stWhc.u32Width;
+    HI_U32 channel = blob->unShape.stWhc.u32Chn;
+    HI_U32 stride = blob->u32Stride;
+    if (input_addr_u8 == u8_src_data) {
+      if (blob->enType == SVP_BLOB_TYPE_S32) {
+        for (i = 0; i < input_size; i++) {
+          input_addr_s32[i] = float_src_data[i] * NNIE_QUANT_BASE;
+        }
      } else {
-        ret = FillByFloat(input_size, blob->u32Num * channel * height, width, stride, float_src_data, input_addr_s32,
-                          input_addr_u8);
+        LOGI("\ninput no memcpy");
+      }
+    } else {
+      if (SVP_BLOB_TYPE_YVU420SP == blob->enType) {
+        ret = FillByUnsignedChar(input_size, blob->u32Num * static_cast<HI_U32>(channel * height / 2), width, stride,
+                                 u8_src_data, input_addr_u8);
+      } else if (SVP_BLOB_TYPE_YVU422SP == blob->enType) {
+        ret = FillByUnsignedChar(input_size, blob->u32Num * height * 2, width, stride, u8_src_data, input_addr_u8);
+      } else {
+        if (SVP_BLOB_TYPE_U8 == blob->enType) {
+          ret =
+            FillByUnsignedChar(input_size, blob->u32Num * channel * height, width, stride, u8_src_data, input_addr_u8);
+        } else {
+          ret = FillByFloat(input_size, blob->u32Num * channel * height, width, stride, float_src_data, input_addr_s32,
+                            input_addr_u8);
+        }
+      }
+      if (ret != RET_OK) {
+        return ret;
      }
-    }
-    if (ret != RET_OK) {
-      return ret;
    }
    NnieMemFlushCache(blob->u64PhyAddr, NNIE_CONVERT_64BIT_ADDR(HI_VOID, blob->u64VirAddr),
                      blob->u32Num * channel * height * stride);
@ -608,42 +676,32 @@ static int NnieFillSrcData(NnieCfg *nnie_cfg, NnieParam *nnie_param, NnieDataInd
  return RET_OK;
 }

-static int NnieGetDstData(NnieCfg *nnie_cfg, NnieParam *nnie_param, NnieDataIndex *input_data_idx, int64_t *shape,
-                          int size) {
+static int NnieGetDstDataSEQ(SVP_SRC_BLOB_S *blob, HI_U32 input_num, NnieDataIndex *input_data_idx,
+                             HI_FLOAT *float_dst_data) {
  HI_U32 i, j, n;
-  HI_U32 height, width, channel, stride, dim;
-  HI_U8 *output_addr_u8 = nullptr;
-  HI_S32 *output_addr_s32 = nullptr;
-  HI_U32 *step_addr_u32 = nullptr;
-  HI_FLOAT *float_dst_data = nullptr;
+  HI_U32 dim = blob->unShape.stSeq.u32Dim;
+  HI_U32 stride = blob->u32Stride;
+  HI_U32 *step_addr_u32 = NNIE_CONVERT_64BIT_ADDR(HI_U32, blob->unShape.stSeq.u64VirAddrStep);
  HI_U32 total_step_num = 0;
-  HI_U32 input_num = 1;
-  SVP_SRC_BLOB_S *blob = &nnie_param->seg_data_[input_data_idx->seg_idx_ - 1].dst_[input_data_idx->node_idx_];
-  for (n = 0; n < (HI_U32)size; n++) {
-    input_num *= shape[n];
-  }
+  HI_U8 *output_addr_u8 = NNIE_CONVERT_64BIT_ADDR(HI_U8, blob->u64VirAddr);
+  HI_S32 *output_addr_s32 = NNIE_CONVERT_64BIT_ADDR(HI_S32, blob->u64VirAddr);

-  if (SVP_BLOB_TYPE_U8 <= blob->enType && SVP_BLOB_TYPE_YVU422SP >= blob->enType) {
-    LOGE("Nnie output type error");
+  for (n = 0; n < blob->u32Num; n++) {
+    total_step_num += *(step_addr_u32 + n);
+  }
+  if (input_num != total_step_num * dim) {
+    LOGE("input shape");
    return RET_ERROR;
  }
-
-  output_addr_u8 = NNIE_CONVERT_64BIT_ADDR(HI_U8, blob->u64VirAddr);
-  output_addr_s32 = NNIE_CONVERT_64BIT_ADDR(HI_S32, blob->u64VirAddr);
-  float_dst_data = reinterpret_cast<float *>(nnie_cfg->data_ptr_);
-
-  if (SVP_BLOB_TYPE_SEQ_S32 == blob->enType) {
-    dim = blob->unShape.stSeq.u32Dim;
-    stride = blob->u32Stride;
-    step_addr_u32 = NNIE_CONVERT_64BIT_ADDR(HI_U32, blob->unShape.stSeq.u64VirAddrStep);
-
+  if (input_data_idx->seg_idx_ == input_data_idx->max_seg_id_) {
    for (n = 0; n < blob->u32Num; n++) {
-      total_step_num += *(step_addr_u32 + n);
-    }
-    if (input_num != total_step_num * dim) {
-      LOGE("input shape");
-      return RET_ERROR;
+      for (i = 0; i < *(step_addr_u32 + n); i++) {
+        memcpy(float_dst_data, output_addr_u8, dim * sizeof(float));
+        float_dst_data += dim;
+        output_addr_u8 += stride;
+      }
    }
+  } else {
    for (n = 0; n < blob->u32Num; n++) {
      for (i = 0; i < *(step_addr_u32 + n); i++) {
        for (j = 0; j < dim; j++) {
@ -654,23 +712,67 @@ static int NnieGetDstData(NnieCfg *nnie_cfg, NnieParam *nnie_param, NnieDataInde
        float_dst_data += dim;
      }
    }
-  } else {
-    height = blob->unShape.stWhc.u32Height;
-    width = blob->unShape.stWhc.u32Width;
-    channel = blob->unShape.stWhc.u32Chn;
-    stride = blob->u32Stride;
-    if (input_num != height * channel * width * blob->u32Num) {
-      LOGE("output shape diff:%d<->%d.", input_num, height * channel * width * blob->u32Num);
+  }
+  return RET_OK;
+}
+static int NnieGetDstData(NnieCfg *nnie_cfg, NnieParam *nnie_param, NnieDataIndex *input_data_idx, int64_t *shape,
+                          int size) {
+  SVP_SRC_BLOB_S *blob = &nnie_param->seg_data_[input_data_idx->seg_idx_ - 1].dst_[input_data_idx->node_idx_];
+  HI_U32 input_num = 1;
+  for (HI_U32 i = 0; i < (HI_U32)size; i++) {
+    input_num *= shape[i];
+  }
+  if (SVP_BLOB_TYPE_U8 <= blob->enType && SVP_BLOB_TYPE_YVU422SP >= blob->enType) {
+    LOGE("Nnie output type error");
+    return RET_ERROR;
+  }
+  HI_FLOAT *float_dst_data = reinterpret_cast<float *>(nnie_cfg->data_ptr_);
+  if (SVP_BLOB_TYPE_SEQ_S32 == blob->enType) {
+    if (NnieGetDstDataSEQ(blob, input_num, input_data_idx, float_dst_data) != RET_OK) {
+      LOGE("NnieGetDstDataSEQ error.");
      return RET_ERROR;
    }
-    for (n = 0; n < blob->u32Num; n++) {
-      for (i = 0; i < channel * height; i++) {
-        for (j = 0; j < width; j++) {
-          float_dst_data[j] = (HI_FLOAT)output_addr_s32[j] / NNIE_QUANT_BASE;
+  } else {
+    HI_U8 *output_addr_u8 = NNIE_CONVERT_64BIT_ADDR(HI_U8, blob->u64VirAddr);
+    HI_S32 *output_addr_s32 = NNIE_CONVERT_64BIT_ADDR(HI_S32, blob->u64VirAddr);
+    if (float_dst_data == reinterpret_cast<float *>(output_addr_s32)) {
+      if (input_data_idx->seg_idx_ != input_data_idx->max_seg_id_) {
+        for (HI_U32 i = 0; i < input_num; i++) {
+          float_dst_data[i] = (HI_FLOAT)output_addr_s32[i] / NNIE_QUANT_BASE;
+        }
+      } else {
+        LOGI("\noutput no memcpy");
+      }
+    } else {
+      HI_U32 height = blob->unShape.stWhc.u32Height;
+      HI_U32 width = blob->unShape.stWhc.u32Width;
+      HI_U32 channel = blob->unShape.stWhc.u32Chn;
+      HI_U32 stride = blob->u32Stride;
+      if (input_num != height * channel * width * blob->u32Num) {
+        LOGE("output shape diff:%d<->%d.", input_num, height * channel * width * blob->u32Num);
+        return RET_ERROR;
+      }
+      if (input_data_idx->seg_idx_ == input_data_idx->max_seg_id_) {
+        if (nnie_cfg->pass_align16_io_) {
+          memcpy(float_dst_data, output_addr_u8, blob->u32Num * channel * height * stride);
+        } else {
+          for (HI_U32 i = 0; i < (blob->u32Num * channel * height); i++) {
+            memcpy(float_dst_data, output_addr_u8, width * sizeof(float));
+            float_dst_data += width;
+            output_addr_u8 += stride;
+          }
+        }
+      } else {
+        for (HI_U32 n = 0; n < blob->u32Num; n++) {
+          for (HI_U32 i = 0; i < channel * height; i++) {
+            for (HI_U32 j = 0; j < width; j++) {
+              float_dst_data[j] = (HI_FLOAT)output_addr_s32[j] / NNIE_QUANT_BASE;
+            }
+            output_addr_u8 += stride;
+            output_addr_s32 = reinterpret_cast<HI_S32 *>(output_addr_u8);
+            float_dst_data += width;
+          }
        }
-        output_addr_u8 += stride;
-        output_addr_s32 = reinterpret_cast<HI_S32 *>(output_addr_u8);
-        float_dst_data += width;
      }
    }
  }
--- a/mindspore/lite/providers/nnie/src/nnie_common.h
+++ b/mindspore/lite/providers/nnie/src/nnie_common.h
@ -19,12 +19,14 @@
 #include <iostream>
 #include <string>
 #include <vector>
+#include <map>
 #include "include/api/types.h"
 #include "include/mpi_vb.h"
 #include "include/hi_comm_svp.h"
 #include "include/hi_nnie.h"
 #include "include/mpi_nnie.h"
 #include "include/ir/dtype/type_id.h"
+#include "src/nnie_cfg_parser.h"

 namespace mindspore {
 namespace nnie {
@ -70,9 +72,11 @@ typedef struct {
  SVP_NNIE_FORWARD_CTRL_S forward_ctrl_[SVP_NNIE_MAX_NET_SEG_NUM];
  SVP_NNIE_FORWARD_WITHBBOX_CTRL_S forward_with_bbox_ctrl_[SVP_NNIE_MAX_NET_SEG_NUM];
  NNIEMemCfg mem_cfg_;
+  bool get_mem_strong;
 } NnieParam;

 typedef struct {
+  bool pass_align16_io_;
  HI_VOID *data_ptr_;
  HI_U32 max_input_num_;
  HI_U32 max_roi_num_;
@ -85,6 +89,7 @@ typedef struct {
 typedef struct {
  HI_U32 seg_idx_;
  HI_U32 node_idx_;
+  HI_U32 max_seg_id_;
 } NnieDataIndex;

 typedef struct {
@ -110,6 +115,8 @@ int NnieCommRun(NnieRunCfg *nnie_run_cfg, bool run_box);
 int NnieCommFillData(NnieRunCfg *nnie_run_cfg, void *data, mindspore::DataType dtype, int64_t *shape, int size, int id);

 int NnieCommGetOutputData(NnieRunCfg *nnie_run_cfg, float *data, int64_t *shape, int size, int tensor_index);
+
+HI_U32 GetBlobSize(const SVP_SRC_BLOB_S &blob);
 }  // namespace nnie
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NNIE_NNIE_COMMON_H_
--- a/mindspore/lite/providers/nnie/src/nnie_manager.cc
+++ b/mindspore/lite/providers/nnie/src/nnie_manager.cc
@ -14,6 +14,9 @@
 * limitations under the License.
 */
 #include <cstring>
+#include <string>
+#include <map>
+#include <memory>
 #include "src/nnie_manager.h"
 #include "src/nnie_common.h"
 #include "src/nnie_print.h"
@ -24,26 +27,29 @@ using mindspore::lite::RET_OK;

 namespace mindspore {
 namespace nnie {
+constexpr int kUINT16_MAX = 65535;
 constexpr int kNumInput2 = 2;
-int NNIEManager::CfgInit(int max_roi_num, int step, const std::vector<int> &core_id) {
+int NNIEManager::CfgInit(const Flags &flags, int max_seg_id) {
  memset(&nnie_cfg_, 0, sizeof(NnieRunCfg));

-  nnie_cfg_.cfg_.max_roi_num_ = max_roi_num;
-
-  nnie_cfg_.cfg_.step_ = step;
-  if (core_id.size() == 1) {
+  nnie_cfg_.cfg_.pass_align16_io_ = flags.keep_origin_output_;
+  nnie_cfg_.param_.get_mem_strong = false;
+  nnie_cfg_.run_idx_.max_seg_id_ = flags.keep_origin_output_ ? max_seg_id + 1 : kUINT16_MAX;
+  nnie_cfg_.cfg_.max_roi_num_ = flags.max_roi_num_;
+  nnie_cfg_.cfg_.step_ = flags.time_step_;
+  if (flags.core_ids_.size() == 1) {
    for (size_t i = 0; i < SVP_NNIE_MAX_NET_SEG_NUM; i++) {
-      if (core_id[0] < SVP_NNIE_ID_BUTT) {
-        nnie_cfg_.cfg_.nnie_core_id_[i] = (SVP_NNIE_ID_E)core_id[0];
+      if (flags.core_ids_[0] < SVP_NNIE_ID_BUTT) {
+        nnie_cfg_.cfg_.nnie_core_id_[i] = (SVP_NNIE_ID_E)flags.core_ids_[0];
      } else {
        LOGE("nnie core num toobig.\n");
        return RET_ERROR;
      }
    }
  }
-  for (size_t i = 0; i < SVP_NNIE_MAX_NET_SEG_NUM && i < core_id.size(); i++) {
-    if (core_id[i] < SVP_NNIE_ID_BUTT) {
-      nnie_cfg_.cfg_.nnie_core_id_[i] = (SVP_NNIE_ID_E)core_id[i];
+  for (size_t i = 0; i < SVP_NNIE_MAX_NET_SEG_NUM && i < flags.core_ids_.size(); i++) {
+    if (flags.core_ids_[i] < SVP_NNIE_ID_BUTT) {
+      nnie_cfg_.cfg_.nnie_core_id_[i] = (SVP_NNIE_ID_E)flags.core_ids_[i];
    } else {
      LOGE("nnie core num toobig.\n");
      return RET_ERROR;
@ -51,6 +57,108 @@ int NNIEManager::CfgInit(int max_roi_num, int step, const std::vector<int> &core
  }
  return RET_OK;
 }
+
+int NNIEManager::MallocBlobData(SVP_SRC_BLOB_S *blob, mindspore::MSTensor *tensor, HI_U32 blob_size) {
+  auto ret = NnieMemMallocCached(tensor->Name().c_str(), nullptr, reinterpret_cast<HI_U64 *>(&blob->u64PhyAddr),
+                                 reinterpret_cast<void **>(&blob->u64VirAddr), blob_size);
+  if (HI_SUCCESS != ret) {
+    LOGE("Error,MallocBlobData failed!");
+    return RET_ERROR;
+  }
+  blobs_.push_back(blob);
+  tensors_.push_back(tensor);
+  return RET_OK;
+}
+
+int NNIEManager::SetBlobAddr(SVP_SRC_BLOB_S *blob, HI_U64 virt, mindspore::MSTensor *tensor,
+                             std::shared_ptr<Allocator> allocator) {
+  HI_U32 blob_size = GetBlobSize(*blob);
+  if (virt == 0) {
+    auto iter = std::find(blobs_.begin(), blobs_.end(), blob);
+    if (iter == blobs_.end()) {
+      if (MallocBlobData(blob, tensor, blob_size) != RET_OK) {
+        LOGE("Failed to malloc.");
+        return RET_ERROR;
+      }
+    }
+    tensor->SetAllocator(allocator);
+    tensor->SetData(reinterpret_cast<void *>(blob->u64VirAddr));
+    LOGI("\nSet %s allocator!", tensor->Name().c_str());
+  } else {
+    auto ret = NnieGetVirMemInfo(virt, &blob->u64PhyAddr);
+    if (ret == HI_SUCCESS) {
+      blob->u64VirAddr = virt;
+      LOGI("Get physical address %llu.", blob->u64PhyAddr);
+    } else {
+      auto iter = std::find(blobs_.begin(), blobs_.end(), blob);
+      if (iter == blobs_.end()) {
+        if (MallocBlobData(blob, tensor, blob_size) != RET_OK) {
+          LOGE("Error, tensor data pointer is not MMZ memory, failed to malloc.");
+          return RET_ERROR;
+        }
+      }
+    }
+  }
+  return RET_OK;
+}
+
+int NNIEManager::LoadInputs(std::vector<mindspore::MSTensor> *inputs, std::shared_ptr<Allocator> allocator) {
+  size_t input_size = inputs->size();
+  if ((input_size < kNumInput2) || (input_size - 1) != nnie_cfg_.param_.model_->astSeg[0].u16SrcNum) {
+    LOGE("Input Size Err!");
+    return RET_ERROR;
+  }
+
+  for (size_t i = 0; i < nnie_cfg_.param_.model_->astSeg[0].u16SrcNum; i++) {
+    size_t j = GetFillIndex(*inputs, input_size - 1, nnie_cfg_.param_.model_->astSeg[0].astSrcNode[i].szName);
+    if (j == (input_size - 1)) {
+      j = i;
+      LOGI("input tensor name(%s) can't match wk node name(%s).", (*inputs)[j].Name().c_str(),
+           nnie_cfg_.param_.model_->astSeg[0].astSrcNode[i].szName);
+    }
+    HI_U64 virt = (HI_U64)(HI_UL)((*inputs)[j].Data().get());
+    auto blob = &nnie_cfg_.param_.seg_data_[0].src_[i];
+    if (SetBlobAddr(blob, virt, &(*inputs)[j], allocator) != RET_OK) {
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+int NNIEManager::LoadOutputs(std::vector<mindspore::MSTensor> *outputs, std::shared_ptr<Allocator> allocator) {
+  int output_size = outputs->size();
+  HI_U32 seg_id = nnie_cfg_.model_.model_.u32NetSegNum - 1;
+  if (output_size != nnie_cfg_.param_.model_->astSeg[seg_id].u16DstNum) {
+    LOGE("seg%d: %d output tensors are required, but there are %d outputs.", nnie_cfg_.run_idx_.seg_idx_ - 1,
+         nnie_cfg_.param_.model_->astSeg[nnie_cfg_.run_idx_.seg_idx_ - 1].u16DstNum, output_size);
+    return RET_ERROR;
+  }
+  if (nnie_cfg_.param_.model_->astSeg[seg_id].enNetType == SVP_NNIE_NET_TYPE_ROI) {
+    LOGE("Unsupported use PassAlign16InOutput!");
+    return RET_ERROR;
+  }
+
+  for (int i = 0; i < nnie_cfg_.param_.model_->astSeg[seg_id].u16DstNum; i++) {
+    int j = GetFillIndex(*outputs, output_size, nnie_cfg_.param_.model_->astSeg[seg_id].astDstNode[i].szName);
+    if (j == output_size) {
+      j = i;
+      LOGI("output tensor name(%s) can't match wk node name(%s).", (*outputs)[j].Name().c_str(),
+           nnie_cfg_.param_.model_->astSeg[seg_id].astDstNode[i].szName);
+    }
+
+    SVP_SRC_BLOB_S *blob = &nnie_cfg_.param_.seg_data_[seg_id].dst_[i];
+    if (SVP_BLOB_TYPE_U8 <= blob->enType && SVP_BLOB_TYPE_YVU422SP >= blob->enType) {
+      LOGE("Nnie output type error");
+      return RET_ERROR;
+    }
+    HI_U64 virt = (HI_U64)(HI_UL)((*outputs)[j].Data().get());
+    if (SetBlobAddr(blob, virt, &(*outputs)[j], allocator) != RET_OK) {
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
 void NNIEManager::SetInputNum(int max_input_num) { nnie_cfg_.cfg_.max_input_num_ = max_input_num; }

 int NNIEManager::Init(char *model_buf, int size, const std::vector<mindspore::MSTensor> &inputs) {
@ -80,17 +188,33 @@ int NNIEManager::Run(std::vector<mindspore::MSTensor> *outputs, unsigned int seg
  return RET_OK;
 }

-void NNIEManager::Release() { NnieCommDelete(&nnie_cfg_.param_, &nnie_cfg_.model_); }
+void NNIEManager::Release(bool resize_flag) {
+  for (auto &blob : blobs_) {
+    NNIE_MEM_FREE(blob->u64PhyAddr, blob->u64VirAddr);
+    blob->u64VirAddr = 0;
+    blob->u64PhyAddr = 0;
+  }
+  blobs_.clear();
+  if (resize_flag) {
+    for (auto &tensor : tensors_) {
+      tensor->SetData(nullptr);
+      tensor->SetAllocator(nullptr);
+    }
+  }
+  tensors_.clear();
+  NnieCommDelete(&nnie_cfg_.param_, &nnie_cfg_.model_);
+}

 int NNIEManager::GetOutputData(std::vector<mindspore::MSTensor> *outputs,
                               const std::vector<std::vector<int64_t>> &outputs_shape, bool run_box) {
-  int i, j, output_size = outputs->size();
+  int output_size = outputs->size();
  if (output_size != nnie_cfg_.param_.model_->astSeg[nnie_cfg_.run_idx_.seg_idx_ - 1].u16DstNum) {
    LOGE("seg%d: %d output tensors are required, but there are %d outputs.", nnie_cfg_.run_idx_.seg_idx_ - 1,
         nnie_cfg_.param_.model_->astSeg[nnie_cfg_.run_idx_.seg_idx_ - 1].u16DstNum, output_size);
    return RET_ERROR;
  }

+  int i, j;
  if (run_box) {
    for (i = 0; i < output_size; i++) {
      auto input_data_type = (*outputs)[i].DataType();
@ -164,6 +288,132 @@ int NNIEManager::FillRoiPooling(mindspore::MSTensor *input) {
  return RET_OK;
 }

+int NNIEManager::SetAllocatorTensor(mindspore::MSTensor *tensor, SVP_SRC_BLOB_S *blob,
+                                    std::shared_ptr<Allocator> allocator) {
+  int step;
+  auto data_type = tensor->DataType();
+  if (data_type == DataType::kNumberTypeFloat32) {
+    step = sizeof(float);
+  } else if ((data_type == DataType::kNumberTypeUInt8) || (data_type == DataType::kNumberTypeInt8)) {
+    step = sizeof(unsigned char);
+  } else {
+    LOGE("Unsupported DataType!");
+    return RET_ERROR;
+  }
+  LOGI("\ninput %s :%d * %d = %d <-> %d", tensor->Name().c_str(), step, blob->unShape.stWhc.u32Width,
+       step * blob->unShape.stWhc.u32Width, blob->u32Stride);
+
+  if (blob->unShape.stWhc.u32Width * step == blob->u32Stride) {
+    if (((tensor->Data() == nullptr) || tensor->allocator() == allocator) && (blob->u64VirAddr != 0)) {
+      tensor->SetAllocator(allocator);
+      tensor->SetData(reinterpret_cast<void *>(blob->u64VirAddr));
+      LOGI("\nSet input %s allocator!", tensor->Name().c_str());
+    }
+  }
+  return RET_OK;
+}
+
+int NNIEManager::SetAllocatorInputs(std::vector<mindspore::MSTensor> *inputs, bool run_box,
+                                    std::shared_ptr<Allocator> allocator, unsigned int seg_id) {
+  size_t i, j, input_size = inputs->size();
+  if (seg_id >= nnie_cfg_.param_.model_->u32NetSegNum) {
+    LOGE("seg num err!");
+    return RET_ERROR;
+  }
+
+  if (!run_box) {
+    if ((input_size < kNumInput2) || (input_size - 1) != nnie_cfg_.param_.model_->astSeg[seg_id].u16SrcNum) {
+      LOGE("Input Size Err!");
+      return RET_ERROR;
+    }
+  }
+
+  for (i = 0; i < nnie_cfg_.param_.model_->astSeg[seg_id].u16SrcNum; i++) {
+    if (nnie_cfg_.param_.mem_cfg_.seg_[seg_id].src_node_[i]) {
+      continue;
+    }
+    j = GetFillIndex(*inputs, input_size - 1, nnie_cfg_.param_.model_->astSeg[seg_id].astSrcNode[i].szName);
+    if (j == (input_size - 1)) {
+      if (run_box && (*inputs)[i].Name() == "proposal") {
+        continue;
+      } else {
+        j = i;
+        LOGI("input tensor name(%s) can't match wk node name(%s).", (*inputs)[i].Name().c_str(),
+             nnie_cfg_.param_.model_->astSeg[seg_id].astSrcNode[i].szName);
+      }
+    }
+    SVP_SRC_BLOB_S *blob = &nnie_cfg_.param_.seg_data_[seg_id].src_[i];
+    SVP_BLOB_TYPE_E src_type = blob->enType;
+
+    if (src_type != SVP_BLOB_TYPE_SEQ_S32) {
+      SetAllocatorTensor(&(*inputs)[j], blob, allocator);
+    }
+  }
+  return RET_OK;
+}
+
+int NNIEManager::SetAllocatorOutputs(std::vector<mindspore::MSTensor> *outputs, bool run_box,
+                                     std::shared_ptr<Allocator> allocator, unsigned int seg_id) {
+  size_t i, j;
+  size_t output_size = outputs->size();
+  if (output_size != nnie_cfg_.param_.model_->astSeg[seg_id].u16DstNum) {
+    LOGE("seg%d: %d output tensors are required.", seg_id, nnie_cfg_.param_.model_->astSeg[seg_id].u16DstNum);
+    return RET_ERROR;
+  }
+
+  for (i = 0; i < nnie_cfg_.param_.model_->astSeg[seg_id].u16DstNum; i++) {
+    if (nnie_cfg_.param_.mem_cfg_.seg_[seg_id].dst_node_[i]) {
+      continue;
+    }
+
+    j = GetFillIndex(*outputs, output_size, nnie_cfg_.param_.model_->astSeg[seg_id].astDstNode[i].szName);
+    if (j == output_size) {
+      j = i;
+      LOGI("output tensor name(%s) can't match wk node name(%s).", (*outputs)[j].Name().c_str(),
+           nnie_cfg_.param_.model_->astSeg[seg_id].astDstNode[i].szName);
+    }
+
+    auto output_data_type = (*outputs)[j].DataType();
+    if (output_data_type == DataType::kNumberTypeFloat32) {
+      SVP_SRC_BLOB_S *blob = &nnie_cfg_.param_.seg_data_[seg_id].dst_[i];
+      if (SVP_BLOB_TYPE_U8 <= blob->enType && SVP_BLOB_TYPE_YVU422SP >= blob->enType) {
+        LOGE("Nnie output type error");
+        return RET_ERROR;
+      } else if (SVP_BLOB_TYPE_SEQ_S32 != blob->enType) {
+        if ((blob->unShape.stWhc.u32Width * sizeof(float) == blob->u32Stride)) {
+          if ((((*outputs)[j].Data() == nullptr) || (*outputs)[j].allocator() == allocator) &&
+              (blob->u64VirAddr != 0)) {
+            (*outputs)[j].SetAllocator(allocator);
+            (*outputs)[j].SetData(reinterpret_cast<void *>(blob->u64VirAddr));
+            LOGI("\nSet output %s allocator!", (*outputs)[j].Name().c_str());
+          }
+        }
+      }
+    } else {
+      LOGE("Unsupported DataType!");
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+int NNIEManager::SetAllocator(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
+                              std::shared_ptr<Allocator> allocator, unsigned int seg_id) {
+  bool run_box = false;
+  if (nnie_cfg_.param_.model_->astSeg[seg_id].enNetType == SVP_NNIE_NET_TYPE_ROI) {
+    run_box = true;
+  }
+  if (SetAllocatorInputs(inputs, run_box, allocator, seg_id) != RET_OK) {
+    LOGE("SetAllocatorInputs failed!");
+    return RET_ERROR;
+  }
+  if (SetAllocatorOutputs(outputs, run_box, allocator, seg_id) != RET_OK) {
+    LOGE("SetAllocatorOutputs failed!");
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
 int NNIEManager::FillData(std::vector<mindspore::MSTensor> *inputs, unsigned int seg_id) {
  bool run_box = false;
  size_t i, j;
--- a/mindspore/lite/providers/nnie/src/nnie_manager.h
+++ b/mindspore/lite/providers/nnie/src/nnie_manager.h
@ -17,17 +17,33 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NNIE_NNIE_MANAGER_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NNIE_NNIE_MANAGER_H_
 #include <vector>
+#include <string>
+#include <map>
+#include <memory>
 #include "include/errorcode.h"
 #include "include/api/types.h"
+#include "include/api/allocator.h"
 #include "src/nnie_common.h"
+#include "src/nnie_cfg_parser.h"

 namespace mindspore {
 namespace nnie {
 class NNIEManager {
 public:
-  static NNIEManager *GetInstance() {
-    static NNIEManager manager;
-    return &manager;
+  static NNIEManager *GetInstance(const void *model_buf) {
+    static std::map<const void *, NNIEManager *> managers_;
+    auto iter = managers_.find(model_buf);
+    if (iter != managers_.end()) {
+      return iter->second;
+    } else {
+      auto manager = new (std::nothrow) NNIEManager();
+      if (manager == nullptr) {
+        return manager;
+      } else {
+        managers_[model_buf] = manager;
+        return manager;
+      }
+    }
  }

  NNIEManager() {}
@ -36,26 +52,65 @@ class NNIEManager {

  int Init(char *model_buf, int size, const std::vector<mindspore::MSTensor> &inputs);

-  int CfgInit(int max_roi_num, int step, const std::vector<int> &core_id);
+  int CfgInit(const Flags &flags, int max_seg_id);

  void SetInputNum(int max_input_num);

+  int SetAllocatorInputs(std::vector<mindspore::MSTensor> *inputs, bool run_box, std::shared_ptr<Allocator> allocator,
+                         unsigned int seg_id);
+
+  int SetAllocatorOutputs(std::vector<mindspore::MSTensor> *outputs, bool run_box, std::shared_ptr<Allocator> allocator,
+                          unsigned int seg_id);
+
+  int SetAllocator(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
+                   std::shared_ptr<Allocator> allocator, unsigned int seg_id);
+
  int FillData(std::vector<mindspore::MSTensor> *inputs, unsigned int seg_id);

  int Run(std::vector<mindspore::MSTensor> *outputs, unsigned int seg_id,
          const std::vector<std::vector<int64_t>> &outputs_shape);

-  void Release();
+  void Release(bool resize_flag);
+
+  int LoadInputs(std::vector<mindspore::MSTensor> *inputs, std::shared_ptr<Allocator> allocator);
+
+  int LoadOutputs(std::vector<mindspore::MSTensor> *outputs, std::shared_ptr<Allocator> allocator);
+
+  int SetBlobAddr(SVP_SRC_BLOB_S *blob, HI_U64 virt, mindspore::MSTensor *tensor, std::shared_ptr<Allocator> allocator);
+
+  void SetMaxSegId(int max_id) {
+    if (max_id > max_seg_id_) {
+      max_seg_id_ = max_id;
+    }
+  }
+
+  inline int GetMaxSegId() { return max_seg_id_; }
+
+  inline Flags *GetFlags() { return &flags_; }
+
+  inline bool GetLoadModel() { return load_model_; }
+
+  void SetLoadModel(bool flag) { load_model_ = flag; }

 private:
+  int SetAllocatorTensor(mindspore::MSTensor *tensor, SVP_SRC_BLOB_S *blob, std::shared_ptr<Allocator> allocator);
+
  int GetOutputData(std::vector<mindspore::MSTensor> *outputs, const std::vector<std::vector<int64_t>> &outputs_shape,
                    bool run_box = false);
+
+  int MallocBlobData(SVP_SRC_BLOB_S *blob, mindspore::MSTensor *tensor, HI_U32 blob_size);
+
  int FillRoiPooling(mindspore::MSTensor *input);
  char *wk_model_ = nullptr;

  int model_size_ = 0;

  NnieRunCfg nnie_cfg_;
+  int max_seg_id_ = 0;
+  Flags flags_;
+  bool load_model_ = false;
+  std::vector<SVP_SRC_BLOB_S *> blobs_;
+  std::vector<mindspore::MSTensor *> tensors_;
 };
 }  // namespace nnie
 }  // namespace mindspore
--- a/mindspore/lite/providers/nnie/src/nnie_memory.cc
+++ b/mindspore/lite/providers/nnie/src/nnie_memory.cc
@ -16,6 +16,7 @@
 #include "src/nnie_memory.h"
 #include "include/hi_common.h"
 #include "include/mpi_sys.h"
+#include "src/nnie_common.h"

 namespace mindspore {
 namespace nnie {
@ -31,5 +32,14 @@ HI_S32 NnieMemMallocCached(const HI_CHAR *mmb, HI_CHAR *zone, HI_U64 *pu_phy_add
 HI_S32 NnieMemFlushCache(HI_U64 phy_addr, HI_VOID *pv_vir_addr, HI_U32 size) {
  return HI_MPI_SYS_MmzFlushCache(phy_addr, pv_vir_addr, size);
 }
+
+HI_S32 NnieGetVirMemInfo(HI_U64 pv_vir_addr, HI_U64 *phy_addr) {
+  SYS_VIRMEM_INFO_S mem_info;
+  HI_S32 ret = HI_MPI_SYS_GetVirMemInfo(NNIE_CONVERT_64BIT_ADDR(HI_VOID, pv_vir_addr), &mem_info);
+  if (ret == HI_SUCCESS) {
+    *phy_addr = mem_info.u64PhyAddr;
+  }
+  return ret;
+}
 }  // namespace nnie
 }  // namespace mindspore
--- a/mindspore/lite/providers/nnie/src/nnie_memory.h
+++ b/mindspore/lite/providers/nnie/src/nnie_memory.h
@ -43,6 +43,8 @@ HI_S32 NnieMemMalloc(const HI_CHAR *mmb, HI_CHAR *zone, HI_U64 *pu_phy_addr, HI_
 HI_S32 NnieMemMallocCached(const HI_CHAR *mmb, HI_CHAR *zone, HI_U64 *pu_phy_addr, HI_VOID **ppv_vir_addr, HI_U32 size);

 HI_S32 NnieMemFlushCache(HI_U64 phy_addr, HI_VOID *pv_vir_addr, HI_U32 size);
+
+HI_S32 NnieGetVirMemInfo(HI_U64 pv_vir_addr, HI_U64 *phy_addr);
 }  // namespace nnie
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NNIE_NNIE_MEMORY_H_
--- a/mindspore/lite/src/cxx_api/context.h
+++ b/mindspore/lite/src/cxx_api/context.h
@ -34,15 +34,14 @@ struct Context::Data {

 #ifdef PARALLEL_INFERENCE
  int32_t thread_num = 8;
-  bool enable_parallel_ = false;
  int affinity_mode_ = 1;
  int32_t inter_op_parallel_num_ = 4;
 #else
  int32_t thread_num = 2;
-  bool enable_parallel_ = false;
  int affinity_mode_ = 0;
  int32_t inter_op_parallel_num_ = 1;
 #endif
+  bool enable_parallel_ = false;
  std::vector<int32_t> affinity_core_list_;
  std::shared_ptr<Delegate> delegate = nullptr;
  bool float_mode = false;
--- a/mindspore/lite/src/cxx_api/model/model.cc
+++ b/mindspore/lite/src/cxx_api/model/model.cc
@ -245,6 +245,14 @@ Status Model::Predict(const std::vector<MSTensor> &inputs, std::vector<MSTensor>
  return impl_->Predict(inputs, outputs, before, after);
 }

+Status Model::Predict(const MSKernelCallBack &before, const MSKernelCallBack &after) {
+  if (impl_ == nullptr) {
+    MS_LOG(ERROR) << "Model implement is null.";
+    return kLiteNullptr;
+  }
+  return impl_->Predict(before, after);
+}
+
 Status Model::PredictWithPreprocess(const std::vector<std::vector<MSTensor>> &inputs, std::vector<MSTensor> *outputs,
                                    const MSKernelCallBack &before, const MSKernelCallBack &after) {
  MS_LOG(ERROR) << "Unsupported Feature.";
--- a/mindspore/lite/src/cxx_api/model/model_impl.cc
+++ b/mindspore/lite/src/cxx_api/model/model_impl.cc
@ -341,6 +341,32 @@ Status ModelImpl::Predict(const std::vector<MSTensor> &inputs, std::vector<MSTen
  return kSuccess;
 }

+Status ModelImpl::Predict(const MSKernelCallBack &before, const MSKernelCallBack &after) {
+  if (session_ == nullptr) {
+    MS_LOG(ERROR) << "Run graph failed.";
+    return kLiteError;
+  }
+  auto input_tensors = session_->GetInputs();
+  if (input_tensors.empty()) {
+    MS_LOG(ERROR) << "Failed to get input tensor.";
+    return kLiteError;
+  }
+
+  for (auto &input : input_tensors) {
+    if (input->data() == nullptr) {
+      MS_LOG(ERROR) << "Tensor " << input->tensor_name() << " has no data.";
+      return kLiteInputTensorError;
+    }
+  }
+  auto ret = RunGraph(before, after);
+  if (ret != kSuccess) {
+    MS_LOG(ERROR) << "Run graph failed : " << ret;
+    return ret;
+  }
+  MS_LOG(DEBUG) << "Run graph success.";
+  return kSuccess;
+}
+
 std::vector<MSTensor> ModelImpl::GetInputs() {
  std::vector<MSTensor> empty;
  if (session_ == nullptr) {
--- a/mindspore/lite/src/cxx_api/model/model_impl.h
+++ b/mindspore/lite/src/cxx_api/model/model_impl.h
@ -72,6 +72,8 @@ class ModelImpl {
  Status Predict(const std::vector<MSTensor> &inputs, std::vector<MSTensor> *outputs, const MSKernelCallBack &before,
                 const MSKernelCallBack &after);

+  Status Predict(const MSKernelCallBack &before, const MSKernelCallBack &after);
+
  lite::LiteSession *CreateLiteSession(lite::InnerContext *context);

  Status LoadConfig(const std::string &config_path);
--- a/mindspore/lite/test/st/scripts/nnie/run_benchmark_nnie.sh
+++ b/mindspore/lite/test/st/scripts/nnie/run_benchmark_nnie.sh
@ -51,6 +51,14 @@ function Run_Hi3516() {
        else
            run_result='hi3516: '${model_name}' failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1
        fi
+
+        echo './benchmark --modelFile='${basepath}'/'${model_name}'.ms --inputShapes='${input_shapes}'  --warmUpLoopCount=0 --loopCount=2 --configFile='${NNIE_CONFIG_FILE} >> "${run_hi3516_log_file}"
+        ./benchmark --modelFile=${basepath}/${model_name}.ms --inputShapes=${input_shapes} --warmUpLoopCount=0 --loopCount=2 --configFile=${NNIE_CONFIG_FILE}>> "${run_hi3516_log_file}"
+        if [ $? = 0 ]; then
+            run_result='hi3516: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file}
+        else
+            run_result='hi3516: '${model_name}' failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1
+        fi
    done < ${models_nnie_config}
 }

@ -97,6 +105,7 @@ else
    echo "Run benchmark failed"
    MS_PRINT_TESTCASE_END_MSG
    cat ${run_benchmark_result_file}
+    cat ${run_hi3516_log_file}
    MS_PRINT_TESTCASE_END_MSG
    rm -rf ${basepath}/*.ms
    rm -rf ${basepath}/libmslite_nnie.so