!20558 [MS][LITE] optimize the npu subgraph split procedure

Merge pull request !20558 from XianglongZeng/myms_new_2
2021-07-23 03:35:54 +00:00 · 2021-07-23 03:35:54 +00:00 · 4d60b57095
parent bcb49f10b4 27e6e53bed
commit 4d60b57095
6 changed files with 141 additions and 37 deletions
--- a/mindspore/lite/src/delegate/npu/npu_graph.cc
+++ b/mindspore/lite/src/delegate/npu/npu_graph.cc
@ -70,38 +70,35 @@ void NPUGraph::set_output(mindspore::MSTensor out_tensor, int index) {
 int NPUGraph::Init() {
  all_kernels_.clear();
  std::map<const NPUOp *, bool> is_visited;
  std::map<const NPUOp *, bool> is_searched;
  std::queue<NPUOp *> candidate_in_ops;
  std::queue<NPUOp *> valid_in_ops;
  // Initialization
  for (auto op : npu_ops_) {
    is_visited[op] = false;
    is_searched[op] = false;
    if (op->in_ops().empty()) {
      candidate_in_ops.push(op);
    }
  }
-
+  while (!candidate_in_ops.empty()) {
-  while (npu_ops_.size() > 0) {
+    // 1. Find out all input ops except transpose, and handle transpose ops independently.
-    auto head_op_iter = std::find_if(npu_ops_.begin(), npu_ops_.end(), [&](const NPUOp *op) {
+    auto ret = FindValidSubgraphInOps(&valid_in_ops, &candidate_in_ops, &is_visited);
-      if (is_visited[op]) {
+    if (ret != RET_OK) {
-        return false;
+      MS_LOG(DEBUG) << "Fail to find valid input ops or handle transpose ops.";
-      }
+      return RET_ERROR;
-      return true;
+    }
-    });
+    if (valid_in_ops.empty()) {
-    if (head_op_iter == npu_ops_.end()) {
+      MS_LOG(INFO) << "Can not find input ops except transpose.";
      break;
    }
-    auto head_op = *head_op_iter;
+    // 2. Find out all ready ops based on valid input ops, but these ops maybe not belong to the same subgraph.
-    if (head_op->type() != schema::PrimitiveType_Transpose) {
+    auto ready_ops = FindReadySubgraphOps(valid_in_ops, &candidate_in_ops, &is_visited);
-      // If npu_kernel does not equal nullptr, this kernel can be supported by delegate
+    // 3. Create subgraph(s). Input ops with connection will be built into a same subgraph.
-      auto npu_ops = FindSubgraphOps(head_op, &is_visited);
+    ret = CreateSubgraphFromReadyOps(&valid_in_ops, ready_ops, &is_searched);
-      auto subgraph_kernel = CreateNPUSubgraphKernel(npu_ops);
+    if (ret != RET_OK) {
-      if (subgraph_kernel == nullptr) {
+      MS_LOG(DEBUG) << "Fail to create subgraph(s) from ready ops.";
-        MS_LOG(DEBUG) << "Create NPU subgraph kernel failed.";
+      return RET_ERROR;
        return RET_ERROR;
      }
      all_kernels_.push_back(subgraph_kernel);
    } else {
      auto transpose_kernel = CreateNPUTransposeKernel(head_op);
      if (transpose_kernel == nullptr) {
        MS_LOG(DEBUG) << "New NPU transpose kernel failed.";
        return RET_ERROR;
      }
      all_kernels_.push_back(transpose_kernel);
      is_visited[head_op] = true;
    }
  }
  return RET_OK;
@ -141,32 +138,128 @@ int NPUGraph::FindPreNextOps() {
  return RET_OK;
 }
-std::vector<NPUOp *> NPUGraph::FindSubgraphOps(NPUOp *head_op, std::map<const NPUOp *, bool> *is_visited) {
+int NPUGraph::FindValidSubgraphInOps(std::queue<NPUOp *> *valid_in_ops, std::queue<NPUOp *> *candidate_in_ops,
                                     std::map<const NPUOp *, bool> *is_visited) {
  while (!candidate_in_ops->empty()) {
    auto cur_op = candidate_in_ops->front();
    candidate_in_ops->pop();
    if ((*is_visited)[cur_op]) {
      continue;
    }
    if (cur_op->type() == schema::PrimitiveType_Transpose) {
      auto transpose_kernel = CreateNPUTransposeKernel(cur_op);
      if (transpose_kernel == nullptr) {
        MS_LOG(DEBUG) << "New NPU transpose kernel failed.";
        return RET_ERROR;
      }
      all_kernels_.push_back(transpose_kernel);
      (*is_visited)[cur_op] = true;
      for (auto out_op : cur_op->out_ops()) {
        if (out_op->type() == schema::PrimitiveType_Transpose) {
          candidate_in_ops->push(out_op);
        } else {
          auto input_ready = std::all_of(out_op->in_ops().begin(), out_op->in_ops().end(),
                                         [&](NPUOp *in_op) { return (*is_visited)[in_op] == true; });
          if (input_ready) {
            valid_in_ops->push(out_op);
          }
        }
      }
    } else {
      valid_in_ops->push(cur_op);
    }
  }
  return RET_OK;
 }
 std::vector<NPUOp *> NPUGraph::FindReadySubgraphOps(std::queue<NPUOp *> op_queue,
                                                    std::queue<NPUOp *> *next_candidate_ops,
                                                    std::map<const NPUOp *, bool> *is_visited) {
  std::vector<NPUOp *> subgraph_ops;
  subgraph_ops.push_back(head_op);
  (*is_visited)[head_op] = true;
  std::queue<NPUOp *> op_queue;
  op_queue.emplace(head_op);
  while (!op_queue.empty()) {
    auto cur_op = op_queue.front();
    op_queue.pop();
    if ((*is_visited)[cur_op]) {
      continue;
    }
    subgraph_ops.push_back(cur_op);
    (*is_visited)[cur_op] = true;
    auto out_ops = cur_op->out_ops();
    for (auto out_op : out_ops) {
-      if ((*is_visited)[out_op] == true) {
+      if ((*is_visited)[out_op]) {
        continue;
      }
      auto input_ready = std::all_of(out_op->in_ops().begin(), out_op->in_ops().end(),
                                     [&](NPUOp *in_op) { return (*is_visited)[in_op] == true; });
      if (input_ready && out_op->type() != schema::PrimitiveType_Transpose) {
        subgraph_ops.push_back(out_op);
        (*is_visited)[out_op] = true;
        op_queue.push(out_op);
      } else {
        next_candidate_ops->push(out_op);
      }
    }
  }
  return subgraph_ops;
 }
 void FindConnectedOps(NPUOp *head_op, std::vector<NPUOp *> ready_ops, std::vector<NPUOp *> *connected_ops,
                      std::map<const NPUOp *, bool> *is_searched) {
  std::queue<NPUOp *> bfs_ops;
  bfs_ops.push(head_op);
  while (!bfs_ops.empty()) {
    auto cur_op = bfs_ops.front();
    bfs_ops.pop();
    if ((*is_searched)[cur_op]) {
      continue;
    }
    for (auto in_op : cur_op->in_ops()) {
      if (std::find(ready_ops.begin(), ready_ops.end(), in_op) == ready_ops.end() || (*is_searched)[in_op]) {
        continue;
      }
      bfs_ops.push(in_op);
    }
    for (auto out_op : cur_op->out_ops()) {
      if (std::find(ready_ops.begin(), ready_ops.end(), out_op) == ready_ops.end() || (*is_searched)[out_op]) {
        continue;
      }
      bfs_ops.push(out_op);
    }
    (*is_searched)[cur_op] = true;
    connected_ops->push_back(cur_op);
  }
  return;
 }
 int NPUGraph::CreateSubgraphFromReadyOps(std::queue<NPUOp *> *valid_in_ops, std::vector<NPUOp *> ready_ops,
                                         std::map<const NPUOp *, bool> *is_searched) {
  while (!valid_in_ops->empty()) {
    std::vector<NPUOp *> connected_ops;
    auto op = valid_in_ops->front();
    valid_in_ops->pop();
    if ((*is_searched)[op]) {
      continue;
    }
    if (valid_in_ops->empty()) {
      // use BFS to find out connected input ops
      FindConnectedOps(op, ready_ops, &connected_ops, is_searched);
    } else {
      // if current input op is the only input op, there is no need to confirm the connectivity
      for (auto ready_op : ready_ops) {
        if (!(*is_searched)[ready_op]) {
          connected_ops.push_back(ready_op);
          (*is_searched)[ready_op] = true;
        }
      }
    }
    auto subgraph_kernel = CreateNPUSubgraphKernel(connected_ops);
    if (subgraph_kernel == nullptr) {
      MS_LOG(DEBUG) << "Create NPU subgraph kernel failed.";
      return RET_ERROR;
    }
    all_kernels_.push_back(subgraph_kernel);
  }
  return RET_OK;
 }
 kernel::Kernel *NPUGraph::CreateNPUSubgraphKernel(std::vector<NPUOp *> npu_ops) {
  auto subgraph = new (std::nothrow) NPUSubGraph(npu_ops, npu_manager_);
  if (subgraph == nullptr) {
--- a/mindspore/lite/src/delegate/npu/npu_graph.h
+++ b/mindspore/lite/src/delegate/npu/npu_graph.h
@ -18,6 +18,7 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_GRAPH_H_
 #include <vector>
 #include <queue>
 #include <map>
 #include <utility>
 #include "include/api/kernel.h"
@ -59,7 +60,14 @@ class NPUGraph : public kernel::Kernel {
  std::vector<NPUOp *> FindNextOps(NPUOp *cur_op);
-  std::vector<NPUOp *> FindSubgraphOps(NPUOp *head_op, std::map<const NPUOp *, bool> *is_visited);
+  int FindValidSubgraphInOps(std::queue<NPUOp *> *valid_in_ops, std::queue<NPUOp *> *candidate_in_ops,
                             std::map<const NPUOp *, bool> *is_visited);
  std::vector<NPUOp *> FindReadySubgraphOps(std::queue<NPUOp *> op_queue, std::queue<NPUOp *> *next_candidate_ops,
                                            std::map<const NPUOp *, bool> *is_visited);
  int CreateSubgraphFromReadyOps(std::queue<NPUOp *> *valid_in_ops, std::vector<NPUOp *> ready_ops,
                                 std::map<const NPUOp *, bool> *is_searched);
  kernel::Kernel *CreateNPUSubgraphKernel(std::vector<NPUOp *> ops);
--- a/mindspore/lite/src/delegate/npu/npu_subgraph.cc
+++ b/mindspore/lite/src/delegate/npu/npu_subgraph.cc
@ -160,7 +160,7 @@ int NPUSubGraph::BuildNPUInputOp() {
    for (int i = 0; i < op->inputs().size(); ++i) {
      auto in_tensor = op->inputs()[i];
      if (IsSubGraphInputTensor(in_tensor)) {
-        auto tensor_name = op->name() + "_" + std::to_string(count++);
+        auto tensor_name = "Input_" + std::to_string(count++) + '_' + op->name();
        hiai::op::Data *data;
        data = ConverterToNPUData(in_tensor, tensor_name);
        subgraph_input_ops_.push_back(*data);
--- a/mindspore/lite/test/config/models_caffe.cfg
+++ b/mindspore/lite/test/config/models_caffe.cfg
@ -125,3 +125,4 @@ ml_Heatmap_depth_240180;2
 ml_Heatmap_depth_180240;2
 ml_video_edit_person_divison_video;2
 ml_video_edit_hair_dyeing_segmodel_v2
 ml_video_edit_hairline_segmentation;3
--- a/mindspore/lite/test/config/models_caffe_fp16.cfg
+++ b/mindspore/lite/test/config/models_caffe_fp16.cfg
@ -134,3 +134,4 @@ hdc_ocr_recog_horizontal 0.5
 ml_Heatmap_depth_240180;2 10
 ml_Heatmap_depth_180240;2 7
 ml_video_edit_hair_dyeing_segmodel_v2 1
 ml_video_edit_hairline_segmentation;3 1.5
--- a/mindspore/lite/test/config/models_npu.cfg
+++ b/mindspore/lite/test/config/models_npu.cfg
@ -86,3 +86,4 @@ ml_video_edit_art_generate_20210513.onnx 0.5
 ml_video_edit_art_transfer_20210513.onnx;3 0.5
 ml_video_edit_hair_dyeing_segmodel_v2 0.5
 ml_video_edit_makeup_mobilenetv203.onnx 2
 ml_video_edit_hairline_segmentation;3 0.5