!20558 [MS][LITE] optimize the npu subgraph split procedure
Merge pull request !20558 from XianglongZeng/myms_new_2
This commit is contained in:
commit
4d60b57095
|
@ -70,38 +70,35 @@ void NPUGraph::set_output(mindspore::MSTensor out_tensor, int index) {
|
||||||
int NPUGraph::Init() {
|
int NPUGraph::Init() {
|
||||||
all_kernels_.clear();
|
all_kernels_.clear();
|
||||||
std::map<const NPUOp *, bool> is_visited;
|
std::map<const NPUOp *, bool> is_visited;
|
||||||
|
std::map<const NPUOp *, bool> is_searched;
|
||||||
|
std::queue<NPUOp *> candidate_in_ops;
|
||||||
|
std::queue<NPUOp *> valid_in_ops;
|
||||||
|
// Initialization
|
||||||
for (auto op : npu_ops_) {
|
for (auto op : npu_ops_) {
|
||||||
is_visited[op] = false;
|
is_visited[op] = false;
|
||||||
|
is_searched[op] = false;
|
||||||
|
if (op->in_ops().empty()) {
|
||||||
|
candidate_in_ops.push(op);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
while (!candidate_in_ops.empty()) {
|
||||||
while (npu_ops_.size() > 0) {
|
// 1. Find out all input ops except transpose, and handle transpose ops independently.
|
||||||
auto head_op_iter = std::find_if(npu_ops_.begin(), npu_ops_.end(), [&](const NPUOp *op) {
|
auto ret = FindValidSubgraphInOps(&valid_in_ops, &candidate_in_ops, &is_visited);
|
||||||
if (is_visited[op]) {
|
if (ret != RET_OK) {
|
||||||
return false;
|
MS_LOG(DEBUG) << "Fail to find valid input ops or handle transpose ops.";
|
||||||
}
|
return RET_ERROR;
|
||||||
return true;
|
}
|
||||||
});
|
if (valid_in_ops.empty()) {
|
||||||
if (head_op_iter == npu_ops_.end()) {
|
MS_LOG(INFO) << "Can not find input ops except transpose.";
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
auto head_op = *head_op_iter;
|
// 2. Find out all ready ops based on valid input ops, but these ops maybe not belong to the same subgraph.
|
||||||
if (head_op->type() != schema::PrimitiveType_Transpose) {
|
auto ready_ops = FindReadySubgraphOps(valid_in_ops, &candidate_in_ops, &is_visited);
|
||||||
// If npu_kernel does not equal nullptr, this kernel can be supported by delegate
|
// 3. Create subgraph(s). Input ops with connection will be built into a same subgraph.
|
||||||
auto npu_ops = FindSubgraphOps(head_op, &is_visited);
|
ret = CreateSubgraphFromReadyOps(&valid_in_ops, ready_ops, &is_searched);
|
||||||
auto subgraph_kernel = CreateNPUSubgraphKernel(npu_ops);
|
if (ret != RET_OK) {
|
||||||
if (subgraph_kernel == nullptr) {
|
MS_LOG(DEBUG) << "Fail to create subgraph(s) from ready ops.";
|
||||||
MS_LOG(DEBUG) << "Create NPU subgraph kernel failed.";
|
return RET_ERROR;
|
||||||
return RET_ERROR;
|
|
||||||
}
|
|
||||||
all_kernels_.push_back(subgraph_kernel);
|
|
||||||
} else {
|
|
||||||
auto transpose_kernel = CreateNPUTransposeKernel(head_op);
|
|
||||||
if (transpose_kernel == nullptr) {
|
|
||||||
MS_LOG(DEBUG) << "New NPU transpose kernel failed.";
|
|
||||||
return RET_ERROR;
|
|
||||||
}
|
|
||||||
all_kernels_.push_back(transpose_kernel);
|
|
||||||
is_visited[head_op] = true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
|
@ -141,32 +138,128 @@ int NPUGraph::FindPreNextOps() {
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<NPUOp *> NPUGraph::FindSubgraphOps(NPUOp *head_op, std::map<const NPUOp *, bool> *is_visited) {
|
int NPUGraph::FindValidSubgraphInOps(std::queue<NPUOp *> *valid_in_ops, std::queue<NPUOp *> *candidate_in_ops,
|
||||||
|
std::map<const NPUOp *, bool> *is_visited) {
|
||||||
|
while (!candidate_in_ops->empty()) {
|
||||||
|
auto cur_op = candidate_in_ops->front();
|
||||||
|
candidate_in_ops->pop();
|
||||||
|
if ((*is_visited)[cur_op]) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (cur_op->type() == schema::PrimitiveType_Transpose) {
|
||||||
|
auto transpose_kernel = CreateNPUTransposeKernel(cur_op);
|
||||||
|
if (transpose_kernel == nullptr) {
|
||||||
|
MS_LOG(DEBUG) << "New NPU transpose kernel failed.";
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
|
all_kernels_.push_back(transpose_kernel);
|
||||||
|
(*is_visited)[cur_op] = true;
|
||||||
|
for (auto out_op : cur_op->out_ops()) {
|
||||||
|
if (out_op->type() == schema::PrimitiveType_Transpose) {
|
||||||
|
candidate_in_ops->push(out_op);
|
||||||
|
} else {
|
||||||
|
auto input_ready = std::all_of(out_op->in_ops().begin(), out_op->in_ops().end(),
|
||||||
|
[&](NPUOp *in_op) { return (*is_visited)[in_op] == true; });
|
||||||
|
if (input_ready) {
|
||||||
|
valid_in_ops->push(out_op);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
valid_in_ops->push(cur_op);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<NPUOp *> NPUGraph::FindReadySubgraphOps(std::queue<NPUOp *> op_queue,
|
||||||
|
std::queue<NPUOp *> *next_candidate_ops,
|
||||||
|
std::map<const NPUOp *, bool> *is_visited) {
|
||||||
std::vector<NPUOp *> subgraph_ops;
|
std::vector<NPUOp *> subgraph_ops;
|
||||||
subgraph_ops.push_back(head_op);
|
|
||||||
(*is_visited)[head_op] = true;
|
|
||||||
std::queue<NPUOp *> op_queue;
|
|
||||||
op_queue.emplace(head_op);
|
|
||||||
while (!op_queue.empty()) {
|
while (!op_queue.empty()) {
|
||||||
auto cur_op = op_queue.front();
|
auto cur_op = op_queue.front();
|
||||||
op_queue.pop();
|
op_queue.pop();
|
||||||
|
if ((*is_visited)[cur_op]) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
subgraph_ops.push_back(cur_op);
|
||||||
|
(*is_visited)[cur_op] = true;
|
||||||
auto out_ops = cur_op->out_ops();
|
auto out_ops = cur_op->out_ops();
|
||||||
for (auto out_op : out_ops) {
|
for (auto out_op : out_ops) {
|
||||||
if ((*is_visited)[out_op] == true) {
|
if ((*is_visited)[out_op]) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
auto input_ready = std::all_of(out_op->in_ops().begin(), out_op->in_ops().end(),
|
auto input_ready = std::all_of(out_op->in_ops().begin(), out_op->in_ops().end(),
|
||||||
[&](NPUOp *in_op) { return (*is_visited)[in_op] == true; });
|
[&](NPUOp *in_op) { return (*is_visited)[in_op] == true; });
|
||||||
if (input_ready && out_op->type() != schema::PrimitiveType_Transpose) {
|
if (input_ready && out_op->type() != schema::PrimitiveType_Transpose) {
|
||||||
subgraph_ops.push_back(out_op);
|
|
||||||
(*is_visited)[out_op] = true;
|
|
||||||
op_queue.push(out_op);
|
op_queue.push(out_op);
|
||||||
|
} else {
|
||||||
|
next_candidate_ops->push(out_op);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return subgraph_ops;
|
return subgraph_ops;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void FindConnectedOps(NPUOp *head_op, std::vector<NPUOp *> ready_ops, std::vector<NPUOp *> *connected_ops,
|
||||||
|
std::map<const NPUOp *, bool> *is_searched) {
|
||||||
|
std::queue<NPUOp *> bfs_ops;
|
||||||
|
bfs_ops.push(head_op);
|
||||||
|
while (!bfs_ops.empty()) {
|
||||||
|
auto cur_op = bfs_ops.front();
|
||||||
|
bfs_ops.pop();
|
||||||
|
if ((*is_searched)[cur_op]) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (auto in_op : cur_op->in_ops()) {
|
||||||
|
if (std::find(ready_ops.begin(), ready_ops.end(), in_op) == ready_ops.end() || (*is_searched)[in_op]) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
bfs_ops.push(in_op);
|
||||||
|
}
|
||||||
|
for (auto out_op : cur_op->out_ops()) {
|
||||||
|
if (std::find(ready_ops.begin(), ready_ops.end(), out_op) == ready_ops.end() || (*is_searched)[out_op]) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
bfs_ops.push(out_op);
|
||||||
|
}
|
||||||
|
(*is_searched)[cur_op] = true;
|
||||||
|
connected_ops->push_back(cur_op);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int NPUGraph::CreateSubgraphFromReadyOps(std::queue<NPUOp *> *valid_in_ops, std::vector<NPUOp *> ready_ops,
|
||||||
|
std::map<const NPUOp *, bool> *is_searched) {
|
||||||
|
while (!valid_in_ops->empty()) {
|
||||||
|
std::vector<NPUOp *> connected_ops;
|
||||||
|
auto op = valid_in_ops->front();
|
||||||
|
valid_in_ops->pop();
|
||||||
|
if ((*is_searched)[op]) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (valid_in_ops->empty()) {
|
||||||
|
// use BFS to find out connected input ops
|
||||||
|
FindConnectedOps(op, ready_ops, &connected_ops, is_searched);
|
||||||
|
} else {
|
||||||
|
// if current input op is the only input op, there is no need to confirm the connectivity
|
||||||
|
for (auto ready_op : ready_ops) {
|
||||||
|
if (!(*is_searched)[ready_op]) {
|
||||||
|
connected_ops.push_back(ready_op);
|
||||||
|
(*is_searched)[ready_op] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto subgraph_kernel = CreateNPUSubgraphKernel(connected_ops);
|
||||||
|
if (subgraph_kernel == nullptr) {
|
||||||
|
MS_LOG(DEBUG) << "Create NPU subgraph kernel failed.";
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
|
all_kernels_.push_back(subgraph_kernel);
|
||||||
|
}
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
kernel::Kernel *NPUGraph::CreateNPUSubgraphKernel(std::vector<NPUOp *> npu_ops) {
|
kernel::Kernel *NPUGraph::CreateNPUSubgraphKernel(std::vector<NPUOp *> npu_ops) {
|
||||||
auto subgraph = new (std::nothrow) NPUSubGraph(npu_ops, npu_manager_);
|
auto subgraph = new (std::nothrow) NPUSubGraph(npu_ops, npu_manager_);
|
||||||
if (subgraph == nullptr) {
|
if (subgraph == nullptr) {
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_GRAPH_H_
|
#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_GRAPH_H_
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <queue>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include "include/api/kernel.h"
|
#include "include/api/kernel.h"
|
||||||
|
@ -59,7 +60,14 @@ class NPUGraph : public kernel::Kernel {
|
||||||
|
|
||||||
std::vector<NPUOp *> FindNextOps(NPUOp *cur_op);
|
std::vector<NPUOp *> FindNextOps(NPUOp *cur_op);
|
||||||
|
|
||||||
std::vector<NPUOp *> FindSubgraphOps(NPUOp *head_op, std::map<const NPUOp *, bool> *is_visited);
|
int FindValidSubgraphInOps(std::queue<NPUOp *> *valid_in_ops, std::queue<NPUOp *> *candidate_in_ops,
|
||||||
|
std::map<const NPUOp *, bool> *is_visited);
|
||||||
|
|
||||||
|
std::vector<NPUOp *> FindReadySubgraphOps(std::queue<NPUOp *> op_queue, std::queue<NPUOp *> *next_candidate_ops,
|
||||||
|
std::map<const NPUOp *, bool> *is_visited);
|
||||||
|
|
||||||
|
int CreateSubgraphFromReadyOps(std::queue<NPUOp *> *valid_in_ops, std::vector<NPUOp *> ready_ops,
|
||||||
|
std::map<const NPUOp *, bool> *is_searched);
|
||||||
|
|
||||||
kernel::Kernel *CreateNPUSubgraphKernel(std::vector<NPUOp *> ops);
|
kernel::Kernel *CreateNPUSubgraphKernel(std::vector<NPUOp *> ops);
|
||||||
|
|
||||||
|
|
|
@ -160,7 +160,7 @@ int NPUSubGraph::BuildNPUInputOp() {
|
||||||
for (int i = 0; i < op->inputs().size(); ++i) {
|
for (int i = 0; i < op->inputs().size(); ++i) {
|
||||||
auto in_tensor = op->inputs()[i];
|
auto in_tensor = op->inputs()[i];
|
||||||
if (IsSubGraphInputTensor(in_tensor)) {
|
if (IsSubGraphInputTensor(in_tensor)) {
|
||||||
auto tensor_name = op->name() + "_" + std::to_string(count++);
|
auto tensor_name = "Input_" + std::to_string(count++) + '_' + op->name();
|
||||||
hiai::op::Data *data;
|
hiai::op::Data *data;
|
||||||
data = ConverterToNPUData(in_tensor, tensor_name);
|
data = ConverterToNPUData(in_tensor, tensor_name);
|
||||||
subgraph_input_ops_.push_back(*data);
|
subgraph_input_ops_.push_back(*data);
|
||||||
|
|
|
@ -125,3 +125,4 @@ ml_Heatmap_depth_240180;2
|
||||||
ml_Heatmap_depth_180240;2
|
ml_Heatmap_depth_180240;2
|
||||||
ml_video_edit_person_divison_video;2
|
ml_video_edit_person_divison_video;2
|
||||||
ml_video_edit_hair_dyeing_segmodel_v2
|
ml_video_edit_hair_dyeing_segmodel_v2
|
||||||
|
ml_video_edit_hairline_segmentation;3
|
||||||
|
|
|
@ -134,3 +134,4 @@ hdc_ocr_recog_horizontal 0.5
|
||||||
ml_Heatmap_depth_240180;2 10
|
ml_Heatmap_depth_240180;2 10
|
||||||
ml_Heatmap_depth_180240;2 7
|
ml_Heatmap_depth_180240;2 7
|
||||||
ml_video_edit_hair_dyeing_segmodel_v2 1
|
ml_video_edit_hair_dyeing_segmodel_v2 1
|
||||||
|
ml_video_edit_hairline_segmentation;3 1.5
|
|
@ -86,3 +86,4 @@ ml_video_edit_art_generate_20210513.onnx 0.5
|
||||||
ml_video_edit_art_transfer_20210513.onnx;3 0.5
|
ml_video_edit_art_transfer_20210513.onnx;3 0.5
|
||||||
ml_video_edit_hair_dyeing_segmodel_v2 0.5
|
ml_video_edit_hair_dyeing_segmodel_v2 0.5
|
||||||
ml_video_edit_makeup_mobilenetv203.onnx 2
|
ml_video_edit_makeup_mobilenetv203.onnx 2
|
||||||
|
ml_video_edit_hairline_segmentation;3 0.5
|
||||||
|
|
Loading…
Reference in New Issue