!18616 fix bug multi conv split

Merge pull request !18616 from zoloft/pub_master2
This commit is contained in:
i-robot 2021-06-23 06:16:04 +00:00 committed by Gitee
commit cb92de87d6
7 changed files with 171 additions and 59 deletions

View File

@ -50,7 +50,7 @@ void InnerContext::SetContextDevice(const Context *context) {
for (auto &device_ctx : context->device_list_) {
// npu/gpu server would use one core so we don't bind core to avoid competition.
// If user does not set npu/gpu device, we still bind core.
if (device_ctx.device_type_ == DT_CPU && (isUserSetNPU || isUserSetGPU)) {
if (device_ctx.device_type_ == DT_CPU && (isUserSetNPU || (isUserSetGPU && !enable_parallel_))) {
auto cpu_ctx = device_ctx;
cpu_ctx.device_info_.cpu_device_info_.cpu_bind_mode_ = NO_BIND;
this->device_list_.push_back(cpu_ctx);

View File

@ -862,14 +862,14 @@ int LiteSession::Resize(const std::vector<mindspore::tensor::MSTensor *> &inputs
}
int LiteSession::InitGPURuntime() {
CpuBindMode cpu_bind_mode = this->context_->device_list_.front().device_info_.cpu_device_info_.cpu_bind_mode_;
ActorThreadPool *thread_pool = this->context_->thread_pool();
if (thread_pool == nullptr) {
MS_LOG(ERROR) << "thread pool is nullptr";
is_running_.store(false);
return RET_NULL_PTR;
}
// Setting the binding core will affect the opencl drive scheduling.
thread_pool->SetProcessAffinity(static_cast<BindMode>(NO_BIND));
thread_pool->SetProcessAffinity(static_cast<BindMode>(cpu_bind_mode));
#if GPU_OPENCL
if (this->context_->IsGpuEnabled()) {
opencl_runtime_wrapper_ = new (std::nothrow) opencl::OpenCLRuntimeWrapper();
@ -911,6 +911,8 @@ int LiteSession::InitGPURuntime() {
}
}
#endif
// Setting the binding core will affect the opencl drive scheduling.
thread_pool->SetProcessAffinity(static_cast<BindMode>(NO_BIND));
return RET_OK;
}
} // namespace lite

View File

@ -66,8 +66,8 @@ int SplitWithOverlapBaseCPUKernel::Init() { return RET_OK; }
int SplitWithOverlapBaseCPUKernel::ReSize() { return RET_OK; }
int SplitWithOverlapBaseCPUKernel::Split(int task_id) {
DoSplitWithOverlap(input_ptr_, output_ptr_.data(), param_->num_split_, split_dim_size_, element_bytes_,
outer_total_dim_, inner_stride_, start_indices_.data(), end_indices_.data());
DoSplitWithOverlapParallel(input_ptr_, output_ptr_.data(), task_id, split_dim_size_, element_bytes_, outer_total_dim_,
inner_stride_, start_indices_.data(), end_indices_.data());
return RET_OK;
}
@ -117,7 +117,7 @@ int SplitWithOverlapBaseCPUKernel::Run() {
inner_stride_ *= input_shape[i];
}
auto ret = ParallelLaunch(this->context_, SplitWithOverlapRun, this, context_->thread_num_);
auto ret = ParallelLaunch(this->context_, SplitWithOverlapRun, this, param_->num_split_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "ParallelLaunch for SplitWIthOverlapRun run fail. errorcode:[" << ret << "]";
return RET_ERROR;

View File

@ -26,12 +26,12 @@
#include "src/ops/populate/populate_register.h"
#include "nnacl/fp32/winograd_utils.h"
#include "nnacl/pooling_parameter.h"
#include "include/model.h"
#if defined(ENABLE_ARM) || (defined(ENABLE_SSE) && !defined(ENABLE_AVX))
#include "nnacl/fp32/conv_depthwise_fp32.h"
#endif
namespace mindspore::lite {
size_t CommConvMul(std::vector<int> weight_shape, std::vector<int> output_shape) {
size_t cost = output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3] * weight_shape[1] *
weight_shape[2] * weight_shape[3];
@ -54,6 +54,69 @@ size_t WinogradConvDwMul() {
return 0;
}
bool IsOfflineParallelNode(const void *node_primitive, int node_device_type) {
if (node_primitive == nullptr) {
return false;
}
return (GetPrimitiveType(node_primitive) == schema::PrimitiveType_Conv2DFusion) &&
(node_device_type != kDefaultDeviceType);
}
void SearchSubGraph::UpdateOfflineParallelFlag() {
if (model_ == nullptr) {
offline_parallel_enable_ = false;
return;
}
// visited whole models to find any conv && depthwise conv have been set to device type
offline_parallel_enable_ =
std::any_of(this->model_->all_nodes_.begin(), this->model_->all_nodes_.end(),
[&](lite::Model::Node *node) { return IsOfflineParallelNode(node->primitive_, node->device_type_); });
}
bool SearchSubGraph::CheckIsParallelSubGraph(const std::vector<Subgraph> &subgraphs) {
if (subgraphs.size() != kDefalutSubGraphSize) {
return false;
}
for (const auto &sub_graph : subgraphs) {
auto heads = sub_graph.heads_;
auto ends = sub_graph.ends_;
if (heads.size() != kDefaultInputs || ends.size() != kDefaultInputs) {
return false;
}
auto head_node = model_->all_nodes_.at(heads.front());
auto end_node = model_->all_nodes_.at(ends.front());
if (!IsOfflineParallelNode(head_node->primitive_, head_node->device_type_) ||
!IsOfflineParallelNode(end_node->primitive_, end_node->device_type_)) {
return false;
}
// 1. check head_node's input is SplitOverlap node
for (const auto &input : head_node->input_indices_) {
if (tensors_.at(input).type_ == CONST) {
continue;
}
auto input_node_index = tensors_.at(input).out_nodes_.front();
if (GetPrimitiveType(model_->all_nodes_.at(input_node_index)->primitive_) !=
schema::PrimitiveType_SplitWithOverlap) {
return false;
}
}
// 2. check end_node's output is concat node
for (const auto &output : end_node->output_indices_) {
if (tensors_.at(output).type_ == CONST) {
continue;
}
auto output_node_index = tensors_.at(output).in_nodes_.front();
if (GetPrimitiveType(model_->all_nodes_.at(output_node_index)->primitive_) != schema::PrimitiveType_Concat) {
return false;
}
}
}
return true;
}
void SearchSubGraph::dfs(int i, int n, int current_sum, int except_value, int *min_value, std::vector<bool> *tmp_group,
std::vector<bool> *cor_group, std::vector<Subgraph> *sub_graphs) {
if (i == n) {
@ -146,7 +209,7 @@ const schema::Primitive *SearchSubGraph::CreatePartialPrimitive(int64_t subgraph
}
void SearchSubGraph::ConvertSubGraphToModel(std::vector<Subgraph> *sub_graphs) {
if (sub_graphs->size() != 2) {
if (sub_graphs->size() != kDefalutSubGraphSize) {
return;
}
Model::SubGraph *main_graphs = model_->sub_graphs_.front();
@ -166,8 +229,7 @@ void SearchSubGraph::ConvertSubGraphToModel(std::vector<Subgraph> *sub_graphs) {
MS_LOG(ERROR) << "New sub graph failed!";
return;
}
new_sub_graph->name_ = "Subgraph-split-" + std::to_string(new_sub_index);
new_sub_graph->name_ = "subgraph-split-" + std::to_string(new_sub_index);
Model::Node *new_partial_node = new (std::nothrow) Model::Node();
if (new_partial_node == nullptr) {
MS_LOG(ERROR) << "New partial node failed!";
@ -175,6 +237,16 @@ void SearchSubGraph::ConvertSubGraphToModel(std::vector<Subgraph> *sub_graphs) {
return;
}
new_partial_node->name_ = "Partial-subgraph-split-" + std::to_string(new_sub_index);
if (device_type == DT_CPU) {
new_partial_node->name_ = "cpu_" + new_partial_node->name_;
} else if (device_type == DT_GPU) {
new_partial_node->name_ = "gpu_" + new_partial_node->name_;
} else if (device_type == DT_NPU) {
new_partial_node->name_ = "npu_" + new_partial_node->name_;
} else {
new_partial_node->name_ = "unknow_" + new_partial_node->name_;
}
new_partial_node->node_type_ = mindspore::lite::NodeType_ValueNode;
new_partial_node->primitive_ = CreatePartialPrimitive(new_sub_index);
@ -221,7 +293,7 @@ void SearchSubGraph::ConvertSubGraphToModel(std::vector<Subgraph> *sub_graphs) {
}
bool SearchSubGraph::IsNodeSubGraphHead(uint32_t node_index, const std::vector<uint32_t> &ready_nodes) {
std::vector<uint32_t> output_indexes = node_list_.at(node_index)->output_indices_;
std::vector<uint32_t> output_indexes = model_->all_nodes_.at(node_index)->output_indices_;
std::vector<uint32_t> output_nodes;
for (uint32_t out_t : output_indexes) {
std::vector<uint32_t> cur_nodes = tensors_[out_t].in_nodes_;
@ -703,7 +775,7 @@ void SearchSubGraph::SubGraphSplitByMiddle() {
InitSubgraphRuntimeInfo(&subgraphs);
SubgraphFusion(&subgraphs);
MS_ASSERT(subgraphs.size() == 2);
MS_ASSERT(subgraphs.size() == kDefalutSubGraphSize);
if (std::any_of(subgraphs.begin(), subgraphs.end(), [&](Subgraph &sub) { return sub.nodes_.empty(); })) {
continue;
}
@ -724,6 +796,60 @@ void SearchSubGraph::SubGraphSplitByMiddle() {
}
}
void SearchSubGraph::SubGraphSplitByOffLineParallel() {
sub_graphs_.clear();
node_list_ = model_->all_nodes_;
std::vector<uint32_t> multy_in_nodes;
SearchMultyInNodes(&multy_in_nodes);
for (uint32_t node_index : multy_in_nodes) {
Model::Node *node = node_list_[node_index];
if (GetPrimitiveType(node->primitive_) != schema::PrimitiveType_Concat) {
continue;
}
std::vector<Subgraph> node_subs;
for (uint32_t input_tensor_index : node->input_indices_) {
Tensor *tensor = &tensors_[input_tensor_index];
if (tensor->type_ == CONST) continue;
std::vector<uint32_t> input_nodes = tensor->out_nodes_;
Subgraph sub;
sub.ends_.push_back(input_nodes[0]);
InsertNodeByMid(input_nodes[0], &sub);
node_subs.push_back(sub);
}
node_sub_map_.insert(std::make_pair(node_index, node_subs));
}
for (auto map : node_sub_map_) {
std::vector<Subgraph> &subgraphs = map.second;
if (std::any_of(subgraphs.begin(), subgraphs.end(), [&](Subgraph &sub) { return sub.nodes_.empty(); })) {
continue;
}
if (!CheckIsParallelSubGraph(subgraphs)) {
continue;
}
// init graph device type
for (auto &subgraph : subgraphs) {
uint32_t head_node_index = subgraph.heads_.front();
subgraph.device_ = static_cast<lite::DeviceType>(model_->all_nodes_.at(head_node_index)->device_type_);
if (subgraph.device_ == DT_GPU) {
subgraph.thread_ = major_thread_;
subgraph.tid_ = 0;
} else {
subgraph.thread_ = minor_thread_;
subgraph.tid_ = 1;
}
}
ConvertSubGraphToModel(&subgraphs);
}
InitMainGraphDevice(DT_CPU);
}
SearchSubGraph::SearchSubGraph(const InnerContext *context, Model *model, std::vector<lite::Tensor *> *src_tensors,
const std::map<int, OpParameter *> *op_parameters, std::vector<size_t> *output_nodes)
: output_nodes_(output_nodes), context_(context), src_tensors_(src_tensors), op_parameters_(op_parameters) {
@ -756,29 +882,20 @@ void SearchSubGraph::InsertParallelNode(uint32_t index, Subgraph *subgraph) {
return;
}
if (subgraph->search_terminate_) {
return;
if (!subgraph->nodes_.empty()) {
sub_graphs_.push_back(std::move(*subgraph));
}
Subgraph new_graph;
subgraph = &new_graph;
}
Model::Node *node = node_list_[index];
// has been searched
if (node == nullptr) {
return;
}
// just deal with parallel target node
std::vector<uint32_t> input = node->input_indices_;
/* remove const node */
for (int i = static_cast<int>(input.size()) - 1; i >= 0; i--) {
if (tensors_[input[i]].type_ == CONST) {
VectorErase(&input, input[i]);
}
}
// search to graph to graph input , terminate it.
if (std::any_of(input.begin(), input.end(), [&](int input_index) { return tensors_[input_index].type_ == INPUT; })) {
subgraph->search_terminate_ = true;
return;
}
// if current node is no parallel target node, just judge terminate or continue
if (GetPrimitiveType(node->primitive_) == schema::PrimitiveType_Conv2DFusion &&
node->device_type_ != kDefaultDeviceType) {
// if current node is parallel target node
if (IsOfflineParallelNode(node->primitive_, node->device_type_)) {
// first searched
if (subgraph->nodes_.empty()) {
subgraph->device_ = static_cast<DeviceType>(node->device_type_);
@ -788,28 +905,28 @@ void SearchSubGraph::InsertParallelNode(uint32_t index, Subgraph *subgraph) {
return;
}
}
if (IsNodeSubGraphHead(index, subgraph->nodes_)) {
if (subgraph->nodes_.empty()) {
subgraph->search_terminate_ = true;
return;
}
subgraph->heads_.push_back(subgraph->nodes_.front());
return;
}
// for offline parallel target subgraph only has one end
if (subgraph->ends_.empty()) {
subgraph->ends_.push_back(index);
}
subgraph->nodes_.insert(subgraph->nodes_.begin(), index);
node_list_[index] = nullptr;
} else {
if (!subgraph->nodes_.empty()) {
return;
subgraph->search_terminate_ = true;
}
// just deal with parallel target node
std::vector<uint32_t> input = node->input_indices_;
/* remove const node */
for (int i = static_cast<int>(input.size()) - 1; i >= 0; i--) {
if (tensors_[input[i]].type_ == CONST) {
VectorErase(&input, input[i]);
}
}
// search to graph to graph input , terminate it.
if (std::any_of(input.begin(), input.end(), [&](int input_index) { return tensors_[input_index].type_ == INPUT; })) {
subgraph->search_terminate_ = true;
return;
}
// search for next nodes
for (uint32_t next : input) {
auto next_nodes = tensors_[next].out_nodes_;
@ -828,15 +945,8 @@ void SearchSubGraph::InitSearchParallelSubGraph() {
}
}
void SearchSubGraph::SubGraphSplitByOffLineParallel() {
MS_LOG(DEBUG) << "start to split offline parallel subgraph";
InitSearchParallelSubGraph();
ConvertSubGraphToModel(&sub_graphs_);
InitMainGraphDevice();
MS_LOG(DEBUG) << "end to split offline parallel subgraph";
}
void SearchSubGraph::SubGraphSplit() {
UpdateOfflineParallelFlag();
if (offline_parallel_enable_) {
SubGraphSplitByOffLineParallel();
} else {

View File

@ -31,6 +31,8 @@
namespace mindspore::lite {
constexpr int kDefaultDeviceType = -1;
constexpr int kDefalutSubGraphSize = 2;
constexpr int kDefaultInputs = 1;
class SearchSubGraph {
enum TensorType { NORMAL, CONST, INPUT };
@ -122,6 +124,8 @@ class SearchSubGraph {
CostModel CalculateConv2DFusion(Model::Node *node);
void dfs(int i, int n, int current_sum, int except_value, int *min_value, std::vector<bool> *tmp_group,
std::vector<bool> *cor_group, std::vector<Subgraph> *sub_graphs);
void UpdateOfflineParallelFlag();
bool CheckIsParallelSubGraph(const std::vector<Subgraph> &subgraphs);
private:
std::vector<size_t> *output_nodes_ = nullptr;

View File

@ -291,8 +291,6 @@ AnfNodePtr CreateOutputsOfConcat(const FuncGraphPtr &func_graph, const AnfNodePt
concate_cnode->set_scope(conv_cnode->scope());
std::vector<AnfNodePtr> outputs;
GetMultipleOutputsOfAnfNode(func_graph, concate_cnode, 1, &outputs);
// only support split_overlap node implementation, to split sub_graph for runtime
concate_cnode->AddAttr(mindspore::ops::kDeviceType, MakeValue(static_cast<int>(lite::DT_CPU)));
return concate_cnode;
}
@ -337,8 +335,6 @@ void CreateOutputsOfSplitWithOverlap(const FuncGraphPtr &func_graph, const AnfNo
ptr_list.push_back(value_node);
}
split_cnode->set_abstract(std::make_shared<abstract::AbstractTuple>(ptr_list));
// only support split_overlap node implementation
split_cnode->AddAttr(mindspore::ops::kDeviceType, MakeValue(static_cast<int>(lite::DT_CPU)));
}
} // namespace opt

View File

@ -89,12 +89,12 @@ bool MultiConvSplit::CheckSplitValid() {
return false;
}
int64_t split_axis_value_0 = UP_DIV(split_info_.ori_split_axis_value * visited_block, total_block_count);
if (split_axis_value_0 >= split_info_.ori_split_axis_value) {
if (split_axis_value_0 > split_info_.ori_split_axis_value) {
return false;
}
int64_t split_axis_value_1 = split_info_.ori_split_axis_value - split_axis_value_0;
split_axis_value_1 += (split_info_.extend_top.back() + split_info_.extend_bottom.back());
return split_axis_value_1 < split_info_.ori_split_axis_value;
return split_axis_value_1 <= split_info_.ori_split_axis_value;
}
int MultiConvSplit::GetMultiConvNodes(const AnfNodePtr &conv_node) {