!19491 [bugfix] Heterogenous scenario in a ms function of PyNative Mode occur core dump

Merge pull request !19491 from zyli2020/mindrt_debug
This commit is contained in:
i-robot 2021-07-06 18:20:29 +00:00 committed by Gitee
commit 7dd2ae02ee
7 changed files with 45 additions and 10 deletions

View File

@ -1192,6 +1192,7 @@ void SessionBasic::GetParameterIndex(const KernelGraph *graph, const std::vector
<< ", input size: " << inputs.size();
}
const auto &input = inputs[index];
MS_EXCEPTION_IF_NULL(input);
// Check shape of input and parameter
const auto &input_shape = input->shape();
const auto &param_shape = AnfAlgo::GetOutputInferShape(param, 0);

View File

@ -168,6 +168,10 @@ void CreateKernelOutputDeviceAddress(const DeviceContext *device_context, const
MS_EXCEPTION_IF_NULL(graph);
const std::vector<CNodePtr> &kernels = graph->execution_order();
for (const auto &kernel : kernels) {
MS_EXCEPTION_IF_NULL(kernel);
if (AnfAlgo::IsControlOpExecInBackend(kernel)) {
continue;
}
auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
MS_EXCEPTION_IF_NULL(kernel_mod);
auto output_sizes = kernel_mod->GetOutputSizeList();
@ -190,6 +194,10 @@ void CreateKernelWorkspaceDeviceAddress(const DeviceContext *device_context, con
MS_EXCEPTION_IF_NULL(graph);
const std::vector<CNodePtr> &kernels = graph->execution_order();
for (const auto &kernel : kernels) {
MS_EXCEPTION_IF_NULL(kernel);
if (AnfAlgo::IsControlOpExecInBackend(kernel)) {
continue;
}
auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
MS_EXCEPTION_IF_NULL(kernel_mod);
auto workspace_sizes = kernel_mod->GetWorkspaceSizeList();

View File

@ -494,6 +494,11 @@ void GraphScheduler::Initialize() {
}
ActorSet *GraphScheduler::Transform(const GraphCompilerInfo &graph_compiler_info) {
// Local maps and vectors clear.
graph_output_to_actor_.clear();
front_node_to_actor_.clear();
copy_actors_.clear();
MS_LOG(INFO) << "Graph(" << graph_compiler_info.name_ << ") transforms actor begin.";
if (graph_compiler_info.graphs_.size() == 0) {
MS_LOG(EXCEPTION) << "The number of graphs is zero.";

View File

@ -349,6 +349,11 @@ const ActorInfo &MindRTBackend::CompileGraphs(const FuncGraphPtr &func_graph) {
// Register a summary callback function, which is called in the final stages of summary.
graph_compiler_->RegisterSummaryCallBackFunc(callbacks::SummarySaveCallback);
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
ms_execution_mode_ = context_ptr->get_param<int>(MS_CTX_EXECUTION_MODE);
real_execution_mode_ = ms_execution_mode_;
// Compile root graph.
graph_id_to_device_context_.clear();
control_nodes_.clear();
@ -365,10 +370,7 @@ const ActorInfo &MindRTBackend::CompileGraphs(const FuncGraphPtr &func_graph) {
// Construct the graph compiler info.
auto graph_compiler_info = ConstructGraphCompilerInfo(root_graph_);
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
const bool graph_mode = context_ptr->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode;
if (graph_mode) {
if (real_execution_mode_ == kGraphMode) {
// Transform graph to actor DAG, and schedule the actor DAG.
const auto &actor_set = runtime::GraphScheduler::GetInstance().Transform(*graph_compiler_info);
runtime::GraphScheduler::GetInstance().Schedule(actor_set);
@ -383,9 +385,12 @@ void MindRTBackend::CompileGraph(const FuncGraphPtr &func_graph) {
MS_EXCEPTION_IF_NULL(graph_partition_);
MS_EXCEPTION_IF_NULL(graph_compiler_);
bool contain_multi_target;
// Split graph to segments.
const auto &segments = graph_partition_->Partition(func_graph);
const auto &segments = graph_partition_->Partition(func_graph, &contain_multi_target);
MS_LOG(INFO) << "Compile graph: " << func_graph->ToString() << ", Split segments size:" << segments.size();
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
// Foreach the segments to compile graph.
for (const auto &segment : segments) {
@ -409,8 +414,19 @@ void MindRTBackend::CompileGraph(const FuncGraphPtr &func_graph) {
AnfNodePtrList outputs;
std::tie(fg, inputs, outputs) = TransformSegmentToAnfGraph(segment->nodes_);
// There will be more than one kernel graph in heterogeneous scenario in a ms function of PyNative Mode.
if (contain_multi_target && ms_execution_mode_ == kPynativeMode) {
real_execution_mode_ = kGraphMode;
context_ptr->set_param<int>(MS_CTX_EXECUTION_MODE, kGraphMode);
}
// Compile graph.
auto graph_id = graph_compiler_->CompileGraph(segment->nodes_, outputs, device_context);
if (ms_execution_mode_ != real_execution_mode_) {
context_ptr->set_param<int>(MS_CTX_EXECUTION_MODE, ms_execution_mode_);
}
graph_id_to_device_context_[graph_id] = device_context;
} else {
// Compile the cut node.
@ -726,9 +742,8 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &args,
// Run in the pynative mode.
MS_EXCEPTION_IF_NULL(outputs);
auto ms_context = MsContext::GetInstance();
const bool pynative_mode = (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode);
if (pynative_mode) {
// There will be more than one kernel graph in heterogeneous scenario in a ms function of PyNative Mode.
if (real_execution_mode_ == kPynativeMode) {
RunGraphBySingleOp(graph_compiler_info.graphs_, input_tensors, outputs);
return;
}

View File

@ -163,6 +163,8 @@ class MindRTBackend : public Backend {
std::shared_ptr<GraphCompiler> graph_compiler_;
std::string device_name_;
uint32_t device_id_;
int ms_execution_mode_{kGraphMode};
int real_execution_mode_{kGraphMode};
};
} // namespace compile
} // namespace mindspore

View File

@ -588,11 +588,15 @@ bool GraphPartition::IsCut(const AnfNodePtr &node) {
return false;
}
std::vector<GraphSegmentPtr> GraphPartition::Partition(const FuncGraphPtr &graph) {
std::vector<GraphSegmentPtr> GraphPartition::Partition(const FuncGraphPtr &graph, bool *multi_target) {
MS_EXCEPTION_IF_NULL(graph);
auto nodes = TopoSort(graph->get_return());
MS_LOG(DEBUG) << "Split all nodes size:" << nodes.size();
bool contain_multi_target = ContainMultiTarget(nodes);
if (multi_target != nullptr) {
*multi_target = contain_multi_target;
}
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
std::string default_target = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);

View File

@ -34,7 +34,7 @@ class GraphPartition {
public:
explicit GraphPartition(const std::vector<PrimitivePtr> &cut_list, const std::string &backend_name);
~GraphPartition() = default;
std::vector<GraphSegmentPtr> Partition(const FuncGraphPtr &func_graph);
std::vector<GraphSegmentPtr> Partition(const FuncGraphPtr &func_graph, bool *multi_target = nullptr);
private:
bool IsCut(const AnfNodePtr &node);