From 3202fc0df9bafcd9065460b780b6bae30d2a3bde Mon Sep 17 00:00:00 2001 From: kingfo Date: Fri, 3 Apr 2020 12:02:41 +0800 Subject: [PATCH] refactor callback for ge backend --- mindspore/ccsrc/CMakeLists.txt | 18 ++- mindspore/ccsrc/pipeline/pipeline.cc | 1 - mindspore/ccsrc/transform/graph_runner.cc | 3 + mindspore/ccsrc/utils/callbacks.cc | 153 ------------------ mindspore/ccsrc/utils/callbacks.h | 8 - mindspore/ccsrc/utils/callbacks_ge.cc | 182 ++++++++++++++++++++++ mindspore/ccsrc/utils/callbacks_ge.h | 38 +++++ mindspore/ccsrc/vm/backend.cc | 3 + tests/ut/cpp/utils/callback_test.cc | 3 + 9 files changed, 246 insertions(+), 163 deletions(-) create mode 100644 mindspore/ccsrc/utils/callbacks_ge.cc create mode 100644 mindspore/ccsrc/utils/callbacks_ge.h diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt index fdefdce6a2a..befe86f3c0d 100644 --- a/mindspore/ccsrc/CMakeLists.txt +++ b/mindspore/ccsrc/CMakeLists.txt @@ -87,7 +87,22 @@ ms_build_flatbuffers("${FLATBUFFER_IN}" "${FLATBUFFER_IN}" GENERATED_OUTPUT_DIR file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "ir/*.cc" "ir/dtype/*.cc" - "utils/*.cc" + "utils/context/ms_context.cc" + "utils/symbolic.cc" + "utils/tensorprint_utils.cc" + "utils/convert_utils.cc" + "utils/graph_utils.cc" + "utils/misc.cc" + "utils/callbacks.cc" + "utils/profile.cc" + "utils/base_ref.cc" + "utils/summary/event_writer.cc" + "utils/log_adapter.cc" + "utils/comm_manager.cc" + "utils/any.cc" + "utils/config_manager.cc" + "utils/system/file_system.cc" + "utils/system/crc32c.cc" "common/*.cc" "parallel/*.cc" "pipeline/pipeline.cc" @@ -173,6 +188,7 @@ if(ENABLE_GE) file(GLOB_RECURSE GE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "transform/*.cc" "pynative/pynative_execute_ge.cc" + "utils/callbacks_ge.cc" "pipeline/pipeline_ge.cc" ) list(APPEND MINDSPORE_SRC_LIST ${GE_SRC_LIST}) diff --git a/mindspore/ccsrc/pipeline/pipeline.cc b/mindspore/ccsrc/pipeline/pipeline.cc index 0d7790fb360..003d4c15e99 100644 --- a/mindspore/ccsrc/pipeline/pipeline.cc +++ b/mindspore/ccsrc/pipeline/pipeline.cc @@ -616,7 +616,6 @@ py::object ExecutorPy::Run(const py::tuple& args, const py::object& phase) { return ExecDFGraph(info_, args, phase_s); } #else - MS_LOG(WARNING) << "In ut test " << size << phase_s; if (backend == "ge") { std::shared_ptr ret_val = std::make_shared(); if (info_.count(phase_s) != 0 && info_[phase_s]->func_graph != nullptr) { diff --git a/mindspore/ccsrc/transform/graph_runner.cc b/mindspore/ccsrc/transform/graph_runner.cc index e77b1bcd736..f1f270cdb6d 100644 --- a/mindspore/ccsrc/transform/graph_runner.cc +++ b/mindspore/ccsrc/transform/graph_runner.cc @@ -24,6 +24,9 @@ #include "utils/callbacks.h" #include "utils/utils.h" #include "./common.h" +#ifdef ENABLE_GE +#include "utils/callbacks_ge.h" +#endif #ifdef NO_GE_CLIENT namespace ge { diff --git a/mindspore/ccsrc/utils/callbacks.cc b/mindspore/ccsrc/utils/callbacks.cc index cdee0be82db..03c6322afe4 100644 --- a/mindspore/ccsrc/utils/callbacks.cc +++ b/mindspore/ccsrc/utils/callbacks.cc @@ -20,10 +20,6 @@ #include #include #include "pybind11/pybind11.h" -#ifdef ENABLE_GE -#include "transform/df_graph_manager.h" -#include "transform/util.h" -#endif #include "pipeline/parse/data_converter.h" #include "pipeline/parse/python_adapter.h" #include "utils/visible.h" @@ -38,155 +34,6 @@ const char kSummary[] = "Summary"; const char kCheckPoint[] = "Save"; const int ONE_SHAPE = 1; -#ifdef ENABLE_GE -using mindspore::transform::Status; -using mindspore::transform::TransformUtil; - -bool GetParameterShape(const FuncGraphPtr& graph, const std::string& param_name, - const std::shared_ptr>& shape) { - if (graph == nullptr) { - MS_LOG(ERROR) << "Graph is null, can not get graph parameter"; - return false; - } - - auto parameter_nodes = graph->parameters(); - for (auto& node : parameter_nodes) { - ParameterPtr param_node = std::static_pointer_cast(node); - if (param_node == nullptr) { - MS_LOG(ERROR) << "Parameter node is null, can not get graph parameter"; - return false; - } - if (param_node->name() == param_name) { - py::object parameter = param_node->default_param(); - ValuePtr value = parse::data_converter::PyDataToValue(parameter); - TensorPtr tensor = std::dynamic_pointer_cast(value); - if (tensor == nullptr) { - shape->push_back(ONE_SHAPE); - } else { - *shape = tensor->shape(); - } - return true; - } - } - MS_LOG(ERROR) << "Can not find parameter of name:" << param_name; - return false; -} - -static TensorPtr GetMeTensorTransformed(uint32_t graph_id, const std::string& parameter_name, - const std::shared_ptr& ge_tensor_ptr) { - FuncGraphPtr anf_graph = transform::DfGraphManager::GetInstance().GetAnfGraph(graph_id); - if (anf_graph == nullptr) { - MS_LOG(ERROR) << "Get anf graph failed during callback"; - return nullptr; - } - - std::shared_ptr> parameter_shape_ptr = std::make_shared>(); - if (!GetParameterShape(anf_graph, parameter_name, parameter_shape_ptr)) { - MS_LOG(ERROR) << "Can not get parameter shape during callback"; - return nullptr; - } - - return TransformUtil::ConvertGeTensor(ge_tensor_ptr, *parameter_shape_ptr); -} - -uint32_t CheckpointSaveCallback(uint32_t graph_id, const std::map& params_list) { - // Acquire GIL before calling Python code - py::gil_scoped_acquire acquire; - - MS_LOG(DEBUG) << "Start the checkpoint save callback function in checkpoint save process."; - py::list parameter_list = py::list(); - for (auto& item : params_list) { - std::string name = item.first; - std::shared_ptr ge_tensor_ptr = std::make_shared(item.second); - TensorPtr tensor_ptr = GetMeTensorTransformed(graph_id, name, ge_tensor_ptr); - if (tensor_ptr == nullptr) { - MS_LOG(EXCEPTION) << "Transform ge tensor to me tensor failed"; - } - py::dict param_dict; - param_dict["name"] = name; - param_dict["data"] = tensor_ptr; - parameter_list.append(param_dict); - } - py::bool_ ret = - parse::python_adapter::CallPyFn(PYTHON_MOD_CALLBACK_MODULE, PYTHON_FUN_PROCESS_CHECKPOINT, parameter_list); - auto bool_ret = py::cast(ret); - - uint32_t status = Status::SUCCESS; - if (!bool_ret) { - status = Status::FAILED; - MS_LOG(ERROR) << "python checkpoint return false during callback"; - } - return status; -} - -static TensorPtr GetMeTensorForSummary(const std::string& name, const std::shared_ptr& ge_tensor_ptr) { - // confirm the type by name - // Format: xxx[:Scalar] xxx[:Image] xxx[:Tensor] - if (name.empty()) { - MS_LOG(EXCEPTION) << "The summary name is empty."; - } - auto bpos = name.rfind("[:"); - if (bpos >= name.size()) { - MS_LOG(EXCEPTION) << "The summary name(" << name << ") is invalid."; - } - auto tname = name.substr(bpos); - if (tname == "[:Scalar]") { - MS_LOG(DEBUG) << "The summary(" << name << ") is Scalar"; - // process the scalar type summary - // Because the ge tensor is dim = 4, so set the (1,1,1,1)-->(1,) - // We do the (1,) shape is scalar - auto shape = std::vector({ONE_SHAPE}); - return TransformUtil::ConvertGeTensor(ge_tensor_ptr, shape); - } - if (tname == "[:Tensor]") { - MS_LOG(DEBUG) << "The summary(" << name << ") is Tensor"; - // process the tensor summary - // Now we can't get the real shape, so we keep same shape with GE - return TransformUtil::ConvertGeTensor(ge_tensor_ptr); - } - if (tname == "[:Image]") { - MS_LOG(DEBUG) << "The summary(" << name << ") is Image"; - // process the Image summary - // Image dim = 4, is same with ge, so we keep same shape with GE - return TransformUtil::ConvertGeTensor(ge_tensor_ptr); - } - - MS_LOG(EXCEPTION) << "The summary name(" << name << ") is invalid."; -} - -// Cache the summary callback data -// Output Format: [{"name": tag_name, "data": tensor}, {"name": tag_name, "data": tensor},...] -uint32_t MS_EXPORT SummarySaveCallback(uint32_t graph_id, const std::map& params_list) { - // Acquire GIL before calling Python code - py::gil_scoped_acquire acquire; - - MS_LOG(DEBUG) << "Start the summary save callback function for graph " << graph_id << "."; - py::list summary_list = py::list(); - MS_LOG(DEBUG) << "Param list size = " << params_list.size(); - for (auto& item : params_list) { - std::string tag_name = item.first; - std::shared_ptr ge_tensor_ptr = std::make_shared(item.second); - TensorPtr tensor_ptr = GetMeTensorForSummary(tag_name, ge_tensor_ptr); - if (tensor_ptr == nullptr) { - MS_LOG(EXCEPTION) << "ConvertGeTensor return tensor is null"; - } - py::dict summary_value_dict; - summary_value_dict["name"] = tag_name; - summary_value_dict["data"] = tensor_ptr; - summary_list.append(summary_value_dict); - } - - py::bool_ ret = parse::python_adapter::CallPyFn(PYTHON_MOD_CALLBACK_MODULE, PYTHON_FUN_PROCESS_SUMMARY, summary_list); - auto bool_ret = py::cast(ret); - if (!bool_ret) { - MS_LOG(ERROR) << "Python checkpoint return false during callback"; - return Status::FAILED; - } - MS_LOG(DEBUG) << "End the summary save callback function."; - return Status::SUCCESS; -} -#endif - // Cache the summary callback data from ME session // Remove the GE module on new architecture // Output Format: [{"name": tag_name, "data": tensor}, {"name": tag_name, "data": tensor},...] diff --git a/mindspore/ccsrc/utils/callbacks.h b/mindspore/ccsrc/utils/callbacks.h index 778b0a9ba2c..a1e4e75d5b6 100644 --- a/mindspore/ccsrc/utils/callbacks.h +++ b/mindspore/ccsrc/utils/callbacks.h @@ -21,10 +21,6 @@ #include #include #include "ir/meta_tensor.h" -#ifdef ENABLE_GE -#include "transform/types.h" -#include "transform/util.h" -#endif namespace mindspore { namespace callbacks { @@ -45,10 +41,6 @@ const int kCallbackFalied = 1; bool GetParameterShape(const FuncGraphPtr& anf_graph, const std::string& param_name, const std::shared_ptr>& shape); -#ifdef ENABLE_GE -uint32_t CheckpointSaveCallback(uint32_t, const std::map&); -uint32_t SummarySaveCallback(uint32_t, const std::map&); -#endif uint32_t SummarySaveCallback(uint32_t, const std::map&); } // namespace callbacks diff --git a/mindspore/ccsrc/utils/callbacks_ge.cc b/mindspore/ccsrc/utils/callbacks_ge.cc new file mode 100644 index 00000000000..50fd2f0b11d --- /dev/null +++ b/mindspore/ccsrc/utils/callbacks_ge.cc @@ -0,0 +1,182 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/callbacks_ge.h" +#include "pybind11/pybind11.h" +#include "transform/df_graph_manager.h" +#include "transform/util.h" +#include "pipeline/parse/data_converter.h" +#include "pipeline/parse/python_adapter.h" +#include "utils/visible.h" + +namespace mindspore { +namespace callbacks { + +const char PYTHON_MOD_CALLBACK_MODULE[] = "mindspore.train.callback"; +const char PYTHON_FUN_PROCESS_CHECKPOINT[] = "_checkpoint_cb_for_save_op"; +const char PYTHON_FUN_PROCESS_SUMMARY[] = "_summary_cb_for_save_op"; +const char kSummary[] = "Summary"; +const char kCheckPoint[] = "Save"; +const int ONE_SHAPE = 1; + +using mindspore::transform::Status; +using mindspore::transform::TransformUtil; + +bool GetParameterShape(const FuncGraphPtr& graph, const std::string& param_name, + const std::shared_ptr>& shape) { + if (graph == nullptr) { + MS_LOG(ERROR) << "Graph is null, can not get graph parameter"; + return false; + } + + auto parameter_nodes = graph->parameters(); + for (auto& node : parameter_nodes) { + ParameterPtr param_node = std::static_pointer_cast(node); + if (param_node == nullptr) { + MS_LOG(ERROR) << "Parameter node is null, can not get graph parameter"; + return false; + } + if (param_node->name() == param_name) { + py::object parameter = param_node->default_param(); + ValuePtr value = parse::data_converter::PyDataToValue(parameter); + TensorPtr tensor = std::dynamic_pointer_cast(value); + if (tensor == nullptr) { + shape->push_back(ONE_SHAPE); + } else { + *shape = tensor->shape(); + } + return true; + } + } + MS_LOG(ERROR) << "Can not find parameter of name:" << param_name; + return false; +} + +static TensorPtr GetMeTensorTransformed(uint32_t graph_id, const std::string& parameter_name, + const std::shared_ptr& ge_tensor_ptr) { + FuncGraphPtr anf_graph = transform::DfGraphManager::GetInstance().GetAnfGraph(graph_id); + if (anf_graph == nullptr) { + MS_LOG(ERROR) << "Get anf graph failed during callback"; + return nullptr; + } + + std::shared_ptr> parameter_shape_ptr = std::make_shared>(); + if (!GetParameterShape(anf_graph, parameter_name, parameter_shape_ptr)) { + MS_LOG(ERROR) << "Can not get parameter shape during callback"; + return nullptr; + } + + return TransformUtil::ConvertGeTensor(ge_tensor_ptr, *parameter_shape_ptr); +} + +uint32_t CheckpointSaveCallback(uint32_t graph_id, const std::map& params_list) { + // Acquire GIL before calling Python code + py::gil_scoped_acquire acquire; + + MS_LOG(DEBUG) << "Start the checkpoint save callback function in checkpoint save process."; + py::list parameter_list = py::list(); + for (auto& item : params_list) { + std::string name = item.first; + std::shared_ptr ge_tensor_ptr = std::make_shared(item.second); + TensorPtr tensor_ptr = GetMeTensorTransformed(graph_id, name, ge_tensor_ptr); + if (tensor_ptr == nullptr) { + MS_LOG(EXCEPTION) << "Transform ge tensor to me tensor failed"; + } + py::dict param_dict; + param_dict["name"] = name; + param_dict["data"] = tensor_ptr; + parameter_list.append(param_dict); + } + py::bool_ ret = + parse::python_adapter::CallPyFn(PYTHON_MOD_CALLBACK_MODULE, PYTHON_FUN_PROCESS_CHECKPOINT, parameter_list); + auto bool_ret = py::cast(ret); + + uint32_t status = Status::SUCCESS; + if (!bool_ret) { + status = Status::FAILED; + MS_LOG(ERROR) << "Python checkpoint return false during callback"; + } + return status; +} + +static TensorPtr GetMeTensorForSummary(const std::string& name, const std::shared_ptr& ge_tensor_ptr) { + // confirm the type by name + // Format: xxx[:Scalar] xxx[:Image] xxx[:Tensor] + if (name.empty()) { + MS_LOG(EXCEPTION) << "The summary name is empty."; + } + auto bpos = name.rfind("[:"); + if (bpos >= name.size()) { + MS_LOG(EXCEPTION) << "The summary name(" << name << ") is invalid."; + } + auto tname = name.substr(bpos); + if (tname == "[:Scalar]") { + MS_LOG(DEBUG) << "The summary(" << name << ") is Scalar"; + // process the scalar type summary + // Because the ge tensor is dim = 4, so set the (1,1,1,1)-->(1,) + // We do the (1,) shape is scalar + auto shape = std::vector({ONE_SHAPE}); + return TransformUtil::ConvertGeTensor(ge_tensor_ptr, shape); + } + if (tname == "[:Tensor]") { + MS_LOG(DEBUG) << "The summary(" << name << ") is Tensor"; + // process the tensor summary + // Now we can't get the real shape, so we keep same shape with GE + return TransformUtil::ConvertGeTensor(ge_tensor_ptr); + } + if (tname == "[:Image]") { + MS_LOG(DEBUG) << "The summary(" << name << ") is Image"; + // process the Image summary + // Image dim = 4, is same with ge, so we keep same shape with GE + return TransformUtil::ConvertGeTensor(ge_tensor_ptr); + } + + MS_LOG(EXCEPTION) << "The summary name(" << name << ") is invalid."; +} + +// Cache the summary callback data +// Output Format: [{"name": tag_name, "data": tensor}, {"name": tag_name, "data": tensor},...] +uint32_t MS_EXPORT SummarySaveCallback(uint32_t graph_id, const std::map& params_list) { + // Acquire GIL before calling Python code + py::gil_scoped_acquire acquire; + + MS_LOG(DEBUG) << "Start the summary save callback function for graph " << graph_id << "."; + py::list summary_list = py::list(); + MS_LOG(DEBUG) << "Param list size = " << params_list.size(); + for (auto& item : params_list) { + std::string tag_name = item.first; + std::shared_ptr ge_tensor_ptr = std::make_shared(item.second); + TensorPtr tensor_ptr = GetMeTensorForSummary(tag_name, ge_tensor_ptr); + if (tensor_ptr == nullptr) { + MS_LOG(EXCEPTION) << "ConvertGeTensor return tensor is null"; + } + py::dict summary_value_dict; + summary_value_dict["name"] = tag_name; + summary_value_dict["data"] = tensor_ptr; + summary_list.append(summary_value_dict); + } + + py::bool_ ret = parse::python_adapter::CallPyFn(PYTHON_MOD_CALLBACK_MODULE, PYTHON_FUN_PROCESS_SUMMARY, summary_list); + auto bool_ret = py::cast(ret); + if (!bool_ret) { + MS_LOG(ERROR) << "Python checkpoint return false during callback"; + return Status::FAILED; + } + MS_LOG(DEBUG) << "End the summary save callback function."; + return Status::SUCCESS; +} +} // namespace callbacks +} // namespace mindspore diff --git a/mindspore/ccsrc/utils/callbacks_ge.h b/mindspore/ccsrc/utils/callbacks_ge.h new file mode 100644 index 00000000000..750ec746665 --- /dev/null +++ b/mindspore/ccsrc/utils/callbacks_ge.h @@ -0,0 +1,38 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_UTILS_CALLBACKS_GE_H_ +#define MINDSPORE_CCSRC_UTILS_CALLBACKS_GE_H_ + +#include +#include +#include +#include +#include "transform/types.h" +#include "transform/util.h" +#include "ir/meta_tensor.h" + +namespace mindspore { +namespace callbacks { + +using mindspore::tensor::TensorPtr; + +uint32_t CheckpointSaveCallback(uint32_t, const std::map&); +uint32_t SummarySaveCallback(uint32_t, const std::map&); + +} // namespace callbacks +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_UTILS_CALLBACKS_GE_H_ diff --git a/mindspore/ccsrc/vm/backend.cc b/mindspore/ccsrc/vm/backend.cc index 28609abfa9f..9355cca99cb 100644 --- a/mindspore/ccsrc/vm/backend.cc +++ b/mindspore/ccsrc/vm/backend.cc @@ -24,6 +24,9 @@ #include "utils/graph_utils.h" #include "session/session_factory.h" #include "common/utils.h" +#ifdef ENABLE_GE +#include "utils/callbacks_ge.h" +#endif namespace mindspore { namespace compile { diff --git a/tests/ut/cpp/utils/callback_test.cc b/tests/ut/cpp/utils/callback_test.cc index 758e99ff59e..c63f68f000a 100644 --- a/tests/ut/cpp/utils/callback_test.cc +++ b/tests/ut/cpp/utils/callback_test.cc @@ -22,6 +22,9 @@ #include "pipeline/parse/python_adapter.h" #include "transform/df_graph_manager.h" #include "debug/draw.h" +#ifdef ENABLE_GE +#include "utils/callbacks_ge.h" +#endif namespace mindspore { namespace python_adapter = mindspore::parse::python_adapter;