From 5a90c140055af147139093c37e17b86fd312f513 Mon Sep 17 00:00:00 2001 From: caifubi Date: Mon, 30 Aug 2021 09:45:16 +0800 Subject: [PATCH] PyNative RunOp Performance Optimize 1. Add OpRuntimeInfo to AnfNode. 2. Cache OpRuntimeInfo in Build. 3. Get format/dtype/tensorsize from OpRuntimeInfo when malloc device memory. --- .../backend/session/anf_runtime_algorithm.cc | 21 +++++++++ .../backend/session/anf_runtime_algorithm.h | 22 +++++++++ .../ccsrc/backend/session/ascend_session.cc | 46 +++++++++++++++++++ .../ccsrc/backend/session/ascend_session.h | 1 + .../ccsrc/runtime/device/kernel_runtime.cc | 21 +++++---- 5 files changed, 102 insertions(+), 9 deletions(-) diff --git a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc index 17d6c02e226..7616c60de55 100644 --- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc +++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc @@ -2394,5 +2394,26 @@ void AnfRuntimeAlgorithm::CacheAddrForAtomicClean(const AnfNodePtr &node, kernel } kernel_mod->set_inputs_addr(kernel_inputs); } + +std::string OpRuntimeInfo::output_format(size_t index) const { + if (index >= output_format_.size()) { + MS_LOG(EXCEPTION) << "Invalid index:" << index << " total output_format:" << output_format_.size(); + } + return output_format_[index]; +} + +TypeId OpRuntimeInfo::output_type(size_t index) const { + if (index >= output_type_.size()) { + MS_LOG(EXCEPTION) << "Invalid index:" << index << " total output_type:" << output_type_.size(); + } + return output_type_[index]; +} + +size_t OpRuntimeInfo::output_tensor_size(size_t index) const { + if (index >= output_tensor_size_.size()) { + MS_LOG(EXCEPTION) << "Invalid index::" << index << " total output_tensor_size:" << output_tensor_size_.size(); + } + return output_tensor_size_[index]; +} } // namespace session } // namespace mindspore diff --git a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h index 618c7fc3fa6..d59873f1a80 100644 --- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h +++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h @@ -58,6 +58,28 @@ struct KernelWithIndexCmp { } }; +class OpRuntimeInfo { + public: + OpRuntimeInfo(std::vector output_format, std::vector output_type, + std::vector output_tensor_size) + : output_format_(std::move(output_format)), + output_type_(std::move(output_type)), + output_tensor_size_(std::move(output_tensor_size)) {} + ~OpRuntimeInfo() = default; + + // Key for user data. + constexpr static char key[] = "OpRuntimeInfo"; + + std::string output_format(size_t index) const; + TypeId output_type(size_t index) const; + size_t output_tensor_size(size_t index) const; + + private: + std::vector output_format_; + std::vector output_type_; + std::vector output_tensor_size_; +}; + class AnfRuntimeAlgorithm { public: static AnfNodePtr MakeMonadValueNode(const KernelGraphPtr &kg); diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index 695d29074d3..996cf1028db 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -50,6 +50,7 @@ #include "runtime/device/ascend/ascend_stream_assign.h" #include "backend/session/anf_runtime_algorithm.h" #include "utils/ms_utils.h" +#include "utils/utils.h" #include "utils/context/graph_kernel_flags.h" #include "backend/optimizer/common/helper.h" #include "runtime/device/kernel_runtime_manager.h" @@ -856,9 +857,54 @@ KernelGraphPtr AscendSession::PreBuildOp(const OpRunInfo &op_run_info, opt::RunOpAscendBackendIRFusionOptimization(graph); SelectKernel(*graph); RunOpHardwareOptimize(graph); + CacheCNodeOutputInfo(*graph); return graph; } +void AscendSession::CacheCNodeOutputInfo(const KernelGraph &graph) const { + auto &nodes = graph.execution_order(); + for (auto const &node : nodes) { + std::vector formats; + std::vector types; + std::vector tensor_sizes; + auto output_num = AnfAlgo::GetOutputTensorNum(node); + for (size_t i = 0; i < output_num; ++i) { + std::string output_format = AnfAlgo::GetOutputFormat(node, i); + auto output_type = AnfAlgo::GetOutputDeviceDataType(node, i); + auto tensor_size = AnfAlgo::GetOutputTensorMemSize(node, i); + formats.emplace_back(output_format); + types.emplace_back(output_type); + tensor_sizes.emplace_back(tensor_size); + } + MS_EXCEPTION_IF_NULL(node); + node->set_user_data(std::make_shared(formats, types, tensor_sizes)); + } + + auto &inputs = graph.inputs(); + for (const auto &input : inputs) { + MS_EXCEPTION_IF_NULL(input); + if (!input->isa()) { + continue; + } + std::vector formats; + std::vector types; + std::vector tensor_sizes; + auto output_size = AnfAlgo::GetOutputTensorNum(input); + for (size_t index = 0; index < output_size; index++) { + auto format = AnfAlgo::GetOutputFormat(input, index); + auto type_id = AnfAlgo::GetOutputDeviceDataType(input, index); + if (type_id == kTypeUnknown) { + type_id = AnfAlgo::GetOutputInferDataType(input, index); + } + auto tensor_size = AnfAlgo::GetOutputTensorMemSize(input, index); + formats.emplace_back(format); + types.emplace_back(type_id); + tensor_sizes.emplace_back(tensor_size); + } + input->set_user_data(std::make_shared(formats, types, tensor_sizes)); + } +} + void AscendSession::GetOpInputStubTensors(const CNodePtr &cnode, const std::map ¶meter_index, const std::vector &graph_inputs, const std::map &node_output_info, diff --git a/mindspore/ccsrc/backend/session/ascend_session.h b/mindspore/ccsrc/backend/session/ascend_session.h index 0a32275df01..9762eda55cb 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.h +++ b/mindspore/ccsrc/backend/session/ascend_session.h @@ -131,6 +131,7 @@ class AscendSession : public SessionBasic { void LoadGraphsToDbg(const NotNull graph, NotNull *> memo) const; void AssignStaticMemory(const NotNull graph, NotNull *> memo) const; void UpdateRefOutputMap(const NotNull graph, NotNull *> memo) const; + void CacheCNodeOutputInfo(const KernelGraph &graph) const; KernelGraphPtr PreBuildOp(const OpRunInfo &op_run_info, const std::vector &input_tensors, const std::vector &tensors_mask); void GetOpInputStubTensors(const CNodePtr &cnode, const std::map ¶meter_index, diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.cc b/mindspore/ccsrc/runtime/device/kernel_runtime.cc index 964e5ecf64b..2eff2471469 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc @@ -99,9 +99,12 @@ void KernelRuntime::RunOpMallocPre(const session::KernelGraph &graph, for (const auto &node : nodes) { auto output_num = AnfAlgo::GetOutputTensorNum(node); for (size_t i = 0; i < output_num; ++i) { - std::string output_format = AnfAlgo::GetOutputFormat(node, i); - auto output_type = AnfAlgo::GetOutputDeviceDataType(node, i); - auto tensor_size = AnfAlgo::GetOutputTensorMemSize(node, i); + MS_EXCEPTION_IF_NULL(node); + auto runtime_info = node->user_data(); + MS_EXCEPTION_IF_NULL(runtime_info); + auto const &output_format = runtime_info->output_format(i); + auto output_type = runtime_info->output_type(i); + auto tensor_size = runtime_info->output_tensor_size(i); // Create DeviceAddress without ptr. // Get real device ptr after KernelBuild finish. auto device_address = CreateDeviceAddress(nullptr, tensor_size, output_format, output_type); @@ -130,13 +133,13 @@ void KernelRuntime::RunOpMallocPre(const session::KernelGraph &graph, AnfAlgo::SetOutputAddr(output_address, index, item.get()); continue; } - TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(item, index); - if (output_type_id == kTypeUnknown) { - output_type_id = AnfAlgo::GetOutputInferDataType(item, index); - } - auto tensor_size = AnfAlgo::GetOutputTensorMemSize(item, index); + auto op_runtime_info = item->user_data(); + + TypeId output_type_id = op_runtime_info->output_type(index); + auto output_tensor_size = op_runtime_info->output_tensor_size(index); + auto output_format = op_runtime_info->output_format(index); auto device_address = - CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id, {item, index}); + CreateDeviceAddress(nullptr, output_tensor_size, output_format, output_type_id, {item, index}); AnfAlgo::SetOutputAddr(device_address, index, item.get()); current_tensor->set_device_address(device_address); current_tensor->set_sync_status(kNeedSyncHostToDevice);