!22867 PyNative RunOp Performance Optimize

Merge pull request !22867 from caifubi/master-pynative-ascend-performance
2021-09-06 01:24:02 +00:00 · 2021-09-06 01:24:02 +00:00 · 682d0c7d4f
parent 1f3a5cb2ea 5a90c14005
commit 682d0c7d4f
5 changed files with 102 additions and 9 deletions
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
@ -2394,5 +2394,26 @@ void AnfRuntimeAlgorithm::CacheAddrForAtomicClean(const AnfNodePtr &node, kernel
  }
  kernel_mod->set_inputs_addr(kernel_inputs);
 }
+
+std::string OpRuntimeInfo::output_format(size_t index) const {
+  if (index >= output_format_.size()) {
+    MS_LOG(EXCEPTION) << "Invalid index:" << index << " total output_format:" << output_format_.size();
+  }
+  return output_format_[index];
+}
+
+TypeId OpRuntimeInfo::output_type(size_t index) const {
+  if (index >= output_type_.size()) {
+    MS_LOG(EXCEPTION) << "Invalid index:" << index << " total output_type:" << output_type_.size();
+  }
+  return output_type_[index];
+}
+
+size_t OpRuntimeInfo::output_tensor_size(size_t index) const {
+  if (index >= output_tensor_size_.size()) {
+    MS_LOG(EXCEPTION) << "Invalid index::" << index << " total output_tensor_size:" << output_tensor_size_.size();
+  }
+  return output_tensor_size_[index];
+}
 }  // namespace session
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
@ -58,6 +58,28 @@ struct KernelWithIndexCmp {
  }
 };

+class OpRuntimeInfo {
+ public:
+  OpRuntimeInfo(std::vector<std::string> output_format, std::vector<TypeId> output_type,
+                std::vector<size_t> output_tensor_size)
+      : output_format_(std::move(output_format)),
+        output_type_(std::move(output_type)),
+        output_tensor_size_(std::move(output_tensor_size)) {}
+  ~OpRuntimeInfo() = default;
+
+  // Key for user data.
+  constexpr static char key[] = "OpRuntimeInfo";
+
+  std::string output_format(size_t index) const;
+  TypeId output_type(size_t index) const;
+  size_t output_tensor_size(size_t index) const;
+
+ private:
+  std::vector<std::string> output_format_;
+  std::vector<TypeId> output_type_;
+  std::vector<size_t> output_tensor_size_;
+};
+
 class AnfRuntimeAlgorithm {
 public:
  static AnfNodePtr MakeMonadValueNode(const KernelGraphPtr &kg);
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@ -50,6 +50,7 @@
 #include "runtime/device/ascend/ascend_stream_assign.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "utils/ms_utils.h"
+#include "utils/utils.h"
 #include "utils/context/graph_kernel_flags.h"
 #include "backend/optimizer/common/helper.h"
 #include "runtime/device/kernel_runtime_manager.h"
@ -866,9 +867,54 @@ KernelGraphPtr AscendSession::PreBuildOp(const OpRunInfo &op_run_info,
  opt::RunOpAscendBackendIRFusionOptimization(graph);
  SelectKernel(*graph);
  RunOpHardwareOptimize(graph);
+  CacheCNodeOutputInfo(*graph);
  return graph;
 }

+void AscendSession::CacheCNodeOutputInfo(const KernelGraph &graph) const {
+  auto &nodes = graph.execution_order();
+  for (auto const &node : nodes) {
+    std::vector<std::string> formats;
+    std::vector<TypeId> types;
+    std::vector<size_t> tensor_sizes;
+    auto output_num = AnfAlgo::GetOutputTensorNum(node);
+    for (size_t i = 0; i < output_num; ++i) {
+      std::string output_format = AnfAlgo::GetOutputFormat(node, i);
+      auto output_type = AnfAlgo::GetOutputDeviceDataType(node, i);
+      auto tensor_size = AnfAlgo::GetOutputTensorMemSize(node, i);
+      formats.emplace_back(output_format);
+      types.emplace_back(output_type);
+      tensor_sizes.emplace_back(tensor_size);
+    }
+    MS_EXCEPTION_IF_NULL(node);
+    node->set_user_data<OpRuntimeInfo>(std::make_shared<OpRuntimeInfo>(formats, types, tensor_sizes));
+  }
+
+  auto &inputs = graph.inputs();
+  for (const auto &input : inputs) {
+    MS_EXCEPTION_IF_NULL(input);
+    if (!input->isa<Parameter>()) {
+      continue;
+    }
+    std::vector<std::string> formats;
+    std::vector<TypeId> types;
+    std::vector<size_t> tensor_sizes;
+    auto output_size = AnfAlgo::GetOutputTensorNum(input);
+    for (size_t index = 0; index < output_size; index++) {
+      auto format = AnfAlgo::GetOutputFormat(input, index);
+      auto type_id = AnfAlgo::GetOutputDeviceDataType(input, index);
+      if (type_id == kTypeUnknown) {
+        type_id = AnfAlgo::GetOutputInferDataType(input, index);
+      }
+      auto tensor_size = AnfAlgo::GetOutputTensorMemSize(input, index);
+      formats.emplace_back(format);
+      types.emplace_back(type_id);
+      tensor_sizes.emplace_back(tensor_size);
+    }
+    input->set_user_data<OpRuntimeInfo>(std::make_shared<OpRuntimeInfo>(formats, types, tensor_sizes));
+  }
+}
+
 void AscendSession::GetOpInputStubTensors(const CNodePtr &cnode, const std::map<AnfNodePtr, size_t> &parameter_index,
                                          const std::vector<tensor::TensorPtr> &graph_inputs,
                                          const std::map<KernelWithIndex, OutputTensorInfo> &node_output_info,
--- a/mindspore/ccsrc/backend/session/ascend_session.h
+++ b/mindspore/ccsrc/backend/session/ascend_session.h
@ -131,6 +131,7 @@ class AscendSession : public SessionBasic {
  void LoadGraphsToDbg(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
  void AssignStaticMemory(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
  void UpdateRefOutputMap(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
+  void CacheCNodeOutputInfo(const KernelGraph &graph) const;
  KernelGraphPtr PreBuildOp(const OpRunInfo &op_run_info, const std::vector<tensor::TensorPtr> &input_tensors,
                            const std::vector<int64_t> &tensors_mask);
  void GetOpInputStubTensors(const CNodePtr &cnode, const std::map<AnfNodePtr, size_t> &parameter_index,
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@ -99,9 +99,12 @@ void KernelRuntime::RunOpMallocPre(const session::KernelGraph &graph,
  for (const auto &node : nodes) {
    auto output_num = AnfAlgo::GetOutputTensorNum(node);
    for (size_t i = 0; i < output_num; ++i) {
-      std::string output_format = AnfAlgo::GetOutputFormat(node, i);
-      auto output_type = AnfAlgo::GetOutputDeviceDataType(node, i);
-      auto tensor_size = AnfAlgo::GetOutputTensorMemSize(node, i);
+      MS_EXCEPTION_IF_NULL(node);
+      auto runtime_info = node->user_data<session::OpRuntimeInfo>();
+      MS_EXCEPTION_IF_NULL(runtime_info);
+      auto const &output_format = runtime_info->output_format(i);
+      auto output_type = runtime_info->output_type(i);
+      auto tensor_size = runtime_info->output_tensor_size(i);
      // Create DeviceAddress without ptr.
      // Get real device ptr after KernelBuild finish.
      auto device_address = CreateDeviceAddress(nullptr, tensor_size, output_format, output_type);
@ -130,13 +133,13 @@ void KernelRuntime::RunOpMallocPre(const session::KernelGraph &graph,
        AnfAlgo::SetOutputAddr(output_address, index, item.get());
        continue;
      }
-      TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(item, index);
-      if (output_type_id == kTypeUnknown) {
-        output_type_id = AnfAlgo::GetOutputInferDataType(item, index);
-      }
-      auto tensor_size = AnfAlgo::GetOutputTensorMemSize(item, index);
+      auto op_runtime_info = item->user_data<session::OpRuntimeInfo>();
+
+      TypeId output_type_id = op_runtime_info->output_type(index);
+      auto output_tensor_size = op_runtime_info->output_tensor_size(index);
+      auto output_format = op_runtime_info->output_format(index);
      auto device_address =
-        CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id, {item, index});
+        CreateDeviceAddress(nullptr, output_tensor_size, output_format, output_type_id, {item, index});
      AnfAlgo::SetOutputAddr(device_address, index, item.get());
      current_tensor->set_device_address(device_address);
      current_tensor->set_sync_status(kNeedSyncHostToDevice);