!27977 The Transpose of the DynamicGRUV2 operator does not need to be executed

Merge pull request !27977 from caifubi/master-test-dynamic-gruv2
2021-12-22 06:41:13 +00:00 · 2021-12-22 06:41:13 +00:00 · ebae4ac5ec
parent e1873d8664 cd58ae7d84
commit ebae4ac5ec
3 changed files with 35 additions and 5 deletions
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_transpose_for_dynamic_gru_v2.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_transpose_for_dynamic_gru_v2.cc
@ -59,6 +59,7 @@ CNodePtr Insert(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
        NewTransOpNode(func_graph, AnfAlgo::GetInputNode(transdata_node->cast<CNodePtr>(), 0), cnode, kernel_select,
                       false, prim::kPrimTranspose->name(), std::vector<int64_t>{2, 3, 1, 0});
      MS_EXCEPTION_IF_NULL(new_transpose_node);
+      // This Transpose operator is only to change the shape, but does not expect to change the data arrangement!
      AnfAlgo::SetNodeAttr("nop_op", MakeValue(true), new_transpose_node);
      RefreshKernelBuildInfo(input_format, kOpFormat_HWCN, new_transpose_node);
      // trans hwcn to output_format
--- a/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc
+++ b/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc
@ -18,6 +18,7 @@
 #include <algorithm>
 #include <set>
 #include <unordered_map>
+#include "acl/acl_rt.h"
 #include "backend/optimizer/ascend/ascend_backend_optimization.h"
 #include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
 #include "utils/context/graph_kernel_flags.h"
@ -633,8 +634,9 @@ void AscendDeviceContext::OptimizeSingleOpGraph(const KernelGraphPtr &graph) con
 void AscendDeviceContext::PreprocessBeforeRunSingleOpGraph(const KernelGraphPtr &graph) const {
  MS_EXCEPTION_IF_NULL(graph);
  const auto &nodes = graph->execution_order();
-  // Remove placeholder
+
  for (const auto &node : nodes) {
+    // Remove placeholder
    auto op_name = AnfAlgo::GetCNodeName(node);
    static const std::set<std::string> place_holder_nodes = {kDynamicRNNOpName, kDynamicGRUV2OpName};
    auto iter = place_holder_nodes.find(op_name);
@ -652,6 +654,11 @@ void AscendDeviceContext::PreprocessBeforeRunSingleOpGraph(const KernelGraphPtr
      }
      node->set_inputs(new_inputs);
    }
+
+    // Save the nop_op that needs to be memcpy
+    if (op_name == prim::kPrimTranspose->name() && AnfAlgo::HasNodeAttr("nop_op", node)) {
+      nop_op_to_memcpy_.insert(node);
+    }
  }

  device::ascend::InsertAtomicCleanOps(nodes, &node_atomics_persistent_cache_);
@ -677,7 +684,7 @@ std::shared_ptr<Bucket> AscendDeviceContext::CreateBucket(uint32_t bucket_id, ui
  return bucket;
 }

-bool AscendDeviceContext::SyncRuning() const {
+bool AscendDeviceContext::PySyncRuning() const {
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  if ((ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) &&
@ -687,6 +694,23 @@ bool AscendDeviceContext::SyncRuning() const {
  return true;
 }

+bool AscendDeviceContext::MemoryCopyAsync(const CNodePtr &node, const vector<AddressPtr> &inputs,
+                                          const vector<AddressPtr> &outputs) const {
+  if (inputs.size() != 1 || outputs.size() != 1) {
+    MS_LOG(ERROR) << "Kernel " << node->fullname_with_scope() << " input output size should be 1 but"
+                  << " input size is:" << inputs.size() << " output size is:" << outputs.size();
+    return false;
+  }
+
+  aclError status = aclrtMemcpyAsync(outputs[0]->addr, outputs[0]->size, inputs[0]->addr, inputs[0]->size,
+                                     ACL_MEMCPY_DEVICE_TO_DEVICE, compute_stream_);
+  if (status != ACL_ERROR_NONE) {
+    MS_LOG(ERROR) << "MemCpyAsync op aclrtMemcpyAsync failed, ret:" << status;
+    return false;
+  }
+  return PySyncRuning();
+}
+
 bool AscendDeviceContext::LaunchKernel(const CNodePtr &kernel, const vector<AddressPtr> &inputs,
                                       const vector<AddressPtr> &workspace, const vector<AddressPtr> &outputs,
                                       bool is_dynamic_shape) const {
@ -712,7 +736,7 @@ bool AscendDeviceContext::LaunchKernel(const CNodePtr &kernel, const vector<Addr
    dynamic_kernel->UpdateArgs();
    dynamic_kernel->Execute();
    dynamic_kernel->PostExecute();
-    return SyncRuning();
+    return PySyncRuning();
  }

  std::vector<AddressPtr> real_inputs;
@ -732,13 +756,16 @@ bool AscendDeviceContext::LaunchKernel(const CNodePtr &kernel, const vector<Addr
  }

  std::lock_guard<std::mutex> locker(launch_mutex_);
+  if (nop_op_to_memcpy_.find(kernel) != nop_op_to_memcpy_.end()) {
+    return MemoryCopyAsync(kernel, real_inputs, outputs);
+  }
  auto ret = kernel_mod->Launch(real_inputs, workspace, outputs, compute_stream_);
  if (!ret) {
    MS_LOG(ERROR) << "Launch kernel failed, kernel full name: " << kernel->fullname_with_scope();
    return false;
  }

-  return SyncRuning();
+  return PySyncRuning();
 }

 void AscendDeviceContext::BindDeviceToCurrentThread() const {
--- a/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.h
+++ b/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.h
@ -136,7 +136,8 @@ class AscendDeviceContext : public DeviceContext {
  void LoadModel(const NotNull<KernelGraphPtr> &root_graph) const;
  void UpdateExecOrder(const KernelGraphPtr &graph) const;
  static bool IsGraphMode();
-  bool SyncRuning() const;
+  bool PySyncRuning() const;
+  bool MemoryCopyAsync(const CNodePtr &node, const vector<AddressPtr> &inputs, const vector<AddressPtr> &outputs) const;

  void ReportErrorMessage() const;
  void ReportWarningMessage() const;
@ -161,6 +162,7 @@ class AscendDeviceContext : public DeviceContext {
  // Persistent cache for single op execution.
  // node_atomics_ will be cleaned up in CompileGraph.
  mutable std::map<CNodePtr, std::vector<CNodePtr>> node_atomics_persistent_cache_;
+  mutable std::set<CNodePtr> nop_op_to_memcpy_;
  // Some NOP nodes have be hide in execution order, it doesn't have output device address, this function creates
  // output device address for these nodes, and the output device address is the same with input device address.
  void AssignOutputNopNodeDeviceAddress(const KernelGraphPtr &graph) const;