PyNative Ascend Task Sink

2022-08-11 20:15:31 +08:00 · 2022-08-11 20:15:31 +08:00 · 2d6fbcd9b3
parent 855e06c7ca
commit 2d6fbcd9b3
30 changed files with 643 additions and 108 deletions
--- a/mindspore/ccsrc/backend/common/session/anf_runtime_algorithm.cc
+++ b/mindspore/ccsrc/backend/common/session/anf_runtime_algorithm.cc
@ -550,6 +550,20 @@ bool AnfRuntimeAlgorithm::WorkspaceAddrExist(const AnfNodePtr &node, size_t outp
 const DeviceAddress *AnfRuntimeAlgorithm::GetPrevNodeOutputAddr(const AnfNodePtr &anf_node, size_t input_idx,
                                                                bool skip_nop_node) {
  KernelWithIndex kernel_with_index = common::AnfAlgo::GetPrevNodeOutput(anf_node, input_idx);
+  MS_EXCEPTION_IF_NULL(kernel_with_index.first);
+  if (kernel_with_index.first->isa<ValueNode>()) {
+    auto value_node = kernel_with_index.first->cast<ValueNodePtr>();
+    MS_EXCEPTION_IF_NULL(value_node);
+    auto value = value_node->value();
+    MS_EXCEPTION_IF_NULL(value);
+    if (value->isa<tensor::Tensor>()) {
+      auto tensor = value->cast<tensor::TensorPtr>();
+      MS_EXCEPTION_IF_NULL(tensor);
+      if (tensor->is_forward_output()) {
+        return dynamic_cast<const DeviceAddress *>(tensor->device_address().get());
+      }
+    }
+  }
  return AnfRuntimeAlgorithm::GetOutputAddr(kernel_with_index.first, kernel_with_index.second, skip_nop_node);
 }

--- a/mindspore/ccsrc/backend/graph_compiler/backend.cc
+++ b/mindspore/ccsrc/backend/graph_compiler/backend.cc
@ -454,13 +454,6 @@ std::vector<std::vector<tensor::TensorPtr>> GetRunGraphInputs(const GraphCompile

  return input_tensor_lists;
 }
-
-bool IsAutoParallel() {
-  auto parallel_context = parallel::ParallelContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(parallel_context);
-  auto parallel_mode = parallel_context->parallel_mode();
-  return parallel_mode == parallel::kSemiAutoParallel || parallel_mode == parallel::kAutoParallel;
-}
 }  // namespace

 VectorRef MsBackend::MsRunGraph(const GraphId &g, const VectorRef &args, const std::string &target) {
@ -605,6 +598,7 @@ const ActorInfo &MindRTBackend::CompileGraphs(const FuncGraphPtr &func_graph) {
  MS_EXCEPTION_IF_NULL(context_ptr);
  ms_execution_mode_ = context_ptr->get_param<int>(MS_CTX_EXECUTION_MODE);
  real_execution_mode_ = ms_execution_mode_;
+  func_graph->set_flag(kFlagPyNativeRunInGraph, real_execution_mode_ == kPynativeMode);

  // Compile root graph.
  graph_id_to_device_context_.clear();
@ -615,9 +609,9 @@ const ActorInfo &MindRTBackend::CompileGraphs(const FuncGraphPtr &func_graph) {
    device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_name_, device_id_});
  MS_EXCEPTION_IF_NULL(device_context);
  bool all_support = device_context->PartitionGraph(func_graph);
-  auto run_mode = device_context->GetRunMode(func_graph);
  if (all_support) {
-    if (run_mode == device::RunMode::kGraphMode) {
+    auto run_mode = device_context->GetRunMode(func_graph);
+    if (run_mode == device::RunMode::kGraphMode && pynative::GraphAdapter::PyNativeEnableTaskSink(func_graph)) {
      auto graph_id = graph_compiler_->CompileWholeGraphForGraphRunMode(func_graph, device_context);
      graph_id_to_device_context_[graph_id] = device_context;
    } else {
@ -1004,8 +998,9 @@ void MindRTBackend::RunGraphByActors(const ActorInfo &actor_info, const GraphCom
      graph->set_flag(kFlagPyNativeRunInGraph, true);
      graph->set_flag(kFlagIsPynativeBpropGraph, root_graph_->has_flag(kFlagIsPynativeBpropGraph));

-      // The size of control_nodes is at least 1 since there is return node in the graph.
-      if (control_nodes_.size() == 1 && graphs.size() == 1) {
+      // KernelByKernel: The size of control_nodes is at least 1 since there is return node in the graph.
+      // GraphMode: No control nodes.
+      if (control_nodes_.size() <= 1 && graphs.size() == 1) {
        MS_LOG(INFO) << "Replace parameter format";
        // The input tensors of heterogeneous graphs or control flow graphs are null.
        // Need to get tensor after ParseControlNodes.
@ -1121,7 +1116,7 @@ void MindRTBackend::RunGraphBySingleOp(const GraphCompilerInfo &graph_compiler_i

      // Save grad node to Bucket
      if (graph->has_flag(kFlagIsPynativeBpropGraph) && (!common::AnfAlgo::IsControlOpExecInBackend(kernel)) &&
-          !kernel->is_parallel() && IsAutoParallel()) {
+          !kernel->is_parallel() && pynative::GraphAdapter::IsAutoParallel()) {
        graph_compiler_->AddGradAddrToBucket(graph->graph_id(), graph_output_info.graph_output_tensors);
      }
    }
--- a/mindspore/ccsrc/pipeline/jit/action.cc
+++ b/mindspore/ccsrc/pipeline/jit/action.cc
@ -951,13 +951,6 @@ void SetRunMode(const FuncGraphPtr &func_graph, compile::Backend *backend_ptr) {
    context_ptr->set_param<bool>(MS_CTX_ENABLE_LOOP_SINK, enable_loop_sink);
    backend_ptr->set_is_multi_graph_sink(is_multi_graph_sink);
  };
-  auto mode = context_ptr->get_param<int>(MS_CTX_EXECUTION_MODE);
-  // PYNATIVE: no need set any context.
-  if (mode == kPynativeMode) {
-    MS_LOG(INFO) << "Run graph mode with pynative.";
-    set_ctx(false, false, false);
-    return;
-  }

  const auto &all_nodes = TopoSort(func_graph->return_node(), SuccDeeperSimple, AlwaysInclude);
  // GPU/CPU no need set any context.
--- a/mindspore/ccsrc/pipeline/pynative/grad/grad.cc
+++ b/mindspore/ccsrc/pipeline/pynative/grad/grad.cc
@ -802,7 +802,7 @@ FuncGraphPtr GradExecutor::GetBpropGraph(const prim::GradOperationPtr &grad, con
  if (top_cell()->dynamic_shape()) {
    bprop_graph->set_flag(FUNC_GRAPH_FLAG_DYNAMIC_SHAPE, true);
  }
-  if (top_cell()->is_dynamic_structure()) {
+  if (top_cell()->is_real_dynamic_structure()) {
    bprop_graph->set_flag(kFlagIsDynamicStructure, true);
  }
  // Do opt for final bprop graph
@ -918,7 +918,7 @@ void GradExecutor::CheckNeedCompileGraph() {
    EraseTopCellFromTopCellList(pre_top_cell);
    pre_top_cell->Clear();
    already_run_top_cell_[already_top_cell_id] = new_top_cell;
-    top_cell()->set_dynamic_structure(true);
+    top_cell()->set_is_real_dynamic_structure(true);
  } else {
    MS_LOG(DEBUG) << "The op info has not been changed, no need to compile graph again";
    pre_top_cell->set_input_args_id(new_top_cell->input_args_id());
--- a/mindspore/ccsrc/pipeline/pynative/grad/top_cell.h
+++ b/mindspore/ccsrc/pipeline/pynative/grad/top_cell.h
@ -96,6 +96,10 @@ class TopCellInfo {
  size_t grad_order() const { return grad_order_; }
  bool is_dynamic_structure() const { return is_dynamic_structure_; }
  void set_dynamic_structure(bool is_dynamic_structure) { is_dynamic_structure_ = is_dynamic_structure; }
+  bool is_real_dynamic_structure() const { return is_real_dynamic_structure_; }
+  void set_is_real_dynamic_structure(bool is_real_dynamic_structure) {
+    is_real_dynamic_structure_ = is_real_dynamic_structure;
+  }
  bool dynamic_shape() const { return dynamic_shape_; }
  void set_dynamic_shape(bool dynamic_shape) { dynamic_shape_ = dynamic_shape; }
  bool hook_changed() const { return hook_changed_; }
@ -184,6 +188,8 @@ class TopCellInfo {
 private:
  bool is_topest_{false};
  bool is_dynamic_structure_{false};
+  // Set this flag to ture when all_op_info of top_cell is changed.
+  bool is_real_dynamic_structure_{false};
  bool dynamic_shape_{false};
  bool vm_compiled_{false};
  bool hook_changed_{false};
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_kernel_runtime.cc
@ -258,6 +258,8 @@ void AscendKernelRuntime::ClearGraphRuntimeResource(uint32_t graph_id) {
  } else {
    MS_LOG(DEBUG) << "GraphId:" << graph_id << " not found";
  }
+
+  rt_model_zero_copy_.Release(graph_id);
 }

 void *AscendKernelRuntime::GetModelStream(uint32_t graph_id) const {
@ -575,6 +577,11 @@ bool AscendKernelRuntime::LoadTask(const session::KernelGraph &graph) {
    MS_LOG(EXCEPTION) << "Distribute Task Failed, \nerror msg: " << e.what();
  }

+  if (!rt_model_zero_copy_.GenerateZeroCopyTasks(graph)) {
+    MS_LOG(ERROR) << "Generate ZeroCopyTask failed, graph id " << graph.graph_id();
+    return false;
+  }
+
 #ifndef ENABLE_SECURITY
  if (ProfilingManager::GetInstance().IsProfilingInitialized()) {
    auto task_ids = ModelRunner::Instance().GetTaskIdList(model_iter->first);
@ -1065,6 +1072,11 @@ bool AscendKernelRuntime::RunTask(const session::KernelGraph &graph) {
    return false;
  }

+  if (!rt_model_zero_copy_.UpdateTaskArgs(graph, compute_stream())) {
+    MS_LOG(ERROR) << "Update rtModel task args failed, graph id " << graph.graph_id();
+    return false;
+  }
+
  try {
    ModelRunner::Instance().RunModel(graph.graph_id());
  } catch (const std::exception &) {
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_kernel_runtime.h
@ -27,6 +27,7 @@
 #include "runtime/device/kernel_runtime.h"
 #include "runtime/context.h"
 #include "plugin/device/ascend/hal/device/ge_runtime/davinci_model.h"
+#include "plugin/device/ascend/hal/device/tasksink/rtmodel_zero_copy.h"
 #include "runtime/device/kernel_runtime_manager.h"
 #include "backend/common/session/session_basic.h"
 #ifndef ENABLE_SECURITY
@ -114,6 +115,7 @@ class AscendKernelRuntime : public KernelRuntime {
  static bool DeleteDumpDir(const std::string &path);
  static int DeleteDumpFile(std::string path);
  static std::string GetRealPath(const std::string &path);
+  void CreateDefaultStream(uint32_t device_id);

  rtContext_t rt_context_{nullptr};
  bool initialized_{false};
@ -128,7 +130,7 @@ class AscendKernelRuntime : public KernelRuntime {
  std::map<uint32_t, std::shared_ptr<std::map<uint32_t, void *>>> device_stream_id_map_;
  std::map<uint32_t, void *> stream_id_map_;
  std::set<uint32_t> initialized_device_set_{};
-  void CreateDefaultStream(uint32_t device_id);
+  tasksink::RtModelZeroCopy rt_model_zero_copy_;
 };

 MS_REG_KERNEL_RUNTIME(kAscendDevice, AscendKernelRuntime);
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_stream_assign.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_stream_assign.cc
@ -481,7 +481,7 @@ void AscendStreamAssign::InsertEventForNonTaskSink(const NotNull<KernelGraphPtr>
 }

 void AscendStreamAssign::AssignStream(const NotNull<KernelGraphPtr> &graph_ptr) {
-  if (graph_ptr->has_flag(kFlagPyNativeRunInGraph)) {
+  if (graph_ptr->has_flag(kFlagPyNativeRunInGraph) && !graph_ptr->is_graph_run_mode()) {
    AssignStreamForPynativeGraph(graph_ptr.get());
    return;
  }
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/model_runner.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/model_runner.cc
@ -60,6 +60,15 @@ const std::vector<uint32_t> &ModelRunner::GetTaskIdList(uint32_t model_id) const
  return model_iter->second->GetTaskIdList();
 }

+const std::vector<std::shared_ptr<Task>> &ModelRunner::GetTaskList(uint32_t model_id) const {
+  auto model_iter = runtime_models_.find(model_id);
+  if (model_iter == runtime_models_.end()) {
+    MS_LOG(EXCEPTION) << "Model id " << model_id << " not found.";
+  }
+  MS_EXCEPTION_IF_NULL(model_iter->second);
+  return model_iter->second->GetTaskList();
+}
+
 const std::vector<uint32_t> &ModelRunner::GetStreamIdList(uint32_t model_id) const {
  decltype(runtime_models_)::const_iterator model_iter = runtime_models_.find(model_id);
  if (model_iter == runtime_models_.cend()) {
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/model_runner.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/model_runner.h
@ -23,6 +23,7 @@
 #include <tuple>
 #include <string>
 #include "plugin/device/ascend/hal/device/ge_runtime/davinci_model.h"
+#include "plugin/device/ascend/hal/device/ge_runtime/task/task.h"

 namespace mindspore::ge::model_runner {
 class RuntimeModel;
@ -38,6 +39,8 @@ class ModelRunner {

  void LoadModelComplete(uint32_t model_id);

+  const std::vector<std::shared_ptr<Task>> &GetTaskList(uint32_t model_id) const;
+
  const std::vector<uint32_t> &GetTaskIdList(uint32_t model_id) const;

  const std::vector<uint32_t> &GetStreamIdList(uint32_t model_id) const;
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/runtime_model.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/runtime_model.cc
@ -293,5 +293,7 @@ void RuntimeModel::RtEventDestory() noexcept {

 const std::vector<uint32_t> &RuntimeModel::GetTaskIdList() const { return task_id_list_; }

+const std::vector<std::shared_ptr<Task>> &RuntimeModel::GetTaskList() const { return task_list_; }
+
 const std::vector<uint32_t> &RuntimeModel::GetStreamIdList() const { return stream_id_list_; }
 }  // namespace mindspore::ge::model_runner
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/runtime_model.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/runtime_model.h
@ -36,6 +36,7 @@ class RuntimeModel {
  void Load(uint32_t device_id, uint64_t session_id, const std::shared_ptr<DavinciModel> &davinci_model);
  void DistributeTask();
  void LoadComplete();
+  const std::vector<std::shared_ptr<Task>> &GetTaskList() const;
  const std::vector<uint32_t> &GetTaskIdList() const;
  const std::vector<uint32_t> &GetStreamIdList() const;
  const std::map<std::string, std::shared_ptr<RuntimeInfo>> &GetRuntimeInfoMap() const { return runtime_info_map_; }
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/task/aicpu_task.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/task/aicpu_task.cc
@ -30,7 +30,8 @@ AicpuTask::AicpuTask(const ModelContext &model_context, const std::shared_ptr<Ai
      stream_(nullptr),
      args_(nullptr),
      ext_info_(nullptr),
-      input_output_addr_(nullptr) {
+      input_output_addr_(nullptr),
+      args_size_(0) {
  MS_EXCEPTION_IF_NULL(task_info_);

  auto stream_list = model_context.stream_list();
@ -60,20 +61,20 @@ void AicpuTask::Distribute() {
  (void)io_addrs.insert(io_addrs.cend(), task_info_->output_data_addrs().cbegin(),
                        task_info_->output_data_addrs().cend());
  auto io_addrs_num = static_cast<uint32_t>(io_addrs.size());
-  auto io_addrs_size = static_cast<uint32_t>(io_addrs_num * sizeof(void *));
+  io_addrs_size_ = static_cast<uint32_t>(io_addrs_num * sizeof(void *));
  constexpr uint32_t io_addr_offset = sizeof(aicpu::AicpuParamHead);
-  uint32_t node_def_len_offset = io_addr_offset + io_addrs_size;
+  uint32_t node_def_len_offset = io_addr_offset + io_addrs_size_;
  uint32_t node_def_addr_offset = node_def_len_offset + sizeof(uint32_t);
-  uint32_t args_size = sizeof(aicpu::AicpuParamHead) + io_addrs_size +
-                       static_cast<uint32_t>(task_info_->node_def().size()) + sizeof(uint32_t);
+  args_size_ = sizeof(aicpu::AicpuParamHead) + io_addrs_size_ + static_cast<uint32_t>(task_info_->node_def().size()) +
+               sizeof(uint32_t);

  // Malloc device memory for args
-  rtError_t rt_ret = rtMalloc(&args_, args_size, RT_MEMORY_HBM);
+  rtError_t rt_ret = rtMalloc(&args_, args_size_, RT_MEMORY_HBM);
  if (rt_ret != RT_ERROR_NONE) {
    MS_LOG(EXCEPTION) << "Call rt api rtMalloc failed, ret: " << rt_ret;
  }

-  SetAicpuParamHead(args_size, io_addrs_num);
+  SetAicpuParamHead(args_size_, io_addrs_num);
  SetInputOutputAddrs(io_addrs, io_addr_offset);
  SetNodeDef(node_def_len_offset, node_def_addr_offset);

@ -82,12 +83,12 @@ void AicpuTask::Distribute() {
  auto dump_flag = task_info_->dump_flag() ? RT_KERNEL_DUMPFLAG : RT_KERNEL_DEFAULT;
  auto cpu_flag = task_info_->cust_aicpu() ? RT_KERNEL_CUSTOM_AICPU : dump_flag;

-  MS_LOG(INFO) << "Distribute AicpuTask start, args_size = " << args_size << ", io_addrs_num =" << io_addrs_num
+  MS_LOG(INFO) << "Distribute AicpuTask start, args_size = " << args_size_ << ", io_addrs_num =" << io_addrs_num
               << ", so_name = " << task_info_->so_name() << ", kernel_name = " << task_info_->kernel_name()
               << ", dump_flag = " << dump_flag;
  rtArgsEx_t argsInfo = {};
  argsInfo.args = args_;
-  argsInfo.argsSize = args_size;
+  argsInfo.argsSize = args_size_;
  rt_ret = rtCpuKernelLaunchWithFlag(static_cast<const void *>(task_info_->so_name().data()),
                                     static_cast<const void *>(task_info_->kernel_name().data()), 1, &argsInfo, nullptr,
                                     stream_, cpu_flag);
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/task/aicpu_task.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/task/aicpu_task.h
@ -33,6 +33,8 @@ class AicpuTask : public TaskRepeater<AicpuTaskInfo> {

  void *Args() const override { return input_output_addr_; }

+  size_t ArgsSize() const override { return io_addrs_size_; }
+
  std::string task_name() const override { return task_info_->op_name(); }

 private:
@ -46,6 +48,8 @@ class AicpuTask : public TaskRepeater<AicpuTaskInfo> {
  void *args_;
  void *ext_info_;
  void *input_output_addr_;
+  size_t io_addrs_size_;
+  size_t args_size_;
 };
 }  // namespace mindspore::ge::model_runner
 #endif  // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_GE_RUNTIME_AICPU_TASK_H_
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/task/task.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/task/task.h
@ -33,6 +33,8 @@ class Task {

  virtual void *Args() const { return nullptr; }

+  virtual size_t ArgsSize() const { return 0; }
+
  virtual std::string task_name() const { return ""; }

  void set_model_handle(rtModel_t model_handle) { model_handle_ = model_handle; }
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/task/tbe_task.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/task/tbe_task.cc
@ -27,7 +27,8 @@ TbeTask::TbeTask(const ModelContext &model_context, const std::shared_ptr<TbeTas
      task_info_(task_info),
      stream_(nullptr),
      stub_func_(nullptr),
-      args_(nullptr) {
+      args_(nullptr),
+      args_size_(0) {
  MS_EXCEPTION_IF_NULL(task_info);

  auto stream_list = model_context.stream_list();
@ -74,14 +75,14 @@ void TbeTask::Distribute() {
                             task_info_->output_data_addrs().cend());
  tensor_device_addrs.insert(tensor_device_addrs.cend(), task_info_->workspace_addrs().cbegin(),
                             task_info_->workspace_addrs().cend());
-  auto args_size = static_cast<uint32_t>(tensor_device_addrs.size() * sizeof(void *));
+  args_size_ = static_cast<uint32_t>(tensor_device_addrs.size() * sizeof(void *));

-  rt_ret = rtMalloc(&args_, args_size, RT_MEMORY_HBM);
+  rt_ret = rtMalloc(&args_, args_size_, RT_MEMORY_HBM);
  if (rt_ret != RT_ERROR_NONE) {
-    MS_LOG(EXCEPTION) << "Call rt api rtMalloc failed, ret: " << rt_ret << " mem size " << args_size;
+    MS_LOG(EXCEPTION) << "Call rt api rtMalloc failed, ret: " << rt_ret << " mem size " << args_size_;
  }

-  rt_ret = aclrtMemcpy(args_, args_size, static_cast<void *>(tensor_device_addrs.data()), args_size,
+  rt_ret = aclrtMemcpy(args_, args_size_, static_cast<void *>(tensor_device_addrs.data()), args_size_,
                       ACL_MEMCPY_HOST_TO_DEVICE);
  if (rt_ret != RT_ERROR_NONE) {
    MS_LOG(EXCEPTION) << "Call rt api rtMemcpy failed, ret: " << rt_ret;
@ -91,10 +92,10 @@ void TbeTask::Distribute() {
  auto dump_flag = task_info_->dump_flag() ? RT_KERNEL_DUMPFLAG : RT_KERNEL_DEFAULT;
  rtArgsEx_t args_info = {};
  args_info.args = args_;
-  args_info.argsSize = args_size;
+  args_info.argsSize = args_size_;
  rt_ret = rtKernelLaunchWithFlag(stub_func_, task_info_->block_dim(), &args_info, nullptr, stream_, dump_flag);
  if (rt_ret != RT_ERROR_NONE) {
-    MS_LOG(EXCEPTION) << "Call rt api rtKernelLaunch failed, ret: " << rt_ret << " mem size " << args_size;
+    MS_LOG(EXCEPTION) << "Call rt api rtKernelLaunch failed, ret: " << rt_ret << " mem size " << args_size_;
  }
  MS_LOG(INFO) << "[DataDump] task name: " << task_info_->op_name() << " dump_flag: " << dump_flag;
 }
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/task/tbe_task.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_runtime/task/tbe_task.h
@ -32,6 +32,8 @@ class TbeTask : public TaskRepeater<TbeTaskInfo> {

  void *Args() const override { return args_; }

+  size_t ArgsSize() const override { return args_size_; }
+
  std::string task_name() const override { return task_info_->op_name(); }

 private:
@ -39,6 +41,7 @@ class TbeTask : public TaskRepeater<TbeTaskInfo> {
  void *stream_;
  void *stub_func_;
  void *args_;
+  size_t args_size_;
 };
 }  // namespace mindspore::ge::model_runner
 #endif  // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_GE_RUNTIME_TASK_TBE_TASK_H_
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/rtmodel_zero_copy.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/rtmodel_zero_copy.cc
@ -0,0 +1,283 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/ascend/hal/device/tasksink/rtmodel_zero_copy.h"
+
+#include <vector>
+#include <map>
+#include <algorithm>
+#include "runtime/rt.h"
+#include "external/acl/acl_rt.h"
+#include "ir/tensor.h"
+#include "include/common/utils/anfalgo.h"
+#include "runtime/device/kernel_info.h"
+#include "backend/common/session/anf_runtime_algorithm.h"
+#include "plugin/device/ascend/hal/device/ge_runtime/model_runner.h"
+#include "plugin/device/ascend/hal/device/tasksink/task_generator.h"
+
+namespace mindspore {
+namespace device {
+namespace ascend {
+namespace tasksink {
+using TaskPtr = std::shared_ptr<ge::model_runner::Task>;
+namespace {
+bool IsForwardOutputValueNode(const AnfNodePtr &input) {
+  MS_EXCEPTION_IF_NULL(input);
+  if (input->isa<ValueNode>()) {
+    auto value_node = input->cast<ValueNodePtr>();
+    MS_EXCEPTION_IF_NULL(value_node);
+    auto value = value_node->value();
+    MS_EXCEPTION_IF_NULL(value);
+    if (value->isa<tensor::Tensor>()) {
+      auto tensor = value->cast<tensor::TensorPtr>();
+      MS_EXCEPTION_IF_NULL(tensor);
+      if (tensor->is_forward_output()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool CheckTaskValid(const CNodePtr &node, const std::vector<void *> &args_datas) {
+  MS_EXCEPTION_IF_NULL(node);
+  bool task_valid = true;
+  // Check input/output/workspace
+  auto input_addrs = TaskGenerator::GetTaskInput(node);
+  auto output_addrs = TaskGenerator::GetTaskOutput(node);
+  auto workspace_addrs = TaskGenerator::GetTaskWorkspace(node);
+
+  std::vector<AddressPtr> node_addresses;
+  std::move(input_addrs.begin(), input_addrs.end(), std::back_inserter(node_addresses));
+  std::move(output_addrs.begin(), output_addrs.end(), std::back_inserter(node_addresses));
+  std::move(workspace_addrs.begin(), workspace_addrs.end(), std::back_inserter(node_addresses));
+
+  if (node_addresses.size() != args_datas.size()) {
+    MS_LOG(ERROR) << "Check failed, Node " << node->UniqueName() << " total addr size " << node_addresses.size()
+                  << " is not equal to " << args_datas.size();
+    return false;
+  }
+
+  for (size_t i = 0; i < node_addresses.size(); ++i) {
+    auto node_address = node_addresses[i];
+    MS_EXCEPTION_IF_NULL(node_address);
+    if (node_address->addr != args_datas[i]) {
+      MS_LOG(ERROR) << "Node " << node->UniqueName() << " addr " << node_address->addr << " not equal to addr of task "
+                    << args_datas[i];
+      task_valid = false;
+    }
+  }
+
+  return task_valid;
+}
+
+bool NeedSkipZeroCopy(const CNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
+  if (common::AnfAlgo::IsNonTaskOp(node)) {
+    MS_LOG(INFO) << "Skip generate ZeroCopyTask for NonTaskOp " << node->fullname_with_scope();
+    return true;
+  }
+  auto kernel_type = AnfAlgo::GetKernelType(node);
+  if (kernel_type != KernelType::TBE_KERNEL && kernel_type != KernelType::AICPU_KERNEL) {
+    MS_LOG(INFO) << "Skip generate ZeroCopyTask for " << node->fullname_with_scope();
+    return true;
+  }
+  return false;
+}
+}  // namespace
+
+void *ParameterZeroCopyTask::GetAddressPtr() {
+  auto node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(node);
+  if (!node->isa<Parameter>()) {
+    MS_LOG(EXCEPTION) << "Not a parameter node " << node->DebugString();
+  }
+  auto kernel_info = dynamic_cast<KernelInfo *>(node->kernel_info());
+  MS_EXCEPTION_IF_NULL(kernel_info);
+  auto parameter_address = kernel_info->GetOutputAddr(0);
+  MS_EXCEPTION_IF_NULL(parameter_address);
+  return parameter_address->GetMutablePtr();
+}
+
+void *ValueNodeZeroCopyTask::GetAddressPtr() {
+  auto node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(node);
+  if (!node->isa<ValueNode>()) {
+    MS_LOG(EXCEPTION) << "Not a ValueNode " << node->DebugString();
+  }
+
+  auto value_node = node->cast<ValueNodePtr>();
+  MS_EXCEPTION_IF_NULL(value_node);
+  auto value = value_node->value();
+  auto tensor = value->cast<tensor::TensorPtr>();
+  MS_EXCEPTION_IF_NULL(tensor);
+  auto value_node_address = tensor->device_address();
+  MS_EXCEPTION_IF_NULL(value_node_address);
+  return value_node_address->GetMutablePtr();
+}
+
+bool ZeroCopyTask::UpdateArgs(void *stream) {
+  device_ptr_ = GetAddressPtr();
+  if (device_ptr_ == nullptr) {
+    MS_LOG(ERROR) << "Device address ptr is null, task " << task_name_;
+    return false;
+  }
+
+  if (device_ptr_ == previous_ptr_) {
+    MS_LOG(DEBUG) << "No need to update task of " << task_name_;
+    return true;
+  }
+
+  auto ret = aclrtMemcpyAsync(static_cast<uint8_t *>(args_base_) + args_offset_, sizeof(void *), &device_ptr_,
+                              sizeof(void *), ACL_MEMCPY_HOST_TO_DEVICE, stream);
+  if (ret != ACL_ERROR_NONE) {
+    MS_LOG(ERROR) << "Update task " << task_name_ << " aclrtMemcpy failed, ret " << ret;
+    return false;
+  }
+
+  previous_ptr_ = device_ptr_;
+  MS_LOG(INFO) << "Update task " << task_name_ << " args_offset " << args_offset_ << " device_ptr " << device_ptr_
+               << " success";
+  return true;
+}
+
+bool RtModelZeroCopy::GenerateZeroCopyTasks(const session::KernelGraph &graph) {
+  if (!graph.has_flag(kFlagPyNativeRunInGraph)) {
+    MS_LOG(INFO) << "RtModelZeroCopy is not enabled";
+    return true;
+  }
+
+  std::vector<ZeroCopyTaskPtr> zero_copy_tasks;
+  auto task_lists = ge::model_runner::ModelRunner::Instance().GetTaskList(graph.graph_id());
+  std::map<std::string, TaskPtr> op_name_to_task;
+  std::transform(task_lists.begin(), task_lists.end(), std::inserter(op_name_to_task, op_name_to_task.end()),
+                 [](const TaskPtr &task) { return std::make_pair(task->task_name(), task); });
+
+  auto nodes = graph.execution_order();
+  for (const auto &node : nodes) {
+    if (NeedSkipZeroCopy(node)) {
+      continue;
+    }
+
+    MS_EXCEPTION_IF_NULL(node);
+    auto op_name = node->UniqueName();
+    auto iter = op_name_to_task.find(op_name);
+    if (iter == op_name_to_task.end()) {
+      MS_LOG(EXCEPTION) << "Cannot found task of op " << op_name;
+    }
+
+    auto task = iter->second;
+    MS_EXCEPTION_IF_NULL(task);
+    auto input_num = common::AnfAlgo::GetInputTensorNum(node);
+    for (size_t i = 0; i < input_num; ++i) {
+      auto input_index_in_graph = AnfAlgo::GetInputIndexInGraph(node, i);
+      auto input = common::AnfAlgo::GetPrevNodeOutput(node, input_index_in_graph, true).first;
+      MS_EXCEPTION_IF_NULL(input);
+      if (input->isa<Parameter>()) {
+        zero_copy_tasks.emplace_back(std::make_shared<tasksink::ParameterZeroCopyTask>(
+          input, task->Args(), i * sizeof(void *), task->task_name()));
+        MS_LOG(INFO) << "Generate ZeroCopyTask for Node " << node->fullname_with_scope() << " Parameter "
+                     << input->DebugString();
+      } else if (IsForwardOutputValueNode(input)) {
+        zero_copy_tasks.emplace_back(std::make_shared<tasksink::ValueNodeZeroCopyTask>(
+          input, task->Args(), i * sizeof(void *), task->task_name()));
+        MS_LOG(INFO) << "Generate ZeroCopyTask for Node " << node->fullname_with_scope() << " ValueNode "
+                     << input->DebugString();
+      }
+    }
+  }
+
+  auto iter = graph_zero_copy_tasks_.try_emplace(graph.graph_id(), zero_copy_tasks);
+  if (!iter.second) {
+    MS_LOG(ERROR) << "Generate ZeroCopyTask failed, Duplicate graph id " << graph.graph_id();
+    return false;
+  }
+  return true;
+}
+
+bool RtModelZeroCopy::UpdateTaskArgs(const session::KernelGraph &graph, void *stream) const {
+  if (!graph.has_flag(kFlagPyNativeRunInGraph)) {
+    MS_LOG(INFO) << "RtModelZeroCopy is not enabled, no need to update task args.";
+    return true;
+  }
+
+  auto iter = graph_zero_copy_tasks_.find(graph.graph_id());
+  if (iter == graph_zero_copy_tasks_.end()) {
+    MS_LOG(ERROR) << "No zero copy tasks found. graph id " << graph.graph_id();
+    return false;
+  }
+
+  auto zero_copy_tasks = iter->second;
+  if (std::any_of(zero_copy_tasks.begin(), zero_copy_tasks.end(),
+                  [stream](const ZeroCopyTaskPtr &task) { return !task->UpdateArgs(stream); })) {
+    MS_LOG(ERROR) << "Update task args failed";
+    return false;
+  }
+
+  MS_LOG(INFO) << "Check rtMode valid " << ((rtStreamSynchronize(stream) == RT_ERROR_NONE) && CheckRtModelValid(graph));
+  return true;
+}
+
+bool RtModelZeroCopy::CheckRtModelValid(const session::KernelGraph &graph) {
+  auto graph_id = graph.graph_id();
+  auto tasks = ge::model_runner::ModelRunner::Instance().GetTaskList(graph_id);
+  std::map<std::string, TaskPtr> op_name_to_task;
+  std::transform(tasks.begin(), tasks.end(), std::inserter(op_name_to_task, op_name_to_task.end()),
+                 [](const TaskPtr &task) { return std::make_pair(task->task_name(), task); });
+
+  auto nodes = graph.execution_order();
+  bool task_valid = true;
+  for (const auto &node : nodes) {
+    if (NeedSkipZeroCopy(node)) {
+      continue;
+    }
+
+    MS_EXCEPTION_IF_NULL(node);
+    auto unique_name = node->UniqueName();
+    auto iter = op_name_to_task.find(unique_name);
+    if (iter == op_name_to_task.end()) {
+      MS_LOG(ERROR) << "Cannot found task of op " << unique_name;
+      task_valid = false;
+      continue;
+    }
+
+    auto task = iter->second;
+    MS_EXCEPTION_IF_NULL(task);
+    auto task_args = task->Args();
+    auto task_size = task->ArgsSize();
+    if (task_size == 0) {
+      // For example InitDataSet (AiCpu kernel).
+      MS_LOG(INFO) << "task name " << task->task_name() << " task_size is 0";
+      continue;
+    }
+    std::vector<void *> args_datas(task_size / sizeof(void *), nullptr);
+    if (aclrtMemcpy(args_datas.data(), task_size, task_args, task_size, ACL_MEMCPY_DEVICE_TO_HOST) != ACL_ERROR_NONE) {
+      MS_LOG(ERROR) << "aclrtMemcpy failed, task " << task->task_name() << " task_size " << task_size;
+      return false;
+    }
+
+    if (!CheckTaskValid(node, args_datas)) {
+      task_valid = false;
+    }
+  }
+  return task_valid;
+}
+
+void RtModelZeroCopy::Release(uint32_t graph_id) { (void)graph_zero_copy_tasks_.erase(graph_id); }
+}  // namespace tasksink
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/rtmodel_zero_copy.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/rtmodel_zero_copy.h
@ -0,0 +1,102 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_DEVICE_TASKSINK_RTMODEL_ZERO_COPY_H_
+#define MINDSPORE_MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_DEVICE_TASKSINK_RTMODEL_ZERO_COPY_H_
+
+#include <memory>
+#include <utility>
+#include <string>
+#include <vector>
+#include "ir/anf.h"
+#include "runtime/device/device_address.h"
+#include "backend/common/session/kernel_graph.h"
+
+namespace mindspore {
+namespace device {
+namespace ascend {
+namespace tasksink {
+class ZeroCopyTask {
+ public:
+  ZeroCopyTask(AnfNodeWeakPtr anf_node, void *args_base, size_t args_offset, std::string task_name)
+      : anf_node_(std::move(anf_node)),
+        args_base_(args_base),
+        args_offset_(args_offset),
+        task_name_(std::move(task_name)) {}
+  ~ZeroCopyTask() = default;
+
+  // Update the address in task args
+  bool UpdateArgs(void *stream);
+
+  virtual void *GetAddressPtr() = 0;
+
+ protected:
+  // Parameter or ValueNode
+  AnfNodeWeakPtr anf_node_;
+
+ private:
+  void *args_base_;
+  size_t args_offset_;
+  std::string task_name_;
+  void *device_ptr_{nullptr};
+  void *previous_ptr_{nullptr};
+};
+using ZeroCopyTaskPtr = std::shared_ptr<ZeroCopyTask>;
+
+class ParameterZeroCopyTask : public ZeroCopyTask {
+ public:
+  ParameterZeroCopyTask(const AnfNodeWeakPtr &anf_node, void *args_base, size_t args_offset,
+                        const std::string &task_name)
+      : ZeroCopyTask(anf_node, args_base, args_offset, task_name) {}
+  void *GetAddressPtr() override;
+};
+
+class ValueNodeZeroCopyTask : public ZeroCopyTask {
+ public:
+  ValueNodeZeroCopyTask(const AnfNodeWeakPtr &anf_node, void *args_base, size_t args_offset,
+                        const std::string &task_name)
+      : ZeroCopyTask(anf_node, args_base, args_offset, task_name) {}
+  void *GetAddressPtr() override;
+};
+
+// Update the device address in task without copying data.
+// Usually we assume that the address in the task is constant.
+// If the address of graph input changed, we need to copy data of graph input tensor to the address of the task.
+// In fact, when the operator is executed, the real address is obtained from the task args.
+// Task args is a secondary pointer, which stores the input address or output address of the operator.
+// If we can update the input address in task args, we can avoid data copying.
+class RtModelZeroCopy {
+ public:
+  RtModelZeroCopy() = default;
+  ~RtModelZeroCopy() = default;
+
+  // Generate ZeroCopyTasks after the tasks of rtModel is Distributed. (Need to get task args address)
+  bool GenerateZeroCopyTasks(const session::KernelGraph &graph);
+  // Copy device address ptr to task args if the ptr changed.
+  bool UpdateTaskArgs(const session::KernelGraph &graph, void *stream) const;
+  // Check rtModel after update task args. The process of checking is consistent with the process of generating tasks.
+  static bool CheckRtModelValid(const session::KernelGraph &graph);
+  // Release resource after the graph is destructed.
+  void Release(uint32_t graph_id);
+
+ private:
+  mindspore::HashMap<uint32_t, std::vector<ZeroCopyTaskPtr>> graph_zero_copy_tasks_;
+};
+}  // namespace tasksink
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore
+#endif  // MINDSPORE_MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_DEVICE_TASKSINK_RTMODEL_ZERO_COPY_H_
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_generator.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_generator.cc
@ -162,13 +162,82 @@ void TaskGenerator::LaunchAddrCleanKernel(const CNodePtr &anf_node_ptr, AddressP
  }
 }

+AddressPtrList TaskGenerator::GetTaskInput(const CNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
+  AddressPtrList kernel_inputs;
+  auto op_name = common::AnfAlgo::GetCNodeName(node);
+  if (op_name == kAtomicAddrCleanOpName) {
+    LaunchAddrCleanKernel(node, &kernel_inputs);
+    return kernel_inputs;
+  }
+
+  size_t input_num = common::AnfAlgo::GetInputTensorNum(node);
+  for (size_t i = 0; i < input_num; ++i) {
+    if (common::AnfAlgo::IsNoneInput(node, i)) {
+      continue;
+    }
+    auto input_index_in_graph = AnfAlgo::GetInputIndexInGraph(node, i);
+    auto device_address = AnfAlgo::GetPrevNodeOutputAddr(node, input_index_in_graph);
+    AddressPtr input = std::make_shared<Address>();
+    MS_EXCEPTION_IF_NULL(input);
+    input->addr = device_address->ptr_;
+    input->size = device_address->size_;
+
+    auto prenode_with_index = common::AnfAlgo::GetPrevNodeOutput(node, input_index_in_graph);
+    MS_EXCEPTION_IF_NULL(prenode_with_index.first);
+    if (AnfUtils::IsRealCNodeKernel(prenode_with_index.first)) {
+      if (common::AnfAlgo::IsNonTaskOp(prenode_with_index.first->cast<CNodePtr>())) {
+        // use memory offset to implement NonTask Type Split op
+        // when op A -> split(NonTask) -> op B, op B's input addr is split's input0's addr + offset
+        // offset is split's output index * split's output size
+        auto split_input0_device_address = AnfAlgo::GetPrevNodeOutputAddr(prenode_with_index.first, 0);
+        MS_EXCEPTION_IF_NULL(split_input0_device_address);
+        input->addr =
+          static_cast<uint8_t *>(split_input0_device_address->ptr_) + (prenode_with_index.second * input->size);
+        MS_LOG(INFO) << "Change " << node->fullname_with_scope() << "'s input " << i << " address to "
+                     << split_input0_device_address->ptr_ << " + " << prenode_with_index.second * input->size;
+      }
+    }
+    kernel_inputs.push_back(input);
+  }
+  return kernel_inputs;
+}
+
+AddressPtrList TaskGenerator::GetTaskOutput(const CNodePtr &node) {
+  AddressPtrList kernel_outputs;
+  // No kernel output if output of the cnode is monad, such as LabelSwitch.
+  if (!HasAbstractMonad(node)) {
+    size_t output_num = common::AnfAlgo::GetOutputTensorNum(node);
+    for (size_t i = 0; i < output_num; ++i) {
+      auto it = AnfAlgo::GetOutputAddr(node, i, false);
+      AddressPtr output = std::make_shared<Address>();
+      output->addr = it->ptr_;
+      output->size = it->size_;
+      kernel_outputs.push_back(output);
+    }
+  }
+  return kernel_outputs;
+}
+
+AddressPtrList TaskGenerator::GetTaskWorkspace(const CNodePtr &node) {
+  AddressPtrList kernel_workspaces;
+  auto kernel_mod = AnfAlgo::GetKernelMod(node);
+  MS_EXCEPTION_IF_NULL(kernel_mod);
+  for (size_t i = 0; i < kernel_mod->GetWorkspaceSizeList().size(); ++i) {
+    auto device_address = AnfAlgo::GetWorkspaceAddr(node, i);
+    kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
+    MS_EXCEPTION_IF_NULL(workspace);
+    workspace->addr = device_address->ptr_;
+    workspace->size = device_address->size_;
+    kernel_workspaces.push_back(workspace);
+  }
+  return kernel_workspaces;
+}
+
 bool TaskGenerator::LaunchKernel(const CNodePtr &anf_node_ptr, uint32_t stream_id,
                                 std::vector<TaskInfoPtr> *task_info_list) {
  MS_EXCEPTION_IF_NULL(task_info_list);
  MS_EXCEPTION_IF_NULL(anf_node_ptr);
-  AddressPtrList kernel_inputs;
-  AddressPtrList kernel_workspaces;
-  AddressPtrList kernel_outputs;
  auto kernel_mod = AnfAlgo::GetKernelMod(anf_node_ptr);
  MS_EXCEPTION_IF_NULL(kernel_mod);
  kernel_mod->set_unique_name(anf_node_ptr->UniqueName());
@ -185,59 +254,16 @@ bool TaskGenerator::LaunchKernel(const CNodePtr &anf_node_ptr, uint32_t stream_i
    return true;
  }

-  if (op_name != kAtomicAddrCleanOpName) {
-    size_t input_num = common::AnfAlgo::GetInputTensorNum(anf_node_ptr);
-    for (size_t i = 0; i < input_num; ++i) {
-      if (common::AnfAlgo::IsNoneInput(anf_node_ptr, i)) {
-        continue;
-      }
-      auto input_index_in_graph = AnfAlgo::GetInputIndexInGraph(anf_node_ptr, i);
-      auto device_address = AnfAlgo::GetPrevNodeOutputAddr(anf_node_ptr, input_index_in_graph);
-      AddressPtr input = std::make_shared<Address>();
-      MS_EXCEPTION_IF_NULL(input);
-      input->addr = device_address->ptr_;
-      input->size = device_address->size_;
+  AddressPtrList kernel_inputs;
+  AddressPtrList kernel_workspaces;
+  AddressPtrList kernel_outputs;

-      auto prenode_with_index = common::AnfAlgo::GetPrevNodeOutput(anf_node_ptr, input_index_in_graph);
-      MS_EXCEPTION_IF_NULL(prenode_with_index.first);
-      if (AnfUtils::IsRealCNodeKernel(prenode_with_index.first)) {
-        if (common::AnfAlgo::IsNonTaskOp(prenode_with_index.first->cast<CNodePtr>())) {
-          // use memory offset to implement NonTask Type Split op
-          // when op A -> split(NonTask) -> op B, op B's input addr is split's input0's addr + offset
-          // offset is split's output index * split's output size
-          auto split_input0_device_address = AnfAlgo::GetPrevNodeOutputAddr(prenode_with_index.first, 0);
-          MS_EXCEPTION_IF_NULL(split_input0_device_address);
-          input->addr =
-            static_cast<uint8_t *>(split_input0_device_address->ptr_) + (prenode_with_index.second * input->size);
-          MS_LOG(INFO) << "Change " << anf_node_ptr->fullname_with_scope() << "'s input " << i << " address to "
-                       << split_input0_device_address->ptr_ << " + " << prenode_with_index.second * input->size;
-        }
-      }
-      kernel_inputs.push_back(input);
-    }
-
-    // No kernel output if output of the cnode is monad, such as LabelSwitch.
-    if (!HasAbstractMonad(anf_node_ptr)) {
-      size_t output_num = common::AnfAlgo::GetOutputTensorNum(anf_node_ptr);
-      for (size_t i = 0; i < output_num; ++i) {
-        auto it = AnfAlgo::GetOutputAddr(anf_node_ptr, i, false);
-        AddressPtr output = std::make_shared<Address>();
-        output->addr = it->ptr_;
-        output->size = it->size_;
-        kernel_outputs.push_back(output);
-      }
-    }
-
-    for (size_t i = 0; i < kernel_mod->GetWorkspaceSizeList().size(); ++i) {
-      auto device_address = AnfAlgo::GetWorkspaceAddr(anf_node_ptr, i);
-      kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
-      MS_EXCEPTION_IF_NULL(workspace);
-      workspace->addr = device_address->ptr_;
-      workspace->size = device_address->size_;
-      kernel_workspaces.push_back(workspace);
-    }
-  } else {
+  if (op_name == kAtomicAddrCleanOpName) {
    LaunchAddrCleanKernel(anf_node_ptr, &kernel_inputs);
+  } else {
+    kernel_inputs = GetTaskInput(anf_node_ptr);
+    kernel_workspaces = GetTaskWorkspace(anf_node_ptr);
+    kernel_outputs = GetTaskOutput(anf_node_ptr);
  }

  auto ascend_kernel_mod = dynamic_cast<kernel::AscendKernelMod *>(kernel_mod);
@ -251,6 +277,7 @@ bool TaskGenerator::LaunchKernel(const CNodePtr &anf_node_ptr, uint32_t stream_i
    MS_LOG(ERROR) << "Empty task_info_ptrs.";
    return false;
  }
+  MS_LOG(INFO) << "Node " << anf_node_ptr->fullname_with_scope() << " get task " << task_info_ptrs.front()->op_name();
  debug_info->op_name_ = anf_node_ptr->fullname_with_scope();
  debug_info->task_num_ = task_info_ptrs.size();
  debug_info->stream_id_ = task_info_ptrs[0]->stream_id();
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_generator.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/task_generator.h
@ -63,6 +63,10 @@ class TaskGenerator {
  std::vector<TaskDebugInfoPtr> GetTaskDebugInfo() const { return task_debug_info_list_; }
  static void DumpTaskInfo(const string &real_filename, const std::vector<TaskDebugInfoPtr> &task_debug_info_list);

+  static AddressPtrList GetTaskInput(const CNodePtr &node);
+  static AddressPtrList GetTaskOutput(const CNodePtr &node);
+  static AddressPtrList GetTaskWorkspace(const CNodePtr &node);
+
 private:
  std::vector<TaskDebugInfoPtr> task_debug_info_list_;
  static void LaunchAddrCleanKernel(const CNodePtr &anf_node_ptr, AddressPtrList *kernel_inputs);
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_context.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_context.cc
@ -97,7 +97,7 @@ bool AscendDeviceContext::PartitionGraph(const FuncGraphPtr &func_graph) const {
 RunMode AscendDeviceContext::GetRunMode(const FuncGraphPtr &func_graph) const {
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
-  if (ms_context->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) && IsGraphMode() && !IsDynamicShapeGraph(func_graph)) {
+  if (ms_context->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) && !IsDynamicShapeGraph(func_graph)) {
    return RunMode::kGraphMode;
  } else {
    return RunMode::kKernelMode;
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_graph_executor.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_graph_executor.cc
@ -33,6 +33,7 @@ namespace device {
 namespace ascend {
 using KernelGraph = mindspore::session::KernelGraph;

+namespace {
 CNodePtr GetNextLabelSet(const std::vector<CNodePtr> &kernel_nodes, uint32_t index) {
  size_t node_sizes = kernel_nodes.size();
  if (index >= node_sizes - 1) {
@ -162,6 +163,25 @@ void InitMemReuseExecOrder(KernelGraph *kernel_graph) {
  UnfoldRecursiveExecOrder(kernel_graph);
 }

+void EnableGraphInputZeroCopy(const KernelGraphPtr &graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  // Zero copy is only enabled for PyNative.
+  if (!graph->has_flag(kFlagPyNativeRunInGraph) || !graph->is_graph_run_mode()) {
+    return;
+  }
+  const auto &input_nodes = graph->input_nodes();
+  for (const auto &input : input_nodes) {
+    MS_EXCEPTION_IF_NULL(input);
+    if (AnfAlgo::OutputAddrExist(input, 0)) {
+      auto input_address = AnfAlgo::GetMutableOutputAddr(input, 0);
+      MS_EXCEPTION_IF_NULL(input_address);
+      input_address->set_is_ptr_persisted(false);
+      MS_LOG(INFO) << "Enable zero copy for input " << input->DebugString();
+    }
+  }
+}
+}  // namespace
+
 void AscendGraphExecutor::Initialize() {
  res_manager_ = dynamic_cast<AscendDeviceResManager *>(device_context_->device_res_manager_.get());
  MS_EXCEPTION_IF_NULL(res_manager_);
@ -217,6 +237,7 @@ void AscendGraphExecutor::PreprocessBeforeRun(const KernelGraphPtr &graph) const
  AllocateGraphMemory(NOT_NULL(graph));
  LoadModel(NOT_NULL(graph));
  AssignOutputNopNodeDeviceAddress(graph, device_context_);
+  EnableGraphInputZeroCopy(graph);
 }

 void AscendGraphExecutor::UpdateExecOrder(const KernelGraphPtr &graph) const {
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hccl_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hccl_kernel.cc
@ -220,12 +220,17 @@ const std::vector<size_t> &HcclKernel::GetOutputSizeList() const {
 const std::vector<size_t> &HcclKernel::GetWorkspaceSizeList() const {
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
-  bool is_task_sink = context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
-  auto mode = context_ptr->get_param<int>(MS_CTX_EXECUTION_MODE);
-  if (!mutable_workspace_size_list_.empty() || hccl_data_type_list_.empty() || (!is_task_sink && mode == kGraphMode) ||
-      mode == kPynativeMode) {
+  auto hccl_node = anf_node_.lock();
+  MS_EXCEPTION_IF_NULL(hccl_node);
+  auto func_graph = hccl_node->func_graph();
+  auto kernel_graph = func_graph->cast<KernelGraphPtr>();
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto graph_run_mode = kernel_graph->is_graph_run_mode();
+  // Not task sink mode.
+  if (!mutable_workspace_size_list_.empty() || hccl_data_type_list_.empty() || !graph_run_mode) {
    return mutable_workspace_size_list_;
  }
+  // Task sink mode.
  mutable_workspace_size_list_.emplace_back(
    hccl::HcclAdapter::GetInstance().CalcWorkspaceSize(anf_node_.lock(), hccl_data_type_list_[0]));
  return mutable_workspace_size_list_;
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/eliminate_graph_output_transdata.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/eliminate_graph_output_transdata.cc
@ -149,7 +149,8 @@ bool EliminateGraphOutputTransdata::Run(const FuncGraphPtr &func_graph) {
  std::vector<PrimitivePtr> return_type = {prim::kPrimMakeTuple};
  auto item_with_index = common::AnfAlgo::VisitKernelWithReturnType(func_graph->output(), 0, false, return_type);
  if (!common::AnfAlgo::CheckPrimitiveType(item_with_index.first, prim::kPrimMakeTuple)) {
-    MS_LOG(EXCEPTION) << "Graph output is not a MakeTuple";
+    MS_LOG(INFO) << "Graph output is not a MakeTuple";
+    return true;
  }

  auto transdata_ref_count = GetTransdataRefCount(func_graph);
--- a/mindspore/ccsrc/runtime/graph_scheduler/graph_compiler.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/graph_compiler.cc
@ -695,8 +695,12 @@ GraphId GraphCompiler::CompileWholeGraphForGraphRunMode(const FuncGraphPtr &func
  opt::BackendCommonOptimization(root_graph);
  root_graph->SetInputNodes();

-  auto graph_id = CompileGraphImpl(root_graph, device_context);
-
+  GraphId graph_id = 0;
+  if (!func_graph->has_flag(kFlagPyNativeRunInGraph)) {
+    graph_id = CompileGraphImpl(root_graph, device_context);
+  } else {
+    graph_id = root_graph->graph_id();
+  }
  // Set summary nodes for all graphs.
  session_->SetSummaryNodesForAllGraphs(root_graph.get(), all_graphs);

@ -704,13 +708,22 @@ GraphId GraphCompiler::CompileWholeGraphForGraphRunMode(const FuncGraphPtr &func
  // for ascend mindRT.
  session_->DumpGraphs(all_graphs);

-  // Cache the backend graph output nodes to front nodes with output index.
-  auto output = func_graph->output();
-  MS_EXCEPTION_IF_NULL(output);
-  auto backend_node = root_graph->output();
-  MS_EXCEPTION_IF_NULL(backend_node);
-  root_graph->CacheGraphOutputToFrontNodeWithIndex({backend_node}, {output});
-  AnfAlgo::UpdateGraphValidRefPair(root_graph);
+  if (!func_graph->has_flag(kFlagPyNativeRunInGraph)) {
+    // Cache the backend graph output nodes to front nodes with output index.
+    auto output = func_graph->output();
+    MS_EXCEPTION_IF_NULL(output);
+    auto backend_node = root_graph->output();
+    MS_EXCEPTION_IF_NULL(backend_node);
+    root_graph->CacheGraphOutputToFrontNodeWithIndex({backend_node}, {output});
+    AnfAlgo::UpdateGraphValidRefPair(root_graph);
+  } else {
+    for (auto &node : root_graph->execution_order()) {
+      if (common::AnfAlgo::IsControlOpExecInBackend(node)) {
+        root_graph->set_flag(kFlagsIsCutGraph, true);
+      }
+    }
+    root_graph->set_front_outputs({func_graph->output()});
+  }

  MS_LOG(INFO) << "Status record: end compile graph. graph id: " << graph_id;
  return graph_id;
--- a/mindspore/ccsrc/runtime/pynative/graph_adapter.cc
+++ b/mindspore/ccsrc/runtime/pynative/graph_adapter.cc
@ -22,6 +22,7 @@
 #include "ir/tensor.h"
 #include "include/common/utils/convert_utils.h"
 #include "include/common/utils/anfalgo.h"
+#include "include/common/utils/parallel_context.h"
 #include "backend/common/session/anf_runtime_algorithm.h"
 #include "runtime/graph_scheduler/device_tensor_store.h"
 #include "runtime/device/ms_device_shape_transfer.h"
@ -330,4 +331,27 @@ void GraphAdapter::ReplaceGraphParameterProperties(const KernelGraphPtr &graph,
    }
  }
 }
+
+bool GraphAdapter::IsAutoParallel() {
+  auto parallel_context = parallel::ParallelContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(parallel_context);
+  auto parallel_mode = parallel_context->parallel_mode();
+  return parallel_mode == parallel::kSemiAutoParallel || parallel_mode == parallel::kAutoParallel;
+}
+
+bool GraphAdapter::PyNativeEnableTaskSink(const FuncGraphPtr &func_graph) {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  bool pynative_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode;
+  if (!pynative_mode) {
+    return true;
+  }
+
+  std::vector<AnfNodePtr> node_list = TopoSort(func_graph->get_return());
+  auto is_cut_graph = std::any_of(node_list.begin(), node_list.end(), [](const AnfNodePtr &node) {
+    return common::AnfAlgo::IsControlOpExecInBackend(node);
+  });
+
+  return !IsAutoParallel() && !is_cut_graph && !func_graph->has_flag(kFlagIsDynamicStructure);
+}
 }  // namespace mindspore::pynative
--- a/mindspore/ccsrc/runtime/pynative/graph_adapter.h
+++ b/mindspore/ccsrc/runtime/pynative/graph_adapter.h
@ -33,6 +33,8 @@ class GraphAdapter {
  static void RemoveUnusedValueNodes(const KernelGraphPtr &graph);
  static void HandleHeterogeneousTensors(const std::vector<std::vector<tensor::TensorPtr>> &tensors,
                                         const std::vector<device::DeviceContext *> &device_contexts);
+  static bool PyNativeEnableTaskSink(const FuncGraphPtr &func_graph);
+  static bool IsAutoParallel();
 };
 }  // namespace mindspore::pynative
 #endif  // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_PYNATIVE_GRAPH_ADAPTER_H_
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@ -160,6 +160,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "../../../mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.cc"
        "../../../mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_pool.cc"
        "../../../mindspore/ccsrc/plugin/device/ascend/hal/device/lic_manager.cc"
+        "../../../mindspore/ccsrc/plugin/device/ascend/hal/device/tasksink/rtmodel_zero_copy.cc"
        "../../../mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_context.cc"
        "../../../mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_res_manager.cc"
        "../../../mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc"
--- a/tests/ut/cpp/stub/tasksink/task_sink_stub.cc
+++ b/tests/ut/cpp/stub/tasksink/task_sink_stub.cc
@ -27,6 +27,10 @@ bool TaskGenerator::GenTasks(const std::vector<CNodePtr> &anf_node_list, std::ve
                             uint32_t graph_id) {
  return true;
 }
+
+AddressPtrList TaskGenerator::GetTaskInput(const CNodePtr &node) { return {}; }
+AddressPtrList TaskGenerator::GetTaskOutput(const CNodePtr &node) { return {}; }
+AddressPtrList TaskGenerator::GetTaskWorkspace(const CNodePtr &node) { return {}; }
 }  // namespace tasksink
 #ifndef ENABLE_SECURITY
 void DataDumper::LoadDumpInfo() {}