Async Data Dump

2020-07-06 21:57:32 +08:00 · 2020-07-06 21:57:32 +08:00 · c577952c9a
parent da9452ee5e
commit c577952c9a
44 changed files with 1201 additions and 135 deletions
--- a/build.sh
+++ b/build.sh
@ -24,7 +24,7 @@ usage()
 {
  echo "Usage:"
  echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t on|off] [-g on|off] [-h] [-b ge] [-m infer|train] \\"
-  echo "              [-a on|off] [-Q on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
+  echo "              [-a on|off] [-Q on|off] [-S on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
  echo "              [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K] [-B on|off] [-E] [-l on|off]"
  echo ""
  echo "Options:"
@ -48,6 +48,7 @@ usage()
  echo "    -P Enable dump anf graph to file in ProtoBuffer format, default on"
  echo "    -Q Enable dump memory, default off"
  echo "    -D Enable dumping of function graph ir, default on"
+  echo "    -S Enable async data dump, default off"
  echo "    -z Compile dataset & mindrecord, default on"
  echo "    -M Enable MPI and NCCL for GPU training, gpu default on"
  echo "    -V Specify the minimum required cuda version, default CUDA 10.1"
@ -88,6 +89,7 @@ checkopts()
  ENABLE_TIMELINE="off"
  ENABLE_DUMP2PROTO="on"
  ENABLE_DUMPE2E="off"
+  ENABLE_DATA_DUMP="off"
  ENABLE_DUMP_IR="on"
  COMPILE_MINDDATA="on"
  ENABLE_MPI="off"
@ -102,7 +104,7 @@ checkopts()
  ENABLE_PYTHON="on"

  # Process the options
-  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:D:zM:V:K:sB:E' opt
+  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:S:D:zM:V:K:sB:E' opt
  do
    OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
    case "${opt}" in
@ -218,6 +220,11 @@ checkopts()
        ENABLE_DUMPE2E="$OPTARG"
        echo "enable dump end to end"
        ;;
+      S)
+        check_on_off $OPTARG S
+        ENABLE_DATA_DUMP="$OPTARG"
+        echo "enable data dump"
+        ;;
      D)
        check_on_off $OPTARG D
        ENABLE_DUMP_IR="$OPTARG"
@ -321,6 +328,9 @@ build_mindspore()
    if [[ "X$ENABLE_DUMPE2E" = "Xon" ]]; then
        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_E2E=ON"
    fi
+    if [[ "X$ENABLE_DATA_DUMP" = "Xon" ]]; then
+        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DATA_DUMP=ON"
+    fi
    CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_IR=${ENABLE_DUMP_IR}"
    CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_PYTHON=${ENABLE_PYTHON}"
    if [[ "X$ENABLE_MPI" = "Xon" ]]; then
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@ -116,6 +116,10 @@ if(ENABLE_DUMP_E2E)
    add_compile_definitions(ENABLE_DUMP_E2E)
 endif()

+if(ENABLE_DATA_DUMP)
+    add_compile_definitions(ENABLE_DATA_DUMP)
+endif()
+
 if(ENABLE_DEBUGGER)
    add_compile_definitions(ENABLE_DEBUGGER)
 endif()
--- a/config/data_dump.json
+++ b/config/data_dump.json
@ -0,0 +1,15 @@
+{
+  "DumpSettings": {
+    "net_name": "ResNet50",
+    "mode": 1,
+    "iteration": 0,
+    "kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"]
+  },
+
+  "DumpSettingsSpec": {
+    "net_name": "net name eg:ResNet50",
+    "mode": "0: dump all kernels, 1: dump kernels in kernels list",
+    "iteration": "specified iteration ",
+    "kernels": "op's full scope name which need to be dump"
+  }
+}
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 1c2672868fda8b1d012c99e5aca73725ac869ba9
+Subproject commit 18cf690152add623ffbddfbbb4674d1b34484ca7
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@ -109,8 +109,12 @@ if (ENABLE_D)
    file(GLOB_RECURSE PROTO_INNER RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "predict/proto/*.proto")
    ms_protobuf_generate(PREDICT_PROTOSRCS PREDICT_PROTOHDRS ${PROTO_INNER})

+    file(GLOB_RECURSE PROTO_DUMP RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "device/ascend/dump/proto/*.proto")
+    ms_protobuf_generate(DUMP_PROTOSRCS PROTOHDRS ${PROTO_DUMP})
+
    list(APPEND MINDSPORE_PROTO_LIST ${PROTOSRCS})
    list(APPEND MINDSPORE_PROTO_LIST ${PREDICT_PROTOSRCS})
+    list(APPEND MINDSPORE_PROTO_LIST ${DUMP_PROTOSRCS})

    add_compile_definitions(ENABLE_D)
 endif ()
--- a/mindspore/ccsrc/debug/CMakeLists.txt
+++ b/mindspore/ccsrc/debug/CMakeLists.txt
@ -19,6 +19,15 @@ if (ENABLE_DEBUGGER)
        )
 endif (ENABLE_DEBUGGER)

+if (ENABLE_D)
+    list(APPEND _DEBUG_SRC_LIST
+        "${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
+        )
+    if (ENABLE_DATA_DUMP)
+        list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/data_dump_parser.cc")
+    endif(ENABLE_DATA_DUMP)
+endif()
+
 if (ENABLE_DUMP_E2E)
    list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/e2e_dump.cc")
 endif (ENABLE_DUMP_E2E)
--- a/mindspore/ccsrc/debug/common.cc
+++ b/mindspore/ccsrc/debug/common.cc
@ -0,0 +1,125 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "debug/common.h"
+
+#include <memory>
+#include <optional>
+#include "utils/system/env.h"
+#include "utils/system/file_system.h"
+#include "utils/log_adapter.h"
+#include "utils/context/ms_context.h"
+
+namespace mindspore {
+std::optional<std::string> Common::GetRealPath(const std::string &input_path) {
+  std::string out_path;
+  auto path_split_pos = input_path.find_last_of('/');
+  if (path_split_pos == std::string::npos) {
+    path_split_pos = input_path.find_last_of('\\');
+  }
+  // get real path
+  char real_path[PATH_MAX] = {0};
+  if (path_split_pos != std::string::npos) {
+    std::string prefix_path = input_path.substr(0, path_split_pos);
+    if (prefix_path.length() >= PATH_MAX) {
+      MS_LOG(ERROR) << "Prefix path is too longer!";
+      return std::nullopt;
+    }
+    std::string last_path = input_path.substr(path_split_pos, input_path.length() - path_split_pos);
+    auto ret = CreateNotExistDirs(prefix_path);
+    if (!ret) {
+      MS_LOG(ERROR) << "CreateNotExistDirs Failed!";
+      return std::nullopt;
+    }
+
+    if (nullptr == realpath(prefix_path.c_str(), real_path)) {
+      MS_LOG(ERROR) << "dir " << prefix_path << " does not exit.";
+      return std::nullopt;
+    }
+    out_path = std::string(real_path) + last_path;
+  }
+
+  if (path_split_pos == std::string::npos) {
+    if (input_path.length() >= PATH_MAX) {
+      MS_LOG(ERROR) << "Prefix path is too longer!";
+      return std::nullopt;
+    }
+    if (nullptr == realpath(input_path.c_str(), real_path)) {
+      MS_LOG(ERROR) << "File " << input_path << " does not exit, it will be created.";
+    }
+    out_path = std::string(real_path);
+  }
+  return out_path;
+}
+
+bool Common::CreateNotExistDirs(const std::string &path) {
+  std::shared_ptr<system::FileSystem> fs = system::Env::GetFileSystem();
+  MS_EXCEPTION_IF_NULL(fs);
+  char temp_path[PATH_MAX] = {0};
+  if (path.length() > PATH_MAX) {
+    MS_LOG(ERROR) << "Path lens is max than " << PATH_MAX;
+    return false;
+  }
+  for (uint32_t i = 0; i < path.length(); i++) {
+    temp_path[i] = path[i];
+    if (temp_path[i] == '\\' || temp_path[i] == '/') {
+      if (i != 0) {
+        char tmp_char = temp_path[i];
+        temp_path[i] = '\0';
+        std::string path_handle(temp_path);
+        if (!fs->FileExist(temp_path)) {
+          MS_LOG(INFO) << "Dir " << path_handle << " does not exit, creating...";
+          if (!fs->CreateDir(temp_path)) {
+            MS_LOG(ERROR) << "Create " << path_handle << " dir error";
+            return false;
+          }
+        }
+        temp_path[i] = tmp_char;
+      }
+    }
+  }
+
+  if (!fs->FileExist(path)) {
+    MS_LOG(INFO) << "Dir " << path << " does not exit, creating...";
+    if (!fs->CreateDir(path)) {
+      MS_LOG(ERROR) << "Create " << path << " dir error";
+      return false;
+    }
+  }
+  return true;
+}
+
+std::optional<std::string> Common::GetConfigFile(const std::string &env) {
+  if (env.empty()) {
+    MS_LOG(EXCEPTION) << "Invalid env";
+  }
+  auto config_path_str = std::getenv(env.c_str());
+  if (config_path_str == nullptr) {
+    MS_LOG(ERROR) << "Please export env:" << env;
+    return {};
+  }
+  MS_LOG(INFO) << "Async Dump Getenv env:" << env << "=" << config_path_str;
+
+  std::string dump_config_file(config_path_str);
+  std::shared_ptr<system::FileSystem> fs = system::Env::GetFileSystem();
+  MS_EXCEPTION_IF_NULL(fs);
+  if (!fs->FileExist(dump_config_file)) {
+    MS_LOG(ERROR) << dump_config_file << " not exist.";
+    return {};
+  }
+  return dump_config_file;
+}
+}  // namespace mindspore
--- a/mindspore/ccsrc/debug/common.h
+++ b/mindspore/ccsrc/debug/common.h
@ -0,0 +1,36 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_MINDSPORE_CCSRC_DEBUG_COMMON_H_
+#define MINDSPORE_MINDSPORE_CCSRC_DEBUG_COMMON_H_
+
+#include <string>
+#include <optional>
+#include "utils/contract.h"
+
+namespace mindspore {
+class Common {
+ public:
+  Common() = default;
+  ~Common() = default;
+  static std::optional<std::string> GetRealPath(const std::string &input_path);
+  static std::optional<std::string> GetConfigFile(const std::string &env);
+
+ private:
+  static bool CreateNotExistDirs(const std::string &path);
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_MINDSPORE_CCSRC_DEBUG_COMMON_H_
--- a/mindspore/ccsrc/debug/data_dump_parser.cc
+++ b/mindspore/ccsrc/debug/data_dump_parser.cc
@ -0,0 +1,152 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "debug/data_dump_parser.h"
+
+#include <fstream>
+#include "utils/context/ms_context.h"
+#include "debug/common.h"
+
+constexpr auto kDataDumpConfigPtah = "DATA_DUMP_CONFIG_PATH";
+constexpr auto kEnableDataDump = "ENABLE_DATA_DUMP";
+constexpr auto kDataDumpPath = "DATA_DUMP_PATH";
+namespace mindspore {
+void DataDumpParser::ResetParam() {
+  enable_ = false;
+  net_name_.clear();
+  dump_mode_ = 0;
+  dump_step_ = 0;
+  kernel_set_.clear();
+}
+
+bool DataDumpParser::DumpEnabled() const {
+  auto enable_dump = std::getenv(kEnableDataDump);
+  if (!enable_dump) {
+    MS_LOG(WARNING) << "[DataDump] enable dump is null. Please export ENABLE_DATA_DUMP";
+    return false;
+  }
+
+  auto enabled = std::atoi(enable_dump);
+  if (enabled != 1) {
+    MS_LOG(WARNING) << "[DataDump] Please export ENABLE_DATA_DUMP=1";
+    return false;
+  }
+
+  auto context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context);
+  if (context->execution_mode() == kPynativeMode) {
+    MS_LOG(EXCEPTION) << "[DataDump] PyNative mode not support data dump";
+  }
+  return true;
+}
+
+std::optional<std::string> DataDumpParser::GetDumpPath() const {
+  auto dump_path = std::getenv(kDataDumpPath);
+  if (!dump_path) {
+    MS_LOG(ERROR) << "[DataDump] dump path is null. Please export DATA_DUMP_PATH";
+    return {};
+  }
+  std::string dump_path_str(dump_path);
+  return dump_path_str;
+}
+
+void DataDumpParser::ParseDumpConfig() {
+  std::lock_guard<std::mutex> guard(lock_);
+  MS_LOG(INFO) << "[DataDump] parse start";
+  if (!DumpEnabled()) {
+    MS_LOG(INFO) << "[DataDump] dump not enable";
+    return;
+  }
+
+  ResetParam();
+
+  auto dump_config_file = Common::GetConfigFile(kDataDumpConfigPtah);
+  if (!dump_config_file.has_value()) {
+    MS_LOG(EXCEPTION) << "[DataDump] Get config file failed";
+  }
+
+  std::ifstream json_file(dump_config_file.value());
+  if (!json_file.is_open()) {
+    MS_LOG(EXCEPTION) << "[DataDump] " << dump_config_file.value() << " open failed.";
+  }
+
+  nlohmann::json j;
+  json_file >> j;
+  if (j.find("DumpSettings") == j.end()) {
+    MS_LOG(EXCEPTION) << "[DataDump] DumpSettings is not exist.";
+  }
+
+  nlohmann::json dump_settings = j.at("DumpSettings");
+  // convert json to string
+  std::stringstream ss;
+  ss << dump_settings;
+  std::string cfg = ss.str();
+  MS_LOG(INFO) << "[DataDump] Async dump settings Json: " << cfg;
+  if (!IsConfigExist(dump_settings)) {
+    MS_LOG(EXCEPTION) << "[DataDump] Async dump json invalid";
+  }
+
+  if (!ParseDumpSetting(dump_settings)) {
+    MS_LOG(EXCEPTION) << "[DataDump] Parse dump json failed";
+  }
+}
+
+bool DataDumpParser::NeedDump(const std::string &op_full_name) const {
+  if (!DumpEnabled()) {
+    return false;
+  }
+  if (dump_mode_ == 0) {
+    return true;
+  }
+  auto iter = kernel_set_.find(op_full_name);
+  return iter != kernel_set_.end();
+}
+
+bool DataDumpParser::IsConfigExist(const nlohmann::json &dump_settings) const {
+  if (dump_settings.find("mode") == dump_settings.end() || dump_settings.find("net_name") == dump_settings.end() ||
+      dump_settings.find("iteration") == dump_settings.end() || dump_settings.find("kernels") == dump_settings.end()) {
+    MS_LOG(ERROR) << "[DataDump] DumpSettings keys are not exist.";
+    return false;
+  }
+  return true;
+}
+
+bool DataDumpParser::ParseDumpSetting(const nlohmann::json &dump_settings) {
+  auto mode = dump_settings.at("mode");
+  auto net_name = dump_settings.at("net_name");
+  auto iteration = dump_settings.at("iteration");
+  auto kernels = dump_settings.at("kernels");
+  if (!(mode.is_number() && net_name.is_string() && iteration.is_number() && kernels.is_array())) {
+    MS_LOG(ERROR) << "[DataDump] Element's type in Dump config json is invalid.";
+    enable_ = false;
+    return false;
+  }
+
+  enable_ = true;
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  dump_mode_ = mode;
+  net_name_ = net_name;
+  dump_step_ = iteration;
+  for (const auto &kernel : kernels) {
+    auto kernel_str = kernel.dump();
+    kernel_str.erase(std::remove(kernel_str.begin(), kernel_str.end(), '\"'), kernel_str.end());
+    MS_LOG(INFO) << "[DataDump] Need dump kernel:" << kernel_str;
+    kernel_set_.insert(kernel_str);
+  }
+  return true;
+}
+}  // namespace mindspore
--- a/mindspore/ccsrc/debug/data_dump_parser.h
+++ b/mindspore/ccsrc/debug/data_dump_parser.h
@ -0,0 +1,61 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_
+#define MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_
+
+#include <string>
+#include <set>
+#include <mutex>
+#include <optional>
+#include "nlohmann/json.hpp"
+#include "common/utils.h"
+
+namespace mindspore {
+class DataDumpParser {
+ public:
+  static DataDumpParser &GetInstance() {
+    static DataDumpParser instance;
+    return instance;
+  }
+  void ParseDumpConfig();
+  bool NeedDump(const std::string &op_full_name) const;
+  bool DumpEnabled() const;
+  std::optional<std::string> GetDumpPath() const;
+  bool enable() const { return enable_; }
+  const std::string &net_name() const { return net_name_; }
+  uint32_t dump_mode() const { return dump_mode_; }
+  uint32_t dump_step() const { return dump_step_; }
+  const std::set<std::string> &kernel_set() const { return kernel_set_; }
+
+ private:
+  DataDumpParser() = default;
+  virtual ~DataDumpParser() = default;
+  DISABLE_COPY_AND_ASSIGN(DataDumpParser);
+
+  void ResetParam();
+  bool IsConfigExist(const nlohmann::json &dump_settings) const;
+  bool ParseDumpSetting(const nlohmann::json &dump_settings);
+
+  std::mutex lock_;
+  bool enable_{false};
+  std::string net_name_;
+  uint32_t dump_mode_{0};
+  uint32_t dump_step_{0};
+  std::set<std::string> kernel_set_;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_
--- a/mindspore/ccsrc/debug/e2e_dump.cc
+++ b/mindspore/ccsrc/debug/e2e_dump.cc
@ -17,12 +17,14 @@
 #include <limits.h>
 #include <fstream>
 #include <string>
+#include <optional>
 #include <nlohmann/json.hpp>
 #include "utils/log_adapter.h"
 #include "utils/system/file_system.h"
 #include "utils/system/env.h"
 #include "utils/convert_utils.h"
 #include "utils/context/ms_context.h"
+#include "debug/common.h"

 using json = nlohmann::json;

@ -158,100 +160,19 @@ bool Dump::DumpToFile(const std::string &filename, const void *data, size_t len)
    return false;
  }

-  std::string realpath;
-  bool ret = GetRealPath(filename, &realpath);
-  if (!ret) {
+  auto realpath = Common::GetRealPath(filename);
+  if (!realpath.has_value()) {
    MS_LOG(ERROR) << "Get real path failed.";
    return false;
  }
  std::ofstream fd;
-  fd.open(realpath, std::ios::binary | std::ios::out);
+  fd.open(realpath.value(), std::ios::binary | std::ios::out);
  if (!fd.is_open()) {
-    MS_LOG(ERROR) << "Open file " << realpath << " fail.";
+    MS_LOG(ERROR) << "Open file " << realpath.value() << " fail.";
    return false;
  }
  (void)fd.write(reinterpret_cast<const char *>(data), SizeToLong(len));
  fd.close();
  return true;
 }
-
-bool Dump::GetRealPath(const std::string &inpath, std::string *outpath) {
-  MS_EXCEPTION_IF_NULL(outpath);
-  auto path_split_pos = inpath.find_last_of('/');
-  if (path_split_pos == std::string::npos) {
-    path_split_pos = inpath.find_last_of('\\');
-  }
-  // get real path
-  char real_path[PATH_MAX] = {0};
-  if (path_split_pos != std::string::npos) {
-    std::string prefix_path = inpath.substr(0, path_split_pos);
-    if (prefix_path.length() >= PATH_MAX) {
-      MS_LOG(ERROR) << "Prefix path is too longer!";
-      return false;
-    }
-    std::string last_path = inpath.substr(path_split_pos, inpath.length() - path_split_pos);
-    auto ret = CreateNotExistDirs(prefix_path);
-    if (ret == false) {
-      MS_LOG(ERROR) << "CreateNotExistDirs Failed!";
-      return false;
-    }
-
-    if (nullptr == realpath(prefix_path.c_str(), real_path)) {
-      MS_LOG(ERROR) << "dir " << prefix_path << " does not exit.";
-      return false;
-    }
-    *outpath = std::string(real_path) + last_path;
-  }
-
-  if (path_split_pos == std::string::npos) {
-    if (inpath.length() >= PATH_MAX) {
-      MS_LOG(ERROR) << "Prefix path is too longer!";
-      return false;
-    }
-    if (nullptr == realpath(inpath.c_str(), real_path)) {
-      MS_LOG(ERROR) << "File " << inpath << " does not exit, it will be created.";
-    }
-    *outpath = std::string(real_path);
-  }
-
-  return true;
-}
-
-bool Dump::CreateNotExistDirs(const std::string &path) {
-  std::shared_ptr<system::FileSystem> fs = system::Env::GetFileSystem();
-  MS_EXCEPTION_IF_NULL(fs);
-  char temp_path[PATH_MAX] = {0};
-  if (path.length() > PATH_MAX) {
-    MS_LOG(ERROR) << "Path lens is max than " << PATH_MAX;
-    return false;
-  }
-  for (uint32_t i = 0; i < path.length(); i++) {
-    temp_path[i] = path[i];
-    if (temp_path[i] == '\\' || temp_path[i] == '/') {
-      if (i != 0) {
-        char tmp_char = temp_path[i];
-        temp_path[i] = '\0';
-        std::string path_handle(temp_path);
-        if (!fs->FileExist(temp_path)) {
-          MS_LOG(INFO) << "Dir " << path_handle << " does not exit, creating...";
-          if (!fs->CreateDir(temp_path)) {
-            MS_LOG(ERROR) << "Create " << path_handle << " dir error";
-            return false;
-          }
-        }
-        temp_path[i] = tmp_char;
-      }
-    }
-  }
-
-  if (!fs->FileExist(path)) {
-    MS_LOG(INFO) << "Dir " << path << " does not exit, creating...";
-    if (!fs->CreateDir(path)) {
-      MS_LOG(ERROR) << "Create " << path << " dir error";
-      return false;
-    }
-  }
-
-  return true;
-}
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/e2e_dump.h
+++ b/mindspore/ccsrc/debug/e2e_dump.h
@ -59,10 +59,6 @@ class Dump {
  uint32_t cur_iter_;
  std::vector<std::string> dump_kernels_;

-  static bool GetRealPath(const std::string &inpath, std::string *outpath);
-
-  static bool CreateNotExistDirs(const std::string &path);
-
 private:
  bool ParseDumpConfig(const std::string &dump_config_file);
  bool IsConfigExist(const nlohmann::json &dumpSettings);
--- a/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc
@ -42,6 +42,7 @@
 #include "device/ascend/ascend_memory_manager.h"
 #include "debug/tensor_load.h"

+using ge::model_runner::ModelRunner;
 using mindspore::device::ascend::ProfilingManager;
 using mindspore::device::ascend::ProfilingUtils;
 using mindspore::device::ascend::tasksink::TaskGenerator;
@ -90,9 +91,16 @@ std::string GetRankId() {
 AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); }

 void AscendKernelRuntime::ClearGraphModelMap() {
+#ifdef ENABLE_DATA_DUMP
+  for (auto &iter : graph_data_dumper_) {
+    MS_LOG(INFO) << "[DataDump] Unload data dumper:" << iter.first;
+    iter.second->UnloadDumpInfo();
+  }
+  graph_data_dumper_.clear();
+#endif
  for (auto &iter : graph_model_map_) {
    MS_LOG(INFO) << "Ge UnloadModel " << iter.first;
-    auto ret = ge::model_runner::ModelRunner::Instance().UnloadModel(iter.first);
+    auto ret = ModelRunner::Instance().UnloadModel(iter.first);
    if (!ret) {
      MS_LOG(ERROR) << "UnloadModel failed";
    }
@ -107,7 +115,7 @@ void AscendKernelRuntime::ClearGraphRuntimeResource(uint32_t graph_id) {
    return;
  }
  MS_LOG(DEBUG) << "Ge UnloadModel " << iter->first;
-  auto ret = ge::model_runner::ModelRunner::Instance().UnloadModel(iter->first);
+  auto ret = ModelRunner::Instance().UnloadModel(iter->first);
  if (!ret) {
    MS_LOG(ERROR) << "UnloadModel failed";
  }
@ -159,6 +167,10 @@ bool AscendKernelRuntime::Init() {
  }
 #endif

+#ifdef ENABLE_DATA_DUMP
+  DataDumpParser::GetInstance().ParseDumpConfig();
+#endif
+
  // Start up profiling before rtSetDevice
  ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
  if (!ret) {
@ -440,7 +452,7 @@ bool AscendKernelRuntime::GenTask(const session::KernelGraph *graph) {
               << ", wait_active_stream_list size:" << wait_active_stream_list.size()
               << ", force_copy_stream_list size:" << force_copy_stream_list.size();
  std::vector<std::shared_ptr<ge::model_runner::OpInfo>> empty_list;
-  std::shared_ptr<ge::model_runner::DavinciModel> model = std::make_shared<ge::model_runner::DavinciModel>(
+  auto model = std::make_shared<ge::model_runner::DavinciModel>(
    task_info_list, empty_list, empty_list, empty_list, empty_list, wait_active_stream_list, force_copy_stream_list, 0,
    0, 0, 0, 0, 0, resource_manager.get_cur_stream_num(), label_assign_instance.GetLabelNum(NOT_NULL(graph)),
    resource_manager.get_cur_event_num(), 0);
@ -477,21 +489,45 @@ bool AscendKernelRuntime::LoadTask(const session::KernelGraph *graph) {

  std::shared_ptr<ge::ModelListener> listener;
  MS_LOG(INFO) << "LoadDavinciModel mode_id:" << model_iter->first;
-  bool status = ge::model_runner::ModelRunner::Instance().LoadDavinciModel(device_id_, 0, model_iter->first,
-                                                                           model_iter->second, listener);
+  bool status =
+    ModelRunner::Instance().LoadDavinciModel(device_id_, 0, model_iter->first, model_iter->second, listener);
  if (!status) {
    MS_LOG(EXCEPTION) << "Load Task Failed";
  }
  if (ProfilingManager::GetInstance().IsProfiling()) {
-    auto task_ids = ge::model_runner::ModelRunner::Instance().GetTaskIdList(model_iter->first);
-    auto stream_ids = ge::model_runner::ModelRunner::Instance().GetStreamIdList(model_iter->first);
+    auto task_ids = ModelRunner::Instance().GetTaskIdList(model_iter->first);
+    auto stream_ids = ModelRunner::Instance().GetStreamIdList(model_iter->first);
    ProfilingUtils::ReportProfilingData(task_ids, stream_ids, NOT_NULL(graph));
  }
+
+#ifdef ENABLE_DATA_DUMP
+  LaunchDataDump(NOT_NULL(graph));
+#endif
+  if (!ModelRunner::Instance().LoadModelComplete(model_iter->first)) {
+    MS_LOG(ERROR) << "Call ge runtime LoadModelComplete failed";
+    return false;
+  }
  return true;
 }

+#ifdef ENABLE_DATA_DUMP
+void AscendKernelRuntime::LaunchDataDump(NotNull<const session::KernelGraph *> graph) {
+  if (!DataDumpParser::GetInstance().DumpEnabled()) {
+    return;
+  }
+  auto runtime_info_map = ModelRunner::Instance().GetRuntimeInfoMap(graph->graph_id());
+  auto data_dumper = std::make_shared<DataDumper>(graph.get(), runtime_info_map);
+  MS_EXCEPTION_IF_NULL(data_dumper);
+  data_dumper->LoadDumpInfo();
+  auto ret = graph_data_dumper_.try_emplace(graph->graph_id(), data_dumper);
+  if (!ret.second) {
+    MS_LOG(WARNING) << "[DataDump] Insert graphId:" << graph->graph_id() << " data dumper failed";
+  }
+}
+#endif
+
 void AscendKernelRuntime::DebugTaskIdName(GraphId graph_id) {
-  auto task_ids = ge::model_runner::ModelRunner::Instance().GetTaskIdList(graph_id);
+  auto task_ids = ModelRunner::Instance().GetTaskIdList(graph_id);
  auto graph_task_names = ProfilingUtils::graph_kernel_name();
  auto iter = graph_task_names.find(graph_id);
  if (iter != graph_task_names.end()) {
@ -524,7 +560,7 @@ bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) {
    return false;
  }

-  bool status = ge::model_runner::ModelRunner::Instance().RunModel(graph->graph_id(), input_tensors, output_tensors);
+  bool status = ModelRunner::Instance().RunModel(graph->graph_id(), input_tensors, output_tensors);
  if (!status) {
    MS_LOG(ERROR) << "Run task failed";
    DebugTaskIdName(graph->graph_id());
--- a/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.h
@ -24,6 +24,10 @@
 #include "framework/ge_runtime/davinci_model.h"
 #include "device/kernel_runtime_manager.h"
 #include "session/session_basic.h"
+#ifdef ENABLE_DATA_DUMP
+#include "debug/data_dump_parser.h"
+#include "device/ascend/dump/data_dumper.h"
+#endif

 using ge::model_runner::TaskInfo;
 using std::unordered_map;
@ -66,6 +70,10 @@ class AscendKernelRuntime : public KernelRuntime {
  bool initialized_{false};
  unordered_map<GraphId, vector<std::shared_ptr<TaskInfo>>> task_map_;
  unordered_map<GraphId, std::shared_ptr<ge::model_runner::DavinciModel>> graph_model_map_;
+#ifdef ENABLE_DATA_DUMP
+  void LaunchDataDump(NotNull<const session::KernelGraph *> graph);
+  unordered_map<GraphId, std::shared_ptr<DataDumper>> graph_data_dumper_;
+#endif
 };

 MS_REG_KERNEL_RUNTIME(kAscendDevice, AscendKernelRuntime);
--- a/mindspore/ccsrc/device/ascend/dump/data_dumper.cc
+++ b/mindspore/ccsrc/device/ascend/dump/data_dumper.cc
@ -0,0 +1,282 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifdef ENABLE_DATA_DUMP
+#include "device/ascend/dump/data_dumper.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include "utility"
+#include "session/anf_runtime_algorithm.h"
+#include "runtime/mem.h"
+#include "runtime/kernel.h"
+#include "device/ascend/dump/ge_dump.h"
+#include "proto/op_mapping_info.pb.h"
+#include "utils/context/ms_context.h"
+#include "debug/data_dump_parser.h"
+
+constexpr uint32_t kAicpuLoadFlag = 1;
+constexpr uint32_t kAicpuUnloadFlag = 0;
+constexpr uint32_t kTupleTaskId = 0;
+constexpr uint32_t kTupleStreamId = 1;
+constexpr uint32_t kTupleArgs = 2;
+constexpr uint32_t kCurrentStepTensorIndex = 0;
+constexpr uint32_t kCurrentEpochTensorIndex = 1;
+constexpr uint32_t kStepsPerEpochTensorIndex = 2;
+
+namespace mindspore {
+namespace device {
+namespace ascend {
+void DumpKernelOutput(const CNodePtr &kernel, void *args, NotNull<aicpu::dump::Task *> task);
+void DumpKernelInput(const CNodePtr &kernel, void *args, NotNull<aicpu::dump::Task *> task);
+void RtLoadDumpData(const aicpu::dump::OpMappingInfo &dump_info, void **ptr);
+
+DataDumper::~DataDumper() {
+  ReleaseDevMem(&dev_load_mem_);
+  ReleaseDevMem(&dev_unload_mem_);
+}
+
+void DataDumper::LoadDumpInfo() {
+  MS_LOG(INFO) << "[DataDump] LoadDumpInfo start";
+  MS_EXCEPTION_IF_NULL(kernel_graph_);
+  aicpu::dump::OpMappingInfo dump_info;
+  SetOpMappingInfo(NOT_NULL(&dump_info));
+
+  auto kernels = kernel_graph_->execution_order();
+  for (const auto &kernel : kernels) {
+    MS_EXCEPTION_IF_NULL(kernel);
+    if (!KernelNeedDump(kernel)) {
+      continue;
+    }
+    MS_LOG(INFO) << "[DataDump] LoadDumpInfo kernel:" << kernel->fullname_with_scope();
+    dump_kernel_names_.emplace_back(kernel->fullname_with_scope());
+
+    aicpu::dump::Task task;
+    ConstructDumpTask(NOT_NULL(kernel), NOT_NULL(&task));
+    MS_EXCEPTION_IF_NULL(dump_info.mutable_task());
+    dump_info.mutable_task()->Add(std::move(task));
+  }
+  RtLoadDumpData(dump_info, &dev_load_mem_);
+  load_flag_ = true;
+  MS_LOG(INFO) << "[DataDump] LoadDumpInfo end";
+}
+
+void DataDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_info) const {
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  MS_EXCEPTION_IF_NULL(kernel_graph_);
+  auto dump_path = DataDumpParser::GetInstance().GetDumpPath();
+  if (!dump_path.has_value()) {
+    MS_LOG(EXCEPTION) << "Dump path invalid";
+  }
+  auto device_id = context_ptr->device_id();
+  dump_info->set_dump_path(dump_path.value() + "_" + std::to_string(device_id) + "/");
+  MS_LOG(INFO) << "[DataDump] dump_path:" << dump_path.value();
+
+  dump_info->set_model_name(DataDumpParser::GetInstance().net_name() + "_" + std::to_string(kernel_graph_->graph_id()));
+  dump_info->set_dump_step(std::to_string(DataDumpParser::GetInstance().dump_step()));
+  dump_info->set_model_id(kernel_graph_->graph_id());
+  dump_info->set_flag(kAicpuLoadFlag);
+
+  const auto &input_ctrl_tensors = kernel_graph_->input_ctrl_tensors();
+  if (input_ctrl_tensors == nullptr || input_ctrl_tensors->size() < 3) {
+    MS_LOG(INFO) << "[DataDump] Not data sink mode, input_ctrl_tensor";
+    return;
+  }
+  const auto &current_step_tensor = input_ctrl_tensors->at(kCurrentStepTensorIndex);
+  const auto &currnet_epoch_tensor = input_ctrl_tensors->at(kCurrentEpochTensorIndex);
+  const auto &steps_per_epoch_tensor = input_ctrl_tensors->at(kStepsPerEpochTensorIndex);
+
+  MS_EXCEPTION_IF_NULL(current_step_tensor);
+  MS_EXCEPTION_IF_NULL(currnet_epoch_tensor);
+  MS_EXCEPTION_IF_NULL(steps_per_epoch_tensor);
+  MS_EXCEPTION_IF_NULL(current_step_tensor->device_address());
+  MS_EXCEPTION_IF_NULL(currnet_epoch_tensor->device_address());
+  MS_EXCEPTION_IF_NULL(steps_per_epoch_tensor->device_address());
+
+  void *current_step = current_step_tensor->device_address()->ptr_;
+  void *current_epoch = currnet_epoch_tensor->device_address()->ptr_;
+  void *steps_per_epoch = steps_per_epoch_tensor->device_address()->ptr_;
+
+  if (current_epoch != nullptr && current_step != nullptr && steps_per_epoch != nullptr) {
+    dump_info->set_step_id_addr(reinterpret_cast<uint64_t>(current_epoch));
+    dump_info->set_loop_cond_addr(reinterpret_cast<uint64_t>(current_step));
+    dump_info->set_iterations_per_loop_addr(reinterpret_cast<uint64_t>(steps_per_epoch));
+  } else {
+    MS_LOG(INFO) << "Invalid ctrl tensor device address";
+  }
+}
+
+bool DataDumper::KernelNeedDump(const CNodePtr &kernel) const {
+  if (AnfAlgo::GetKernelType(kernel) != TBE_KERNEL && AnfAlgo::GetKernelType(kernel) != AICPU_KERNEL &&
+      AnfAlgo::GetKernelType(kernel) != AKG_KERNEL) {
+    return false;
+  }
+  MS_EXCEPTION_IF_NULL(kernel);
+  const auto &kernel_set = DataDumpParser::GetInstance().kernel_set();
+  return kernel_set.find(kernel->fullname_with_scope()) != kernel_set.end();
+}
+
+void DataDumper::UnloadDumpInfo() {
+  if (!load_flag_) {
+    MS_LOG(WARNING) << "Load not success, no need to unload";
+    return;
+  }
+  MS_EXCEPTION_IF_NULL(kernel_graph_);
+  MS_LOG(INFO) << "[DataDump] UnloadDumpInfo start. graphId:" << kernel_graph_->graph_id();
+
+  aicpu::dump::OpMappingInfo op_mapping_info;
+  op_mapping_info.set_model_id(kernel_graph_->graph_id());
+  op_mapping_info.set_flag(kAicpuUnloadFlag);
+
+  for (const auto &kernel_name : dump_kernel_names_) {
+    aicpu::dump::Task task;
+    auto iter = runtime_info_map_.find(kernel_name);
+    if (iter == runtime_info_map_.end()) {
+      MS_LOG(EXCEPTION) << "[DataDump] kernel name not found in runtime_info_map";
+    }
+    MS_EXCEPTION_IF_NULL(iter->second);
+    auto task_id = std::get<kTupleTaskId>(*iter->second);
+    task.set_task_id(task_id);
+    MS_EXCEPTION_IF_NULL(op_mapping_info.mutable_task());
+    op_mapping_info.mutable_task()->Add(std::move(task));
+  }
+
+  RtLoadDumpData(op_mapping_info, &dev_unload_mem_);
+}
+
+void DataDumper::ReleaseDevMem(void **ptr) const {
+  if (ptr == nullptr) {
+    return;
+  }
+  if (*ptr != nullptr) {
+    rtError_t rt_error = rtFree(*ptr);
+    if (rt_error != RT_ERROR_NONE) {
+      MS_LOG(ERROR) << "[DataDump] Call rtFree failed, ret:" << rt_error;
+    }
+    *ptr = nullptr;
+  }
+}
+
+void DataDumper::ConstructDumpTask(NotNull<const CNodePtr &> kernel, NotNull<aicpu::dump::Task *> dump_task) const {
+  dump_task->set_end_graph(false);
+  auto iter = runtime_info_map_.find(kernel->fullname_with_scope());
+  if (iter == runtime_info_map_.end()) {
+    MS_LOG(EXCEPTION) << "[DataDump] kernel name not found in runtime_info_map";
+  }
+  MS_EXCEPTION_IF_NULL(iter->second);
+  auto task_id = std::get<kTupleTaskId>(*iter->second);
+  auto stream_id = std::get<kTupleStreamId>(*iter->second);
+  auto args = std::get<kTupleArgs>(*iter->second);
+  MS_LOG(INFO) << "[DataDump] Get runtime info task_id:" << task_id << " stream_id:" << stream_id;
+
+  dump_task->set_task_id(task_id);
+  dump_task->set_stream_id(stream_id);
+  MS_EXCEPTION_IF_NULL(dump_task->mutable_op());
+  dump_task->mutable_op()->set_op_name(kernel->fullname_with_scope());
+  dump_task->mutable_op()->set_op_type(AnfAlgo::GetCNodeName(kernel.get()));
+
+  DumpKernelOutput(kernel, args, dump_task);
+  DumpKernelInput(kernel, args, dump_task);
+}
+
+void RtLoadDumpData(const aicpu::dump::OpMappingInfo &dump_info, void **ptr) {
+  std::string proto_str;
+  size_t proto_size = dump_info.ByteSizeLong();
+  bool ret = dump_info.SerializeToString(&proto_str);
+  if (!ret || proto_size == 0) {
+    MS_LOG(EXCEPTION) << "[DataDump] Protobuf SerializeToString failed, proto size %zu.";
+  }
+
+  rtError_t rt_ret = rtMalloc(ptr, proto_size, RT_MEMORY_HBM);
+  if (rt_ret != RT_ERROR_NONE) {
+    MS_LOG(EXCEPTION) << "[DataDump] Call rtMalloc failed";
+  }
+
+  if (ptr == nullptr) {
+    MS_LOG(ERROR) << "[DataDump] rtMalloc failed, ptr is nullptr";
+    return;
+  }
+  rt_ret = rtMemcpy(*ptr, proto_size, proto_str.c_str(), proto_size, RT_MEMCPY_HOST_TO_DEVICE);
+  if (rt_ret != RT_ERROR_NONE) {
+    MS_LOG(EXCEPTION) << "[DataDump] Call rtMemcpy failed";
+  }
+
+  MS_LOG(INFO) << "[DataDump] rtDatadumpInfoLoad start";
+  rt_ret = rtDatadumpInfoLoad(*ptr, proto_size);
+  if (rt_ret != RT_ERROR_NONE) {
+    MS_LOG(EXCEPTION) << "[DataDump] Call rtDatadumpInfoLoad failed";
+  }
+}
+
+void DumpKernelOutput(const CNodePtr &kernel, void *args, NotNull<aicpu::dump::Task *> task) {
+  MS_LOG(INFO) << "[DataDump] DumpKernelOutput start. Kernel:" << kernel->fullname_with_scope();
+  auto input_size = AnfAlgo::GetInputTensorNum(kernel);
+  auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
+  uint64_t offset = sizeof(void *) * input_size;
+  for (size_t i = 0; i < output_size; ++i) {
+    auto data_type = AnfAlgo::GetOutputDeviceDataType(kernel, i);
+    auto output_format = AnfAlgo::GetOutputFormat(kernel, i);
+    auto output_shape = AnfAlgo::GetOutputDeviceShape(kernel, i);
+
+    aicpu::dump::Output output;
+    output.set_data_type(GetGeDataType(data_type));
+    output.set_format(GetGeFormat(output_format, output_shape.size()));
+    MS_EXCEPTION_IF_NULL(output.mutable_shape());
+    for (auto dim : output_shape) {
+      output.mutable_shape()->add_dim(dim);
+    }
+    output.set_original_output_format(GetGeFormat(output_format, output_shape.size()));
+    output.set_address(static_cast<uint64_t>(reinterpret_cast<uintptr_t>(args)) + offset);
+    MS_EXCEPTION_IF_NULL(task->mutable_output());
+    task->mutable_output()->Add(std::move(output));
+    offset += sizeof(void *);
+  }
+}
+
+void DumpKernelInput(const CNodePtr &kernel, void *args, NotNull<aicpu::dump::Task *> task) {
+  MS_LOG(INFO) << "[DataDump] DumpKernelInput start. Kernel:" << kernel->fullname_with_scope();
+  auto input_size = AnfAlgo::GetInputTensorNum(kernel);
+  uint64_t offset = 0;
+  for (size_t i = 0; i < input_size; ++i) {
+    aicpu::dump::Input input;
+    auto input_node_with_index = AnfAlgo::GetPrevNodeOutput(kernel, i);
+    auto input_node = input_node_with_index.first;
+    auto input_index = input_node_with_index.second;
+    std::string output_format = AnfAlgo::GetOutputFormat(input_node, input_index);
+    auto output_type = AnfAlgo::GetOutputDeviceDataType(input_node, input_index);
+    if (output_type == kTypeUnknown) {
+      MS_LOG(WARNING) << "[DataDump] It is not suggested to use a lonely weight parameter as the output of graph";
+      output_type = AnfAlgo::GetOutputInferDataType(input_node, input_index);
+    }
+    auto output_shape = AnfAlgo::GetOutputDeviceShape(input_node, input_index);
+
+    input.set_data_type(GetGeDataType(output_type));
+    input.set_format(GetGeFormat(output_format, output_shape.size()));
+    MS_EXCEPTION_IF_NULL(input.mutable_shape());
+    for (auto dim : output_shape) {
+      input.mutable_shape()->add_dim(dim);
+    }
+    input.set_address(static_cast<uint64_t>(reinterpret_cast<uintptr_t>(args)) + offset);
+    MS_EXCEPTION_IF_NULL(task->mutable_input());
+    task->mutable_input()->Add(std::move(input));
+    offset += sizeof(void *);
+  }
+}
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore
+#endif
--- a/mindspore/ccsrc/device/ascend/dump/data_dumper.h
+++ b/mindspore/ccsrc/device/ascend/dump/data_dumper.h
@ -0,0 +1,69 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_DUMP_DATADUMP_H_
+#define MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_DUMP_DATADUMP_H_
+#ifdef ENABLE_DATA_DUMP
+#include <tuple>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "session/kernel_graph.h"
+
+namespace aicpu {
+namespace dump {
+class OpMappingInfo;
+class Task;
+}  // namespace dump
+}  // namespace aicpu
+namespace mindspore {
+namespace device {
+namespace ascend {
+// tuple(op_name, task_id, stream_id, args)
+using RuntimeInfo = std::tuple<uint32_t, uint32_t, void *>;
+class DataDumper {
+ public:
+  DataDumper(const session::KernelGraph *kernel_graph,
+             const std::map<std::string, std::shared_ptr<RuntimeInfo>> &runtime_info_map)
+      : load_flag_(false),
+        dev_load_mem_(nullptr),
+        dev_unload_mem_(nullptr),
+        kernel_graph_(kernel_graph),
+        runtime_info_map_(runtime_info_map) {}
+  ~DataDumper();
+  void LoadDumpInfo();
+
+  void UnloadDumpInfo();
+
+ private:
+  void ReleaseDevMem(void **ptr) const;
+  bool KernelNeedDump(const CNodePtr &kernel) const;
+  void SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_info) const;
+  void ConstructDumpTask(NotNull<const CNodePtr &> kernel, NotNull<aicpu::dump::Task *> dump_task) const;
+
+  bool load_flag_;
+  void *dev_load_mem_;
+  void *dev_unload_mem_;
+  std::vector<std::string> dump_kernel_names_;
+  const session::KernelGraph *kernel_graph_;
+  std::map<std::string, std::shared_ptr<RuntimeInfo>> runtime_info_map_;
+};
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore
+#endif
+#endif  // MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_DUMP_DATADUMP_H_
--- a/mindspore/ccsrc/device/ascend/dump/ge_dump.h
+++ b/mindspore/ccsrc/device/ascend/dump/ge_dump.h
@ -0,0 +1,120 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_DUMP_GE_DUMP_H_
+#define MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_DUMP_GE_DUMP_H_
+
+#include <map>
+#include <string>
+#include "proto/ge_dtype.pb.h"
+#include "ir/dtype/type_id.h"
+#include "utils/utils.h"
+
+namespace mindspore {
+namespace device {
+namespace ascend {
+static ge::proto::DataType GetGeDataType(TypeId type_id) {
+  static const std::map<TypeId, ge::proto::DataType> data_type_map = {
+    {TypeId::kTypeUnknown, ge::proto::DT_UNDEFINED},     {TypeId::kNumberTypeFloat32, ge::proto::DT_FLOAT},
+    {TypeId::kNumberTypeFloat16, ge::proto::DT_FLOAT16}, {TypeId::kNumberTypeInt8, ge::proto::DT_INT8},
+    {TypeId::kNumberTypeUInt8, ge::proto::DT_UINT8},     {TypeId::kNumberTypeInt16, ge::proto::DT_INT16},
+    {TypeId::kNumberTypeUInt16, ge::proto::DT_UINT16},   {TypeId::kNumberTypeInt32, ge::proto::DT_INT32},
+    {TypeId::kNumberTypeInt64, ge::proto::DT_INT64},     {TypeId::kNumberTypeUInt32, ge::proto::DT_UINT32},
+    {TypeId::kNumberTypeUInt64, ge::proto::DT_UINT64},   {TypeId::kNumberTypeBool, ge::proto::DT_BOOL},
+    {TypeId::kNumberTypeFloat64, ge::proto::DT_DOUBLE},
+  };
+  MS_LOG(INFO) << "Vm origin type_id:" << type_id;
+  auto iter = data_type_map.find(type_id);
+  if (iter == data_type_map.end()) {
+    MS_LOG(EXCEPTION) << "Invalid data type:" << type_id;
+  }
+  return iter->second;
+}
+
+enum GeFormat {
+  kFormat_NCHW = 0,   // NCHW
+  kFormat_NHWC,       // NHWC
+  kFormat_ND,         // Nd Tensor
+  kFormat_NC1HWC0,    // NC1HWC0
+  kFormat_FRACTAL_Z,  // FRACTAL_Z
+  kFormat_NC1C0HWPAD,
+  kFormat_NHWC1C0,
+  kFormat_FSR_NCHW,
+  kFormat_FRACTAL_DECONV,
+  kFormat_C1HWNC0,
+  kFormat_FRACTAL_DECONV_TRANSPOSE,
+  kFormat_FRACTAL_DECONV_SP_STRIDE_TRANS,
+  kFormat_NC1HWC0_C04,    // NC1HWC0, C0 =4
+  kFormat_FRACTAL_Z_C04,  // FRACZ, C0 =4
+  kFormat_CHWN,
+  kFormat_FRACTAL_DECONV_SP_STRIDE8_TRANS,
+  kFormat_HWCN,
+  kFormat_NC1KHKWHWC0,  // KH,KW kernel h& kernel w maxpooling max output format
+  kFormat_BN_WEIGHT,
+  kFormat_FILTER_HWCK,  // filter input tensor format
+  kFormat_HASHTABLE_LOOKUP_LOOKUPS = 20,
+  kFormat_HASHTABLE_LOOKUP_KEYS,
+  kFormat_HASHTABLE_LOOKUP_VALUE,
+  kFormat_HASHTABLE_LOOKUP_OUTPUT,
+  kFormat_HASHTABLE_LOOKUP_HITS = 24,
+  kFormat_C1HWNCoC0,
+  kFormat_MD,
+  kFormat_NDHWC,
+  kFormat_FRACTAL_ZZ,
+  kFormat_FRACTAL_NZ,
+  kFormat_NCDHW,
+  kFormat_DHWCN,  // 3D filter input tensor format
+  kFormat_NDC1HWC0,
+  kFormat_FRACTAL_Z_3D,
+  kFormat_CN,
+  kFormat_NC,
+  kFormat_DHWNC,
+  kFormat_FRACTAL_Z_3D_TRANSPOSE,  // 3D filter(transpose) input tensor format
+  kFormat_RESERVED,
+  kFormat_ALL
+};
+
+static GeFormat GetGeFormat(const std::string &format, size_t shape_size) {
+  static const std::map<std::string, GeFormat> format_map = {
+    // default format: nchw, fractal_nz?
+    {kOpFormat_DEFAULT, kFormat_NCHW},
+    {kOpFormat_NC1KHKWHWC0, kFormat_NC1KHKWHWC0},
+    {kOpFormat_ND, kFormat_ND},
+    {kOpFormat_NCHW, kFormat_NCHW},
+    {kOpFormat_NHWC, kFormat_NHWC},
+    {kOpFormat_HWCN, kFormat_HWCN},
+    {kOpFormat_NC1HWC0, kFormat_NC1HWC0},
+    {kOpFormat_FRAC_Z, kFormat_FRACTAL_Z},
+    {kOpFormat_FRAC_NZ, kFormat_FRACTAL_NZ},
+    {kOpFormat_C1HWNCoC0, kFormat_C1HWNCoC0},
+    {kOpFormat_NC1HWC0_C04, kFormat_NC1HWC0_C04},
+    {kOpFormat_FRACTAL_Z_C04, kFormat_FRACTAL_Z_C04},
+    {kOpFormat_NDHWC, kFormat_NDHWC},
+  };
+  MS_LOG(INFO) << "GetGeFormat format:" << format << " shape_size:" << shape_size;
+  if (format == kOpFormat_DEFAULT) {
+    return shape_size == 4 ? kFormat_NCHW : kFormat_ND;
+  }
+  auto iter = format_map.find(format);
+  if (iter == format_map.end()) {
+    MS_LOG(EXCEPTION) << "Invalid format:" << format;
+  }
+  return iter->second;
+}
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore
+#endif  // MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_DUMP_GE_DUMP_H_
--- a/mindspore/ccsrc/device/ascend/dump/proto/ge_dtype.proto
+++ b/mindspore/ccsrc/device/ascend/dump/proto/ge_dtype.proto
@ -0,0 +1,49 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = "proto3";
+
+package ge.proto;
+
+enum DataType
+{
+  DT_UNDEFINED = 0;  // Used to indicate a DataType field has not been set.
+  DT_FLOAT     = 1;  // float type
+  DT_FLOAT16   = 2;  // fp16 type
+  DT_INT8      = 3;  // int8 type
+  DT_UINT8     = 4;  // uint8 type
+  DT_INT16     = 5;  // int16 type
+  DT_UINT16    = 6;  // uint16 type
+  DT_INT32     = 7;  //
+  DT_INT64     = 8;  // int64 type
+  DT_UINT32    = 9;  // unsigned int32
+  DT_UINT64    = 10;  // unsigned int64
+  DT_BOOL      = 11;  // bool type
+  DT_DOUBLE    = 12; // double type
+  DT_STRING = 13;            // string type
+  DT_DUAL_SUB_INT8 = 14;    /**< dual output int8 type */
+  DT_DUAL_SUB_UINT8 = 15;    /**< dual output uint8 type */
+  DT_COMPLEX64 = 16;         // complex64 type
+  DT_COMPLEX128 = 17;        // complex128 type
+  DT_QINT8 = 18;             // qint8 type
+  DT_QINT16 = 19;            // qint16 type
+  DT_QINT32 = 20;            // qint32 type
+  DT_QUINT8 = 21;            // quint8 type
+  DT_QUINT16 = 22;           // quint16 type
+  DT_RESOURCE  = 23;         // resource type
+  DT_STRING_REF = 24;        // string_ref type
+  DT_DUAL      = 25;              /**< dual output type */
+}
--- a/mindspore/ccsrc/device/ascend/dump/proto/op_mapping_info.proto
+++ b/mindspore/ccsrc/device/ascend/dump/proto/op_mapping_info.proto
@ -0,0 +1,78 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = "proto3";
+package aicpu.dump;
+
+message Shape {
+  repeated uint64 dim = 1;
+}
+
+message Output {
+  int32 data_type = 1;
+  int32 format = 2;
+  Shape shape = 3;
+  uint64 address = 4;
+  string original_name = 5;
+  int32 original_output_index = 6;
+  int32 original_output_data_type = 7;
+  int32 original_output_format = 8;
+  uint64 size = 9;
+};
+
+message Input {
+  int32 data_type = 1;
+  int32 format = 2;
+  Shape shape = 3;
+  uint64 address = 4;
+  uint64 size = 5;
+}
+
+message Op {
+  string op_name = 1;
+  string op_type = 2;
+};
+
+message Task {
+  uint32 task_id = 1;
+  uint32 stream_id = 2;
+  Op op = 3;
+  repeated Output output = 4;
+  bool end_graph = 5;
+  repeated Input input = 6;
+};
+
+message OpMappingInfo {
+  string dump_path = 1;
+  oneof model_name_param {
+    string model_name = 2;
+  }
+  oneof model_id_param {
+    uint32 model_id = 3;
+  }
+  oneof step_id {
+    uint64 step_id_addr = 4;
+  }
+  oneof iterations_per_loop {
+    uint64 iterations_per_loop_addr = 5;
+  }
+  oneof loop_cond {
+    uint64 loop_cond_addr = 6;
+  }
+  uint32 flag = 7; // 0x01 load, 0x00 unload
+  repeated Task task = 8;
+  string dump_step = 9;
+};
--- a/mindspore/ccsrc/device/ascend/tasksink/task_generator.cc
+++ b/mindspore/ccsrc/device/ascend/tasksink/task_generator.cc
@ -127,6 +127,7 @@ bool TaskGenerator::LaunchKernel(const CNodePtr &anf_node_ptr, uint32_t stream_i
  AddressPtrList kernel_outputs;
  auto kernel_mod = AnfAlgo::GetKernelMod(anf_node_ptr);
  MS_EXCEPTION_IF_NULL(kernel_mod);
+  kernel_mod->set_kernel_name(anf_node_ptr->fullname_with_scope());
  if (AnfAlgo::GetCNodeName(anf_node_ptr) != kAtomicAddrCleanOpName) {
    for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(anf_node_ptr); ++i) {
      auto real_input_index = AnfAlgo::GetRealInputIndex(anf_node_ptr, i);
--- a/mindspore/ccsrc/device/device_address.h
+++ b/mindspore/ccsrc/device/device_address.h
@ -34,6 +34,7 @@ class CPUKernelRuntime;
 namespace ascend {
 class AscendKernelRuntime;
 class AscendMemoryManager;
+class DataDumper;
 namespace tasksink {
 class TaskGenerator;
 }  // namespace tasksink
@ -90,6 +91,7 @@ class DeviceAddress {
  friend class mindspore::device::gpu::GPUMemoryManager;
  friend class mindspore::device::ascend::AscendKernelRuntime;
  friend class mindspore::device::ascend::AscendMemoryManager;
+  friend class mindspore::device::ascend::DataDumper;
 };

 using DeviceAddressPtr = std::shared_ptr<DeviceAddress>;
--- a/mindspore/ccsrc/device/kernel_adjust.cc
+++ b/mindspore/ccsrc/device/kernel_adjust.cc
@ -34,6 +34,7 @@
 #include "device/ascend/kernel_select_ascend.h"
 #include "runtime/base.h"
 #include "device/ascend/ascend_stream_assign.h"
+
 namespace mindspore {
 namespace device {
 using device::ascend::ProfilingUtils;
@ -117,6 +118,7 @@ void KernelAdjust::InsertSwitchLoop(const std::shared_ptr<session::KernelGraph>
  std::vector<AnfNodePtr> *mute_inputs = kernel_graph_ptr->MutableInputs();
  MS_EXCEPTION_IF_NULL(mute_inputs);
  mute_inputs->push_back(switch_loop_input[kLoopCountParamName]);
+  mute_inputs->push_back(switch_loop_input[kEpochParamName]);
  mute_inputs->push_back(switch_loop_input[kIterLoopParamName]);
  mute_inputs->push_back(switch_loop_input[kZeroParamName]);
  mute_inputs->push_back(switch_loop_input[kOneParamName]);
@ -316,6 +318,13 @@ void KernelAdjust::CreateSwitchOpParameters(const std::shared_ptr<session::Kerne
  one->set_abstract(paremeter_abstract_ptr);
  ParameterPtr one_new = kernel_graph_ptr->NewParameter(one);
  (*switch_loop_input)[kOneParamName] = one_new;
+
+  ParameterPtr epoch = std::make_shared<Parameter>(kernel_graph_ptr);
+  MS_EXCEPTION_IF_NULL(epoch);
+  epoch->set_name(kEpochParamName);
+  epoch->set_abstract(paremeter_abstract_ptr);
+  ParameterPtr epoch_new = kernel_graph_ptr->NewParameter(epoch);
+  (*switch_loop_input)[kEpochParamName] = epoch_new;
 }

 kernel::KernelBuildInfo::KernelBuildInfoBuilder KernelAdjust::CreateMngKernelBuilder(
@ -510,6 +519,14 @@ void KernelAdjust::LoadSwitchInputs(std::vector<tensor::TensorPtr> *inputs) {
  *val = 0;
  inputs->push_back(loop_count_tensor);

+  // Epoch in device
+  tensor::TensorPtr epoch_tensor = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
+  MS_EXCEPTION_IF_NULL(epoch_tensor);
+  val = static_cast<int32_t *>(epoch_tensor->data_c());
+  MS_EXCEPTION_IF_NULL(val);
+  *val = 0;
+  inputs->push_back(epoch_tensor);
+
  tensor::TensorPtr iter_loop_tensor = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  MS_EXCEPTION_IF_NULL(iter_loop_tensor);
  val = static_cast<int32_t *>(iter_loop_tensor->data_c());
@ -531,6 +548,7 @@ void KernelAdjust::LoadSwitchInputs(std::vector<tensor::TensorPtr> *inputs) {
  MS_EXCEPTION_IF_NULL(val);
  *val = 1;
  inputs->push_back(one_tensor);
+
  MS_LOG(INFO) << "---------------- LoadSwitchInputs End--";
 }

--- a/mindspore/ccsrc/device/kernel_adjust.h
+++ b/mindspore/ccsrc/device/kernel_adjust.h
@ -37,6 +37,7 @@ constexpr auto kLoopCountParamName = "loop_count";
 constexpr auto kIterLoopParamName = "iter_loop";
 constexpr auto kZeroParamName = "zero";
 constexpr auto kOneParamName = "one";
+constexpr auto kEpochParamName = "loop_epoch";
 constexpr auto kStreamNeedActivedFirst = "stream_need_active_first";
 constexpr uint32_t kSecondStreamSwitchLabel = 2;

--- a/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_mod.cc
+++ b/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_mod.cc
@ -26,6 +26,7 @@
 #include "kernel/aicpu/aicpu_kernel_build.h"
 #include "utils/convert_utils.h"
 #include "kernel/aicpu/aicpu_util.h"
+#include "utils/context/ms_context.h"

 using AicpuTaskInfoPtr = std::shared_ptr<ge::model_runner::AicpuTaskInfo>;

@ -144,8 +145,9 @@ std::vector<TaskInfoPtr> AicpuOpKernelMod::GenTask(const std::vector<AddressPtr>
  if (node_name_ == kTopK) {
    node_name_ = kTopKV2;
  }
+
  AicpuTaskInfoPtr task_info_ptr = make_shared<ge::model_runner::AicpuTaskInfo>(
-    stream_id, node_so_, node_name_, node_def_str_, input_data_addrs, output_data_addrs);
+    kernel_name_, stream_id, node_so_, node_name_, node_def_str_, input_data_addrs, output_data_addrs, NeedDump());

  MS_LOG(INFO) << "AicpuOpKernelMod GenTask end";
  return {task_info_ptr};
--- a/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_mod.cc
+++ b/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_mod.cc
@ -26,6 +26,7 @@
 #include "runtime/rt.h"
 #include "utils/log_adapter.h"
 #include "utils/convert_utils.h"
+#include "utils/context/ms_context.h"

 namespace mindspore {
 namespace kernel {
@ -123,8 +124,8 @@ std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &in
  MS_LOG(DEBUG) << "The block_dim is:" << block_dim;

  TbeTaskInfoPtr task_info_ptr = make_shared<ge::model_runner::TbeTaskInfo>(
-    stream_id, stub_func, block_dim, args, args_size, sm_desc, binary, binary_size, meta_data, input_data_addrs,
-    output_data_addrs, workspace_addrs);
+    kernel_name_, stream_id, stub_func, block_dim, args, args_size, sm_desc, binary, binary_size, meta_data,
+    input_data_addrs, output_data_addrs, workspace_addrs, NeedDump());
  return {task_info_ptr};
 }
 }  // namespace kernel
--- a/mindspore/ccsrc/kernel/ascend_kernel_mod.h
+++ b/mindspore/ccsrc/kernel/ascend_kernel_mod.h
@ -21,6 +21,9 @@
 #include <memory>
 #include "framework/ge_runtime/task_info.h"
 #include "kernel/kernel.h"
+#ifdef ENABLE_DATA_DUMP
+#include "debug/data_dump_parser.h"
+#endif

 using TaskInfoPtr = std::shared_ptr<ge::model_runner::TaskInfo>;
 namespace mindspore {
@ -31,6 +34,13 @@ class AscendKernelMod : public KernelMod {
                                           const std::vector<AddressPtr> &, uint32_t) = 0;
  uint32_t block_dim() { return block_dim_; }
  uint32_t stream_id() { return stream_id_; }
+  virtual bool NeedDump() {
+#ifdef ENABLE_DATA_DUMP
+    return DataDumpParser::GetInstance().NeedDump(kernel_name_);
+#else
+    return false;
+#endif
+  }

 protected:
  uint32_t block_dim_{1};
--- a/mindspore/ccsrc/kernel/hccl/hccl_kernel.cc
+++ b/mindspore/ccsrc/kernel/hccl/hccl_kernel.cc
@ -18,6 +18,7 @@
 #include "device/ascend/tasksink/runtime_utils.h"
 #include "session/anf_runtime_algorithm.h"
 #include "utils/utils.h"
+#include "utils/context/ms_context.h"

 using HcclTaskInfoPtr = std::shared_ptr<ge::model_runner::HcclTaskInfo>;
 using ge::model_runner::HcclTaskInfo;
@ -146,10 +147,12 @@ std::vector<TaskInfoPtr> HcclKernel::GenTask(const std::vector<AddressPtr> &inpu
               << ", root_id=" << root_id_ << ", op_type=" << static_cast<int>(op_type_)
               << ", data_type=" << static_cast<int>(data_type);

+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
  HcclTaskInfoPtr task_info_ptr = std::make_shared<HcclTaskInfo>(
-    stream_id, hccl_type, input_data_addr, output_data_addr, workspace_address, workspace_num, 0, private_def, nullptr,
-    hccl_count_, root_id_, op_type_, data_type, group_, RuntimeUtils::HcomBindModel, RuntimeUtils::HcomUnbindModel,
-    RuntimeUtils::HcomDistribute);
+    kernel_name_, stream_id, hccl_type, input_data_addr, output_data_addr, workspace_address, workspace_num, 0,
+    private_def, nullptr, hccl_count_, root_id_, op_type_, data_type, group_, RuntimeUtils::HcomBindModel,
+    RuntimeUtils::HcomUnbindModel, RuntimeUtils::HcomDistribute, NeedDump());
  MS_EXCEPTION_IF_NULL(task_info_ptr);
  return {task_info_ptr};
 }
--- a/mindspore/ccsrc/kernel/kernel.h
+++ b/mindspore/ccsrc/kernel/kernel.h
@ -129,6 +129,10 @@ class KernelMod {
  virtual std::vector<size_t> GenParameters() { return {}; }

  virtual ~KernelMod() = default;
+  void set_kernel_name(const std::string &kernel_name) { kernel_name_ = kernel_name; }
+
+ protected:
+  std::string kernel_name_;
 };
 using KernelModPtr = std::shared_ptr<KernelMod>;
 }  // namespace kernel
--- a/mindspore/ccsrc/kernel/rts/assign.cc
+++ b/mindspore/ccsrc/kernel/rts/assign.cc
@ -58,8 +58,9 @@ std::vector<TaskInfoPtr> AssignKernel::GenTask(const std::vector<AddressPtr> &in
  }
  stream_id_ = stream_id;

-  std::shared_ptr<MemcpyAsyncTaskInfo> task_info_ptr = std::make_shared<MemcpyAsyncTaskInfo>(
-    stream_id, inputs[0]->addr, inputs[0]->size, inputs[1]->addr, inputs[1]->size, RT_MEMCPY_DEVICE_TO_DEVICE);
+  std::shared_ptr<MemcpyAsyncTaskInfo> task_info_ptr =
+    std::make_shared<MemcpyAsyncTaskInfo>(kernel_name_, stream_id, inputs[0]->addr, inputs[0]->size, inputs[1]->addr,
+                                          inputs[1]->size, RT_MEMCPY_DEVICE_TO_DEVICE, false);
  MS_EXCEPTION_IF_NULL(task_info_ptr);
  return {task_info_ptr};
 }
--- a/mindspore/ccsrc/kernel/rts/label_goto.cc
+++ b/mindspore/ccsrc/kernel/rts/label_goto.cc
@ -55,7 +55,8 @@ std::vector<TaskInfoPtr> LabelGotoKernel::GenTask(const std::vector<AddressPtr>
                                                  const std::vector<AddressPtr> &, uint32_t stream_id) {
  MS_LOG(INFO) << "LabelGotoKernel GenTask label:" << label_ << ", stream id:" << stream_id;
  std::vector<TaskInfoPtr> task_info_list;
-  std::shared_ptr<LabelGotoTaskInfo> task_info_ptr = std::make_shared<LabelGotoTaskInfo>(stream_id, label_);
+  std::shared_ptr<LabelGotoTaskInfo> task_info_ptr =
+    std::make_shared<LabelGotoTaskInfo>(kernel_name_, stream_id, label_);
  MS_EXCEPTION_IF_NULL(task_info_ptr);
  task_info_list.emplace_back(task_info_ptr);
  return task_info_list;
--- a/mindspore/ccsrc/kernel/rts/label_set.cc
+++ b/mindspore/ccsrc/kernel/rts/label_set.cc
@ -55,7 +55,7 @@ std::vector<TaskInfoPtr> LabelSetKernel::GenTask(const std::vector<AddressPtr> &
                                                 const std::vector<AddressPtr> &, uint32_t stream_id) {
  MS_LOG(INFO) << "LabelSetKernel GenTask label:" << label_ << ", stream id:" << stream_id;
  std::vector<TaskInfoPtr> task_info_list;
-  std::shared_ptr<LabelSetTaskInfo> task_info_ptr = std::make_shared<LabelSetTaskInfo>(stream_id, label_);
+  std::shared_ptr<LabelSetTaskInfo> task_info_ptr = std::make_shared<LabelSetTaskInfo>(kernel_name_, stream_id, label_);
  MS_EXCEPTION_IF_NULL(task_info_ptr);
  task_info_list.emplace_back(task_info_ptr);
  return task_info_list;
--- a/mindspore/ccsrc/kernel/rts/label_switch.cc
+++ b/mindspore/ccsrc/kernel/rts/label_switch.cc
@ -67,7 +67,7 @@ std::vector<TaskInfoPtr> LabelSwitchKernel::GenTask(const std::vector<AddressPtr
  MS_LOG(INFO) << "LabelSwitchKernel GenTask label size:" << label_size_ << ", stream id:" << stream_id;
  std::vector<TaskInfoPtr> task_info_list;
  cond_ = inputs[0]->addr;
-  auto task_info_ptr = std::make_shared<LabelSwitchTaskInfo>(stream_id, label_size_, label_list_, cond_);
+  auto task_info_ptr = std::make_shared<LabelSwitchTaskInfo>(kernel_name_, stream_id, label_size_, label_list_, cond_);
  MS_EXCEPTION_IF_NULL(task_info_ptr);
  task_info_list.emplace_back(task_info_ptr);
  return task_info_list;
--- a/mindspore/ccsrc/kernel/rts/memcpy_async.cc
+++ b/mindspore/ccsrc/kernel/rts/memcpy_async.cc
@ -23,6 +23,7 @@
 #include "common/utils.h"
 #include "session/anf_runtime_algorithm.h"
 #include "common/trans.h"
+#include "utils/context/ms_context.h"

 using ge::model_runner::MemcpyAsyncTaskInfo;
 using MemcpyAsyncTaskInfoPtr = std::shared_ptr<MemcpyAsyncTaskInfo>;
@ -118,8 +119,9 @@ std::vector<TaskInfoPtr> MemCpyAsyncKernel::GenTask(const std::vector<AddressPtr
  }

  stream_id_ = stream_id;
-  std::shared_ptr<MemcpyAsyncTaskInfo> task_info_ptr = std::make_shared<MemcpyAsyncTaskInfo>(
-    stream_id, outputs[0]->addr, outputs[0]->size, inputs[0]->addr, inputs[0]->size, RT_MEMCPY_DEVICE_TO_DEVICE);
+  std::shared_ptr<MemcpyAsyncTaskInfo> task_info_ptr =
+    std::make_shared<MemcpyAsyncTaskInfo>(kernel_name_, stream_id, outputs[0]->addr, outputs[0]->size, inputs[0]->addr,
+                                          inputs[0]->size, RT_MEMCPY_DEVICE_TO_DEVICE, NeedDump());
  MS_EXCEPTION_IF_NULL(task_info_ptr);
  return {task_info_ptr};
 }
--- a/mindspore/ccsrc/kernel/rts/profiling_kernel_mod.cc
+++ b/mindspore/ccsrc/kernel/rts/profiling_kernel_mod.cc
@ -63,7 +63,7 @@ std::vector<TaskInfoPtr> ProfilingKernelMod::GenTask(const std::vector<AddressPt
               << ", outputs size:" << outputs.size();
  stream_id_ = stream_id;
  std::shared_ptr<ProfilerTraceTaskInfo> task_info_ptr =
-    std::make_shared<ProfilerTraceTaskInfo>(stream_id, log_id_, notify_, flags_);
+    std::make_shared<ProfilerTraceTaskInfo>(kernel_name_, stream_id, log_id_, notify_, flags_);
  return {task_info_ptr};
 }
 }  // namespace kernel
--- a/mindspore/ccsrc/kernel/rts/recv.cc
+++ b/mindspore/ccsrc/kernel/rts/recv.cc
@ -60,7 +60,7 @@ std::vector<TaskInfoPtr> RecvKernel::GenTask(const std::vector<AddressPtr> &, co
                                             const std::vector<AddressPtr> &, uint32_t stream_id) {
  MS_LOG(INFO) << "RecvKernel GenTask event_id_:" << event_id_ << ", stream_id_:" << stream_id;
  stream_id_ = stream_id;
-  EventWaitTaskInfoPtr task_info_ptr = std::make_shared<EventWaitTaskInfo>(stream_id, event_id_);
+  EventWaitTaskInfoPtr task_info_ptr = std::make_shared<EventWaitTaskInfo>(kernel_name_, stream_id, event_id_);
  MS_EXCEPTION_IF_NULL(task_info_ptr);
  return {task_info_ptr};
 }
--- a/mindspore/ccsrc/kernel/rts/send.cc
+++ b/mindspore/ccsrc/kernel/rts/send.cc
@ -57,7 +57,7 @@ std::vector<TaskInfoPtr> SendKernel::GenTask(const std::vector<AddressPtr> &, co
                                             const std::vector<AddressPtr> &, uint32_t stream_id) {
  MS_LOG(INFO) << "SendKernel GenTask event id:" << event_id_ << ", stream id:" << stream_id;
  stream_id_ = stream_id;
-  EventRecordTaskInfoPtr task_info_ptr = std::make_shared<EventRecordTaskInfo>(stream_id, event_id_);
+  EventRecordTaskInfoPtr task_info_ptr = std::make_shared<EventRecordTaskInfo>(kernel_name_, stream_id, event_id_);
  MS_EXCEPTION_IF_NULL(task_info_ptr);
  return {task_info_ptr};
 }
--- a/mindspore/ccsrc/kernel/rts/stream_active.cc
+++ b/mindspore/ccsrc/kernel/rts/stream_active.cc
@ -72,7 +72,8 @@ std::vector<TaskInfoPtr> StreamActiveKernel::GenTask(const std::vector<AddressPt
  stream_id_ = stream_id;
  std::vector<TaskInfoPtr> task_info_list;
  for (auto &index : active_streams_index_) {
-    std::shared_ptr<StreamActiveTaskInfo> task_info_ptr = std::make_shared<StreamActiveTaskInfo>(stream_id, index);
+    std::shared_ptr<StreamActiveTaskInfo> task_info_ptr =
+      std::make_shared<StreamActiveTaskInfo>(kernel_name_, stream_id, index);
    MS_EXCEPTION_IF_NULL(task_info_ptr);
    task_info_list.emplace_back(task_info_ptr);
    MS_LOG(INFO) << "StreamActiveKernel GenTask: streamId:" << stream_id << ", Active streamId:" << index;
--- a/mindspore/ccsrc/kernel/rts/stream_switch.cc
+++ b/mindspore/ccsrc/kernel/rts/stream_switch.cc
@ -91,8 +91,8 @@ std::vector<TaskInfoPtr> StreamSwitchKernel::GenTask(const std::vector<AddressPt
  auto ites_per_loop = inputs[1]->addr;
  MS_LOG(INFO) << "cond_:" << static_cast<int>(cond_) << ", true_stream_index_:" << true_stream_index_
               << ", stream_id:" << stream_id;
-  std::shared_ptr<StreamSwitchTaskInfo> task_info_ptr =
-    std::make_shared<StreamSwitchTaskInfo>(stream_id, true_stream_index_, loop_cnt, ites_per_loop, cond_, data_type_);
+  std::shared_ptr<StreamSwitchTaskInfo> task_info_ptr = std::make_shared<StreamSwitchTaskInfo>(
+    kernel_name_, stream_id, true_stream_index_, loop_cnt, ites_per_loop, cond_, data_type_);
  MS_EXCEPTION_IF_NULL(task_info_ptr);
  return {task_info_ptr};
 }
--- a/mindspore/ccsrc/kernel/tbe/tbe_kernel_mod.cc
+++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_mod.cc
@ -17,7 +17,7 @@
 #include "kernel/tbe/tbe_kernel_mod.h"
 #include <algorithm>
 #include "runtime/rt.h"
-#include "nlohmann/json.hpp"
+#include "utils/context/ms_context.h"
 #include "graphengine/inc/framework/ge_runtime/task_info.h"

 namespace mindspore {
@ -99,9 +99,9 @@ std::vector<TaskInfoPtr> TbeKernelMod::GenTask(const std::vector<AddressPtr> &in

  MS_LOG(INFO) << "block_dim is:" << block_dim_;

-  TbeTaskInfoPtr task_info_ptr =
-    make_shared<ge::model_runner::TbeTaskInfo>(stream_id, stub_func, block_dim_, args, 0, sm_desc, nullptr, 0,
-                                               meta_data, input_data_addrs, output_data_addrs, workspace_addrs);
+  TbeTaskInfoPtr task_info_ptr = make_shared<ge::model_runner::TbeTaskInfo>(
+    kernel_name_, stream_id, stub_func, block_dim_, args, 0, sm_desc, nullptr, 0, meta_data, input_data_addrs,
+    output_data_addrs, workspace_addrs, NeedDump());
  return {task_info_ptr};
 }

--- a/mindspore/ccsrc/session/kernel_graph.h
+++ b/mindspore/ccsrc/session/kernel_graph.h
@ -36,7 +36,7 @@ namespace session {
 using AnfWithOutIndex = std::pair<AnfNodePtr, size_t>;
 class KernelGraph : public FuncGraph {
 public:
-  KernelGraph() : graph_id_(0), start_label_(nullptr), end_goto_(nullptr), null_output_(false) {
+  KernelGraph() : graph_id_(0), start_label_(nullptr), end_goto_(nullptr), null_output_(false), current_epoch_(0) {
    inputs_ = std::make_shared<std::vector<AnfNodePtr>>();
    execution_order_ = {};
    executable_ = true;
@ -154,6 +154,8 @@ class KernelGraph : public FuncGraph {
  AnfNodePtr GetFrontNodeByInternalOutput(const AnfNodePtr &node) const;
  void AddFinalOutputKernel(const AnfNodePtr &node);
  bool IsFinalOutputKernel(const AnfNodePtr &node) const;
+  uint32_t current_epoch() const { return current_epoch_; }
+  void set_current_epoch(uint32_t epoch) { current_epoch_ = epoch; }

 private:
  // remove value node form graph
@ -216,6 +218,7 @@ class KernelGraph : public FuncGraph {
  std::unordered_map<AnfNodePtr, AnfNodePtr> front_to_internal_outputs_map_;
  std::unordered_map<AnfNodePtr, AnfNodePtr> internal_outputs_to_front_map_;
  std::set<AnfNodePtr> final_output_kernels_;
+  uint32_t current_epoch_;
 };
 }  // namespace session
 using KernelGraphPtr = std::shared_ptr<session::KernelGraph>;
--- a/mindspore/ccsrc/session/session_basic.cc
+++ b/mindspore/ccsrc/session/session_basic.cc
@ -187,6 +187,18 @@ size_t LoadCtrlInputTensor(const std::shared_ptr<KernelGraph> &graph, std::vecto
  // set loop_count to zero
  MS_EXCEPTION_IF_NULL(inputs);
  inputs->push_back(tensor);
+
+  auto epoch_tensor = (*inputs_params)[1];
+  MS_EXCEPTION_IF_NULL(epoch_tensor);
+  auto *epoch_val = static_cast<int32_t *>(epoch_tensor->data_c());
+  MS_EXCEPTION_IF_NULL(epoch_val);
+  *epoch_val = graph->current_epoch();
+  epoch_tensor->set_dirty(true);
+  inputs->push_back(epoch_tensor);
+  MS_LOG(INFO) << "Load epoch_val:" << *epoch_val;
+
+  graph->set_current_epoch(graph->current_epoch() + 1);
+
  return inputs_params->size();
 }

@ -814,13 +826,13 @@ void SessionBasic::AddParameterToGraphInputs(const std::vector<AnfNodePtr> &para
 void SessionBasic::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
                                 const std::vector<tensor::TensorPtr> &inputs_const) const {
  std::vector<tensor::TensorPtr> inputs(inputs_const);
-  size_t input_ctrl_size = 1;
+  size_t input_ctrl_size = 2;
  MS_EXCEPTION_IF_NULL(kernel_graph);
  if (kernel_graph->input_ctrl_tensors()) {
    input_ctrl_size = LoadCtrlInputTensor(kernel_graph, &inputs);
  }
  auto input_nodes = kernel_graph->inputs();
-  if ((inputs.size() + input_ctrl_size) - 1 != input_nodes.size()) {
+  if ((inputs.size() + input_ctrl_size) - 2 != input_nodes.size()) {
    MS_LOG(EXCEPTION) << "Tensor input:" << inputs.size() << " is not equal graph inputs:" << input_nodes.size()
                      << ", input_ctrl_size:" << input_ctrl_size;
  }
--- a/tests/ut/cpp/stub/ge/ge_task_launch_stub.cc
+++ b/tests/ut/cpp/stub/ge/ge_task_launch_stub.cc
@ -32,6 +32,8 @@ bool ModelRunner::LoadDavinciModel(uint32_t device_id, uint64_t session_id, uint

 bool ModelRunner::UnloadModel(uint32_t model_id) { return true; }

+bool ModelRunner::LoadModelComplete(uint32_t model_id) { return true; }
+
 bool ModelRunner::RunModel(uint32_t model_id, const ge::InputData &input_data, ge::OutputData *output_data) {
  return true;
 }
@ -45,6 +47,11 @@ const std::vector<uint32_t> &ModelRunner::GetStreamIdList(uint32_t model_id) con
  static std::vector<uint32_t> stream_id_list;
  return stream_id_list;
 }
+
+const std::map<std::string, std::shared_ptr<RuntimeInfo>> &ModelRunner::GetRuntimeInfoMap(uint32_t model_id) const {
+  static std::map<std::string, std::shared_ptr<RuntimeInfo>> runtime_info_map;
+  return runtime_info_map;
+}
 }  // namespace model_runner
 }  // namespace ge

--- a/tests/ut/cpp/stub/tasksink/ascend_stream_assign_stub.cc
+++ b/tests/ut/cpp/stub/tasksink/ascend_stream_assign_stub.cc
@ -15,7 +15,6 @@
 */
 #include "device/ascend/ascend_stream_assign.h"
 #include "device/ascend/ascend_label_assign.h"
-#include "device/ascend/tasksink/task_generator.h"
 #include "device/kernel_adjust.h"

 namespace mindspore {
@ -31,13 +30,6 @@ void AscendStreamAssign::AssignStream(const NotNull<KernelGraphPtr> &graph_ptr)
 void AscendStreamAssign::GetWaitStreams(vector<uint32_t> *wait_active_stream_list) { return; }

 void AscendStreamAssign::GetHcomStreams(std::vector<uint32_t> *streams) { return; }
-
-namespace tasksink {
-bool TaskGenerator::GenTasks(const std::vector<CNodePtr> &anf_node_list, std::vector<TaskInfoPtr> *const task_info_list,
-                             uint32_t graph_id) {
-  return true;
-}
-}  // namespace tasksink
 }  // namespace ascend
 void KernelAdjust::InsertSwitchLoop(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) { return; }
 bool KernelAdjust::StepLoadCtrlInputs(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) { return true; }
--- a/tests/ut/cpp/stub/tasksink/task_sink_stub.cc
+++ b/tests/ut/cpp/stub/tasksink/task_sink_stub.cc
@ -0,0 +1,30 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/ascend/tasksink/task_generator.h"
+
+namespace mindspore {
+namespace device {
+namespace ascend {
+namespace tasksink {
+bool TaskGenerator::GenTasks(const std::vector<CNodePtr> &anf_node_list, std::vector<TaskInfoPtr> *const task_info_list,
+                             uint32_t graph_id) {
+  return true;
+}
+}  // namespace tasksink
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore