!31482 Fix load tensor into mem twice for ascend kernel by kernel dump

Merge pull request !31482 from TinaMengtingZhang/kernel_dump
2022-03-26 01:53:33 +00:00 · 2022-03-26 01:53:33 +00:00 · d40dc4f997
parent 06676366e3 9e717bd7fa
commit d40dc4f997
18 changed files with 152 additions and 148 deletions
--- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc
+++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc
@ -263,6 +263,7 @@ bool DumpJsonParser::DumpToFile(const std::string &filename, const void *data, s
    return false;
  }
  const std::string file_path_str = file_path.value();
+  MS_LOG(INFO) << "Dump path is " << file_path_str;
  ChangeFileMode(file_path_str, S_IWUSR);
  std::ofstream fd(file_path_str, std::ios::out | std::ios::trunc | std::ios::binary);
  if (!fd.is_open()) {
--- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
+++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
@ -118,23 +118,23 @@ bool E2eDump::IsDeviceTargetGPU() {
  return context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice;
 }

+bool E2eDump::IsMindRTKernelByKernel() {
+  return IsDeviceTargetGPU() || Debugger::GetInstance()->GetAscendKernelByKernelFlag();
+}
+
 /*
 * Feature group: Dump.
- * Target device group: GPU.
+ * Target device group: GPU, Ascend.
 * Runtime category: Old runtime, MindRT.
- * Description: This function is for dumping tensor in memory to disk in GPU machine.
+ * Description: This function is for dumping tensor loaded to tensor_loader in memory to disk in GPU and Ascend machine.
 */
-void E2eDump::DumpGPUMemToFile(const Debugger *debugger, const std::string &file_path, bool trans_flag,
-                               const device::DeviceAddress &addr, const std::string &original_kernel_name, size_t slot,
-                               const ShapeVector &int_shapes, const TypeId &host_type) {
+void E2eDump::DumpMemFromTensorLoaderToFile(const Debugger *debugger, const std::string &file_path,
+                                            const std::string &original_kernel_name, size_t slot) {
 #ifdef ENABLE_DEBUGGER
-  auto format = kOpFormat_DEFAULT;
  MS_EXCEPTION_IF_NULL(debugger);
-  auto ret = debugger->DumpTensorToFile(file_path, trans_flag, format, addr.format(), original_kernel_name, slot,
-                                        int_shapes, host_type);
+  auto ret = debugger->DumpTensorToFile(file_path, original_kernel_name, slot);
  if (!ret) {
-    MS_LOG(INFO) << "DumpTensorToFile Failed: flag:" << trans_flag << ", path:" << file_path
-                 << ", host_format:" << format;
+    MS_LOG(INFO) << "DumpTensorToFile Failed: path:" << file_path;
  }
 #endif
 }
@ -184,6 +184,7 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s
      continue;
    }
    auto addr = AnfAlgo::GetOutputAddr(node, j);
+    std::string node_name = GetKernelNodeName(node);
    MS_EXCEPTION_IF_NULL(addr);
    ShapeVector int_shapes;
    GetDumpIntShape(node, j, NOT_NULL(&int_shapes), trans_flag);
@ -196,14 +197,13 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s
    std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(task_id) + '.' +
                            std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".output." +
                            std::to_string(j);
-    if (DumpJsonParser::GetInstance().IsStatisticDump() &&
-        (IsDeviceTargetGPU() || Debugger::GetInstance()->GetAscendKernelByKernelFlag())) {
+    if (DumpJsonParser::GetInstance().IsStatisticDump() && IsMindRTKernelByKernel()) {
      TensorStatDump stat_dump(op_type, op_name, task_id, stream_id, timestamp, false, j, j);
-      (void)stat_dump.DumpTensorStatsToFile(GetKernelNodeName(node), dump_path, debugger);
+      (void)stat_dump.DumpTensorStatsToFile(node_name, dump_path, debugger);
    }
    if (DumpJsonParser::GetInstance().IsTensorDump()) {
-      if (IsDeviceTargetGPU()) {
-        DumpGPUMemToFile(debugger, file_path, trans_flag, *addr, GetKernelNodeName(node), j, int_shapes, type);
+      if (IsMindRTKernelByKernel()) {
+        DumpMemFromTensorLoaderToFile(debugger, file_path, node_name, j);
      } else {
        DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
      }
@ -213,10 +213,8 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s

 void E2eDump::DumpOutputData(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
                             std::string *kernel_name) {
-  auto debugger = Debugger::GetInstance();
-  MS_EXCEPTION_IF_NULL(debugger);
-  if (IsDeviceTargetGPU() || debugger->GetAscendKernelByKernelFlag()) {
-    MS_LOG(INFO) << "DumpInputData is only for graph mode on Ascend";
+  if (IsMindRTKernelByKernel()) {
+    MS_LOG(INFO) << "DumpOutputData is only for graph mode on Ascend";
    return;
  }
  MS_EXCEPTION_IF_NULL(node);
@ -256,8 +254,7 @@ void E2eDump::DumpInput(const session::KernelGraph *graph, const std::string &du
  }
 }

-void E2eDump::DumpInputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger,
-                                  const KernelLaunchInfo *launch_info) {
+void E2eDump::DumpInputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger) {
  auto &dump_json_parser = DumpJsonParser::GetInstance();
  if (!dump_json_parser.InputNeedDump()) {
    return;
@ -269,25 +266,11 @@ void E2eDump::DumpInputSingleNode(const CNodePtr &node, const std::string &dump_
    return;
  }
  DumpJsonParser::GetInstance().MatchKernel(kernel_name);
-  DumpInputImpl(node, trans_flag, dump_path, &kernel_name, debugger, launch_info);
-}
-
-std::shared_ptr<device::DeviceAddress> CreateAscendDeviceAddress(const KernelLaunchInfo *launch_info, size_t index,
-                                                                 TypeId type) {
-  MS_EXCEPTION_IF_NULL(launch_info);
-  auto addr_ptr = launch_info->inputs_[index];
-  auto ms_context = MsContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(ms_context);
-  auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
-  auto device_context =
-    device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({kAscendDevice, device_id});
-  auto format = kOpFormat_DEFAULT;
-  MS_EXCEPTION_IF_NULL(addr_ptr);
-  return device_context->CreateDeviceAddress(addr_ptr->addr, addr_ptr->size, format, type, ShapeVector());
+  DumpInputImpl(node, trans_flag, dump_path, &kernel_name, debugger);
 }

 void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
-                            std::string *kernel_name, const Debugger *debugger, const KernelLaunchInfo *launch_info) {
+                            std::string *kernel_name, const Debugger *debugger) {
  MS_EXCEPTION_IF_NULL(node);
  GetFileKernelName(NOT_NULL(kernel_name));
  auto input_size = common::AnfAlgo::GetInputTensorNum(node);
@ -298,12 +281,12 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st
    if (!AnfAlgo::OutputAddrExist(input, index)) {
      continue;
    }
-    std::string tensor_name = GetKernelNodeName(node);
+    std::string node_name = GetKernelNodeName(node);
    size_t slot = j;
-    if (IsDeviceTargetGPU() || Debugger::GetInstance()->GetAscendKernelByKernelFlag()) {
+    if (IsMindRTKernelByKernel()) {
      auto input_kernel = node->input(j + 1);
      std::string input_kernel_name = GetKernelNodeName(input_kernel);
-      tensor_name = input_kernel_name;
+      node_name = input_kernel_name;
      slot = 0;
    }
    ShapeVector int_shapes;
@ -318,18 +301,13 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st
                            std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".input." + std::to_string(j);
    auto addr = AnfAlgo::GetOutputAddr(input, index);
    MS_EXCEPTION_IF_NULL(addr);
-    if (DumpJsonParser::GetInstance().IsStatisticDump() &&
-        (IsDeviceTargetGPU() || Debugger::GetInstance()->GetAscendKernelByKernelFlag())) {
+    if (DumpJsonParser::GetInstance().IsStatisticDump() && IsMindRTKernelByKernel()) {
      TensorStatDump stat_dump(op_type, op_name, task_id, stream_id, timestamp, true, j, slot);
-      (void)stat_dump.DumpTensorStatsToFile(tensor_name, dump_path, debugger);
+      (void)stat_dump.DumpTensorStatsToFile(node_name, dump_path, debugger);
    }
    if (DumpJsonParser::GetInstance().IsTensorDump()) {
-      if (IsDeviceTargetGPU()) {
-        DumpGPUMemToFile(debugger, file_path, trans_flag, *addr, tensor_name, slot, int_shapes, type);
-      } else if (Debugger::GetInstance()->GetAscendKernelByKernelFlag()) {
-        // load address from launch_info when it's Ascend Kernel by kernel mode.
-        auto ascend_device_addr = CreateAscendDeviceAddress(launch_info, j, type);
-        DumpMemToFile(file_path, *ascend_device_addr, int_shapes, type, trans_flag);
+      if (IsMindRTKernelByKernel()) {
+        DumpMemFromTensorLoaderToFile(debugger, file_path, node_name, slot);
      } else {
        DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
      }
@ -339,9 +317,7 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st

 void E2eDump::DumpInputData(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
                            std::string *kernel_name) {
-  auto debugger = Debugger::GetInstance();
-  MS_EXCEPTION_IF_NULL(debugger);
-  if (IsDeviceTargetGPU() || debugger->GetAscendKernelByKernelFlag()) {
+  if (IsMindRTKernelByKernel()) {
    MS_LOG(INFO) << "DumpInputData is only for graph mode on Ascend";
    return;
  }
@ -409,7 +385,7 @@ void E2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_
      (void)stat_dump.DumpTensorStatsToFile(node_name, dump_path, debugger);
    }
    if (dump_json_parser.IsTensorDump()) {
-      DumpGPUMemToFile(debugger, file_path, trans_flag, *addr, node_name, 0, int_shapes, type);
+      DumpMemFromTensorLoaderToFile(debugger, file_path, node_name, 0);
    }
  } else {
    DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
@ -452,7 +428,7 @@ void E2eDump::DumpSingleParameterNode(const AnfNodePtr &anf_node, const std::str
      (void)stat_dump.DumpTensorStatsToFile(node_name, dump_path, debugger);
    }
    if (dump_json_parser.IsTensorDump()) {
-      DumpGPUMemToFile(debugger, file_path, trans_flag, *addr, node_name, 0, int_shapes, type);
+      DumpMemFromTensorLoaderToFile(debugger, file_path, node_name, 0);
    }
  } else {
    DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
@ -662,13 +638,12 @@ void E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, cons
 * Runtime category: MindRT.
 * Description: This function is for dumping a single node. It is used for mindrt in GPU and Ascend kernel-by-kernel.
 */
-bool E2eDump::DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id, const Debugger *debugger,
-                                 const KernelLaunchInfo *launch_info) {
+bool E2eDump::DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id, const Debugger *debugger) {
  bool success = false;
  auto &dump_json_parser = DumpJsonParser::GetInstance();
  if (dump_json_parser.DumpEnabledForIter()) {
    std::string dump_path = GenerateDumpPath(graph_id, rank_id);
-    DumpInputSingleNode(node, dump_path, debugger, launch_info);
+    DumpInputSingleNode(node, dump_path, debugger);
    DumpOutputSingleNode(node, dump_path, debugger);
    success = true;
  }
@ -761,9 +736,10 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum
  if (dump_tensor_vec.empty()) {
    return;
  }
+  // The maximum tensor size to allow convert format in single thread to 1 MB.
  constexpr int kMaxTensorSize = 1048576;
  if (offset <= kMaxTensorSize) {
-    // If the total tensor size is less than 1Mb, do it in single thread.
+    // If the total tensor size is less than 1MB, do it in single thread.
    ConvertFormatForTensors(&dump_tensor_vec, 0, dump_tensor_vec.size() - 1);
  } else {
    // In multi_thread process, we only use 1/4 of the total concurrent threads.
@ -775,7 +751,7 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum
    std::vector<std::thread> threads;
    threads.reserve(num_threads);
    MS_LOG(INFO) << "Number of threads used for A+M dump: " << num_threads;
-    for (size_t t = 0; t < threads.capacity(); t++) {
+    for (size_t t = 0; t < num_threads; t++) {
      uint32_t start_idx = t * task_size;
      uint32_t end_idx = start_idx + task_size - 1;
      if (t == num_threads - 1) {
--- a/mindspore/ccsrc/debug/data_dump/e2e_dump.h
+++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.h
@ -32,7 +32,6 @@
 #endif
 #include "include/backend/visible.h"

-using mindspore::kernel::KernelLaunchInfo;
 #ifndef ENABLE_DEBUGGER
 class Debugger;
 #endif
@ -71,12 +70,11 @@ class E2eDump {
  static void DumpParametersData(uint32_t rank_id, const Debugger *debugger);

  static bool DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id,
-                                 const Debugger *debugger = nullptr, const KernelLaunchInfo *launch_info = nullptr);
+                                 const Debugger *debugger = nullptr);

  // Dump data when task error.
  static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
-                            std::string *kernel_name, const Debugger *debugger,
-                            const KernelLaunchInfo *launch_info = nullptr);
+                            std::string *kernel_name, const Debugger *debugger);

  static void DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
                             std::string *kernel_name, const Debugger *debugger);
@ -93,6 +91,10 @@ class E2eDump {
                                char *data_ptr);
 #endif

+  static bool IsDeviceTargetGPU();
+
+  static bool IsMindRTKernelByKernel();
+
 private:
  static void DumpOutput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger);

@ -100,15 +102,13 @@ class E2eDump {

  static void DumpInput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger);

-  static void DumpInputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger,
-                                  const KernelLaunchInfo *launch_info = nullptr);
+  static void DumpInputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger);

  static void DumpParameters(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger);

-  static void DumpGPUMemToFile(const Debugger *debugger, const std::string &file_path, bool trans_flag,
-                               const device::DeviceAddress &addr, const std::string &original_kernel_name, size_t slot,
-                               const ShapeVector &int_shapes, const TypeId &host_type);
-  static bool IsDeviceTargetGPU();
+  static void DumpMemFromTensorLoaderToFile(const Debugger *debugger, const std::string &file_path,
+                                            const std::string &original_kernel_name, size_t slot);
+
  static void DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_index, const std::string &dump_path,
                                bool trans_flag, const Debugger *debugger);

--- a/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.cc
+++ b/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.cc
@ -33,11 +33,6 @@ constexpr auto kCsvFileName = "statistic.csv";
 }  // namespace

 namespace mindspore {
-const std::map<DbgDataType, std::string> kDbgDataTypeToStringMap = {
-  {DT_BOOL, "bool"},     {DT_INT8, "int8"},       {DT_INT16, "int16"},     {DT_INT32, "int32"},
-  {DT_INT64, "int64"},   {DT_UINT8, "uint8"},     {DT_UINT16, "uint16"},   {DT_UINT32, "uint32"},
-  {DT_UINT64, "uint64"}, {DT_FLOAT16, "float16"}, {DT_FLOAT32, "float32"}, {DT_FLOAT64, "float64"}};
-
 bool CsvWriter::OpenFile(const std::string &path, const std::string &header) {
  if (file_.is_open() && path == file_path_str_) {
    return true;
@ -162,13 +157,10 @@ bool TensorStatDump::DumpTensorStatsToFile(const std::string &dump_path, const s
    MS_LOG(INFO) << "Tensor data is empty, skipping current statistics";
    return false;
  }
-  std::string type;
-  auto iter_type = kDbgDataTypeToStringMap.find(data->GetType());
-  if (iter_type == kDbgDataTypeToStringMap.end()) {
+  std::string type = data->GetTypeString();
+  if (type.empty()) {
    type = "unsupported(" + std::to_string(data->GetType()) + ")";
    MS_LOG(INFO) << "Unsupported tensor data_type " << type << " for tensor " << data->GetName();
-  } else {
-    type = iter_type->second;
  }
  if (!OpenStatisticsFile(dump_path)) {
    return false;
--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
@ -1836,11 +1836,8 @@ std::shared_ptr<TensorData> DebugServices::GetTensor(const std::string &tensor_n
 void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }

 #ifdef ONLINE_DBG_MODE
-bool DebugServices::DumpTensorToFile(const std::string &filepath, bool trans_flag, const std::string &host_fmt,
-                                     const std::string &addr_format, const std::string &tensor_name, size_t slot,
-                                     const std::vector<int64_t> &host_shape, TypeId host_type) const {
-  return tensor_loader_->DumpTensorToFile(filepath, trans_flag, host_fmt, addr_format, tensor_name, slot, host_shape,
-                                          host_type);
+bool DebugServices::DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const {
+  return tensor_loader_->DumpTensorToFile(filepath, tensor_name, slot);
 }
 #endif

--- a/mindspore/ccsrc/debug/debug_services.h
+++ b/mindspore/ccsrc/debug/debug_services.h
@ -461,9 +461,7 @@ class DebugServices {
  void EmptyCurrentTensor();

 #ifdef ONLINE_DBG_MODE
-  bool DumpTensorToFile(const std::string &filepath, bool trans_flag, const std::string &host_fmt,
-                        const std::string &addr_format, const std::string &tensor_name, size_t slot,
-                        const std::vector<int64_t> &host_shape, TypeId host_type) const;
+  bool DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const;
 #endif

  bool LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev);
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@ -527,10 +527,10 @@ void Debugger::DumpConstantDataAscend(const KernelGraphPtr &graph) {
 * Runtime category: MindRT.
 * Description: Dumps a single node for given graph_id.
 */
-void Debugger::DumpSingleNode(const CNodePtr &node, uint32_t graph_id, const KernelLaunchInfo *launch_info) {
+void Debugger::DumpSingleNode(const CNodePtr &node, uint32_t graph_id) {
  if (debugger_ && debugger_->DebuggerBackendEnabled()) {
    uint32_t rank_id = GetRankID();
-    (void)E2eDump::DumpSingleNodeData(node, graph_id, rank_id, debugger_.get(), launch_info);
+    (void)E2eDump::DumpSingleNodeData(node, graph_id, rank_id, debugger_.get());
  }
 }

@ -1335,11 +1335,8 @@ void Debugger::SendWatchpoints(const std::list<WatchpointHit> &points) {
  }
 }

-bool Debugger::DumpTensorToFile(const std::string &filepath, bool trans_flag, const std::string &host_fmt,
-                                const std::string &addr_format, const std::string &tensor_name, size_t slot,
-                                const std::vector<int64_t> &host_shape, TypeId host_type) const {
-  return debug_services_.get()->DumpTensorToFile(filepath, trans_flag, host_fmt, addr_format, tensor_name, slot,
-                                                 host_shape, host_type);
+bool Debugger::DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const {
+  return debug_services_.get()->DumpTensorToFile(filepath, tensor_name, slot);
 }

 bool Debugger::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
@ -1541,7 +1538,8 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output
  } else {
    keep_prev = false;
  }
-  bool ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, keep_prev, root_graph_id, false);
+  bool ret =
+    addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, keep_prev, root_graph_id, false, true);
  if (!ret) {
    MS_LOG(ERROR) << "LoadMemToHost:"
                  << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
@ -1572,7 +1570,7 @@ void Debugger::LoadSingleParameterMindRT(const AnfNodePtr &node) {
  }
  // Keep_prev is True for parameters.
  // force update for parameters.
-  bool ret = device_addr->LoadMemToHost(tensor_name, 0, format, int_shapes, type, 0, true, root_graph_id, true);
+  bool ret = device_addr->LoadMemToHost(tensor_name, 0, format, int_shapes, type, 0, true, root_graph_id, true, true);
  if (!ret) {
    MS_LOG(ERROR) << "LoadMemToHost:"
                  << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
@ -1702,7 +1700,8 @@ void Debugger::LoadGraphOutputs() {
      auto format = kOpFormat_DEFAULT;
      string tensor_name = kernel_name + ':' + std::to_string(j);
      ShapeVector int_shapes = trans::GetRuntimePaddingShape(node, j);
-      auto ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false, root_graph_id, false);
+      auto ret =
+        addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false, root_graph_id, false, true);
      if (!ret) {
        MS_LOG(ERROR) << "LoadMemToHost:"
                      << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@ -107,7 +107,7 @@ class BACKEND_EXPORT Debugger : public std::enable_shared_from_this<Debugger> {

  void DumpConstantDataAscend(const KernelGraphPtr &graph);

-  void DumpSingleNode(const CNodePtr &node, uint32_t graph_id, const KernelLaunchInfo *launch_info = nullptr);
+  void DumpSingleNode(const CNodePtr &node, uint32_t graph_id);

  void DumpInGraphCompiler(const KernelGraphPtr &kernel_graph);

@ -117,9 +117,7 @@ class BACKEND_EXPORT Debugger : public std::enable_shared_from_this<Debugger> {

  void PostExecuteNode(const CNodePtr &kernel, bool last_kernel);

-  bool DumpTensorToFile(const std::string &filepath, bool trans_flag, const std::string &host_fmt,
-                        const std::string &addr_format, const std::string &tensor_name, size_t slot,
-                        const std::vector<int64_t> &host_shape, TypeId host_type) const;
+  bool DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const;

  bool LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev);

--- a/mindspore/ccsrc/debug/debugger/debugger_utils.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger_utils.cc
@ -66,12 +66,12 @@ std::vector<size_t> CheckRealOutput(const std::string &node_name, const size_t &

 /*
 * Feature group: Dump, Online debugger.
- * Target device group: GPU.
+ * Target device group: GPU, Ascend.
 * Runtime category: MindRT.
 * Description: Get kernel inputs from launch_info and load the inputs from device to host.
 */
 void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order, uint32_t root_graph_id,
-                const DeviceContext *device_context) {
+                const DeviceContext *device_context, const bool trans_flag) {
  // get inputs
  auto kernel_inputs = launch_info->inputs_;
  auto input_size = common::AnfAlgo::GetInputTensorNum(cnode);
@ -79,33 +79,40 @@ void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint
    auto input_kernel = cnode->input(j + 1);
    std::string input_kernel_name = GetKernelNodeName(input_kernel);
    auto addr = kernel_inputs[j];
-    auto type = common::AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX);
+    auto device_type = AnfAlgo::GetOutputDeviceDataType(input_kernel, PARAMETER_OUTPUT_INDEX);
+    auto host_type = common::AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX);
+    auto type = trans_flag ? host_type : device_type;
    // For example, this happens with the Depend op
    if (type == kMetaTypeNone) {
      continue;
    }

-    auto format = kOpFormat_DEFAULT;
-    auto device_addr = device_context->CreateDeviceAddress(addr->addr, addr->size, format, type, ShapeVector());
+    auto host_format = kOpFormat_DEFAULT;
+    auto device_format =
+      E2eDump::IsDeviceTargetGPU() ? kOpFormat_DEFAULT : AnfAlgo::GetOutputFormat(input_kernel, PARAMETER_OUTPUT_INDEX);
+    auto device_addr =
+      device_context->CreateDeviceAddress(addr->addr, addr->size, device_format, device_type, ShapeVector());
    string input_tensor_name = input_kernel_name + ':' + "0";
-    ShapeVector int_shapes = trans::GetRuntimePaddingShape(input_kernel, PARAMETER_OUTPUT_INDEX);
-    auto ret = device_addr->LoadMemToHost(input_tensor_name, UintToInt(exec_order), format, int_shapes, type, 0, true,
-                                          root_graph_id, false);
+    ShapeVector int_shapes;
+    GetDumpIntShape(input_kernel, PARAMETER_OUTPUT_INDEX, NOT_NULL(&int_shapes), trans_flag);
+    auto ret = device_addr->LoadMemToHost(input_tensor_name, UintToInt(exec_order), host_format, int_shapes, type, 0,
+                                          true, root_graph_id, false, trans_flag);
    if (!ret) {
      MS_LOG(ERROR) << "LoadMemToHost:"
-                    << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
+                    << ", tensor_name:" << input_tensor_name << ", host_format:" << host_format
+                    << ", device_format:" << device_format << ".";
    }
  }
 }

 /*
 * Feature group: Dump, Online debugger.
- * Target device group: GPU.
+ * Target device group: GPU, Ascend.
 * Runtime category: MindRT.
 * Description: Get kernel outputs from launch_info and load the inputs from device to host.
 */
 void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order,
-                 uint32_t root_graph_id, const DeviceContext *device_context) {
+                 uint32_t root_graph_id, const DeviceContext *device_context, const bool trans_flag) {
  // get outputs
  auto kernel_outputs = launch_info->outputs_;
  auto output_size = common::AnfAlgo::GetOutputTensorNum(cnode);
@ -115,21 +122,27 @@ void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uin

  for (size_t j : real_outputs) {
    auto addr = kernel_outputs[j];
-    auto type = common::AnfAlgo::GetOutputInferDataType(cnode, j);
+    auto device_type = AnfAlgo::GetOutputDeviceDataType(cnode, j);
+    auto host_type = common::AnfAlgo::GetOutputInferDataType(cnode, j);
+    auto type = trans_flag ? host_type : device_type;
    // For example, this happens with the Depend op
    if (type == kMetaTypeNone) {
      continue;
    }

-    auto format = kOpFormat_DEFAULT;
-    auto device_addr = device_context->CreateDeviceAddress(addr->addr, addr->size, format, type, ShapeVector());
+    auto host_format = kOpFormat_DEFAULT;
+    auto device_format = E2eDump::IsDeviceTargetGPU() ? kOpFormat_DEFAULT : AnfAlgo::GetOutputFormat(cnode, j);
+    auto device_addr =
+      device_context->CreateDeviceAddress(addr->addr, addr->size, device_format, device_type, ShapeVector());
    string tensor_name = kernel_name + ':' + std::to_string(j);
-    ShapeVector int_shapes = trans::GetRuntimePaddingShape(cnode, j);
-    auto ret = device_addr->LoadMemToHost(tensor_name, UintToInt(exec_order), format, int_shapes, type, j, false,
-                                          root_graph_id, false);
+    ShapeVector int_shapes;
+    GetDumpIntShape(cnode, j, NOT_NULL(&int_shapes), trans_flag);
+    auto ret = device_addr->LoadMemToHost(tensor_name, UintToInt(exec_order), host_format, int_shapes, type, j, false,
+                                          root_graph_id, false, trans_flag);
    if (!ret) {
      MS_LOG(ERROR) << "LoadMemToHost:"
-                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
+                    << ", tensor_name:" << tensor_name << ", host_format:" << host_format
+                    << ", device_format:" << device_format << ".!";
    }
  }
 }
@ -168,6 +181,13 @@ bool IsDeviceTargetGPU() {
  return context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice;
 }

+bool GetTransFlag() {
+  if (Debugger::GetInstance()->debugger_enabled() || IsDeviceTargetGPU()) {
+    return true;
+  }
+  return DumpJsonParser::GetInstance().trans_flag();
+}
+
 /*
 * Feature group: Dump, Online debugger.
 * Target device group: Ascend, GPU.
@ -187,11 +207,12 @@ void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info,
  auto kernel_graph = std::dynamic_pointer_cast<KernelGraph>(cnode->func_graph());
  MS_EXCEPTION_IF_NULL(kernel_graph);
  auto root_graph_id = kernel_graph->root_graph_id();
+  bool trans_flag = GetTransFlag();
  if (debugger->debugger_enabled() || dump_json_parser.InputNeedDump()) {
-    LoadInputs(cnode, launch_info, exec_order, root_graph_id, device_context);
+    LoadInputs(cnode, launch_info, exec_order, root_graph_id, device_context, trans_flag);
  }
  if (debugger->debugger_enabled() || dump_json_parser.OutputNeedDump()) {
-    LoadOutputs(cnode, launch_info, exec_order, root_graph_id, device_context);
+    LoadOutputs(cnode, launch_info, exec_order, root_graph_id, device_context, trans_flag);
  }
  // Dump kernel
  if (dump_enabled) {
@ -202,7 +223,7 @@ void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info,
      debugger->DumpSingleNode(cnode, graph_id);
    } else {
      // for Ascend, node are dumped in root_graph_id directory.
-      debugger->DumpSingleNode(cnode, root_graph_id, launch_info);
+      debugger->DumpSingleNode(cnode, root_graph_id);
    }
    // Clear Dumped data when online debugger is not enabled
    if (!debugger->debugger_enabled()) {
--- a/mindspore/ccsrc/debug/debugger/debugger_utils.h
+++ b/mindspore/ccsrc/debug/debugger/debugger_utils.h
@ -33,10 +33,10 @@ namespace mindspore {
 std::vector<size_t> CheckRealOutput(const std::string &node_name, const size_t &output_size);

 void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order, uint32_t root_graph_id,
-                const DeviceContext *device_context);
+                const DeviceContext *device_context, const bool trans_flag);

 void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order,
-                 uint32_t root_graph_id, const DeviceContext *device_context);
+                 uint32_t root_graph_id, const DeviceContext *device_context, const bool trans_flag);

 bool CheckReadData(const CNodePtr &cnode);

--- a/mindspore/ccsrc/debug/tensor_data.h
+++ b/mindspore/ccsrc/debug/tensor_data.h
@ -17,6 +17,7 @@
 #define MINDSPORE_CCSRC_DEBUG_TENSOR_DATA_H_

 #include <algorithm>
+#include <map>
 #include <vector>
 #include <string>
 #include <iostream>
@ -203,6 +204,10 @@ class TensorData {

 #ifdef ONLINE_DBG_MODE
  void SetTensor(const mindspore::tensor::TensorPtr &out_tensor) { this->tensor_ptr_ = out_tensor; }
+
+  void SetFormat(const std::string &format) { this->format_ = format; }
+
+  std::string GetFormat() { return this->format_; }
 #endif

  void SetSlot(size_t slot) { this->slot_ = slot; }
@ -239,6 +244,19 @@ class TensorData {

  DbgDataType GetType() const { return this->data_type_; }

+  std::string GetTypeString() const {
+    const std::map<DbgDataType, std::string> kDbgDataTypeToStringMap = {
+      {DT_BOOL, "bool"},     {DT_INT8, "int8"},       {DT_INT16, "int16"},     {DT_INT32, "int32"},
+      {DT_INT64, "int64"},   {DT_UINT8, "uint8"},     {DT_UINT16, "uint16"},   {DT_UINT32, "uint32"},
+      {DT_UINT64, "uint64"}, {DT_FLOAT16, "float16"}, {DT_FLOAT32, "float32"}, {DT_FLOAT64, "float64"}};
+    auto iter_type = kDbgDataTypeToStringMap.find(data_type_);
+    if (iter_type == kDbgDataTypeToStringMap.end()) {
+      return std::string();
+    } else {
+      return iter_type->second;
+    }
+  }
+
  void SetType(unsigned int type) { ConvertMsToDbgType(type); }

  void SetType(const std::string &type_name) { ConvertStringToDbgType(type_name); }
@ -438,6 +456,7 @@ class TensorData {
  std::string time_stamp_;

 #ifdef ONLINE_DBG_MODE
+  std::string format_{""};
  mindspore::tensor::TensorPtr tensor_ptr_{nullptr};
 #endif
 };
--- a/mindspore/ccsrc/debug/tensor_load.h
+++ b/mindspore/ccsrc/debug/tensor_load.h
@ -244,29 +244,20 @@ class TensorLoader {
   * Runtime category: Old runtime, MindRT.
   * Description: Load tensor data from debugger backend cache (tensor_list_map_) and dump to file in npy format.
   */
-  bool DumpTensorToFile(const std::string &filepath, bool trans_flag, const std::string &host_fmt,
-                        const std::string &addr_format, const std::string &tensor_name, size_t slot,
-                        const std::vector<int64_t> &host_shape, TypeId host_type) {
+  bool DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) {
    if (filepath.empty()) {
      MS_LOG(ERROR) << "Dump file path is null!";
      return false;
    }
-    std::string path = "";
-    if (trans_flag) {
-      path = filepath + '.' + host_fmt;
-    } else {
-      path = filepath + '.' + addr_format;
-    }
-
-    MS_LOG(INFO) << "Dump path is " << path;

    std::string tensor_loader_name = tensor_name + ":" + std::to_string(slot);
    auto iter = tensor_list_map_.find(tensor_loader_name);
    if (iter != tensor_list_map_.end()) {
      std::shared_ptr<TensorData> node = iter->second;
-      size_t host_size = node->GetByteSize();
+      std::string path = filepath + '.' + node->GetFormat();

-      return DumpJsonParser::DumpToFile(path, node->GetDataPtr(), host_size, host_shape, host_type);
+      return DumpJsonParser::DumpToFile(path, node->GetDataPtr(), node->GetByteSize(), node->GetShape(),
+                                        StringToTypeId(node->GetTypeString()));
    }
    MS_LOG(INFO) << "Tensor name:" << tensor_name << " not found in tensor_list_map_";
    return false;
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.cc
@ -647,9 +647,10 @@ bool AscendDeviceAddress::DumpMemToFile(const std::string &filepath, const std::
 * Runtime category: Old runtime, MindRT.
 * Description: Load tensor to host and create tensor_data object for the loaded tensor.
 */
-bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &,
-                                        const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev,
-                                        uint32_t root_graph_id, bool force_update) const {
+bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order,
+                                        const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type,
+                                        size_t slot, bool keep_prev, uint32_t root_graph_id, bool force_update,
+                                        bool trans_flag) const {
  bool ret = false;
  auto debugger = Debugger::GetInstance();
  MS_EXCEPTION_IF_NULL(debugger);
@ -671,9 +672,14 @@ bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int exec
  mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(host_type, host_shape);
  MS_EXCEPTION_IF_NULL(out_tensor);
  size_t host_size = out_tensor->data().nbytes();
-  auto ret_sync = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c());
+  bool ret_sync = false;
+  if (trans_flag) {
+    ret_sync = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c());
+  } else {
+    ret_sync = SyncDeviceToHost(host_size, out_tensor->data_c());
+  }
  if (!ret_sync) {
-    MS_LOG(ERROR) << "Copy device mem to host failed";
+    MS_LOG(ERROR) << "Convert format or Copy device mem to host failed";
    return ret;
  }
  MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
@ -683,7 +689,11 @@ bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int exec
  tensor_data->SetType((unsigned int)host_type);
  tensor_data->SetShape(out_tensor->shape());
  tensor_data->SetRootGraphId(root_graph_id);
+  std::string tensor_format = trans_flag ? host_fmt : format_;
+  tensor_data->SetFormat(tensor_format);
  ret = debugger->LoadNewTensor(tensor_data, keep_prev);
+  MS_LOG(INFO) << "Load tensor '" << tensor_name << "' into debugger tensor loader successfully: format("
+               << tensor_format << ")";
  return ret;
 }
 #endif
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.h
@ -62,7 +62,7 @@ class AscendDeviceAddress : public DeviceAddress {
 #ifdef ENABLE_DEBUGGER
  bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
                     const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev,
-                     uint32_t root_graph_id, bool force_update) const override;
+                     uint32_t root_graph_id, bool force_update, bool trans_flag) const override;
 #endif

 private:
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_device_address.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_device_address.cc
@ -185,7 +185,7 @@ GPUDeviceAddress::~GPUDeviceAddress() { ClearDeviceMemory(); }
 #ifdef ENABLE_DEBUGGER
 bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
                                     const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev,
-                                     uint32_t root_graph_id, bool force_update) const {
+                                     uint32_t root_graph_id, bool force_update, bool) const {
  bool ret = false;
  if (size_ == 0) {
    return true;
@ -219,6 +219,7 @@ bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int executi
  tensor_data->SetType((unsigned int)host_type);
  tensor_data->SetShape(out_tensor->shape());
  tensor_data->SetRootGraphId(root_graph_id);
+  tensor_data->SetFormat(host_fmt);
  ret = Debugger::GetInstance()->LoadNewTensor(tensor_data, keep_prev);
  MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
  return ret;
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_device_address.h
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_device_address.h
@ -56,7 +56,7 @@ class GPUDeviceAddress : public DeviceAddress {
 #ifdef ENABLE_DEBUGGER
  bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
                     const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev,
-                     uint32_t root_graph_id, bool force_update) const override;
+                     uint32_t root_graph_id, bool force_update, bool trans_flag) const override;
 #endif

 private:
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_kernel_runtime.cc
@ -183,7 +183,8 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
      auto gpu_addr = std::make_unique<GPUDeviceAddress>(addr->addr, addr->size, format, type);
      string input_tensor_name = input_kernel_name + ':' + "0";
      ShapeVector int_shapes = trans::GetRuntimePaddingShape(input_kernel, PARAMETER_OUTPUT_INDEX);
-      auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, true, 0, false);
+      auto ret =
+        gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, true, 0, false, true);
      if (!ret) {
        MS_LOG(ERROR) << "LoadMemToHost:"
                      << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
@ -210,7 +211,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
      auto gpu_addr = std::make_unique<GPUDeviceAddress>(addr->addr, addr->size, format, type);
      string tensor_name = kernel_name + ':' + std::to_string(j);
      ShapeVector int_shapes = trans::GetRuntimePaddingShape(kernel, j);
-      auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false, 0, false);
+      auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false, 0, false, true);
      if (!ret) {
        MS_LOG(ERROR) << "LoadMemToHost:"
                      << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
--- a/mindspore/ccsrc/runtime/device/device_address.h
+++ b/mindspore/ccsrc/runtime/device/device_address.h
@ -141,7 +141,7 @@ class DeviceAddress : public mindspore::DeviceSync {
 #ifdef ENABLE_DEBUGGER
  virtual bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
                             const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev,
-                             uint32_t root_graph_id, bool force_update) const {
+                             uint32_t root_graph_id, bool force_update, bool trans_flag) const {
    return true;
  }
 #endif