Support Cann callback api for ascend async dump

2021-11-12 17:57:36 -05:00 · 2021-11-12 17:57:36 -05:00 · 07b653103e
parent 3fc995a6ae
commit 07b653103e
16 changed files with 662 additions and 14 deletions
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit fd9e9a96f97960ba46c21352b0df7719d3a0a3f7
+Subproject commit 8f7df5fd1f7a70233e2aeaa6155dcd76b93e0b11
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@ -168,8 +168,11 @@ if(ENABLE_DEBUGGER)
    ms_protobuf_generate(DEBUGGER_PROTO_SRCS DEBUGGER_PROTO_HDRS ${DEBUGGER_PROTO_LIST})
    file(GLOB_RECURSE DEBUGGER_GRPC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "debug/debugger/debug_grpc.proto")
    ms_grpc_generate(DEBUGGER_GRPC_SRCS DEBUGGER_GRPC_HDRS ${DEBUGGER_GRPC_LIST})
+    file(GLOB_RECURSE DUMP_DATA_PROTO_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "debug/debugger/dump_data.proto")
+    ms_protobuf_generate(DUMP_DATA_PROTO_SRCS DUMP_DATA_PROTO_HDRS ${DUMP_DATA_PROTO_LIST})
    list(APPEND MINDSPORE_PROTO_LIST ${DEBUGGER_PROTO_SRCS})
    list(APPEND MINDSPORE_PROTO_LIST ${DEBUGGER_GRPC_SRCS})
+    list(APPEND MINDSPORE_PROTO_LIST ${DUMP_DATA_PROTO_SRCS})
 endif()

 if(ENABLE_DUMP_PROTO)
--- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc
+++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc
@ -42,6 +42,7 @@ constexpr auto kTransFlag = "trans_flag";
 constexpr auto kStatisticDump = "statistic";
 constexpr auto kTensorDump = "tensor";
 constexpr auto kFullDump = "full";
+constexpr auto kFileFormat = "file_format";
 constexpr auto kDumpInputAndOutput = 0;
 constexpr auto kDumpInputOnly = 1;
 constexpr auto kDumpOutputOnly = 2;
@ -274,6 +275,8 @@ void DumpJsonParser::ParseCommonDumpSetting(const nlohmann::json &content) {
  ParseSupportDevice(*support_device);
  if (!e2e_dump_enabled_) {
    ParseOpDebugMode(*op_debug_mode);
+    ParseFileFormat(
+      *common_dump_settings);  // Pass in the whole json string to parse because file_format field is optional.
  }
 }

@ -505,6 +508,23 @@ void DumpJsonParser::ParseOpDebugMode(const nlohmann::json &content) {
  }
 }

+void DumpJsonParser::ParseFileFormat(const nlohmann::json &content) {
+  auto iter = content.find(kFileFormat);
+  if (iter == content.end()) {
+    file_format_ = JsonFileFormat::FORMAT_BIN;
+  } else {
+    CheckJsonStringType(*iter, kFileFormat);
+    std::string file_format = *iter;
+    const std::map<std::string, JsonFileFormat> str_to_fmt_enum = {{"bin", JsonFileFormat::FORMAT_BIN},
+                                                                   {"npy", JsonFileFormat::FORMAT_NPY}};
+    if (str_to_fmt_enum.find(file_format) == str_to_fmt_enum.end()) {
+      MS_LOG(EXCEPTION) << "Dump Json Parse Failed. 'file_format' should be either 'npy' or 'bin', but got: "
+                        << file_format;
+    }
+    file_format_ = str_to_fmt_enum.at(file_format);
+  }
+}
+
 void DumpJsonParser::JsonConfigToString() {
  std::string cur_config;
  cur_config.append("dump_mode:");
--- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.h
+++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.h
@ -59,6 +59,7 @@ class DumpJsonParser {
  bool trans_flag() const { return trans_flag_; }
  uint32_t cur_dump_iter() const { return cur_dump_iter_; }
  void UpdateDumpIter() { ++cur_dump_iter_; }
+  bool FileFormatIsNpy() const { return file_format_ == JsonFileFormat::FORMAT_NPY; }
  bool GetIterDumpFlag() const;
  bool InputNeedDump() const;
  bool OutputNeedDump() const;
@ -70,6 +71,7 @@ class DumpJsonParser {
  void SaveGraph(session::KernelGraph *graph) { (void)graphs_.emplace_back(graph); }
  const std::vector<session::KernelGraph *> &graphs() const { return graphs_; }
  enum JsonDumpMode { DUMP_ALL = 0, DUMP_KERNEL = 1, DUMP_KERNELS_WITH_FLAG = 2 };
+  enum JsonFileFormat { FORMAT_NPY = 0, FORMAT_BIN = 1 };

 private:
  DumpJsonParser() = default;
@ -89,6 +91,7 @@ class DumpJsonParser {
  std::vector<std::string> cell_dump_kernels_;
  std::set<uint32_t> support_devices_;
  uint32_t op_debug_mode_{0};
+  JsonFileFormat file_format_;
  bool trans_flag_{false};
  uint32_t cur_dump_iter_{0};
  bool already_parsed_{false};
@ -112,6 +115,7 @@ class DumpJsonParser {
  void ParseSupportDevice(const nlohmann::json &content);
  bool ParseEnable(const nlohmann::json &content);
  void ParseOpDebugMode(const nlohmann::json &content);
+  void ParseFileFormat(const nlohmann::json &content);

  void JudgeDumpEnabled();
  void JsonConfigToString();
--- a/mindspore/ccsrc/debug/data_dump/dump_utils.cc
+++ b/mindspore/ccsrc/debug/data_dump/dump_utils.cc
@ -24,6 +24,8 @@
 #include "debug/data_dump/dump_json_parser.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "runtime/device/kernel_runtime_manager.h"
+#include "utils/utils.h"
+#include "debug/common.h"

 namespace mindspore {
 uint32_t ConvertPhysicalDeviceId(uint32_t device_id) {
@ -137,8 +139,7 @@ uint64_t GetTimeStamp() {
  return timestamp;
 }

-std::string GetOpNameWithoutScope(const std::string &fullname_with_scope) {
-  const std::string separator("--");
+std::string GetOpNameWithoutScope(const std::string &fullname_with_scope, const std::string &separator) {
  std::size_t found = fullname_with_scope.rfind(separator);
  std::string op_name;
  if (found != std::string::npos) {
@ -146,4 +147,30 @@ std::string GetOpNameWithoutScope(const std::string &fullname_with_scope) {
  }
  return op_name;
 }
+
+void DumpToFile(const std::string &file_name, const std::string &dump_str) {
+  if (dump_str.empty()) {
+    MS_LOG(ERROR) << "Failed to dump empty tensor data.";
+    return;
+  }
+
+  auto real_path = Common::CreatePrefixPath(file_name);
+  if (!real_path.has_value()) {
+    MS_LOG(ERROR) << "CreatePrefixPath failed.";
+    return;
+  }
+  std::string real_path_str = real_path.value();
+  ChangeFileMode(real_path_str, S_IWUSR);
+  std::ofstream file(real_path_str, std::ofstream::out | std::ofstream::trunc);
+  if (!file.is_open()) {
+    MS_LOG(EXCEPTION) << "Open file " << real_path_str << "failed: " << ErrnoToString(errno);
+  }
+  file << dump_str;
+  if (file.bad()) {
+    file.close();
+    MS_LOG(EXCEPTION) << "Dump string to file " << real_path_str << " failed: " << ErrnoToString(errno);
+  }
+  file.close();
+  ChangeFileMode(real_path_str, S_IRUSR);
+}
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/data_dump/dump_utils.h
+++ b/mindspore/ccsrc/debug/data_dump/dump_utils.h
@ -39,7 +39,10 @@ void DumpMemToFile(const std::string &file_path, const device::DeviceAddress &ad
                   const TypeId &type, bool trans_flag = false);
 // Get time stamp since epoch in microseconds
 uint64_t GetTimeStamp();
-std::string GetOpNameWithoutScope(const std::string &fullname_with_scope);
+std::string GetOpNameWithoutScope(const std::string &fullname_with_scope, const std::string &separator = "--");
+
+// dump target string into file
+void DumpToFile(const std::string &file_name, const std::string &dump_str);
 }  // namespace mindspore

 #endif  // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_DUMP_UTILS_H_
--- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
+++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
@ -19,6 +19,8 @@
 #include <unistd.h>
 #include <algorithm>
 #include <map>
+#include <set>
+#include <utility>
 #include <vector>
 #include "debug/data_dump/dump_json_parser.h"
 #include "common/trans.h"
@ -30,6 +32,7 @@
 #include "utils/config_manager.h"
 #include "utils/file_utils.h"
 #include "debug/data_dump/tensor_stat_dump.h"
+#include "abstract/utils.h"
 #ifdef ENABLE_DEBUGGER
 #include "debug/debug_services.h"
 #include "debug/tensor_load.h"
@ -37,6 +40,60 @@
 #endif

 namespace mindspore {
+#ifdef ENABLE_D
+using ProtoFormat = debugger::dump::OutputFormat;
+using ProtoDataType = debugger::dump::OutputDataType;
+
+constexpr int kDhaAtomicAddInfoSize = 128;
+constexpr int kL2AtomicAddInfoSize = 128;
+constexpr int kAiCoreInfoSize = 256;
+constexpr int kDhaAtomicAddStatusSize = 256;
+constexpr int kL2AtomicAddStatusSize = 256;
+constexpr int kUint64Size = sizeof(uint64_t);
+const std::set<std::pair<std::string, std::string>> kSuppTransFormatPair = {
+  // {device format, host format}
+  {kOpFormat_FRAC_Z, kOpFormat_NCHW},      {kOpFormat_FRAC_NZ, kOpFormat_NCHW},
+  {kOpFormat_NC1HWC0, kOpFormat_NCHW},     {kOpFormat_C1HWNCoC0, kOpFormat_NCHW},
+  {kOpFormat_NC1HWC0_C04, kOpFormat_NCHW}, {kOpFormat_NDC1HWC0, kOpFormat_NCHW},
+  {kOpFormat_FRACTAL_Z_3D, kOpFormat_NCHW}};
+
+const std::map<ProtoFormat, std::string> kFormatToStringMap = {
+  {ProtoFormat::FORMAT_NCHW, kOpFormat_NCHW},
+  {ProtoFormat::FORMAT_NHWC, kOpFormat_NHWC},
+  {ProtoFormat::FORMAT_ND, kOpFormat_ND},
+  {ProtoFormat::FORMAT_NC1HWC0, kOpFormat_NC1HWC0},
+  {ProtoFormat::FORMAT_FRACTAL_Z, kOpFormat_FRAC_Z},
+  {ProtoFormat::FORMAT_NC1HWC0_C04, kOpFormat_NC1HWC0_C04},
+  {ProtoFormat::FORMAT_FRACTAL_Z_C04, kOpFormat_FRACTAL_Z_C04},
+  {ProtoFormat::FORMAT_NC1KHKWHWC0, kOpFormat_NC1KHKWHWC0},
+  {ProtoFormat::FORMAT_HWCN, kOpFormat_HWCN},
+  {ProtoFormat::FORMAT_NDHWC, kOpFormat_NDHWC},
+  {ProtoFormat::FORMAT_NCDHW, kOpFormat_NCDHW},
+  {ProtoFormat::FORMAT_DHWCN, kOpFormat_DHWCN},
+  {ProtoFormat::FORMAT_DHWNC, kOpFormat_DHWNC},
+  {ProtoFormat::FORMAT_NDC1HWC0, kOpFormat_NDC1HWC0},
+  {ProtoFormat::FORMAT_FRACTAL_Z_3D, kOpFormat_FRACTAL_Z_3D},
+  {ProtoFormat::FORMAT_C1HWNCoC0, kOpFormat_C1HWNCoC0},
+  {ProtoFormat::FORMAT_FRACTAL_NZ, kOpFormat_FRAC_NZ},
+  {ProtoFormat::FORMAT_FRACTAL_ZN_LSTM, kOpFormat_FRACTAL_ZN_LSTM}};
+
+const std::map<ProtoDataType, mindspore::TypeId> kDataTypetoMSTypeMap = {
+  {ProtoDataType::DT_UNDEFINED, mindspore::TypeId::kTypeUnknown},
+  {ProtoDataType::DT_FLOAT, mindspore::TypeId::kNumberTypeFloat32},
+  {ProtoDataType::DT_FLOAT16, mindspore::TypeId::kNumberTypeFloat16},
+  {ProtoDataType::DT_INT8, mindspore::TypeId::kNumberTypeInt8},
+  {ProtoDataType::DT_UINT8, mindspore::TypeId::kNumberTypeUInt8},
+  {ProtoDataType::DT_INT16, mindspore::TypeId::kNumberTypeInt16},
+  {ProtoDataType::DT_UINT16, mindspore::TypeId::kNumberTypeUInt16},
+  {ProtoDataType::DT_INT32, mindspore::TypeId::kNumberTypeInt32},
+  {ProtoDataType::DT_INT64, mindspore::TypeId::kNumberTypeInt64},
+  {ProtoDataType::DT_UINT32, mindspore::TypeId::kNumberTypeUInt32},
+  {ProtoDataType::DT_UINT64, mindspore::TypeId::kNumberTypeUInt64},
+  {ProtoDataType::DT_BOOL, mindspore::TypeId::kNumberTypeBool},
+  {ProtoDataType::DT_DOUBLE, mindspore::TypeId::kNumberTypeFloat64},
+  {ProtoDataType::DT_STRING, mindspore::TypeId::kObjectTypeString}};
+#endif
+
 bool E2eDump::IsDeviceTargetGPU() {
  auto context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context);
@ -443,4 +500,178 @@ bool E2eDump::DumpDirExists(const std::string &dump_path) {
  }
  return false;
 }
+
+#ifdef ENABLE_D
+void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dump::DumpData &dump_data,
+                               char *data_ptr) {
+  // dump input tensors
+  std::vector<debugger::dump::OpInput> input_tensors(dump_data.input().begin(), dump_data.input().end());
+  uint64_t offset = 0;
+  std::string in_path = dump_path + ".input.";
+  for (uint32_t slot = 0; slot < input_tensors.size(); slot++) {
+    auto in_tensor = input_tensors[slot];
+    std::string in_slot_path = in_path + std::to_string(slot) + ".";
+    auto succ = ConvertFormatForTensorAndDump(in_slot_path, in_tensor, data_ptr + offset);
+    if (!succ) {
+      MS_LOG(INFO) << "Failed to convert format for tensor " << in_slot_path;
+    }
+    offset += in_tensor.size();
+  }
+
+  // dump output tensors
+  std::vector<debugger::dump::OpOutput> output_tensors(dump_data.output().begin(), dump_data.output().end());
+  std::string out_path = dump_path + ".output.";
+  for (uint32_t slot = 0; slot < output_tensors.size(); slot++) {
+    auto out_tensor = output_tensors[slot];
+    std::string out_slot_path = out_path + std::to_string(slot) + ".";
+    auto succ = ConvertFormatForTensorAndDump(out_slot_path, out_tensor, data_ptr + offset);
+    if (!succ) {
+      MS_LOG(INFO) << "Failed to convert format for tensor " << out_slot_path;
+    }
+    offset += out_tensor.size();
+  }
+}
+
+template <typename T>
+bool E2eDump::ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr) {
+  // get format
+  auto iter_fmt = kFormatToStringMap.find(tensor.format());
+  if (iter_fmt == kFormatToStringMap.end()) {
+    MS_LOG(INFO) << "Unsupported tensor format " << iter_fmt->second << " for tensor " << dump_path;
+    return false;
+  }
+  std::string device_format = iter_fmt->second;
+  // get data type
+  auto iter_dtype = kDataTypetoMSTypeMap.find(tensor.data_type());
+  if (iter_dtype == kDataTypetoMSTypeMap.end()) {
+    MS_LOG(INFO) << "Unsupported tensor type " << iter_dtype->second << " for tensor " << dump_path;
+    return false;
+  }
+  auto src_type = iter_dtype->second;
+  // get host shape
+  std::vector<size_t> device_shape;
+  (void)std::copy(tensor.shape().dim().begin(), tensor.shape().dim().end(), std::back_inserter(device_shape));
+  std::vector<size_t> host_shape;
+  (void)std::copy(tensor.original_shape().dim().begin(), tensor.original_shape().dim().end(),
+                  std::back_inserter(host_shape));
+  ShapeVector shape_to;
+  (void)std::transform(host_shape.begin(), host_shape.end(), std::back_inserter(shape_to), SizeToLong);
+  size_t data_size = (size_t)tensor.size();
+
+  bool trans_success = false;
+  auto trans_buf = std::vector<uint8_t>(data_size);
+  // convert format to host format. It can be either NCHW or ND (non 4-dimemsions).
+  const uint8_t kNumFourDim = 4;
+  std::string host_format;
+  if (host_shape.size() == kNumFourDim) {
+    host_format = kOpFormat_NCHW;
+  } else {
+    host_format = kOpFormat_ND;
+  }
+  if (device_format != host_format) {
+    auto iter = kSuppTransFormatPair.find(std::make_pair(device_format, host_format));
+    if (iter == kSuppTransFormatPair.end()) {
+      MS_LOG(INFO) << "Do not support convert from format " << device_format << " to " << host_format << " for tensor "
+                   << dump_path;
+    } else {
+      const trans::FormatArgs format_args{data_ptr,   data_size,    host_format, device_format,
+                                          host_shape, device_shape, src_type};
+      auto group = tensor.sub_format() > 1 ? tensor.sub_format() : 1;
+      trans_success = trans::TransFormatFromDeviceToHost(format_args, trans_buf.data(), group);
+      if (!trans_success) {
+        MS_LOG(ERROR) << "Trans format failed.";
+      }
+    }
+  }
+  // dump tensor data into npy file
+  bool dump_success = false;
+  if (trans_success) {
+    dump_path += host_format;
+    dump_success = DumpJsonParser::DumpToFile(dump_path, trans_buf.data(), data_size, shape_to, src_type);
+  } else {
+    dump_path += device_format;
+    dump_success = DumpJsonParser::DumpToFile(dump_path, data_ptr, data_size, shape_to, src_type);
+  }
+  return dump_success;
+}
+
+uint64_t UnpackUint64Value(char *ptr) {
+#if defined(__APPLE__)
+  return *reinterpret_cast<const uint64_t *>(ptr);
+#else
+  return le16toh(*reinterpret_cast<const uint64_t *>(ptr));
+#endif
+}
+
+std::string IntToHexString(const uint64_t value) {
+  std::stringstream ss;
+  ss << "0x" << std::hex << value;
+  return ss.str();
+}
+
+nlohmann::json E2eDump::ParseOverflowInfo(char *data_ptr) {
+  uint32_t index = 0;
+  uint64_t model_id = UnpackUint64Value(data_ptr + index);
+  index += kUint64Size;
+  uint64_t stream_id = UnpackUint64Value(data_ptr + index);
+  index += kUint64Size;
+  uint64_t task_id = UnpackUint64Value(data_ptr + index);
+  index += kUint64Size;
+  uint64_t task_type = UnpackUint64Value(data_ptr + index);
+  index += kUint64Size;
+  uint64_t pc_start = UnpackUint64Value(data_ptr + index);
+  index += kUint64Size;
+  uint64_t para_base = UnpackUint64Value(data_ptr + index);
+
+  nlohmann::json overflow_info;
+  overflow_info["model_id"] = model_id;
+  overflow_info["stream_id"] = stream_id;
+  overflow_info["task_id"] = task_id;
+  overflow_info["task_type"] = task_type;
+  overflow_info["pc_start"] = IntToHexString(pc_start);
+  overflow_info["para_base"] = IntToHexString(para_base);
+  return overflow_info;
+}
+
+void E2eDump::DumpOpDebugToFile(const std::string &dump_path, const debugger::dump::DumpData &dump_data,
+                                char *data_ptr) {
+  std::string out_path = dump_path + ".output.";
+  std::vector<debugger::dump::OpOutput> op_debug(dump_data.output().begin(), dump_data.output().end());
+  for (uint32_t slot = 0; slot < op_debug.size(); slot++) {
+    uint32_t index = 0;
+    // parse DHA Atomic Add info
+    nlohmann::json dha_atomic_add_info = ParseOverflowInfo(data_ptr + index);
+    index += kDhaAtomicAddInfoSize;
+    // parse L2 Atomic Add info
+    nlohmann::json l2_atomic_add_info = ParseOverflowInfo(data_ptr + index);
+    index += kL2AtomicAddInfoSize;
+    // parse AICore info
+    nlohmann::json ai_core_info = ParseOverflowInfo(data_ptr + index);
+    index += kAiCoreInfoSize;
+    // parse DHA Atomic Add status
+    dha_atomic_add_info["status"] = UnpackUint64Value(data_ptr + index);
+    index += kDhaAtomicAddStatusSize;
+    // parse L2 Atomic Add status
+    l2_atomic_add_info["status"] = UnpackUint64Value(data_ptr + index);
+    index += kL2AtomicAddStatusSize;
+    // parse AICore status
+    uint64_t kernel_code = UnpackUint64Value(data_ptr + index);
+    index += kUint64Size;
+    uint64_t block_idx = UnpackUint64Value(data_ptr + index);
+    index += kUint64Size;
+    uint64_t status = UnpackUint64Value(data_ptr + index);
+    ai_core_info["kernel_code"] = IntToHexString(kernel_code);
+    ai_core_info["block_idx"] = block_idx;
+    ai_core_info["status"] = status;
+
+    nlohmann::json opdebug_data;
+    opdebug_data["DHA Atomic Add"] = dha_atomic_add_info;
+    opdebug_data["L2 Atomic Add"] = l2_atomic_add_info;
+    opdebug_data["AI Core"] = ai_core_info;
+
+    // save json to file
+    DumpToFile(out_path + std::to_string(slot) + ".json", opdebug_data.dump());
+  }
+}
+#endif  // ENABLE_D
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/data_dump/e2e_dump.h
+++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.h
@ -26,6 +26,9 @@
 #include "runtime/device/device_address.h"
 #include "debug/data_dump/dump_json_parser.h"
 #include "debug/data_dump/dump_utils.h"
+#ifdef ENABLE_D
+#include "proto/dump_data.pb.h"
+#endif

 #ifndef ENABLE_DEBUGGER
 class Debugger;
@ -59,6 +62,13 @@ class E2eDump {

  static bool DumpDirExists(const std::string &dump_path);

+#ifdef ENABLE_D
+  static void DumpTensorToFile(const std::string &dump_path, const debugger::dump::DumpData &dump_data, char *data_ptr);
+
+  static void DumpOpDebugToFile(const std::string &dump_path, const debugger::dump::DumpData &dump_data,
+                                char *data_ptr);
+#endif
+
 private:
  static void DumpOutput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger);

@ -81,6 +91,13 @@ class E2eDump {

  static void UpdateIterDumpSetup(const session::KernelGraph *graph, bool sink_mode);

+#ifdef ENABLE_D
+  static nlohmann::json ParseOverflowInfo(char *data_ptr);
+
+  template <typename T>
+  static bool ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr);
+#endif
+
  inline static unsigned int starting_graph_id = INT32_MAX;
 };
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@ -1535,4 +1535,16 @@ bool Debugger::TensorExistsInCurrent(const std::string &tensor_name) {
  return debug_services_->TensorExistsInCurrent(tensor_name);
 }

+#ifdef ENABLE_D
+std::shared_ptr<DumpDataBuilder> Debugger::LoadDumpDataBuilder(const std::string &node_name) {
+  auto iter = dump_data_construct_map_.find(node_name);
+  if (iter == dump_data_construct_map_.end()) {
+    dump_data_construct_map_[node_name] = std::make_shared<DumpDataBuilder>();
+  }
+  return dump_data_construct_map_[node_name];
+}
+
+void Debugger::ClearDumpDataBuilder(const std::string &node_name) { dump_data_construct_map_.erase(node_name); }
+#endif
+
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@ -26,6 +26,9 @@
 #include "debug/debugger/grpc_client.h"
 #include "debug/debug_services.h"
 #include "common/trans.h"
+#ifdef ENABLE_D
+#include "debug/dump_data_builder.h"
+#endif

 using debugger::Chunk;
 using debugger::DataType;
@ -170,6 +173,12 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // check if dump using debugger backend is enabled
  bool CheckDebuggerDumpEnabled() const;

+#ifdef ENABLE_D
+  std::shared_ptr<DumpDataBuilder> LoadDumpDataBuilder(const std::string &node_name);
+
+  void ClearDumpDataBuilder(const std::string &node_name);
+#endif
+
 private:
  // private constructor for singleton
  Debugger();
@ -289,6 +298,11 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // The vector of graph pointers that have been run in the current step.
  std::vector<KernelGraphPtr> graph_ptr_step_vec_;

+#ifdef ENABLE_D
+  // to construct kernel data for async dump, key is the dump path to the node
+  std::map<std::string, std::shared_ptr<DumpDataBuilder>> dump_data_construct_map_;
+#endif
+
  // singleton
  static std::mutex instance_lock_;
  static std::shared_ptr<Debugger> debugger_;
--- a/mindspore/ccsrc/debug/debugger/debugger_utils.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger_utils.cc
@ -23,8 +23,12 @@
 #include "debug/debugger/debugger.h"
 #include "runtime/device/gpu/gpu_device_address.h"
 #include "debug/data_dump/dump_json_parser.h"
+#ifdef ENABLE_D
+#include "debug/dump_data_builder.h"
+#endif
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/kernel_compiler/kernel.h"
+#include "debug/data_dump/e2e_dump.h"

 using mindspore::kernel::AddressPtr;
 using mindspore::kernel::KernelLaunchInfo;
@ -33,8 +37,6 @@ using KernelGraph = mindspore::session::KernelGraph;
 using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;

 namespace mindspore {
-static const size_t PARAMETER_OUTPUT_INDEX = 0;
-
 std::vector<size_t> CheckRealOutput(const std::string &node_name, const size_t &output_size) {
  // define a vector containing real output number
  std::vector<size_t> real_outputs;
@ -162,4 +164,52 @@ void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_
  bool last_kernel = !AnfAlgo::IsInplaceNode(cnode, "skip");
  debugger->PostExecuteNode(cnode, last_kernel);
 }
+
+#ifdef ENABLE_D
+int32_t DumpDataCallBack(const DumpChunk *dump_chunk, int32_t size) {
+  MS_LOG(DEBUG) << "ADX DumpDataCallBack is called";
+  string file_name = dump_chunk->fileName;
+  uint32_t isLastChunk = dump_chunk->isLastChunk;
+
+  // parse chunk header
+  auto debugger = Debugger::GetInstance();
+  MS_EXCEPTION_IF_NULL(debugger);
+  auto dump_data_build = debugger->LoadDumpDataBuilder(file_name);
+  if (dump_data_build == nullptr) {
+    MS_LOG(ERROR) << "Failed to load dump data builder for node " << file_name;
+    return 0;
+  }
+  if (!dump_data_build->CopyDumpChunk(dump_chunk)) {
+    return 1;
+  }
+
+  if (isLastChunk == 1) {
+    // construct dump data object
+    debugger::dump::DumpData dump_data;
+    std::vector<char> data_buf;
+    if (!dump_data_build->ConstructDumpData(&dump_data, &data_buf)) {
+      MS_LOG(ERROR) << "Failed to parse data for node " << file_name;
+      return 0;
+    }
+
+    // convert and save to files
+    auto separator = file_name.rfind("/");
+    auto path_name = file_name.substr(0, separator);
+    auto file_base_name = file_name.substr(separator + 1);
+    if (file_base_name.rfind("Opdebug.Node_OpDebug.") == 0) {
+      // save overflow data
+      E2eDump::DumpOpDebugToFile(file_name, dump_data, data_buf.data());
+    } else {
+      // save tensor data
+      auto op_type = file_base_name.substr(0, file_base_name.find("."));
+      auto file_base_name_no_scope = GetOpNameWithoutScope(file_base_name, "_");
+      E2eDump::DumpTensorToFile(path_name + "/" + op_type + "." + file_base_name_no_scope, dump_data, data_buf.data());
+    }
+
+    debugger->ClearDumpDataBuilder(file_name);
+  }
+
+  return 0;
+}
+#endif
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/debugger/debugger_utils.h
+++ b/mindspore/ccsrc/debug/debugger/debugger_utils.h
@ -19,7 +19,11 @@
 #include <string>
 #include "debug/debugger/debugger.h"
 #include "backend/kernel_compiler/kernel.h"
+#ifdef ENABLE_D
+#include "toolchain/adx_datadump_callback.h"

+using Adx::DumpChunk;
+#endif
 using mindspore::kernel::KernelLaunchInfo;

 namespace mindspore {
@ -36,4 +40,8 @@ bool CheckReadData(const CNodePtr &cnode);

 void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_);

+#ifdef ENABLE_D
+// Callback function to dump ascend async mode
+int32_t DumpDataCallBack(const DumpChunk *dump_chunk, int32_t size);
+#endif
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/debugger/dump_data.proto
+++ b/mindspore/ccsrc/debug/debugger/dump_data.proto
@ -0,0 +1,146 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = "proto3";
+
+package debugger.dump;
+
+
+enum OutputDataType {
+  DT_UNDEFINED = 0;
+  DT_FLOAT = 1;
+  DT_FLOAT16 = 2;
+  DT_INT8 = 3;
+  DT_UINT8 = 4;
+  DT_INT16 = 5;
+  DT_UINT16 = 6;
+  DT_INT32 = 7;
+  DT_INT64 = 8;
+  DT_UINT32 = 9;
+  DT_UINT64 = 10;
+  DT_BOOL = 11;
+  DT_DOUBLE = 12;
+  DT_STRING = 13;
+  DT_DUAL_SUB_INT8 = 14;
+  DT_DUAL_SUB_UINT8 = 15;
+  DT_COMPLEX64 = 16;
+  DT_COMPLEX128 = 17;
+  DT_QINT8 = 18;
+  DT_QINT16 = 19;
+  DT_QINT32 = 20;
+  DT_QUINT8 = 21;
+  DT_QUINT16 = 22;
+  DT_RESOURCE = 23;
+  DT_STRING_REF = 24;
+  DT_DUAL = 25;
+}
+
+enum OutputFormat {
+  FORMAT_NCHW = 0;
+  FORMAT_NHWC = 1;
+  FORMAT_ND = 2;
+  FORMAT_NC1HWC0 = 3;
+  FORMAT_FRACTAL_Z = 4;
+  FORMAT_NC1C0HWPAD = 5;
+  FORMAT_NHWC1C0 = 6;
+  FORMAT_FSR_NCHW = 7;
+  FORMAT_FRACTAL_DECONV = 8;
+  FORMAT_C1HWNC0 = 9;
+  FORMAT_FRACTAL_DECONV_TRANSPOSE = 10;
+  FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS = 11;
+  FORMAT_NC1HWC0_C04 = 12;
+  FORMAT_FRACTAL_Z_C04 = 13;
+  FORMAT_CHWN = 14;
+  FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS = 15;
+  FORMAT_HWCN = 16;
+  FORMAT_NC1KHKWHWC0 = 17;
+  FORMAT_BN_WEIGHT = 18;
+  FORMAT_FILTER_HWCK = 19;
+  FORMAT_HASHTABLE_LOOKUP_LOOKUPS = 20;
+  FORMAT_HASHTABLE_LOOKUP_KEYS = 21;
+  FORMAT_HASHTABLE_LOOKUP_VALUE = 22;
+  FORMAT_HASHTABLE_LOOKUP_OUTPUT = 23;
+  FORMAT_HASHTABLE_LOOKUP_HITS = 24;
+  FORMAT_C1HWNCoC0 = 25;
+  FORMAT_MD = 26;
+  FORMAT_NDHWC = 27;
+  FORMAT_FRACTAL_ZZ = 28;
+  FORMAT_FRACTAL_NZ = 29;
+  FORMAT_NCDHW = 30;
+  FORMAT_DHWCN = 31;  // 3D filter input tensor format
+  FORMAT_NDC1HWC0 = 32;
+  FORMAT_FRACTAL_Z_3D=33;
+  FORMAT_CN = 34;
+  FORMAT_NC = 35;
+  FORMAT_DHWNC = 36;
+  FORMAT_FRACTAL_Z_3D_TRANSPOSE = 37; // 3D filter(transpose) input tensor format
+  FORMAT_FRACTAL_ZN_LSTM = 38;
+  FORMAT_FRACTAL_Z_G = 39;
+  FORMAT_RESERVED = 40;
+  // Add new formats definition here
+  FORMAT_MAX = 0xff;
+}
+
+message OriginalOp {
+  string name = 1;
+  uint32 output_index = 2;
+  OutputDataType data_type = 3;
+  OutputFormat format = 4;
+}
+
+message Shape {
+  repeated uint64 dim = 1;
+}
+
+message OpOutput {
+  OutputDataType data_type = 1;
+  OutputFormat format = 2;
+  Shape shape = 3;
+  OriginalOp original_op = 4;  // the original op corresponding to the output
+  bytes data = 5;
+  uint64 size = 6;
+  Shape original_shape = 7;
+  int32 sub_format = 8;
+}
+
+message OpInput {
+  OutputDataType data_type = 1;
+  OutputFormat format = 2;
+  Shape shape = 3;
+  bytes data = 4;
+  uint64 size = 5;
+  Shape original_shape = 6;
+  int32 sub_format = 7;
+}
+
+enum BufferType {
+  L1 = 0;
+}
+
+message OpBuffer {
+  BufferType buffer_type = 1;
+  bytes data = 2;
+  uint64 size = 3;
+}
+
+message DumpData {
+  string version = 1;
+  uint64 dump_time = 2;
+  repeated OpOutput output = 3;
+  repeated OpInput input = 4;
+  repeated OpBuffer buffer = 5;
+  string op_name = 6;
+}
--- a/mindspore/ccsrc/debug/dump_data_builder.h
+++ b/mindspore/ccsrc/debug/dump_data_builder.h
@ -0,0 +1,86 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_DEBUG_DUMP_DATA_BUILDER_H_
+#define MINDSPORE_CCSRC_DEBUG_DUMP_DATA_BUILDER_H_
+
+#include <vector>
+#include <string>
+#include <iostream>
+#include "utils/log_adapter.h"
+#ifdef ENABLE_D
+#include "proto/dump_data.pb.h"
+#include "toolchain/adx_datadump_callback.h"
+
+using Adx::DumpChunk;
+#endif
+// This class is for building dump data receiving from adx server. Tensor Data for each kernel will be divided in pieces
+// and each piece would be wrapped into DumpChunk struct. This class provides function to merge dump chunks and
+// construct dump data object.
+class DumpDataBuilder {
+ public:
+  DumpDataBuilder() {}
+
+  ~DumpDataBuilder() = default;
+
+#ifdef ENABLE_D
+  bool CopyDumpChunk(const DumpChunk *dump_chunk) {
+    try {
+      uint32_t buf_sz = dump_chunk->bufLen;
+      std::string buffer_str(reinterpret_cast<const char *>(dump_chunk->dataBuf), buf_sz);
+      chunk_list_.push_back(buffer_str);
+      total_sz_ += buf_sz;
+    } catch (std::bad_alloc &err) {
+      MS_LOG(ERROR) << "Failed to allocate memory for " << dump_chunk->fileName << ", reason: " << err.what();
+      return false;
+    }
+    return true;
+  }
+
+  bool ConstructDumpData(debugger::dump::DumpData *dump_data_proto, std::vector<char> *data_ptr) {
+    if (chunk_list_.empty()) {
+      return false;
+    }
+    // merge several chunks into one piece.
+    std::string dump_proto_str;
+    dump_proto_str.reserve(total_sz_);
+    for (auto item : chunk_list_) {
+      dump_proto_str += item;
+    }
+    chunk_list_.clear();
+
+    const int8_t header_len_offset = 8;
+    uint64_t header_len = *reinterpret_cast<const uint64_t *>(dump_proto_str.c_str());
+    std::string header = dump_proto_str.substr(header_len_offset, header_len);
+    if (!(*dump_data_proto).ParseFromString(header)) {
+      MS_LOG(ERROR) << "Failed to parse dump proto file.";
+      return false;
+    }
+    auto data_sz = total_sz_ - header_len_offset - header_len;
+    data_ptr->resize(data_sz);
+    auto ret = memcpy_s(data_ptr->data(), data_sz, dump_proto_str.c_str() + header_len_offset + header_len, data_sz);
+    if (ret != 0) {
+      MS_LOG(ERROR) << "Failed to get data from Adx";
+      return false;
+    }
+    return true;
+  }
+#endif
+
+ private:
+  std::vector<std::string> chunk_list_;
+  uint64_t total_sz_{0};
+};
+#endif  // MINDSPORE_CCSRC_DEBUG_DUMP_DATA_BUILDER_H_
--- a/tests/st/dump/dump_test_utils.py
+++ b/tests/st/dump/dump_test_utils.py
@ -118,7 +118,10 @@ def generate_dump_json(dump_path, json_file_name, test_key):
    elif test_key == "test_Ascend_async_multi_root_graph_dump":
        data = async_dump_dict_3
        data["common_dump_settings"]["path"] = dump_path
-
+    elif test_key == "test_async_dump_file_format":
+        data = async_dump_dict
+        data["common_dump_settings"]["path"] = dump_path
+        data["common_dump_settings"]["file_format"] = "npy"
    else:
        raise ValueError(
            "Failed to generate dump json file. The test name value " + test_key + " is invalid.")
--- a/tests/st/dump/test_data_dump.py
+++ b/tests/st/dump/test_data_dump.py
@ -51,17 +51,12 @@ x = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
 y = np.array([[7, 8, 9], [10, 11, 12]]).astype(np.float32)


-@pytest.mark.level1
-@pytest.mark.platform_arm_ascend_training
-@pytest.mark.platform_x86_ascend_training
-@pytest.mark.env_onecard
-@security_off_wrap
-def test_async_dump():
+def run_async_dump(test_name):
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
    with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
        dump_path = os.path.join(tmp_dir, 'async_dump')
        dump_config_path = os.path.join(tmp_dir, 'async_dump.json')
-        generate_dump_json(dump_path, dump_config_path, 'test_async_dump')
+        generate_dump_json(dump_path, dump_config_path, test_name)
        os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
        dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
        if os.path.isdir(dump_path):
@ -76,6 +71,35 @@ def test_async_dump():
        del os.environ['MINDSPORE_DUMP_CONFIG']


+@pytest.mark.level1
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_async_dump():
+    """
+    Feature: async dump on Ascend
+    Description: test async dump with default file_format value
+    Expectation: dump data are generated as protobuf file format (suffix with timestamp)
+    """
+    run_async_dump("test_async_dump")
+
+
+@pytest.mark.skip(reason="wait for run package updates in Dec 01")
+@pytest.mark.level1
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+@security_off_wrap
+def test_async_dump_file_format():
+    """
+    Feature: async dump on Ascend in npy format
+    Description: test async dump with file_format is configured as npy
+    Expectation: dump data are generated as npy file format
+    """
+    run_async_dump("test_async_dump_file_format")
+
+
 def run_e2e_dump():
    if sys.platform != 'linux':
        return