From 9e717bd7fa127ba825c551bf57a217ebe5ffa7b1 Mon Sep 17 00:00:00 2001 From: TinaMengtingZhang Date: Thu, 17 Mar 2022 16:18:32 -0400 Subject: [PATCH] fix load and convert tensor twice in ascend kernel by kernel dump --- .../ccsrc/debug/data_dump/dump_json_parser.cc | 1 + mindspore/ccsrc/debug/data_dump/e2e_dump.cc | 94 +++++++------------ mindspore/ccsrc/debug/data_dump/e2e_dump.h | 20 ++-- .../ccsrc/debug/data_dump/tensor_stat_dump.cc | 12 +-- mindspore/ccsrc/debug/debug_services.cc | 7 +- mindspore/ccsrc/debug/debug_services.h | 4 +- mindspore/ccsrc/debug/debugger/debugger.cc | 19 ++-- mindspore/ccsrc/debug/debugger/debugger.h | 6 +- .../ccsrc/debug/debugger/debugger_utils.cc | 63 ++++++++----- .../ccsrc/debug/debugger/debugger_utils.h | 4 +- mindspore/ccsrc/debug/tensor_data.h | 19 ++++ mindspore/ccsrc/debug/tensor_load.h | 17 +--- .../hal/device/ascend_device_address.cc | 20 +++- .../ascend/hal/device/ascend_device_address.h | 2 +- .../gpu/hal/device/gpu_device_address.cc | 3 +- .../gpu/hal/device/gpu_device_address.h | 2 +- .../gpu/hal/device/gpu_kernel_runtime.cc | 5 +- .../ccsrc/runtime/device/device_address.h | 2 +- 18 files changed, 152 insertions(+), 148 deletions(-) diff --git a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc index 6a6df51db55..ac6c689b657 100644 --- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc +++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc @@ -263,6 +263,7 @@ bool DumpJsonParser::DumpToFile(const std::string &filename, const void *data, s return false; } const std::string file_path_str = file_path.value(); + MS_LOG(INFO) << "Dump path is " << file_path_str; ChangeFileMode(file_path_str, S_IWUSR); std::ofstream fd(file_path_str, std::ios::out | std::ios::trunc | std::ios::binary); if (!fd.is_open()) { diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc index d1942b2f11f..24072f6b753 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc @@ -118,23 +118,23 @@ bool E2eDump::IsDeviceTargetGPU() { return context->get_param(MS_CTX_DEVICE_TARGET) == kGPUDevice; } +bool E2eDump::IsMindRTKernelByKernel() { + return IsDeviceTargetGPU() || Debugger::GetInstance()->GetAscendKernelByKernelFlag(); +} + /* * Feature group: Dump. - * Target device group: GPU. + * Target device group: GPU, Ascend. * Runtime category: Old runtime, MindRT. - * Description: This function is for dumping tensor in memory to disk in GPU machine. + * Description: This function is for dumping tensor loaded to tensor_loader in memory to disk in GPU and Ascend machine. */ -void E2eDump::DumpGPUMemToFile(const Debugger *debugger, const std::string &file_path, bool trans_flag, - const device::DeviceAddress &addr, const std::string &original_kernel_name, size_t slot, - const ShapeVector &int_shapes, const TypeId &host_type) { +void E2eDump::DumpMemFromTensorLoaderToFile(const Debugger *debugger, const std::string &file_path, + const std::string &original_kernel_name, size_t slot) { #ifdef ENABLE_DEBUGGER - auto format = kOpFormat_DEFAULT; MS_EXCEPTION_IF_NULL(debugger); - auto ret = debugger->DumpTensorToFile(file_path, trans_flag, format, addr.format(), original_kernel_name, slot, - int_shapes, host_type); + auto ret = debugger->DumpTensorToFile(file_path, original_kernel_name, slot); if (!ret) { - MS_LOG(INFO) << "DumpTensorToFile Failed: flag:" << trans_flag << ", path:" << file_path - << ", host_format:" << format; + MS_LOG(INFO) << "DumpTensorToFile Failed: path:" << file_path; } #endif } @@ -184,6 +184,7 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s continue; } auto addr = AnfAlgo::GetOutputAddr(node, j); + std::string node_name = GetKernelNodeName(node); MS_EXCEPTION_IF_NULL(addr); ShapeVector int_shapes; GetDumpIntShape(node, j, NOT_NULL(&int_shapes), trans_flag); @@ -196,14 +197,13 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(task_id) + '.' + std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".output." + std::to_string(j); - if (DumpJsonParser::GetInstance().IsStatisticDump() && - (IsDeviceTargetGPU() || Debugger::GetInstance()->GetAscendKernelByKernelFlag())) { + if (DumpJsonParser::GetInstance().IsStatisticDump() && IsMindRTKernelByKernel()) { TensorStatDump stat_dump(op_type, op_name, task_id, stream_id, timestamp, false, j, j); - (void)stat_dump.DumpTensorStatsToFile(GetKernelNodeName(node), dump_path, debugger); + (void)stat_dump.DumpTensorStatsToFile(node_name, dump_path, debugger); } if (DumpJsonParser::GetInstance().IsTensorDump()) { - if (IsDeviceTargetGPU()) { - DumpGPUMemToFile(debugger, file_path, trans_flag, *addr, GetKernelNodeName(node), j, int_shapes, type); + if (IsMindRTKernelByKernel()) { + DumpMemFromTensorLoaderToFile(debugger, file_path, node_name, j); } else { DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag); } @@ -213,10 +213,8 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s void E2eDump::DumpOutputData(const CNodePtr &node, bool trans_flag, const std::string &dump_path, std::string *kernel_name) { - auto debugger = Debugger::GetInstance(); - MS_EXCEPTION_IF_NULL(debugger); - if (IsDeviceTargetGPU() || debugger->GetAscendKernelByKernelFlag()) { - MS_LOG(INFO) << "DumpInputData is only for graph mode on Ascend"; + if (IsMindRTKernelByKernel()) { + MS_LOG(INFO) << "DumpOutputData is only for graph mode on Ascend"; return; } MS_EXCEPTION_IF_NULL(node); @@ -256,8 +254,7 @@ void E2eDump::DumpInput(const session::KernelGraph *graph, const std::string &du } } -void E2eDump::DumpInputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger, - const KernelLaunchInfo *launch_info) { +void E2eDump::DumpInputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger) { auto &dump_json_parser = DumpJsonParser::GetInstance(); if (!dump_json_parser.InputNeedDump()) { return; @@ -269,25 +266,11 @@ void E2eDump::DumpInputSingleNode(const CNodePtr &node, const std::string &dump_ return; } DumpJsonParser::GetInstance().MatchKernel(kernel_name); - DumpInputImpl(node, trans_flag, dump_path, &kernel_name, debugger, launch_info); -} - -std::shared_ptr CreateAscendDeviceAddress(const KernelLaunchInfo *launch_info, size_t index, - TypeId type) { - MS_EXCEPTION_IF_NULL(launch_info); - auto addr_ptr = launch_info->inputs_[index]; - auto ms_context = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(ms_context); - auto device_id = ms_context->get_param(MS_CTX_DEVICE_ID); - auto device_context = - device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({kAscendDevice, device_id}); - auto format = kOpFormat_DEFAULT; - MS_EXCEPTION_IF_NULL(addr_ptr); - return device_context->CreateDeviceAddress(addr_ptr->addr, addr_ptr->size, format, type, ShapeVector()); + DumpInputImpl(node, trans_flag, dump_path, &kernel_name, debugger); } void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, - std::string *kernel_name, const Debugger *debugger, const KernelLaunchInfo *launch_info) { + std::string *kernel_name, const Debugger *debugger) { MS_EXCEPTION_IF_NULL(node); GetFileKernelName(NOT_NULL(kernel_name)); auto input_size = common::AnfAlgo::GetInputTensorNum(node); @@ -298,12 +281,12 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st if (!AnfAlgo::OutputAddrExist(input, index)) { continue; } - std::string tensor_name = GetKernelNodeName(node); + std::string node_name = GetKernelNodeName(node); size_t slot = j; - if (IsDeviceTargetGPU() || Debugger::GetInstance()->GetAscendKernelByKernelFlag()) { + if (IsMindRTKernelByKernel()) { auto input_kernel = node->input(j + 1); std::string input_kernel_name = GetKernelNodeName(input_kernel); - tensor_name = input_kernel_name; + node_name = input_kernel_name; slot = 0; } ShapeVector int_shapes; @@ -318,18 +301,13 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".input." + std::to_string(j); auto addr = AnfAlgo::GetOutputAddr(input, index); MS_EXCEPTION_IF_NULL(addr); - if (DumpJsonParser::GetInstance().IsStatisticDump() && - (IsDeviceTargetGPU() || Debugger::GetInstance()->GetAscendKernelByKernelFlag())) { + if (DumpJsonParser::GetInstance().IsStatisticDump() && IsMindRTKernelByKernel()) { TensorStatDump stat_dump(op_type, op_name, task_id, stream_id, timestamp, true, j, slot); - (void)stat_dump.DumpTensorStatsToFile(tensor_name, dump_path, debugger); + (void)stat_dump.DumpTensorStatsToFile(node_name, dump_path, debugger); } if (DumpJsonParser::GetInstance().IsTensorDump()) { - if (IsDeviceTargetGPU()) { - DumpGPUMemToFile(debugger, file_path, trans_flag, *addr, tensor_name, slot, int_shapes, type); - } else if (Debugger::GetInstance()->GetAscendKernelByKernelFlag()) { - // load address from launch_info when it's Ascend Kernel by kernel mode. - auto ascend_device_addr = CreateAscendDeviceAddress(launch_info, j, type); - DumpMemToFile(file_path, *ascend_device_addr, int_shapes, type, trans_flag); + if (IsMindRTKernelByKernel()) { + DumpMemFromTensorLoaderToFile(debugger, file_path, node_name, slot); } else { DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag); } @@ -339,9 +317,7 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st void E2eDump::DumpInputData(const CNodePtr &node, bool trans_flag, const std::string &dump_path, std::string *kernel_name) { - auto debugger = Debugger::GetInstance(); - MS_EXCEPTION_IF_NULL(debugger); - if (IsDeviceTargetGPU() || debugger->GetAscendKernelByKernelFlag()) { + if (IsMindRTKernelByKernel()) { MS_LOG(INFO) << "DumpInputData is only for graph mode on Ascend"; return; } @@ -409,7 +385,7 @@ void E2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_ (void)stat_dump.DumpTensorStatsToFile(node_name, dump_path, debugger); } if (dump_json_parser.IsTensorDump()) { - DumpGPUMemToFile(debugger, file_path, trans_flag, *addr, node_name, 0, int_shapes, type); + DumpMemFromTensorLoaderToFile(debugger, file_path, node_name, 0); } } else { DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag); @@ -452,7 +428,7 @@ void E2eDump::DumpSingleParameterNode(const AnfNodePtr &anf_node, const std::str (void)stat_dump.DumpTensorStatsToFile(node_name, dump_path, debugger); } if (dump_json_parser.IsTensorDump()) { - DumpGPUMemToFile(debugger, file_path, trans_flag, *addr, node_name, 0, int_shapes, type); + DumpMemFromTensorLoaderToFile(debugger, file_path, node_name, 0); } } else { DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag); @@ -662,13 +638,12 @@ void E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, cons * Runtime category: MindRT. * Description: This function is for dumping a single node. It is used for mindrt in GPU and Ascend kernel-by-kernel. */ -bool E2eDump::DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id, const Debugger *debugger, - const KernelLaunchInfo *launch_info) { +bool E2eDump::DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id, const Debugger *debugger) { bool success = false; auto &dump_json_parser = DumpJsonParser::GetInstance(); if (dump_json_parser.DumpEnabledForIter()) { std::string dump_path = GenerateDumpPath(graph_id, rank_id); - DumpInputSingleNode(node, dump_path, debugger, launch_info); + DumpInputSingleNode(node, dump_path, debugger); DumpOutputSingleNode(node, dump_path, debugger); success = true; } @@ -761,9 +736,10 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum if (dump_tensor_vec.empty()) { return; } + // The maximum tensor size to allow convert format in single thread to 1 MB. constexpr int kMaxTensorSize = 1048576; if (offset <= kMaxTensorSize) { - // If the total tensor size is less than 1Mb, do it in single thread. + // If the total tensor size is less than 1MB, do it in single thread. ConvertFormatForTensors(&dump_tensor_vec, 0, dump_tensor_vec.size() - 1); } else { // In multi_thread process, we only use 1/4 of the total concurrent threads. @@ -775,7 +751,7 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum std::vector threads; threads.reserve(num_threads); MS_LOG(INFO) << "Number of threads used for A+M dump: " << num_threads; - for (size_t t = 0; t < threads.capacity(); t++) { + for (size_t t = 0; t < num_threads; t++) { uint32_t start_idx = t * task_size; uint32_t end_idx = start_idx + task_size - 1; if (t == num_threads - 1) { diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.h b/mindspore/ccsrc/debug/data_dump/e2e_dump.h index a16c4c0fc58..bbdd318cc34 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.h +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.h @@ -32,7 +32,6 @@ #endif #include "include/backend/visible.h" -using mindspore::kernel::KernelLaunchInfo; #ifndef ENABLE_DEBUGGER class Debugger; #endif @@ -71,12 +70,11 @@ class E2eDump { static void DumpParametersData(uint32_t rank_id, const Debugger *debugger); static bool DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id, - const Debugger *debugger = nullptr, const KernelLaunchInfo *launch_info = nullptr); + const Debugger *debugger = nullptr); // Dump data when task error. static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, - std::string *kernel_name, const Debugger *debugger, - const KernelLaunchInfo *launch_info = nullptr); + std::string *kernel_name, const Debugger *debugger); static void DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, std::string *kernel_name, const Debugger *debugger); @@ -93,6 +91,10 @@ class E2eDump { char *data_ptr); #endif + static bool IsDeviceTargetGPU(); + + static bool IsMindRTKernelByKernel(); + private: static void DumpOutput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger); @@ -100,15 +102,13 @@ class E2eDump { static void DumpInput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger); - static void DumpInputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger, - const KernelLaunchInfo *launch_info = nullptr); + static void DumpInputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger); static void DumpParameters(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger); - static void DumpGPUMemToFile(const Debugger *debugger, const std::string &file_path, bool trans_flag, - const device::DeviceAddress &addr, const std::string &original_kernel_name, size_t slot, - const ShapeVector &int_shapes, const TypeId &host_type); - static bool IsDeviceTargetGPU(); + static void DumpMemFromTensorLoaderToFile(const Debugger *debugger, const std::string &file_path, + const std::string &original_kernel_name, size_t slot); + static void DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_index, const std::string &dump_path, bool trans_flag, const Debugger *debugger); diff --git a/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.cc b/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.cc index 380dcc184bf..fbfaaae1554 100644 --- a/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.cc +++ b/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.cc @@ -33,11 +33,6 @@ constexpr auto kCsvFileName = "statistic.csv"; } // namespace namespace mindspore { -const std::map kDbgDataTypeToStringMap = { - {DT_BOOL, "bool"}, {DT_INT8, "int8"}, {DT_INT16, "int16"}, {DT_INT32, "int32"}, - {DT_INT64, "int64"}, {DT_UINT8, "uint8"}, {DT_UINT16, "uint16"}, {DT_UINT32, "uint32"}, - {DT_UINT64, "uint64"}, {DT_FLOAT16, "float16"}, {DT_FLOAT32, "float32"}, {DT_FLOAT64, "float64"}}; - bool CsvWriter::OpenFile(const std::string &path, const std::string &header) { if (file_.is_open() && path == file_path_str_) { return true; @@ -162,13 +157,10 @@ bool TensorStatDump::DumpTensorStatsToFile(const std::string &dump_path, const s MS_LOG(INFO) << "Tensor data is empty, skipping current statistics"; return false; } - std::string type; - auto iter_type = kDbgDataTypeToStringMap.find(data->GetType()); - if (iter_type == kDbgDataTypeToStringMap.end()) { + std::string type = data->GetTypeString(); + if (type.empty()) { type = "unsupported(" + std::to_string(data->GetType()) + ")"; MS_LOG(INFO) << "Unsupported tensor data_type " << type << " for tensor " << data->GetName(); - } else { - type = iter_type->second; } if (!OpenStatisticsFile(dump_path)) { return false; diff --git a/mindspore/ccsrc/debug/debug_services.cc b/mindspore/ccsrc/debug/debug_services.cc index a59e3c0f981..fa33f22a13c 100644 --- a/mindspore/ccsrc/debug/debug_services.cc +++ b/mindspore/ccsrc/debug/debug_services.cc @@ -1836,11 +1836,8 @@ std::shared_ptr DebugServices::GetTensor(const std::string &tensor_n void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); } #ifdef ONLINE_DBG_MODE -bool DebugServices::DumpTensorToFile(const std::string &filepath, bool trans_flag, const std::string &host_fmt, - const std::string &addr_format, const std::string &tensor_name, size_t slot, - const std::vector &host_shape, TypeId host_type) const { - return tensor_loader_->DumpTensorToFile(filepath, trans_flag, host_fmt, addr_format, tensor_name, slot, host_shape, - host_type); +bool DebugServices::DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const { + return tensor_loader_->DumpTensorToFile(filepath, tensor_name, slot); } #endif diff --git a/mindspore/ccsrc/debug/debug_services.h b/mindspore/ccsrc/debug/debug_services.h index 27e30895c01..2b1edcae392 100644 --- a/mindspore/ccsrc/debug/debug_services.h +++ b/mindspore/ccsrc/debug/debug_services.h @@ -461,9 +461,7 @@ class DebugServices { void EmptyCurrentTensor(); #ifdef ONLINE_DBG_MODE - bool DumpTensorToFile(const std::string &filepath, bool trans_flag, const std::string &host_fmt, - const std::string &addr_format, const std::string &tensor_name, size_t slot, - const std::vector &host_shape, TypeId host_type) const; + bool DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const; #endif bool LoadNewTensor(const std::shared_ptr &tensor, bool keep_prev); diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index 6dcd181a8c4..cd1cf96c836 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -527,10 +527,10 @@ void Debugger::DumpConstantDataAscend(const KernelGraphPtr &graph) { * Runtime category: MindRT. * Description: Dumps a single node for given graph_id. */ -void Debugger::DumpSingleNode(const CNodePtr &node, uint32_t graph_id, const KernelLaunchInfo *launch_info) { +void Debugger::DumpSingleNode(const CNodePtr &node, uint32_t graph_id) { if (debugger_ && debugger_->DebuggerBackendEnabled()) { uint32_t rank_id = GetRankID(); - (void)E2eDump::DumpSingleNodeData(node, graph_id, rank_id, debugger_.get(), launch_info); + (void)E2eDump::DumpSingleNodeData(node, graph_id, rank_id, debugger_.get()); } } @@ -1335,11 +1335,8 @@ void Debugger::SendWatchpoints(const std::list &points) { } } -bool Debugger::DumpTensorToFile(const std::string &filepath, bool trans_flag, const std::string &host_fmt, - const std::string &addr_format, const std::string &tensor_name, size_t slot, - const std::vector &host_shape, TypeId host_type) const { - return debug_services_.get()->DumpTensorToFile(filepath, trans_flag, host_fmt, addr_format, tensor_name, slot, - host_shape, host_type); +bool Debugger::DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const { + return debug_services_.get()->DumpTensorToFile(filepath, tensor_name, slot); } bool Debugger::LoadNewTensor(const std::shared_ptr &tensor, bool keep_prev) { @@ -1541,7 +1538,8 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output } else { keep_prev = false; } - bool ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, keep_prev, root_graph_id, false); + bool ret = + addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, keep_prev, root_graph_id, false, true); if (!ret) { MS_LOG(ERROR) << "LoadMemToHost:" << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; @@ -1572,7 +1570,7 @@ void Debugger::LoadSingleParameterMindRT(const AnfNodePtr &node) { } // Keep_prev is True for parameters. // force update for parameters. - bool ret = device_addr->LoadMemToHost(tensor_name, 0, format, int_shapes, type, 0, true, root_graph_id, true); + bool ret = device_addr->LoadMemToHost(tensor_name, 0, format, int_shapes, type, 0, true, root_graph_id, true, true); if (!ret) { MS_LOG(ERROR) << "LoadMemToHost:" << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; @@ -1702,7 +1700,8 @@ void Debugger::LoadGraphOutputs() { auto format = kOpFormat_DEFAULT; string tensor_name = kernel_name + ':' + std::to_string(j); ShapeVector int_shapes = trans::GetRuntimePaddingShape(node, j); - auto ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false, root_graph_id, false); + auto ret = + addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false, root_graph_id, false, true); if (!ret) { MS_LOG(ERROR) << "LoadMemToHost:" << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h index 8f9e0265d8f..2458661751c 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.h +++ b/mindspore/ccsrc/debug/debugger/debugger.h @@ -107,7 +107,7 @@ class BACKEND_EXPORT Debugger : public std::enable_shared_from_this { void DumpConstantDataAscend(const KernelGraphPtr &graph); - void DumpSingleNode(const CNodePtr &node, uint32_t graph_id, const KernelLaunchInfo *launch_info = nullptr); + void DumpSingleNode(const CNodePtr &node, uint32_t graph_id); void DumpInGraphCompiler(const KernelGraphPtr &kernel_graph); @@ -117,9 +117,7 @@ class BACKEND_EXPORT Debugger : public std::enable_shared_from_this { void PostExecuteNode(const CNodePtr &kernel, bool last_kernel); - bool DumpTensorToFile(const std::string &filepath, bool trans_flag, const std::string &host_fmt, - const std::string &addr_format, const std::string &tensor_name, size_t slot, - const std::vector &host_shape, TypeId host_type) const; + bool DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const; bool LoadNewTensor(const std::shared_ptr &tensor, bool keep_prev); diff --git a/mindspore/ccsrc/debug/debugger/debugger_utils.cc b/mindspore/ccsrc/debug/debugger/debugger_utils.cc index 4c57d1b58e8..e4a71c9286d 100644 --- a/mindspore/ccsrc/debug/debugger/debugger_utils.cc +++ b/mindspore/ccsrc/debug/debugger/debugger_utils.cc @@ -66,12 +66,12 @@ std::vector CheckRealOutput(const std::string &node_name, const size_t & /* * Feature group: Dump, Online debugger. - * Target device group: GPU. + * Target device group: GPU, Ascend. * Runtime category: MindRT. * Description: Get kernel inputs from launch_info and load the inputs from device to host. */ void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order, uint32_t root_graph_id, - const DeviceContext *device_context) { + const DeviceContext *device_context, const bool trans_flag) { // get inputs auto kernel_inputs = launch_info->inputs_; auto input_size = common::AnfAlgo::GetInputTensorNum(cnode); @@ -79,33 +79,40 @@ void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint auto input_kernel = cnode->input(j + 1); std::string input_kernel_name = GetKernelNodeName(input_kernel); auto addr = kernel_inputs[j]; - auto type = common::AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX); + auto device_type = AnfAlgo::GetOutputDeviceDataType(input_kernel, PARAMETER_OUTPUT_INDEX); + auto host_type = common::AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX); + auto type = trans_flag ? host_type : device_type; // For example, this happens with the Depend op if (type == kMetaTypeNone) { continue; } - auto format = kOpFormat_DEFAULT; - auto device_addr = device_context->CreateDeviceAddress(addr->addr, addr->size, format, type, ShapeVector()); + auto host_format = kOpFormat_DEFAULT; + auto device_format = + E2eDump::IsDeviceTargetGPU() ? kOpFormat_DEFAULT : AnfAlgo::GetOutputFormat(input_kernel, PARAMETER_OUTPUT_INDEX); + auto device_addr = + device_context->CreateDeviceAddress(addr->addr, addr->size, device_format, device_type, ShapeVector()); string input_tensor_name = input_kernel_name + ':' + "0"; - ShapeVector int_shapes = trans::GetRuntimePaddingShape(input_kernel, PARAMETER_OUTPUT_INDEX); - auto ret = device_addr->LoadMemToHost(input_tensor_name, UintToInt(exec_order), format, int_shapes, type, 0, true, - root_graph_id, false); + ShapeVector int_shapes; + GetDumpIntShape(input_kernel, PARAMETER_OUTPUT_INDEX, NOT_NULL(&int_shapes), trans_flag); + auto ret = device_addr->LoadMemToHost(input_tensor_name, UintToInt(exec_order), host_format, int_shapes, type, 0, + true, root_graph_id, false, trans_flag); if (!ret) { MS_LOG(ERROR) << "LoadMemToHost:" - << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!"; + << ", tensor_name:" << input_tensor_name << ", host_format:" << host_format + << ", device_format:" << device_format << "."; } } } /* * Feature group: Dump, Online debugger. - * Target device group: GPU. + * Target device group: GPU, Ascend. * Runtime category: MindRT. * Description: Get kernel outputs from launch_info and load the inputs from device to host. */ void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order, - uint32_t root_graph_id, const DeviceContext *device_context) { + uint32_t root_graph_id, const DeviceContext *device_context, const bool trans_flag) { // get outputs auto kernel_outputs = launch_info->outputs_; auto output_size = common::AnfAlgo::GetOutputTensorNum(cnode); @@ -115,21 +122,27 @@ void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uin for (size_t j : real_outputs) { auto addr = kernel_outputs[j]; - auto type = common::AnfAlgo::GetOutputInferDataType(cnode, j); + auto device_type = AnfAlgo::GetOutputDeviceDataType(cnode, j); + auto host_type = common::AnfAlgo::GetOutputInferDataType(cnode, j); + auto type = trans_flag ? host_type : device_type; // For example, this happens with the Depend op if (type == kMetaTypeNone) { continue; } - auto format = kOpFormat_DEFAULT; - auto device_addr = device_context->CreateDeviceAddress(addr->addr, addr->size, format, type, ShapeVector()); + auto host_format = kOpFormat_DEFAULT; + auto device_format = E2eDump::IsDeviceTargetGPU() ? kOpFormat_DEFAULT : AnfAlgo::GetOutputFormat(cnode, j); + auto device_addr = + device_context->CreateDeviceAddress(addr->addr, addr->size, device_format, device_type, ShapeVector()); string tensor_name = kernel_name + ':' + std::to_string(j); - ShapeVector int_shapes = trans::GetRuntimePaddingShape(cnode, j); - auto ret = device_addr->LoadMemToHost(tensor_name, UintToInt(exec_order), format, int_shapes, type, j, false, - root_graph_id, false); + ShapeVector int_shapes; + GetDumpIntShape(cnode, j, NOT_NULL(&int_shapes), trans_flag); + auto ret = device_addr->LoadMemToHost(tensor_name, UintToInt(exec_order), host_format, int_shapes, type, j, false, + root_graph_id, false, trans_flag); if (!ret) { MS_LOG(ERROR) << "LoadMemToHost:" - << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; + << ", tensor_name:" << tensor_name << ", host_format:" << host_format + << ", device_format:" << device_format << ".!"; } } } @@ -168,6 +181,13 @@ bool IsDeviceTargetGPU() { return context->get_param(MS_CTX_DEVICE_TARGET) == kGPUDevice; } +bool GetTransFlag() { + if (Debugger::GetInstance()->debugger_enabled() || IsDeviceTargetGPU()) { + return true; + } + return DumpJsonParser::GetInstance().trans_flag(); +} + /* * Feature group: Dump, Online debugger. * Target device group: Ascend, GPU. @@ -187,11 +207,12 @@ void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, auto kernel_graph = std::dynamic_pointer_cast(cnode->func_graph()); MS_EXCEPTION_IF_NULL(kernel_graph); auto root_graph_id = kernel_graph->root_graph_id(); + bool trans_flag = GetTransFlag(); if (debugger->debugger_enabled() || dump_json_parser.InputNeedDump()) { - LoadInputs(cnode, launch_info, exec_order, root_graph_id, device_context); + LoadInputs(cnode, launch_info, exec_order, root_graph_id, device_context, trans_flag); } if (debugger->debugger_enabled() || dump_json_parser.OutputNeedDump()) { - LoadOutputs(cnode, launch_info, exec_order, root_graph_id, device_context); + LoadOutputs(cnode, launch_info, exec_order, root_graph_id, device_context, trans_flag); } // Dump kernel if (dump_enabled) { @@ -202,7 +223,7 @@ void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, debugger->DumpSingleNode(cnode, graph_id); } else { // for Ascend, node are dumped in root_graph_id directory. - debugger->DumpSingleNode(cnode, root_graph_id, launch_info); + debugger->DumpSingleNode(cnode, root_graph_id); } // Clear Dumped data when online debugger is not enabled if (!debugger->debugger_enabled()) { diff --git a/mindspore/ccsrc/debug/debugger/debugger_utils.h b/mindspore/ccsrc/debug/debugger/debugger_utils.h index b22fecf0f9d..c0fe35dc90c 100644 --- a/mindspore/ccsrc/debug/debugger/debugger_utils.h +++ b/mindspore/ccsrc/debug/debugger/debugger_utils.h @@ -33,10 +33,10 @@ namespace mindspore { std::vector CheckRealOutput(const std::string &node_name, const size_t &output_size); void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order, uint32_t root_graph_id, - const DeviceContext *device_context); + const DeviceContext *device_context, const bool trans_flag); void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order, - uint32_t root_graph_id, const DeviceContext *device_context); + uint32_t root_graph_id, const DeviceContext *device_context, const bool trans_flag); bool CheckReadData(const CNodePtr &cnode); diff --git a/mindspore/ccsrc/debug/tensor_data.h b/mindspore/ccsrc/debug/tensor_data.h index 1f4b434b709..f1bee8d0da7 100644 --- a/mindspore/ccsrc/debug/tensor_data.h +++ b/mindspore/ccsrc/debug/tensor_data.h @@ -17,6 +17,7 @@ #define MINDSPORE_CCSRC_DEBUG_TENSOR_DATA_H_ #include +#include #include #include #include @@ -203,6 +204,10 @@ class TensorData { #ifdef ONLINE_DBG_MODE void SetTensor(const mindspore::tensor::TensorPtr &out_tensor) { this->tensor_ptr_ = out_tensor; } + + void SetFormat(const std::string &format) { this->format_ = format; } + + std::string GetFormat() { return this->format_; } #endif void SetSlot(size_t slot) { this->slot_ = slot; } @@ -239,6 +244,19 @@ class TensorData { DbgDataType GetType() const { return this->data_type_; } + std::string GetTypeString() const { + const std::map kDbgDataTypeToStringMap = { + {DT_BOOL, "bool"}, {DT_INT8, "int8"}, {DT_INT16, "int16"}, {DT_INT32, "int32"}, + {DT_INT64, "int64"}, {DT_UINT8, "uint8"}, {DT_UINT16, "uint16"}, {DT_UINT32, "uint32"}, + {DT_UINT64, "uint64"}, {DT_FLOAT16, "float16"}, {DT_FLOAT32, "float32"}, {DT_FLOAT64, "float64"}}; + auto iter_type = kDbgDataTypeToStringMap.find(data_type_); + if (iter_type == kDbgDataTypeToStringMap.end()) { + return std::string(); + } else { + return iter_type->second; + } + } + void SetType(unsigned int type) { ConvertMsToDbgType(type); } void SetType(const std::string &type_name) { ConvertStringToDbgType(type_name); } @@ -438,6 +456,7 @@ class TensorData { std::string time_stamp_; #ifdef ONLINE_DBG_MODE + std::string format_{""}; mindspore::tensor::TensorPtr tensor_ptr_{nullptr}; #endif }; diff --git a/mindspore/ccsrc/debug/tensor_load.h b/mindspore/ccsrc/debug/tensor_load.h index e0e6513b85c..59dc3f08677 100644 --- a/mindspore/ccsrc/debug/tensor_load.h +++ b/mindspore/ccsrc/debug/tensor_load.h @@ -244,29 +244,20 @@ class TensorLoader { * Runtime category: Old runtime, MindRT. * Description: Load tensor data from debugger backend cache (tensor_list_map_) and dump to file in npy format. */ - bool DumpTensorToFile(const std::string &filepath, bool trans_flag, const std::string &host_fmt, - const std::string &addr_format, const std::string &tensor_name, size_t slot, - const std::vector &host_shape, TypeId host_type) { + bool DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) { if (filepath.empty()) { MS_LOG(ERROR) << "Dump file path is null!"; return false; } - std::string path = ""; - if (trans_flag) { - path = filepath + '.' + host_fmt; - } else { - path = filepath + '.' + addr_format; - } - - MS_LOG(INFO) << "Dump path is " << path; std::string tensor_loader_name = tensor_name + ":" + std::to_string(slot); auto iter = tensor_list_map_.find(tensor_loader_name); if (iter != tensor_list_map_.end()) { std::shared_ptr node = iter->second; - size_t host_size = node->GetByteSize(); + std::string path = filepath + '.' + node->GetFormat(); - return DumpJsonParser::DumpToFile(path, node->GetDataPtr(), host_size, host_shape, host_type); + return DumpJsonParser::DumpToFile(path, node->GetDataPtr(), node->GetByteSize(), node->GetShape(), + StringToTypeId(node->GetTypeString())); } MS_LOG(INFO) << "Tensor name:" << tensor_name << " not found in tensor_list_map_"; return false; diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.cc b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.cc index 40d05d42bc2..fa9e1766d8f 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.cc +++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.cc @@ -647,9 +647,10 @@ bool AscendDeviceAddress::DumpMemToFile(const std::string &filepath, const std:: * Runtime category: Old runtime, MindRT. * Description: Load tensor to host and create tensor_data object for the loaded tensor. */ -bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &, - const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev, - uint32_t root_graph_id, bool force_update) const { +bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, + const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type, + size_t slot, bool keep_prev, uint32_t root_graph_id, bool force_update, + bool trans_flag) const { bool ret = false; auto debugger = Debugger::GetInstance(); MS_EXCEPTION_IF_NULL(debugger); @@ -671,9 +672,14 @@ bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int exec mindspore::tensor::TensorPtr out_tensor = std::make_shared(host_type, host_shape); MS_EXCEPTION_IF_NULL(out_tensor); size_t host_size = out_tensor->data().nbytes(); - auto ret_sync = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c()); + bool ret_sync = false; + if (trans_flag) { + ret_sync = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c()); + } else { + ret_sync = SyncDeviceToHost(host_size, out_tensor->data_c()); + } if (!ret_sync) { - MS_LOG(ERROR) << "Copy device mem to host failed"; + MS_LOG(ERROR) << "Convert format or Copy device mem to host failed"; return ret; } MS_LOG(INFO) << "E2E tensor name is " << tensor_name; @@ -683,7 +689,11 @@ bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int exec tensor_data->SetType((unsigned int)host_type); tensor_data->SetShape(out_tensor->shape()); tensor_data->SetRootGraphId(root_graph_id); + std::string tensor_format = trans_flag ? host_fmt : format_; + tensor_data->SetFormat(tensor_format); ret = debugger->LoadNewTensor(tensor_data, keep_prev); + MS_LOG(INFO) << "Load tensor '" << tensor_name << "' into debugger tensor loader successfully: format(" + << tensor_format << ")"; return ret; } #endif diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.h b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.h index a25831317da..fc501ad054a 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.h +++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.h @@ -62,7 +62,7 @@ class AscendDeviceAddress : public DeviceAddress { #ifdef ENABLE_DEBUGGER bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev, - uint32_t root_graph_id, bool force_update) const override; + uint32_t root_graph_id, bool force_update, bool trans_flag) const override; #endif private: diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_device_address.cc b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_device_address.cc index 9830ec50dfa..b3888d9529c 100644 --- a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_device_address.cc +++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_device_address.cc @@ -185,7 +185,7 @@ GPUDeviceAddress::~GPUDeviceAddress() { ClearDeviceMemory(); } #ifdef ENABLE_DEBUGGER bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev, - uint32_t root_graph_id, bool force_update) const { + uint32_t root_graph_id, bool force_update, bool) const { bool ret = false; if (size_ == 0) { return true; @@ -219,6 +219,7 @@ bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int executi tensor_data->SetType((unsigned int)host_type); tensor_data->SetShape(out_tensor->shape()); tensor_data->SetRootGraphId(root_graph_id); + tensor_data->SetFormat(host_fmt); ret = Debugger::GetInstance()->LoadNewTensor(tensor_data, keep_prev); MS_LOG(INFO) << "E2E tensor name is " << tensor_name; return ret; diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_device_address.h b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_device_address.h index d99cdf43c74..27379f5fb5d 100644 --- a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_device_address.h +++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_device_address.h @@ -56,7 +56,7 @@ class GPUDeviceAddress : public DeviceAddress { #ifdef ENABLE_DEBUGGER bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev, - uint32_t root_graph_id, bool force_update) const override; + uint32_t root_graph_id, bool force_update, bool trans_flag) const override; #endif private: diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_kernel_runtime.cc b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_kernel_runtime.cc index cce6e499d25..12965b2fb60 100644 --- a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_kernel_runtime.cc +++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_kernel_runtime.cc @@ -183,7 +183,8 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, auto gpu_addr = std::make_unique(addr->addr, addr->size, format, type); string input_tensor_name = input_kernel_name + ':' + "0"; ShapeVector int_shapes = trans::GetRuntimePaddingShape(input_kernel, PARAMETER_OUTPUT_INDEX); - auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, true, 0, false); + auto ret = + gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, true, 0, false, true); if (!ret) { MS_LOG(ERROR) << "LoadMemToHost:" << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!"; @@ -210,7 +211,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, auto gpu_addr = std::make_unique(addr->addr, addr->size, format, type); string tensor_name = kernel_name + ':' + std::to_string(j); ShapeVector int_shapes = trans::GetRuntimePaddingShape(kernel, j); - auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false, 0, false); + auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false, 0, false, true); if (!ret) { MS_LOG(ERROR) << "LoadMemToHost:" << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; diff --git a/mindspore/ccsrc/runtime/device/device_address.h b/mindspore/ccsrc/runtime/device/device_address.h index 5d1b25efdcd..b1d765dac7a 100644 --- a/mindspore/ccsrc/runtime/device/device_address.h +++ b/mindspore/ccsrc/runtime/device/device_address.h @@ -141,7 +141,7 @@ class DeviceAddress : public mindspore::DeviceSync { #ifdef ENABLE_DEBUGGER virtual bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev, - uint32_t root_graph_id, bool force_update) const { + uint32_t root_graph_id, bool force_update, bool trans_flag) const { return true; } #endif