diff --git a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc index a32d802f933..8255f2606d9 100644 --- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc +++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc @@ -382,9 +382,9 @@ void DumpJsonParser::ParseSavedData(const nlohmann::json &content) { << saved_data_ << ". Please set saved_data to either statistic, tensor, or full"; } auto context = MsContext::GetInstance(); - if (IsStatisticDump() && context->get_param(MS_CTX_DEVICE_TARGET) != kGPUDevice) { - MS_LOG(EXCEPTION) << "Dump Json parse failed, storing statistic dump is only supported on GPU device, please set " - "saved_data to tensor or use a GPU device"; + if (IsStatisticDump() && context->get_param(MS_CTX_DEVICE_TARGET) == kCPUDevice) { + MS_LOG(EXCEPTION) << "Dump Json parse failed, storing statistic dump is only supported on GPU and Ascend, please " + "set saved_data to tensor or use a GPU or Ascend device"; } } diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc index 0b96e0ea2a9..7be96fc76db 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -511,7 +512,7 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum for (uint32_t slot = 0; slot < input_tensors.size(); slot++) { auto in_tensor = input_tensors[slot]; std::string in_slot_path = in_path + std::to_string(slot) + "."; - auto succ = ConvertFormatForTensorAndDump(in_slot_path, in_tensor, data_ptr + offset); + auto succ = ConvertFormatForTensorAndDump(in_slot_path, in_tensor, data_ptr + offset, "input", slot); if (!succ) { MS_LOG(INFO) << "Failed to convert format for tensor " << in_slot_path; } @@ -524,7 +525,7 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum for (uint32_t slot = 0; slot < output_tensors.size(); slot++) { auto out_tensor = output_tensors[slot]; std::string out_slot_path = out_path + std::to_string(slot) + "."; - auto succ = ConvertFormatForTensorAndDump(out_slot_path, out_tensor, data_ptr + offset); + auto succ = ConvertFormatForTensorAndDump(out_slot_path, out_tensor, data_ptr + offset, "output", slot); if (!succ) { MS_LOG(INFO) << "Failed to convert format for tensor " << out_slot_path; } @@ -533,7 +534,40 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum } template -bool E2eDump::ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr) { +bool DumpTensorStatsIfNeeded(const std::string &dump_path, const T &tensor, char *data_ptr, const std::string &io, + uint32_t slot, const ShapeVector &shape, TypeId type) { + if (!DumpJsonParser::GetInstance().IsStatisticDump()) { + return true; + } + size_t pos = dump_path.rfind("/"); + std::string file_name = dump_path.substr(pos + 1); + size_t first_dot = file_name.find("."); + size_t second_dot = file_name.find(".", first_dot + 1); + size_t third_dot = file_name.find(".", second_dot + 1); + size_t fourth_dot = file_name.find(".", third_dot + 1); + size_t fifth_dot = file_name.find(".", fourth_dot + 1); + std::string op_type = file_name.substr(0, first_dot); + std::string op_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1); + std::string task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1); + std::string stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1); + std::string timestamp = file_name.substr(fourth_dot + 1, fifth_dot - fourth_dot - 1); + TensorStatDump stat_dump(op_type, op_name, task_id, stream_id, timestamp, io, slot, slot); + std::shared_ptr data = std::make_shared(); + try { + data->ConvertMsToDbgType(type); + } catch (...) { + MS_LOG(ERROR) << "Data type of operator " << file_name << " is not supported by statistic dump"; + return false; + } + data->SetByteSize((size_t)tensor.size()); + data->SetShape(shape); + data->SetDataPtr(data_ptr); + return stat_dump.DumpTensorStatsToFile(dump_path.substr(0, pos), data); +} + +template +bool E2eDump::ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr, + const std::string &io, uint32_t slot) { // get format auto iter_fmt = kFormatToStringMap.find(tensor.format()); if (iter_fmt == kFormatToStringMap.end()) { @@ -584,13 +618,21 @@ bool E2eDump::ConvertFormatForTensorAndDump(std::string dump_path, const T &tens } } // dump tensor data into npy file - bool dump_success = false; + bool dump_success = true; if (trans_success) { - dump_path += host_format; - dump_success = DumpJsonParser::DumpToFile(dump_path, trans_buf.data(), data_size, shape_to, src_type); + dump_success = DumpTensorStatsIfNeeded(dump_path, tensor, reinterpret_cast(trans_buf.data()), io, slot, + shape_to, src_type); + if (DumpJsonParser::GetInstance().IsTensorDump()) { + dump_path += host_format; + dump_success = + DumpJsonParser::DumpToFile(dump_path, trans_buf.data(), data_size, shape_to, src_type) && dump_success; + } } else { - dump_path += device_format; - dump_success = DumpJsonParser::DumpToFile(dump_path, data_ptr, data_size, shape_to, src_type); + dump_success = DumpTensorStatsIfNeeded(dump_path, tensor, data_ptr, io, slot, shape_to, src_type); + if (DumpJsonParser::GetInstance().IsTensorDump()) { + dump_path += device_format; + dump_success = DumpJsonParser::DumpToFile(dump_path, data_ptr, data_size, shape_to, src_type) && dump_success; + } } return dump_success; } diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.h b/mindspore/ccsrc/debug/data_dump/e2e_dump.h index e3ba3a37a69..844b92cc51b 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.h +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.h @@ -95,7 +95,8 @@ class E2eDump { static nlohmann::json ParseOverflowInfo(char *data_ptr); template - static bool ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr); + static bool ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr, + const std::string &io, uint32_t slot); #endif inline static unsigned int starting_graph_id = INT32_MAX; diff --git a/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.cc b/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.cc index a2a5faee9ef..f18cb1feecb 100644 --- a/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.cc +++ b/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.cc @@ -41,7 +41,7 @@ bool CsvWriter::OpenFile(const std::string &path, const std::string &header) { } auto file_path = Common::CreatePrefixPath(path); if (!file_path.has_value()) { - MS_LOG(WARNING) << "CreatePrefixPath failed."; + MS_LOG(WARNING) << "CreatePrefixPath failed, skipping current statistics"; return false; } // try to open file @@ -55,7 +55,7 @@ bool CsvWriter::OpenFile(const std::string &path, const std::string &header) { file_.open(file_path_value, std::ios::out | std::ios::app | std::ios::binary); } if (!file_.is_open()) { - MS_LOG(WARNING) << "Open file " << path << " failed." << ErrnoToString(errno); + MS_LOG(WARNING) << "Open file " << file_path_value << " failed." << ErrnoToString(errno); return false; } if (first_time_opening) { @@ -63,7 +63,7 @@ bool CsvWriter::OpenFile(const std::string &path, const std::string &header) { file_.flush(); file_path_str_ = path; } - MS_LOG(INFO) << "Opened file: " << path; + MS_LOG(INFO) << "Opened file: " << file_path_value; return true; } @@ -93,9 +93,9 @@ TensorStatDump::TensorStatDump(const std::string &op_type, const std::string &op size_t tensor_loader_slot) : op_type_{op_type}, op_name_{op_name}, - task_id_{task_id}, - stream_id_{stream_id}, - timestamp_{timestamp}, + task_id_{std::to_string(task_id)}, + stream_id_{std::to_string(stream_id)}, + timestamp_{std::to_string(timestamp)}, slot_{slot}, tensor_loader_slot_{tensor_loader_slot} { if (input) { @@ -105,6 +105,22 @@ TensorStatDump::TensorStatDump(const std::string &op_type, const std::string &op } } +TensorStatDump::TensorStatDump(const std::string &op_type, const std::string &op_name, const std::string &task_id, + const std::string &stream_id, const std::string ×tamp, const std::string &io, + size_t slot, size_t tensor_loader_slot) + : op_type_{op_type}, + op_name_{op_name}, + task_id_{task_id}, + stream_id_{stream_id}, + timestamp_{timestamp}, + io_{io}, + slot_{slot}, + tensor_loader_slot_{tensor_loader_slot} { + if (io_ != kInput && io_ != kOutput) { + MS_LOG(EXCEPTION) << "Cannot instantiate TensorStatDump, io needs to be either " << kInput << " or " << kOutput; + } +} + bool TensorStatDump::OpenStatisticsFile(const std::string &dump_path) { std::string filename = dump_path + "/" + kCsvFileName; // try to open file @@ -125,16 +141,24 @@ bool TensorStatDump::OpenStatisticsFile(const std::string &dump_path) { bool TensorStatDump::DumpTensorStatsToFile(const std::string &original_kernel_name, const std::string &dump_path, const Debugger *debugger) { - if (!OpenStatisticsFile(dump_path)) { - return false; - } - // get tensor statistics using debugger + // get tensor data using debugger std::string tensor_loader_name = original_kernel_name + ":" + std::to_string(tensor_loader_slot_); std::shared_ptr data = debugger->GetTensor(tensor_loader_name); if (data == nullptr) { MS_LOG(WARNING) << "Failed to find " << tensor_loader_name << " in tensor loader, skipping current statistics"; return false; } + return DumpTensorStatsToFile(dump_path, data); +} + +bool TensorStatDump::DumpTensorStatsToFile(const std::string &dump_path, std::shared_ptr data) { + if (data == nullptr) { + MS_LOG(WARNING) << "Tensor data is empty, skipping current statistics"; + return false; + } + if (!OpenStatisticsFile(dump_path)) { + return false; + } const DebugServices::TensorStat &stat = DebugServices::GetTensorStatistics(data); // write tensor statistics to csv file std::ostringstream shape; diff --git a/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.h b/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.h index 4ad265e86fc..b42d23cf578 100644 --- a/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.h +++ b/mindspore/ccsrc/debug/data_dump/tensor_stat_dump.h @@ -17,6 +17,7 @@ #ifndef MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_ #define MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_ +#include #include #include @@ -24,6 +25,7 @@ namespace mindspore { class Debugger; +class TensorData; class CsvWriter { public: static CsvWriter &GetInstance() { @@ -31,13 +33,6 @@ class CsvWriter { return instance; } - private: - const std::string kSeparator = ","; - const std::string kEndLine = "\n"; - std::ofstream file_; - std::string file_path_str_ = ""; - - public: CsvWriter() = default; ~CsvWriter(); DISABLE_COPY_AND_ASSIGN(CsvWriter) @@ -45,28 +40,39 @@ class CsvWriter { void CloseFile(); template void WriteToCsv(const T &val, bool end_line = false); + + private: + const std::string kSeparator = ","; + const std::string kEndLine = "\n"; + std::ofstream file_; + std::string file_path_str_ = ""; }; class TensorStatDump { - static const char CSV_HEADER[]; - static const char CSV_FILE_NAME[]; - - const std::string &op_type_; - const std::string &op_name_; - uint32_t task_id_; - uint32_t stream_id_; - uint64_t timestamp_; - std::string io_; - size_t slot_; - size_t tensor_loader_slot_; - public: static bool OpenStatisticsFile(const std::string &dump_path); TensorStatDump(const std::string &op_type, const std::string &op_name, uint32_t task_id, uint32_t stream_id, uint64_t timestamp, bool input, size_t slot, size_t tensor_loader_slot_); + TensorStatDump(const std::string &op_type, const std::string &op_name, const std::string &task_id, + const std::string &stream_id, const std::string ×tamp, const std::string &io, size_t slot, + size_t tensor_loader_slot); + bool DumpTensorStatsToFile(const std::string &dump_path, std::shared_ptr data); bool DumpTensorStatsToFile(const std::string &original_kernel_name, const std::string &dump_path, const Debugger *debugger); + + private: + static const char CSV_HEADER[]; + static const char CSV_FILE_NAME[]; + + const std::string op_type_; + const std::string op_name_; + const std::string task_id_; + const std::string stream_id_; + const std::string timestamp_; + std::string io_; + size_t slot_; + size_t tensor_loader_slot_; }; } // namespace mindspore #endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_ diff --git a/tests/st/dump/dump_test_utils.py b/tests/st/dump/dump_test_utils.py index de317e24ce4..8d64b4b64f4 100644 --- a/tests/st/dump/dump_test_utils.py +++ b/tests/st/dump/dump_test_utils.py @@ -158,11 +158,15 @@ def generate_statistic_dump_json(dump_path, json_file_name, test_key, saved_data """ if test_key == "test_gpu_e2e_dump": data = e2e_dump_dict - data["common_dump_settings"]["path"] = dump_path - data["common_dump_settings"]["saved_data"] = saved_data + elif test_key == "test_async_dump": + data = async_dump_dict + data["common_dump_settings"]["input_output"] = 0 + data["common_dump_settings"]["file_format"] = "npy" else: raise ValueError( "Failed to generate statistic dump json file. The test name value " + test_key + " is invalid.") + data["common_dump_settings"]["path"] = dump_path + data["common_dump_settings"]["saved_data"] = saved_data with open(json_file_name, 'w') as f: json.dump(data, f) diff --git a/tests/st/dump/test_data_dump.py b/tests/st/dump/test_data_dump.py index 149afb56ba7..205ee114087 100644 --- a/tests/st/dump/test_data_dump.py +++ b/tests/st/dump/test_data_dump.py @@ -409,36 +409,37 @@ def check_statistic_dump(dump_file_path): real_path = os.path.realpath(output_path) with open(real_path) as f: reader = csv.DictReader(f) - input1 = next(reader) + stats = list(reader) + input1 = stats[0] assert input1['IO'] == 'input' assert input1['Min Value'] == '1' assert input1['Max Value'] == '6' - input2 = next(reader) + input2 = stats[1] assert input2['IO'] == 'input' assert input2['Min Value'] == '7' assert input2['Max Value'] == '12' - output = next(reader) + output = stats[2] assert output['IO'] == 'output' assert output['Min Value'] == '8' assert output['Max Value'] == '18' def check_data_dump(dump_file_path): - output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy" + output_name = "Add.Add-op*.output.0.*.npy" output_path = glob.glob(os.path.join(dump_file_path, output_name))[0] real_path = os.path.realpath(output_path) output = np.load(real_path) expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32) assert np.array_equal(output, expect) -def run_gpu_e2e_dump(saved_data): - """Run gpu e2e dump""" +def run_saved_data_dump_test(scenario, saved_data): + """Run e2e dump on scenario, testing statistic dump""" if sys.platform != 'linux': return pwd = os.getcwd() with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir: - dump_path = os.path.join(tmp_dir, 'gpu_e2e_dump') - dump_config_path = os.path.join(tmp_dir, 'gpu_e2e_dump.json') - generate_statistic_dump_json(dump_path, dump_config_path, 'test_gpu_e2e_dump', saved_data) + dump_path = os.path.join(tmp_dir, 'test_saved_data') + dump_config_path = os.path.join(tmp_dir, 'test_saved_data.json') + generate_statistic_dump_json(dump_path, dump_config_path, scenario, saved_data) os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0') if os.path.isdir(dump_path): @@ -473,7 +474,7 @@ def test_gpu_e2e_statistic_dump(): Expectation: Statistics are stored in statistic.csv files """ context.set_context(mode=context.GRAPH_MODE, device_target="GPU") - run_gpu_e2e_dump('statistic') + run_saved_data_dump_test('test_gpu_e2e_dump', 'statistic') @pytest.mark.level0 @pytest.mark.platform_x86_gpu_training @@ -486,7 +487,7 @@ def test_gpu_e2e_tensor_dump(): Expectation: Tensor data are stored in npy files """ context.set_context(mode=context.GRAPH_MODE, device_target="GPU") - run_gpu_e2e_dump('tensor') + run_saved_data_dump_test('test_gpu_e2e_dump', 'tensor') @pytest.mark.level0 @pytest.mark.platform_x86_gpu_training @@ -499,4 +500,46 @@ def test_gpu_e2e_full_dump(): Expectation: Tensor are stored in npy files and their statistics stored in statistic.csv """ context.set_context(mode=context.GRAPH_MODE, device_target="GPU") - run_gpu_e2e_dump('full') + run_saved_data_dump_test('test_gpu_e2e_dump', 'full') + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@security_off_wrap +def test_ascend_statistic_dump(): + """ + Feature: Ascend Statistics Dump + Description: Test Ascend statistics dump + Expectation: Statistics are stored in statistic.csv files + """ + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") + run_saved_data_dump_test('test_async_dump', 'statistic') + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@security_off_wrap +def test_ascend_tensor_dump(): + """ + Feature: Ascend Tensor Dump + Description: Test Ascend tensor dump + Expectation: Tensors are stored in npy files + """ + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") + run_saved_data_dump_test('test_async_dump', 'tensor') + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@security_off_wrap +def test_ascend_full_dump(): + """ + Feature: Ascend Full Dump + Description: Test Ascend full dump + Expectation: Tensors are stored in npy files and their statistics stored in statistic.csv + """ + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") + run_saved_data_dump_test('test_async_dump', 'full')