Fix the incompleted watchpointhits for offline debugger for multi_rank data

This commit is contained in:
maning202007 2022-12-16 14:44:34 +08:00
parent 2928e3289f
commit 558bbe738a
1 changed files with 5 additions and 2 deletions

View File

@ -1643,6 +1643,7 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
std::tuple<uint32_t, uint32_t> rank_and_graph = rank_and_graph_item.first;
uint32_t rank_id = std::get<0>(rank_and_graph);
uint32_t root_graph_id = std::get<1>(rank_and_graph);
MS_LOG(INFO) << "Get tensor files for rank_id: " << rank_id << ", root_graph_id: " << root_graph_id;
std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(rank_id) + "/" + net_name_ + "/" +
std::to_string(root_graph_id) + "/" + IterationString(iteration);
std::string real_dump_dir = RealPath(specific_dump_dir);
@ -1672,7 +1673,8 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
if (is_sync_mode_) {
// search files in dir for the one that meets the filename prefix and read the file into memory
NPYFilePool npy_files = PreProcessDumpDirSync(real_dump_dir);
*processed_npy_files = ProcessNPYFilePool(npy_files);
auto processed_npy_files_in_rank = ProcessNPYFilePool(npy_files);
processed_npy_files->insert(processed_npy_files_in_rank.begin(), processed_npy_files_in_rank.end());
ProcessTensorDataSync(proto_to_dump, real_dump_dir, *processed_npy_files, iteration, rank_id, root_graph_id,
&tensor_list, error_on_no_value);
} else {
@ -1680,7 +1682,8 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
// convert all files in proto_to_dump to npy and add to pool of async file names
NPYFilePool async_file_pool;
ConvertWatchPointNodes(std::get<1>(preprocess_async_result), proto_to_dump, real_dump_dir, &async_file_pool);
*processed_npy_files = ProcessNPYFilePool(async_file_pool);
auto processed_npy_files_in_rank = ProcessNPYFilePool(async_file_pool);
processed_npy_files->insert(processed_npy_files_in_rank.begin(), processed_npy_files_in_rank.end());
GetTensorDataInfoAsync(proto_to_dump, real_dump_dir, iteration, rank_id, root_graph_id, *processed_npy_files,
&tensor_list);
}