forked from mindspore-Ecosystem/mindspore
!30573 Catch abnormals from sto functions in debug
Merge pull request !30573 from maning202007/master
This commit is contained in:
commit
330cacf905
|
@ -20,6 +20,7 @@ set(_OFFLINE_SRC_LIST
|
|||
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/offline_debug/dbg_services.cc"
|
||||
"${CMAKE_SOURCE_DIR}/mindspore/core/utils/log_adapter.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/offline_debug/mi_pybind_register.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/utils.cc"
|
||||
)
|
||||
|
||||
if(ENABLE_DUMP_IR)
|
||||
|
@ -43,6 +44,7 @@ if(ENABLE_DEBUGGER)
|
|||
"${CMAKE_CURRENT_SOURCE_DIR}/debug_services.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/debugger_utils.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/data_dump/tensor_stat_dump.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/utils.cc"
|
||||
)
|
||||
endif()
|
||||
|
||||
|
@ -52,6 +54,7 @@ if(NOT ENABLE_SECURITY)
|
|||
"${CMAKE_CURRENT_SOURCE_DIR}/data_dump/dump_json_parser.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/data_dump/dump_utils.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/data_dump/npy_header.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/utils.cc"
|
||||
)
|
||||
if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows")
|
||||
list(APPEND _DEBUG_SRC_LIST
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include <fstream>
|
||||
#include "utils/log_adapter.h"
|
||||
#include "debug/common.h"
|
||||
#include "debug/utils.h"
|
||||
#include "utils/ms_context.h"
|
||||
#include "utils/convert_utils_base.h"
|
||||
#include "backend/common/session/anf_runtime_algorithm.h"
|
||||
|
@ -466,7 +467,13 @@ bool IsIterInRange(uint32_t iteration, const std::string &range) {
|
|||
std::size_t range_idx = range.find(dash);
|
||||
// no dash in range, compare the value directly
|
||||
if (range_idx == std::string::npos) {
|
||||
return iteration == std::stoul(range);
|
||||
size_t range_d = 0;
|
||||
if (!CheckStoul(&range_d, range)) {
|
||||
MS_LOG(INFO) << "Failed to convert the single step range: " << range
|
||||
<< " into an integer, so the iteration: " << iteration << " is regarded as not in dump range.";
|
||||
return false;
|
||||
}
|
||||
return iteration == range_d;
|
||||
}
|
||||
// make sure there is only one dash in range
|
||||
if (range.find(dash, range_idx + 1) != std::string::npos) {
|
||||
|
@ -477,8 +484,18 @@ bool IsIterInRange(uint32_t iteration, const std::string &range) {
|
|||
if (low_range_str.empty() || high_range_str.empty()) {
|
||||
return false;
|
||||
}
|
||||
uint32_t low_range = static_cast<uint32_t>(std::stoul(low_range_str));
|
||||
uint32_t high_range = static_cast<uint32_t>(std::stoul(high_range_str));
|
||||
size_t low_range = 0;
|
||||
if (!CheckStoul(&low_range, low_range_str)) {
|
||||
MS_LOG(INFO) << "Failed to convert the low_range_str: " << low_range_str
|
||||
<< " into an integer, so the iteration: " << iteration << " is regarded as not in dump range.";
|
||||
return false;
|
||||
}
|
||||
size_t high_range = 0;
|
||||
if (!CheckStoul(&high_range, high_range_str)) {
|
||||
MS_LOG(INFO) << "Failed to convert the high_range_str: " << high_range_str
|
||||
<< " into an integer, so the iteration: " << iteration << " is regarded as not in dump range.";
|
||||
return false;
|
||||
}
|
||||
return (low_range <= iteration) && (iteration <= high_range);
|
||||
}
|
||||
|
||||
|
|
|
@ -35,13 +35,13 @@
|
|||
#include "backend/common/session/anf_runtime_algorithm.h"
|
||||
#include "include/common/utils/anfalgo.h"
|
||||
#endif
|
||||
#include "debug/utils.h"
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "debug/debugger/tensor_summary.h"
|
||||
#include "utils/file_utils.h"
|
||||
#include "climits"
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
|
||||
namespace mindspore {
|
||||
#endif
|
||||
|
||||
static constexpr const char *constant_prefix = "Default--data-";
|
||||
static constexpr const char *kNpyExt = ".npy";
|
||||
|
@ -91,8 +91,8 @@ DebugServices &DebugServices::operator=(const DebugServices &other) {
|
|||
* watchpoint_table.
|
||||
*/
|
||||
void DebugServices::AddWatchpoint(
|
||||
unsigned int id, int watch_condition, float parameter,
|
||||
const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> ¶meter_list,
|
||||
int id, int watch_condition, float parameter, const std::vector<std::tuple<std::string, bool>> &check_node_list,
|
||||
const std::vector<parameter_t> ¶meter_list,
|
||||
const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
|
||||
const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
|
||||
std::lock_guard<std::mutex> lg(lock_);
|
||||
|
@ -733,7 +733,13 @@ void DebugServices::SortWatchpointsInfo(
|
|||
std::vector<int32_t>().swap((*chunk_error_codes)[i]);
|
||||
std::vector<unsigned int>().swap((*chunk_device_id)[i]);
|
||||
std::vector<unsigned int>().swap((*chunk_root_graph_id)[i]);
|
||||
(*tensor_list_byte_size) += (*chunk_tensor_byte_size)[i];
|
||||
if ((*tensor_list_byte_size) > ULONG_LONG_MAX - (*chunk_tensor_byte_size)[i]) {
|
||||
MS_LOG(WARNING) << (*tensor_list_byte_size) << " + " << (*chunk_tensor_byte_size)[i]
|
||||
<< " would lead to integer overflow!";
|
||||
(*tensor_list_byte_size) = ULONG_LONG_MAX;
|
||||
} else {
|
||||
(*tensor_list_byte_size) += (*chunk_tensor_byte_size)[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -801,9 +807,20 @@ void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std:
|
|||
std::stringstream check_shape(shape_str);
|
||||
MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
|
||||
while (getline(check_shape, intermediate, ',')) {
|
||||
shape->push_back(std::stoi(intermediate));
|
||||
int64_t shape_d = 0;
|
||||
if (!CheckStoi(&shape_d, intermediate)) {
|
||||
MS_LOG(INFO) << "Failed to get the shape from file: " << file_name << ", error in convert the string "
|
||||
<< intermediate << " into an integer.";
|
||||
return;
|
||||
}
|
||||
shape->push_back(shape_d);
|
||||
}
|
||||
std::size_t word_size = 0;
|
||||
if (!CheckStoul(&word_size, std::string(1, (*tensor_type)[1]))) {
|
||||
MS_LOG(INFO) << "Failed to get the word_size from file: " << file_name << ", error in convert the string "
|
||||
<< (*tensor_type)[1] << " into an integer.";
|
||||
return;
|
||||
}
|
||||
std::size_t word_size = std::stoul(std::string(1, (*tensor_type)[1]));
|
||||
std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
|
||||
std::size_t data_size = data_len * word_size;
|
||||
if (!data_size) {
|
||||
|
@ -881,25 +898,26 @@ void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &f
|
|||
std::string real_dump_iter_dir = RealPath(dump_key);
|
||||
DIR *d_handle = opendir(real_dump_iter_dir.c_str());
|
||||
if (d_handle == nullptr) {
|
||||
MS_LOG(INFO) << "Directory does not exist in ConvertToHostFormat.";
|
||||
MS_LOG(INFO) << "Directory " << real_dump_iter_dir << " does not exist in ConvertToHostFormat.";
|
||||
return;
|
||||
}
|
||||
struct dirent *dir = nullptr;
|
||||
while ((dir = readdir(d_handle)) != nullptr) {
|
||||
std::string name = real_dump_iter_dir + std::string("/") + std::string(dir->d_name);
|
||||
if (IsRegFile(name)) {
|
||||
std::string candidate = dir->d_name;
|
||||
for (const std::string &file_to_find : files_after_convert_in_dir) {
|
||||
std::string file_n = file_to_find;
|
||||
auto last_slash_pos = file_to_find.find_last_of("\\/");
|
||||
if (last_slash_pos != std::string::npos) {
|
||||
file_n = file_to_find.substr(last_slash_pos + 1);
|
||||
}
|
||||
if (candidate.find(file_n + ".") != std::string::npos && candidate.rfind(kNpyExt) != std::string::npos) {
|
||||
// we found a converted file for this op
|
||||
std::string found_file = dump_key + "/" + candidate;
|
||||
(void)result_list->insert(found_file);
|
||||
}
|
||||
if (!IsRegFile(name)) {
|
||||
continue;
|
||||
}
|
||||
std::string candidate = dir->d_name;
|
||||
for (const std::string &file_to_find : files_after_convert_in_dir) {
|
||||
std::string file_n = file_to_find;
|
||||
auto last_slash_pos = file_to_find.find_last_of("\\/");
|
||||
if (last_slash_pos != std::string::npos) {
|
||||
file_n = file_to_find.substr(last_slash_pos + 1);
|
||||
}
|
||||
if (candidate.find(file_n + ".") != std::string::npos && candidate.rfind(kNpyExt) != std::string::npos) {
|
||||
// we found a converted file for this op
|
||||
std::string found_file = dump_key + "/" + candidate;
|
||||
(void)result_list->insert(found_file);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1047,8 +1065,14 @@ void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::str
|
|||
|
||||
if (file_name.find(specific_dump_dir) != std::string::npos && found != std::string::npos &&
|
||||
found_out != std::string::npos) {
|
||||
slot_list.push_back(
|
||||
std::stoul(file_name_to_check.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1)));
|
||||
std::string slot_str = file_name_to_check.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1);
|
||||
size_t slot = 0;
|
||||
if (!CheckStoul(&slot, slot_str)) {
|
||||
MS_LOG(INFO) << "Failed to get the slot_id from file_name: " << file_name << ", error in convert the string "
|
||||
<< slot_str << " into an integer.";
|
||||
continue;
|
||||
}
|
||||
slot_list.push_back(slot);
|
||||
}
|
||||
}
|
||||
for (auto slot : slot_list) {
|
||||
|
@ -1632,35 +1656,40 @@ void DebugServices::ProcessTensorDataSync(const std::vector<std::tuple<std::stri
|
|||
DIR *d = opendir(specific_dump_dir.c_str());
|
||||
if (d == nullptr) {
|
||||
MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ProcessTensorDataSync.";
|
||||
} else {
|
||||
struct dirent *dir = nullptr;
|
||||
while ((dir = readdir(d)) != nullptr) {
|
||||
std::string file_name = dir->d_name;
|
||||
std::string file_path = specific_dump_dir + std::string("/") + file_name;
|
||||
if (IsRegFile(file_path)) {
|
||||
for (auto &node : proto_to_dump) {
|
||||
std::string dump_name = std::get<1>(node);
|
||||
std::string stripped_file_name = GetStrippedFilename(file_name);
|
||||
if (stripped_file_name.empty() || stripped_file_name.length() <= dump_name.length()) {
|
||||
return;
|
||||
}
|
||||
struct dirent *dir = nullptr;
|
||||
while ((dir = readdir(d)) != nullptr) {
|
||||
std::string file_name = dir->d_name;
|
||||
std::string file_path = specific_dump_dir + std::string("/") + file_name;
|
||||
if (IsRegFile(file_path)) {
|
||||
for (auto &node : proto_to_dump) {
|
||||
std::string dump_name = std::get<1>(node);
|
||||
std::string stripped_file_name = GetStrippedFilename(file_name);
|
||||
if (stripped_file_name.empty() || stripped_file_name.length() <= dump_name.length()) {
|
||||
continue;
|
||||
}
|
||||
std::size_t found = stripped_file_name.rfind(dump_name + ".", 0);
|
||||
if (found == 0) {
|
||||
size_t slot = 0;
|
||||
if (!CheckStoul(&slot, stripped_file_name.substr(dump_name.length() + 1))) {
|
||||
MS_LOG(INFO) << "Failed to get the slot from file_name: " << file_name << ", error in convert the string "
|
||||
<< stripped_file_name.substr(dump_name.length() + 1) << " into an integer.";
|
||||
continue;
|
||||
}
|
||||
std::size_t found = stripped_file_name.rfind(dump_name + ".", 0);
|
||||
if (found == 0) {
|
||||
size_t slot = std::stoul(stripped_file_name.substr(dump_name.length() + 1));
|
||||
std::vector<int64_t> shape;
|
||||
std::string orig_name = std::get<0>(node);
|
||||
std::string output_str = dump_name.substr(dump_name.rfind(".") + 1);
|
||||
bool output_flag = (output_str == "output");
|
||||
std::vector<int64_t> shape;
|
||||
std::string orig_name = std::get<0>(node);
|
||||
std::string output_str = dump_name.substr(dump_name.rfind(".") + 1);
|
||||
bool output_flag = (output_str == "output");
|
||||
|
||||
AddToTensorData(orig_name, "", slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape,
|
||||
nullptr, tensor_list);
|
||||
break;
|
||||
}
|
||||
AddToTensorData(orig_name, "", slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape, nullptr,
|
||||
tensor_list);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
(void)closedir(d);
|
||||
}
|
||||
(void)closedir(d);
|
||||
}
|
||||
|
||||
std::string DebugServices::IterationString(unsigned int iteration) {
|
||||
|
@ -2019,8 +2048,16 @@ bool DebugServices::GetTaskIdStreamId(std::string file_name, std::string overflo
|
|||
|
||||
std::string task_id_str = file_name.substr(task_pos_start, task_pos_end - task_pos_start);
|
||||
std::string stream_id_str = file_name.substr(stream_pos_start, stream_pos_end - stream_pos_start);
|
||||
*task_id = std::stoull(task_id_str);
|
||||
*stream_id = std::stoull(stream_id_str);
|
||||
if (!CheckStoull(task_id, task_id_str)) {
|
||||
MS_LOG(INFO) << "Failed to get the task_id from file_name: " << file_name << ", error in convert the string "
|
||||
<< task_id_str << " into an integer.";
|
||||
return false;
|
||||
}
|
||||
if (!CheckStoull(stream_id, stream_id_str)) {
|
||||
MS_LOG(INFO) << "Failed to get the stream_id from file_name: " << file_name << ", error in convert the string "
|
||||
<< stream_id_str << " into an integer.";
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -2064,13 +2101,9 @@ bool DebugServices::GetAttrsFromFilename(const std::string &file_name, std::stri
|
|||
// get task id
|
||||
if (second_dot < third_dot) {
|
||||
std::string extracted_task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
|
||||
try {
|
||||
*task_id = std::stoull(extracted_task_id);
|
||||
} catch (std::invalid_argument &e) {
|
||||
MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, invalid argument.";
|
||||
return false;
|
||||
} catch (std::out_of_range &e) {
|
||||
MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, out of range.";
|
||||
if (!CheckStoull(task_id, extracted_task_id)) {
|
||||
MS_LOG(INFO) << "Failed to get the task_id from file_name: " << file_name << ", error in convert the string "
|
||||
<< extracted_task_id << " into an integer.";
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
|
@ -2080,13 +2113,9 @@ bool DebugServices::GetAttrsFromFilename(const std::string &file_name, std::stri
|
|||
// get stream id
|
||||
if (third_dot < fourth_dot) {
|
||||
std::string extracted_stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
|
||||
try {
|
||||
*stream_id = std::stoull(extracted_stream_id);
|
||||
} catch (std::invalid_argument &e) {
|
||||
MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, invalid argument.";
|
||||
return false;
|
||||
} catch (std::out_of_range &e) {
|
||||
MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, out of range.";
|
||||
if (!CheckStoull(stream_id, extracted_stream_id)) {
|
||||
MS_LOG(INFO) << "Failed to get the stream_id from file_name: " << file_name << ", error in convert the string "
|
||||
<< extracted_stream_id << " into an integer.";
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
|
@ -2169,6 +2198,4 @@ bool DebugServices::GetSyncMode() { return is_sync_mode_; }
|
|||
|
||||
void DebugServices::SetMemLimit(uint64_t max_mem_size) { tensor_loader_->SetMemTotal(max_mem_size); }
|
||||
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
} // namespace mindspore
|
||||
#endif
|
||||
|
|
|
@ -40,9 +40,7 @@
|
|||
#include "debug/tensor_load.h"
|
||||
#include "debug/tensor_data.h"
|
||||
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
namespace mindspore {
|
||||
#endif
|
||||
class DebugServices {
|
||||
public:
|
||||
DebugServices();
|
||||
|
@ -242,8 +240,8 @@ class DebugServices {
|
|||
static TensorStat GetTensorStatistics(const std::shared_ptr<TensorData> &tensor);
|
||||
|
||||
void AddWatchpoint(
|
||||
unsigned int id, int watch_condition, float parameter,
|
||||
const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> ¶meter_list,
|
||||
int id, int watch_condition, float parameter, const std::vector<std::tuple<std::string, bool>> &check_node_list,
|
||||
const std::vector<parameter_t> ¶meter_list,
|
||||
const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list = nullptr,
|
||||
const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list = nullptr);
|
||||
|
||||
|
@ -496,8 +494,6 @@ class DebugServices {
|
|||
|
||||
std::shared_ptr<TensorLoader> tensor_loader_;
|
||||
};
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
} // namespace mindspore
|
||||
#endif
|
||||
|
||||
#endif // MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_
|
||||
|
|
|
@ -161,13 +161,13 @@ message Statistics {
|
|||
float max_value = 2;
|
||||
float min_value = 3;
|
||||
float avg_value = 4;
|
||||
int32 count = 5;
|
||||
int32 neg_zero_count = 6;
|
||||
int32 pos_zero_count = 7;
|
||||
int32 nan_count = 8;
|
||||
int32 neg_inf_count = 9;
|
||||
int32 pos_inf_count = 10;
|
||||
int32 zero_count = 11;
|
||||
uint64 count = 5;
|
||||
uint64 neg_zero_count = 6;
|
||||
uint64 pos_zero_count = 7;
|
||||
uint64 nan_count = 8;
|
||||
uint64 neg_inf_count = 9;
|
||||
uint64 pos_inf_count = 10;
|
||||
uint64 zero_count = 11;
|
||||
}
|
||||
|
||||
message TensorBase{
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include "debug/utils.h"
|
||||
|
||||
namespace mindspore {
|
||||
DbgServices::DbgServices() { debug_services_ = std::make_shared<DebugServices>(); }
|
||||
|
@ -77,7 +78,7 @@ int32_t DbgServices::Initialize(const std::string net_name, const std::string du
|
|||
}
|
||||
|
||||
int32_t DbgServices::AddWatchpoint(
|
||||
unsigned int id, int watch_condition,
|
||||
int id, int watch_condition,
|
||||
std::map<std::string, std::map<std::string, std::variant<bool, std::vector<std::string>>>> check_nodes,
|
||||
std::vector<parameter_t> parameter_list) {
|
||||
MS_EXCEPTION_IF_NULL(debug_services_);
|
||||
|
@ -94,9 +95,14 @@ int32_t DbgServices::AddWatchpoint(
|
|||
|
||||
std::vector<std::string> rank_id_str = std::get<std::vector<std::string>>(attr_map["rank_id"]);
|
||||
std::vector<std::uint32_t> rank_id;
|
||||
(void)std::transform(
|
||||
rank_id_str.begin(), rank_id_str.end(), std::back_inserter(rank_id),
|
||||
[](const std::string &id_str) -> std::uint32_t { return static_cast<uint32_t>(std::stoul(id_str)); });
|
||||
(void)std::transform(rank_id_str.begin(), rank_id_str.end(), std::back_inserter(rank_id),
|
||||
[](const std::string &id_str) -> std::uint32_t {
|
||||
size_t id_inter = 0;
|
||||
if (!CheckStoul(&id_inter, id_str)) {
|
||||
MS_LOG(EXCEPTION) << "Failed to extract rand_id!";
|
||||
}
|
||||
return static_cast<uint32_t>(id_inter);
|
||||
});
|
||||
MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint rank_id: ";
|
||||
for (auto const &i : rank_id) {
|
||||
MS_LOG(DEBUG) << i << " ";
|
||||
|
@ -104,9 +110,14 @@ int32_t DbgServices::AddWatchpoint(
|
|||
|
||||
std::vector<std::string> root_graph_id_str = std::get<std::vector<std::string>>(attr_map["root_graph_id"]);
|
||||
std::vector<std::uint32_t> root_graph_id;
|
||||
(void)std::transform(
|
||||
root_graph_id_str.begin(), root_graph_id_str.end(), std::back_inserter(root_graph_id),
|
||||
[](const std::string &graph_str) -> std::uint32_t { return static_cast<uint32_t>(std::stoul(graph_str)); });
|
||||
(void)std::transform(root_graph_id_str.begin(), root_graph_id_str.end(), std::back_inserter(root_graph_id),
|
||||
[](const std::string &graph_str) -> std::uint32_t {
|
||||
size_t graph_inter = 0;
|
||||
if (!CheckStoul(&graph_inter, graph_str)) {
|
||||
MS_LOG(EXCEPTION) << "Failed to extract graph_id!";
|
||||
}
|
||||
return static_cast<uint32_t>(graph_inter);
|
||||
});
|
||||
MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint root_graph_id: ";
|
||||
for (auto const &j : root_graph_id) {
|
||||
MS_LOG(DEBUG) << j << " ";
|
||||
|
@ -139,7 +150,11 @@ int32_t DbgServices::AddWatchpoint(
|
|||
std::vector<std::uint32_t> rank_id;
|
||||
(void)std::transform(rank_id_str.begin(), rank_id_str.end(), std::back_inserter(rank_id),
|
||||
[](std::string &id_str) -> std::uint32_t {
|
||||
return static_cast<uint32_t>(std::stoul(id_str));
|
||||
size_t id_inter = 0;
|
||||
if (!CheckStoul(&id_inter, id_str)) {
|
||||
MS_LOG(EXCEPTION) << "Failed to extract rand_id!";
|
||||
}
|
||||
return static_cast<uint32_t>(id_inter);
|
||||
});
|
||||
return std::make_tuple(node.first, rank_id);
|
||||
});
|
||||
|
@ -150,9 +165,14 @@ int32_t DbgServices::AddWatchpoint(
|
|||
auto attr_map = node.second;
|
||||
std::vector<std::string> root_graph_id_str = std::get<std::vector<std::string>>(attr_map["root_graph_id"]);
|
||||
std::vector<std::uint32_t> root_graph_id;
|
||||
(void)std::transform(
|
||||
root_graph_id_str.begin(), root_graph_id_str.end(), std::back_inserter(root_graph_id),
|
||||
[](std::string &graph_str) -> std::uint32_t { return static_cast<uint32_t>(std::stoul(graph_str)); });
|
||||
(void)std::transform(root_graph_id_str.begin(), root_graph_id_str.end(), std::back_inserter(root_graph_id),
|
||||
[](std::string &graph_str) -> std::uint32_t {
|
||||
size_t graph_inter = 0;
|
||||
if (!CheckStoul(&graph_inter, graph_str)) {
|
||||
MS_LOG(EXCEPTION) << "Failed to extract graph_id!";
|
||||
}
|
||||
return static_cast<uint32_t>(graph_inter);
|
||||
});
|
||||
return std::make_tuple(node.first, root_graph_id);
|
||||
});
|
||||
|
||||
|
@ -204,8 +224,12 @@ std::vector<watchpoint_hit_t> DbgServices::CheckWatchpoints(unsigned int iterati
|
|||
parameter_t api_parameter(p.name, p.disabled, p.value, p.hit, p.actual_value);
|
||||
api_parameter_vector.push_back(api_parameter);
|
||||
}
|
||||
watchpoint_hit_t hit(name[i], std::stoi(slot[i]), condition[i], watchpoint_id[i], api_parameter_vector,
|
||||
error_codes[i], rank_id[i], root_graph_id[i]);
|
||||
size_t slot_inter = 0;
|
||||
if (!CheckStoul(&slot_inter, slot[i])) {
|
||||
MS_LOG(EXCEPTION) << "Failed to extract slot_id!";
|
||||
}
|
||||
watchpoint_hit_t hit(name[i], static_cast<uint32_t>(slot_inter), condition[i], watchpoint_id[i],
|
||||
api_parameter_vector, error_codes[i], rank_id[i], root_graph_id[i]);
|
||||
|
||||
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t name " << hit.name;
|
||||
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t slot " << hit.slot;
|
||||
|
|
|
@ -196,7 +196,7 @@ class DbgServices {
|
|||
uint64_t max_mem_usage);
|
||||
|
||||
int32_t AddWatchpoint(
|
||||
unsigned int id, int watch_condition,
|
||||
int id, int watch_condition,
|
||||
std::map<std::string, std::map<std::string, std::variant<bool, std::vector<std::string>>>> check_nodes,
|
||||
std::vector<parameter_t> parameter_list);
|
||||
|
||||
|
|
|
@ -28,9 +28,7 @@
|
|||
#include "base/float16.h"
|
||||
#endif
|
||||
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
namespace mindspore {
|
||||
#endif
|
||||
using CONDITION_TYPE = DebugServices::CONDITION_TYPE;
|
||||
|
||||
RangeCountCalculator::RangeCountCalculator()
|
||||
|
@ -437,6 +435,4 @@ template class TensorSummary<float16>;
|
|||
template class TensorSummary<float>;
|
||||
template class TensorSummary<double>;
|
||||
template class TensorSummary<bool>;
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
} // namespace mindspore
|
||||
#endif
|
||||
|
|
|
@ -24,9 +24,7 @@
|
|||
#include "utils/hash_map.h"
|
||||
#include "debug/debug_services.h"
|
||||
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
namespace mindspore {
|
||||
#endif
|
||||
class RangeCountCalculator {
|
||||
public:
|
||||
RangeCountCalculator();
|
||||
|
@ -164,7 +162,5 @@ class TensorSummary : public ITensorSummary {
|
|||
void TensorStatisticsSingleThread();
|
||||
void InitCalculators(const std::vector<DebugServices::watchpoint_t> &);
|
||||
};
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
} // namespace mindspore
|
||||
#endif
|
||||
#endif // MINDSPORE_TENSOR_SUMMARY_H
|
||||
|
|
|
@ -25,10 +25,7 @@
|
|||
#include "ir/tensor.h"
|
||||
#endif
|
||||
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
namespace mindspore {
|
||||
#endif
|
||||
|
||||
namespace MsTypeId {
|
||||
typedef enum MsTypeId : unsigned int {
|
||||
kTypeUnknown = 0,
|
||||
|
@ -444,7 +441,5 @@ class TensorData {
|
|||
mindspore::tensor::TensorPtr tensor_ptr_{nullptr};
|
||||
#endif
|
||||
};
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
} // namespace mindspore
|
||||
#endif
|
||||
#endif // MINDSPORE_CCSRC_DEBUG_TENSOR_DATA_H_
|
||||
|
|
|
@ -28,8 +28,8 @@
|
|||
#include "debug/tensor_data.h"
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
#include "debug/data_dump/dump_json_parser.h"
|
||||
namespace mindspore {
|
||||
#endif
|
||||
namespace mindspore {
|
||||
class TensorLoader {
|
||||
public:
|
||||
#ifndef __APPLE__
|
||||
|
@ -287,7 +287,5 @@ class TensorLoader {
|
|||
std::deque<std::string> cache_evict_queue_;
|
||||
std::condition_variable evict_cond;
|
||||
};
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
} // namespace mindspore
|
||||
#endif
|
||||
#endif // MINDSPORE_CCSRC_DEBUG_TENSOR_LOAD_H_
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "debug/utils.h"
|
||||
#include "mindspore/core/utils/log_adapter.h"
|
||||
|
||||
namespace mindspore {
|
||||
bool CheckStoull(uint64_t *const output_digit, const std::string &input_str) {
|
||||
try {
|
||||
*output_digit = std::stoull(input_str);
|
||||
} catch (const std::out_of_range &oor) {
|
||||
MS_LOG(ERROR) << "Out of Range error: " << oor.what() << " when parse " << input_str;
|
||||
return false;
|
||||
} catch (const std::invalid_argument &ia) {
|
||||
MS_LOG(ERROR) << "Invalid argument: " << ia.what() << " when parse " << input_str;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CheckStoul(size_t *const output_digit, const std::string &input_str) {
|
||||
try {
|
||||
*output_digit = std::stoul(input_str);
|
||||
} catch (const std::out_of_range &oor) {
|
||||
MS_LOG(ERROR) << "Out of Range error: " << oor.what() << " when parse " << input_str;
|
||||
return false;
|
||||
} catch (const std::invalid_argument &ia) {
|
||||
MS_LOG(ERROR) << "Invalid argument: " << ia.what() << " when parse " << input_str;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CheckStoi(int64_t *const output_digit, const std::string &input_str) {
|
||||
try {
|
||||
*output_digit = std::stoi(input_str);
|
||||
} catch (const std::out_of_range &oor) {
|
||||
MS_LOG(ERROR) << "Out of Range error: " << oor.what() << " when parse " << input_str;
|
||||
return false;
|
||||
} catch (const std::invalid_argument &ia) {
|
||||
MS_LOG(ERROR) << "Invalid argument: " << ia.what() << " when parse " << input_str;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,29 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_UTILS_H
|
||||
#define MINDSPORE_UTILS_H
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace mindspore {
|
||||
bool CheckStoull(uint64_t *const output_digit, const std::string &input_str);
|
||||
|
||||
bool CheckStoul(size_t *const output_digit, const std::string &input_str);
|
||||
|
||||
bool CheckStoi(int64_t *const output_digit, const std::string &input_str);
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_UTILS_H
|
|
@ -112,6 +112,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
|||
# dont remove the 4 lines above
|
||||
"../../../mindspore/ccsrc/debug/data_dump/dump_json_parser.cc"
|
||||
"../../../mindspore/ccsrc/debug/common.cc"
|
||||
"../../../mindspore/ccsrc/debug/utils.cc"
|
||||
"../../../mindspore/ccsrc/plugin/device/ascend/hal/hccl_adapter/all_to_all_v_calc_param.cc"
|
||||
"../../../mindspore/ccsrc/runtime/device/kernel_runtime.cc"
|
||||
"../../../mindspore/ccsrc/runtime/device/memory_manager.cc"
|
||||
|
|
Loading…
Reference in New Issue