diff --git a/cmake/package.cmake b/cmake/package.cmake index fe2d7a829a1..cf99f6757db 100644 --- a/cmake/package.cmake +++ b/cmake/package.cmake @@ -63,6 +63,16 @@ install( COMPONENT mindspore ) +if(CMAKE_SYSTEM_NAME MATCHES "Windows") + message("offline debugger does not support windows system temporarily") +else() + install( + TARGETS _mindspore_offline_debug + DESTINATION ${INSTALL_BASE_DIR} + COMPONENT mindspore + ) +endif() + install( TARGETS mindspore_shared_lib DESTINATION ${INSTALL_LIB_DIR} @@ -317,6 +327,18 @@ if(EXISTS ${CMAKE_SOURCE_DIR}/mindspore/dataset) ) endif() +if(CMAKE_SYSTEM_NAME MATCHES "Windows") + message("offline debugger does not support windows system temporarily") +else() + if(EXISTS ${CMAKE_SOURCE_DIR}/mindspore/offline_debug) + install( + DIRECTORY ${CMAKE_SOURCE_DIR}/mindspore/offline_debug + DESTINATION ${INSTALL_PY_DIR} + COMPONENT mindspore + ) + endif() +endif() + ## Public header files install( DIRECTORY ${CMAKE_SOURCE_DIR}/include diff --git a/mindspore/ccsrc/debug/CMakeLists.txt b/mindspore/ccsrc/debug/CMakeLists.txt index 85ebcbdd8e0..38b5bf43a62 100644 --- a/mindspore/ccsrc/debug/CMakeLists.txt +++ b/mindspore/ccsrc/debug/CMakeLists.txt @@ -1,3 +1,6 @@ +include_directories(${CMAKE_SOURCE_DIR}/mindspore/ccsrc/debug/) +include_directories(${CMAKE_BINARY_DIR}) + set(_DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/anf_ir_dump.cc" "${CMAKE_CURRENT_SOURCE_DIR}/anf_ir_utils.cc" @@ -8,6 +11,14 @@ set(_DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/env_config_parser.cc" ) +set(_OFFLINE_SRC_LIST + "${CMAKE_CURRENT_SOURCE_DIR}/debug_services.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/debugger/tensor_summary.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/debugger/offline_debug/offline_logger.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/debugger/offline_debug/dbg_services.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/debugger/offline_debug/mi_pybind_register.cc" +) + if(ENABLE_DUMP_IR) file(GLOB_RECURSE _RDR_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "rdr/*.cc") if(NOT ENABLE_D) @@ -38,3 +49,13 @@ endif() set_property(SOURCE ${_DEBUG_SRC_LIST} ${_RDR_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEBUG) add_library(_mindspore_debug_obj OBJECT ${_DEBUG_SRC_LIST} ${_RDR_SRC_LIST}) +if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows") + add_compile_options(-Wall -DOFFLINE_DBG_MODE -fPIC -O2) + set_property(SOURCE ${_OFFLINE_SRC_LIST} PROPERTY COMPILE_DEFINITIONS + SUBMODULE_ID=mindspore::SubModuleId::SM_OFFLINE_DEBUG) + add_library(_mindspore_offline_debug SHARED ${_OFFLINE_SRC_LIST}) + set_target_properties(_mindspore_offline_debug PROPERTIES + PREFIX "${PYTHON_MODULE_PREFIX}" + SUFFIX "${PYTHON_MODULE_EXTENSION}" + ) +endif() diff --git a/mindspore/ccsrc/debug/debug_services.cc b/mindspore/ccsrc/debug/debug_services.cc index 0e4654e49a5..2b5c1a8588f 100644 --- a/mindspore/ccsrc/debug/debug_services.cc +++ b/mindspore/ccsrc/debug/debug_services.cc @@ -13,14 +13,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "debug/debug_services.h" +#include +#include #include #include +#include +#ifdef ONLINE_DBG_MODE #include "backend/session/anf_runtime_algorithm.h" -#include "debug/debug_services.h" +#endif #include "debug/debugger/tensor_summary.h" - +#ifdef ONLINE_DBG_MODE namespace mindspore { - +#endif DebugServices::DebugServices() { tensor_loader_ = new TensorLoader(); uint32_t iter_num = -1; @@ -42,9 +47,11 @@ DebugServices &DebugServices::operator=(const DebugServices &other) { DebugServices::~DebugServices() { delete tensor_loader_; } -void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter, - const std::vector> &check_node_list, - const std::vector ¶meter_list) { +void DebugServices::AddWatchpoint( + unsigned int id, unsigned int watch_condition, float parameter, + const std::vector> &check_node_list, const std::vector ¶meter_list, + const std::vector>> *check_node_device_list, + const std::vector>> *check_node_graph_list) { std::lock_guard lg(lock_); watchpoint_t watchpoint_item; @@ -52,6 +59,12 @@ void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition, watchpoint_item.condition.type = static_cast(watch_condition); watchpoint_item.condition.parameter = parameter; watchpoint_item.check_node_list = check_node_list; + if (check_node_device_list != nullptr) { + watchpoint_item.check_node_device_list = *check_node_device_list; + } + if (check_node_graph_list != nullptr) { + watchpoint_item.check_node_graph_list = *check_node_graph_list; + } watchpoint_item.parameter_list = parameter_list; watchpoint_table[id] = watchpoint_item; } @@ -61,122 +74,170 @@ void DebugServices::RemoveWatchpoint(unsigned int id) { watchpoint_table.erase(id); } +std::unique_ptr GetSummaryPtr(const std::shared_ptr &tensor, void *previous_tensor_ptr, + uint32_t num_elements, int tensor_dtype) { + switch (tensor_dtype) { + case DbgDataType::DT_UINT8: { + return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); + } + case DbgDataType::DT_INT8: { + return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); + } + case DbgDataType::DT_UINT16: { + return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); + } + case DbgDataType::DT_INT16: { + return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); + } + case DbgDataType::DT_UINT32: { + return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); + } + case DbgDataType::DT_INT32: + case DbgDataType::DT_BASE_INT: { + return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); + } + case DbgDataType::DT_UINT64: { + return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); + } + case DbgDataType::DT_INT64: { + return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); + } + case DbgDataType::DT_FLOAT16: { + return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); + } + case DbgDataType::DT_FLOAT32: + case DbgDataType::DT_BASE_FLOAT: { + return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); + } + case DbgDataType::DT_FLOAT64: { + return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); + } + case DbgDataType::DT_BOOL: { + return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); + } + default: + MS_LOG(INFO) << "Unsupported tensor type"; + // return a null pointer + return std::unique_ptr>{}; + } +} + +#ifdef OFFLINE_DBG_MODE +void *DebugServices::GetPrevTensor(const std::shared_ptr &tensor, bool previous_iter_tensor_needed) { + void *previous_tensor_ptr = nullptr; + std::shared_ptr tensor_prev; + if (previous_iter_tensor_needed && tensor->GetIteration() > 1) { + // read data in offline mode + std::vector> result_list_prev; + ReadDumpedTensor(std::vector{tensor->GetName()}, std::vector{tensor->GetSlot()}, + std::vector{tensor->GetDeviceId()}, + std::vector{tensor->GetIteration() - 1}, + std::vector{tensor->GetRootGraphId()}, &result_list_prev); + tensor_prev = result_list_prev[0]; + if (!tensor_prev->GetByteSize()) { + tensor_prev.reset(); + } else { + previous_tensor_ptr = tensor_prev->GetDataPtr(); + } + } + return previous_tensor_ptr; +} +#endif + +void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck, + const std::string &tensor_name, const std::string &tensor_name_no_slot, + bool *previous_iter_tensor_needed, std::string *qualified_tensor_name, + std::vector *watchpoints_to_check) { + for (auto w_table_item : watchpoint_table) { + auto wp = std::get<1>(w_table_item); + // check ONLY init conditions on initial suspended state. + // skip other conditions on initial suspended state + if (init_dbg_suspend && (wp.condition.type != INIT)) continue; + // skip init condition if not init suspend + if ((wp.condition.type == INIT) && !init_dbg_suspend) continue; + // check change conditions only on step end. + if (wp.change_condition() && !step_end) continue; + // if recheck, ignore the cache results and reanalyze everything. + // if not a recheck, check only unanalyzed tensors + if (!recheck && wp_id_cache[tensor_name].count(wp.id)) continue; + std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot); + if (!found.empty()) { + *qualified_tensor_name = found; + watchpoints_to_check->push_back(w_table_item.second); +#ifdef OFFLINE_DBG_MODE + if (wp.change_condition()) { + *previous_iter_tensor_needed = true; + } +#endif + } + } +} + +void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id, + const std::string &tensor_name) { + // add analyzed tensor to cache + if (!recheck) { + wp_id_cache[tensor_name].insert(id); + } +} + void DebugServices::CheckWatchpoints(std::vector *name, std::vector *slot, std::vector *condition, std::vector *watchpoint_id, std::vector> *parameters, std::vector *error_codes, const std::vector &op_overflows, - const std::vector> &tensor_list, - const bool init_dbg_suspend, const bool step_end, const bool recheck) { + std::vector> *tensor_list, const bool init_dbg_suspend, + const bool step_end, const bool recheck, std::vector *device_id, + std::vector *root_graph_id) { std::lock_guard lg(lock_); if (watchpoint_table.empty()) return; - for (const auto &tensor : tensor_list) { + for (auto &tensor : *tensor_list) { +#ifdef OFFLINE_DBG_MODE + // read data in offline mode + std::vector> result_list; + ReadDumpedTensor(std::vector{tensor->GetName()}, std::vector{tensor->GetSlot()}, + std::vector{tensor->GetDeviceId()}, + std::vector{tensor->GetIteration()}, + std::vector{tensor->GetRootGraphId()}, &result_list); + tensor = result_list[0]; + if (!tensor->GetByteSize()) { + tensor.reset(); + continue; + } +#endif + const auto tensor_name = tensor->GetName(); const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':')); const auto tensor_slot = std::to_string(tensor->GetSlot()); - mindspore::tensor::TensorPtr tensor_ptr = tensor->GetTensor(); // no elements to analyze - if (tensor_ptr->DataSize() == 0) continue; - int tensor_dtype = tensor_ptr->data_type_c(); + if (tensor->GetByteSize() == 0) continue; + int tensor_dtype = tensor->GetType(); std::vector watchpoints_to_check; std::string qualified_tensor_name; - for (auto w_table_item : watchpoint_table) { - auto wp = std::get<1>(w_table_item); - // check ONLY init conditions on intial suspended state. - // skip other conditions on intial suspended state - if (init_dbg_suspend && (wp.condition.type != INIT)) continue; - // skip init condition if not init suspend - if ((wp.condition.type == INIT) && !init_dbg_suspend) continue; - // check change conditions only on step end. - if (wp.change_condition() && !step_end) continue; - // if recheck, ignore the cache results and reanalyze everything. - // if not a recheck, check only unanalyzed tensors - if (!recheck && wp_id_cache[tensor_name].count(wp.id)) continue; - std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot); - if (!found.empty()) { - qualified_tensor_name = found; - watchpoints_to_check.push_back(w_table_item.second); - } - } + bool previous_iter_tensor_needed = false; + // Add do nothing line in case offline debug is off, prevent unused var warning + (void)previous_iter_tensor_needed; + AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor_name, tensor_name_no_slot, + &previous_iter_tensor_needed, &qualified_tensor_name, &watchpoints_to_check); // no wp set on current tensor if (watchpoints_to_check.empty()) continue; - uint32_t num_elements = tensor_ptr->DataSize(); - void *previous_tensor_ptr = tensor_loader_->GetPrevTensor(tensor_name) - ? tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c() - : nullptr; + uint32_t num_elements = tensor->GetNumElements(); + +#ifdef OFFLINE_DBG_MODE + void *previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed); +#else + void *previous_tensor_ptr = + tensor_loader_->GetPrevTensor(tensor_name) ? tensor_loader_->GetPrevTensor(tensor_name)->GetDataPtr() : nullptr; +#endif + std::unique_ptr base_summary_ptr; if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) { - switch (tensor_dtype) { - case kNumberTypeUInt8: { - base_summary_ptr = - std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); - break; - } - case kNumberTypeInt8: { - base_summary_ptr = - std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); - break; - } - case kNumberTypeUInt16: { - base_summary_ptr = - std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); - break; - } - case kNumberTypeInt16: { - base_summary_ptr = - std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); - break; - } - case kNumberTypeUInt32: { - base_summary_ptr = - std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); - break; - } - case kNumberTypeInt32: - case kNumberTypeInt: { - base_summary_ptr = - std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); - break; - } - case kNumberTypeUInt64: { - base_summary_ptr = - std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); - break; - } - case kNumberTypeInt64: { - base_summary_ptr = - std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); - break; - } - case kNumberTypeFloat16: { - base_summary_ptr = - std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); - break; - } - case kNumberTypeFloat32: - case kNumberTypeFloat: { - base_summary_ptr = - std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); - break; - } - case kNumberTypeFloat64: { - base_summary_ptr = - std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); - break; - } - case kNumberTypeBool: { - base_summary_ptr = - std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); - break; - } - default: - MS_LOG(INFO) << "Unsupported tensor type"; - continue; + base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, tensor_dtype); + if (base_summary_ptr != nullptr) { + base_summary_ptr->SummarizeTensor(watchpoints_to_check); } - base_summary_ptr->SummarizeTensor(watchpoints_to_check); } - for (auto &wp : watchpoints_to_check) { bool is_hit = false; int error_code = 0; @@ -189,26 +250,439 @@ void DebugServices::CheckWatchpoints(std::vector *name, std::vector error_code = std::get<1>(item); parameter_list = std::get<2>(item); } - // add analyzed tensor to cache - if (!recheck) { - wp_id_cache[tensor_name].insert(wp.id); - } + AddAnalyzedTensorToCache(recheck, wp.id, tensor_name); if (is_hit || error_code) { name->push_back(qualified_tensor_name); slot->push_back(tensor_slot); condition->push_back(wp.condition.type); watchpoint_id->push_back(wp.id); + if (device_id != nullptr) { + device_id->push_back(tensor->GetDeviceId()); + } + if (root_graph_id != nullptr) { + root_graph_id->push_back(tensor->GetRootGraphId()); + } parameters->push_back(parameter_list); error_codes->push_back(error_code); } } + +#ifdef OFFLINE_DBG_MODE + // in offline mode remove the need for the data + tensor.reset(); +#endif } } +#ifdef OFFLINE_DBG_MODE +void DebugServices::GetSlotInfo(const std::string &file_name, const std::string &dump_name, + const std::string &specific_dump_dir, std::vector *slot_list) { + if (is_sync_mode) { + // get the slot from the name + std::string delimiter = "_"; + unsigned int start_pos = dump_name.length(); + unsigned int end_pos = file_name.find(delimiter, start_pos); + std::string item = file_name.substr(start_pos, end_pos - start_pos); + slot_list->push_back(std::stoul(item)); + } else { + std::string out_dir = "/tmp/" + file_name; + std::string input_file = specific_dump_dir + "/" + file_name; + std::string log_enabled = DbgLogger::verbose ? "" : "> /dev/null"; + std::string convert_command = + "python /usr/local/Ascend/toolkit/tools/operator_cmp/compare/msaccucmp.pyc convert -d " + input_file + " -out " + + out_dir + " -t bin " + log_enabled; + (void)(system(convert_command.c_str()) + 1); + convert_command = "python /usr/local/Ascend/toolkit/tools/operator_cmp/compare/msaccucmp.pyc convert -d " + + input_file + " -out " + out_dir + " -f NCHW -t bin " + log_enabled; + (void)(system(convert_command.c_str()) + 1); + + std::string prefix_converted_dump_file_name = file_name + ".output."; + DIR *convert_dir_ptr = opendir(out_dir.c_str()); + if (convert_dir_ptr != nullptr) { + struct dirent *convert_dir_contents = nullptr; + while ((convert_dir_contents = readdir(convert_dir_ptr)) != NULL) { + if (convert_dir_contents->d_type == DT_REG) { + std::string converted_file_name = convert_dir_contents->d_name; + std::size_t nd_file = converted_file_name.rfind(".ND.bin"); + std::size_t fractal_z_file = converted_file_name.rfind(".FRACTAL_Z.bin"); + std::size_t nchw_file = converted_file_name.rfind(".NCHW.bin"); + if (nd_file == std::string::npos && nchw_file == std::string::npos && fractal_z_file == std::string::npos) { + continue; + } + std::size_t found_c = converted_file_name.find(prefix_converted_dump_file_name); + if (found_c != 0) { + continue; + } + std::size_t slot_start_pos = prefix_converted_dump_file_name.length(); + std::size_t slot_end_pos = converted_file_name.find(".", slot_start_pos) - 1; + std::string slot_item = converted_file_name.substr(slot_start_pos, slot_end_pos - slot_start_pos + 1); + slot_list->push_back(std::stoul(slot_item)); + } + } + } else { + MS_LOG(INFO) << out_dir << " directory does not exist!"; + } + closedir(convert_dir_ptr); + + // std::string delete_cmd = "rm -rf " + out_dir; + // system(delete_cmd.c_str()); + } +} + +std::size_t DebugServices::GetShapeTypeInfo(const std::string &specific_dump_dir, std::size_t slot, + const std::string &prefix_dump_file_name, std::string *file_name, + std::string *type_name, std::string *out_dir, std::vector *shape) { + std::size_t found = 0; + if (is_sync_mode) { + found = file_name->rfind(prefix_dump_file_name, 0); + } else { + std::string file_name_w_o_prefix = file_name->substr(file_name->find('.') + 1); + found = file_name_w_o_prefix.rfind(prefix_dump_file_name, 0); + } + if (found != 0) { + return found; + } + if (is_sync_mode) { + // found a file, now get the shape and type + // find "_shape_" in the filename + std::string shape_delimiter = "_shape_"; + unsigned int str_pos = file_name->find(shape_delimiter) + shape_delimiter.length(); + + // read numbers with '_' delimter until you read a non-number, that will be the type name + bool number_found = true; + std::string delimiter = "_"; + while (number_found) { + unsigned int end_pos = file_name->find(delimiter, str_pos); + std::string item = file_name->substr(str_pos, end_pos - str_pos); + bool is_number = !item.empty() && std::find_if(item.begin(), item.end(), + [](unsigned char c) { return !std::isdigit(c); }) == item.end(); + + if (is_number) { + shape->push_back(std::stoul(item)); + str_pos = end_pos + 1; + } else { + *type_name = item; + number_found = false; + } + } + } else { + *out_dir = "/tmp/" + *file_name; + std::string input_file = specific_dump_dir + "/" + *file_name; + std::string log_enabled = DbgLogger::verbose ? "" : "> /dev/null"; + std::string convert_command = + "python /usr/local/Ascend/toolkit/tools/operator_cmp/compare/msaccucmp.pyc convert -d " + input_file + " -out " + + *out_dir + " -t bin " + log_enabled; + (void)(system(convert_command.c_str()) + 1); + convert_command = "python /usr/local/Ascend/toolkit/tools/operator_cmp/compare/msaccucmp.pyc convert -d " + + input_file + " -out " + *out_dir + " -f NCHW -t bin " + log_enabled; + (void)(system(convert_command.c_str()) + 1); + + std::string prefix_converted_dump_file_name = *file_name + ".output." + std::to_string(slot); + *file_name = ""; + DIR *convert_dir_ptr = opendir(out_dir->c_str()); + if (convert_dir_ptr != nullptr) { + struct dirent *convert_dir_contents = nullptr; + while ((convert_dir_contents = readdir(convert_dir_ptr)) != NULL) { + if (convert_dir_contents->d_type == DT_REG) { + std::string converted_file_name = convert_dir_contents->d_name; + std::size_t nd_file = converted_file_name.rfind(".ND.bin"); + std::size_t fractal_z_file = converted_file_name.rfind(".FRACTAL_Z.bin"); + std::size_t nchw_file = converted_file_name.rfind(".NCHW.bin"); + if (nd_file == std::string::npos && nchw_file == std::string::npos && fractal_z_file == std::string::npos) { + continue; + } + std::size_t found_c = converted_file_name.rfind(prefix_converted_dump_file_name, 0); + if (found_c != 0) { + continue; + } + *file_name = converted_file_name; + } + } + } else { + MS_LOG(INFO) << *out_dir << " directory does not exist!"; + } + closedir(convert_dir_ptr); + + if (*file_name == "") { + MS_LOG(WARNING) << out_dir << ": no valid files found post msaccucmp exec"; + return 1; + } + + // std::string delete_cmd = "rm -rf " + out_dir; + // system(delete_cmd.c_str()); + + // found a file, now get the shape and type + std::stringstream check_filename(*file_name); + std::vector tokens; + std::string intermediate; + + while (getline(check_filename, intermediate, '.')) { + tokens.push_back(intermediate); + } + *type_name = tokens[8]; + + std::string shape_str = tokens[7]; + std::stringstream check_shape(shape_str); + while (getline(check_shape, intermediate, '_')) { + shape->push_back(std::stoul(intermediate)); + } + } + return 0; +} + +void DebugServices::ReadDumpedTensor(std::vector backend_name, std::vector slot, + std::vector device_id, std::vector iteration, + std::vector root_graph_id, + std::vector> *result_list) { + for (unsigned int i = 0; i < backend_name.size(); i++) { + // form prefix of the tensor file to read from graph pb node name + std::string dump_style_kernel_name = backend_name[i]; + const std::string strsrc = "/"; + + std::string strdst; + if (is_sync_mode) { + strdst = "--"; + } else { + strdst = "_"; + } + + std::string::size_type pos = 0; + std::string::size_type srclen = strsrc.size(); + std::string::size_type dstlen = strdst.size(); + + // remove slot from name + std::size_t found_colon = dump_style_kernel_name.find_last_of(":"); + dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon); + + while ((pos = dump_style_kernel_name.find(strsrc, pos)) != std::string::npos) { + dump_style_kernel_name.replace(pos, srclen, strdst); + pos += dstlen; + } + + std::string prefix_dump_file_name = dump_style_kernel_name; + if (is_sync_mode) { + prefix_dump_file_name += "_output_" + std::to_string(slot[i]) + "_"; + } + + std::string specific_dump_dir; + if (is_sync_mode) { + specific_dump_dir = + dump_dir + "/device_" + std::to_string(device_id[i]) + "/iteration_" + std::to_string(iteration[i]); + } else { + specific_dump_dir = dump_dir + "/device_" + std::to_string(device_id[i]) + "/" + net_name + "_graph_" + + std::to_string(root_graph_id[i]) + "/" + std::to_string(root_graph_id[i]) + "/" + + std::to_string(iteration[i]); + } + + // search files in dir for the one that meets the filename prefix and read the file into memory + DIR *d; + d = opendir(specific_dump_dir.c_str()); + std::vector *buffer = NULL; + std::string type_name = ""; + std::vector shape; + uint64_t data_size = 0; + if (d != nullptr) { + struct dirent *dir = nullptr; + while ((dir = readdir(d)) != NULL) { + if (dir->d_type == DT_REG) { + std::string file_name = dir->d_name; + std::string out_dir; + std::size_t found = GetShapeTypeInfo(specific_dump_dir, slot[i], prefix_dump_file_name, &file_name, + &type_name, &out_dir, &shape); + if (found != 0) { + continue; + } + + // read the tensor data from the file + std::string file_path; + if (is_sync_mode) { + file_path = specific_dump_dir + "/" + file_name; + } else { + file_path = out_dir + "/" + file_name; + } + + std::ifstream infile; + infile.open(file_path.c_str(), std::ios::binary | std::ios::ate); + if (!infile.is_open()) { + MS_LOG(ERROR) << "Failed to open bin file " << file_name; + break; + } + uint64_t file_size = infile.tellg(); + infile.seekg(0, std::ios::beg); + buffer = new std::vector(file_size); + if (!infile.read(buffer->data(), file_size)) { + MS_LOG(ERROR) << "Failed to read in bin file " << file_name; + break; + } + data_size = file_size; + infile.close(); + } + } + } else { + MS_LOG(INFO) << "directory does not exist!"; + } + closedir(d); + + // call LoadNewTensor to store tensor in internal cache + auto tensor_data = std::make_shared(); + tensor_data->SetName(backend_name[i]); + tensor_data->SetExecutionOrder(0); + tensor_data->SetSlot(slot[i]); + tensor_data->SetIteration(iteration[i]); + tensor_data->SetDeviceId(device_id[i]); + tensor_data->SetRootGraphId(root_graph_id[i]); + if (data_size) { + tensor_data->SetDataPtr(buffer->data()); + } else { + tensor_data->SetDataPtr(NULL); + } + tensor_data->SetByteSize(data_size); + tensor_data->SetType(type_name); + tensor_data->SetShape(shape); + if (data_size) { + tensor_loader_->LoadNewTensor(tensor_data, false); + } + + // add to result_list + result_list->push_back(tensor_data); + } +} + +void ReplaceSrcFileName(const bool is_sync_mode, std::string *dump_style_name) { + const std::string strsrc = "/"; + std::string strdst; + if (is_sync_mode) { + strdst = "--"; + } else { + strdst = "_"; + } + std::string::size_type pos = 0; + std::string::size_type srclen = strsrc.size(); + std::string::size_type dstlen = strdst.size(); + + while ((pos = dump_style_name->find(strsrc, pos)) != std::string::npos) { + dump_style_name->replace(pos, srclen, strdst); + pos += dstlen; + } +} + +std::vector> DebugServices::ReadNeededDumpedTensors(unsigned int iteration) { + // get a list of nodes and the devices they are on to monitor + std::vector> tensor_list; + std::map, std::unordered_set> device_and_graph_to_nodes; + for (auto w_table_item : watchpoint_table) { + auto wp = std::get<1>(w_table_item); + for (auto check_node : wp.check_node_list) { + unsigned int index = 0; + std::string w_name = std::get<0>(check_node); + bool w_is_param = std::get<1>(check_node); + + std::string node_name = w_name; + if (w_is_param) { + std::size_t found = node_name.find_last_of("/"); + node_name = node_name.substr(found + 1); + } + + std::vector devices = std::get<1>(wp.check_node_device_list[index]); + std::vector graphs = std::get<1>(wp.check_node_graph_list[index]); + for (auto device : devices) { + for (auto graph : graphs) { + std::tuple key(device, graph); + device_and_graph_to_nodes[key].insert(node_name); + } + } + + index++; + } + } + + // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list + // as they are found + for (auto const &device_and_graph_item : device_and_graph_to_nodes) { + std::tuple device_and_graph = device_and_graph_item.first; + uint32_t device_id = std::get<0>(device_and_graph); + uint32_t root_graph_id = std::get<1>(device_and_graph); + std::unordered_set wp_nodes = device_and_graph_item.second; + std::vector> proto_to_dump; + + std::string specific_dump_dir; + if (is_sync_mode) { + specific_dump_dir = dump_dir + "/device_" + std::to_string(device_id) + "/iteration_" + std::to_string(iteration); + } else { + specific_dump_dir = dump_dir + "/device_" + std::to_string(device_id) + "/" + net_name + "_graph_" + + std::to_string(root_graph_id) + "/" + std::to_string(root_graph_id) + "/" + + std::to_string(iteration); + } + + // convert node names to dump style + for (auto node : wp_nodes) { + std::string orig_name = node; + std::string dump_style_name = node; + ReplaceSrcFileName(is_sync_mode, &dump_style_name); + + if (is_sync_mode) { + dump_style_name.append("_output_"); + } + + proto_to_dump.push_back(std::tuple(orig_name, dump_style_name)); + } + + // search files in dir for the one that meets the filename prefix and read the file into memory + DIR *d; + d = opendir(specific_dump_dir.c_str()); + if (d != nullptr) { + struct dirent *dir = nullptr; + while ((dir = readdir(d)) != NULL) { + if (dir->d_type == DT_REG) { + std::string file_name = dir->d_name; + for (auto &node : proto_to_dump) { + std::string dump_name = std::get<1>(node); + std::size_t found = 0; + + if (is_sync_mode) { + found = file_name.rfind(dump_name, 0); + } else { + std::string file_name_w_o_prefix = file_name.substr(file_name.find('.') + 1); + found = file_name_w_o_prefix.rfind(dump_name, 0); + } + + if (found == 0) { + std::vector slot_list; + GetSlotInfo(file_name, dump_name, specific_dump_dir, &slot_list); + for (auto slot : slot_list) { + // add a TensorData entry (data will be read when needed) + std::vector shape; + std::string orig_name = std::get<0>(node); + auto tensor_data = std::make_shared(); + tensor_data->SetName(orig_name); + tensor_data->SetExecutionOrder(0); + tensor_data->SetSlot(slot); + tensor_data->SetIteration(iteration); + tensor_data->SetDeviceId(device_id); + tensor_data->SetRootGraphId(root_graph_id); + tensor_data->SetDataPtr(NULL); + tensor_data->SetByteSize(0); + tensor_data->SetType(""); + tensor_data->SetShape(shape); + + tensor_list.push_back(tensor_data); + } + break; + } + } + } + } + } + } + + return tensor_list; +} +#endif + void DebugServices::ReadNodesTensors(std::vector name, std::vector *ret_name, std::vector *data_ptr, std::vector *data_size, - std::vector *dtype, std::vector> *shape) { + std::vector *dtype, std::vector> *shape) { std::vector>> result_list; tensor_loader_->SearchTensors(name, &result_list); @@ -217,13 +691,14 @@ void DebugServices::ReadNodesTensors(std::vector name, std::vector< continue; } ret_name->push_back(std::get<0>(result)); - data_ptr->push_back(reinterpret_cast(std::get<1>(result)->GetTensor()->data_c())); - data_size->push_back(std::get<1>(result)->GetTensor()->data().nbytes()); - dtype->push_back(std::get<1>(result)->GetTensor()->Dtype()); - shape->push_back(std::get<1>(result)->GetTensor()->shape()); + data_ptr->push_back(reinterpret_cast(std::get<1>(result)->GetDataPtr())); + data_size->push_back(std::get<1>(result)->GetByteSize()); + dtype->push_back(std::get<1>(result)->GetType()); + shape->push_back(std::get<1>(result)->GetShape()); } } +#ifdef ONLINE_DBG_MODE bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const { bool ret = false; for (auto w_table_item : watchpoint_table) { @@ -256,6 +731,7 @@ bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNode return false; } } +#endif void DebugServices::EmptyTensor() { tensor_loader_->EmptyTensor(); } @@ -273,6 +749,7 @@ void DebugServices::EmptyPrevTensor() { tensor_loader_->EmptyPrevTensor(); } void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); } +#ifdef ONLINE_DBG_MODE bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath, const std::string &host_fmt, const std::vector &host_shape, TypeId host_type, TypeId addr_type_id, const std::string &addr_format, @@ -280,6 +757,7 @@ bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_ return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type, addr_type_id, addr_format, slot); } +#endif bool DebugServices::LoadNewTensor(const std::shared_ptr &tensor, bool keep_prev) { return tensor_loader_->LoadNewTensor(tensor, keep_prev); @@ -298,6 +776,7 @@ void DebugServices::ResetLoadedTensors() { tensor_loader_->SwapCurrentPrev(); } +#ifdef ONLINE_DBG_MODE std::vector> DebugServices::GetNodeTensor(const CNodePtr &kernel) { MS_EXCEPTION_IF_NULL(kernel); std::vector> result; @@ -310,6 +789,8 @@ std::vector> DebugServices::GetNodeTensor(const CNod } return result; } +#endif + bool DebugServices::TensorExistsInCurrent(std::string tensor_name) { return tensor_loader_->TensorExistsInCurrent(tensor_name); } @@ -317,4 +798,18 @@ void DebugServices::MoveTensorCurrentToPrev(std::string tensor_name) { tensor_loader_->MoveTensorCurrentToPrev(tensor_name); } +void DebugServices::SetNetName(std::string net_name) { this->net_name = net_name; } + +std::string DebugServices::GetNetName() { return net_name; } + +void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir = dump_dir; } + +std::string DebugServices::GetDumpDir() { return dump_dir; } + +void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode = is_sync_mode; } + +bool DebugServices::GetSyncMode() { return is_sync_mode; } + +#ifdef ONLINE_DBG_MODE } // namespace mindspore +#endif diff --git a/mindspore/ccsrc/debug/debug_services.h b/mindspore/ccsrc/debug/debug_services.h index 1031beace7e..032ea3f3a09 100644 --- a/mindspore/ccsrc/debug/debug_services.h +++ b/mindspore/ccsrc/debug/debug_services.h @@ -16,6 +16,17 @@ #ifndef MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_ #define MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_ +#ifndef OFFLINE_DBG_MODE +#define ONLINE_DBG_MODE +#endif + +#ifdef OFFLINE_DBG_MODE +#include "Eigen/Core" +#include "Eigen/src/Core/arch/CUDA/Half.h" +using float16 = Eigen::half; +#include "debugger/offline_debug/offline_logger.h" +#endif + #include #include #include @@ -26,11 +37,13 @@ #include #include #include +#include #include "debug/tensor_load.h" #include "debug/tensor_data.h" -#include "ir/dtype.h" +#ifdef ONLINE_DBG_MODE namespace mindspore { +#endif class DebugServices { public: DebugServices(); @@ -103,6 +116,8 @@ class DebugServices { unsigned int id; condition_t condition; std::vector> check_node_list; + std::vector>> check_node_device_list; + std::vector>> check_node_graph_list; std::vector parameter_list; size_t location = 0; @@ -167,30 +182,55 @@ class DebugServices { } } watchpoint_t; - void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter, - const std::vector> &check_node_list, - const std::vector ¶meter_list); + void AddWatchpoint( + unsigned int id, unsigned int watch_condition, float parameter, + const std::vector> &check_node_list, const std::vector ¶meter_list, + const std::vector>> *check_node_device_list = nullptr, + const std::vector>> *check_node_graph_list = nullptr); void RemoveWatchpoint(unsigned int id); void CheckWatchpoints(std::vector *name, std::vector *slot, std::vector *condition, std::vector *watchpoint_id, std::vector> *parameters, std::vector *error_code, const std::vector &op_overflows, - const std::vector> &tensor_list, bool init_dbg_suspend, - const bool step_end, const bool recheck); + std::vector> *tensor_list, bool init_dbg_suspend, + const bool step_end, const bool recheck, std::vector *device_id = nullptr, + std::vector *root_graph_id = nullptr); + void AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck, const std::string &tensor_name, + const std::string &tensor_name_no_slot, bool *previous_iter_tensor_needed, + std::string *qualified_tensor_name, std::vector *watchpoints_to_check); + +#ifdef OFFLINE_DBG_MODE + void GetSlotInfo(const std::string &file_name, const std::string &dump_name, const std::string &specific_dump_dir, + std::vector *slot_list); + + std::size_t GetShapeTypeInfo(const std::string &specific_dump_dir, std::size_t slot, + const std::string &prefix_dump_file_name, std::string *file_name, std::string *type_name, + std::string *out_dir, std::vector *shape); + + void ReadDumpedTensor(std::vector backend_name, std::vector slot, + std::vector device_id, std::vector iteration, + std::vector root_graph_id, std::vector> *result_list); + + std::vector> ReadNeededDumpedTensors(unsigned int iteration); + + void *GetPrevTensor(const std::shared_ptr &tensor, bool previous_iter_tensor_needed); +#endif void ReadNodesTensors(std::vector name, std::vector *ret_name, - std::vector *data_ptr, std::vector *data_size, std::vector *dtype, - std::vector> *shape); - + std::vector *data_ptr, std::vector *data_size, + std::vector *dtype, std::vector> *shape); +#ifdef ONLINE_DBG_MODE bool IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel = nullptr) const; bool IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const; - +#endif void EmptyTensor(); std::vector> GetTensor() const; + void AddAnalyzedTensorToCache(const bool recheck, const unsigned int id, const std::string &tensor_name); + std::vector> GetNodeTensorMap(const std::string &node_name) const; uint32_t GetTensorLoaderIterNum() const; @@ -201,31 +241,51 @@ class DebugServices { void EmptyCurrentTensor(); +#ifdef ONLINE_DBG_MODE bool DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath, const std::string &host_fmt, const std::vector &host_shape, TypeId host_type, TypeId addr_type_id, const std::string &addr_format, size_t slot) const; +#endif bool LoadNewTensor(const std::shared_ptr &tensor, bool keep_prev); std::unordered_map GetWatchpointTable(); void ResetLoadedTensors(); - +#ifdef ONLINE_DBG_MODE std::vector> GetNodeTensor(const CNodePtr &kernel); +#endif bool TensorExistsInCurrent(std::string tensor_name); void MoveTensorCurrentToPrev(std::string tensor_name); + void SetNetName(std::string net_name); + + std::string GetNetName(); + + void SetDumpDir(std::string dump_dir); + + std::string GetDumpDir(); + + void SetSyncMode(bool is_sync_mode); + + bool GetSyncMode(); + private: std::mutex lock_; // to keep track of watchpoints that have been checked already for a tensor in current step std::unordered_map> wp_id_cache; std::unordered_map watchpoint_table; + std::string net_name; + std::string dump_dir; + bool is_sync_mode; TensorLoader *tensor_loader_; }; +#ifdef ONLINE_DBG_MODE } // namespace mindspore +#endif #endif // MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_ diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index 4be643983c3..521c27b883e 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -755,7 +755,7 @@ std::list Debugger::LoadTensors(const ProtoVector &ten std::vector ret_name; std::vector data_ptr; std::vector data_size; - std::vector dtype; + std::vector dtype; std::vector> shape; std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName); @@ -789,7 +789,7 @@ std::list Debugger::LoadTensors(const ProtoVector &ten tensor_item.set_tensor_content(data_ptr[result_index] + size_iter, chunk_size); - tensor_item.set_data_type(GetDebuggerNumberDataType(dtype[result_index])); + tensor_item.set_data_type((debugger::DataType)dtype[result_index]); for (auto &elem : shape[result_index]) { tensor_item.add_dims(elem); } @@ -827,7 +827,7 @@ std::list Debugger::CheckWatchpoints(const std::string &watchnode tensor_list = debug_services_->GetNodeTensor(kernel); } debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, ¶meters, &error_codes, overflow_ops, - tensor_list, initial_suspend_, watchnode.empty(), recheck); + &tensor_list, initial_suspend_, watchnode.empty(), recheck); std::list hits; for (unsigned int i = 0; i < name.size(); i++) { WatchpointHit hit; diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/async_sink_mode_true_read_tensors.expected b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/async_sink_mode_true_read_tensors.expected new file mode 100644 index 00000000000..c17f62ce9bb --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/async_sink_mode_true_read_tensors.expected @@ -0,0 +1,28 @@ +----------------------------------------------------------- +tensor_info_1 attributes: +node name = Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op169 +slot = 0 +iteration = 2 +device_id = None +root_graph_id = 1 +is_parameter = False + +tensor_data_1 attributes: +data (printed in uint8) = [149 167 124 ... 158 212 164] +size in bytes = 2076672 +debugger dtype = 10 +shape = [32, 192, 13, 13] +----------------------------------------------------------- +tensor_info_2 attributes: +node name = Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/ReLUV2-op348 +slot = 1 +iteration = 2 +device_id = None +root_graph_id = 1 +is_parameter = False + +tensor_data_2 attributes: +data (printed in uint8) = [ 20 21 18 ... 126 98 25] +size in bytes = 129792 +debugger dtype = 6 +shape = [32, 12, 13, 13, 2] diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/async_sink_mode_true_read_tensors.py b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/async_sink_mode_true_read_tensors.py new file mode 100644 index 00000000000..47e2db9c746 --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/async_sink_mode_true_read_tensors.py @@ -0,0 +1,72 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Read tensor test script for offline debugger APIs. +""" + +import mindspore.offline_debug.dbg_services as d +import numpy as np + + +def main(): + + debugger_backend = d.DbgServices( + dump_file_path="/opt/nvme2n1/j00455527/dumps/async_sink_true/032421") + + _ = debugger_backend.initialize(net_name="alexnet", is_sync_mode=False) + + # output tensor with zero slot + info1 = d.TensorInfo(node_name="Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/" + "conv3-Conv2d/Conv2D-op169", + slot=0, iteration=2, device_id=0, root_graph_id=1, is_parameter=False) + # output tensor with non-zero slot + info2 = d.TensorInfo(node_name="Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/" + "ReLUV2-op348", + slot=1, iteration=2, device_id=0, root_graph_id=1, is_parameter=False) + + tensor_info = [info1, info2] + + tensor_data = debugger_backend.read_tensors(tensor_info) + + print_read_tensors(tensor_info, tensor_data) + + +def print_read_tensors(tensor_info, tensor_data): + """Print read tensors.""" + for x, _ in enumerate(tensor_info): + print("-----------------------------------------------------------") + print("tensor_info_" + str(x+1) + " attributes:") + print("node name = ", tensor_info[x].node_name) + print("slot = ", tensor_info[x].slot) + print("iteration = ", tensor_info[x].iteration) + print("device_id = ", tensor_info[x].device_id) + print("root_graph_id = ", tensor_info[x].root_graph_id) + print("is_parameter = ", tensor_info[x].is_parameter) + print() + print("tensor_data_" + str(x+1) + " attributes:") + print("data (printed in uint8) = ", np.frombuffer( + tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size)) + py_byte_size = len(tensor_data[x].data_ptr) + c_byte_size = tensor_data[x].data_size + if c_byte_size != py_byte_size: + print("The python byte size of ", py_byte_size, + " does not match the C++ byte size of ", c_byte_size) + print("size in bytes = ", tensor_data[x].data_size) + print("debugger dtype = ", tensor_data[x].dtype) + print("shape = ", tensor_data[x].shape) + + +if __name__ == "__main__": + main() diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/async_sink_mode_true_watchpoints.expected b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/async_sink_mode_true_watchpoints.expected new file mode 100644 index 00000000000..4e6f066f5ef --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/async_sink_mode_true_watchpoints.expected @@ -0,0 +1,14 @@ +----------------------------------------------------------- +watchpoint_hit for test_1 attributes: +name = Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op169 +slot = 0 +condition = 6 +watchpoint_id = 1 +parameter 0 name = param +parameter 0 disabled = False +parameter 0 value = 0.0 +parameter 0 hit = True +parameter 0 actual_value = -0.1417236328125 +error code = 0 +device_id = 0 +root_graph_id = 1 diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/async_sink_mode_true_watchpoints.py b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/async_sink_mode_true_watchpoints.py new file mode 100644 index 00000000000..e9041d2950c --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/async_sink_mode_true_watchpoints.py @@ -0,0 +1,92 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Watchpoints test script for offline debugger APIs. +""" + +import mindspore.offline_debug.dbg_services as d + + +def main(): + + debugger_backend = d.DbgServices( + dump_file_path="/opt/nvme2n1/j00455527/dumps/async_sink_true/032421") + + _ = debugger_backend.initialize(net_name="alexnet", is_sync_mode=False) + + # NOTES: + # -> watch_condition=6 is MIN_LT + # -> watch_condition=18 is CHANGE_TOO_LARGE + + # test 1: watchpoint set and hit (watch_condition=6) + param1 = d.Parameter(name="param", disabled=False, value=0.0) + _ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6, + check_node_list={"Default/network-TrainOneStepCell/network-WithLossCell/" + "_backbone-AlexNet/conv3-Conv2d/Conv2D-op169": + {"device_id": [0], "root_graph_id": [1], "is_parameter": False + }}, parameter_list=[param1]) + + watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2) + if len(watchpoint_hits_test_1) != 1: + print("ERROR -> test 1: watchpoint set but not hit just once") + print_watchpoint_hits(watchpoint_hits_test_1, 1) + + # test 2: watchpoint remove and ensure it's not hit + _ = debugger_backend.remove_watchpoint(watchpoint_id=1) + watchpoint_hits_test_2 = debugger_backend.check_watchpoints(iteration=2) + if watchpoint_hits_test_2: + print("ERROR -> test 2: watchpoint removed but hit") + + # test 3: watchpoint set and not hit, then remove + param2 = d.Parameter(name="param", disabled=False, value=-1000.0) + _ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6, + check_node_list={"Default/network-TrainOneStepCell/network-WithLossCell/" + "_backbone-AlexNet/conv3-Conv2d/Conv2D-op169": + {"device_id": [0], "root_graph_id": [1], "is_parameter": False + }}, parameter_list=[param2]) + + watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2) + if watchpoint_hits_test_3: + print("ERROR -> test 3: watchpoint set but not supposed to be hit") + _ = debugger_backend.remove_watchpoint(watchpoint_id=2) + + +def print_watchpoint_hits(watchpoint_hits, test_id): + """Print watchpoint hits.""" + for x, _ in enumerate(watchpoint_hits): + print("-----------------------------------------------------------") + print("watchpoint_hit for test_%u attributes:" % test_id) + print("name = ", watchpoint_hits[x].name) + print("slot = ", watchpoint_hits[x].slot) + print("condition = ", watchpoint_hits[x].condition) + print("watchpoint_id = ", watchpoint_hits[x].watchpoint_id) + for p, _ in enumerate(watchpoint_hits[x].parameters): + print("parameter ", p, " name = ", + watchpoint_hits[x].parameters[p].name) + print("parameter ", p, " disabled = ", + watchpoint_hits[x].parameters[p].disabled) + print("parameter ", p, " value = ", + watchpoint_hits[x].parameters[p].value) + print("parameter ", p, " hit = ", + watchpoint_hits[x].parameters[p].hit) + print("parameter ", p, " actual_value = ", + watchpoint_hits[x].parameters[p].actual_value) + print("error code = ", watchpoint_hits[x].error_code) + print("device_id = ", watchpoint_hits[x].device_id) + print("root_graph_id = ", watchpoint_hits[x].root_graph_id) + + +if __name__ == "__main__": + main() diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/run_tests b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/run_tests new file mode 100755 index 00000000000..299a61eadcc --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/run_tests @@ -0,0 +1,49 @@ +python sync_trans_false_read_tensors.py > sync_trans_false_read_tensors.actual +sed -i '/\[WARNING\]/d' sync_trans_false_read_tensors.actual +sed -i '/Deprecated/d' sync_trans_false_read_tensors.actual +diff sync_trans_false_read_tensors.actual sync_trans_false_read_tensors.expected +if [ $? -eq 0 ]; then + echo sync_trans_false_read_tensors PASSED +else + echo sync_trans_false_read_tensors FAILED +fi + +python sync_trans_true_read_tensors.py > sync_trans_true_read_tensors.actual +sed -i '/\[WARNING\]/d' sync_trans_true_read_tensors.actual +sed -i '/Deprecated/d' sync_trans_true_read_tensors.actual +diff sync_trans_true_read_tensors.actual sync_trans_true_read_tensors.expected +if [ $? -eq 0 ]; then + echo sync_trans_true_read_tensors PASSED +else + echo sync_trans_true_read_tensors FAILED +fi + +python sync_trans_false_watchpoints.py > sync_trans_false_watchpoints.actual +sed -i '/\[WARNING\]/d' sync_trans_false_watchpoints.actual +sed -i '/Deprecated/d' sync_trans_false_watchpoints.actual +diff sync_trans_false_watchpoints.actual sync_trans_false_watchpoints.expected +if [ $? -eq 0 ]; then + echo sync_trans_false_watchpoints PASSED +else + echo sync_trans_false_watchpoints FAILED +fi + +python async_sink_mode_true_read_tensors.py > async_sink_mode_true_read_tensors.actual +sed -i '/\[WARNING\]/d' async_sink_mode_true_read_tensors.actual +sed -i '/Deprecated/d' async_sink_mode_true_read_tensors.actual +diff async_sink_mode_true_read_tensors.actual async_sink_mode_true_read_tensors.expected +if [ $? -eq 0 ]; then + echo async_sink_mode_true_read_tensors PASSED +else + echo async_sink_mode_true_read_tensors FAILED +fi + +python async_sink_mode_true_watchpoints.py > async_sink_mode_true_watchpoints.actual +sed -i '/\[WARNING\]/d' async_sink_mode_true_watchpoints.actual +sed -i '/Deprecated/d' async_sink_mode_true_watchpoints.actual +diff async_sink_mode_true_watchpoints.actual async_sink_mode_true_watchpoints.expected +if [ $? -eq 0 ]; then + echo async_sink_mode_true_watchpoints PASSED +else + echo async_sink_mode_true_watchpoints FAILED +fi diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_false_read_tensors.expected b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_false_read_tensors.expected new file mode 100644 index 00000000000..ef4977b8fe3 --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_false_read_tensors.expected @@ -0,0 +1,70 @@ +----------------------------------------------------------- +tensor_info_1 attributes: +node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias +slot = 0 +iteration = 2 +device_id = None +root_graph_id = 0 +is_parameter = True + +tensor_data_1 attributes: +data (printed in uint8) = [170 19 44 181 254 212 16 52 52 162 148 180 130 115 226 180 183 243 + 101 52 224 79 189 51 10 70 69 51 199 75 159 52 79 98 104 52 + 106 77 19 52 129 183 8 180 252 58 48 180 35 219 9 52 240 201 + 179 51 142 151 158 51 210 145 182 53 140 219 0 53 140 219 22 181 + 46 33 87 180 238 90 122 180 166 10 38 179 202 195 4 53 166 10 + 150 51 214 120 209 52 235 115 37 180 92 177 215 180 0 136 84 51 + 72 114 145 180 43 169 255 180 114 27 61 52 76 225 122 50 126 72 + 159 51 58 35 202 51 114 61 106 51 60 223 63 52 209 179 1 52 + 232 217 44 178 130 158 109 179 213 231 10 179 37 40 94 179 208 68 + 64 53 6 52 249 52 162 35 1 181 231 29 155 52 30 201 69 180 + 229 131 126 51 18 165 109 180 164 112 163 181 116 172 11 178 6 129 + 37 52 54 205 203 180 115 104 145 52 232 106 219 179 36 40 214 52 + 202 50 204 52 76 89 38 179 230 140 232 178 168 53 77 52 180 191 + 108 51 128 183 64 51 56 137 161 180 247 6 143 180 126 63 197 180 + 198 177 94 52 140 185 139 51 150 178 228 180 255 67 150 52 134 201 + 164 52 107 43 14 53 174 216 63 179 40 160 41 53 120 88 72 179 + 218 172 234 52 234 38 25 52 85 159 155 180 254 67 138 180 34 253 + 118 180 218 61 17 52 242 133 253 52 175 37 180 52 171 62 163 52 + 202 195 86 53 160 171 45 52 34 31 176 180 156 85 5 53 178 191 + 68 180 42 203 140 52 248 117 72 52 248 253 212 176 195 100 202 51 + 87 14 141 52 91 100 235 51 48 221 136 52 143 117 17 180 51 196 + 25 52 127 29 112 180 152 144 207 178 219 104 64 52 21 174 251 52 + 164 78 138 181 20 63 6 52 10 249 96 179 163 146 18 53 200 186 + 236 52 2 188 85 52 124 140 121 179 246 185 22 181 246 74 249 51 + 70 182 135 53 189 227 76 52 249 160 159 180 134 235 65 53 64 164 + 255 51 224 156 41 53 142 117 69 181 247 151 101 53 185 175 35 52 + 164 112 21 53 30 31 212 179 142 151 110 179 176 148 29 181 206 204 + 88 53 116 215 214 180 172 173 216 51 106 222 153 180 200 152 19 181 + 176 3 7 52 215 52 87 52] +size in bytes = 512 +debugger dtype = 11 +shape = [128] +----------------------------------------------------------- +tensor_info_2 attributes: +node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op168 +slot = 0 +iteration = 2 +device_id = None +root_graph_id = 0 +is_parameter = False + +tensor_data_2 attributes: +data (printed in uint8) = [181 167 46 ... 12 204 164] +size in bytes = 2076672 +debugger dtype = 10 +shape = [32, 12, 13, 13, 16] +----------------------------------------------------------- +tensor_info_3 attributes: +node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op346 +slot = 1 +iteration = 2 +device_id = None +root_graph_id = 0 +is_parameter = False + +tensor_data_3 attributes: +data (printed in uint8) = [ 50 17 122 ... 94 42 90] +size in bytes = 129792 +debugger dtype = 6 +shape = [32, 12, 13, 13, 2] diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_false_read_tensors.py b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_false_read_tensors.py new file mode 100644 index 00000000000..0bf1f1896f1 --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_false_read_tensors.py @@ -0,0 +1,74 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Read tensor test script for offline debugger APIs. +""" + +import mindspore.offline_debug.dbg_services as d +import numpy as np + + +def main(): + + debugger_backend = d.DbgServices( + dump_file_path="/opt/nvme2n1/j00455527/dumps/sync_trans_false/032421/alexnet") + + _ = debugger_backend.initialize( + net_name="Network Name goes here!", is_sync_mode=True) + + # parameter + info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias", + slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True) + # output tensor with zero slot + info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op168", + slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False) + # output tensor with non-zero slot + info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op346", + slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False) + + tensor_info = [info1, info2, info3] + + tensor_data = debugger_backend.read_tensors(tensor_info) + + print_read_tensors(tensor_info, tensor_data) + + +def print_read_tensors(tensor_info, tensor_data): + """Print read tensors.""" + for x, _ in enumerate(tensor_info): + print("-----------------------------------------------------------") + print("tensor_info_" + str(x+1) + " attributes:") + print("node name = ", tensor_info[x].node_name) + print("slot = ", tensor_info[x].slot) + print("iteration = ", tensor_info[x].iteration) + print("device_id = ", tensor_info[x].device_id) + print("root_graph_id = ", tensor_info[x].root_graph_id) + print("is_parameter = ", tensor_info[x].is_parameter) + print() + print("tensor_data_" + str(x+1) + " attributes:") + print("data (printed in uint8) = ", np.frombuffer( + tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size)) + py_byte_size = len(tensor_data[x].data_ptr) + c_byte_size = tensor_data[x].data_size + if c_byte_size != py_byte_size: + print("The python byte size of ", py_byte_size, + " does not match the C++ byte size of ", c_byte_size) + print("size in bytes = ", tensor_data[x].data_size) + print("debugger dtype = ", tensor_data[x].dtype) + print("shape = ", tensor_data[x].shape) + + +if __name__ == "__main__": + main() diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_false_watchpoints.expected b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_false_watchpoints.expected new file mode 100644 index 00000000000..f7f209ca2c5 --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_false_watchpoints.expected @@ -0,0 +1,33 @@ +----------------------------------------------------------- +watchpoint_hit for test_1 attributes: +name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op168 +slot = 0 +condition = 6 +watchpoint_id = 1 +parameter 0 name = param +parameter 0 disabled = False +parameter 0 value = 0.0 +parameter 0 hit = True +parameter 0 actual_value = -0.14013671875 +error code = 0 +device_id = 0 +root_graph_id = 0 +----------------------------------------------------------- +watchpoint_hit for test_4 attributes: +name = Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc3.bias +slot = 0 +condition = 18 +watchpoint_id = 3 +parameter 0 name = abs_mean_update_ratio_gt +parameter 0 disabled = False +parameter 0 value = 0.0 +parameter 0 hit = True +parameter 0 actual_value = 0.5243796973599475 +parameter 1 name = epsilon +parameter 1 disabled = True +parameter 1 value = 0.0 +parameter 1 hit = False +parameter 1 actual_value = 0.0 +error code = 0 +device_id = 0 +root_graph_id = 0 diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_false_watchpoints.py b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_false_watchpoints.py new file mode 100644 index 00000000000..c2059024585 --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_false_watchpoints.py @@ -0,0 +1,109 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Watchpoints test script for offline debugger APIs. +""" + +import mindspore.offline_debug.dbg_services as d + + +def main(): + + debugger_backend = d.DbgServices( + dump_file_path="/opt/nvme2n1/j00455527/dumps/sync_trans_false/032421/alexnet") + + _ = debugger_backend.initialize( + net_name="Network Name goes here!", is_sync_mode=True) + + # NOTES: + # -> watch_condition=6 is MIN_LT + # -> watch_condition=18 is CHANGE_TOO_LARGE + + # test 1: watchpoint set and hit (watch_condition=6) + param1 = d.Parameter(name="param", disabled=False, value=0.0) + _ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6, + check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/" + "Conv2D-op168": + {"device_id": [0], "root_graph_id": [0], "is_parameter": False + }}, parameter_list=[param1]) + + watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2) + if len(watchpoint_hits_test_1) != 1: + print("ERROR -> test 1: watchpoint set but not hit just once") + print_watchpoint_hits(watchpoint_hits_test_1, 1) + + # test 2: watchpoint remove and ensure it's not hit + _ = debugger_backend.remove_watchpoint(watchpoint_id=1) + watchpoint_hits_test_2 = debugger_backend.check_watchpoints(iteration=2) + if watchpoint_hits_test_2: + print("ERROR -> test 2: watchpoint removed but hit") + + # test 3: watchpoint set and not hit, then remove + param2 = d.Parameter(name="param", disabled=False, value=-1000.0) + _ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6, + check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/" + "Conv2D-op308": + {"device_id": [0], "root_graph_id": [0], "is_parameter": False + }}, parameter_list=[param2]) + + watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2) + if watchpoint_hits_test_3: + print("ERROR -> test 3: watchpoint set but not supposed to be hit") + _ = debugger_backend.remove_watchpoint(watchpoint_id=2) + + # test 4: weight change watchpoint set and hit + param_abs_mean_update_ratio_gt = d.Parameter( + name="abs_mean_update_ratio_gt", disabled=False, value=0.0) + param_epsilon = d.Parameter(name="epsilon", disabled=True, value=0.0) + _ = debugger_backend.add_watchpoint(watchpoint_id=3, watch_condition=18, + check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/" + "Parameter[6]_11/fc3.bias": + {"device_id": [0], "root_graph_id": [0], "is_parameter": True + }}, parameter_list=[param_abs_mean_update_ratio_gt, + param_epsilon]) + + watchpoint_hits_test_4 = debugger_backend.check_watchpoints(iteration=3) + if len(watchpoint_hits_test_4) != 1: + print("ERROR -> test 4: watchpoint weight change set but not hit just once") + print_watchpoint_hits(watchpoint_hits_test_4, 4) + + +def print_watchpoint_hits(watchpoint_hits, test_id): + """Print watchpoint hits.""" + for x, _ in enumerate(watchpoint_hits): + print("-----------------------------------------------------------") + print("watchpoint_hit for test_%u attributes:" % test_id) + print("name = ", watchpoint_hits[x].name) + print("slot = ", watchpoint_hits[x].slot) + print("condition = ", watchpoint_hits[x].condition) + print("watchpoint_id = ", watchpoint_hits[x].watchpoint_id) + for p, _ in enumerate(watchpoint_hits[x].parameters): + print("parameter ", p, " name = ", + watchpoint_hits[x].parameters[p].name) + print("parameter ", p, " disabled = ", + watchpoint_hits[x].parameters[p].disabled) + print("parameter ", p, " value = ", + watchpoint_hits[x].parameters[p].value) + print("parameter ", p, " hit = ", + watchpoint_hits[x].parameters[p].hit) + print("parameter ", p, " actual_value = ", + watchpoint_hits[x].parameters[p].actual_value) + print("error code = ", watchpoint_hits[x].error_code) + print("device_id = ", watchpoint_hits[x].device_id) + print("root_graph_id = ", watchpoint_hits[x].root_graph_id) + + +if __name__ == "__main__": + main() diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_true_read_tensors.expected b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_true_read_tensors.expected new file mode 100644 index 00000000000..46e8db511b4 --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_true_read_tensors.expected @@ -0,0 +1,70 @@ +----------------------------------------------------------- +tensor_info_1 attributes: +node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias +slot = 0 +iteration = 2 +device_id = None +root_graph_id = 0 +is_parameter = True + +tensor_data_1 attributes: +data (printed in uint8) = [230 208 10 52 104 34 252 52 4 231 144 52 188 150 64 180 88 236 + 15 180 254 135 180 51 131 226 147 52 88 202 62 53 2 43 55 53 + 231 29 87 180 220 249 30 180 157 17 177 180 81 107 140 181 8 95 + 192 180 89 134 112 180 96 238 90 178 156 196 212 180 206 25 15 181 + 212 154 6 180 91 211 116 52 191 14 140 51 128 106 124 53 28 158 + 70 181 182 21 251 50 100 204 157 179 88 202 42 180 7 95 8 53 + 128 251 238 52 241 133 241 52 111 86 157 179 48 221 148 180 200 7 + 141 180 236 226 182 51 190 82 158 180 140 108 179 180 195 134 215 179 + 103 213 39 179 89 168 149 180 42 58 58 180 64 53 62 179 250 126 + 158 52 38 83 117 52 0 0 136 180 136 133 122 51 110 18 131 179 + 238 13 94 51 102 136 15 181 134 90 227 180 16 11 117 180 35 74 + 163 52 105 0 87 181 112 18 131 50 226 233 67 181 217 172 10 52 + 206 25 217 52 208 213 22 52 146 203 87 180 74 46 207 52 178 191 + 4 180 100 93 216 52 119 190 171 180 223 2 5 181 128 72 207 179 + 58 146 11 179 224 79 137 52 143 228 154 180 246 219 215 179 14 79 + 195 52 126 29 64 52 132 192 42 51 94 220 86 52 94 109 1 181 + 72 37 117 178 110 197 94 180 160 94 153 179 118 224 80 181 156 17 + 37 50 120 156 162 53 26 115 135 180 228 20 29 53 145 126 147 52 + 99 16 48 180 211 188 199 180 52 51 99 180 93 254 227 52 152 126 + 123 49 6 18 16 181 5 163 130 51 27 158 98 53 134 235 189 52 + 119 45 9 180 130 115 110 52 158 128 162 52 232 251 197 180 178 46 + 158 179 57 214 157 52 172 207 161 180 208 0 222 49 242 99 32 53 + 20 174 135 50 247 117 176 52 194 57 43 180 140 108 135 51 243 65 + 175 51 187 73 156 51 63 232 217 50 180 234 115 52 194 168 148 52 + 27 192 183 180 45 178 157 52 125 208 17 53 236 192 65 53 190 193 + 7 53 254 246 57 53 3 43 199 51 64 164 215 180 220 104 240 51 + 23 72 24 180 68 173 9 51 72 114 29 53 105 0 57 181 188 150 + 8 53 229 97 131 53 0 34 189 51 163 146 74 53 31 244 204 51 + 86 193 220 180 156 51 146 179] +size in bytes = 512 +debugger dtype = 11 +shape = [128] +----------------------------------------------------------- +tensor_info_2 attributes: +node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op171 +slot = 0 +iteration = 2 +device_id = None +root_graph_id = 0 +is_parameter = False + +tensor_data_2 attributes: +data (printed in uint8) = [ 99 26 69 ... 154 218 164] +size in bytes = 2076672 +debugger dtype = 10 +shape = [32, 192, 13, 13] +----------------------------------------------------------- +tensor_info_3 attributes: +node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op353 +slot = 1 +iteration = 2 +device_id = None +root_graph_id = 0 +is_parameter = False + +tensor_data_3 attributes: +data (printed in uint8) = [19 17 27 ... 94 42 90] +size in bytes = 129792 +debugger dtype = 6 +shape = [32, 12, 13, 13, 2] diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_true_read_tensors.py b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_true_read_tensors.py new file mode 100644 index 00000000000..7d87ef32402 --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/ascend_tests/sync_trans_true_read_tensors.py @@ -0,0 +1,74 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Read tensor test script for offline debugger APIs. +""" + +import mindspore.offline_debug.dbg_services as d +import numpy as np + + +def main(): + + debugger_backend = d.DbgServices( + dump_file_path="/opt/nvme2n1/j00455527/dumps/sync_trans_true/032421/alexnet") + + _ = debugger_backend.initialize( + net_name="Network Name goes here!", is_sync_mode=True) + + # parameter + info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias", + slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True) + # output tensor with zero slot + info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op171", + slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False) + # output tensor with non-zero slot + info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op353", + slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False) + + tensor_info = [info1, info2, info3] + + tensor_data = debugger_backend.read_tensors(tensor_info) + + print_read_tensors(tensor_info, tensor_data) + + +def print_read_tensors(tensor_info, tensor_data): + """Print read tensors.""" + for x, _ in enumerate(tensor_info): + print("-----------------------------------------------------------") + print("tensor_info_" + str(x+1) + " attributes:") + print("node name = ", tensor_info[x].node_name) + print("slot = ", tensor_info[x].slot) + print("iteration = ", tensor_info[x].iteration) + print("device_id = ", tensor_info[x].device_id) + print("root_graph_id = ", tensor_info[x].root_graph_id) + print("is_parameter = ", tensor_info[x].is_parameter) + print() + print("tensor_data_" + str(x+1) + " attributes:") + print("data (printed in uint8) = ", np.frombuffer( + tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size)) + py_byte_size = len(tensor_data[x].data_ptr) + c_byte_size = tensor_data[x].data_size + if c_byte_size != py_byte_size: + print("The python byte size of ", py_byte_size, + " does not match the C++ byte size of ", c_byte_size) + print("size in bytes = ", tensor_data[x].data_size) + print("debugger dtype = ", tensor_data[x].dtype) + print("shape = ", tensor_data[x].shape) + + +if __name__ == "__main__": + main() diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc new file mode 100644 index 00000000000..6786872ac1f --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc @@ -0,0 +1,261 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "debugger/offline_debug/dbg_services.h" + +#include + +DbgServices::DbgServices(bool verbose) { + DbgLogger::verbose = verbose; + char *dbg_log_path = getenv("OFFLINE_DBG_LOG"); + if (dbg_log_path != NULL) { + DbgLogger::verbose = true; + } + debug_services = new DebugServices(); +} + +DbgServices::DbgServices(const DbgServices &other) { + MS_LOG(INFO) << "cpp DbgServices object is created via copy"; + debug_services = new DebugServices(*other.debug_services); +} + +DbgServices &DbgServices::operator=(const DbgServices &other) { + MS_LOG(INFO) << "cpp DbgServices object is being assigned a different state"; + if (this != &other) { + delete debug_services; + debug_services = new DebugServices(*other.debug_services); + } + return *this; +} + +DbgServices::~DbgServices() { + MS_LOG(INFO) << "cpp DbgServices object is deleted"; + delete debug_services; +} + +std::string DbgServices::GetVersion() { + MS_LOG(INFO) << "get version is called"; + return "1.2.0"; +} + +int32_t DbgServices::Initialize(std::string net_name, std::string dump_folder_path, bool is_sync_mode) { + MS_LOG(INFO) << "cpp DbgServices initialize network name " << net_name; + MS_LOG(INFO) << "cpp DbgServices initialize dump folder path " << dump_folder_path; + MS_LOG(INFO) << "cpp DbgServices initialize sync mode " << is_sync_mode; + debug_services->SetNetName(net_name); + debug_services->SetDumpDir(dump_folder_path); + debug_services->SetSyncMode(is_sync_mode); + return 0; +} + +int32_t DbgServices::AddWatchpoint( + unsigned int id, unsigned int watch_condition, + std::map>>> check_nodes, + std::vector parameter_list) { + MS_LOG(INFO) << "cpp start"; + + MS_LOG(INFO) << "cpp DbgServices AddWatchpoint id " << id; + MS_LOG(INFO) << "cpp DbgServices AddWatchpoint watch_condition " << watch_condition; + for (auto const &node : check_nodes) { + MS_LOG(INFO) << "cpp DbgServices AddWatchpoint name " << node.first; + auto attr_map = node.second; + + bool is_parameter = std::get(attr_map["is_parameter"]); + MS_LOG(INFO) << "cpp DbgServices AddWatchpoint is_parameter " << is_parameter; + + // std::vector device_id = std::get>(attr_map["device_id"]); + std::vector device_id_str = std::get>(attr_map["device_id"]); + std::vector device_id; + std::transform(device_id_str.begin(), device_id_str.end(), std::back_inserter(device_id), + [](std::string &id_str) -> std::uint32_t { return static_cast(std::stoul(id_str)); }); + MS_LOG(INFO) << "cpp DbgServices AddWatchpoint device_id "; + for (auto const &i : device_id) { + MS_LOG(INFO) << i << " "; + } + + // std::vector root_graph_id = std::get>(attr_map["root_graph_id"]); + std::vector root_graph_id_str = std::get>(attr_map["root_graph_id"]); + std::vector root_graph_id; + std::transform( + root_graph_id_str.begin(), root_graph_id_str.end(), std::back_inserter(root_graph_id), + [](std::string &graph_str) -> std::uint32_t { return static_cast(std::stoul(graph_str)); }); + MS_LOG(INFO) << "cpp DbgServices AddWatchpoint root_graph_id"; + for (auto const &j : root_graph_id) { + MS_LOG(INFO) << j << " "; + } + } + + for (auto const ¶meter : parameter_list) { + MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter name " << parameter.name; + MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter disabled " << parameter.disabled; + MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter value " << parameter.value; + MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter hit " << parameter.hit; + MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter actual_value " << parameter.actual_value; + } + + std::vector> check_node_list; + std::vector>> check_node_device_list; + std::vector>> check_node_graph_list; + std::vector parameter_list_backend; + + std::transform(check_nodes.begin(), check_nodes.end(), std::back_inserter(check_node_list), + [](auto &node) -> std::tuple { + auto attr_map = node.second; + return std::make_tuple(node.first, std::get(attr_map["is_parameter"])); + }); + + std::transform(check_nodes.begin(), check_nodes.end(), std::back_inserter(check_node_device_list), + [](auto &node) -> std::tuple> { + auto attr_map = node.second; + std::vector device_id_str = std::get>(attr_map["device_id"]); + std::vector device_id; + std::transform( + device_id_str.begin(), device_id_str.end(), std::back_inserter(device_id), + [](std::string &id_str) -> std::uint32_t { return static_cast(std::stoul(id_str)); }); + return std::make_tuple(node.first, device_id); + }); + + std::transform( + check_nodes.begin(), check_nodes.end(), std::back_inserter(check_node_graph_list), + [](auto &node) -> std::tuple> { + auto attr_map = node.second; + std::vector root_graph_id_str = std::get>(attr_map["root_graph_id"]); + std::vector root_graph_id; + std::transform( + root_graph_id_str.begin(), root_graph_id_str.end(), std::back_inserter(root_graph_id), + [](std::string &graph_str) -> std::uint32_t { return static_cast(std::stoul(graph_str)); }); + return std::make_tuple(node.first, root_graph_id); + }); + + std::transform( + parameter_list.begin(), parameter_list.end(), std::back_inserter(parameter_list_backend), + [](const parameter_t ¶meter) -> DebugServices::parameter_t { + return DebugServices::parameter_t{parameter.name, parameter.disabled, parameter.value, parameter.hit}; + }); + + debug_services->AddWatchpoint(id, watch_condition, 0, check_node_list, parameter_list_backend, + &check_node_device_list, &check_node_graph_list); + MS_LOG(INFO) << "cpp end"; + return 0; +} + +int32_t DbgServices::RemoveWatchpoint(unsigned int id) { + MS_LOG(INFO) << "cpp DbgServices RemoveWatchpoint id " << id; + debug_services->RemoveWatchpoint(id); + return 0; +} + +std::vector DbgServices::CheckWatchpoints(unsigned int iteration) { + MS_LOG(INFO) << "cpp DbgServices CheckWatchpoint iteration " << iteration; + + std::vector name; + std::vector slot; + std::vector condition; + std::vector watchpoint_id; + std::vector overflow_ops; + std::vector> parameters; + std::vector error_codes; + std::vector device_id; + std::vector root_graph_id; + // #ifdef ENABLE_D + // overflow_ops = CheckOpOverflow(); + // #endif + + std::vector> tensor_list; + tensor_list = debug_services->ReadNeededDumpedTensors(iteration); + + debug_services->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, ¶meters, &error_codes, overflow_ops, + &tensor_list, false, true, true, &device_id, &root_graph_id); + + std::vector hits; + for (unsigned int i = 0; i < name.size(); i++) { + std::vector ¶meter = parameters[i]; + std::vector api_parameter_vector; + for (const auto &p : parameter) { + parameter_t api_parameter(p.name, p.disabled, p.value, p.hit, p.actual_value); + api_parameter_vector.push_back(api_parameter); + } + watchpoint_hit_t hit(name[i], std::stoi(slot[i]), condition[i], watchpoint_id[i], api_parameter_vector, + error_codes[i], device_id[i], root_graph_id[i]); + + MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t name " << hit.name; + MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t slot " << hit.slot; + MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t watchpoint_id " << hit.watchpoint_id; + MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t error_code " << hit.error_code; + MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t device_id " << hit.device_id; + MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t root_graph_id " << hit.root_graph_id; + + for (auto const ¶meter_i : api_parameter_vector) { + MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter name " << parameter_i.name; + MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter disabled " << parameter_i.disabled; + MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter value " << parameter_i.value; + MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter hit " << parameter_i.hit; + MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter actual_value " << parameter_i.actual_value; + } + + hits.push_back(hit); + } + return hits; +} + +std::string GetTensorFullName(tensor_info_t info) { + std::string node_name = info.node_name; + if (info.is_parameter) { + // scopes in node name are separated by '/' + // use the name without scope if truncate is true + std::size_t found = node_name.find_last_of("/"); + node_name = node_name.substr(found + 1); + } + return node_name + ":" + std::to_string(info.slot); +} + +unsigned int GetTensorDeviceId(tensor_info_t info) { return info.device_id; } + +unsigned int GetTensorRootGraphId(tensor_info_t info) { return info.root_graph_id; } + +unsigned int GetTensorIteration(tensor_info_t info) { return info.iteration; } + +unsigned int GetTensorSlot(tensor_info_t info) { return info.slot; } + +std::vector DbgServices::ReadTensors(std::vector info) { + for (auto i : info) { + MS_LOG(INFO) << "cpp DbgServices ReadTensor info name " << i.node_name << ", slot " << i.slot << ", iteration " + << i.iteration << ", device_id " << i.device_id << ", root_graph_id " << i.root_graph_id; + } + std::vector backend_name; + std::vector device_id; + std::vector root_graph_id; + std::vector iteration; + std::vector slot; + std::vector> result_list; + std::vector tensors_read; + + std::transform(info.begin(), info.end(), std::back_inserter(backend_name), GetTensorFullName); + std::transform(info.begin(), info.end(), std::back_inserter(slot), GetTensorSlot); + std::transform(info.begin(), info.end(), std::back_inserter(device_id), GetTensorDeviceId); + std::transform(info.begin(), info.end(), std::back_inserter(root_graph_id), GetTensorRootGraphId); + std::transform(info.begin(), info.end(), std::back_inserter(iteration), GetTensorIteration); + + MS_LOG(INFO) << "cpp before"; + debug_services->ReadDumpedTensor(backend_name, slot, device_id, iteration, root_graph_id, &result_list); + MS_LOG(INFO) << "cpp after"; + + for (auto result : result_list) { + tensor_data_t tensor_data_item(result->GetDataPtr(), result->GetByteSize(), result->GetType(), result->GetShape()); + tensors_read.push_back(tensor_data_item); + } + MS_LOG(INFO) << "cpp end"; + return tensors_read; +} diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.h b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.h new file mode 100644 index 00000000000..e64fd801832 --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.h @@ -0,0 +1,149 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DEBUG_DBG_SERVICES_H_ +#define DEBUG_DBG_SERVICES_H_ + +#include +#include +#include +#include +#include +#include +#include +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" +#include "pybind11/stl_bind.h" + +#include "debug/debug_services.h" +namespace py = pybind11; + +typedef struct parameter { + parameter(const std::string &name, bool disabled, double value, bool hit, double actual_value) + : name(name), disabled(disabled), value(value), hit(hit), actual_value(actual_value) {} + const std::string get_name() const { return name; } + const bool get_disabled() const { return disabled; } + const double get_value() const { return value; } + const bool get_hit() const { return hit; } + const double get_actual_value() const { return actual_value; } + std::string name; + bool disabled; + double value; + bool hit; + double actual_value; +} parameter_t; + +typedef struct watchpoint_hit { + watchpoint_hit(const std::string &name, uint32_t slot, int condition, uint32_t watchpoint_id, + const std::vector ¶meters, int32_t error_code, uint32_t device_id, + uint32_t root_graph_id) + : name(name), + slot(slot), + condition(condition), + watchpoint_id(watchpoint_id), + parameters(parameters), + error_code(error_code), + device_id(device_id), + root_graph_id(root_graph_id) {} + const std::string get_name() const { return name; } + const uint32_t get_slot() const { return slot; } + const int get_condition() const { return condition; } + const uint32_t get_watchpoint_id() const { return watchpoint_id; } + const std::vector get_parameters() const { return parameters; } + const int32_t get_error_code() const { return error_code; } + const uint32_t get_device_id() const { return device_id; } + const uint32_t get_root_graph_id() const { return root_graph_id; } + std::string name; + uint32_t slot; + int condition; + uint32_t watchpoint_id; + std::vector parameters; + int32_t error_code; + uint32_t device_id; + uint32_t root_graph_id; +} watchpoint_hit_t; + +typedef struct tensor_info { + tensor_info(const std::string &node_name, uint32_t slot, uint32_t iteration, uint32_t device_id, + uint32_t root_graph_id, bool is_parameter) + : node_name(node_name), + slot(slot), + iteration(iteration), + device_id(device_id), + root_graph_id(root_graph_id), + is_parameter(is_parameter) {} + const std::string get_node_name() const { return node_name; } + const uint32_t get_slot() const { return slot; } + const uint32_t get_iteration() const { return iteration; } + const uint32_t get_device_id() const { return device_id; } + const uint32_t get_root_graph_id() const { return root_graph_id; } + const bool get_is_parameter() const { return is_parameter; } + std::string node_name; + uint32_t slot; + uint32_t iteration; + uint32_t device_id; + uint32_t root_graph_id; + bool is_parameter; +} tensor_info_t; + +typedef struct tensor_data { + tensor_data(char *data_ptr, uint64_t data_size, int dtype, const std::vector &shape) + : data_size(data_size), dtype(dtype), shape(shape) { + if (data_ptr != NULL) { + this->data_ptr = py::bytes(data_ptr, data_size); + } else { + this->data_ptr = py::bytes(); + } + } + const py::bytes get_data_ptr() const { return data_ptr; } + const uint64_t get_data_size() const { return data_size; } + const int get_dtype() const { return dtype; } + const std::vector &get_shape() const { return shape; } + py::bytes data_ptr; + uint64_t data_size; + int dtype; + std::vector shape; +} tensor_data_t; + +class DbgServices { + private: + DebugServices *debug_services; + + public: + explicit DbgServices(bool verbose = false); + + DbgServices(const DbgServices &other); + + DbgServices &operator=(const DbgServices &other); + + ~DbgServices(); + + int32_t Initialize(std::string net_name, std::string dump_folder_path, bool is_sync_mode); + + int32_t AddWatchpoint( + unsigned int id, unsigned int watch_condition, + std::map>>> check_nodes, + std::vector parameter_list); + + int32_t RemoveWatchpoint(unsigned int id); + + std::vector CheckWatchpoints(unsigned int iteration); + + std::vector ReadTensors(std::vector info); + + std::string GetVersion(); +}; + +#endif // DEBUG_DBG_SERVICES_H_ diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.py b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.py new file mode 100644 index 00000000000..cb82a38908d --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.py @@ -0,0 +1,865 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +The module DbgServices provides offline debugger APIs. +""" + +import mindspore._mindspore_offline_debug as cds +from mi_validators import check_init, check_initialize, check_add_watchpoint, check_remove_watchpoint, check_check_watchpoints, check_read_tensors, check_initialize_done, check_tensor_info_init, check_tensor_data_init, check_watchpoint_hit_init, check_parameter_init + + +def get_version(): + """ + Function to return offline Debug Services version. + + Returns: + version (str): dbgServices version. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> version = dbg_services.get_version() + """ + return cds.DbgServices(False).GetVersion() + +class DbgLogger: + """ + Offline Debug Services Logger + + Args: + verbose (bool): whether to print logs. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> version = dbg_services.DbgLogger(verbose=False) + """ + def __init__(self, verbose): + self.verbose = verbose + + def __call__(self, *logs): + if self.verbose: + print(logs) + + +log = DbgLogger(False) + + +class DbgServices(): + """ + Offline Debug Services class. + + Args: + dump_file_path (str): directory where the dump files are saved. + verbose (bool): whether to print logs (default: False).. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> d = dbg_services.DbgServices(dump_file_path="dump_file_path", + >>> verbose=True) + """ + + @check_init + def __init__(self, dump_file_path, verbose=False): + log.verbose = verbose + log("in Python __init__, file path is ", dump_file_path) + self.dump_file_path = dump_file_path + self.dbg_instance = cds.DbgServices(verbose) + self.version = self.dbg_instance.GetVersion() + self.verbose = verbose + self.initialized = False + + @check_initialize + def initialize(self, net_name, is_sync_mode=True): + """ + Initialize Debug Service. + + Args: + net_name (str): Network name. + is_sync_mode (bool): Whether to process synchronous or asynchronous dump files mode + (default: True (synchronous)). + + Returns: + Initialized Debug Service instance. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> d = dbg_services.DbgServices(dump_file_path="dump_file_path", + >>> verbose=True) + >>> d_init = d.initialize(net_name="network name", is_sync_mode=True) + """ + + log("in Python Initialize dump_file_path ", self.dump_file_path) + self.initialized = True + return self.dbg_instance.Initialize(net_name, self.dump_file_path, is_sync_mode) + + @check_initialize_done + @check_add_watchpoint + def add_watchpoint(self, watchpoint_id, watch_condition, check_node_list, parameter_list): + """ + Adding watchpoint to Debug Service instance. + + Args: + watchpoint_id (int): Watchpoint id + watch_condition (int): A representation of the condition to be checked. + check_node_list (dict): Dictionary of node names (str) as key, + mapping to device_id (list of ints), root_graph_id (list of ints) and is_parameter + (bool). + parameter_list (list): List of parameters in watchpoint. Parameters should be instances of Parameter class. + Each parameter describes the value to be checked in watchpoint. + + Returns: + Debug Service instance with added watchpoint. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> d = dbg_services.DbgServices(dump_file_path="dump_file_path", + >>> verbose=True) + >>> d_init = d.initialize(is_sync_mode=True) + >>> d_wp = d_init.add_watchpoint(watchpoint_id=1, + >>> watch_condition=6, + >>> check_node_list={"conv2.bias" : {"device_id": [0], + root_graph_id: [0], "is_parameter": True}}, + >>> parameter_list=[dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> actual_value=0.0)]) + """ + + print("Amir: ", check_node_list) + + log("in Python AddWatchpoint") + parameter_list_inst = [] + for elem in parameter_list: + parameter_list_inst.append(elem.instance) + return self.dbg_instance.AddWatchpoint(watchpoint_id, watch_condition, check_node_list, parameter_list_inst) + + @check_initialize_done + @check_remove_watchpoint + def remove_watchpoint(self, watchpoint_id): + """ + Removing watchpoint from Debug Service instance. + + Args: + watchpoint_id (int): Watchpoint id + + Returns: + Debug Service instance with removed watchpoint. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> d = dbg_services.DbgServices(dump_file_path="dump_file_path", + >>> verbose=True) + >>> d_init = d.initialize(is_sync_mode=True) + >>> d_wp = d_init.add_watchpoint(watchpoint_id=1, + >>> watch_condition=6, + >>> check_node_list={"conv2.bias" : {"device_id": [5], + root_graph_id: [0], "is_parameter": True}}, + >>> parameter_list=[dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> actual_value=0.0)]) + >>> d_wp = d_wp.remove_watchpoint(watchpoint_id=1) + """ + + log("in Python Remove Watchpoint id ", watchpoint_id) + return self.dbg_instance.RemoveWatchpoint(watchpoint_id) + + @check_initialize_done + @check_check_watchpoints + def check_watchpoints(self, iteration): + """ + Checking watchpoint at given iteration. + + Args: + iteration (int): Watchpoint check iteration. + + Returns: + Watchpoint hit list. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> d = dbg_services.DbgServices(dump_file_path="dump_file_path", + >>> verbose=True) + >>> d_init = d.initialize(is_sync_mode=True) + >>> d_wp = d_init.add_watchpoint(id=1, + >>> watch_condition=6, + >>> check_node_list={"conv2.bias" : {"device_id": [5], + root_graph_id: [0], "is_parameter": True}}, + >>> parameter_list=[dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> actual_value=0.0)]) + >>> watchpoints = d_wp.check_watchpoints(iteration=8) + """ + + log("in Python CheckWatchpoints iteration ", iteration) + watchpoint_list = self.dbg_instance.CheckWatchpoints(iteration) + watchpoint_hit_list = [] + for watchpoint in watchpoint_list: + name = watchpoint.get_name() + slot = watchpoint.get_slot() + condition = watchpoint.get_condition() + watchpoint_id = watchpoint.get_watchpoint_id() + parameters = watchpoint.get_parameters() + error_code = watchpoint.get_error_code() + device_id = watchpoint.get_device_id() + root_graph_id = watchpoint.get_root_graph_id() + param_list = [] + for param in parameters: + p_name = param.get_name() + disabled = param.get_disabled() + value = param.get_value() + hit = param.get_hit() + actual_value = param.get_actual_value() + param_list.append(Parameter(p_name, disabled, value, hit, actual_value)) + watchpoint_hit_list.append(WatchpointHit(name, slot, condition, watchpoint_id, + param_list, error_code, device_id, root_graph_id)) + return watchpoint_hit_list + + @check_initialize_done + @check_read_tensors + def read_tensors(self, info): + """ + Returning tensor data object describing the tensor requested tensor. + + Args: + info (list): List of TensorInfo objects. + + Returns: + TensorData list (list). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> d = dbg_services.DbgServices(dump_file_path="dump_file_path", + >>> verbose=True) + >>> d_init = d.initialize(is_sync_mode=True) + >>> tensor_data_list = d_init.read_tensors([dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> device_id=5, + >>> root_graph_id=0, + >>> is_parameter=True)]) + """ + + log("in Python ReadTensors info ", info) + info_list_inst = [] + for elem in info: + log("in Python ReadTensors info ", info) + info_list_inst.append(elem.instance) + tensor_data_list = self.dbg_instance.ReadTensors(info_list_inst) + tensor_data_list_ret = [] + for elem in tensor_data_list: + if elem.get_data_size() == 0: + tensor_data = TensorData(b'', elem.get_data_size(), elem.get_dtype(), elem.get_shape()) + else: + tensor_data = TensorData(elem.get_data_ptr(), elem.get_data_size(), elem.get_dtype(), elem.get_shape()) + tensor_data_list_ret.append(tensor_data) + return tensor_data_list_ret + +class TensorInfo(): + """ + Tensor Information class. + + Args: + node_name (str): Fully qualified name of the desired node. + slot (int): The particular output for the requested node. + iteration (int): The desired itraretion to gather tensor information. + device_id (int): The desired device id to gather tensor information. + is_parameter (bool): Whether node is a parameter (input, constant, bias, parameter). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> device_id=5, + >>> root_graph_id=0, + >>> is_parameter=True) + """ + + @check_tensor_info_init + def __init__(self, node_name, slot, iteration, device_id, root_graph_id, is_parameter): + self.instance = cds.tensor_info(node_name, slot, iteration, device_id, root_graph_id, is_parameter) + + @property + def node_name(self): + """ + Function to receive TensorInfo node_name. + + Returns: + node_name of TensorInfo instance (str). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> device_id=5, + >>> root_graph_id=0, + >>> is_parameter=True) + >>> name = tensor_info.node_name + """ + + return self.instance.get_node_name() + + @property + def slot(self): + """ + Function to receive TensorInfo slot. + + Returns: + slot of TensorInfo instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> device_id=5, + >>> root_graph_id=0, + >>> is_parameter=True) + >>> slot = tensor_info.slot + """ + + return self.instance.get_slot() + + @property + def iteration(self): + """ + Function to receive TensorInfo iteration. + + Returns: + iteration of TensorInfo instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> device_id=5, + >>> root_graph_id=0, + >>> is_parameter=True) + >>> iteration = tensor_info.iteration + """ + + return self.instance.get_iteration() + + @property + def device_id(self): + """ + Function to receive TensorInfo device_id. + + Returns: + device_id of TensorInfo instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> device_id=5, + >>> root_graph_id=0, + >>> is_parameter=True) + >>> device_id = tensor_info.device_id + """ + + @property + def root_graph_id(self): + """ + Function to receive TensorInfo root_graph_id. + + Returns: + root_graph_id of TensorInfo instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> device_id=5, + >>> root_graph_id=0, + >>> is_parameter=True) + >>> device_id = tensor_info.root_graph_id + """ + + return self.instance.get_root_graph_id() + + @property + def is_parameter(self): + """ + Function to receive TensorInfo is_parameter. + + Returns: + is_parameter of TensorInfo instance (bool). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> device_id=5, + >>> root_graph_id=0, + >>> is_parameter=True) + >>> is_parameter = tensor_info.is_parameter + """ + + return self.instance.get_is_parameter() + +class TensorData(): + """ + TensorData class. + + Args: + data_ptr (byte): Data pointer. + data_size (int): Size of data in bytes. + dtype (int): An encoding representing the type of TensorData. + shape (list): Shape of tensor. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0', + >>> data_size=4, + >>> dtype=0, + >>> shape=[2, 2]) + """ + + @check_tensor_data_init + def __init__(self, data_ptr, data_size, dtype, shape): + self.instance = cds.tensor_data(data_ptr, data_size, dtype, shape) + + @property + def data_ptr(self): + """ + Function to receive TensorData data_ptr. + + Returns: + data_ptr of TensorData instance (byte). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0', + >>> data_size=4, + >>> dtype=0, + >>> shape=[2, 2]) + >>> data_ptr = tensor_data.data_ptr + """ + + return self.instance.get_data_ptr() + + @property + def data_size(self): + """ + Function to receive TensorData data_size. + + Returns: + data_size of TensorData instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0', + >>> data_size=4, + >>> dtype=0, + >>> shape=[2, 2]) + >>> data_size = tensor_data.data_size + """ + + return self.instance.get_data_size() + + @property + def dtype(self): + """ + Function to receive TensorData dtype. + + Returns: + dtype of TensorData instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0', + >>> data_size=4, + >>> dtype=0, + >>> shape=[2, 2]) + >>> dtype = tensor_data.dtype + """ + + return self.instance.get_dtype() + + @property + def shape(self): + """ + Function to receive TensorData shape. + + Returns: + shape of TensorData instance (list). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0', + >>> data_size=4, + >>> dtype=0, + >>> shape=[2, 2]) + >>> shape = tensor_data.shape + """ + + return self.instance.get_shape() + +class WatchpointHit(): + """ + WatchpointHit class. + + Args: + name (str): Name of WatchpointHit instance. + slot (int): The numerical label of an output. + condition (int): A representation of the condition to be checked. + watchpoint_id (int): Watchpoint id. + parameters (list): A list of all parameters for WatchpointHit instance. + Parameters have to be instances of Parameter class. + error_code (int): An explanation of certain scenarios where watchpoint could not be checked. + device_id (int): Device id where the watchpoint is hit. + root_graph_id (int): Root graph id where the watchpoint is hit. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + """ + + @check_watchpoint_hit_init + def __init__(self, name, slot, condition, watchpoint_id, parameters, error_code, device_id, root_graph_id): + parameter_list_inst = [] + for elem in parameters: + parameter_list_inst.append(elem.instance) + self.instance = cds.watchpoint_hit(name, slot, condition, watchpoint_id, + parameter_list_inst, error_code, device_id, root_graph_id) + + @property + def name(self): + """ + Function to receive WatchpointHit name. + + Returns: + name of WatchpointHit instance (str). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + >>> name = watchpoint_hit.name + """ + + return self.instance.get_name() + + @property + def slot(self): + """ + Function to receive WatchpointHit slot. + + Returns: + slot of WatchpointHit instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + >>> slot = watchpoint_hit.slot + """ + + return self.instance.get_slot() + + @property + def condition(self): + """ + Function to receive WatchpointHit condition. + + Returns: + condition of WatchpointHit instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + >>> condition = watchpoint_hit.condition + """ + + return self.instance.get_condition() + + @property + def watchpoint_id(self): + """ + Function to receive WatchpointHit watchpoint_id. + + Returns: + watchpoint_id of WatchpointHit instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + >>> watchpoint_id = watchpoint_hit.watchpoint_id + """ + + return self.instance.get_watchpoint_id() + + @property + def parameters(self): + """ + Function to receive WatchpointHit parameters. + + Returns: + List of parameters of WatchpointHit instance (list). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + >>> parameters = watchpoint_hit.parameters + """ + + params = self.instance.get_parameters() + param_list = [] + for elem in params: + tmp = Parameter(elem.get_name(), + elem.get_disabled(), + elem.get_value(), + elem.get_hit(), + elem.get_actual_value()) + param_list.append(tmp) + return param_list + + @property + def error_code(self): + """ + Function to receive WatchpointHit error_code. + + Returns: + error_code of WatchpointHit instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + >>> error_code = watchpoint_hit.error_code + """ + + return self.instance.get_error_code() + + @property + def device_id(self): + """ + Function to receive WatchpointHit device_id. + + Returns: + device_id of WatchpointHit instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + >>> device_id = watchpoint_hit.device_id + """ + + return self.instance.get_device_id() + + @property + def root_graph_id(self): + """ + Function to receive WatchpointHit root_graph_id. + + Returns: + root_graph_id of WatchpointHit instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + >>> root_graph_id = watchpoint_hit.root_graph_id + """ + + return self.instance.get_root_graph_id() + +class Parameter(): + """ + Parameter class. + + Args: + name (str): Name of the parameter. + disabled (bool): Whether parameter is used in backend. + value (float): Threshold value of the parameter. + hit (bool): Whether this parameter triggered watchpoint (default is False). + actual_value (float): Actual value of the parameter (default is 0.0). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> parameter = dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> actual_value=0.0) + """ + + @check_parameter_init + def __init__(self, name, disabled, value, hit=False, actual_value=0.0): + self.instance = cds.parameter(name, disabled, value, hit, actual_value) + + @property + def name(self): + """ + Function to receive Parameter name. + + Returns: + name of Parameter instance (str). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> parameter = dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> name = watchpoint_hit.name + """ + + return self.instance.get_name() + + @property + def disabled(self): + """ + Function to receive Parameter disabled value. + + Returns: + disabled of Parameter instance (bool). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> parameter = dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> disabled = watchpoint_hit.disabled + """ + + return self.instance.get_disabled() + + @property + def value(self): + """ + Function to receive Parameter value. + + Returns: + value of Parameter instance (float). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> parameter = dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> value = watchpoint_hit.value + """ + + return self.instance.get_value() + + @property + def hit(self): + """ + Function to receive Parameter hit value. + + Returns: + hit of Parameter instance (bool). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> parameter = dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> hit = watchpoint_hit.hit + """ + + return self.instance.get_hit() + + @property + def actual_value(self): + """ + Function to receive Parameter actual_value value. + + Returns: + actual_value of Parameter instance (float). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> parameter = dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> actual_value = watchpoint_hit.actual_value + """ + + return self.instance.get_actual_value() diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/run_tests b/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/run_tests new file mode 100755 index 00000000000..e8713c1ed8d --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/run_tests @@ -0,0 +1,24 @@ +python sync_trans_false_read_tensors.py > sync_trans_false_read_tensors.actual +diff sync_trans_false_read_tensors.actual sync_trans_false_read_tensors.expected +if [ $? -eq 0 ]; then + echo sync_trans_false_read_tensors PASSED +else + echo sync_trans_false_read_tensors FAILED +fi + +python sync_trans_true_read_tensors.py > sync_trans_true_read_tensors.actual +diff sync_trans_true_read_tensors.actual sync_trans_true_read_tensors.expected +if [ $? -eq 0 ]; then + echo sync_trans_true_read_tensors PASSED +else + echo sync_trans_true_read_tensors FAILED +fi + +python sync_trans_false_watchpoints.py > sync_trans_false_watchpoints.actual +diff sync_trans_false_watchpoints.actual sync_trans_false_watchpoints.expected +if [ $? -eq 0 ]; then + echo sync_trans_false_watchpoints PASSED +else + echo sync_trans_false_watchpoints FAILED +fi + diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_false_read_tensors.expected b/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_false_read_tensors.expected new file mode 100644 index 00000000000..afcae6da915 --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_false_read_tensors.expected @@ -0,0 +1,70 @@ +----------------------------------------------------------- +tensor_info_1 attributes: +node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias +slot = 0 +iteration = 2 +device_id = None +root_graph_id = 0 +is_parameter = True + +tensor_data_1 attributes: +data (printed in uint8) = [ 0 0 0 0 195 127 0 0 176 202 195 248 194 127 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 58 196 248 + 194 127 0 0 17 0 0 0 0 0 0 0 160 76 6 140 195 127 + 0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127 0 0 + 64 195 195 248 194 127 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 88 1 196 248 194 127 0 0 18 0 0 0 + 0 0 0 0 160 47 6 140 195 127 0 0 69 0 0 0 0 0 + 0 0 1 0 0 0 195 127 0 0 176 203 195 248 194 127 0 0 + 176 204 195 248 194 127 0 0 0 0 0 0 0 0 0 0 216 241 + 195 248 194 127 0 0 19 0 0 0 0 0 0 0 96 39 6 140 + 195 127 0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127 + 0 0 112 52 196 248 194 127 0 0 176 52 196 248 194 127 0 0 + 0 0 0 0 0 0 0 0 88 250 195 248 194 127 0 0 20 0 + 0 0 0 0 0 0 128 130 5 140 195 127 0 0 69 0 0 0 + 0 0 0 0 0 0 0 0 195 127 0 0 208 136 195 248 194 127 + 0 0 176 202 195 248 194 127 0 0 48 52 196 248 194 127 0 0 + 184 247 195 248 194 127 0 0 21 0 0 0 0 0 0 0 176 213 + 4 140 195 127 0 0 69 0 0 0 0 0 0 0 0 0 0 0 + 195 127 0 0 48 52 196 248 194 127 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 8 249 195 248 194 127 0 0 + 22 0 0 0 0 0 0 0 16 46 4 140 195 127 0 0 69 0 + 0 0 0 0 0 0 1 0 0 0 195 127 0 0 64 137 195 248 + 194 127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 88 12 196 248 194 127 0 0 23 0 0 0 0 0 0 0 + 32 137 3 140 195 127 0 0 85 0 0 0 0 0 0 0 0 0 + 0 0 195 127 0 0 176 202 195 248 194 127 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 104 246 195 248 194 127 + 0 0 24 0 0 0 0 0 0 0 48 104 15 140 195 127 0 0 + 32 104 15 140 195 127 0 0] +size in bytes = 512 +debugger dtype = 11 +shape = [128] +----------------------------------------------------------- +tensor_info_2 attributes: +node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308 +slot = 0 +iteration = 2 +device_id = None +root_graph_id = 0 +is_parameter = False + +tensor_data_2 attributes: +data (printed in uint8) = [ 0 169 0 ... 152 242 63] +size in bytes = 4153344 +debugger dtype = 11 +shape = [32, 192, 13, 13] +----------------------------------------------------------- +tensor_info_3 attributes: +node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300 +slot = 1 +iteration = 2 +device_id = None +root_graph_id = 0 +is_parameter = False + +tensor_data_3 attributes: +data (printed in uint8) = [ 0 169 0 ... 217 4 52] +size in bytes = 831744 +debugger dtype = 8 +shape = [207936] diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_false_read_tensors.py b/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_false_read_tensors.py new file mode 100644 index 00000000000..f963e6b3a6e --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_false_read_tensors.py @@ -0,0 +1,74 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Read tensor test script for offline debugger APIs. +""" + +import mindspore.offline_debug.dbg_services as d +import numpy as np + + +def main(): + + debugger_backend = d.DbgServices( + dump_file_path="/home/jtzanaka/dumps/sync_trans_false/032421/alexnet") + + _ = debugger_backend.initialize( + net_name="Network Name goes here!", is_sync_mode=True) + + # parameter + info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias", + slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True) + # output tensor with zero slot + info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308", + slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False) + # output tensor with non-zero slot + info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300", + slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False) + + tensor_info = [info1, info2, info3] + + tensor_data = debugger_backend.read_tensors(tensor_info) + + print_read_tensors(tensor_info, tensor_data) + + +def print_read_tensors(tensor_info, tensor_data): + """Print read tensors.""" + for x, _ in enumerate(tensor_info): + print("-----------------------------------------------------------") + print("tensor_info_" + str(x+1) + " attributes:") + print("node name = ", tensor_info[x].node_name) + print("slot = ", tensor_info[x].slot) + print("iteration = ", tensor_info[x].iteration) + print("device_id = ", tensor_info[x].device_id) + print("root_graph_id = ", tensor_info[x].root_graph_id) + print("is_parameter = ", tensor_info[x].is_parameter) + print() + print("tensor_data_" + str(x+1) + " attributes:") + print("data (printed in uint8) = ", np.frombuffer( + tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size)) + py_byte_size = len(tensor_data[x].data_ptr) + c_byte_size = tensor_data[x].data_size + if c_byte_size != py_byte_size: + print("The python byte size of ", py_byte_size, + " does not match the C++ byte size of ", c_byte_size) + print("size in bytes = ", tensor_data[x].data_size) + print("debugger dtype = ", tensor_data[x].dtype) + print("shape = ", tensor_data[x].shape) + + +if __name__ == "__main__": + main() diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_false_watchpoints.expected b/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_false_watchpoints.expected new file mode 100644 index 00000000000..5928d44d547 --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_false_watchpoints.expected @@ -0,0 +1,33 @@ +----------------------------------------------------------- +watchpoint_hit for test_1 attributes: +name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308 +slot = 0 +condition = 6 +watchpoint_id = 1 +parameter 0 name = param +parameter 0 disabled = False +parameter 0 value = 0.0 +parameter 0 hit = True +parameter 0 actual_value = -2.429065704345703 +error code = 0 +device_id = 0 +root_graph_id = 0 +----------------------------------------------------------- +watchpoint_hit for test_4 attributes: +name = Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc3.bias +slot = 0 +condition = 18 +watchpoint_id = 3 +parameter 0 name = abs_mean_update_ratio_gt +parameter 0 disabled = False +parameter 0 value = 0.0 +parameter 0 hit = True +parameter 0 actual_value = 1.793662034335766e-35 +parameter 1 name = epsilon +parameter 1 disabled = True +parameter 1 value = 0.0 +parameter 1 hit = False +parameter 1 actual_value = 0.0 +error code = 0 +device_id = 0 +root_graph_id = 0 diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_false_watchpoints.py b/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_false_watchpoints.py new file mode 100644 index 00000000000..870fcf75cda --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_false_watchpoints.py @@ -0,0 +1,109 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Watchpoints test script for offline debugger APIs. +""" + +import mindspore.offline_debug.dbg_services as d + + +def main(): + + debugger_backend = d.DbgServices( + dump_file_path="/home/jtzanaka/dumps/sync_trans_false/032421/alexnet") + + _ = debugger_backend.initialize( + net_name="Network Name goes here!", is_sync_mode=True) + + # NOTES: + # -> watch_condition=6 is MIN_LT + # -> watch_condition=18 is CHANGE_TOO_LARGE + + # test 1: watchpoint set and hit (watch_condition=6) + param1 = d.Parameter(name="param", disabled=False, value=0.0) + _ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6, + check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/" + "Conv2D-op308": + {"device_id": [0], "root_graph_id": [0], "is_parameter": False + }}, parameter_list=[param1]) + + watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2) + if len(watchpoint_hits_test_1) != 1: + print("ERROR -> test 1: watchpoint set but not hit just once") + print_watchpoint_hits(watchpoint_hits_test_1, 1) + + # test 2: watchpoint remove and ensure it's not hit + _ = debugger_backend.remove_watchpoint(watchpoint_id=1) + watchpoint_hits_test_2 = debugger_backend.check_watchpoints(iteration=2) + if watchpoint_hits_test_2: + print("ERROR -> test 2: watchpoint removed but hit") + + # test 3: watchpoint set and not hit, then remove + param2 = d.Parameter(name="param", disabled=False, value=-1000.0) + _ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6, + check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/" + "Conv2D-op308": + {"device_id": [0], "root_graph_id": [0], "is_parameter": False + }}, parameter_list=[param2]) + + watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2) + if watchpoint_hits_test_3: + print("ERROR -> test 3: watchpoint set but not supposed to be hit") + _ = debugger_backend.remove_watchpoint(watchpoint_id=2) + + # test 4: weight change watchpoint set and hit + param_abs_mean_update_ratio_gt = d.Parameter( + name="abs_mean_update_ratio_gt", disabled=False, value=0.0) + param_epsilon = d.Parameter(name="epsilon", disabled=True, value=0.0) + _ = debugger_backend.add_watchpoint(watchpoint_id=3, watch_condition=18, + check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/" + "Parameter[6]_11/fc3.bias": + {"device_id": [0], "root_graph_id": [0], "is_parameter": True + }}, parameter_list=[param_abs_mean_update_ratio_gt, + param_epsilon]) + + watchpoint_hits_test_4 = debugger_backend.check_watchpoints(iteration=3) + if len(watchpoint_hits_test_4) != 1: + print("ERROR -> test 4: watchpoint weight change set but not hit just once") + print_watchpoint_hits(watchpoint_hits_test_4, 4) + + +def print_watchpoint_hits(watchpoint_hits, test_id): + """Print watchpoint hits.""" + for x, _ in enumerate(watchpoint_hits): + print("-----------------------------------------------------------") + print("watchpoint_hit for test_%u attributes:" % test_id) + print("name = ", watchpoint_hits[x].name) + print("slot = ", watchpoint_hits[x].slot) + print("condition = ", watchpoint_hits[x].condition) + print("watchpoint_id = ", watchpoint_hits[x].watchpoint_id) + for p, _ in enumerate(watchpoint_hits[x].parameters): + print("parameter ", p, " name = ", + watchpoint_hits[x].parameters[p].name) + print("parameter ", p, " disabled = ", + watchpoint_hits[x].parameters[p].disabled) + print("parameter ", p, " value = ", + watchpoint_hits[x].parameters[p].value) + print("parameter ", p, " hit = ", + watchpoint_hits[x].parameters[p].hit) + print("parameter ", p, " actual_value = ", + watchpoint_hits[x].parameters[p].actual_value) + print("error code = ", watchpoint_hits[x].error_code) + print("device_id = ", watchpoint_hits[x].device_id) + print("root_graph_id = ", watchpoint_hits[x].root_graph_id) + + +if __name__ == "__main__": + main() diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_true_read_tensors.expected b/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_true_read_tensors.expected new file mode 100644 index 00000000000..286072a0431 --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_true_read_tensors.expected @@ -0,0 +1,70 @@ +----------------------------------------------------------- +tensor_info_1 attributes: +node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias +slot = 0 +iteration = 2 +device_id = None +root_graph_id = 0 +is_parameter = True + +tensor_data_1 attributes: +data (printed in uint8) = [ 1 0 0 0 195 127 0 0 80 58 118 65 195 127 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 40 186 117 65 + 195 127 0 0 5 0 0 0 0 0 0 0 160 76 6 204 195 127 + 0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127 0 0 + 48 135 117 65 195 127 0 0 16 58 118 65 195 127 0 0 144 58 + 118 65 195 127 0 0 168 186 117 65 195 127 0 0 6 0 0 0 + 0 0 0 0 160 47 6 204 195 127 0 0 69 0 0 0 0 0 + 0 0 1 0 0 0 195 127 0 0 80 58 118 65 195 127 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 184 249 + 117 65 195 127 0 0 7 0 0 0 0 0 0 0 96 39 6 204 + 195 127 0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127 + 0 0 224 218 117 65 195 127 0 0 0 0 0 0 0 0 0 0 + 224 219 117 65 195 127 0 0 200 17 118 65 195 127 0 0 8 0 + 0 0 0 0 0 0 128 130 5 204 195 127 0 0 69 0 0 0 + 0 0 0 0 1 0 0 0 195 127 0 0 120 233 255 59 196 127 + 0 0 224 217 117 65 195 127 0 0 224 214 117 65 195 127 0 0 + 120 250 117 65 195 127 0 0 9 0 0 0 0 0 0 0 176 213 + 4 204 195 127 0 0 69 0 0 0 0 0 0 0 1 0 0 0 + 195 127 0 0 240 66 118 65 195 127 0 0 160 218 117 65 195 127 + 0 0 224 215 117 65 195 127 0 0 40 9 118 65 195 127 0 0 + 10 0 0 0 0 0 0 0 16 46 4 204 195 127 0 0 69 0 + 0 0 0 0 0 0 1 0 0 0 195 127 0 0 208 59 118 65 + 195 127 0 0 0 0 0 0 0 0 0 0 96 218 117 65 195 127 + 0 0 56 251 117 65 195 127 0 0 11 0 0 0 0 0 0 0 + 32 137 3 204 195 127 0 0 85 0 0 0 0 0 0 0 1 0 + 0 0 195 127 0 0 224 214 117 65 195 127 0 0 144 59 118 65 + 195 127 0 0 160 214 117 65 195 127 0 0 136 62 118 65 195 127 + 0 0 12 0 0 0 0 0 0 0 48 104 15 204 195 127 0 0 + 32 104 15 204 195 127 0 0] +size in bytes = 512 +debugger dtype = 11 +shape = [128] +----------------------------------------------------------- +tensor_info_2 attributes: +node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308 +slot = 0 +iteration = 2 +device_id = None +root_graph_id = 0 +is_parameter = False + +tensor_data_2 attributes: +data (printed in uint8) = [206 239 74 ... 53 201 62] +size in bytes = 4153344 +debugger dtype = 11 +shape = [32, 192, 13, 13] +----------------------------------------------------------- +tensor_info_3 attributes: +node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300 +slot = 1 +iteration = 2 +device_id = None +root_graph_id = 0 +is_parameter = False + +tensor_data_3 attributes: +data (printed in uint8) = [206 239 74 ... 16 239 51] +size in bytes = 831744 +debugger dtype = 8 +shape = [207936] diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_true_read_tensors.py b/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_true_read_tensors.py new file mode 100644 index 00000000000..54ef2bd5c1d --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/gpu_tests/sync_trans_true_read_tensors.py @@ -0,0 +1,74 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Read tensor test script for offline debugger APIs. +""" + +import mindspore.offline_debug.dbg_services as d +import numpy as np + + +def main(): + + debugger_backend = d.DbgServices( + dump_file_path="/home/jtzanaka/dumps/sync_trans_true/032421/alexnet") + + _ = debugger_backend.initialize( + net_name="Network Name goes here!", is_sync_mode=True) + + # parameter + info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias", + slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True) + # output tensor with zero slot + info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308", + slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False) + # output tensor with non-zero slot + info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300", + slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False) + + tensor_info = [info1, info2, info3] + + tensor_data = debugger_backend.read_tensors(tensor_info) + + print_read_tensors(tensor_info, tensor_data) + + +def print_read_tensors(tensor_info, tensor_data): + """Print read tensors.""" + for x, _ in enumerate(tensor_info): + print("-----------------------------------------------------------") + print("tensor_info_" + str(x+1) + " attributes:") + print("node name = ", tensor_info[x].node_name) + print("slot = ", tensor_info[x].slot) + print("iteration = ", tensor_info[x].iteration) + print("device_id = ", tensor_info[x].device_id) + print("root_graph_id = ", tensor_info[x].root_graph_id) + print("is_parameter = ", tensor_info[x].is_parameter) + print() + print("tensor_data_" + str(x+1) + " attributes:") + print("data (printed in uint8) = ", np.frombuffer( + tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size)) + py_byte_size = len(tensor_data[x].data_ptr) + c_byte_size = tensor_data[x].data_size + if c_byte_size != py_byte_size: + print("The python byte size of ", py_byte_size, + " does not match the C++ byte size of ", c_byte_size) + print("size in bytes = ", tensor_data[x].data_size) + print("debugger dtype = ", tensor_data[x].dtype) + print("shape = ", tensor_data[x].shape) + + +if __name__ == "__main__": + main() diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/mi_pybind_register.cc b/mindspore/ccsrc/debug/debugger/offline_debug/mi_pybind_register.cc new file mode 100644 index 00000000000..d95e9efec31 --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/mi_pybind_register.cc @@ -0,0 +1,66 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" +#include "pybind11/stl_bind.h" +#include "debugger/offline_debug/dbg_services.h" + +PYBIND11_MODULE(_mindspore_offline_debug, m) { + m.doc() = "pybind11 debug services api"; + py::class_(m, "DbgServices") + .def(py::init()) + .def("Initialize", &DbgServices::Initialize) + .def("AddWatchpoint", &DbgServices::AddWatchpoint) + .def("RemoveWatchpoint", &DbgServices::RemoveWatchpoint) + .def("CheckWatchpoints", &DbgServices::CheckWatchpoints) + .def("ReadTensors", &DbgServices::ReadTensors) + .def("GetVersion", &DbgServices::GetVersion); + + py::class_(m, "parameter") + .def(py::init()) + .def("get_name", ¶meter::get_name) + .def("get_disabled", ¶meter::get_disabled) + .def("get_value", ¶meter::get_value) + .def("get_hit", ¶meter::get_hit) + .def("get_actual_value", ¶meter::get_actual_value); + + py::class_(m, "watchpoint_hit") + .def(py::init, int32_t, uint32_t, uint32_t>()) + .def("get_name", &watchpoint_hit::get_name) + .def("get_slot", &watchpoint_hit::get_slot) + .def("get_condition", &watchpoint_hit::get_condition) + .def("get_watchpoint_id", &watchpoint_hit::get_watchpoint_id) + .def("get_parameters", &watchpoint_hit::get_parameters) + .def("get_error_code", &watchpoint_hit::get_error_code) + .def("get_device_id", &watchpoint_hit::get_device_id) + .def("get_root_graph_id", &watchpoint_hit::get_root_graph_id); + + py::class_(m, "tensor_info") + .def(py::init()) + .def("get_node_name", &tensor_info::get_node_name) + .def("get_slot", &tensor_info::get_slot) + .def("get_iteration", &tensor_info::get_iteration) + .def("get_device_id", &tensor_info::get_device_id) + .def("get_root_graph_id", &tensor_info::get_root_graph_id) + .def("get_is_parameter", &tensor_info::get_is_parameter); + + py::class_(m, "tensor_data") + .def(py::init>()) + .def("get_data_ptr", &tensor_data::get_data_ptr) + .def("get_data_size", &tensor_data::get_data_size) + .def("get_dtype", &tensor_data::get_dtype) + .def("get_shape", &tensor_data::get_shape); +} diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/mi_validator_helpers.py b/mindspore/ccsrc/debug/debugger/offline_debug/mi_validator_helpers.py new file mode 100644 index 00000000000..abc4b38223f --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/mi_validator_helpers.py @@ -0,0 +1,123 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +General Validator Helper Functions. +""" +import os +import inspect + +UINT32_MAX = 4294967295 +UINT32_MIN = 0 +UINT64_MAX = 18446744073709551615 +UINT64_MIN = 0 + + +def pad_arg_name(arg_name): + if arg_name != "": + arg_name = arg_name + " " + return arg_name + + +def check_value(arg, valid_range, arg_name=""): + arg_name = pad_arg_name(arg_name) + if arg < valid_range[0] or arg > valid_range[1]: + raise ValueError( + "Input {0}is not within the required interval of ({1} to {2}).".format(arg_name, + valid_range[0], valid_range[1])) + + +def check_uint32(arg, arg_name=""): + type_check(arg, (int,), arg_name) + check_value(arg, [UINT32_MIN, UINT32_MAX]) + + +def check_uint64(arg, arg_name=""): + type_check(arg, (int,), arg_name) + check_value(arg, [UINT64_MIN, UINT64_MAX]) + + +def check_dir(dataset_dir): + if not os.path.isdir(dataset_dir) or not os.access(dataset_dir, os.R_OK): + raise ValueError("The folder {} does not exist or permission denied!".format(dataset_dir)) + + +def parse_user_args(method, *args, **kwargs): + """ + Parse user arguments in a function. + + Args: + method (method): a callable function. + args: user passed args. + kwargs: user passed kwargs. + + Returns: + user_filled_args (list): values of what the user passed in for the arguments. + ba.arguments (Ordered Dict): ordered dict of parameter and argument for what the user has passed. + """ + sig = inspect.signature(method) + if 'self' in sig.parameters or 'cls' in sig.parameters: + ba = sig.bind(method, *args, **kwargs) + ba.apply_defaults() + params = list(sig.parameters.keys())[1:] + else: + ba = sig.bind(*args, **kwargs) + ba.apply_defaults() + params = list(sig.parameters.keys()) + + user_filled_args = [ba.arguments.get(arg_value) for arg_value in params] + return user_filled_args, ba.arguments + + +def type_check(arg, types, arg_name): + """ + Check the type of the parameter. + + Args: + arg (Any) : any variable. + types (tuple): tuple of all valid types for arg. + arg_name (str): the name of arg. + + Returns: + Exception: when the type is not correct, otherwise nothing. + """ + # handle special case of booleans being a subclass of ints + print_value = '\"\"' if repr(arg) == repr('') else arg + + if int in types and bool not in types: + if isinstance(arg, bool): + raise TypeError("Argument {0} with value {1} is not of type {2}.".format(arg_name, print_value, types)) + if not isinstance(arg, types): + raise TypeError("Argument {0} with value {1} is not of type {2}.".format(arg_name, print_value, types)) + + +def type_check_list(args, types, arg_names): + """ + Check the type of each parameter in the list. + + Args: + args (Union[list, tuple]): a list or tuple of any variable. + types (tuple): tuple of all valid types for arg. + arg_names (Union[list, tuple of str]): the names of args. + + Returns: + Exception: when the type is not correct, otherwise nothing. + """ + type_check(args, (list, tuple,), arg_names) + if len(args) != len(arg_names) and not isinstance(arg_names, str): + raise ValueError("List of arguments is not the same length as argument_names.") + if isinstance(arg_names, str): + arg_names = ["{0}[{1}]".format(arg_names, i) for i in range(len(args))] + for arg, arg_name in zip(args, arg_names): + type_check(arg, types, arg_name) diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/mi_validators.py b/mindspore/ccsrc/debug/debugger/offline_debug/mi_validators.py new file mode 100644 index 00000000000..d30515cd412 --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/mi_validators.py @@ -0,0 +1,223 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Validator Functions for Offline Debugger APIs. +""" +from functools import wraps + +import dbg_services as cds +from mi_validator_helpers import parse_user_args, type_check, type_check_list, check_dir, check_uint32, check_uint64 + + +def check_init(method): + """Wrapper method to check the parameters of DbgServices init.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [dump_file_path, verbose], _ = parse_user_args(method, *args, **kwargs) + + type_check(dump_file_path, (str,), "dump_file_path") + type_check(verbose, (bool,), "verbose") + check_dir(dump_file_path) + + return method(self, *args, **kwargs) + + return new_method + + +def check_initialize(method): + """Wrapper method to check the parameters of DbgServices Initialize method.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [net_name, is_sync_mode], _ = parse_user_args(method, *args, **kwargs) + + type_check(net_name, (str,), "net_name") + type_check(is_sync_mode, (bool,), "is_sync_mode") + + return method(self, *args, **kwargs) + + return new_method + + +def check_add_watchpoint(method): + """Wrapper method to check the parameters of DbgServices AddWatchpoint.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [id_value, watch_condition, check_node_list, parameter_list], _ = parse_user_args(method, *args, **kwargs) + + check_uint32(id_value, "id") + check_uint32(watch_condition, "watch_condition") + type_check(check_node_list, (dict,), "check_node_list") + for node_name, node_info in check_node_list.items(): + type_check(node_name, (str,), "node_name") + type_check(node_info, (dict,), "node_info") + for info_name, info_param in node_info.items(): + type_check(info_name, (str,), "node parameter name") + if info_name in ["device_id"]: + for param in info_param: + check_uint32(param, "device_id") + elif info_name in ["root_graph_id"]: + for param in info_param: + check_uint32(param, "root_graph_id") + elif info_name in ["is_parameter"]: + type_check(info_param, (bool,), "is_parameter") + else: + raise ValueError("Node parameter {} is not defined.".format(info_name)) + param_names = ["param_{0}".format(i) for i in range(len(parameter_list))] + type_check_list(parameter_list, (cds.Parameter,), param_names) + + return method(self, *args, **kwargs) + + return new_method + + +def check_remove_watchpoint(method): + """Wrapper method to check the parameters of DbgServices RemoveWatchpoint.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [id_value], _ = parse_user_args(method, *args, **kwargs) + + check_uint32(id_value, "id") + + return method(self, *args, **kwargs) + + return new_method + + +def check_check_watchpoints(method): + """Wrapper method to check the parameters of DbgServices CheckWatchpoint.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [iteration], _ = parse_user_args(method, *args, **kwargs) + + check_uint32(iteration, "iteration") + + return method(self, *args, **kwargs) + + return new_method + + +def check_read_tensors(method): + """Wrapper method to check the parameters of DbgServices ReadTensors.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [info_list], _ = parse_user_args(method, *args, **kwargs) + + info_names = ["info_{0}".format(i) for i in range(len(info_list))] + type_check_list(info_list, (cds.TensorInfo,), info_names) + + return method(self, *args, **kwargs) + + return new_method + + +def check_initialize_done(method): + """Wrapper method to check if initlize is done for DbgServices.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + + if not self.initialized: + raise RuntimeError("Inilize should be called before any other methods of DbgServices!") + return method(self, *args, **kwargs) + + return new_method + + +def check_tensor_info_init(method): + """Wrapper method to check the parameters of DbgServices TensorInfo init.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [node_name, slot, iteration, device_id, root_graph_id, + is_parameter], _ = parse_user_args(method, *args, **kwargs) + + type_check(node_name, (str,), "node_name") + check_uint32(slot, "slot") + check_uint32(iteration, "iteration") + check_uint32(device_id, "device_id") + check_uint32(root_graph_id, "root_graph_id") + type_check(is_parameter, (bool,), "is_parameter") + + return method(self, *args, **kwargs) + + return new_method + + +def check_tensor_data_init(method): + """Wrapper method to check the parameters of DbgServices TensorData init.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [data_ptr, data_size, dtype, shape], _ = parse_user_args(method, *args, **kwargs) + + type_check(data_ptr, (bytes,), "data_ptr") + check_uint64(data_size, "data_size") + type_check(dtype, (int,), "dtype") + shape_names = ["shape_{0}".format(i) for i in range(len(shape))] + type_check_list(shape, (int,), shape_names) + + if len(data_ptr) != data_size: + raise ValueError("data_ptr length ({0}) is not equal to data_size ({1}).".format(len(data_ptr), data_size)) + + return method(self, *args, **kwargs) + + return new_method + + +def check_watchpoint_hit_init(method): + """Wrapper method to check the parameters of DbgServices WatchpointHit init.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [name, slot, condition, watchpoint_id, + parameters, error_code, device_id, root_graph_id], _ = parse_user_args(method, *args, **kwargs) + + type_check(name, (str,), "name") + check_uint32(slot, "slot") + type_check(condition, (int,), "condition") + check_uint32(watchpoint_id, "watchpoint_id") + param_names = ["param_{0}".format(i) for i in range(len(parameters))] + type_check_list(parameters, (cds.Parameter,), param_names) + type_check(error_code, (int,), "error_code") + check_uint32(device_id, "device_id") + check_uint32(root_graph_id, "root_graph_id") + + return method(self, *args, **kwargs) + + return new_method + + +def check_parameter_init(method): + """Wrapper method to check the parameters of DbgServices Parameter init.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [name, disabled, value, hit, actual_value], _ = parse_user_args(method, *args, **kwargs) + + type_check(name, (str,), "name") + type_check(disabled, (bool,), "disabled") + type_check(value, (float,), "value") + type_check(hit, (bool,), "hit") + type_check(actual_value, (float,), "actual_value") + + return method(self, *args, **kwargs) + + return new_method diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/offline_logger.cc b/mindspore/ccsrc/debug/debugger/offline_debug/offline_logger.cc new file mode 100644 index 00000000000..bb6ebbe509a --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/offline_logger.cc @@ -0,0 +1,19 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "debugger/offline_debug/offline_logger.h" + +bool DbgLogger::verbose = false; diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/offline_logger.h b/mindspore/ccsrc/debug/debugger/offline_debug/offline_logger.h new file mode 100644 index 00000000000..3b02b06ead8 --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/offline_debug/offline_logger.h @@ -0,0 +1,59 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef OFFLINE_LOGGER_H_ +#define OFFLINE_LOGGER_H_ + +#include + +#define MS_LOG(level) MS_LOG_##level + +#define MS_LOG_INFO static_cast(0), !(DbgLogger::verbose) ? void(0) : DbgLogger(DbgLoggerLvl::INFO) < std::cout + +#define MS_LOG_ERROR MS_LOG_INFO + +#define MS_LOG_DEBUG MS_LOG_INFO + +#define MS_LOG_WARNING MS_LOG_INFO + +#define MS_LOG_EXCEPTION \ + static_cast(0), !(DbgLogger::verbose) ? void(0) : DbgLogger(DbgLoggerLvl::EXCEPTION) < std::cout + +enum DbgLoggerLvl : int { DEBUG = 0, INFO, WARNING, ERROR, EXCEPTION }; + +class DbgLogger { + public: + explicit DbgLogger(DbgLoggerLvl lvl) : lvl_(lvl) {} + ~DbgLogger() = default; + void operator<(std::ostream &os) const { + char *dbg_log_path = getenv("OFFLINE_DBG_LOG"); + if (dbg_log_path != NULL) { + FILE *fp; + fp = freopen(dbg_log_path, "a", stdout); + if (fp == nullptr) { + std::cout << "ERROR: DbgLogger could not redirect all stdout to a file"; + } + } + os << std::endl; + if (lvl_ == DbgLoggerLvl::EXCEPTION) { + throw; + } + } + static bool verbose; + + private: + DbgLoggerLvl lvl_; +}; +#endif // OFFLINE_LOGGER_H_ diff --git a/mindspore/ccsrc/debug/debugger/tensor_summary.cc b/mindspore/ccsrc/debug/debugger/tensor_summary.cc index afd46404020..f480aa2cfd3 100644 --- a/mindspore/ccsrc/debug/debugger/tensor_summary.cc +++ b/mindspore/ccsrc/debug/debugger/tensor_summary.cc @@ -22,7 +22,16 @@ #include #include "debug/debugger/tensor_summary.h" +#ifdef OFFLINE_DBG_MODE +#include "Eigen/Core" +#include "Eigen/src/Core/arch/CUDA/Half.h" +using float16 = Eigen::half; +#include "offline_debug/offline_logger.h" +#endif + +#ifdef ONLINE_DBG_MODE namespace mindspore { +#endif using CONDITION_TYPE = DebugServices::CONDITION_TYPE; RangeCountCalculator::RangeCountCalculator() @@ -281,4 +290,6 @@ template class TensorSummary; template class TensorSummary; template class TensorSummary; template class TensorSummary; +#ifdef ONLINE_DBG_MODE } // namespace mindspore +#endif diff --git a/mindspore/ccsrc/debug/debugger/tensor_summary.h b/mindspore/ccsrc/debug/debugger/tensor_summary.h index 0f3aff96142..5c853d7e25b 100644 --- a/mindspore/ccsrc/debug/debugger/tensor_summary.h +++ b/mindspore/ccsrc/debug/debugger/tensor_summary.h @@ -24,7 +24,9 @@ #include "debug/debug_services.h" +#ifdef ONLINE_DBG_MODE namespace mindspore { +#endif class RangeCountCalculator { public: RangeCountCalculator(); @@ -121,5 +123,7 @@ class TensorSummary : public ITensorSummary { double_t GetZeroValPercent(); void InitCalculators(const std::vector &); }; +#ifdef ONLINE_DBG_MODE } // namespace mindspore +#endif #endif // MINDSPORE_TENSOR_SUMMARY_H diff --git a/mindspore/ccsrc/debug/tensor_data.h b/mindspore/ccsrc/debug/tensor_data.h index 00af2032083..27bdba3ff23 100644 --- a/mindspore/ccsrc/debug/tensor_data.h +++ b/mindspore/ccsrc/debug/tensor_data.h @@ -16,37 +16,170 @@ #ifndef MINDSPORE_CCSRC_DEBUG_TENSOR_DATA_H_ #define MINDSPORE_CCSRC_DEBUG_TENSOR_DATA_H_ +#include #include #include #include #include +#ifdef OFFLINE_DBG_MODE +#include "debugger/offline_debug/offline_logger.h" +#else #include "ir/tensor.h" +#include "mindspore/core/utils/log_adapter.h" +#endif +#ifdef ONLINE_DBG_MODE namespace mindspore { -class TensorData { - private: - mindspore::tensor::TensorPtr tensor_ptr; - std::string name; - size_t slot; - int execution_order; +#endif +namespace MsTypeId { +typedef enum MsTypeId : unsigned int { + kTypeUnknown = 0, + kMetaTypeBegin = kTypeUnknown, + kMetaTypeType, // Type + kMetaTypeAnything, + kMetaTypeObject, + kMetaTypeTypeType, // TypeType + kMetaTypeProblem, + kMetaTypeExternal, + kMetaTypeNone, + kMetaTypeNull, + kMetaTypeEllipsis, + kMetaTypeEnd, + // + // Object types + // + kObjectTypeBegin = kMetaTypeEnd, + kObjectTypeNumber, + kObjectTypeString, + kObjectTypeList, + kObjectTypeTuple, + kObjectTypeSlice, + kObjectTypeKeyword, + kObjectTypeTensorType, + kObjectTypeRowTensorType, + kObjectTypeSparseTensorType, + kObjectTypeUndeterminedType, + kObjectTypeClass, + kObjectTypeDictionary, + kObjectTypeFunction, + kObjectTypeJTagged, + kObjectTypeSymbolicKeyType, + kObjectTypeEnvType, + kObjectTypeRefKey, + kObjectTypeRef, + kObjectTypeEnd, + // + // Number Types + // + kNumberTypeBegin = kObjectTypeEnd, + kNumberTypeBool, + kNumberTypeInt, + kNumberTypeInt8, + kNumberTypeInt16, + kNumberTypeInt32, + kNumberTypeInt64, + kNumberTypeUInt, + kNumberTypeUInt8, + kNumberTypeUInt16, + kNumberTypeUInt32, + kNumberTypeUInt64, + kNumberTypeFloat, + kNumberTypeFloat16, + kNumberTypeFloat32, + kNumberTypeFloat64, + kNumberTypeComplex64, + kNumberTypeEnd +} MsTypeId; +} // namespace MsTypeId + +typedef enum DbgDataType : unsigned int { + DT_UNDEFINED = 0, + // Basic types. + DT_BOOL = 1, // bool + + DT_INT8 = 2, // int8_t + DT_INT16 = 3, // int16_t + DT_INT32 = 4, // int32_t + DT_INT64 = 5, // int64_t + + DT_UINT8 = 6, // uint8_t + DT_UINT16 = 7, // uint16_t + DT_UINT32 = 8, // uint32_t + DT_UINT64 = 9, // uint64_t + + DT_FLOAT16 = 10, // float 16 + DT_FLOAT32 = 11, // float 32 + DT_FLOAT64 = 12, // float 64 + + DT_STRING = 13, // string + DT_TENSOR = 14, // tensor + DT_GRAPH = 15, // graph + + // list type + DT_BOOLS = 16, // list of bool + + DT_INTS8 = 17, // list of int8_t + DT_INTS16 = 18, // list of int16_t + DT_INTS32 = 19, // list of int32_t + DT_INTS64 = 20, // list of int64_t + + DT_UINTS8 = 21, // list of uint8_t + DT_UINTS16 = 22, // list of uint16_t + DT_UINTS32 = 23, // list of uint32_t + DT_UINTS64 = 24, // list of uint64_t + + DT_FLOATS16 = 25, // list of float16 + DT_FLOATS32 = 26, // list of float32 + DT_FLOATS64 = 27, // list of float64 + + DT_STRINGS = 28, // list of string + DT_TENSORS = 29, // list of tensor + DT_GRAPHS = 30, // list of graph + + DT_TUPLE = 31, // tuple + DT_LIST = 32, // list + DT_DICT = 33, // dictionary + + // other types + DT_NONE = 34, // None + DT_SYM_INST = 35, // Symbolic Key Instance + + // type related type + DT_BASE_INT = 36, // type generic int + DT_BASE_UINT = 37, // type generate unsigned int + DT_BASE_FLOAT = 38, // type generate float + DT_TYPE = 39, // type type + DT_ANYTHING = 40, // type anything + DT_REFKEY = 41, // type refkey + DT_REF = 42 // type ref +} DbgDataType; + +class TensorData { public: TensorData() : slot(0), execution_order(-1) {} TensorData(const TensorData &obj) { - std::cout << "Copy Constructor" << std::endl; + MS_LOG(INFO) << "Copy Constructor"; this->name = obj.name; this->execution_order = obj.execution_order; this->slot = obj.slot; + this->data_ptr = obj.data_ptr; + this->size = obj.size; + this->data_type = obj.data_type; + this->data_type_size = obj.data_type_size; + this->shape = obj.shape; + this->iteration = obj.iteration; + this->device_id = obj.device_id; +#ifdef ONLINE_DBG_MODE this->tensor_ptr = obj.tensor_ptr; +#endif } ~TensorData() {} std::string GetName() { return this->name; } - mindspore::tensor::TensorPtr GetTensor() { return this->tensor_ptr; } - size_t GetSlot() { return this->slot; } int GetExecutionOrder() { return this->execution_order; } @@ -55,9 +188,179 @@ class TensorData { void SetName(const std::string &name) { this->name = name; } +#ifdef ONLINE_DBG_MODE void SetTensor(mindspore::tensor::TensorPtr out_tensor) { this->tensor_ptr = out_tensor; } +#endif void SetSlot(size_t slot) { this->slot = slot; } + + char *GetDataPtr() { return data_ptr; } + + void SetDataPtr(char *data_ptr) { this->data_ptr = data_ptr; } + + uint32_t GetNumElements() { return size / data_type_size; } + + uint64_t GetByteSize() { return size; } + + void SetByteSize(uint64_t size) { this->size = size; } + + std::vector GetShape() { return shape; } + + void SetShape(std::vector shape) { this->shape = shape; } + + unsigned int GetIteration() { return iteration; } + + void SetIteration(unsigned int iteration) { this->iteration = iteration; } + + unsigned int GetDeviceId() { return device_id; } + + void SetDeviceId(unsigned int device_id) { this->device_id = device_id; } + + unsigned int GetRootGraphId() { return root_graph_id; } + + void SetRootGraphId(unsigned int root_graph_id) { this->root_graph_id = root_graph_id; } + + DbgDataType GetType() { return data_type; } + + void SetType(unsigned int type) { ConvertMsToDbgType(type); } + + void SetType(std::string type_name) { ConvertStringToDbgType(type_name); } + + void ConvertMsToDbgType(uint32_t type) { + switch (type) { + case MsTypeId::kNumberTypeBool: + this->data_type = DbgDataType::DT_BOOL; + this->data_type_size = 1; + break; + case MsTypeId::kNumberTypeInt8: + this->data_type = DbgDataType::DT_INT8; + this->data_type_size = 1; + break; + case MsTypeId::kNumberTypeInt16: + this->data_type = DbgDataType::DT_INT16; + this->data_type_size = 2; + break; + case MsTypeId::kNumberTypeInt32: + this->data_type = DbgDataType::DT_INT32; + this->data_type_size = 4; + break; + case MsTypeId::kNumberTypeInt64: + this->data_type = DbgDataType::DT_INT64; + this->data_type_size = 8; + break; + case MsTypeId::kNumberTypeUInt8: + this->data_type = DbgDataType::DT_UINT8; + this->data_type_size = 1; + break; + case MsTypeId::kNumberTypeUInt16: + this->data_type = DbgDataType::DT_UINT16; + this->data_type_size = 2; + break; + case MsTypeId::kNumberTypeUInt32: + this->data_type = DbgDataType::DT_UINT32; + this->data_type_size = 4; + break; + case MsTypeId::kNumberTypeUInt64: + this->data_type = DbgDataType::DT_UINT64; + this->data_type_size = 8; + break; + case MsTypeId::kNumberTypeFloat16: + this->data_type = DbgDataType::DT_FLOAT16; + this->data_type_size = 2; + break; + case MsTypeId::kNumberTypeFloat32: + this->data_type = DbgDataType::DT_FLOAT32; + this->data_type_size = 4; + break; + case MsTypeId::kNumberTypeFloat64: + this->data_type = DbgDataType::DT_FLOAT64; + this->data_type_size = 8; + break; + case MsTypeId::kNumberTypeInt: + this->data_type = DbgDataType::DT_BASE_INT; + this->data_type_size = 4; + break; + case MsTypeId::kNumberTypeUInt: + this->data_type = DbgDataType::DT_BASE_UINT; + this->data_type_size = 4; + break; + case MsTypeId::kNumberTypeFloat: + this->data_type = DbgDataType::DT_BASE_FLOAT; + this->data_type_size = 4; + break; + default: + MS_LOG(EXCEPTION) << "Unexpected type id: " << type; + } + } + + void ConvertStringToDbgType(const std::string &type_name) { + std::string type_name_lower = type_name; + std::string trans_true_prefix = "kNumberType"; + if (type_name.find(trans_true_prefix) == 0) { + type_name_lower = type_name.substr(trans_true_prefix.length()); + } + (void)std::transform(type_name_lower.begin(), type_name_lower.end(), type_name_lower.begin(), ::tolower); + if (type_name_lower == "bool") { + this->data_type = DbgDataType::DT_BOOL; + this->data_type_size = 1; + } else if (type_name_lower == "int8") { + this->data_type = DbgDataType::DT_INT8; + this->data_type_size = 1; + } else if (type_name_lower == "int16") { + this->data_type = DbgDataType::DT_INT16; + this->data_type_size = 2; + } else if (type_name_lower == "int32") { + this->data_type = DbgDataType::DT_INT32; + this->data_type_size = 4; + } else if (type_name_lower == "int64") { + this->data_type = DbgDataType::DT_INT64; + this->data_type_size = 8; + } else if (type_name_lower == "uint8") { + this->data_type = DbgDataType::DT_UINT8; + this->data_type_size = 1; + } else if (type_name_lower == "uint16") { + this->data_type = DbgDataType::DT_UINT16; + this->data_type_size = 2; + } else if (type_name_lower == "uint32") { + this->data_type = DbgDataType::DT_UINT32; + this->data_type_size = 4; + } else if (type_name_lower == "uint64") { + this->data_type = DbgDataType::DT_UINT64; + this->data_type_size = 8; + } else if (type_name_lower == "float16") { + this->data_type = DbgDataType::DT_FLOAT16; + this->data_type_size = 2; + } else if (type_name_lower == "float32") { + this->data_type = DbgDataType::DT_FLOAT32; + this->data_type_size = 4; + } else if (type_name_lower == "float64") { + this->data_type = DbgDataType::DT_FLOAT64; + this->data_type_size = 8; + } else if (type_name_lower == "") { + this->data_type = DbgDataType::DT_UNDEFINED; + this->data_type_size = 0; + } else { + MS_LOG(EXCEPTION) << "Unexpected type name: " << type_name; + } + } + + private: + char *data_ptr; // pointer to the pre-allocated memory + uint64_t size; // size in bytes + DbgDataType data_type; // internal debugger type + unsigned int data_type_size; + std::vector shape; + std::string name; + uint64_t slot; + unsigned int iteration; + unsigned int device_id; + unsigned int root_graph_id; + int execution_order; +#ifdef ONLINE_DBG_MODE + mindspore::tensor::TensorPtr tensor_ptr; +#endif }; +#ifdef ONLINE_DBG_MODE } // namespace mindspore +#endif #endif // MINDSPORE_CCSRC_DEBUG_TENSOR_DATA_H_ diff --git a/mindspore/ccsrc/debug/tensor_load.h b/mindspore/ccsrc/debug/tensor_load.h index af69519b80b..680bd39cd01 100644 --- a/mindspore/ccsrc/debug/tensor_load.h +++ b/mindspore/ccsrc/debug/tensor_load.h @@ -23,10 +23,14 @@ #include #include #include +#ifdef OFFLINE_DBG_MODE +#include "debugger/offline_debug/offline_logger.h" +#endif #include "debug/tensor_data.h" +#ifdef ONLINE_DBG_MODE #include "debug/data_dump/dump_json_parser.h" -#include "ir/dtype.h" namespace mindspore { +#endif class TensorLoader { public: TensorLoader() : iter_num(-1) {} @@ -152,9 +156,10 @@ class TensorLoader { void set_iter_num(uint32_t iter_num) { this->iter_num = iter_num; } +#ifdef ONLINE_DBG_MODE bool DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath, const std::string &host_fmt, const std::vector &host_shape, TypeId host_type, - TypeId addr_type_id, const std::string &addr_format, size_t slot) const { + TypeId addr_type_id, const std::string &addr_format, size_t slot) { if (filepath.empty()) { MS_LOG(ERROR) << "Dump file path is null!"; return false; @@ -181,21 +186,24 @@ class TensorLoader { auto iter = tensor_list_map.find(tensor_loader_name); if (iter != tensor_list_map.end()) { std::shared_ptr node = iter->second; - mindspore::tensor::TensorPtr out_tensor = node->GetTensor(); - size_t host_size = out_tensor->data().nbytes(); + size_t host_size = node->GetByteSize(); - return DumpJsonParser::DumpToFile(path, out_tensor->data_c(), host_size); + return DumpJsonParser::DumpToFile(path, node->GetDataPtr(), host_size); } MS_LOG(INFO) << "Tensor name:" << tensor_name << " not found in tensor_list_map"; return true; } +#endif private: + // the pair is (device_id, iteration) std::map> tensor_list_map; std::multimap> node_tensor_map; std::map> prev_tensor_list_map; uint32_t iter_num; std::mutex lock_; }; +#ifdef ONLINE_DBG_MODE } // namespace mindspore +#endif #endif // MINDSPORE_CCSRC_DEBUG_TENSOR_LOAD_H_ diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc index b2aab5f15f9..f6c1dcf10a7 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc @@ -713,6 +713,10 @@ bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int exec } MS_LOG(INFO) << "E2E tensor name is " << tensor_name; tensor_data->SetTensor(out_tensor); + tensor_data->SetDataPtr(static_cast(out_tensor->data_c())); + tensor_data->SetByteSize(out_tensor->data().nbytes()); + tensor_data->SetType((unsigned int)host_type); + tensor_data->SetShape(out_tensor->shape()); ret = Debugger::GetInstance()->LoadNewTensor(tensor_data, keep_prev); return ret; } diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc index a5e79b20a46..78e7c6cc654 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc @@ -93,7 +93,7 @@ void GPUDeviceAddress::ClearDeviceMemory() { } GPUDeviceAddress::~GPUDeviceAddress() { ClearDeviceMemory(); } -#ifdef ENABLE_DEBUGGER + bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const { @@ -117,13 +117,16 @@ bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int executi auto tensor_data = std::make_shared(); tensor_data->SetName(tensor_name); tensor_data->SetExecutionOrder(execution_order); - tensor_data->SetTensor(out_tensor); tensor_data->SetSlot(slot); + tensor_data->SetTensor(out_tensor); + tensor_data->SetDataPtr(static_cast(out_tensor->data_c())); + tensor_data->SetByteSize(out_tensor->data().nbytes()); + tensor_data->SetType((unsigned int)host_type); + tensor_data->SetShape(out_tensor->shape()); ret = Debugger::GetInstance()->LoadNewTensor(tensor_data, keep_prev); MS_LOG(INFO) << "E2E tensor name is " << tensor_name; return ret; } -#endif } // namespace gpu } // namespace device } // namespace mindspore diff --git a/mindspore/core/utils/log_adapter.cc b/mindspore/core/utils/log_adapter.cc index 2da8d7306d9..d7d0eb9c598 100644 --- a/mindspore/core/utils/log_adapter.cc +++ b/mindspore/core/utils/log_adapter.cc @@ -114,32 +114,33 @@ static int GetSlogLevel(MsLogLevel level) { static const char *GetSubModuleName(SubModuleId module_id) { static const char *sub_module_names[NUM_SUBMODUES] = { - "UNKNOWN", // SM_UNKNOWN - "CORE", // SM_CORE - "ANALYZER", // SM_ANALYZER - "COMMON", // SM_COMMON - "DEBUG", // SM_DEBUG - "DEVICE", // SM_DEVICE - "GE_ADPT", // SM_GE_ADPT - "IR", // SM_IR - "KERNEL", // SM_KERNEL - "MD", // SM_MD - "ME", // SM_ME - "EXPRESS", // SM_EXPRESS - "OPTIMIZER", // SM_OPTIMIZER - "PARALLEL", // SM_PARALLEL - "PARSER", // SM_PARSER - "PIPELINE", // SM_PIPELINE - "PRE_ACT", // SM_PRE_ACT - "PYNATIVE", // SM_PYNATIVE - "SESSION", // SM_SESSION - "UTILS", // SM_UTILS - "VM", // SM_VM - "PROFILER", // SM_PROFILER - "PS", // SM_PS - "LITE", // SM_LITE - "HCCL_ADPT", // SM_HCCL_ADPT - "MINDQUANTUM" // SM_MINDQUANTUM + "UNKNOWN", // SM_UNKNOWN + "CORE", // SM_CORE + "ANALYZER", // SM_ANALYZER + "COMMON", // SM_COMMON + "DEBUG", // SM_DEBUG + "OFFLINE_DEBUG", // SM_OFFLINE_DEBUG + "DEVICE", // SM_DEVICE + "GE_ADPT", // SM_GE_ADPT + "IR", // SM_IR + "KERNEL", // SM_KERNEL + "MD", // SM_MD + "ME", // SM_ME + "EXPRESS", // SM_EXPRESS + "OPTIMIZER", // SM_OPTIMIZER + "PARALLEL", // SM_PARALLEL + "PARSER", // SM_PARSER + "PIPELINE", // SM_PIPELINE + "PRE_ACT", // SM_PRE_ACT + "PYNATIVE", // SM_PYNATIVE + "SESSION", // SM_SESSION + "UTILS", // SM_UTILS + "VM", // SM_VM + "PROFILER", // SM_PROFILER + "PS", // SM_PS + "LITE", // SM_LITE + "HCCL_ADPT", // SM_HCCL_ADPT + "MINDQUANTUM" // SM_MINDQUANTUM }; return sub_module_names[module_id % NUM_SUBMODUES]; diff --git a/mindspore/core/utils/log_adapter.h b/mindspore/core/utils/log_adapter.h index 1429f73fb33..b64b125cb13 100644 --- a/mindspore/core/utils/log_adapter.h +++ b/mindspore/core/utils/log_adapter.h @@ -111,6 +111,7 @@ enum SubModuleId : int { SM_ANALYZER, // static analyzer SM_COMMON, // common SM_DEBUG, // debug + SM_OFFLINE_DEBUG, // offline debug SM_DEVICE, // device SM_GE_ADPT, // ge adapter SM_IR, // IR diff --git a/mindspore/offline_debug/__init__.py b/mindspore/offline_debug/__init__.py new file mode 100644 index 00000000000..9d7758bb719 --- /dev/null +++ b/mindspore/offline_debug/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This module provides APIs to load and process dump data, i.e. read tensors, check +for watchpoints and other debugging services. +""" + +from . import dbg_services +from . import mi_validator_helpers +from . import mi_validators diff --git a/mindspore/offline_debug/dbg_services.py b/mindspore/offline_debug/dbg_services.py new file mode 100644 index 00000000000..10f8c2e4934 --- /dev/null +++ b/mindspore/offline_debug/dbg_services.py @@ -0,0 +1,870 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +The module DbgServices provides offline debugger APIs. +""" + +import mindspore._mindspore_offline_debug as cds +from mindspore.offline_debug.mi_validators import check_init, check_initialize, check_add_watchpoint, check_remove_watchpoint, check_check_watchpoints, check_read_tensors, check_initialize_done, check_tensor_info_init, check_tensor_data_init, check_watchpoint_hit_init, check_parameter_init + + +def get_version(): + """ + Function to return offline Debug Services version. + + Returns: + version (str): dbgServices version. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> version = dbg_services.get_version() + """ + return cds.DbgServices(False).GetVersion() + +class DbgLogger: + """ + Offline Debug Services Logger + + Args: + verbose (bool): whether to print logs. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> version = dbg_services.DbgLogger(verbose=False) + """ + def __init__(self, verbose): + self.verbose = verbose + + def __call__(self, *logs): + if self.verbose: + print(logs) + + +log = DbgLogger(False) + + +class DbgServices(): + """ + Offline Debug Services class. + + Args: + dump_file_path (str): directory where the dump files are saved. + verbose (bool): whether to print logs (default: False).. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> d = dbg_services.DbgServices(dump_file_path="dump_file_path", + >>> verbose=True) + """ + + @check_init + def __init__(self, dump_file_path, verbose=False): + log.verbose = verbose + log("in Python __init__, file path is ", dump_file_path) + self.dump_file_path = dump_file_path + self.dbg_instance = cds.DbgServices(verbose) + self.version = self.dbg_instance.GetVersion() + self.verbose = verbose + self.initialized = False + + @check_initialize + def initialize(self, net_name, is_sync_mode=True): + """ + Initialize Debug Service. + + Args: + net_name (str): Network name. + is_sync_mode (bool): Whether to process synchronous or asynchronous dump files mode + (default: True (synchronous)). + + Returns: + Initialized Debug Service instance. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> d = dbg_services.DbgServices(dump_file_path="dump_file_path", + >>> verbose=True) + >>> d_init = d.initialize(net_name="network name", is_sync_mode=True) + """ + + log("in Python Initialize dump_file_path ", self.dump_file_path) + self.initialized = True + return self.dbg_instance.Initialize(net_name, self.dump_file_path, is_sync_mode) + + @check_initialize_done + @check_add_watchpoint + def add_watchpoint(self, watchpoint_id, watch_condition, check_node_list, parameter_list): + """ + Adding watchpoint to Debug Service instance. + + Args: + watchpoint_id (int): Watchpoint id + watch_condition (int): A representation of the condition to be checked. + check_node_list (dict): Dictionary of node names (str or '*' to check all nodes) as key, + mapping to device_id (list of ints or '*' to check all devices), + root_graph_id (list of ints or '*' to check all graphs) and is_parameter (bool). + parameter_list (list): List of parameters in watchpoint. Parameters should be instances of Parameter class. + Each parameter describes the value to be checked in watchpoint. + + Returns: + Debug Service instance with added watchpoint. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> d = dbg_services.DbgServices(dump_file_path="dump_file_path", + >>> verbose=True) + >>> d_init = d.initialize(is_sync_mode=True) + >>> d_wp = d_init.add_watchpoint(watchpoint_id=1, + >>> watch_condition=6, + >>> check_node_list={"conv2.bias" : {"device_id": [0], + root_graph_id: [0], "is_parameter": True}}, + >>> parameter_list=[dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> actual_value=0.0)]) + """ + + log("in Python AddWatchpoint") + for node_name, node_info in check_node_list.items(): + for info_name, info_param in node_info.items(): + if info_name in ["device_id", "root_graph_id"]: + if info_param in ["*"]: + check_node_list[node_name][info_name] = ["*"] + else: + check_node_list[node_name][info_name] = list(map(str, info_param)) + parameter_list_inst = [] + for elem in parameter_list: + parameter_list_inst.append(elem.instance) + return self.dbg_instance.AddWatchpoint(watchpoint_id, watch_condition, check_node_list, parameter_list_inst) + + @check_initialize_done + @check_remove_watchpoint + def remove_watchpoint(self, watchpoint_id): + """ + Removing watchpoint from Debug Service instance. + + Args: + watchpoint_id (int): Watchpoint id + + Returns: + Debug Service instance with removed watchpoint. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> d = dbg_services.DbgServices(dump_file_path="dump_file_path", + >>> verbose=True) + >>> d_init = d.initialize(is_sync_mode=True) + >>> d_wp = d_init.add_watchpoint(watchpoint_id=1, + >>> watch_condition=6, + >>> check_node_list={"conv2.bias" : {"device_id": [5], + root_graph_id: [0], "is_parameter": True}}, + >>> parameter_list=[dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> actual_value=0.0)]) + >>> d_wp = d_wp.remove_watchpoint(watchpoint_id=1) + """ + + log("in Python Remove Watchpoint id ", watchpoint_id) + return self.dbg_instance.RemoveWatchpoint(watchpoint_id) + + @check_initialize_done + @check_check_watchpoints + def check_watchpoints(self, iteration): + """ + Checking watchpoint at given iteration. + + Args: + iteration (int): Watchpoint check iteration. + + Returns: + Watchpoint hit list. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> d = dbg_services.DbgServices(dump_file_path="dump_file_path", + >>> verbose=True) + >>> d_init = d.initialize(is_sync_mode=True) + >>> d_wp = d_init.add_watchpoint(id=1, + >>> watch_condition=6, + >>> check_node_list={"conv2.bias" : {"device_id": [5], + root_graph_id: [0], "is_parameter": True}}, + >>> parameter_list=[dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> actual_value=0.0)]) + >>> watchpoints = d_wp.check_watchpoints(iteration=8) + """ + + log("in Python CheckWatchpoints iteration ", iteration) + watchpoint_list = self.dbg_instance.CheckWatchpoints(iteration) + watchpoint_hit_list = [] + for watchpoint in watchpoint_list: + name = watchpoint.get_name() + slot = watchpoint.get_slot() + condition = watchpoint.get_condition() + watchpoint_id = watchpoint.get_watchpoint_id() + parameters = watchpoint.get_parameters() + error_code = watchpoint.get_error_code() + device_id = watchpoint.get_device_id() + root_graph_id = watchpoint.get_root_graph_id() + param_list = [] + for param in parameters: + p_name = param.get_name() + disabled = param.get_disabled() + value = param.get_value() + hit = param.get_hit() + actual_value = param.get_actual_value() + param_list.append(Parameter(p_name, disabled, value, hit, actual_value)) + watchpoint_hit_list.append(WatchpointHit(name, slot, condition, watchpoint_id, + param_list, error_code, device_id, root_graph_id)) + return watchpoint_hit_list + + @check_initialize_done + @check_read_tensors + def read_tensors(self, info): + """ + Returning tensor data object describing the tensor requested tensor. + + Args: + info (list): List of TensorInfo objects. + + Returns: + TensorData list (list). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> d = dbg_services.DbgServices(dump_file_path="dump_file_path", + >>> verbose=True) + >>> d_init = d.initialize(is_sync_mode=True) + >>> tensor_data_list = d_init.read_tensors([dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> device_id=5, + >>> root_graph_id=0, + >>> is_parameter=True)]) + """ + + log("in Python ReadTensors info ", info) + info_list_inst = [] + for elem in info: + log("in Python ReadTensors info ", info) + info_list_inst.append(elem.instance) + tensor_data_list = self.dbg_instance.ReadTensors(info_list_inst) + tensor_data_list_ret = [] + for elem in tensor_data_list: + if elem.get_data_size() == 0: + tensor_data = TensorData(b'', elem.get_data_size(), elem.get_dtype(), elem.get_shape()) + else: + tensor_data = TensorData(elem.get_data_ptr(), elem.get_data_size(), elem.get_dtype(), elem.get_shape()) + tensor_data_list_ret.append(tensor_data) + return tensor_data_list_ret + +class TensorInfo(): + """ + Tensor Information class. + + Args: + node_name (str): Fully qualified name of the desired node. + slot (int): The particular output for the requested node. + iteration (int): The desired itraretion to gather tensor information. + device_id (int): The desired device id to gather tensor information. + is_parameter (bool): Whether node is a parameter (input, constant, bias, parameter). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> device_id=5, + >>> root_graph_id=0, + >>> is_parameter=True) + """ + + @check_tensor_info_init + def __init__(self, node_name, slot, iteration, device_id, root_graph_id, is_parameter): + self.instance = cds.tensor_info(node_name, slot, iteration, device_id, root_graph_id, is_parameter) + + @property + def node_name(self): + """ + Function to receive TensorInfo node_name. + + Returns: + node_name of TensorInfo instance (str). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> device_id=5, + >>> root_graph_id=0, + >>> is_parameter=True) + >>> name = tensor_info.node_name + """ + + return self.instance.get_node_name() + + @property + def slot(self): + """ + Function to receive TensorInfo slot. + + Returns: + slot of TensorInfo instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> device_id=5, + >>> root_graph_id=0, + >>> is_parameter=True) + >>> slot = tensor_info.slot + """ + + return self.instance.get_slot() + + @property + def iteration(self): + """ + Function to receive TensorInfo iteration. + + Returns: + iteration of TensorInfo instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> device_id=5, + >>> root_graph_id=0, + >>> is_parameter=True) + >>> iteration = tensor_info.iteration + """ + + return self.instance.get_iteration() + + @property + def device_id(self): + """ + Function to receive TensorInfo device_id. + + Returns: + device_id of TensorInfo instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> device_id=5, + >>> root_graph_id=0, + >>> is_parameter=True) + >>> device_id = tensor_info.device_id + """ + + @property + def root_graph_id(self): + """ + Function to receive TensorInfo root_graph_id. + + Returns: + root_graph_id of TensorInfo instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> device_id=5, + >>> root_graph_id=0, + >>> is_parameter=True) + >>> device_id = tensor_info.root_graph_id + """ + + return self.instance.get_root_graph_id() + + @property + def is_parameter(self): + """ + Function to receive TensorInfo is_parameter. + + Returns: + is_parameter of TensorInfo instance (bool). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias", + >>> slot=0, + >>> iteration=8, + >>> device_id=5, + >>> root_graph_id=0, + >>> is_parameter=True) + >>> is_parameter = tensor_info.is_parameter + """ + + return self.instance.get_is_parameter() + +class TensorData(): + """ + TensorData class. + + Args: + data_ptr (byte): Data pointer. + data_size (int): Size of data in bytes. + dtype (int): An encoding representing the type of TensorData. + shape (list): Shape of tensor. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0', + >>> data_size=4, + >>> dtype=0, + >>> shape=[2, 2]) + """ + + @check_tensor_data_init + def __init__(self, data_ptr, data_size, dtype, shape): + self.instance = cds.tensor_data(data_ptr, data_size, dtype, shape) + + @property + def data_ptr(self): + """ + Function to receive TensorData data_ptr. + + Returns: + data_ptr of TensorData instance (byte). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0', + >>> data_size=4, + >>> dtype=0, + >>> shape=[2, 2]) + >>> data_ptr = tensor_data.data_ptr + """ + + return self.instance.get_data_ptr() + + @property + def data_size(self): + """ + Function to receive TensorData data_size. + + Returns: + data_size of TensorData instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0', + >>> data_size=4, + >>> dtype=0, + >>> shape=[2, 2]) + >>> data_size = tensor_data.data_size + """ + + return self.instance.get_data_size() + + @property + def dtype(self): + """ + Function to receive TensorData dtype. + + Returns: + dtype of TensorData instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0', + >>> data_size=4, + >>> dtype=0, + >>> shape=[2, 2]) + >>> dtype = tensor_data.dtype + """ + + return self.instance.get_dtype() + + @property + def shape(self): + """ + Function to receive TensorData shape. + + Returns: + shape of TensorData instance (list). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0', + >>> data_size=4, + >>> dtype=0, + >>> shape=[2, 2]) + >>> shape = tensor_data.shape + """ + + return self.instance.get_shape() + +class WatchpointHit(): + """ + WatchpointHit class. + + Args: + name (str): Name of WatchpointHit instance. + slot (int): The numerical label of an output. + condition (int): A representation of the condition to be checked. + watchpoint_id (int): Watchpoint id. + parameters (list): A list of all parameters for WatchpointHit instance. + Parameters have to be instances of Parameter class. + error_code (int): An explanation of certain scenarios where watchpoint could not be checked. + device_id (int): Device id where the watchpoint is hit. + root_graph_id (int): Root graph id where the watchpoint is hit. + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + """ + + @check_watchpoint_hit_init + def __init__(self, name, slot, condition, watchpoint_id, parameters, error_code, device_id, root_graph_id): + parameter_list_inst = [] + for elem in parameters: + parameter_list_inst.append(elem.instance) + self.instance = cds.watchpoint_hit(name, slot, condition, watchpoint_id, + parameter_list_inst, error_code, device_id, root_graph_id) + + @property + def name(self): + """ + Function to receive WatchpointHit name. + + Returns: + name of WatchpointHit instance (str). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + >>> name = watchpoint_hit.name + """ + + return self.instance.get_name() + + @property + def slot(self): + """ + Function to receive WatchpointHit slot. + + Returns: + slot of WatchpointHit instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + >>> slot = watchpoint_hit.slot + """ + + return self.instance.get_slot() + + @property + def condition(self): + """ + Function to receive WatchpointHit condition. + + Returns: + condition of WatchpointHit instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + >>> condition = watchpoint_hit.condition + """ + + return self.instance.get_condition() + + @property + def watchpoint_id(self): + """ + Function to receive WatchpointHit watchpoint_id. + + Returns: + watchpoint_id of WatchpointHit instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + >>> watchpoint_id = watchpoint_hit.watchpoint_id + """ + + return self.instance.get_watchpoint_id() + + @property + def parameters(self): + """ + Function to receive WatchpointHit parameters. + + Returns: + List of parameters of WatchpointHit instance (list). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + >>> parameters = watchpoint_hit.parameters + """ + + params = self.instance.get_parameters() + param_list = [] + for elem in params: + tmp = Parameter(elem.get_name(), + elem.get_disabled(), + elem.get_value(), + elem.get_hit(), + elem.get_actual_value()) + param_list.append(tmp) + return param_list + + @property + def error_code(self): + """ + Function to receive WatchpointHit error_code. + + Returns: + error_code of WatchpointHit instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + >>> error_code = watchpoint_hit.error_code + """ + + return self.instance.get_error_code() + + @property + def device_id(self): + """ + Function to receive WatchpointHit device_id. + + Returns: + device_id of WatchpointHit instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + >>> device_id = watchpoint_hit.device_id + """ + + return self.instance.get_device_id() + + @property + def root_graph_id(self): + """ + Function to receive WatchpointHit root_graph_id. + + Returns: + root_graph_id of WatchpointHit instance (int). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1", + >>> slot=1, + >>> condition=2, + >>> watchpoint_id=3, + >>> parameters=[param1, param2], + >>> error_code=0, + >>> device_id=1, + >>> root_graph_id=1) + >>> root_graph_id = watchpoint_hit.root_graph_id + """ + + return self.instance.get_root_graph_id() + +class Parameter(): + """ + Parameter class. + + Args: + name (str): Name of the parameter. + disabled (bool): Whether parameter is used in backend. + value (float): Threshold value of the parameter. + hit (bool): Whether this parameter triggered watchpoint (default is False). + actual_value (float): Actual value of the parameter (default is 0.0). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> parameter = dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> actual_value=0.0) + """ + + @check_parameter_init + def __init__(self, name, disabled, value, hit=False, actual_value=0.0): + self.instance = cds.parameter(name, disabled, value, hit, actual_value) + + @property + def name(self): + """ + Function to receive Parameter name. + + Returns: + name of Parameter instance (str). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> parameter = dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> name = watchpoint_hit.name + """ + + return self.instance.get_name() + + @property + def disabled(self): + """ + Function to receive Parameter disabled value. + + Returns: + disabled of Parameter instance (bool). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> parameter = dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> disabled = watchpoint_hit.disabled + """ + + return self.instance.get_disabled() + + @property + def value(self): + """ + Function to receive Parameter value. + + Returns: + value of Parameter instance (float). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> parameter = dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> value = watchpoint_hit.value + """ + + return self.instance.get_value() + + @property + def hit(self): + """ + Function to receive Parameter hit value. + + Returns: + hit of Parameter instance (bool). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> parameter = dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> hit = watchpoint_hit.hit + """ + + return self.instance.get_hit() + + @property + def actual_value(self): + """ + Function to receive Parameter actual_value value. + + Returns: + actual_value of Parameter instance (float). + + Examples: + >>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services + >>> parameter = dbg_services.Parameter(name="param", + >>> disabled=False, + >>> value=0.0, + >>> hit=False, + >>> actual_value = watchpoint_hit.actual_value + """ + + return self.instance.get_actual_value() diff --git a/mindspore/offline_debug/mi_validator_helpers.py b/mindspore/offline_debug/mi_validator_helpers.py new file mode 100644 index 00000000000..abc4b38223f --- /dev/null +++ b/mindspore/offline_debug/mi_validator_helpers.py @@ -0,0 +1,123 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +General Validator Helper Functions. +""" +import os +import inspect + +UINT32_MAX = 4294967295 +UINT32_MIN = 0 +UINT64_MAX = 18446744073709551615 +UINT64_MIN = 0 + + +def pad_arg_name(arg_name): + if arg_name != "": + arg_name = arg_name + " " + return arg_name + + +def check_value(arg, valid_range, arg_name=""): + arg_name = pad_arg_name(arg_name) + if arg < valid_range[0] or arg > valid_range[1]: + raise ValueError( + "Input {0}is not within the required interval of ({1} to {2}).".format(arg_name, + valid_range[0], valid_range[1])) + + +def check_uint32(arg, arg_name=""): + type_check(arg, (int,), arg_name) + check_value(arg, [UINT32_MIN, UINT32_MAX]) + + +def check_uint64(arg, arg_name=""): + type_check(arg, (int,), arg_name) + check_value(arg, [UINT64_MIN, UINT64_MAX]) + + +def check_dir(dataset_dir): + if not os.path.isdir(dataset_dir) or not os.access(dataset_dir, os.R_OK): + raise ValueError("The folder {} does not exist or permission denied!".format(dataset_dir)) + + +def parse_user_args(method, *args, **kwargs): + """ + Parse user arguments in a function. + + Args: + method (method): a callable function. + args: user passed args. + kwargs: user passed kwargs. + + Returns: + user_filled_args (list): values of what the user passed in for the arguments. + ba.arguments (Ordered Dict): ordered dict of parameter and argument for what the user has passed. + """ + sig = inspect.signature(method) + if 'self' in sig.parameters or 'cls' in sig.parameters: + ba = sig.bind(method, *args, **kwargs) + ba.apply_defaults() + params = list(sig.parameters.keys())[1:] + else: + ba = sig.bind(*args, **kwargs) + ba.apply_defaults() + params = list(sig.parameters.keys()) + + user_filled_args = [ba.arguments.get(arg_value) for arg_value in params] + return user_filled_args, ba.arguments + + +def type_check(arg, types, arg_name): + """ + Check the type of the parameter. + + Args: + arg (Any) : any variable. + types (tuple): tuple of all valid types for arg. + arg_name (str): the name of arg. + + Returns: + Exception: when the type is not correct, otherwise nothing. + """ + # handle special case of booleans being a subclass of ints + print_value = '\"\"' if repr(arg) == repr('') else arg + + if int in types and bool not in types: + if isinstance(arg, bool): + raise TypeError("Argument {0} with value {1} is not of type {2}.".format(arg_name, print_value, types)) + if not isinstance(arg, types): + raise TypeError("Argument {0} with value {1} is not of type {2}.".format(arg_name, print_value, types)) + + +def type_check_list(args, types, arg_names): + """ + Check the type of each parameter in the list. + + Args: + args (Union[list, tuple]): a list or tuple of any variable. + types (tuple): tuple of all valid types for arg. + arg_names (Union[list, tuple of str]): the names of args. + + Returns: + Exception: when the type is not correct, otherwise nothing. + """ + type_check(args, (list, tuple,), arg_names) + if len(args) != len(arg_names) and not isinstance(arg_names, str): + raise ValueError("List of arguments is not the same length as argument_names.") + if isinstance(arg_names, str): + arg_names = ["{0}[{1}]".format(arg_names, i) for i in range(len(args))] + for arg, arg_name in zip(args, arg_names): + type_check(arg, types, arg_name) diff --git a/mindspore/offline_debug/mi_validators.py b/mindspore/offline_debug/mi_validators.py new file mode 100644 index 00000000000..669c99be774 --- /dev/null +++ b/mindspore/offline_debug/mi_validators.py @@ -0,0 +1,231 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Validator Functions for Offline Debugger APIs. +""" +from functools import wraps + +import mindspore.offline_debug.dbg_services as cds +from mindspore.offline_debug.mi_validator_helpers import parse_user_args, type_check, type_check_list, check_dir, check_uint32, check_uint64 + + +def check_init(method): + """Wrapper method to check the parameters of DbgServices init.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [dump_file_path, verbose], _ = parse_user_args(method, *args, **kwargs) + + type_check(dump_file_path, (str,), "dump_file_path") + type_check(verbose, (bool,), "verbose") + check_dir(dump_file_path) + + return method(self, *args, **kwargs) + + return new_method + + +def check_initialize(method): + """Wrapper method to check the parameters of DbgServices Initialize method.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [net_name, is_sync_mode], _ = parse_user_args(method, *args, **kwargs) + + type_check(net_name, (str,), "net_name") + type_check(is_sync_mode, (bool,), "is_sync_mode") + + return method(self, *args, **kwargs) + + return new_method + + +def check_add_watchpoint(method): + """Wrapper method to check the parameters of DbgServices AddWatchpoint.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [id_value, watch_condition, check_node_list, parameter_list], _ = parse_user_args(method, *args, **kwargs) + + check_uint32(id_value, "id") + check_uint32(watch_condition, "watch_condition") + type_check(check_node_list, (dict,), "check_node_list") + for node_name, node_info in check_node_list.items(): + type_check(node_name, (str,), "node_name") + type_check(node_info, (dict,), "node_info") + for info_name, info_param in node_info.items(): + type_check(info_name, (str,), "node parameter name") + if info_name in ["device_id"]: + if isinstance(info_param, str): + if info_param not in ["*"]: + raise ValueError("Node parameter {} only accepts '*' as string.".format(info_name)) + else: + for param in info_param: + check_uint32(param, "device_id") + elif info_name in ["root_graph_id"]: + if isinstance(info_param, str): + if info_param not in ["*"]: + raise ValueError("Node parameter {} only accepts '*' as string.".format(info_name)) + else: + for param in info_param: + check_uint32(param, "root_graph_id") + elif info_name in ["is_parameter"]: + type_check(info_param, (bool,), "is_parameter") + else: + raise ValueError("Node parameter {} is not defined.".format(info_name)) + param_names = ["param_{0}".format(i) for i in range(len(parameter_list))] + type_check_list(parameter_list, (cds.Parameter,), param_names) + + return method(self, *args, **kwargs) + + return new_method + + +def check_remove_watchpoint(method): + """Wrapper method to check the parameters of DbgServices RemoveWatchpoint.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [id_value], _ = parse_user_args(method, *args, **kwargs) + + check_uint32(id_value, "id") + + return method(self, *args, **kwargs) + + return new_method + + +def check_check_watchpoints(method): + """Wrapper method to check the parameters of DbgServices CheckWatchpoint.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [iteration], _ = parse_user_args(method, *args, **kwargs) + + check_uint32(iteration, "iteration") + + return method(self, *args, **kwargs) + + return new_method + + +def check_read_tensors(method): + """Wrapper method to check the parameters of DbgServices ReadTensors.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [info_list], _ = parse_user_args(method, *args, **kwargs) + + info_names = ["info_{0}".format(i) for i in range(len(info_list))] + type_check_list(info_list, (cds.TensorInfo,), info_names) + + return method(self, *args, **kwargs) + + return new_method + + +def check_initialize_done(method): + """Wrapper method to check if initlize is done for DbgServices.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + + if not self.initialized: + raise RuntimeError("Inilize should be called before any other methods of DbgServices!") + return method(self, *args, **kwargs) + + return new_method + + +def check_tensor_info_init(method): + """Wrapper method to check the parameters of DbgServices TensorInfo init.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [node_name, slot, iteration, device_id, root_graph_id, + is_parameter], _ = parse_user_args(method, *args, **kwargs) + + type_check(node_name, (str,), "node_name") + check_uint32(slot, "slot") + check_uint32(iteration, "iteration") + check_uint32(device_id, "device_id") + check_uint32(root_graph_id, "root_graph_id") + type_check(is_parameter, (bool,), "is_parameter") + + return method(self, *args, **kwargs) + + return new_method + + +def check_tensor_data_init(method): + """Wrapper method to check the parameters of DbgServices TensorData init.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [data_ptr, data_size, dtype, shape], _ = parse_user_args(method, *args, **kwargs) + + type_check(data_ptr, (bytes,), "data_ptr") + check_uint64(data_size, "data_size") + type_check(dtype, (int,), "dtype") + shape_names = ["shape_{0}".format(i) for i in range(len(shape))] + type_check_list(shape, (int,), shape_names) + + if len(data_ptr) != data_size: + raise ValueError("data_ptr length ({0}) is not equal to data_size ({1}).".format(len(data_ptr), data_size)) + + return method(self, *args, **kwargs) + + return new_method + + +def check_watchpoint_hit_init(method): + """Wrapper method to check the parameters of DbgServices WatchpointHit init.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [name, slot, condition, watchpoint_id, + parameters, error_code, device_id, root_graph_id], _ = parse_user_args(method, *args, **kwargs) + + type_check(name, (str,), "name") + check_uint32(slot, "slot") + type_check(condition, (int,), "condition") + check_uint32(watchpoint_id, "watchpoint_id") + param_names = ["param_{0}".format(i) for i in range(len(parameters))] + type_check_list(parameters, (cds.Parameter,), param_names) + type_check(error_code, (int,), "error_code") + check_uint32(device_id, "device_id") + check_uint32(root_graph_id, "root_graph_id") + + return method(self, *args, **kwargs) + + return new_method + + +def check_parameter_init(method): + """Wrapper method to check the parameters of DbgServices Parameter init.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [name, disabled, value, hit, actual_value], _ = parse_user_args(method, *args, **kwargs) + + type_check(name, (str,), "name") + type_check(disabled, (bool,), "disabled") + type_check(value, (float,), "value") + type_check(hit, (bool,), "hit") + type_check(actual_value, (float,), "actual_value") + + return method(self, *args, **kwargs) + + return new_method