Offline debugger

Authors: John Tzanakakis, Adel Shafiei, Amir Lashkari, Islam Amin
This commit is contained in:
John Tzanakakis 2021-04-01 14:24:05 -04:00
parent 3d4a1aaff1
commit da3b13a0e1
43 changed files with 5250 additions and 168 deletions

View File

@ -63,6 +63,16 @@ install(
COMPONENT mindspore
)
if(CMAKE_SYSTEM_NAME MATCHES "Windows")
message("offline debugger does not support windows system temporarily")
else()
install(
TARGETS _mindspore_offline_debug
DESTINATION ${INSTALL_BASE_DIR}
COMPONENT mindspore
)
endif()
install(
TARGETS mindspore_shared_lib
DESTINATION ${INSTALL_LIB_DIR}
@ -317,6 +327,18 @@ if(EXISTS ${CMAKE_SOURCE_DIR}/mindspore/dataset)
)
endif()
if(CMAKE_SYSTEM_NAME MATCHES "Windows")
message("offline debugger does not support windows system temporarily")
else()
if(EXISTS ${CMAKE_SOURCE_DIR}/mindspore/offline_debug)
install(
DIRECTORY ${CMAKE_SOURCE_DIR}/mindspore/offline_debug
DESTINATION ${INSTALL_PY_DIR}
COMPONENT mindspore
)
endif()
endif()
## Public header files
install(
DIRECTORY ${CMAKE_SOURCE_DIR}/include

View File

@ -1,3 +1,6 @@
include_directories(${CMAKE_SOURCE_DIR}/mindspore/ccsrc/debug/)
include_directories(${CMAKE_BINARY_DIR})
set(_DEBUG_SRC_LIST
"${CMAKE_CURRENT_SOURCE_DIR}/anf_ir_dump.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/anf_ir_utils.cc"
@ -8,6 +11,14 @@ set(_DEBUG_SRC_LIST
"${CMAKE_CURRENT_SOURCE_DIR}/env_config_parser.cc"
)
set(_OFFLINE_SRC_LIST
"${CMAKE_CURRENT_SOURCE_DIR}/debug_services.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/tensor_summary.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/offline_debug/offline_logger.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/offline_debug/dbg_services.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/offline_debug/mi_pybind_register.cc"
)
if(ENABLE_DUMP_IR)
file(GLOB_RECURSE _RDR_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "rdr/*.cc")
if(NOT ENABLE_D)
@ -38,3 +49,13 @@ endif()
set_property(SOURCE ${_DEBUG_SRC_LIST} ${_RDR_SRC_LIST} PROPERTY COMPILE_DEFINITIONS
SUBMODULE_ID=mindspore::SubModuleId::SM_DEBUG)
add_library(_mindspore_debug_obj OBJECT ${_DEBUG_SRC_LIST} ${_RDR_SRC_LIST})
if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows")
add_compile_options(-Wall -DOFFLINE_DBG_MODE -fPIC -O2)
set_property(SOURCE ${_OFFLINE_SRC_LIST} PROPERTY COMPILE_DEFINITIONS
SUBMODULE_ID=mindspore::SubModuleId::SM_OFFLINE_DEBUG)
add_library(_mindspore_offline_debug SHARED ${_OFFLINE_SRC_LIST})
set_target_properties(_mindspore_offline_debug PROPERTIES
PREFIX "${PYTHON_MODULE_PREFIX}"
SUFFIX "${PYTHON_MODULE_EXTENSION}"
)
endif()

View File

@ -13,14 +13,19 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "debug/debug_services.h"
#include <dirent.h>
#include <fstream>
#include <algorithm>
#include <map>
#include <unordered_set>
#ifdef ONLINE_DBG_MODE
#include "backend/session/anf_runtime_algorithm.h"
#include "debug/debug_services.h"
#endif
#include "debug/debugger/tensor_summary.h"
#ifdef ONLINE_DBG_MODE
namespace mindspore {
#endif
DebugServices::DebugServices() {
tensor_loader_ = new TensorLoader();
uint32_t iter_num = -1;
@ -42,9 +47,11 @@ DebugServices &DebugServices::operator=(const DebugServices &other) {
DebugServices::~DebugServices() { delete tensor_loader_; }
void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
const std::vector<std::tuple<std::string, bool>> &check_node_list,
const std::vector<parameter_t> &parameter_list) {
void DebugServices::AddWatchpoint(
unsigned int id, unsigned int watch_condition, float parameter,
const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
std::lock_guard<std::mutex> lg(lock_);
watchpoint_t watchpoint_item;
@ -52,6 +59,12 @@ void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition,
watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
watchpoint_item.condition.parameter = parameter;
watchpoint_item.check_node_list = check_node_list;
if (check_node_device_list != nullptr) {
watchpoint_item.check_node_device_list = *check_node_device_list;
}
if (check_node_graph_list != nullptr) {
watchpoint_item.check_node_graph_list = *check_node_graph_list;
}
watchpoint_item.parameter_list = parameter_list;
watchpoint_table[id] = watchpoint_item;
}
@ -61,122 +74,170 @@ void DebugServices::RemoveWatchpoint(unsigned int id) {
watchpoint_table.erase(id);
}
std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor, void *previous_tensor_ptr,
uint32_t num_elements, int tensor_dtype) {
switch (tensor_dtype) {
case DbgDataType::DT_UINT8: {
return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
}
case DbgDataType::DT_INT8: {
return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
}
case DbgDataType::DT_UINT16: {
return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
}
case DbgDataType::DT_INT16: {
return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
}
case DbgDataType::DT_UINT32: {
return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
}
case DbgDataType::DT_INT32:
case DbgDataType::DT_BASE_INT: {
return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
}
case DbgDataType::DT_UINT64: {
return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
}
case DbgDataType::DT_INT64: {
return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
}
case DbgDataType::DT_FLOAT16: {
return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
}
case DbgDataType::DT_FLOAT32:
case DbgDataType::DT_BASE_FLOAT: {
return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
}
case DbgDataType::DT_FLOAT64: {
return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
}
case DbgDataType::DT_BOOL: {
return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
}
default:
MS_LOG(INFO) << "Unsupported tensor type";
// return a null pointer
return std::unique_ptr<TensorSummary<int32_t>>{};
}
}
#ifdef OFFLINE_DBG_MODE
void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed) {
void *previous_tensor_ptr = nullptr;
std::shared_ptr<TensorData> tensor_prev;
if (previous_iter_tensor_needed && tensor->GetIteration() > 1) {
// read data in offline mode
std::vector<std::shared_ptr<TensorData>> result_list_prev;
ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
std::vector<unsigned int>{tensor->GetDeviceId()},
std::vector<unsigned int>{tensor->GetIteration() - 1},
std::vector<unsigned int>{tensor->GetRootGraphId()}, &result_list_prev);
tensor_prev = result_list_prev[0];
if (!tensor_prev->GetByteSize()) {
tensor_prev.reset();
} else {
previous_tensor_ptr = tensor_prev->GetDataPtr();
}
}
return previous_tensor_ptr;
}
#endif
void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
const std::string &tensor_name, const std::string &tensor_name_no_slot,
bool *previous_iter_tensor_needed, std::string *qualified_tensor_name,
std::vector<watchpoint_t> *watchpoints_to_check) {
for (auto w_table_item : watchpoint_table) {
auto wp = std::get<1>(w_table_item);
// check ONLY init conditions on initial suspended state.
// skip other conditions on initial suspended state
if (init_dbg_suspend && (wp.condition.type != INIT)) continue;
// skip init condition if not init suspend
if ((wp.condition.type == INIT) && !init_dbg_suspend) continue;
// check change conditions only on step end.
if (wp.change_condition() && !step_end) continue;
// if recheck, ignore the cache results and reanalyze everything.
// if not a recheck, check only unanalyzed tensors
if (!recheck && wp_id_cache[tensor_name].count(wp.id)) continue;
std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot);
if (!found.empty()) {
*qualified_tensor_name = found;
watchpoints_to_check->push_back(w_table_item.second);
#ifdef OFFLINE_DBG_MODE
if (wp.change_condition()) {
*previous_iter_tensor_needed = true;
}
#endif
}
}
}
void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
const std::string &tensor_name) {
// add analyzed tensor to cache
if (!recheck) {
wp_id_cache[tensor_name].insert(id);
}
}
void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot,
std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
std::vector<std::vector<parameter_t>> *parameters,
std::vector<int32_t> *error_codes, const std::vector<std::string> &op_overflows,
const std::vector<std::shared_ptr<TensorData>> &tensor_list,
const bool init_dbg_suspend, const bool step_end, const bool recheck) {
std::vector<std::shared_ptr<TensorData>> *tensor_list, const bool init_dbg_suspend,
const bool step_end, const bool recheck, std::vector<unsigned int> *device_id,
std::vector<unsigned int> *root_graph_id) {
std::lock_guard<std::mutex> lg(lock_);
if (watchpoint_table.empty()) return;
for (const auto &tensor : tensor_list) {
for (auto &tensor : *tensor_list) {
#ifdef OFFLINE_DBG_MODE
// read data in offline mode
std::vector<std::shared_ptr<TensorData>> result_list;
ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
std::vector<unsigned int>{tensor->GetDeviceId()},
std::vector<unsigned int>{tensor->GetIteration()},
std::vector<unsigned int>{tensor->GetRootGraphId()}, &result_list);
tensor = result_list[0];
if (!tensor->GetByteSize()) {
tensor.reset();
continue;
}
#endif
const auto tensor_name = tensor->GetName();
const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
const auto tensor_slot = std::to_string(tensor->GetSlot());
mindspore::tensor::TensorPtr tensor_ptr = tensor->GetTensor();
// no elements to analyze
if (tensor_ptr->DataSize() == 0) continue;
int tensor_dtype = tensor_ptr->data_type_c();
if (tensor->GetByteSize() == 0) continue;
int tensor_dtype = tensor->GetType();
std::vector<watchpoint_t> watchpoints_to_check;
std::string qualified_tensor_name;
for (auto w_table_item : watchpoint_table) {
auto wp = std::get<1>(w_table_item);
// check ONLY init conditions on intial suspended state.
// skip other conditions on intial suspended state
if (init_dbg_suspend && (wp.condition.type != INIT)) continue;
// skip init condition if not init suspend
if ((wp.condition.type == INIT) && !init_dbg_suspend) continue;
// check change conditions only on step end.
if (wp.change_condition() && !step_end) continue;
// if recheck, ignore the cache results and reanalyze everything.
// if not a recheck, check only unanalyzed tensors
if (!recheck && wp_id_cache[tensor_name].count(wp.id)) continue;
std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot);
if (!found.empty()) {
qualified_tensor_name = found;
watchpoints_to_check.push_back(w_table_item.second);
}
}
bool previous_iter_tensor_needed = false;
// Add do nothing line in case offline debug is off, prevent unused var warning
(void)previous_iter_tensor_needed;
AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor_name, tensor_name_no_slot,
&previous_iter_tensor_needed, &qualified_tensor_name, &watchpoints_to_check);
// no wp set on current tensor
if (watchpoints_to_check.empty()) continue;
uint32_t num_elements = tensor_ptr->DataSize();
void *previous_tensor_ptr = tensor_loader_->GetPrevTensor(tensor_name)
? tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()
: nullptr;
uint32_t num_elements = tensor->GetNumElements();
#ifdef OFFLINE_DBG_MODE
void *previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed);
#else
void *previous_tensor_ptr =
tensor_loader_->GetPrevTensor(tensor_name) ? tensor_loader_->GetPrevTensor(tensor_name)->GetDataPtr() : nullptr;
#endif
std::unique_ptr<ITensorSummary> base_summary_ptr;
if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
switch (tensor_dtype) {
case kNumberTypeUInt8: {
base_summary_ptr =
std::make_unique<TensorSummary<uint8_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
break;
}
case kNumberTypeInt8: {
base_summary_ptr =
std::make_unique<TensorSummary<int8_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
break;
}
case kNumberTypeUInt16: {
base_summary_ptr =
std::make_unique<TensorSummary<uint16_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
break;
}
case kNumberTypeInt16: {
base_summary_ptr =
std::make_unique<TensorSummary<int16_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
break;
}
case kNumberTypeUInt32: {
base_summary_ptr =
std::make_unique<TensorSummary<uint32_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
break;
}
case kNumberTypeInt32:
case kNumberTypeInt: {
base_summary_ptr =
std::make_unique<TensorSummary<int32_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
break;
}
case kNumberTypeUInt64: {
base_summary_ptr =
std::make_unique<TensorSummary<uint64_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
break;
}
case kNumberTypeInt64: {
base_summary_ptr =
std::make_unique<TensorSummary<int64_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
break;
}
case kNumberTypeFloat16: {
base_summary_ptr =
std::make_unique<TensorSummary<float16>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
break;
}
case kNumberTypeFloat32:
case kNumberTypeFloat: {
base_summary_ptr =
std::make_unique<TensorSummary<float>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
break;
}
case kNumberTypeFloat64: {
base_summary_ptr =
std::make_unique<TensorSummary<double>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
break;
}
case kNumberTypeBool: {
base_summary_ptr =
std::make_unique<TensorSummary<bool>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
break;
}
default:
MS_LOG(INFO) << "Unsupported tensor type";
continue;
base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, tensor_dtype);
if (base_summary_ptr != nullptr) {
base_summary_ptr->SummarizeTensor(watchpoints_to_check);
}
base_summary_ptr->SummarizeTensor(watchpoints_to_check);
}
for (auto &wp : watchpoints_to_check) {
bool is_hit = false;
int error_code = 0;
@ -189,26 +250,439 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
error_code = std::get<1>(item);
parameter_list = std::get<2>(item);
}
// add analyzed tensor to cache
if (!recheck) {
wp_id_cache[tensor_name].insert(wp.id);
}
AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
if (is_hit || error_code) {
name->push_back(qualified_tensor_name);
slot->push_back(tensor_slot);
condition->push_back(wp.condition.type);
watchpoint_id->push_back(wp.id);
if (device_id != nullptr) {
device_id->push_back(tensor->GetDeviceId());
}
if (root_graph_id != nullptr) {
root_graph_id->push_back(tensor->GetRootGraphId());
}
parameters->push_back(parameter_list);
error_codes->push_back(error_code);
}
}
#ifdef OFFLINE_DBG_MODE
// in offline mode remove the need for the data
tensor.reset();
#endif
}
}
#ifdef OFFLINE_DBG_MODE
void DebugServices::GetSlotInfo(const std::string &file_name, const std::string &dump_name,
const std::string &specific_dump_dir, std::vector<size_t> *slot_list) {
if (is_sync_mode) {
// get the slot from the name
std::string delimiter = "_";
unsigned int start_pos = dump_name.length();
unsigned int end_pos = file_name.find(delimiter, start_pos);
std::string item = file_name.substr(start_pos, end_pos - start_pos);
slot_list->push_back(std::stoul(item));
} else {
std::string out_dir = "/tmp/" + file_name;
std::string input_file = specific_dump_dir + "/" + file_name;
std::string log_enabled = DbgLogger::verbose ? "" : "> /dev/null";
std::string convert_command =
"python /usr/local/Ascend/toolkit/tools/operator_cmp/compare/msaccucmp.pyc convert -d " + input_file + " -out " +
out_dir + " -t bin " + log_enabled;
(void)(system(convert_command.c_str()) + 1);
convert_command = "python /usr/local/Ascend/toolkit/tools/operator_cmp/compare/msaccucmp.pyc convert -d " +
input_file + " -out " + out_dir + " -f NCHW -t bin " + log_enabled;
(void)(system(convert_command.c_str()) + 1);
std::string prefix_converted_dump_file_name = file_name + ".output.";
DIR *convert_dir_ptr = opendir(out_dir.c_str());
if (convert_dir_ptr != nullptr) {
struct dirent *convert_dir_contents = nullptr;
while ((convert_dir_contents = readdir(convert_dir_ptr)) != NULL) {
if (convert_dir_contents->d_type == DT_REG) {
std::string converted_file_name = convert_dir_contents->d_name;
std::size_t nd_file = converted_file_name.rfind(".ND.bin");
std::size_t fractal_z_file = converted_file_name.rfind(".FRACTAL_Z.bin");
std::size_t nchw_file = converted_file_name.rfind(".NCHW.bin");
if (nd_file == std::string::npos && nchw_file == std::string::npos && fractal_z_file == std::string::npos) {
continue;
}
std::size_t found_c = converted_file_name.find(prefix_converted_dump_file_name);
if (found_c != 0) {
continue;
}
std::size_t slot_start_pos = prefix_converted_dump_file_name.length();
std::size_t slot_end_pos = converted_file_name.find(".", slot_start_pos) - 1;
std::string slot_item = converted_file_name.substr(slot_start_pos, slot_end_pos - slot_start_pos + 1);
slot_list->push_back(std::stoul(slot_item));
}
}
} else {
MS_LOG(INFO) << out_dir << " directory does not exist!";
}
closedir(convert_dir_ptr);
// std::string delete_cmd = "rm -rf " + out_dir;
// system(delete_cmd.c_str());
}
}
std::size_t DebugServices::GetShapeTypeInfo(const std::string &specific_dump_dir, std::size_t slot,
const std::string &prefix_dump_file_name, std::string *file_name,
std::string *type_name, std::string *out_dir, std::vector<int64_t> *shape) {
std::size_t found = 0;
if (is_sync_mode) {
found = file_name->rfind(prefix_dump_file_name, 0);
} else {
std::string file_name_w_o_prefix = file_name->substr(file_name->find('.') + 1);
found = file_name_w_o_prefix.rfind(prefix_dump_file_name, 0);
}
if (found != 0) {
return found;
}
if (is_sync_mode) {
// found a file, now get the shape and type
// find "_shape_" in the filename
std::string shape_delimiter = "_shape_";
unsigned int str_pos = file_name->find(shape_delimiter) + shape_delimiter.length();
// read numbers with '_' delimter until you read a non-number, that will be the type name
bool number_found = true;
std::string delimiter = "_";
while (number_found) {
unsigned int end_pos = file_name->find(delimiter, str_pos);
std::string item = file_name->substr(str_pos, end_pos - str_pos);
bool is_number = !item.empty() && std::find_if(item.begin(), item.end(),
[](unsigned char c) { return !std::isdigit(c); }) == item.end();
if (is_number) {
shape->push_back(std::stoul(item));
str_pos = end_pos + 1;
} else {
*type_name = item;
number_found = false;
}
}
} else {
*out_dir = "/tmp/" + *file_name;
std::string input_file = specific_dump_dir + "/" + *file_name;
std::string log_enabled = DbgLogger::verbose ? "" : "> /dev/null";
std::string convert_command =
"python /usr/local/Ascend/toolkit/tools/operator_cmp/compare/msaccucmp.pyc convert -d " + input_file + " -out " +
*out_dir + " -t bin " + log_enabled;
(void)(system(convert_command.c_str()) + 1);
convert_command = "python /usr/local/Ascend/toolkit/tools/operator_cmp/compare/msaccucmp.pyc convert -d " +
input_file + " -out " + *out_dir + " -f NCHW -t bin " + log_enabled;
(void)(system(convert_command.c_str()) + 1);
std::string prefix_converted_dump_file_name = *file_name + ".output." + std::to_string(slot);
*file_name = "";
DIR *convert_dir_ptr = opendir(out_dir->c_str());
if (convert_dir_ptr != nullptr) {
struct dirent *convert_dir_contents = nullptr;
while ((convert_dir_contents = readdir(convert_dir_ptr)) != NULL) {
if (convert_dir_contents->d_type == DT_REG) {
std::string converted_file_name = convert_dir_contents->d_name;
std::size_t nd_file = converted_file_name.rfind(".ND.bin");
std::size_t fractal_z_file = converted_file_name.rfind(".FRACTAL_Z.bin");
std::size_t nchw_file = converted_file_name.rfind(".NCHW.bin");
if (nd_file == std::string::npos && nchw_file == std::string::npos && fractal_z_file == std::string::npos) {
continue;
}
std::size_t found_c = converted_file_name.rfind(prefix_converted_dump_file_name, 0);
if (found_c != 0) {
continue;
}
*file_name = converted_file_name;
}
}
} else {
MS_LOG(INFO) << *out_dir << " directory does not exist!";
}
closedir(convert_dir_ptr);
if (*file_name == "") {
MS_LOG(WARNING) << out_dir << ": no valid files found post msaccucmp exec";
return 1;
}
// std::string delete_cmd = "rm -rf " + out_dir;
// system(delete_cmd.c_str());
// found a file, now get the shape and type
std::stringstream check_filename(*file_name);
std::vector<std::string> tokens;
std::string intermediate;
while (getline(check_filename, intermediate, '.')) {
tokens.push_back(intermediate);
}
*type_name = tokens[8];
std::string shape_str = tokens[7];
std::stringstream check_shape(shape_str);
while (getline(check_shape, intermediate, '_')) {
shape->push_back(std::stoul(intermediate));
}
}
return 0;
}
void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
std::vector<unsigned int> root_graph_id,
std::vector<std::shared_ptr<TensorData>> *result_list) {
for (unsigned int i = 0; i < backend_name.size(); i++) {
// form prefix of the tensor file to read from graph pb node name
std::string dump_style_kernel_name = backend_name[i];
const std::string strsrc = "/";
std::string strdst;
if (is_sync_mode) {
strdst = "--";
} else {
strdst = "_";
}
std::string::size_type pos = 0;
std::string::size_type srclen = strsrc.size();
std::string::size_type dstlen = strdst.size();
// remove slot from name
std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
while ((pos = dump_style_kernel_name.find(strsrc, pos)) != std::string::npos) {
dump_style_kernel_name.replace(pos, srclen, strdst);
pos += dstlen;
}
std::string prefix_dump_file_name = dump_style_kernel_name;
if (is_sync_mode) {
prefix_dump_file_name += "_output_" + std::to_string(slot[i]) + "_";
}
std::string specific_dump_dir;
if (is_sync_mode) {
specific_dump_dir =
dump_dir + "/device_" + std::to_string(device_id[i]) + "/iteration_" + std::to_string(iteration[i]);
} else {
specific_dump_dir = dump_dir + "/device_" + std::to_string(device_id[i]) + "/" + net_name + "_graph_" +
std::to_string(root_graph_id[i]) + "/" + std::to_string(root_graph_id[i]) + "/" +
std::to_string(iteration[i]);
}
// search files in dir for the one that meets the filename prefix and read the file into memory
DIR *d;
d = opendir(specific_dump_dir.c_str());
std::vector<char> *buffer = NULL;
std::string type_name = "";
std::vector<int64_t> shape;
uint64_t data_size = 0;
if (d != nullptr) {
struct dirent *dir = nullptr;
while ((dir = readdir(d)) != NULL) {
if (dir->d_type == DT_REG) {
std::string file_name = dir->d_name;
std::string out_dir;
std::size_t found = GetShapeTypeInfo(specific_dump_dir, slot[i], prefix_dump_file_name, &file_name,
&type_name, &out_dir, &shape);
if (found != 0) {
continue;
}
// read the tensor data from the file
std::string file_path;
if (is_sync_mode) {
file_path = specific_dump_dir + "/" + file_name;
} else {
file_path = out_dir + "/" + file_name;
}
std::ifstream infile;
infile.open(file_path.c_str(), std::ios::binary | std::ios::ate);
if (!infile.is_open()) {
MS_LOG(ERROR) << "Failed to open bin file " << file_name;
break;
}
uint64_t file_size = infile.tellg();
infile.seekg(0, std::ios::beg);
buffer = new std::vector<char>(file_size);
if (!infile.read(buffer->data(), file_size)) {
MS_LOG(ERROR) << "Failed to read in bin file " << file_name;
break;
}
data_size = file_size;
infile.close();
}
}
} else {
MS_LOG(INFO) << "directory does not exist!";
}
closedir(d);
// call LoadNewTensor to store tensor in internal cache
auto tensor_data = std::make_shared<TensorData>();
tensor_data->SetName(backend_name[i]);
tensor_data->SetExecutionOrder(0);
tensor_data->SetSlot(slot[i]);
tensor_data->SetIteration(iteration[i]);
tensor_data->SetDeviceId(device_id[i]);
tensor_data->SetRootGraphId(root_graph_id[i]);
if (data_size) {
tensor_data->SetDataPtr(buffer->data());
} else {
tensor_data->SetDataPtr(NULL);
}
tensor_data->SetByteSize(data_size);
tensor_data->SetType(type_name);
tensor_data->SetShape(shape);
if (data_size) {
tensor_loader_->LoadNewTensor(tensor_data, false);
}
// add to result_list
result_list->push_back(tensor_data);
}
}
void ReplaceSrcFileName(const bool is_sync_mode, std::string *dump_style_name) {
const std::string strsrc = "/";
std::string strdst;
if (is_sync_mode) {
strdst = "--";
} else {
strdst = "_";
}
std::string::size_type pos = 0;
std::string::size_type srclen = strsrc.size();
std::string::size_type dstlen = strdst.size();
while ((pos = dump_style_name->find(strsrc, pos)) != std::string::npos) {
dump_style_name->replace(pos, srclen, strdst);
pos += dstlen;
}
}
std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(unsigned int iteration) {
// get a list of nodes and the devices they are on to monitor
std::vector<std::shared_ptr<TensorData>> tensor_list;
std::map<std::tuple<uint32_t, uint32_t>, std::unordered_set<std::string>> device_and_graph_to_nodes;
for (auto w_table_item : watchpoint_table) {
auto wp = std::get<1>(w_table_item);
for (auto check_node : wp.check_node_list) {
unsigned int index = 0;
std::string w_name = std::get<0>(check_node);
bool w_is_param = std::get<1>(check_node);
std::string node_name = w_name;
if (w_is_param) {
std::size_t found = node_name.find_last_of("/");
node_name = node_name.substr(found + 1);
}
std::vector<uint32_t> devices = std::get<1>(wp.check_node_device_list[index]);
std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
for (auto device : devices) {
for (auto graph : graphs) {
std::tuple<uint32_t, uint32_t> key(device, graph);
device_and_graph_to_nodes[key].insert(node_name);
}
}
index++;
}
}
// scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
// as they are found
for (auto const &device_and_graph_item : device_and_graph_to_nodes) {
std::tuple<uint32_t, uint32_t> device_and_graph = device_and_graph_item.first;
uint32_t device_id = std::get<0>(device_and_graph);
uint32_t root_graph_id = std::get<1>(device_and_graph);
std::unordered_set<std::string> wp_nodes = device_and_graph_item.second;
std::vector<std::tuple<std::string, std::string>> proto_to_dump;
std::string specific_dump_dir;
if (is_sync_mode) {
specific_dump_dir = dump_dir + "/device_" + std::to_string(device_id) + "/iteration_" + std::to_string(iteration);
} else {
specific_dump_dir = dump_dir + "/device_" + std::to_string(device_id) + "/" + net_name + "_graph_" +
std::to_string(root_graph_id) + "/" + std::to_string(root_graph_id) + "/" +
std::to_string(iteration);
}
// convert node names to dump style
for (auto node : wp_nodes) {
std::string orig_name = node;
std::string dump_style_name = node;
ReplaceSrcFileName(is_sync_mode, &dump_style_name);
if (is_sync_mode) {
dump_style_name.append("_output_");
}
proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
}
// search files in dir for the one that meets the filename prefix and read the file into memory
DIR *d;
d = opendir(specific_dump_dir.c_str());
if (d != nullptr) {
struct dirent *dir = nullptr;
while ((dir = readdir(d)) != NULL) {
if (dir->d_type == DT_REG) {
std::string file_name = dir->d_name;
for (auto &node : proto_to_dump) {
std::string dump_name = std::get<1>(node);
std::size_t found = 0;
if (is_sync_mode) {
found = file_name.rfind(dump_name, 0);
} else {
std::string file_name_w_o_prefix = file_name.substr(file_name.find('.') + 1);
found = file_name_w_o_prefix.rfind(dump_name, 0);
}
if (found == 0) {
std::vector<size_t> slot_list;
GetSlotInfo(file_name, dump_name, specific_dump_dir, &slot_list);
for (auto slot : slot_list) {
// add a TensorData entry (data will be read when needed)
std::vector<int64_t> shape;
std::string orig_name = std::get<0>(node);
auto tensor_data = std::make_shared<TensorData>();
tensor_data->SetName(orig_name);
tensor_data->SetExecutionOrder(0);
tensor_data->SetSlot(slot);
tensor_data->SetIteration(iteration);
tensor_data->SetDeviceId(device_id);
tensor_data->SetRootGraphId(root_graph_id);
tensor_data->SetDataPtr(NULL);
tensor_data->SetByteSize(0);
tensor_data->SetType("");
tensor_data->SetShape(shape);
tensor_list.push_back(tensor_data);
}
break;
}
}
}
}
}
}
return tensor_list;
}
#endif
void DebugServices::ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
std::vector<char *> *data_ptr, std::vector<ssize_t> *data_size,
std::vector<TypePtr> *dtype, std::vector<std::vector<int64_t>> *shape) {
std::vector<unsigned int> *dtype, std::vector<std::vector<int64_t>> *shape) {
std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
tensor_loader_->SearchTensors(name, &result_list);
@ -217,13 +691,14 @@ void DebugServices::ReadNodesTensors(std::vector<std::string> name, std::vector<
continue;
}
ret_name->push_back(std::get<0>(result));
data_ptr->push_back(reinterpret_cast<char *>(std::get<1>(result)->GetTensor()->data_c()));
data_size->push_back(std::get<1>(result)->GetTensor()->data().nbytes());
dtype->push_back(std::get<1>(result)->GetTensor()->Dtype());
shape->push_back(std::get<1>(result)->GetTensor()->shape());
data_ptr->push_back(reinterpret_cast<char *>(std::get<1>(result)->GetDataPtr()));
data_size->push_back(std::get<1>(result)->GetByteSize());
dtype->push_back(std::get<1>(result)->GetType());
shape->push_back(std::get<1>(result)->GetShape());
}
}
#ifdef ONLINE_DBG_MODE
bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
bool ret = false;
for (auto w_table_item : watchpoint_table) {
@ -256,6 +731,7 @@ bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNode
return false;
}
}
#endif
void DebugServices::EmptyTensor() { tensor_loader_->EmptyTensor(); }
@ -273,6 +749,7 @@ void DebugServices::EmptyPrevTensor() { tensor_loader_->EmptyPrevTensor(); }
void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
#ifdef ONLINE_DBG_MODE
bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
const std::string &host_fmt, const std::vector<int64_t> &host_shape,
TypeId host_type, TypeId addr_type_id, const std::string &addr_format,
@ -280,6 +757,7 @@ bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_
return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
addr_type_id, addr_format, slot);
}
#endif
bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
return tensor_loader_->LoadNewTensor(tensor, keep_prev);
@ -298,6 +776,7 @@ void DebugServices::ResetLoadedTensors() {
tensor_loader_->SwapCurrentPrev();
}
#ifdef ONLINE_DBG_MODE
std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
MS_EXCEPTION_IF_NULL(kernel);
std::vector<std::shared_ptr<TensorData>> result;
@ -310,6 +789,8 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNod
}
return result;
}
#endif
bool DebugServices::TensorExistsInCurrent(std::string tensor_name) {
return tensor_loader_->TensorExistsInCurrent(tensor_name);
}
@ -317,4 +798,18 @@ void DebugServices::MoveTensorCurrentToPrev(std::string tensor_name) {
tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
}
void DebugServices::SetNetName(std::string net_name) { this->net_name = net_name; }
std::string DebugServices::GetNetName() { return net_name; }
void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir = dump_dir; }
std::string DebugServices::GetDumpDir() { return dump_dir; }
void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode = is_sync_mode; }
bool DebugServices::GetSyncMode() { return is_sync_mode; }
#ifdef ONLINE_DBG_MODE
} // namespace mindspore
#endif

View File

@ -16,6 +16,17 @@
#ifndef MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_
#define MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_
#ifndef OFFLINE_DBG_MODE
#define ONLINE_DBG_MODE
#endif
#ifdef OFFLINE_DBG_MODE
#include "Eigen/Core"
#include "Eigen/src/Core/arch/CUDA/Half.h"
using float16 = Eigen::half;
#include "debugger/offline_debug/offline_logger.h"
#endif
#include <math.h>
#include <vector>
#include <string>
@ -26,11 +37,13 @@
#include <mutex>
#include <map>
#include <limits>
#include <sstream>
#include "debug/tensor_load.h"
#include "debug/tensor_data.h"
#include "ir/dtype.h"
#ifdef ONLINE_DBG_MODE
namespace mindspore {
#endif
class DebugServices {
public:
DebugServices();
@ -103,6 +116,8 @@ class DebugServices {
unsigned int id;
condition_t condition;
std::vector<std::tuple<std::string, bool>> check_node_list;
std::vector<std::tuple<std::string, std::vector<uint32_t>>> check_node_device_list;
std::vector<std::tuple<std::string, std::vector<uint32_t>>> check_node_graph_list;
std::vector<parameter_t> parameter_list;
size_t location = 0;
@ -167,30 +182,55 @@ class DebugServices {
}
} watchpoint_t;
void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
const std::vector<std::tuple<std::string, bool>> &check_node_list,
const std::vector<parameter_t> &parameter_list);
void AddWatchpoint(
unsigned int id, unsigned int watch_condition, float parameter,
const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list = nullptr,
const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list = nullptr);
void RemoveWatchpoint(unsigned int id);
void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition,
std::vector<unsigned int> *watchpoint_id, std::vector<std::vector<parameter_t>> *parameters,
std::vector<int32_t> *error_code, const std::vector<std::string> &op_overflows,
const std::vector<std::shared_ptr<TensorData>> &tensor_list, bool init_dbg_suspend,
const bool step_end, const bool recheck);
std::vector<std::shared_ptr<TensorData>> *tensor_list, bool init_dbg_suspend,
const bool step_end, const bool recheck, std::vector<unsigned int> *device_id = nullptr,
std::vector<unsigned int> *root_graph_id = nullptr);
void AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck, const std::string &tensor_name,
const std::string &tensor_name_no_slot, bool *previous_iter_tensor_needed,
std::string *qualified_tensor_name, std::vector<watchpoint_t> *watchpoints_to_check);
#ifdef OFFLINE_DBG_MODE
void GetSlotInfo(const std::string &file_name, const std::string &dump_name, const std::string &specific_dump_dir,
std::vector<size_t> *slot_list);
std::size_t GetShapeTypeInfo(const std::string &specific_dump_dir, std::size_t slot,
const std::string &prefix_dump_file_name, std::string *file_name, std::string *type_name,
std::string *out_dir, std::vector<int64_t> *shape);
void ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
std::vector<unsigned int> root_graph_id, std::vector<std::shared_ptr<TensorData>> *result_list);
std::vector<std::shared_ptr<TensorData>> ReadNeededDumpedTensors(unsigned int iteration);
void *GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed);
#endif
void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
std::vector<char *> *data_ptr, std::vector<ssize_t> *data_size, std::vector<TypePtr> *dtype,
std::vector<std::vector<int64_t>> *shape);
std::vector<char *> *data_ptr, std::vector<ssize_t> *data_size,
std::vector<unsigned int> *dtype, std::vector<std::vector<int64_t>> *shape);
#ifdef ONLINE_DBG_MODE
bool IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel = nullptr) const;
bool IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const;
#endif
void EmptyTensor();
std::vector<std::shared_ptr<TensorData>> GetTensor() const;
void AddAnalyzedTensorToCache(const bool recheck, const unsigned int id, const std::string &tensor_name);
std::vector<std::shared_ptr<TensorData>> GetNodeTensorMap(const std::string &node_name) const;
uint32_t GetTensorLoaderIterNum() const;
@ -201,31 +241,51 @@ class DebugServices {
void EmptyCurrentTensor();
#ifdef ONLINE_DBG_MODE
bool DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
const std::string &host_fmt, const std::vector<int64_t> &host_shape, TypeId host_type,
TypeId addr_type_id, const std::string &addr_format, size_t slot) const;
#endif
bool LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev);
std::unordered_map<unsigned int, watchpoint_t> GetWatchpointTable();
void ResetLoadedTensors();
#ifdef ONLINE_DBG_MODE
std::vector<std::shared_ptr<TensorData>> GetNodeTensor(const CNodePtr &kernel);
#endif
bool TensorExistsInCurrent(std::string tensor_name);
void MoveTensorCurrentToPrev(std::string tensor_name);
void SetNetName(std::string net_name);
std::string GetNetName();
void SetDumpDir(std::string dump_dir);
std::string GetDumpDir();
void SetSyncMode(bool is_sync_mode);
bool GetSyncMode();
private:
std::mutex lock_;
// to keep track of watchpoints that have been checked already for a tensor in current step
std::unordered_map<std::string, std::set<int32_t>> wp_id_cache;
std::unordered_map<unsigned int, watchpoint_t> watchpoint_table;
std::string net_name;
std::string dump_dir;
bool is_sync_mode;
TensorLoader *tensor_loader_;
};
#ifdef ONLINE_DBG_MODE
} // namespace mindspore
#endif
#endif // MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_

View File

@ -755,7 +755,7 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten
std::vector<std::string> ret_name;
std::vector<char *> data_ptr;
std::vector<ssize_t> data_size;
std::vector<TypePtr> dtype;
std::vector<unsigned int> dtype;
std::vector<std::vector<int64_t>> shape;
std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
@ -789,7 +789,7 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten
tensor_item.set_tensor_content(data_ptr[result_index] + size_iter, chunk_size);
tensor_item.set_data_type(GetDebuggerNumberDataType(dtype[result_index]));
tensor_item.set_data_type((debugger::DataType)dtype[result_index]);
for (auto &elem : shape[result_index]) {
tensor_item.add_dims(elem);
}
@ -827,7 +827,7 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode
tensor_list = debug_services_->GetNodeTensor(kernel);
}
debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops,
tensor_list, initial_suspend_, watchnode.empty(), recheck);
&tensor_list, initial_suspend_, watchnode.empty(), recheck);
std::list<WatchpointHit> hits;
for (unsigned int i = 0; i < name.size(); i++) {
WatchpointHit hit;

View File

@ -0,0 +1,28 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op169
slot = 0
iteration = 2
device_id = None
root_graph_id = 1
is_parameter = False
tensor_data_1 attributes:
data (printed in uint8) = [149 167 124 ... 158 212 164]
size in bytes = 2076672
debugger dtype = 10
shape = [32, 192, 13, 13]
-----------------------------------------------------------
tensor_info_2 attributes:
node name = Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/ReLUV2-op348
slot = 1
iteration = 2
device_id = None
root_graph_id = 1
is_parameter = False
tensor_data_2 attributes:
data (printed in uint8) = [ 20 21 18 ... 126 98 25]
size in bytes = 129792
debugger dtype = 6
shape = [32, 12, 13, 13, 2]

View File

@ -0,0 +1,72 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Read tensor test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
import numpy as np
def main():
debugger_backend = d.DbgServices(
dump_file_path="/opt/nvme2n1/j00455527/dumps/async_sink_true/032421")
_ = debugger_backend.initialize(net_name="alexnet", is_sync_mode=False)
# output tensor with zero slot
info1 = d.TensorInfo(node_name="Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/"
"conv3-Conv2d/Conv2D-op169",
slot=0, iteration=2, device_id=0, root_graph_id=1, is_parameter=False)
# output tensor with non-zero slot
info2 = d.TensorInfo(node_name="Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/"
"ReLUV2-op348",
slot=1, iteration=2, device_id=0, root_graph_id=1, is_parameter=False)
tensor_info = [info1, info2]
tensor_data = debugger_backend.read_tensors(tensor_info)
print_read_tensors(tensor_info, tensor_data)
def print_read_tensors(tensor_info, tensor_data):
"""Print read tensors."""
for x, _ in enumerate(tensor_info):
print("-----------------------------------------------------------")
print("tensor_info_" + str(x+1) + " attributes:")
print("node name = ", tensor_info[x].node_name)
print("slot = ", tensor_info[x].slot)
print("iteration = ", tensor_info[x].iteration)
print("device_id = ", tensor_info[x].device_id)
print("root_graph_id = ", tensor_info[x].root_graph_id)
print("is_parameter = ", tensor_info[x].is_parameter)
print()
print("tensor_data_" + str(x+1) + " attributes:")
print("data (printed in uint8) = ", np.frombuffer(
tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size))
py_byte_size = len(tensor_data[x].data_ptr)
c_byte_size = tensor_data[x].data_size
if c_byte_size != py_byte_size:
print("The python byte size of ", py_byte_size,
" does not match the C++ byte size of ", c_byte_size)
print("size in bytes = ", tensor_data[x].data_size)
print("debugger dtype = ", tensor_data[x].dtype)
print("shape = ", tensor_data[x].shape)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,14 @@
-----------------------------------------------------------
watchpoint_hit for test_1 attributes:
name = Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op169
slot = 0
condition = 6
watchpoint_id = 1
parameter 0 name = param
parameter 0 disabled = False
parameter 0 value = 0.0
parameter 0 hit = True
parameter 0 actual_value = -0.1417236328125
error code = 0
device_id = 0
root_graph_id = 1

View File

@ -0,0 +1,92 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Watchpoints test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
def main():
debugger_backend = d.DbgServices(
dump_file_path="/opt/nvme2n1/j00455527/dumps/async_sink_true/032421")
_ = debugger_backend.initialize(net_name="alexnet", is_sync_mode=False)
# NOTES:
# -> watch_condition=6 is MIN_LT
# -> watch_condition=18 is CHANGE_TOO_LARGE
# test 1: watchpoint set and hit (watch_condition=6)
param1 = d.Parameter(name="param", disabled=False, value=0.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6,
check_node_list={"Default/network-TrainOneStepCell/network-WithLossCell/"
"_backbone-AlexNet/conv3-Conv2d/Conv2D-op169":
{"device_id": [0], "root_graph_id": [1], "is_parameter": False
}}, parameter_list=[param1])
watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2)
if len(watchpoint_hits_test_1) != 1:
print("ERROR -> test 1: watchpoint set but not hit just once")
print_watchpoint_hits(watchpoint_hits_test_1, 1)
# test 2: watchpoint remove and ensure it's not hit
_ = debugger_backend.remove_watchpoint(watchpoint_id=1)
watchpoint_hits_test_2 = debugger_backend.check_watchpoints(iteration=2)
if watchpoint_hits_test_2:
print("ERROR -> test 2: watchpoint removed but hit")
# test 3: watchpoint set and not hit, then remove
param2 = d.Parameter(name="param", disabled=False, value=-1000.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6,
check_node_list={"Default/network-TrainOneStepCell/network-WithLossCell/"
"_backbone-AlexNet/conv3-Conv2d/Conv2D-op169":
{"device_id": [0], "root_graph_id": [1], "is_parameter": False
}}, parameter_list=[param2])
watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2)
if watchpoint_hits_test_3:
print("ERROR -> test 3: watchpoint set but not supposed to be hit")
_ = debugger_backend.remove_watchpoint(watchpoint_id=2)
def print_watchpoint_hits(watchpoint_hits, test_id):
"""Print watchpoint hits."""
for x, _ in enumerate(watchpoint_hits):
print("-----------------------------------------------------------")
print("watchpoint_hit for test_%u attributes:" % test_id)
print("name = ", watchpoint_hits[x].name)
print("slot = ", watchpoint_hits[x].slot)
print("condition = ", watchpoint_hits[x].condition)
print("watchpoint_id = ", watchpoint_hits[x].watchpoint_id)
for p, _ in enumerate(watchpoint_hits[x].parameters):
print("parameter ", p, " name = ",
watchpoint_hits[x].parameters[p].name)
print("parameter ", p, " disabled = ",
watchpoint_hits[x].parameters[p].disabled)
print("parameter ", p, " value = ",
watchpoint_hits[x].parameters[p].value)
print("parameter ", p, " hit = ",
watchpoint_hits[x].parameters[p].hit)
print("parameter ", p, " actual_value = ",
watchpoint_hits[x].parameters[p].actual_value)
print("error code = ", watchpoint_hits[x].error_code)
print("device_id = ", watchpoint_hits[x].device_id)
print("root_graph_id = ", watchpoint_hits[x].root_graph_id)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,49 @@
python sync_trans_false_read_tensors.py > sync_trans_false_read_tensors.actual
sed -i '/\[WARNING\]/d' sync_trans_false_read_tensors.actual
sed -i '/Deprecated/d' sync_trans_false_read_tensors.actual
diff sync_trans_false_read_tensors.actual sync_trans_false_read_tensors.expected
if [ $? -eq 0 ]; then
echo sync_trans_false_read_tensors PASSED
else
echo sync_trans_false_read_tensors FAILED
fi
python sync_trans_true_read_tensors.py > sync_trans_true_read_tensors.actual
sed -i '/\[WARNING\]/d' sync_trans_true_read_tensors.actual
sed -i '/Deprecated/d' sync_trans_true_read_tensors.actual
diff sync_trans_true_read_tensors.actual sync_trans_true_read_tensors.expected
if [ $? -eq 0 ]; then
echo sync_trans_true_read_tensors PASSED
else
echo sync_trans_true_read_tensors FAILED
fi
python sync_trans_false_watchpoints.py > sync_trans_false_watchpoints.actual
sed -i '/\[WARNING\]/d' sync_trans_false_watchpoints.actual
sed -i '/Deprecated/d' sync_trans_false_watchpoints.actual
diff sync_trans_false_watchpoints.actual sync_trans_false_watchpoints.expected
if [ $? -eq 0 ]; then
echo sync_trans_false_watchpoints PASSED
else
echo sync_trans_false_watchpoints FAILED
fi
python async_sink_mode_true_read_tensors.py > async_sink_mode_true_read_tensors.actual
sed -i '/\[WARNING\]/d' async_sink_mode_true_read_tensors.actual
sed -i '/Deprecated/d' async_sink_mode_true_read_tensors.actual
diff async_sink_mode_true_read_tensors.actual async_sink_mode_true_read_tensors.expected
if [ $? -eq 0 ]; then
echo async_sink_mode_true_read_tensors PASSED
else
echo async_sink_mode_true_read_tensors FAILED
fi
python async_sink_mode_true_watchpoints.py > async_sink_mode_true_watchpoints.actual
sed -i '/\[WARNING\]/d' async_sink_mode_true_watchpoints.actual
sed -i '/Deprecated/d' async_sink_mode_true_watchpoints.actual
diff async_sink_mode_true_watchpoints.actual async_sink_mode_true_watchpoints.expected
if [ $? -eq 0 ]; then
echo async_sink_mode_true_watchpoints PASSED
else
echo async_sink_mode_true_watchpoints FAILED
fi

View File

@ -0,0 +1,70 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = True
tensor_data_1 attributes:
data (printed in uint8) = [170 19 44 181 254 212 16 52 52 162 148 180 130 115 226 180 183 243
101 52 224 79 189 51 10 70 69 51 199 75 159 52 79 98 104 52
106 77 19 52 129 183 8 180 252 58 48 180 35 219 9 52 240 201
179 51 142 151 158 51 210 145 182 53 140 219 0 53 140 219 22 181
46 33 87 180 238 90 122 180 166 10 38 179 202 195 4 53 166 10
150 51 214 120 209 52 235 115 37 180 92 177 215 180 0 136 84 51
72 114 145 180 43 169 255 180 114 27 61 52 76 225 122 50 126 72
159 51 58 35 202 51 114 61 106 51 60 223 63 52 209 179 1 52
232 217 44 178 130 158 109 179 213 231 10 179 37 40 94 179 208 68
64 53 6 52 249 52 162 35 1 181 231 29 155 52 30 201 69 180
229 131 126 51 18 165 109 180 164 112 163 181 116 172 11 178 6 129
37 52 54 205 203 180 115 104 145 52 232 106 219 179 36 40 214 52
202 50 204 52 76 89 38 179 230 140 232 178 168 53 77 52 180 191
108 51 128 183 64 51 56 137 161 180 247 6 143 180 126 63 197 180
198 177 94 52 140 185 139 51 150 178 228 180 255 67 150 52 134 201
164 52 107 43 14 53 174 216 63 179 40 160 41 53 120 88 72 179
218 172 234 52 234 38 25 52 85 159 155 180 254 67 138 180 34 253
118 180 218 61 17 52 242 133 253 52 175 37 180 52 171 62 163 52
202 195 86 53 160 171 45 52 34 31 176 180 156 85 5 53 178 191
68 180 42 203 140 52 248 117 72 52 248 253 212 176 195 100 202 51
87 14 141 52 91 100 235 51 48 221 136 52 143 117 17 180 51 196
25 52 127 29 112 180 152 144 207 178 219 104 64 52 21 174 251 52
164 78 138 181 20 63 6 52 10 249 96 179 163 146 18 53 200 186
236 52 2 188 85 52 124 140 121 179 246 185 22 181 246 74 249 51
70 182 135 53 189 227 76 52 249 160 159 180 134 235 65 53 64 164
255 51 224 156 41 53 142 117 69 181 247 151 101 53 185 175 35 52
164 112 21 53 30 31 212 179 142 151 110 179 176 148 29 181 206 204
88 53 116 215 214 180 172 173 216 51 106 222 153 180 200 152 19 181
176 3 7 52 215 52 87 52]
size in bytes = 512
debugger dtype = 11
shape = [128]
-----------------------------------------------------------
tensor_info_2 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op168
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_2 attributes:
data (printed in uint8) = [181 167 46 ... 12 204 164]
size in bytes = 2076672
debugger dtype = 10
shape = [32, 12, 13, 13, 16]
-----------------------------------------------------------
tensor_info_3 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op346
slot = 1
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_3 attributes:
data (printed in uint8) = [ 50 17 122 ... 94 42 90]
size in bytes = 129792
debugger dtype = 6
shape = [32, 12, 13, 13, 2]

View File

@ -0,0 +1,74 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Read tensor test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
import numpy as np
def main():
debugger_backend = d.DbgServices(
dump_file_path="/opt/nvme2n1/j00455527/dumps/sync_trans_false/032421/alexnet")
_ = debugger_backend.initialize(
net_name="Network Name goes here!", is_sync_mode=True)
# parameter
info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True)
# output tensor with zero slot
info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op168",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
# output tensor with non-zero slot
info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op346",
slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
tensor_info = [info1, info2, info3]
tensor_data = debugger_backend.read_tensors(tensor_info)
print_read_tensors(tensor_info, tensor_data)
def print_read_tensors(tensor_info, tensor_data):
"""Print read tensors."""
for x, _ in enumerate(tensor_info):
print("-----------------------------------------------------------")
print("tensor_info_" + str(x+1) + " attributes:")
print("node name = ", tensor_info[x].node_name)
print("slot = ", tensor_info[x].slot)
print("iteration = ", tensor_info[x].iteration)
print("device_id = ", tensor_info[x].device_id)
print("root_graph_id = ", tensor_info[x].root_graph_id)
print("is_parameter = ", tensor_info[x].is_parameter)
print()
print("tensor_data_" + str(x+1) + " attributes:")
print("data (printed in uint8) = ", np.frombuffer(
tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size))
py_byte_size = len(tensor_data[x].data_ptr)
c_byte_size = tensor_data[x].data_size
if c_byte_size != py_byte_size:
print("The python byte size of ", py_byte_size,
" does not match the C++ byte size of ", c_byte_size)
print("size in bytes = ", tensor_data[x].data_size)
print("debugger dtype = ", tensor_data[x].dtype)
print("shape = ", tensor_data[x].shape)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,33 @@
-----------------------------------------------------------
watchpoint_hit for test_1 attributes:
name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op168
slot = 0
condition = 6
watchpoint_id = 1
parameter 0 name = param
parameter 0 disabled = False
parameter 0 value = 0.0
parameter 0 hit = True
parameter 0 actual_value = -0.14013671875
error code = 0
device_id = 0
root_graph_id = 0
-----------------------------------------------------------
watchpoint_hit for test_4 attributes:
name = Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc3.bias
slot = 0
condition = 18
watchpoint_id = 3
parameter 0 name = abs_mean_update_ratio_gt
parameter 0 disabled = False
parameter 0 value = 0.0
parameter 0 hit = True
parameter 0 actual_value = 0.5243796973599475
parameter 1 name = epsilon
parameter 1 disabled = True
parameter 1 value = 0.0
parameter 1 hit = False
parameter 1 actual_value = 0.0
error code = 0
device_id = 0
root_graph_id = 0

View File

@ -0,0 +1,109 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Watchpoints test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
def main():
debugger_backend = d.DbgServices(
dump_file_path="/opt/nvme2n1/j00455527/dumps/sync_trans_false/032421/alexnet")
_ = debugger_backend.initialize(
net_name="Network Name goes here!", is_sync_mode=True)
# NOTES:
# -> watch_condition=6 is MIN_LT
# -> watch_condition=18 is CHANGE_TOO_LARGE
# test 1: watchpoint set and hit (watch_condition=6)
param1 = d.Parameter(name="param", disabled=False, value=0.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/"
"Conv2D-op168":
{"device_id": [0], "root_graph_id": [0], "is_parameter": False
}}, parameter_list=[param1])
watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2)
if len(watchpoint_hits_test_1) != 1:
print("ERROR -> test 1: watchpoint set but not hit just once")
print_watchpoint_hits(watchpoint_hits_test_1, 1)
# test 2: watchpoint remove and ensure it's not hit
_ = debugger_backend.remove_watchpoint(watchpoint_id=1)
watchpoint_hits_test_2 = debugger_backend.check_watchpoints(iteration=2)
if watchpoint_hits_test_2:
print("ERROR -> test 2: watchpoint removed but hit")
# test 3: watchpoint set and not hit, then remove
param2 = d.Parameter(name="param", disabled=False, value=-1000.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/"
"Conv2D-op308":
{"device_id": [0], "root_graph_id": [0], "is_parameter": False
}}, parameter_list=[param2])
watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2)
if watchpoint_hits_test_3:
print("ERROR -> test 3: watchpoint set but not supposed to be hit")
_ = debugger_backend.remove_watchpoint(watchpoint_id=2)
# test 4: weight change watchpoint set and hit
param_abs_mean_update_ratio_gt = d.Parameter(
name="abs_mean_update_ratio_gt", disabled=False, value=0.0)
param_epsilon = d.Parameter(name="epsilon", disabled=True, value=0.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=3, watch_condition=18,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/"
"Parameter[6]_11/fc3.bias":
{"device_id": [0], "root_graph_id": [0], "is_parameter": True
}}, parameter_list=[param_abs_mean_update_ratio_gt,
param_epsilon])
watchpoint_hits_test_4 = debugger_backend.check_watchpoints(iteration=3)
if len(watchpoint_hits_test_4) != 1:
print("ERROR -> test 4: watchpoint weight change set but not hit just once")
print_watchpoint_hits(watchpoint_hits_test_4, 4)
def print_watchpoint_hits(watchpoint_hits, test_id):
"""Print watchpoint hits."""
for x, _ in enumerate(watchpoint_hits):
print("-----------------------------------------------------------")
print("watchpoint_hit for test_%u attributes:" % test_id)
print("name = ", watchpoint_hits[x].name)
print("slot = ", watchpoint_hits[x].slot)
print("condition = ", watchpoint_hits[x].condition)
print("watchpoint_id = ", watchpoint_hits[x].watchpoint_id)
for p, _ in enumerate(watchpoint_hits[x].parameters):
print("parameter ", p, " name = ",
watchpoint_hits[x].parameters[p].name)
print("parameter ", p, " disabled = ",
watchpoint_hits[x].parameters[p].disabled)
print("parameter ", p, " value = ",
watchpoint_hits[x].parameters[p].value)
print("parameter ", p, " hit = ",
watchpoint_hits[x].parameters[p].hit)
print("parameter ", p, " actual_value = ",
watchpoint_hits[x].parameters[p].actual_value)
print("error code = ", watchpoint_hits[x].error_code)
print("device_id = ", watchpoint_hits[x].device_id)
print("root_graph_id = ", watchpoint_hits[x].root_graph_id)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,70 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = True
tensor_data_1 attributes:
data (printed in uint8) = [230 208 10 52 104 34 252 52 4 231 144 52 188 150 64 180 88 236
15 180 254 135 180 51 131 226 147 52 88 202 62 53 2 43 55 53
231 29 87 180 220 249 30 180 157 17 177 180 81 107 140 181 8 95
192 180 89 134 112 180 96 238 90 178 156 196 212 180 206 25 15 181
212 154 6 180 91 211 116 52 191 14 140 51 128 106 124 53 28 158
70 181 182 21 251 50 100 204 157 179 88 202 42 180 7 95 8 53
128 251 238 52 241 133 241 52 111 86 157 179 48 221 148 180 200 7
141 180 236 226 182 51 190 82 158 180 140 108 179 180 195 134 215 179
103 213 39 179 89 168 149 180 42 58 58 180 64 53 62 179 250 126
158 52 38 83 117 52 0 0 136 180 136 133 122 51 110 18 131 179
238 13 94 51 102 136 15 181 134 90 227 180 16 11 117 180 35 74
163 52 105 0 87 181 112 18 131 50 226 233 67 181 217 172 10 52
206 25 217 52 208 213 22 52 146 203 87 180 74 46 207 52 178 191
4 180 100 93 216 52 119 190 171 180 223 2 5 181 128 72 207 179
58 146 11 179 224 79 137 52 143 228 154 180 246 219 215 179 14 79
195 52 126 29 64 52 132 192 42 51 94 220 86 52 94 109 1 181
72 37 117 178 110 197 94 180 160 94 153 179 118 224 80 181 156 17
37 50 120 156 162 53 26 115 135 180 228 20 29 53 145 126 147 52
99 16 48 180 211 188 199 180 52 51 99 180 93 254 227 52 152 126
123 49 6 18 16 181 5 163 130 51 27 158 98 53 134 235 189 52
119 45 9 180 130 115 110 52 158 128 162 52 232 251 197 180 178 46
158 179 57 214 157 52 172 207 161 180 208 0 222 49 242 99 32 53
20 174 135 50 247 117 176 52 194 57 43 180 140 108 135 51 243 65
175 51 187 73 156 51 63 232 217 50 180 234 115 52 194 168 148 52
27 192 183 180 45 178 157 52 125 208 17 53 236 192 65 53 190 193
7 53 254 246 57 53 3 43 199 51 64 164 215 180 220 104 240 51
23 72 24 180 68 173 9 51 72 114 29 53 105 0 57 181 188 150
8 53 229 97 131 53 0 34 189 51 163 146 74 53 31 244 204 51
86 193 220 180 156 51 146 179]
size in bytes = 512
debugger dtype = 11
shape = [128]
-----------------------------------------------------------
tensor_info_2 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op171
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_2 attributes:
data (printed in uint8) = [ 99 26 69 ... 154 218 164]
size in bytes = 2076672
debugger dtype = 10
shape = [32, 192, 13, 13]
-----------------------------------------------------------
tensor_info_3 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op353
slot = 1
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_3 attributes:
data (printed in uint8) = [19 17 27 ... 94 42 90]
size in bytes = 129792
debugger dtype = 6
shape = [32, 12, 13, 13, 2]

View File

@ -0,0 +1,74 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Read tensor test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
import numpy as np
def main():
debugger_backend = d.DbgServices(
dump_file_path="/opt/nvme2n1/j00455527/dumps/sync_trans_true/032421/alexnet")
_ = debugger_backend.initialize(
net_name="Network Name goes here!", is_sync_mode=True)
# parameter
info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True)
# output tensor with zero slot
info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op171",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
# output tensor with non-zero slot
info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op353",
slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
tensor_info = [info1, info2, info3]
tensor_data = debugger_backend.read_tensors(tensor_info)
print_read_tensors(tensor_info, tensor_data)
def print_read_tensors(tensor_info, tensor_data):
"""Print read tensors."""
for x, _ in enumerate(tensor_info):
print("-----------------------------------------------------------")
print("tensor_info_" + str(x+1) + " attributes:")
print("node name = ", tensor_info[x].node_name)
print("slot = ", tensor_info[x].slot)
print("iteration = ", tensor_info[x].iteration)
print("device_id = ", tensor_info[x].device_id)
print("root_graph_id = ", tensor_info[x].root_graph_id)
print("is_parameter = ", tensor_info[x].is_parameter)
print()
print("tensor_data_" + str(x+1) + " attributes:")
print("data (printed in uint8) = ", np.frombuffer(
tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size))
py_byte_size = len(tensor_data[x].data_ptr)
c_byte_size = tensor_data[x].data_size
if c_byte_size != py_byte_size:
print("The python byte size of ", py_byte_size,
" does not match the C++ byte size of ", c_byte_size)
print("size in bytes = ", tensor_data[x].data_size)
print("debugger dtype = ", tensor_data[x].dtype)
print("shape = ", tensor_data[x].shape)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,261 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "debugger/offline_debug/dbg_services.h"
#include <algorithm>
DbgServices::DbgServices(bool verbose) {
DbgLogger::verbose = verbose;
char *dbg_log_path = getenv("OFFLINE_DBG_LOG");
if (dbg_log_path != NULL) {
DbgLogger::verbose = true;
}
debug_services = new DebugServices();
}
DbgServices::DbgServices(const DbgServices &other) {
MS_LOG(INFO) << "cpp DbgServices object is created via copy";
debug_services = new DebugServices(*other.debug_services);
}
DbgServices &DbgServices::operator=(const DbgServices &other) {
MS_LOG(INFO) << "cpp DbgServices object is being assigned a different state";
if (this != &other) {
delete debug_services;
debug_services = new DebugServices(*other.debug_services);
}
return *this;
}
DbgServices::~DbgServices() {
MS_LOG(INFO) << "cpp DbgServices object is deleted";
delete debug_services;
}
std::string DbgServices::GetVersion() {
MS_LOG(INFO) << "get version is called";
return "1.2.0";
}
int32_t DbgServices::Initialize(std::string net_name, std::string dump_folder_path, bool is_sync_mode) {
MS_LOG(INFO) << "cpp DbgServices initialize network name " << net_name;
MS_LOG(INFO) << "cpp DbgServices initialize dump folder path " << dump_folder_path;
MS_LOG(INFO) << "cpp DbgServices initialize sync mode " << is_sync_mode;
debug_services->SetNetName(net_name);
debug_services->SetDumpDir(dump_folder_path);
debug_services->SetSyncMode(is_sync_mode);
return 0;
}
int32_t DbgServices::AddWatchpoint(
unsigned int id, unsigned int watch_condition,
std::map<std::string, std::map<std::string, std::variant<bool, std::vector<std::string>>>> check_nodes,
std::vector<parameter_t> parameter_list) {
MS_LOG(INFO) << "cpp start";
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint id " << id;
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint watch_condition " << watch_condition;
for (auto const &node : check_nodes) {
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint name " << node.first;
auto attr_map = node.second;
bool is_parameter = std::get<bool>(attr_map["is_parameter"]);
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint is_parameter " << is_parameter;
// std::vector<uint32_t> device_id = std::get<std::vector<uint32_t>>(attr_map["device_id"]);
std::vector<std::string> device_id_str = std::get<std::vector<std::string>>(attr_map["device_id"]);
std::vector<std::uint32_t> device_id;
std::transform(device_id_str.begin(), device_id_str.end(), std::back_inserter(device_id),
[](std::string &id_str) -> std::uint32_t { return static_cast<uint32_t>(std::stoul(id_str)); });
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint device_id ";
for (auto const &i : device_id) {
MS_LOG(INFO) << i << " ";
}
// std::vector<uint32_t> root_graph_id = std::get<std::vector<uint32_t>>(attr_map["root_graph_id"]);
std::vector<std::string> root_graph_id_str = std::get<std::vector<std::string>>(attr_map["root_graph_id"]);
std::vector<std::uint32_t> root_graph_id;
std::transform(
root_graph_id_str.begin(), root_graph_id_str.end(), std::back_inserter(root_graph_id),
[](std::string &graph_str) -> std::uint32_t { return static_cast<uint32_t>(std::stoul(graph_str)); });
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint root_graph_id";
for (auto const &j : root_graph_id) {
MS_LOG(INFO) << j << " ";
}
}
for (auto const &parameter : parameter_list) {
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter name " << parameter.name;
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter disabled " << parameter.disabled;
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter value " << parameter.value;
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter hit " << parameter.hit;
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter actual_value " << parameter.actual_value;
}
std::vector<std::tuple<std::string, bool>> check_node_list;
std::vector<std::tuple<std::string, std::vector<uint32_t>>> check_node_device_list;
std::vector<std::tuple<std::string, std::vector<uint32_t>>> check_node_graph_list;
std::vector<DebugServices::parameter_t> parameter_list_backend;
std::transform(check_nodes.begin(), check_nodes.end(), std::back_inserter(check_node_list),
[](auto &node) -> std::tuple<std::string, bool> {
auto attr_map = node.second;
return std::make_tuple(node.first, std::get<bool>(attr_map["is_parameter"]));
});
std::transform(check_nodes.begin(), check_nodes.end(), std::back_inserter(check_node_device_list),
[](auto &node) -> std::tuple<std::string, std::vector<uint32_t>> {
auto attr_map = node.second;
std::vector<std::string> device_id_str = std::get<std::vector<std::string>>(attr_map["device_id"]);
std::vector<std::uint32_t> device_id;
std::transform(
device_id_str.begin(), device_id_str.end(), std::back_inserter(device_id),
[](std::string &id_str) -> std::uint32_t { return static_cast<uint32_t>(std::stoul(id_str)); });
return std::make_tuple(node.first, device_id);
});
std::transform(
check_nodes.begin(), check_nodes.end(), std::back_inserter(check_node_graph_list),
[](auto &node) -> std::tuple<std::string, std::vector<uint32_t>> {
auto attr_map = node.second;
std::vector<std::string> root_graph_id_str = std::get<std::vector<std::string>>(attr_map["root_graph_id"]);
std::vector<std::uint32_t> root_graph_id;
std::transform(
root_graph_id_str.begin(), root_graph_id_str.end(), std::back_inserter(root_graph_id),
[](std::string &graph_str) -> std::uint32_t { return static_cast<uint32_t>(std::stoul(graph_str)); });
return std::make_tuple(node.first, root_graph_id);
});
std::transform(
parameter_list.begin(), parameter_list.end(), std::back_inserter(parameter_list_backend),
[](const parameter_t &parameter) -> DebugServices::parameter_t {
return DebugServices::parameter_t{parameter.name, parameter.disabled, parameter.value, parameter.hit};
});
debug_services->AddWatchpoint(id, watch_condition, 0, check_node_list, parameter_list_backend,
&check_node_device_list, &check_node_graph_list);
MS_LOG(INFO) << "cpp end";
return 0;
}
int32_t DbgServices::RemoveWatchpoint(unsigned int id) {
MS_LOG(INFO) << "cpp DbgServices RemoveWatchpoint id " << id;
debug_services->RemoveWatchpoint(id);
return 0;
}
std::vector<watchpoint_hit_t> DbgServices::CheckWatchpoints(unsigned int iteration) {
MS_LOG(INFO) << "cpp DbgServices CheckWatchpoint iteration " << iteration;
std::vector<std::string> name;
std::vector<std::string> slot;
std::vector<int> condition;
std::vector<unsigned int> watchpoint_id;
std::vector<std::string> overflow_ops;
std::vector<std::vector<DebugServices::parameter_t>> parameters;
std::vector<int32_t> error_codes;
std::vector<unsigned int> device_id;
std::vector<unsigned int> root_graph_id;
// #ifdef ENABLE_D
// overflow_ops = CheckOpOverflow();
// #endif
std::vector<std::shared_ptr<TensorData>> tensor_list;
tensor_list = debug_services->ReadNeededDumpedTensors(iteration);
debug_services->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops,
&tensor_list, false, true, true, &device_id, &root_graph_id);
std::vector<watchpoint_hit_t> hits;
for (unsigned int i = 0; i < name.size(); i++) {
std::vector<DebugServices::parameter_t> &parameter = parameters[i];
std::vector<parameter_t> api_parameter_vector;
for (const auto &p : parameter) {
parameter_t api_parameter(p.name, p.disabled, p.value, p.hit, p.actual_value);
api_parameter_vector.push_back(api_parameter);
}
watchpoint_hit_t hit(name[i], std::stoi(slot[i]), condition[i], watchpoint_id[i], api_parameter_vector,
error_codes[i], device_id[i], root_graph_id[i]);
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t name " << hit.name;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t slot " << hit.slot;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t watchpoint_id " << hit.watchpoint_id;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t error_code " << hit.error_code;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t device_id " << hit.device_id;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t root_graph_id " << hit.root_graph_id;
for (auto const &parameter_i : api_parameter_vector) {
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter name " << parameter_i.name;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter disabled " << parameter_i.disabled;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter value " << parameter_i.value;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter hit " << parameter_i.hit;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter actual_value " << parameter_i.actual_value;
}
hits.push_back(hit);
}
return hits;
}
std::string GetTensorFullName(tensor_info_t info) {
std::string node_name = info.node_name;
if (info.is_parameter) {
// scopes in node name are separated by '/'
// use the name without scope if truncate is true
std::size_t found = node_name.find_last_of("/");
node_name = node_name.substr(found + 1);
}
return node_name + ":" + std::to_string(info.slot);
}
unsigned int GetTensorDeviceId(tensor_info_t info) { return info.device_id; }
unsigned int GetTensorRootGraphId(tensor_info_t info) { return info.root_graph_id; }
unsigned int GetTensorIteration(tensor_info_t info) { return info.iteration; }
unsigned int GetTensorSlot(tensor_info_t info) { return info.slot; }
std::vector<tensor_data_t> DbgServices::ReadTensors(std::vector<tensor_info_t> info) {
for (auto i : info) {
MS_LOG(INFO) << "cpp DbgServices ReadTensor info name " << i.node_name << ", slot " << i.slot << ", iteration "
<< i.iteration << ", device_id " << i.device_id << ", root_graph_id " << i.root_graph_id;
}
std::vector<std::string> backend_name;
std::vector<unsigned int> device_id;
std::vector<unsigned int> root_graph_id;
std::vector<unsigned int> iteration;
std::vector<size_t> slot;
std::vector<std::shared_ptr<TensorData>> result_list;
std::vector<tensor_data_t> tensors_read;
std::transform(info.begin(), info.end(), std::back_inserter(backend_name), GetTensorFullName);
std::transform(info.begin(), info.end(), std::back_inserter(slot), GetTensorSlot);
std::transform(info.begin(), info.end(), std::back_inserter(device_id), GetTensorDeviceId);
std::transform(info.begin(), info.end(), std::back_inserter(root_graph_id), GetTensorRootGraphId);
std::transform(info.begin(), info.end(), std::back_inserter(iteration), GetTensorIteration);
MS_LOG(INFO) << "cpp before";
debug_services->ReadDumpedTensor(backend_name, slot, device_id, iteration, root_graph_id, &result_list);
MS_LOG(INFO) << "cpp after";
for (auto result : result_list) {
tensor_data_t tensor_data_item(result->GetDataPtr(), result->GetByteSize(), result->GetType(), result->GetShape());
tensors_read.push_back(tensor_data_item);
}
MS_LOG(INFO) << "cpp end";
return tensors_read;
}

View File

@ -0,0 +1,149 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DEBUG_DBG_SERVICES_H_
#define DEBUG_DBG_SERVICES_H_
#include <vector>
#include <string>
#include <map>
#include <memory>
#include <tuple>
#include <iostream>
#include <variant>
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
#include "pybind11/stl_bind.h"
#include "debug/debug_services.h"
namespace py = pybind11;
typedef struct parameter {
parameter(const std::string &name, bool disabled, double value, bool hit, double actual_value)
: name(name), disabled(disabled), value(value), hit(hit), actual_value(actual_value) {}
const std::string get_name() const { return name; }
const bool get_disabled() const { return disabled; }
const double get_value() const { return value; }
const bool get_hit() const { return hit; }
const double get_actual_value() const { return actual_value; }
std::string name;
bool disabled;
double value;
bool hit;
double actual_value;
} parameter_t;
typedef struct watchpoint_hit {
watchpoint_hit(const std::string &name, uint32_t slot, int condition, uint32_t watchpoint_id,
const std::vector<parameter_t> &parameters, int32_t error_code, uint32_t device_id,
uint32_t root_graph_id)
: name(name),
slot(slot),
condition(condition),
watchpoint_id(watchpoint_id),
parameters(parameters),
error_code(error_code),
device_id(device_id),
root_graph_id(root_graph_id) {}
const std::string get_name() const { return name; }
const uint32_t get_slot() const { return slot; }
const int get_condition() const { return condition; }
const uint32_t get_watchpoint_id() const { return watchpoint_id; }
const std::vector<parameter_t> get_parameters() const { return parameters; }
const int32_t get_error_code() const { return error_code; }
const uint32_t get_device_id() const { return device_id; }
const uint32_t get_root_graph_id() const { return root_graph_id; }
std::string name;
uint32_t slot;
int condition;
uint32_t watchpoint_id;
std::vector<parameter_t> parameters;
int32_t error_code;
uint32_t device_id;
uint32_t root_graph_id;
} watchpoint_hit_t;
typedef struct tensor_info {
tensor_info(const std::string &node_name, uint32_t slot, uint32_t iteration, uint32_t device_id,
uint32_t root_graph_id, bool is_parameter)
: node_name(node_name),
slot(slot),
iteration(iteration),
device_id(device_id),
root_graph_id(root_graph_id),
is_parameter(is_parameter) {}
const std::string get_node_name() const { return node_name; }
const uint32_t get_slot() const { return slot; }
const uint32_t get_iteration() const { return iteration; }
const uint32_t get_device_id() const { return device_id; }
const uint32_t get_root_graph_id() const { return root_graph_id; }
const bool get_is_parameter() const { return is_parameter; }
std::string node_name;
uint32_t slot;
uint32_t iteration;
uint32_t device_id;
uint32_t root_graph_id;
bool is_parameter;
} tensor_info_t;
typedef struct tensor_data {
tensor_data(char *data_ptr, uint64_t data_size, int dtype, const std::vector<int64_t> &shape)
: data_size(data_size), dtype(dtype), shape(shape) {
if (data_ptr != NULL) {
this->data_ptr = py::bytes(data_ptr, data_size);
} else {
this->data_ptr = py::bytes();
}
}
const py::bytes get_data_ptr() const { return data_ptr; }
const uint64_t get_data_size() const { return data_size; }
const int get_dtype() const { return dtype; }
const std::vector<int64_t> &get_shape() const { return shape; }
py::bytes data_ptr;
uint64_t data_size;
int dtype;
std::vector<int64_t> shape;
} tensor_data_t;
class DbgServices {
private:
DebugServices *debug_services;
public:
explicit DbgServices(bool verbose = false);
DbgServices(const DbgServices &other);
DbgServices &operator=(const DbgServices &other);
~DbgServices();
int32_t Initialize(std::string net_name, std::string dump_folder_path, bool is_sync_mode);
int32_t AddWatchpoint(
unsigned int id, unsigned int watch_condition,
std::map<std::string, std::map<std::string, std::variant<bool, std::vector<std::string>>>> check_nodes,
std::vector<parameter_t> parameter_list);
int32_t RemoveWatchpoint(unsigned int id);
std::vector<watchpoint_hit_t> CheckWatchpoints(unsigned int iteration);
std::vector<tensor_data_t> ReadTensors(std::vector<tensor_info_t> info);
std::string GetVersion();
};
#endif // DEBUG_DBG_SERVICES_H_

View File

@ -0,0 +1,865 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
The module DbgServices provides offline debugger APIs.
"""
import mindspore._mindspore_offline_debug as cds
from mi_validators import check_init, check_initialize, check_add_watchpoint, check_remove_watchpoint, check_check_watchpoints, check_read_tensors, check_initialize_done, check_tensor_info_init, check_tensor_data_init, check_watchpoint_hit_init, check_parameter_init
def get_version():
"""
Function to return offline Debug Services version.
Returns:
version (str): dbgServices version.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> version = dbg_services.get_version()
"""
return cds.DbgServices(False).GetVersion()
class DbgLogger:
"""
Offline Debug Services Logger
Args:
verbose (bool): whether to print logs.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> version = dbg_services.DbgLogger(verbose=False)
"""
def __init__(self, verbose):
self.verbose = verbose
def __call__(self, *logs):
if self.verbose:
print(logs)
log = DbgLogger(False)
class DbgServices():
"""
Offline Debug Services class.
Args:
dump_file_path (str): directory where the dump files are saved.
verbose (bool): whether to print logs (default: False)..
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> d = dbg_services.DbgServices(dump_file_path="dump_file_path",
>>> verbose=True)
"""
@check_init
def __init__(self, dump_file_path, verbose=False):
log.verbose = verbose
log("in Python __init__, file path is ", dump_file_path)
self.dump_file_path = dump_file_path
self.dbg_instance = cds.DbgServices(verbose)
self.version = self.dbg_instance.GetVersion()
self.verbose = verbose
self.initialized = False
@check_initialize
def initialize(self, net_name, is_sync_mode=True):
"""
Initialize Debug Service.
Args:
net_name (str): Network name.
is_sync_mode (bool): Whether to process synchronous or asynchronous dump files mode
(default: True (synchronous)).
Returns:
Initialized Debug Service instance.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> d = dbg_services.DbgServices(dump_file_path="dump_file_path",
>>> verbose=True)
>>> d_init = d.initialize(net_name="network name", is_sync_mode=True)
"""
log("in Python Initialize dump_file_path ", self.dump_file_path)
self.initialized = True
return self.dbg_instance.Initialize(net_name, self.dump_file_path, is_sync_mode)
@check_initialize_done
@check_add_watchpoint
def add_watchpoint(self, watchpoint_id, watch_condition, check_node_list, parameter_list):
"""
Adding watchpoint to Debug Service instance.
Args:
watchpoint_id (int): Watchpoint id
watch_condition (int): A representation of the condition to be checked.
check_node_list (dict): Dictionary of node names (str) as key,
mapping to device_id (list of ints), root_graph_id (list of ints) and is_parameter
(bool).
parameter_list (list): List of parameters in watchpoint. Parameters should be instances of Parameter class.
Each parameter describes the value to be checked in watchpoint.
Returns:
Debug Service instance with added watchpoint.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> d = dbg_services.DbgServices(dump_file_path="dump_file_path",
>>> verbose=True)
>>> d_init = d.initialize(is_sync_mode=True)
>>> d_wp = d_init.add_watchpoint(watchpoint_id=1,
>>> watch_condition=6,
>>> check_node_list={"conv2.bias" : {"device_id": [0],
root_graph_id: [0], "is_parameter": True}},
>>> parameter_list=[dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> actual_value=0.0)])
"""
print("Amir: ", check_node_list)
log("in Python AddWatchpoint")
parameter_list_inst = []
for elem in parameter_list:
parameter_list_inst.append(elem.instance)
return self.dbg_instance.AddWatchpoint(watchpoint_id, watch_condition, check_node_list, parameter_list_inst)
@check_initialize_done
@check_remove_watchpoint
def remove_watchpoint(self, watchpoint_id):
"""
Removing watchpoint from Debug Service instance.
Args:
watchpoint_id (int): Watchpoint id
Returns:
Debug Service instance with removed watchpoint.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> d = dbg_services.DbgServices(dump_file_path="dump_file_path",
>>> verbose=True)
>>> d_init = d.initialize(is_sync_mode=True)
>>> d_wp = d_init.add_watchpoint(watchpoint_id=1,
>>> watch_condition=6,
>>> check_node_list={"conv2.bias" : {"device_id": [5],
root_graph_id: [0], "is_parameter": True}},
>>> parameter_list=[dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> actual_value=0.0)])
>>> d_wp = d_wp.remove_watchpoint(watchpoint_id=1)
"""
log("in Python Remove Watchpoint id ", watchpoint_id)
return self.dbg_instance.RemoveWatchpoint(watchpoint_id)
@check_initialize_done
@check_check_watchpoints
def check_watchpoints(self, iteration):
"""
Checking watchpoint at given iteration.
Args:
iteration (int): Watchpoint check iteration.
Returns:
Watchpoint hit list.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> d = dbg_services.DbgServices(dump_file_path="dump_file_path",
>>> verbose=True)
>>> d_init = d.initialize(is_sync_mode=True)
>>> d_wp = d_init.add_watchpoint(id=1,
>>> watch_condition=6,
>>> check_node_list={"conv2.bias" : {"device_id": [5],
root_graph_id: [0], "is_parameter": True}},
>>> parameter_list=[dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> actual_value=0.0)])
>>> watchpoints = d_wp.check_watchpoints(iteration=8)
"""
log("in Python CheckWatchpoints iteration ", iteration)
watchpoint_list = self.dbg_instance.CheckWatchpoints(iteration)
watchpoint_hit_list = []
for watchpoint in watchpoint_list:
name = watchpoint.get_name()
slot = watchpoint.get_slot()
condition = watchpoint.get_condition()
watchpoint_id = watchpoint.get_watchpoint_id()
parameters = watchpoint.get_parameters()
error_code = watchpoint.get_error_code()
device_id = watchpoint.get_device_id()
root_graph_id = watchpoint.get_root_graph_id()
param_list = []
for param in parameters:
p_name = param.get_name()
disabled = param.get_disabled()
value = param.get_value()
hit = param.get_hit()
actual_value = param.get_actual_value()
param_list.append(Parameter(p_name, disabled, value, hit, actual_value))
watchpoint_hit_list.append(WatchpointHit(name, slot, condition, watchpoint_id,
param_list, error_code, device_id, root_graph_id))
return watchpoint_hit_list
@check_initialize_done
@check_read_tensors
def read_tensors(self, info):
"""
Returning tensor data object describing the tensor requested tensor.
Args:
info (list): List of TensorInfo objects.
Returns:
TensorData list (list).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> d = dbg_services.DbgServices(dump_file_path="dump_file_path",
>>> verbose=True)
>>> d_init = d.initialize(is_sync_mode=True)
>>> tensor_data_list = d_init.read_tensors([dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> device_id=5,
>>> root_graph_id=0,
>>> is_parameter=True)])
"""
log("in Python ReadTensors info ", info)
info_list_inst = []
for elem in info:
log("in Python ReadTensors info ", info)
info_list_inst.append(elem.instance)
tensor_data_list = self.dbg_instance.ReadTensors(info_list_inst)
tensor_data_list_ret = []
for elem in tensor_data_list:
if elem.get_data_size() == 0:
tensor_data = TensorData(b'', elem.get_data_size(), elem.get_dtype(), elem.get_shape())
else:
tensor_data = TensorData(elem.get_data_ptr(), elem.get_data_size(), elem.get_dtype(), elem.get_shape())
tensor_data_list_ret.append(tensor_data)
return tensor_data_list_ret
class TensorInfo():
"""
Tensor Information class.
Args:
node_name (str): Fully qualified name of the desired node.
slot (int): The particular output for the requested node.
iteration (int): The desired itraretion to gather tensor information.
device_id (int): The desired device id to gather tensor information.
is_parameter (bool): Whether node is a parameter (input, constant, bias, parameter).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> device_id=5,
>>> root_graph_id=0,
>>> is_parameter=True)
"""
@check_tensor_info_init
def __init__(self, node_name, slot, iteration, device_id, root_graph_id, is_parameter):
self.instance = cds.tensor_info(node_name, slot, iteration, device_id, root_graph_id, is_parameter)
@property
def node_name(self):
"""
Function to receive TensorInfo node_name.
Returns:
node_name of TensorInfo instance (str).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> device_id=5,
>>> root_graph_id=0,
>>> is_parameter=True)
>>> name = tensor_info.node_name
"""
return self.instance.get_node_name()
@property
def slot(self):
"""
Function to receive TensorInfo slot.
Returns:
slot of TensorInfo instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> device_id=5,
>>> root_graph_id=0,
>>> is_parameter=True)
>>> slot = tensor_info.slot
"""
return self.instance.get_slot()
@property
def iteration(self):
"""
Function to receive TensorInfo iteration.
Returns:
iteration of TensorInfo instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> device_id=5,
>>> root_graph_id=0,
>>> is_parameter=True)
>>> iteration = tensor_info.iteration
"""
return self.instance.get_iteration()
@property
def device_id(self):
"""
Function to receive TensorInfo device_id.
Returns:
device_id of TensorInfo instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> device_id=5,
>>> root_graph_id=0,
>>> is_parameter=True)
>>> device_id = tensor_info.device_id
"""
@property
def root_graph_id(self):
"""
Function to receive TensorInfo root_graph_id.
Returns:
root_graph_id of TensorInfo instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> device_id=5,
>>> root_graph_id=0,
>>> is_parameter=True)
>>> device_id = tensor_info.root_graph_id
"""
return self.instance.get_root_graph_id()
@property
def is_parameter(self):
"""
Function to receive TensorInfo is_parameter.
Returns:
is_parameter of TensorInfo instance (bool).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> device_id=5,
>>> root_graph_id=0,
>>> is_parameter=True)
>>> is_parameter = tensor_info.is_parameter
"""
return self.instance.get_is_parameter()
class TensorData():
"""
TensorData class.
Args:
data_ptr (byte): Data pointer.
data_size (int): Size of data in bytes.
dtype (int): An encoding representing the type of TensorData.
shape (list): Shape of tensor.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0',
>>> data_size=4,
>>> dtype=0,
>>> shape=[2, 2])
"""
@check_tensor_data_init
def __init__(self, data_ptr, data_size, dtype, shape):
self.instance = cds.tensor_data(data_ptr, data_size, dtype, shape)
@property
def data_ptr(self):
"""
Function to receive TensorData data_ptr.
Returns:
data_ptr of TensorData instance (byte).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0',
>>> data_size=4,
>>> dtype=0,
>>> shape=[2, 2])
>>> data_ptr = tensor_data.data_ptr
"""
return self.instance.get_data_ptr()
@property
def data_size(self):
"""
Function to receive TensorData data_size.
Returns:
data_size of TensorData instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0',
>>> data_size=4,
>>> dtype=0,
>>> shape=[2, 2])
>>> data_size = tensor_data.data_size
"""
return self.instance.get_data_size()
@property
def dtype(self):
"""
Function to receive TensorData dtype.
Returns:
dtype of TensorData instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0',
>>> data_size=4,
>>> dtype=0,
>>> shape=[2, 2])
>>> dtype = tensor_data.dtype
"""
return self.instance.get_dtype()
@property
def shape(self):
"""
Function to receive TensorData shape.
Returns:
shape of TensorData instance (list).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0',
>>> data_size=4,
>>> dtype=0,
>>> shape=[2, 2])
>>> shape = tensor_data.shape
"""
return self.instance.get_shape()
class WatchpointHit():
"""
WatchpointHit class.
Args:
name (str): Name of WatchpointHit instance.
slot (int): The numerical label of an output.
condition (int): A representation of the condition to be checked.
watchpoint_id (int): Watchpoint id.
parameters (list): A list of all parameters for WatchpointHit instance.
Parameters have to be instances of Parameter class.
error_code (int): An explanation of certain scenarios where watchpoint could not be checked.
device_id (int): Device id where the watchpoint is hit.
root_graph_id (int): Root graph id where the watchpoint is hit.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
"""
@check_watchpoint_hit_init
def __init__(self, name, slot, condition, watchpoint_id, parameters, error_code, device_id, root_graph_id):
parameter_list_inst = []
for elem in parameters:
parameter_list_inst.append(elem.instance)
self.instance = cds.watchpoint_hit(name, slot, condition, watchpoint_id,
parameter_list_inst, error_code, device_id, root_graph_id)
@property
def name(self):
"""
Function to receive WatchpointHit name.
Returns:
name of WatchpointHit instance (str).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
>>> name = watchpoint_hit.name
"""
return self.instance.get_name()
@property
def slot(self):
"""
Function to receive WatchpointHit slot.
Returns:
slot of WatchpointHit instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
>>> slot = watchpoint_hit.slot
"""
return self.instance.get_slot()
@property
def condition(self):
"""
Function to receive WatchpointHit condition.
Returns:
condition of WatchpointHit instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
>>> condition = watchpoint_hit.condition
"""
return self.instance.get_condition()
@property
def watchpoint_id(self):
"""
Function to receive WatchpointHit watchpoint_id.
Returns:
watchpoint_id of WatchpointHit instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
>>> watchpoint_id = watchpoint_hit.watchpoint_id
"""
return self.instance.get_watchpoint_id()
@property
def parameters(self):
"""
Function to receive WatchpointHit parameters.
Returns:
List of parameters of WatchpointHit instance (list).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
>>> parameters = watchpoint_hit.parameters
"""
params = self.instance.get_parameters()
param_list = []
for elem in params:
tmp = Parameter(elem.get_name(),
elem.get_disabled(),
elem.get_value(),
elem.get_hit(),
elem.get_actual_value())
param_list.append(tmp)
return param_list
@property
def error_code(self):
"""
Function to receive WatchpointHit error_code.
Returns:
error_code of WatchpointHit instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
>>> error_code = watchpoint_hit.error_code
"""
return self.instance.get_error_code()
@property
def device_id(self):
"""
Function to receive WatchpointHit device_id.
Returns:
device_id of WatchpointHit instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
>>> device_id = watchpoint_hit.device_id
"""
return self.instance.get_device_id()
@property
def root_graph_id(self):
"""
Function to receive WatchpointHit root_graph_id.
Returns:
root_graph_id of WatchpointHit instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
>>> root_graph_id = watchpoint_hit.root_graph_id
"""
return self.instance.get_root_graph_id()
class Parameter():
"""
Parameter class.
Args:
name (str): Name of the parameter.
disabled (bool): Whether parameter is used in backend.
value (float): Threshold value of the parameter.
hit (bool): Whether this parameter triggered watchpoint (default is False).
actual_value (float): Actual value of the parameter (default is 0.0).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> parameter = dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> actual_value=0.0)
"""
@check_parameter_init
def __init__(self, name, disabled, value, hit=False, actual_value=0.0):
self.instance = cds.parameter(name, disabled, value, hit, actual_value)
@property
def name(self):
"""
Function to receive Parameter name.
Returns:
name of Parameter instance (str).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> parameter = dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> name = watchpoint_hit.name
"""
return self.instance.get_name()
@property
def disabled(self):
"""
Function to receive Parameter disabled value.
Returns:
disabled of Parameter instance (bool).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> parameter = dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> disabled = watchpoint_hit.disabled
"""
return self.instance.get_disabled()
@property
def value(self):
"""
Function to receive Parameter value.
Returns:
value of Parameter instance (float).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> parameter = dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> value = watchpoint_hit.value
"""
return self.instance.get_value()
@property
def hit(self):
"""
Function to receive Parameter hit value.
Returns:
hit of Parameter instance (bool).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> parameter = dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> hit = watchpoint_hit.hit
"""
return self.instance.get_hit()
@property
def actual_value(self):
"""
Function to receive Parameter actual_value value.
Returns:
actual_value of Parameter instance (float).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> parameter = dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> actual_value = watchpoint_hit.actual_value
"""
return self.instance.get_actual_value()

View File

@ -0,0 +1,24 @@
python sync_trans_false_read_tensors.py > sync_trans_false_read_tensors.actual
diff sync_trans_false_read_tensors.actual sync_trans_false_read_tensors.expected
if [ $? -eq 0 ]; then
echo sync_trans_false_read_tensors PASSED
else
echo sync_trans_false_read_tensors FAILED
fi
python sync_trans_true_read_tensors.py > sync_trans_true_read_tensors.actual
diff sync_trans_true_read_tensors.actual sync_trans_true_read_tensors.expected
if [ $? -eq 0 ]; then
echo sync_trans_true_read_tensors PASSED
else
echo sync_trans_true_read_tensors FAILED
fi
python sync_trans_false_watchpoints.py > sync_trans_false_watchpoints.actual
diff sync_trans_false_watchpoints.actual sync_trans_false_watchpoints.expected
if [ $? -eq 0 ]; then
echo sync_trans_false_watchpoints PASSED
else
echo sync_trans_false_watchpoints FAILED
fi

View File

@ -0,0 +1,70 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = True
tensor_data_1 attributes:
data (printed in uint8) = [ 0 0 0 0 195 127 0 0 176 202 195 248 194 127 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 58 196 248
194 127 0 0 17 0 0 0 0 0 0 0 160 76 6 140 195 127
0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127 0 0
64 195 195 248 194 127 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 88 1 196 248 194 127 0 0 18 0 0 0
0 0 0 0 160 47 6 140 195 127 0 0 69 0 0 0 0 0
0 0 1 0 0 0 195 127 0 0 176 203 195 248 194 127 0 0
176 204 195 248 194 127 0 0 0 0 0 0 0 0 0 0 216 241
195 248 194 127 0 0 19 0 0 0 0 0 0 0 96 39 6 140
195 127 0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127
0 0 112 52 196 248 194 127 0 0 176 52 196 248 194 127 0 0
0 0 0 0 0 0 0 0 88 250 195 248 194 127 0 0 20 0
0 0 0 0 0 0 128 130 5 140 195 127 0 0 69 0 0 0
0 0 0 0 0 0 0 0 195 127 0 0 208 136 195 248 194 127
0 0 176 202 195 248 194 127 0 0 48 52 196 248 194 127 0 0
184 247 195 248 194 127 0 0 21 0 0 0 0 0 0 0 176 213
4 140 195 127 0 0 69 0 0 0 0 0 0 0 0 0 0 0
195 127 0 0 48 52 196 248 194 127 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 8 249 195 248 194 127 0 0
22 0 0 0 0 0 0 0 16 46 4 140 195 127 0 0 69 0
0 0 0 0 0 0 1 0 0 0 195 127 0 0 64 137 195 248
194 127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 88 12 196 248 194 127 0 0 23 0 0 0 0 0 0 0
32 137 3 140 195 127 0 0 85 0 0 0 0 0 0 0 0 0
0 0 195 127 0 0 176 202 195 248 194 127 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 104 246 195 248 194 127
0 0 24 0 0 0 0 0 0 0 48 104 15 140 195 127 0 0
32 104 15 140 195 127 0 0]
size in bytes = 512
debugger dtype = 11
shape = [128]
-----------------------------------------------------------
tensor_info_2 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_2 attributes:
data (printed in uint8) = [ 0 169 0 ... 152 242 63]
size in bytes = 4153344
debugger dtype = 11
shape = [32, 192, 13, 13]
-----------------------------------------------------------
tensor_info_3 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300
slot = 1
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_3 attributes:
data (printed in uint8) = [ 0 169 0 ... 217 4 52]
size in bytes = 831744
debugger dtype = 8
shape = [207936]

View File

@ -0,0 +1,74 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Read tensor test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
import numpy as np
def main():
debugger_backend = d.DbgServices(
dump_file_path="/home/jtzanaka/dumps/sync_trans_false/032421/alexnet")
_ = debugger_backend.initialize(
net_name="Network Name goes here!", is_sync_mode=True)
# parameter
info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True)
# output tensor with zero slot
info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
# output tensor with non-zero slot
info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300",
slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
tensor_info = [info1, info2, info3]
tensor_data = debugger_backend.read_tensors(tensor_info)
print_read_tensors(tensor_info, tensor_data)
def print_read_tensors(tensor_info, tensor_data):
"""Print read tensors."""
for x, _ in enumerate(tensor_info):
print("-----------------------------------------------------------")
print("tensor_info_" + str(x+1) + " attributes:")
print("node name = ", tensor_info[x].node_name)
print("slot = ", tensor_info[x].slot)
print("iteration = ", tensor_info[x].iteration)
print("device_id = ", tensor_info[x].device_id)
print("root_graph_id = ", tensor_info[x].root_graph_id)
print("is_parameter = ", tensor_info[x].is_parameter)
print()
print("tensor_data_" + str(x+1) + " attributes:")
print("data (printed in uint8) = ", np.frombuffer(
tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size))
py_byte_size = len(tensor_data[x].data_ptr)
c_byte_size = tensor_data[x].data_size
if c_byte_size != py_byte_size:
print("The python byte size of ", py_byte_size,
" does not match the C++ byte size of ", c_byte_size)
print("size in bytes = ", tensor_data[x].data_size)
print("debugger dtype = ", tensor_data[x].dtype)
print("shape = ", tensor_data[x].shape)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,33 @@
-----------------------------------------------------------
watchpoint_hit for test_1 attributes:
name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308
slot = 0
condition = 6
watchpoint_id = 1
parameter 0 name = param
parameter 0 disabled = False
parameter 0 value = 0.0
parameter 0 hit = True
parameter 0 actual_value = -2.429065704345703
error code = 0
device_id = 0
root_graph_id = 0
-----------------------------------------------------------
watchpoint_hit for test_4 attributes:
name = Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc3.bias
slot = 0
condition = 18
watchpoint_id = 3
parameter 0 name = abs_mean_update_ratio_gt
parameter 0 disabled = False
parameter 0 value = 0.0
parameter 0 hit = True
parameter 0 actual_value = 1.793662034335766e-35
parameter 1 name = epsilon
parameter 1 disabled = True
parameter 1 value = 0.0
parameter 1 hit = False
parameter 1 actual_value = 0.0
error code = 0
device_id = 0
root_graph_id = 0

View File

@ -0,0 +1,109 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Watchpoints test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
def main():
debugger_backend = d.DbgServices(
dump_file_path="/home/jtzanaka/dumps/sync_trans_false/032421/alexnet")
_ = debugger_backend.initialize(
net_name="Network Name goes here!", is_sync_mode=True)
# NOTES:
# -> watch_condition=6 is MIN_LT
# -> watch_condition=18 is CHANGE_TOO_LARGE
# test 1: watchpoint set and hit (watch_condition=6)
param1 = d.Parameter(name="param", disabled=False, value=0.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/"
"Conv2D-op308":
{"device_id": [0], "root_graph_id": [0], "is_parameter": False
}}, parameter_list=[param1])
watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2)
if len(watchpoint_hits_test_1) != 1:
print("ERROR -> test 1: watchpoint set but not hit just once")
print_watchpoint_hits(watchpoint_hits_test_1, 1)
# test 2: watchpoint remove and ensure it's not hit
_ = debugger_backend.remove_watchpoint(watchpoint_id=1)
watchpoint_hits_test_2 = debugger_backend.check_watchpoints(iteration=2)
if watchpoint_hits_test_2:
print("ERROR -> test 2: watchpoint removed but hit")
# test 3: watchpoint set and not hit, then remove
param2 = d.Parameter(name="param", disabled=False, value=-1000.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/"
"Conv2D-op308":
{"device_id": [0], "root_graph_id": [0], "is_parameter": False
}}, parameter_list=[param2])
watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2)
if watchpoint_hits_test_3:
print("ERROR -> test 3: watchpoint set but not supposed to be hit")
_ = debugger_backend.remove_watchpoint(watchpoint_id=2)
# test 4: weight change watchpoint set and hit
param_abs_mean_update_ratio_gt = d.Parameter(
name="abs_mean_update_ratio_gt", disabled=False, value=0.0)
param_epsilon = d.Parameter(name="epsilon", disabled=True, value=0.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=3, watch_condition=18,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/"
"Parameter[6]_11/fc3.bias":
{"device_id": [0], "root_graph_id": [0], "is_parameter": True
}}, parameter_list=[param_abs_mean_update_ratio_gt,
param_epsilon])
watchpoint_hits_test_4 = debugger_backend.check_watchpoints(iteration=3)
if len(watchpoint_hits_test_4) != 1:
print("ERROR -> test 4: watchpoint weight change set but not hit just once")
print_watchpoint_hits(watchpoint_hits_test_4, 4)
def print_watchpoint_hits(watchpoint_hits, test_id):
"""Print watchpoint hits."""
for x, _ in enumerate(watchpoint_hits):
print("-----------------------------------------------------------")
print("watchpoint_hit for test_%u attributes:" % test_id)
print("name = ", watchpoint_hits[x].name)
print("slot = ", watchpoint_hits[x].slot)
print("condition = ", watchpoint_hits[x].condition)
print("watchpoint_id = ", watchpoint_hits[x].watchpoint_id)
for p, _ in enumerate(watchpoint_hits[x].parameters):
print("parameter ", p, " name = ",
watchpoint_hits[x].parameters[p].name)
print("parameter ", p, " disabled = ",
watchpoint_hits[x].parameters[p].disabled)
print("parameter ", p, " value = ",
watchpoint_hits[x].parameters[p].value)
print("parameter ", p, " hit = ",
watchpoint_hits[x].parameters[p].hit)
print("parameter ", p, " actual_value = ",
watchpoint_hits[x].parameters[p].actual_value)
print("error code = ", watchpoint_hits[x].error_code)
print("device_id = ", watchpoint_hits[x].device_id)
print("root_graph_id = ", watchpoint_hits[x].root_graph_id)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,70 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = True
tensor_data_1 attributes:
data (printed in uint8) = [ 1 0 0 0 195 127 0 0 80 58 118 65 195 127 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 40 186 117 65
195 127 0 0 5 0 0 0 0 0 0 0 160 76 6 204 195 127
0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127 0 0
48 135 117 65 195 127 0 0 16 58 118 65 195 127 0 0 144 58
118 65 195 127 0 0 168 186 117 65 195 127 0 0 6 0 0 0
0 0 0 0 160 47 6 204 195 127 0 0 69 0 0 0 0 0
0 0 1 0 0 0 195 127 0 0 80 58 118 65 195 127 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 184 249
117 65 195 127 0 0 7 0 0 0 0 0 0 0 96 39 6 204
195 127 0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127
0 0 224 218 117 65 195 127 0 0 0 0 0 0 0 0 0 0
224 219 117 65 195 127 0 0 200 17 118 65 195 127 0 0 8 0
0 0 0 0 0 0 128 130 5 204 195 127 0 0 69 0 0 0
0 0 0 0 1 0 0 0 195 127 0 0 120 233 255 59 196 127
0 0 224 217 117 65 195 127 0 0 224 214 117 65 195 127 0 0
120 250 117 65 195 127 0 0 9 0 0 0 0 0 0 0 176 213
4 204 195 127 0 0 69 0 0 0 0 0 0 0 1 0 0 0
195 127 0 0 240 66 118 65 195 127 0 0 160 218 117 65 195 127
0 0 224 215 117 65 195 127 0 0 40 9 118 65 195 127 0 0
10 0 0 0 0 0 0 0 16 46 4 204 195 127 0 0 69 0
0 0 0 0 0 0 1 0 0 0 195 127 0 0 208 59 118 65
195 127 0 0 0 0 0 0 0 0 0 0 96 218 117 65 195 127
0 0 56 251 117 65 195 127 0 0 11 0 0 0 0 0 0 0
32 137 3 204 195 127 0 0 85 0 0 0 0 0 0 0 1 0
0 0 195 127 0 0 224 214 117 65 195 127 0 0 144 59 118 65
195 127 0 0 160 214 117 65 195 127 0 0 136 62 118 65 195 127
0 0 12 0 0 0 0 0 0 0 48 104 15 204 195 127 0 0
32 104 15 204 195 127 0 0]
size in bytes = 512
debugger dtype = 11
shape = [128]
-----------------------------------------------------------
tensor_info_2 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_2 attributes:
data (printed in uint8) = [206 239 74 ... 53 201 62]
size in bytes = 4153344
debugger dtype = 11
shape = [32, 192, 13, 13]
-----------------------------------------------------------
tensor_info_3 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300
slot = 1
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_3 attributes:
data (printed in uint8) = [206 239 74 ... 16 239 51]
size in bytes = 831744
debugger dtype = 8
shape = [207936]

View File

@ -0,0 +1,74 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Read tensor test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
import numpy as np
def main():
debugger_backend = d.DbgServices(
dump_file_path="/home/jtzanaka/dumps/sync_trans_true/032421/alexnet")
_ = debugger_backend.initialize(
net_name="Network Name goes here!", is_sync_mode=True)
# parameter
info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True)
# output tensor with zero slot
info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
# output tensor with non-zero slot
info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300",
slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
tensor_info = [info1, info2, info3]
tensor_data = debugger_backend.read_tensors(tensor_info)
print_read_tensors(tensor_info, tensor_data)
def print_read_tensors(tensor_info, tensor_data):
"""Print read tensors."""
for x, _ in enumerate(tensor_info):
print("-----------------------------------------------------------")
print("tensor_info_" + str(x+1) + " attributes:")
print("node name = ", tensor_info[x].node_name)
print("slot = ", tensor_info[x].slot)
print("iteration = ", tensor_info[x].iteration)
print("device_id = ", tensor_info[x].device_id)
print("root_graph_id = ", tensor_info[x].root_graph_id)
print("is_parameter = ", tensor_info[x].is_parameter)
print()
print("tensor_data_" + str(x+1) + " attributes:")
print("data (printed in uint8) = ", np.frombuffer(
tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size))
py_byte_size = len(tensor_data[x].data_ptr)
c_byte_size = tensor_data[x].data_size
if c_byte_size != py_byte_size:
print("The python byte size of ", py_byte_size,
" does not match the C++ byte size of ", c_byte_size)
print("size in bytes = ", tensor_data[x].data_size)
print("debugger dtype = ", tensor_data[x].dtype)
print("shape = ", tensor_data[x].shape)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,66 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
#include "pybind11/stl_bind.h"
#include "debugger/offline_debug/dbg_services.h"
PYBIND11_MODULE(_mindspore_offline_debug, m) {
m.doc() = "pybind11 debug services api";
py::class_<DbgServices>(m, "DbgServices")
.def(py::init<bool>())
.def("Initialize", &DbgServices::Initialize)
.def("AddWatchpoint", &DbgServices::AddWatchpoint)
.def("RemoveWatchpoint", &DbgServices::RemoveWatchpoint)
.def("CheckWatchpoints", &DbgServices::CheckWatchpoints)
.def("ReadTensors", &DbgServices::ReadTensors)
.def("GetVersion", &DbgServices::GetVersion);
py::class_<parameter>(m, "parameter")
.def(py::init<std::string, bool, double, bool, double>())
.def("get_name", &parameter::get_name)
.def("get_disabled", &parameter::get_disabled)
.def("get_value", &parameter::get_value)
.def("get_hit", &parameter::get_hit)
.def("get_actual_value", &parameter::get_actual_value);
py::class_<watchpoint_hit>(m, "watchpoint_hit")
.def(py::init<std::string, uint32_t, int, uint32_t, std::vector<parameter_t>, int32_t, uint32_t, uint32_t>())
.def("get_name", &watchpoint_hit::get_name)
.def("get_slot", &watchpoint_hit::get_slot)
.def("get_condition", &watchpoint_hit::get_condition)
.def("get_watchpoint_id", &watchpoint_hit::get_watchpoint_id)
.def("get_parameters", &watchpoint_hit::get_parameters)
.def("get_error_code", &watchpoint_hit::get_error_code)
.def("get_device_id", &watchpoint_hit::get_device_id)
.def("get_root_graph_id", &watchpoint_hit::get_root_graph_id);
py::class_<tensor_info>(m, "tensor_info")
.def(py::init<std::string, uint32_t, uint32_t, uint32_t, uint32_t, bool>())
.def("get_node_name", &tensor_info::get_node_name)
.def("get_slot", &tensor_info::get_slot)
.def("get_iteration", &tensor_info::get_iteration)
.def("get_device_id", &tensor_info::get_device_id)
.def("get_root_graph_id", &tensor_info::get_root_graph_id)
.def("get_is_parameter", &tensor_info::get_is_parameter);
py::class_<tensor_data>(m, "tensor_data")
.def(py::init<char *, uint64_t, int, std::vector<int64_t>>())
.def("get_data_ptr", &tensor_data::get_data_ptr)
.def("get_data_size", &tensor_data::get_data_size)
.def("get_dtype", &tensor_data::get_dtype)
.def("get_shape", &tensor_data::get_shape);
}

View File

@ -0,0 +1,123 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
General Validator Helper Functions.
"""
import os
import inspect
UINT32_MAX = 4294967295
UINT32_MIN = 0
UINT64_MAX = 18446744073709551615
UINT64_MIN = 0
def pad_arg_name(arg_name):
if arg_name != "":
arg_name = arg_name + " "
return arg_name
def check_value(arg, valid_range, arg_name=""):
arg_name = pad_arg_name(arg_name)
if arg < valid_range[0] or arg > valid_range[1]:
raise ValueError(
"Input {0}is not within the required interval of ({1} to {2}).".format(arg_name,
valid_range[0], valid_range[1]))
def check_uint32(arg, arg_name=""):
type_check(arg, (int,), arg_name)
check_value(arg, [UINT32_MIN, UINT32_MAX])
def check_uint64(arg, arg_name=""):
type_check(arg, (int,), arg_name)
check_value(arg, [UINT64_MIN, UINT64_MAX])
def check_dir(dataset_dir):
if not os.path.isdir(dataset_dir) or not os.access(dataset_dir, os.R_OK):
raise ValueError("The folder {} does not exist or permission denied!".format(dataset_dir))
def parse_user_args(method, *args, **kwargs):
"""
Parse user arguments in a function.
Args:
method (method): a callable function.
args: user passed args.
kwargs: user passed kwargs.
Returns:
user_filled_args (list): values of what the user passed in for the arguments.
ba.arguments (Ordered Dict): ordered dict of parameter and argument for what the user has passed.
"""
sig = inspect.signature(method)
if 'self' in sig.parameters or 'cls' in sig.parameters:
ba = sig.bind(method, *args, **kwargs)
ba.apply_defaults()
params = list(sig.parameters.keys())[1:]
else:
ba = sig.bind(*args, **kwargs)
ba.apply_defaults()
params = list(sig.parameters.keys())
user_filled_args = [ba.arguments.get(arg_value) for arg_value in params]
return user_filled_args, ba.arguments
def type_check(arg, types, arg_name):
"""
Check the type of the parameter.
Args:
arg (Any) : any variable.
types (tuple): tuple of all valid types for arg.
arg_name (str): the name of arg.
Returns:
Exception: when the type is not correct, otherwise nothing.
"""
# handle special case of booleans being a subclass of ints
print_value = '\"\"' if repr(arg) == repr('') else arg
if int in types and bool not in types:
if isinstance(arg, bool):
raise TypeError("Argument {0} with value {1} is not of type {2}.".format(arg_name, print_value, types))
if not isinstance(arg, types):
raise TypeError("Argument {0} with value {1} is not of type {2}.".format(arg_name, print_value, types))
def type_check_list(args, types, arg_names):
"""
Check the type of each parameter in the list.
Args:
args (Union[list, tuple]): a list or tuple of any variable.
types (tuple): tuple of all valid types for arg.
arg_names (Union[list, tuple of str]): the names of args.
Returns:
Exception: when the type is not correct, otherwise nothing.
"""
type_check(args, (list, tuple,), arg_names)
if len(args) != len(arg_names) and not isinstance(arg_names, str):
raise ValueError("List of arguments is not the same length as argument_names.")
if isinstance(arg_names, str):
arg_names = ["{0}[{1}]".format(arg_names, i) for i in range(len(args))]
for arg, arg_name in zip(args, arg_names):
type_check(arg, types, arg_name)

View File

@ -0,0 +1,223 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Validator Functions for Offline Debugger APIs.
"""
from functools import wraps
import dbg_services as cds
from mi_validator_helpers import parse_user_args, type_check, type_check_list, check_dir, check_uint32, check_uint64
def check_init(method):
"""Wrapper method to check the parameters of DbgServices init."""
@wraps(method)
def new_method(self, *args, **kwargs):
[dump_file_path, verbose], _ = parse_user_args(method, *args, **kwargs)
type_check(dump_file_path, (str,), "dump_file_path")
type_check(verbose, (bool,), "verbose")
check_dir(dump_file_path)
return method(self, *args, **kwargs)
return new_method
def check_initialize(method):
"""Wrapper method to check the parameters of DbgServices Initialize method."""
@wraps(method)
def new_method(self, *args, **kwargs):
[net_name, is_sync_mode], _ = parse_user_args(method, *args, **kwargs)
type_check(net_name, (str,), "net_name")
type_check(is_sync_mode, (bool,), "is_sync_mode")
return method(self, *args, **kwargs)
return new_method
def check_add_watchpoint(method):
"""Wrapper method to check the parameters of DbgServices AddWatchpoint."""
@wraps(method)
def new_method(self, *args, **kwargs):
[id_value, watch_condition, check_node_list, parameter_list], _ = parse_user_args(method, *args, **kwargs)
check_uint32(id_value, "id")
check_uint32(watch_condition, "watch_condition")
type_check(check_node_list, (dict,), "check_node_list")
for node_name, node_info in check_node_list.items():
type_check(node_name, (str,), "node_name")
type_check(node_info, (dict,), "node_info")
for info_name, info_param in node_info.items():
type_check(info_name, (str,), "node parameter name")
if info_name in ["device_id"]:
for param in info_param:
check_uint32(param, "device_id")
elif info_name in ["root_graph_id"]:
for param in info_param:
check_uint32(param, "root_graph_id")
elif info_name in ["is_parameter"]:
type_check(info_param, (bool,), "is_parameter")
else:
raise ValueError("Node parameter {} is not defined.".format(info_name))
param_names = ["param_{0}".format(i) for i in range(len(parameter_list))]
type_check_list(parameter_list, (cds.Parameter,), param_names)
return method(self, *args, **kwargs)
return new_method
def check_remove_watchpoint(method):
"""Wrapper method to check the parameters of DbgServices RemoveWatchpoint."""
@wraps(method)
def new_method(self, *args, **kwargs):
[id_value], _ = parse_user_args(method, *args, **kwargs)
check_uint32(id_value, "id")
return method(self, *args, **kwargs)
return new_method
def check_check_watchpoints(method):
"""Wrapper method to check the parameters of DbgServices CheckWatchpoint."""
@wraps(method)
def new_method(self, *args, **kwargs):
[iteration], _ = parse_user_args(method, *args, **kwargs)
check_uint32(iteration, "iteration")
return method(self, *args, **kwargs)
return new_method
def check_read_tensors(method):
"""Wrapper method to check the parameters of DbgServices ReadTensors."""
@wraps(method)
def new_method(self, *args, **kwargs):
[info_list], _ = parse_user_args(method, *args, **kwargs)
info_names = ["info_{0}".format(i) for i in range(len(info_list))]
type_check_list(info_list, (cds.TensorInfo,), info_names)
return method(self, *args, **kwargs)
return new_method
def check_initialize_done(method):
"""Wrapper method to check if initlize is done for DbgServices."""
@wraps(method)
def new_method(self, *args, **kwargs):
if not self.initialized:
raise RuntimeError("Inilize should be called before any other methods of DbgServices!")
return method(self, *args, **kwargs)
return new_method
def check_tensor_info_init(method):
"""Wrapper method to check the parameters of DbgServices TensorInfo init."""
@wraps(method)
def new_method(self, *args, **kwargs):
[node_name, slot, iteration, device_id, root_graph_id,
is_parameter], _ = parse_user_args(method, *args, **kwargs)
type_check(node_name, (str,), "node_name")
check_uint32(slot, "slot")
check_uint32(iteration, "iteration")
check_uint32(device_id, "device_id")
check_uint32(root_graph_id, "root_graph_id")
type_check(is_parameter, (bool,), "is_parameter")
return method(self, *args, **kwargs)
return new_method
def check_tensor_data_init(method):
"""Wrapper method to check the parameters of DbgServices TensorData init."""
@wraps(method)
def new_method(self, *args, **kwargs):
[data_ptr, data_size, dtype, shape], _ = parse_user_args(method, *args, **kwargs)
type_check(data_ptr, (bytes,), "data_ptr")
check_uint64(data_size, "data_size")
type_check(dtype, (int,), "dtype")
shape_names = ["shape_{0}".format(i) for i in range(len(shape))]
type_check_list(shape, (int,), shape_names)
if len(data_ptr) != data_size:
raise ValueError("data_ptr length ({0}) is not equal to data_size ({1}).".format(len(data_ptr), data_size))
return method(self, *args, **kwargs)
return new_method
def check_watchpoint_hit_init(method):
"""Wrapper method to check the parameters of DbgServices WatchpointHit init."""
@wraps(method)
def new_method(self, *args, **kwargs):
[name, slot, condition, watchpoint_id,
parameters, error_code, device_id, root_graph_id], _ = parse_user_args(method, *args, **kwargs)
type_check(name, (str,), "name")
check_uint32(slot, "slot")
type_check(condition, (int,), "condition")
check_uint32(watchpoint_id, "watchpoint_id")
param_names = ["param_{0}".format(i) for i in range(len(parameters))]
type_check_list(parameters, (cds.Parameter,), param_names)
type_check(error_code, (int,), "error_code")
check_uint32(device_id, "device_id")
check_uint32(root_graph_id, "root_graph_id")
return method(self, *args, **kwargs)
return new_method
def check_parameter_init(method):
"""Wrapper method to check the parameters of DbgServices Parameter init."""
@wraps(method)
def new_method(self, *args, **kwargs):
[name, disabled, value, hit, actual_value], _ = parse_user_args(method, *args, **kwargs)
type_check(name, (str,), "name")
type_check(disabled, (bool,), "disabled")
type_check(value, (float,), "value")
type_check(hit, (bool,), "hit")
type_check(actual_value, (float,), "actual_value")
return method(self, *args, **kwargs)
return new_method

View File

@ -0,0 +1,19 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "debugger/offline_debug/offline_logger.h"
bool DbgLogger::verbose = false;

View File

@ -0,0 +1,59 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef OFFLINE_LOGGER_H_
#define OFFLINE_LOGGER_H_
#include <iostream>
#define MS_LOG(level) MS_LOG_##level
#define MS_LOG_INFO static_cast<void>(0), !(DbgLogger::verbose) ? void(0) : DbgLogger(DbgLoggerLvl::INFO) < std::cout
#define MS_LOG_ERROR MS_LOG_INFO
#define MS_LOG_DEBUG MS_LOG_INFO
#define MS_LOG_WARNING MS_LOG_INFO
#define MS_LOG_EXCEPTION \
static_cast<void>(0), !(DbgLogger::verbose) ? void(0) : DbgLogger(DbgLoggerLvl::EXCEPTION) < std::cout
enum DbgLoggerLvl : int { DEBUG = 0, INFO, WARNING, ERROR, EXCEPTION };
class DbgLogger {
public:
explicit DbgLogger(DbgLoggerLvl lvl) : lvl_(lvl) {}
~DbgLogger() = default;
void operator<(std::ostream &os) const {
char *dbg_log_path = getenv("OFFLINE_DBG_LOG");
if (dbg_log_path != NULL) {
FILE *fp;
fp = freopen(dbg_log_path, "a", stdout);
if (fp == nullptr) {
std::cout << "ERROR: DbgLogger could not redirect all stdout to a file";
}
}
os << std::endl;
if (lvl_ == DbgLoggerLvl::EXCEPTION) {
throw;
}
}
static bool verbose;
private:
DbgLoggerLvl lvl_;
};
#endif // OFFLINE_LOGGER_H_

View File

@ -22,7 +22,16 @@
#include <tuple>
#include "debug/debugger/tensor_summary.h"
#ifdef OFFLINE_DBG_MODE
#include "Eigen/Core"
#include "Eigen/src/Core/arch/CUDA/Half.h"
using float16 = Eigen::half;
#include "offline_debug/offline_logger.h"
#endif
#ifdef ONLINE_DBG_MODE
namespace mindspore {
#endif
using CONDITION_TYPE = DebugServices::CONDITION_TYPE;
RangeCountCalculator::RangeCountCalculator()
@ -281,4 +290,6 @@ template class TensorSummary<float16>;
template class TensorSummary<float>;
template class TensorSummary<double>;
template class TensorSummary<bool>;
#ifdef ONLINE_DBG_MODE
} // namespace mindspore
#endif

View File

@ -24,7 +24,9 @@
#include "debug/debug_services.h"
#ifdef ONLINE_DBG_MODE
namespace mindspore {
#endif
class RangeCountCalculator {
public:
RangeCountCalculator();
@ -121,5 +123,7 @@ class TensorSummary : public ITensorSummary {
double_t GetZeroValPercent();
void InitCalculators(const std::vector<DebugServices::watchpoint_t> &);
};
#ifdef ONLINE_DBG_MODE
} // namespace mindspore
#endif
#endif // MINDSPORE_TENSOR_SUMMARY_H

View File

@ -16,37 +16,170 @@
#ifndef MINDSPORE_CCSRC_DEBUG_TENSOR_DATA_H_
#define MINDSPORE_CCSRC_DEBUG_TENSOR_DATA_H_
#include <algorithm>
#include <vector>
#include <string>
#include <cstring>
#include <iostream>
#ifdef OFFLINE_DBG_MODE
#include "debugger/offline_debug/offline_logger.h"
#else
#include "ir/tensor.h"
#include "mindspore/core/utils/log_adapter.h"
#endif
#ifdef ONLINE_DBG_MODE
namespace mindspore {
class TensorData {
private:
mindspore::tensor::TensorPtr tensor_ptr;
std::string name;
size_t slot;
int execution_order;
#endif
namespace MsTypeId {
typedef enum MsTypeId : unsigned int {
kTypeUnknown = 0,
kMetaTypeBegin = kTypeUnknown,
kMetaTypeType, // Type
kMetaTypeAnything,
kMetaTypeObject,
kMetaTypeTypeType, // TypeType
kMetaTypeProblem,
kMetaTypeExternal,
kMetaTypeNone,
kMetaTypeNull,
kMetaTypeEllipsis,
kMetaTypeEnd,
//
// Object types
//
kObjectTypeBegin = kMetaTypeEnd,
kObjectTypeNumber,
kObjectTypeString,
kObjectTypeList,
kObjectTypeTuple,
kObjectTypeSlice,
kObjectTypeKeyword,
kObjectTypeTensorType,
kObjectTypeRowTensorType,
kObjectTypeSparseTensorType,
kObjectTypeUndeterminedType,
kObjectTypeClass,
kObjectTypeDictionary,
kObjectTypeFunction,
kObjectTypeJTagged,
kObjectTypeSymbolicKeyType,
kObjectTypeEnvType,
kObjectTypeRefKey,
kObjectTypeRef,
kObjectTypeEnd,
//
// Number Types
//
kNumberTypeBegin = kObjectTypeEnd,
kNumberTypeBool,
kNumberTypeInt,
kNumberTypeInt8,
kNumberTypeInt16,
kNumberTypeInt32,
kNumberTypeInt64,
kNumberTypeUInt,
kNumberTypeUInt8,
kNumberTypeUInt16,
kNumberTypeUInt32,
kNumberTypeUInt64,
kNumberTypeFloat,
kNumberTypeFloat16,
kNumberTypeFloat32,
kNumberTypeFloat64,
kNumberTypeComplex64,
kNumberTypeEnd
} MsTypeId;
} // namespace MsTypeId
typedef enum DbgDataType : unsigned int {
DT_UNDEFINED = 0,
// Basic types.
DT_BOOL = 1, // bool
DT_INT8 = 2, // int8_t
DT_INT16 = 3, // int16_t
DT_INT32 = 4, // int32_t
DT_INT64 = 5, // int64_t
DT_UINT8 = 6, // uint8_t
DT_UINT16 = 7, // uint16_t
DT_UINT32 = 8, // uint32_t
DT_UINT64 = 9, // uint64_t
DT_FLOAT16 = 10, // float 16
DT_FLOAT32 = 11, // float 32
DT_FLOAT64 = 12, // float 64
DT_STRING = 13, // string
DT_TENSOR = 14, // tensor
DT_GRAPH = 15, // graph
// list type
DT_BOOLS = 16, // list of bool
DT_INTS8 = 17, // list of int8_t
DT_INTS16 = 18, // list of int16_t
DT_INTS32 = 19, // list of int32_t
DT_INTS64 = 20, // list of int64_t
DT_UINTS8 = 21, // list of uint8_t
DT_UINTS16 = 22, // list of uint16_t
DT_UINTS32 = 23, // list of uint32_t
DT_UINTS64 = 24, // list of uint64_t
DT_FLOATS16 = 25, // list of float16
DT_FLOATS32 = 26, // list of float32
DT_FLOATS64 = 27, // list of float64
DT_STRINGS = 28, // list of string
DT_TENSORS = 29, // list of tensor
DT_GRAPHS = 30, // list of graph
DT_TUPLE = 31, // tuple
DT_LIST = 32, // list
DT_DICT = 33, // dictionary
// other types
DT_NONE = 34, // None
DT_SYM_INST = 35, // Symbolic Key Instance
// type related type
DT_BASE_INT = 36, // type generic int
DT_BASE_UINT = 37, // type generate unsigned int
DT_BASE_FLOAT = 38, // type generate float
DT_TYPE = 39, // type type
DT_ANYTHING = 40, // type anything
DT_REFKEY = 41, // type refkey
DT_REF = 42 // type ref
} DbgDataType;
class TensorData {
public:
TensorData() : slot(0), execution_order(-1) {}
TensorData(const TensorData &obj) {
std::cout << "Copy Constructor" << std::endl;
MS_LOG(INFO) << "Copy Constructor";
this->name = obj.name;
this->execution_order = obj.execution_order;
this->slot = obj.slot;
this->data_ptr = obj.data_ptr;
this->size = obj.size;
this->data_type = obj.data_type;
this->data_type_size = obj.data_type_size;
this->shape = obj.shape;
this->iteration = obj.iteration;
this->device_id = obj.device_id;
#ifdef ONLINE_DBG_MODE
this->tensor_ptr = obj.tensor_ptr;
#endif
}
~TensorData() {}
std::string GetName() { return this->name; }
mindspore::tensor::TensorPtr GetTensor() { return this->tensor_ptr; }
size_t GetSlot() { return this->slot; }
int GetExecutionOrder() { return this->execution_order; }
@ -55,9 +188,179 @@ class TensorData {
void SetName(const std::string &name) { this->name = name; }
#ifdef ONLINE_DBG_MODE
void SetTensor(mindspore::tensor::TensorPtr out_tensor) { this->tensor_ptr = out_tensor; }
#endif
void SetSlot(size_t slot) { this->slot = slot; }
char *GetDataPtr() { return data_ptr; }
void SetDataPtr(char *data_ptr) { this->data_ptr = data_ptr; }
uint32_t GetNumElements() { return size / data_type_size; }
uint64_t GetByteSize() { return size; }
void SetByteSize(uint64_t size) { this->size = size; }
std::vector<int64_t> GetShape() { return shape; }
void SetShape(std::vector<int64_t> shape) { this->shape = shape; }
unsigned int GetIteration() { return iteration; }
void SetIteration(unsigned int iteration) { this->iteration = iteration; }
unsigned int GetDeviceId() { return device_id; }
void SetDeviceId(unsigned int device_id) { this->device_id = device_id; }
unsigned int GetRootGraphId() { return root_graph_id; }
void SetRootGraphId(unsigned int root_graph_id) { this->root_graph_id = root_graph_id; }
DbgDataType GetType() { return data_type; }
void SetType(unsigned int type) { ConvertMsToDbgType(type); }
void SetType(std::string type_name) { ConvertStringToDbgType(type_name); }
void ConvertMsToDbgType(uint32_t type) {
switch (type) {
case MsTypeId::kNumberTypeBool:
this->data_type = DbgDataType::DT_BOOL;
this->data_type_size = 1;
break;
case MsTypeId::kNumberTypeInt8:
this->data_type = DbgDataType::DT_INT8;
this->data_type_size = 1;
break;
case MsTypeId::kNumberTypeInt16:
this->data_type = DbgDataType::DT_INT16;
this->data_type_size = 2;
break;
case MsTypeId::kNumberTypeInt32:
this->data_type = DbgDataType::DT_INT32;
this->data_type_size = 4;
break;
case MsTypeId::kNumberTypeInt64:
this->data_type = DbgDataType::DT_INT64;
this->data_type_size = 8;
break;
case MsTypeId::kNumberTypeUInt8:
this->data_type = DbgDataType::DT_UINT8;
this->data_type_size = 1;
break;
case MsTypeId::kNumberTypeUInt16:
this->data_type = DbgDataType::DT_UINT16;
this->data_type_size = 2;
break;
case MsTypeId::kNumberTypeUInt32:
this->data_type = DbgDataType::DT_UINT32;
this->data_type_size = 4;
break;
case MsTypeId::kNumberTypeUInt64:
this->data_type = DbgDataType::DT_UINT64;
this->data_type_size = 8;
break;
case MsTypeId::kNumberTypeFloat16:
this->data_type = DbgDataType::DT_FLOAT16;
this->data_type_size = 2;
break;
case MsTypeId::kNumberTypeFloat32:
this->data_type = DbgDataType::DT_FLOAT32;
this->data_type_size = 4;
break;
case MsTypeId::kNumberTypeFloat64:
this->data_type = DbgDataType::DT_FLOAT64;
this->data_type_size = 8;
break;
case MsTypeId::kNumberTypeInt:
this->data_type = DbgDataType::DT_BASE_INT;
this->data_type_size = 4;
break;
case MsTypeId::kNumberTypeUInt:
this->data_type = DbgDataType::DT_BASE_UINT;
this->data_type_size = 4;
break;
case MsTypeId::kNumberTypeFloat:
this->data_type = DbgDataType::DT_BASE_FLOAT;
this->data_type_size = 4;
break;
default:
MS_LOG(EXCEPTION) << "Unexpected type id: " << type;
}
}
void ConvertStringToDbgType(const std::string &type_name) {
std::string type_name_lower = type_name;
std::string trans_true_prefix = "kNumberType";
if (type_name.find(trans_true_prefix) == 0) {
type_name_lower = type_name.substr(trans_true_prefix.length());
}
(void)std::transform(type_name_lower.begin(), type_name_lower.end(), type_name_lower.begin(), ::tolower);
if (type_name_lower == "bool") {
this->data_type = DbgDataType::DT_BOOL;
this->data_type_size = 1;
} else if (type_name_lower == "int8") {
this->data_type = DbgDataType::DT_INT8;
this->data_type_size = 1;
} else if (type_name_lower == "int16") {
this->data_type = DbgDataType::DT_INT16;
this->data_type_size = 2;
} else if (type_name_lower == "int32") {
this->data_type = DbgDataType::DT_INT32;
this->data_type_size = 4;
} else if (type_name_lower == "int64") {
this->data_type = DbgDataType::DT_INT64;
this->data_type_size = 8;
} else if (type_name_lower == "uint8") {
this->data_type = DbgDataType::DT_UINT8;
this->data_type_size = 1;
} else if (type_name_lower == "uint16") {
this->data_type = DbgDataType::DT_UINT16;
this->data_type_size = 2;
} else if (type_name_lower == "uint32") {
this->data_type = DbgDataType::DT_UINT32;
this->data_type_size = 4;
} else if (type_name_lower == "uint64") {
this->data_type = DbgDataType::DT_UINT64;
this->data_type_size = 8;
} else if (type_name_lower == "float16") {
this->data_type = DbgDataType::DT_FLOAT16;
this->data_type_size = 2;
} else if (type_name_lower == "float32") {
this->data_type = DbgDataType::DT_FLOAT32;
this->data_type_size = 4;
} else if (type_name_lower == "float64") {
this->data_type = DbgDataType::DT_FLOAT64;
this->data_type_size = 8;
} else if (type_name_lower == "") {
this->data_type = DbgDataType::DT_UNDEFINED;
this->data_type_size = 0;
} else {
MS_LOG(EXCEPTION) << "Unexpected type name: " << type_name;
}
}
private:
char *data_ptr; // pointer to the pre-allocated memory
uint64_t size; // size in bytes
DbgDataType data_type; // internal debugger type
unsigned int data_type_size;
std::vector<int64_t> shape;
std::string name;
uint64_t slot;
unsigned int iteration;
unsigned int device_id;
unsigned int root_graph_id;
int execution_order;
#ifdef ONLINE_DBG_MODE
mindspore::tensor::TensorPtr tensor_ptr;
#endif
};
#ifdef ONLINE_DBG_MODE
} // namespace mindspore
#endif
#endif // MINDSPORE_CCSRC_DEBUG_TENSOR_DATA_H_

View File

@ -23,10 +23,14 @@
#include <tuple>
#include <string>
#include <utility>
#ifdef OFFLINE_DBG_MODE
#include "debugger/offline_debug/offline_logger.h"
#endif
#include "debug/tensor_data.h"
#ifdef ONLINE_DBG_MODE
#include "debug/data_dump/dump_json_parser.h"
#include "ir/dtype.h"
namespace mindspore {
#endif
class TensorLoader {
public:
TensorLoader() : iter_num(-1) {}
@ -152,9 +156,10 @@ class TensorLoader {
void set_iter_num(uint32_t iter_num) { this->iter_num = iter_num; }
#ifdef ONLINE_DBG_MODE
bool DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
const std::string &host_fmt, const std::vector<int64_t> &host_shape, TypeId host_type,
TypeId addr_type_id, const std::string &addr_format, size_t slot) const {
TypeId addr_type_id, const std::string &addr_format, size_t slot) {
if (filepath.empty()) {
MS_LOG(ERROR) << "Dump file path is null!";
return false;
@ -181,21 +186,24 @@ class TensorLoader {
auto iter = tensor_list_map.find(tensor_loader_name);
if (iter != tensor_list_map.end()) {
std::shared_ptr<TensorData> node = iter->second;
mindspore::tensor::TensorPtr out_tensor = node->GetTensor();
size_t host_size = out_tensor->data().nbytes();
size_t host_size = node->GetByteSize();
return DumpJsonParser::DumpToFile(path, out_tensor->data_c(), host_size);
return DumpJsonParser::DumpToFile(path, node->GetDataPtr(), host_size);
}
MS_LOG(INFO) << "Tensor name:" << tensor_name << " not found in tensor_list_map";
return true;
}
#endif
private:
// the pair is (device_id, iteration)
std::map<std::string, std::shared_ptr<TensorData>> tensor_list_map;
std::multimap<std::string, std::shared_ptr<TensorData>> node_tensor_map;
std::map<std::string, std::shared_ptr<TensorData>> prev_tensor_list_map;
uint32_t iter_num;
std::mutex lock_;
};
#ifdef ONLINE_DBG_MODE
} // namespace mindspore
#endif
#endif // MINDSPORE_CCSRC_DEBUG_TENSOR_LOAD_H_

View File

@ -713,6 +713,10 @@ bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int exec
}
MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
tensor_data->SetTensor(out_tensor);
tensor_data->SetDataPtr(static_cast<char *>(out_tensor->data_c()));
tensor_data->SetByteSize(out_tensor->data().nbytes());
tensor_data->SetType((unsigned int)host_type);
tensor_data->SetShape(out_tensor->shape());
ret = Debugger::GetInstance()->LoadNewTensor(tensor_data, keep_prev);
return ret;
}

View File

@ -93,7 +93,7 @@ void GPUDeviceAddress::ClearDeviceMemory() {
}
GPUDeviceAddress::~GPUDeviceAddress() { ClearDeviceMemory(); }
#ifdef ENABLE_DEBUGGER
bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
const ShapeVector &host_shape, TypeId host_type, size_t slot,
bool keep_prev) const {
@ -117,13 +117,16 @@ bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int executi
auto tensor_data = std::make_shared<mindspore::TensorData>();
tensor_data->SetName(tensor_name);
tensor_data->SetExecutionOrder(execution_order);
tensor_data->SetTensor(out_tensor);
tensor_data->SetSlot(slot);
tensor_data->SetTensor(out_tensor);
tensor_data->SetDataPtr(static_cast<char *>(out_tensor->data_c()));
tensor_data->SetByteSize(out_tensor->data().nbytes());
tensor_data->SetType((unsigned int)host_type);
tensor_data->SetShape(out_tensor->shape());
ret = Debugger::GetInstance()->LoadNewTensor(tensor_data, keep_prev);
MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
return ret;
}
#endif
} // namespace gpu
} // namespace device
} // namespace mindspore

View File

@ -114,32 +114,33 @@ static int GetSlogLevel(MsLogLevel level) {
static const char *GetSubModuleName(SubModuleId module_id) {
static const char *sub_module_names[NUM_SUBMODUES] = {
"UNKNOWN", // SM_UNKNOWN
"CORE", // SM_CORE
"ANALYZER", // SM_ANALYZER
"COMMON", // SM_COMMON
"DEBUG", // SM_DEBUG
"DEVICE", // SM_DEVICE
"GE_ADPT", // SM_GE_ADPT
"IR", // SM_IR
"KERNEL", // SM_KERNEL
"MD", // SM_MD
"ME", // SM_ME
"EXPRESS", // SM_EXPRESS
"OPTIMIZER", // SM_OPTIMIZER
"PARALLEL", // SM_PARALLEL
"PARSER", // SM_PARSER
"PIPELINE", // SM_PIPELINE
"PRE_ACT", // SM_PRE_ACT
"PYNATIVE", // SM_PYNATIVE
"SESSION", // SM_SESSION
"UTILS", // SM_UTILS
"VM", // SM_VM
"PROFILER", // SM_PROFILER
"PS", // SM_PS
"LITE", // SM_LITE
"HCCL_ADPT", // SM_HCCL_ADPT
"MINDQUANTUM" // SM_MINDQUANTUM
"UNKNOWN", // SM_UNKNOWN
"CORE", // SM_CORE
"ANALYZER", // SM_ANALYZER
"COMMON", // SM_COMMON
"DEBUG", // SM_DEBUG
"OFFLINE_DEBUG", // SM_OFFLINE_DEBUG
"DEVICE", // SM_DEVICE
"GE_ADPT", // SM_GE_ADPT
"IR", // SM_IR
"KERNEL", // SM_KERNEL
"MD", // SM_MD
"ME", // SM_ME
"EXPRESS", // SM_EXPRESS
"OPTIMIZER", // SM_OPTIMIZER
"PARALLEL", // SM_PARALLEL
"PARSER", // SM_PARSER
"PIPELINE", // SM_PIPELINE
"PRE_ACT", // SM_PRE_ACT
"PYNATIVE", // SM_PYNATIVE
"SESSION", // SM_SESSION
"UTILS", // SM_UTILS
"VM", // SM_VM
"PROFILER", // SM_PROFILER
"PS", // SM_PS
"LITE", // SM_LITE
"HCCL_ADPT", // SM_HCCL_ADPT
"MINDQUANTUM" // SM_MINDQUANTUM
};
return sub_module_names[module_id % NUM_SUBMODUES];

View File

@ -111,6 +111,7 @@ enum SubModuleId : int {
SM_ANALYZER, // static analyzer
SM_COMMON, // common
SM_DEBUG, // debug
SM_OFFLINE_DEBUG, // offline debug
SM_DEVICE, // device
SM_GE_ADPT, // ge adapter
SM_IR, // IR

View File

@ -0,0 +1,21 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module provides APIs to load and process dump data, i.e. read tensors, check
for watchpoints and other debugging services.
"""
from . import dbg_services
from . import mi_validator_helpers
from . import mi_validators

View File

@ -0,0 +1,870 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
The module DbgServices provides offline debugger APIs.
"""
import mindspore._mindspore_offline_debug as cds
from mindspore.offline_debug.mi_validators import check_init, check_initialize, check_add_watchpoint, check_remove_watchpoint, check_check_watchpoints, check_read_tensors, check_initialize_done, check_tensor_info_init, check_tensor_data_init, check_watchpoint_hit_init, check_parameter_init
def get_version():
"""
Function to return offline Debug Services version.
Returns:
version (str): dbgServices version.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> version = dbg_services.get_version()
"""
return cds.DbgServices(False).GetVersion()
class DbgLogger:
"""
Offline Debug Services Logger
Args:
verbose (bool): whether to print logs.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> version = dbg_services.DbgLogger(verbose=False)
"""
def __init__(self, verbose):
self.verbose = verbose
def __call__(self, *logs):
if self.verbose:
print(logs)
log = DbgLogger(False)
class DbgServices():
"""
Offline Debug Services class.
Args:
dump_file_path (str): directory where the dump files are saved.
verbose (bool): whether to print logs (default: False)..
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> d = dbg_services.DbgServices(dump_file_path="dump_file_path",
>>> verbose=True)
"""
@check_init
def __init__(self, dump_file_path, verbose=False):
log.verbose = verbose
log("in Python __init__, file path is ", dump_file_path)
self.dump_file_path = dump_file_path
self.dbg_instance = cds.DbgServices(verbose)
self.version = self.dbg_instance.GetVersion()
self.verbose = verbose
self.initialized = False
@check_initialize
def initialize(self, net_name, is_sync_mode=True):
"""
Initialize Debug Service.
Args:
net_name (str): Network name.
is_sync_mode (bool): Whether to process synchronous or asynchronous dump files mode
(default: True (synchronous)).
Returns:
Initialized Debug Service instance.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> d = dbg_services.DbgServices(dump_file_path="dump_file_path",
>>> verbose=True)
>>> d_init = d.initialize(net_name="network name", is_sync_mode=True)
"""
log("in Python Initialize dump_file_path ", self.dump_file_path)
self.initialized = True
return self.dbg_instance.Initialize(net_name, self.dump_file_path, is_sync_mode)
@check_initialize_done
@check_add_watchpoint
def add_watchpoint(self, watchpoint_id, watch_condition, check_node_list, parameter_list):
"""
Adding watchpoint to Debug Service instance.
Args:
watchpoint_id (int): Watchpoint id
watch_condition (int): A representation of the condition to be checked.
check_node_list (dict): Dictionary of node names (str or '*' to check all nodes) as key,
mapping to device_id (list of ints or '*' to check all devices),
root_graph_id (list of ints or '*' to check all graphs) and is_parameter (bool).
parameter_list (list): List of parameters in watchpoint. Parameters should be instances of Parameter class.
Each parameter describes the value to be checked in watchpoint.
Returns:
Debug Service instance with added watchpoint.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> d = dbg_services.DbgServices(dump_file_path="dump_file_path",
>>> verbose=True)
>>> d_init = d.initialize(is_sync_mode=True)
>>> d_wp = d_init.add_watchpoint(watchpoint_id=1,
>>> watch_condition=6,
>>> check_node_list={"conv2.bias" : {"device_id": [0],
root_graph_id: [0], "is_parameter": True}},
>>> parameter_list=[dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> actual_value=0.0)])
"""
log("in Python AddWatchpoint")
for node_name, node_info in check_node_list.items():
for info_name, info_param in node_info.items():
if info_name in ["device_id", "root_graph_id"]:
if info_param in ["*"]:
check_node_list[node_name][info_name] = ["*"]
else:
check_node_list[node_name][info_name] = list(map(str, info_param))
parameter_list_inst = []
for elem in parameter_list:
parameter_list_inst.append(elem.instance)
return self.dbg_instance.AddWatchpoint(watchpoint_id, watch_condition, check_node_list, parameter_list_inst)
@check_initialize_done
@check_remove_watchpoint
def remove_watchpoint(self, watchpoint_id):
"""
Removing watchpoint from Debug Service instance.
Args:
watchpoint_id (int): Watchpoint id
Returns:
Debug Service instance with removed watchpoint.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> d = dbg_services.DbgServices(dump_file_path="dump_file_path",
>>> verbose=True)
>>> d_init = d.initialize(is_sync_mode=True)
>>> d_wp = d_init.add_watchpoint(watchpoint_id=1,
>>> watch_condition=6,
>>> check_node_list={"conv2.bias" : {"device_id": [5],
root_graph_id: [0], "is_parameter": True}},
>>> parameter_list=[dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> actual_value=0.0)])
>>> d_wp = d_wp.remove_watchpoint(watchpoint_id=1)
"""
log("in Python Remove Watchpoint id ", watchpoint_id)
return self.dbg_instance.RemoveWatchpoint(watchpoint_id)
@check_initialize_done
@check_check_watchpoints
def check_watchpoints(self, iteration):
"""
Checking watchpoint at given iteration.
Args:
iteration (int): Watchpoint check iteration.
Returns:
Watchpoint hit list.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> d = dbg_services.DbgServices(dump_file_path="dump_file_path",
>>> verbose=True)
>>> d_init = d.initialize(is_sync_mode=True)
>>> d_wp = d_init.add_watchpoint(id=1,
>>> watch_condition=6,
>>> check_node_list={"conv2.bias" : {"device_id": [5],
root_graph_id: [0], "is_parameter": True}},
>>> parameter_list=[dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> actual_value=0.0)])
>>> watchpoints = d_wp.check_watchpoints(iteration=8)
"""
log("in Python CheckWatchpoints iteration ", iteration)
watchpoint_list = self.dbg_instance.CheckWatchpoints(iteration)
watchpoint_hit_list = []
for watchpoint in watchpoint_list:
name = watchpoint.get_name()
slot = watchpoint.get_slot()
condition = watchpoint.get_condition()
watchpoint_id = watchpoint.get_watchpoint_id()
parameters = watchpoint.get_parameters()
error_code = watchpoint.get_error_code()
device_id = watchpoint.get_device_id()
root_graph_id = watchpoint.get_root_graph_id()
param_list = []
for param in parameters:
p_name = param.get_name()
disabled = param.get_disabled()
value = param.get_value()
hit = param.get_hit()
actual_value = param.get_actual_value()
param_list.append(Parameter(p_name, disabled, value, hit, actual_value))
watchpoint_hit_list.append(WatchpointHit(name, slot, condition, watchpoint_id,
param_list, error_code, device_id, root_graph_id))
return watchpoint_hit_list
@check_initialize_done
@check_read_tensors
def read_tensors(self, info):
"""
Returning tensor data object describing the tensor requested tensor.
Args:
info (list): List of TensorInfo objects.
Returns:
TensorData list (list).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> d = dbg_services.DbgServices(dump_file_path="dump_file_path",
>>> verbose=True)
>>> d_init = d.initialize(is_sync_mode=True)
>>> tensor_data_list = d_init.read_tensors([dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> device_id=5,
>>> root_graph_id=0,
>>> is_parameter=True)])
"""
log("in Python ReadTensors info ", info)
info_list_inst = []
for elem in info:
log("in Python ReadTensors info ", info)
info_list_inst.append(elem.instance)
tensor_data_list = self.dbg_instance.ReadTensors(info_list_inst)
tensor_data_list_ret = []
for elem in tensor_data_list:
if elem.get_data_size() == 0:
tensor_data = TensorData(b'', elem.get_data_size(), elem.get_dtype(), elem.get_shape())
else:
tensor_data = TensorData(elem.get_data_ptr(), elem.get_data_size(), elem.get_dtype(), elem.get_shape())
tensor_data_list_ret.append(tensor_data)
return tensor_data_list_ret
class TensorInfo():
"""
Tensor Information class.
Args:
node_name (str): Fully qualified name of the desired node.
slot (int): The particular output for the requested node.
iteration (int): The desired itraretion to gather tensor information.
device_id (int): The desired device id to gather tensor information.
is_parameter (bool): Whether node is a parameter (input, constant, bias, parameter).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> device_id=5,
>>> root_graph_id=0,
>>> is_parameter=True)
"""
@check_tensor_info_init
def __init__(self, node_name, slot, iteration, device_id, root_graph_id, is_parameter):
self.instance = cds.tensor_info(node_name, slot, iteration, device_id, root_graph_id, is_parameter)
@property
def node_name(self):
"""
Function to receive TensorInfo node_name.
Returns:
node_name of TensorInfo instance (str).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> device_id=5,
>>> root_graph_id=0,
>>> is_parameter=True)
>>> name = tensor_info.node_name
"""
return self.instance.get_node_name()
@property
def slot(self):
"""
Function to receive TensorInfo slot.
Returns:
slot of TensorInfo instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> device_id=5,
>>> root_graph_id=0,
>>> is_parameter=True)
>>> slot = tensor_info.slot
"""
return self.instance.get_slot()
@property
def iteration(self):
"""
Function to receive TensorInfo iteration.
Returns:
iteration of TensorInfo instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> device_id=5,
>>> root_graph_id=0,
>>> is_parameter=True)
>>> iteration = tensor_info.iteration
"""
return self.instance.get_iteration()
@property
def device_id(self):
"""
Function to receive TensorInfo device_id.
Returns:
device_id of TensorInfo instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> device_id=5,
>>> root_graph_id=0,
>>> is_parameter=True)
>>> device_id = tensor_info.device_id
"""
@property
def root_graph_id(self):
"""
Function to receive TensorInfo root_graph_id.
Returns:
root_graph_id of TensorInfo instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> device_id=5,
>>> root_graph_id=0,
>>> is_parameter=True)
>>> device_id = tensor_info.root_graph_id
"""
return self.instance.get_root_graph_id()
@property
def is_parameter(self):
"""
Function to receive TensorInfo is_parameter.
Returns:
is_parameter of TensorInfo instance (bool).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_info = dbg_services.TensorInfo(node_name="conv2.bias",
>>> slot=0,
>>> iteration=8,
>>> device_id=5,
>>> root_graph_id=0,
>>> is_parameter=True)
>>> is_parameter = tensor_info.is_parameter
"""
return self.instance.get_is_parameter()
class TensorData():
"""
TensorData class.
Args:
data_ptr (byte): Data pointer.
data_size (int): Size of data in bytes.
dtype (int): An encoding representing the type of TensorData.
shape (list): Shape of tensor.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0',
>>> data_size=4,
>>> dtype=0,
>>> shape=[2, 2])
"""
@check_tensor_data_init
def __init__(self, data_ptr, data_size, dtype, shape):
self.instance = cds.tensor_data(data_ptr, data_size, dtype, shape)
@property
def data_ptr(self):
"""
Function to receive TensorData data_ptr.
Returns:
data_ptr of TensorData instance (byte).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0',
>>> data_size=4,
>>> dtype=0,
>>> shape=[2, 2])
>>> data_ptr = tensor_data.data_ptr
"""
return self.instance.get_data_ptr()
@property
def data_size(self):
"""
Function to receive TensorData data_size.
Returns:
data_size of TensorData instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0',
>>> data_size=4,
>>> dtype=0,
>>> shape=[2, 2])
>>> data_size = tensor_data.data_size
"""
return self.instance.get_data_size()
@property
def dtype(self):
"""
Function to receive TensorData dtype.
Returns:
dtype of TensorData instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0',
>>> data_size=4,
>>> dtype=0,
>>> shape=[2, 2])
>>> dtype = tensor_data.dtype
"""
return self.instance.get_dtype()
@property
def shape(self):
"""
Function to receive TensorData shape.
Returns:
shape of TensorData instance (list).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> tensor_data = dbg_services.TensorData(data_ptr=b'\xba\xd0\xba\xd0',
>>> data_size=4,
>>> dtype=0,
>>> shape=[2, 2])
>>> shape = tensor_data.shape
"""
return self.instance.get_shape()
class WatchpointHit():
"""
WatchpointHit class.
Args:
name (str): Name of WatchpointHit instance.
slot (int): The numerical label of an output.
condition (int): A representation of the condition to be checked.
watchpoint_id (int): Watchpoint id.
parameters (list): A list of all parameters for WatchpointHit instance.
Parameters have to be instances of Parameter class.
error_code (int): An explanation of certain scenarios where watchpoint could not be checked.
device_id (int): Device id where the watchpoint is hit.
root_graph_id (int): Root graph id where the watchpoint is hit.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
"""
@check_watchpoint_hit_init
def __init__(self, name, slot, condition, watchpoint_id, parameters, error_code, device_id, root_graph_id):
parameter_list_inst = []
for elem in parameters:
parameter_list_inst.append(elem.instance)
self.instance = cds.watchpoint_hit(name, slot, condition, watchpoint_id,
parameter_list_inst, error_code, device_id, root_graph_id)
@property
def name(self):
"""
Function to receive WatchpointHit name.
Returns:
name of WatchpointHit instance (str).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
>>> name = watchpoint_hit.name
"""
return self.instance.get_name()
@property
def slot(self):
"""
Function to receive WatchpointHit slot.
Returns:
slot of WatchpointHit instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
>>> slot = watchpoint_hit.slot
"""
return self.instance.get_slot()
@property
def condition(self):
"""
Function to receive WatchpointHit condition.
Returns:
condition of WatchpointHit instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
>>> condition = watchpoint_hit.condition
"""
return self.instance.get_condition()
@property
def watchpoint_id(self):
"""
Function to receive WatchpointHit watchpoint_id.
Returns:
watchpoint_id of WatchpointHit instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
>>> watchpoint_id = watchpoint_hit.watchpoint_id
"""
return self.instance.get_watchpoint_id()
@property
def parameters(self):
"""
Function to receive WatchpointHit parameters.
Returns:
List of parameters of WatchpointHit instance (list).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
>>> parameters = watchpoint_hit.parameters
"""
params = self.instance.get_parameters()
param_list = []
for elem in params:
tmp = Parameter(elem.get_name(),
elem.get_disabled(),
elem.get_value(),
elem.get_hit(),
elem.get_actual_value())
param_list.append(tmp)
return param_list
@property
def error_code(self):
"""
Function to receive WatchpointHit error_code.
Returns:
error_code of WatchpointHit instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
>>> error_code = watchpoint_hit.error_code
"""
return self.instance.get_error_code()
@property
def device_id(self):
"""
Function to receive WatchpointHit device_id.
Returns:
device_id of WatchpointHit instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
>>> device_id = watchpoint_hit.device_id
"""
return self.instance.get_device_id()
@property
def root_graph_id(self):
"""
Function to receive WatchpointHit root_graph_id.
Returns:
root_graph_id of WatchpointHit instance (int).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> watchpoint_hit = dbg_services.WatchpointHit(name="hit1",
>>> slot=1,
>>> condition=2,
>>> watchpoint_id=3,
>>> parameters=[param1, param2],
>>> error_code=0,
>>> device_id=1,
>>> root_graph_id=1)
>>> root_graph_id = watchpoint_hit.root_graph_id
"""
return self.instance.get_root_graph_id()
class Parameter():
"""
Parameter class.
Args:
name (str): Name of the parameter.
disabled (bool): Whether parameter is used in backend.
value (float): Threshold value of the parameter.
hit (bool): Whether this parameter triggered watchpoint (default is False).
actual_value (float): Actual value of the parameter (default is 0.0).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> parameter = dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> actual_value=0.0)
"""
@check_parameter_init
def __init__(self, name, disabled, value, hit=False, actual_value=0.0):
self.instance = cds.parameter(name, disabled, value, hit, actual_value)
@property
def name(self):
"""
Function to receive Parameter name.
Returns:
name of Parameter instance (str).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> parameter = dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> name = watchpoint_hit.name
"""
return self.instance.get_name()
@property
def disabled(self):
"""
Function to receive Parameter disabled value.
Returns:
disabled of Parameter instance (bool).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> parameter = dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> disabled = watchpoint_hit.disabled
"""
return self.instance.get_disabled()
@property
def value(self):
"""
Function to receive Parameter value.
Returns:
value of Parameter instance (float).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> parameter = dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> value = watchpoint_hit.value
"""
return self.instance.get_value()
@property
def hit(self):
"""
Function to receive Parameter hit value.
Returns:
hit of Parameter instance (bool).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> parameter = dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> hit = watchpoint_hit.hit
"""
return self.instance.get_hit()
@property
def actual_value(self):
"""
Function to receive Parameter actual_value value.
Returns:
actual_value of Parameter instance (float).
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
>>> parameter = dbg_services.Parameter(name="param",
>>> disabled=False,
>>> value=0.0,
>>> hit=False,
>>> actual_value = watchpoint_hit.actual_value
"""
return self.instance.get_actual_value()

View File

@ -0,0 +1,123 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
General Validator Helper Functions.
"""
import os
import inspect
UINT32_MAX = 4294967295
UINT32_MIN = 0
UINT64_MAX = 18446744073709551615
UINT64_MIN = 0
def pad_arg_name(arg_name):
if arg_name != "":
arg_name = arg_name + " "
return arg_name
def check_value(arg, valid_range, arg_name=""):
arg_name = pad_arg_name(arg_name)
if arg < valid_range[0] or arg > valid_range[1]:
raise ValueError(
"Input {0}is not within the required interval of ({1} to {2}).".format(arg_name,
valid_range[0], valid_range[1]))
def check_uint32(arg, arg_name=""):
type_check(arg, (int,), arg_name)
check_value(arg, [UINT32_MIN, UINT32_MAX])
def check_uint64(arg, arg_name=""):
type_check(arg, (int,), arg_name)
check_value(arg, [UINT64_MIN, UINT64_MAX])
def check_dir(dataset_dir):
if not os.path.isdir(dataset_dir) or not os.access(dataset_dir, os.R_OK):
raise ValueError("The folder {} does not exist or permission denied!".format(dataset_dir))
def parse_user_args(method, *args, **kwargs):
"""
Parse user arguments in a function.
Args:
method (method): a callable function.
args: user passed args.
kwargs: user passed kwargs.
Returns:
user_filled_args (list): values of what the user passed in for the arguments.
ba.arguments (Ordered Dict): ordered dict of parameter and argument for what the user has passed.
"""
sig = inspect.signature(method)
if 'self' in sig.parameters or 'cls' in sig.parameters:
ba = sig.bind(method, *args, **kwargs)
ba.apply_defaults()
params = list(sig.parameters.keys())[1:]
else:
ba = sig.bind(*args, **kwargs)
ba.apply_defaults()
params = list(sig.parameters.keys())
user_filled_args = [ba.arguments.get(arg_value) for arg_value in params]
return user_filled_args, ba.arguments
def type_check(arg, types, arg_name):
"""
Check the type of the parameter.
Args:
arg (Any) : any variable.
types (tuple): tuple of all valid types for arg.
arg_name (str): the name of arg.
Returns:
Exception: when the type is not correct, otherwise nothing.
"""
# handle special case of booleans being a subclass of ints
print_value = '\"\"' if repr(arg) == repr('') else arg
if int in types and bool not in types:
if isinstance(arg, bool):
raise TypeError("Argument {0} with value {1} is not of type {2}.".format(arg_name, print_value, types))
if not isinstance(arg, types):
raise TypeError("Argument {0} with value {1} is not of type {2}.".format(arg_name, print_value, types))
def type_check_list(args, types, arg_names):
"""
Check the type of each parameter in the list.
Args:
args (Union[list, tuple]): a list or tuple of any variable.
types (tuple): tuple of all valid types for arg.
arg_names (Union[list, tuple of str]): the names of args.
Returns:
Exception: when the type is not correct, otherwise nothing.
"""
type_check(args, (list, tuple,), arg_names)
if len(args) != len(arg_names) and not isinstance(arg_names, str):
raise ValueError("List of arguments is not the same length as argument_names.")
if isinstance(arg_names, str):
arg_names = ["{0}[{1}]".format(arg_names, i) for i in range(len(args))]
for arg, arg_name in zip(args, arg_names):
type_check(arg, types, arg_name)

View File

@ -0,0 +1,231 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Validator Functions for Offline Debugger APIs.
"""
from functools import wraps
import mindspore.offline_debug.dbg_services as cds
from mindspore.offline_debug.mi_validator_helpers import parse_user_args, type_check, type_check_list, check_dir, check_uint32, check_uint64
def check_init(method):
"""Wrapper method to check the parameters of DbgServices init."""
@wraps(method)
def new_method(self, *args, **kwargs):
[dump_file_path, verbose], _ = parse_user_args(method, *args, **kwargs)
type_check(dump_file_path, (str,), "dump_file_path")
type_check(verbose, (bool,), "verbose")
check_dir(dump_file_path)
return method(self, *args, **kwargs)
return new_method
def check_initialize(method):
"""Wrapper method to check the parameters of DbgServices Initialize method."""
@wraps(method)
def new_method(self, *args, **kwargs):
[net_name, is_sync_mode], _ = parse_user_args(method, *args, **kwargs)
type_check(net_name, (str,), "net_name")
type_check(is_sync_mode, (bool,), "is_sync_mode")
return method(self, *args, **kwargs)
return new_method
def check_add_watchpoint(method):
"""Wrapper method to check the parameters of DbgServices AddWatchpoint."""
@wraps(method)
def new_method(self, *args, **kwargs):
[id_value, watch_condition, check_node_list, parameter_list], _ = parse_user_args(method, *args, **kwargs)
check_uint32(id_value, "id")
check_uint32(watch_condition, "watch_condition")
type_check(check_node_list, (dict,), "check_node_list")
for node_name, node_info in check_node_list.items():
type_check(node_name, (str,), "node_name")
type_check(node_info, (dict,), "node_info")
for info_name, info_param in node_info.items():
type_check(info_name, (str,), "node parameter name")
if info_name in ["device_id"]:
if isinstance(info_param, str):
if info_param not in ["*"]:
raise ValueError("Node parameter {} only accepts '*' as string.".format(info_name))
else:
for param in info_param:
check_uint32(param, "device_id")
elif info_name in ["root_graph_id"]:
if isinstance(info_param, str):
if info_param not in ["*"]:
raise ValueError("Node parameter {} only accepts '*' as string.".format(info_name))
else:
for param in info_param:
check_uint32(param, "root_graph_id")
elif info_name in ["is_parameter"]:
type_check(info_param, (bool,), "is_parameter")
else:
raise ValueError("Node parameter {} is not defined.".format(info_name))
param_names = ["param_{0}".format(i) for i in range(len(parameter_list))]
type_check_list(parameter_list, (cds.Parameter,), param_names)
return method(self, *args, **kwargs)
return new_method
def check_remove_watchpoint(method):
"""Wrapper method to check the parameters of DbgServices RemoveWatchpoint."""
@wraps(method)
def new_method(self, *args, **kwargs):
[id_value], _ = parse_user_args(method, *args, **kwargs)
check_uint32(id_value, "id")
return method(self, *args, **kwargs)
return new_method
def check_check_watchpoints(method):
"""Wrapper method to check the parameters of DbgServices CheckWatchpoint."""
@wraps(method)
def new_method(self, *args, **kwargs):
[iteration], _ = parse_user_args(method, *args, **kwargs)
check_uint32(iteration, "iteration")
return method(self, *args, **kwargs)
return new_method
def check_read_tensors(method):
"""Wrapper method to check the parameters of DbgServices ReadTensors."""
@wraps(method)
def new_method(self, *args, **kwargs):
[info_list], _ = parse_user_args(method, *args, **kwargs)
info_names = ["info_{0}".format(i) for i in range(len(info_list))]
type_check_list(info_list, (cds.TensorInfo,), info_names)
return method(self, *args, **kwargs)
return new_method
def check_initialize_done(method):
"""Wrapper method to check if initlize is done for DbgServices."""
@wraps(method)
def new_method(self, *args, **kwargs):
if not self.initialized:
raise RuntimeError("Inilize should be called before any other methods of DbgServices!")
return method(self, *args, **kwargs)
return new_method
def check_tensor_info_init(method):
"""Wrapper method to check the parameters of DbgServices TensorInfo init."""
@wraps(method)
def new_method(self, *args, **kwargs):
[node_name, slot, iteration, device_id, root_graph_id,
is_parameter], _ = parse_user_args(method, *args, **kwargs)
type_check(node_name, (str,), "node_name")
check_uint32(slot, "slot")
check_uint32(iteration, "iteration")
check_uint32(device_id, "device_id")
check_uint32(root_graph_id, "root_graph_id")
type_check(is_parameter, (bool,), "is_parameter")
return method(self, *args, **kwargs)
return new_method
def check_tensor_data_init(method):
"""Wrapper method to check the parameters of DbgServices TensorData init."""
@wraps(method)
def new_method(self, *args, **kwargs):
[data_ptr, data_size, dtype, shape], _ = parse_user_args(method, *args, **kwargs)
type_check(data_ptr, (bytes,), "data_ptr")
check_uint64(data_size, "data_size")
type_check(dtype, (int,), "dtype")
shape_names = ["shape_{0}".format(i) for i in range(len(shape))]
type_check_list(shape, (int,), shape_names)
if len(data_ptr) != data_size:
raise ValueError("data_ptr length ({0}) is not equal to data_size ({1}).".format(len(data_ptr), data_size))
return method(self, *args, **kwargs)
return new_method
def check_watchpoint_hit_init(method):
"""Wrapper method to check the parameters of DbgServices WatchpointHit init."""
@wraps(method)
def new_method(self, *args, **kwargs):
[name, slot, condition, watchpoint_id,
parameters, error_code, device_id, root_graph_id], _ = parse_user_args(method, *args, **kwargs)
type_check(name, (str,), "name")
check_uint32(slot, "slot")
type_check(condition, (int,), "condition")
check_uint32(watchpoint_id, "watchpoint_id")
param_names = ["param_{0}".format(i) for i in range(len(parameters))]
type_check_list(parameters, (cds.Parameter,), param_names)
type_check(error_code, (int,), "error_code")
check_uint32(device_id, "device_id")
check_uint32(root_graph_id, "root_graph_id")
return method(self, *args, **kwargs)
return new_method
def check_parameter_init(method):
"""Wrapper method to check the parameters of DbgServices Parameter init."""
@wraps(method)
def new_method(self, *args, **kwargs):
[name, disabled, value, hit, actual_value], _ = parse_user_args(method, *args, **kwargs)
type_check(name, (str,), "name")
type_check(disabled, (bool,), "disabled")
type_check(value, (float,), "value")
type_check(hit, (bool,), "hit")
type_check(actual_value, (float,), "actual_value")
return method(self, *args, **kwargs)
return new_method