bugfix: duplicate wp hit results in offline dbg

This commit is contained in:
TinaMengtingZhang 2022-01-28 15:09:43 -05:00
parent dab15fb156
commit 23eb886d11
2 changed files with 46 additions and 10 deletions

View File

@ -891,7 +891,7 @@ void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &f
if (last_slash_pos != std::string::npos) {
file_n = file_to_find.substr(last_slash_pos + 1);
}
if (candidate.find(file_n) != std::string::npos && candidate.rfind(kNpyExt) != std::string::npos) {
if (candidate.find(file_n + ".") != std::string::npos && candidate.rfind(kNpyExt) != std::string::npos) {
// we found a converted file for this op
std::string found_file = dump_key + "/" + candidate;
result_list->insert(found_file);
@ -1044,8 +1044,8 @@ void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::str
if (delim != std::string::npos) {
file_name_to_check = file_name.substr(delim + 1);
}
std::size_t found = file_name_to_check.find(dump_name);
std::size_t found_out = file_name_to_check.find(output_str);
std::size_t found = file_name_to_check.find("." + dump_name + ".");
std::size_t found_out = file_name_to_check.find(output_str, found + dump_name.length());
std::size_t found_dot_start = file_name_to_check.find(".", found_out);
std::size_t found_dot_end = file_name_to_check.find(".", found_dot_start);
@ -1518,8 +1518,8 @@ void DebugServices::ReadDumpedTensorAsync(const std::string &specific_dump_dir,
file_name_to_check = file_path.substr(delim + 1);
}
if (file_path.find(specific_dump_dir) != std::string::npos &&
file_name_to_check.find(prefix_dump_to_check) != std::string::npos &&
file_name_to_check.find(slot_string_to_check) != std::string::npos) {
file_name_to_check.find("." + prefix_dump_to_check + ".") != std::string::npos &&
file_name_to_check.find(slot_string_to_check + ".") != std::string::npos) {
matched_paths.push_back(file_path);
found = true;
}
@ -1663,7 +1663,7 @@ void DebugServices::ProcessTensorDataSync(const std::vector<std::tuple<std::stri
if (stripped_file_name.empty() || stripped_file_name.length() <= dump_name.length()) {
continue;
}
std::size_t found = stripped_file_name.rfind(dump_name, 0);
std::size_t found = stripped_file_name.rfind(dump_name + ".", 0);
if (found == 0) {
size_t slot = std::stoul(stripped_file_name.substr(dump_name.length() + 1));
std::vector<int64_t> shape;

View File

@ -1,4 +1,4 @@
# Copyright 2021 Huawei Technologies Co., Ltd
# Copyright 2021-2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -66,9 +66,23 @@ class TestOfflineWatchpoints:
info4 = d.TensorInfo(node_name="Default/CudnnUniformReal-op391",
slot=0, iteration=2, rank_id=0, root_graph_id=0, is_output=False)
tensor_info = [info1, info2, info3, info4]
tensor_name = [name1, name2, name2, name3]
tensor_list = [tensor1, tensor2, tensor3, tensor4]
name4 = "Cast.Cast-op4.0.0.1"
tensor_all_zero = np.array([[[0, 0, 0],
[0, 0, 0],
[0, 0, 0]]], np.float32)
info5 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/Cast-op4",
slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True)
name5 = "Cast.Cast-op40.0.0.1"
tensor_all_one = np.array([[[1, 1, 1],
[1, 1, 1],
[1, 1, 1]]], np.float32)
info6 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/Cast-op40",
slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True)
tensor_info = [info1, info2, info3, info4, info5, info6]
tensor_name = [name1, name2, name2, name3, name4, name5]
tensor_list = [tensor1, tensor2, tensor3, tensor4, tensor_all_zero, tensor_all_one]
cls.temp_dir = build_dump_structure(tensor_name, tensor_list, "Test", tensor_info)
@classmethod
@ -181,6 +195,28 @@ class TestOfflineWatchpoints:
assert not watchpoint_hits_test
_ = debugger_backend.remove_watchpoint(watchpoint_id=2)
@security_off_wrap
def test_async_watchpoints_no_duplicate_wp_hit(self):
"""
Feature: Offline Debugger CheckWatchpoint.
Description: Test check watchpoint hit with similar op name (one is the prefix of the other)
Expectation: Get exactly one watchpoint hit result and no duplicate watchpoints in the hit results.
"""
# watchpoint set and hit only one (watch_condition=3) in async mode
debugger_backend = d.DbgServices(dump_file_path=self.temp_dir)
_ = debugger_backend.initialize(net_name="Test", is_sync_mode=False)
max_gt = d.Parameter(name="max_gt", disabled=False, value=0.0)
debugger_backend.add_watchpoint(watchpoint_id=3, watch_condition=3,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/Cast-op4":
{"rank_id": [0], "root_graph_id": [0], "is_output": True
},
"Default/network-WithLossCell/_backbone-AlexNet/Cast-op40":
{"rank_id": [0], "root_graph_id": [0], "is_output": True
}}, parameter_list=[max_gt])
watchpoint_hits_test = debugger_backend.check_watchpoints(iteration=0)
assert len(watchpoint_hits_test) == 1
def compare_expect_actual_result(self, watchpoint_hits_list, test_index):
"""Compare actual result with golden file."""
golden_file = os.path.realpath(os.path.join("../data/dump/gpu_dumps/golden/",