From c737dca9abaf46fce41f5ddece11f82053971e38 Mon Sep 17 00:00:00 2001 From: gaojing Date: Fri, 24 Sep 2021 00:27:24 +0800 Subject: [PATCH] reenable st offline debugger test --- tests/st/debugger/dump_test_utils.py | 31 ++- .../golden/async_read_tensors_expected.json | 253 ++++++++++++++++++ ...async_sink_mode_true_read_tensors.expected | 28 -- .../async_sink_mode_true_watchpoints.expected | 14 - .../golden/async_watchpoints_expected.json | 55 ++++ .../golden/sync_read_tensors_expected.json | 253 ++++++++++++++++++ .../sync_trans_false_read_tensors.expected | 73 ----- .../sync_trans_false_watchpoints.expected | 33 --- .../sync_trans_true_read_tensors.expected | 98 ------- .../golden/sync_watchpoints_expected.json | 55 ++++ .../test_async_sink_mode_true_read_tensors.py | 86 ------ .../test_async_sink_mode_true_watchpoints.py | 109 -------- tests/st/debugger/test_read_tensors.py | 159 +++++++++++ .../test_sync_trans_false_read_tensors.py | 89 ------ .../test_sync_trans_false_watchpoints.py | 128 --------- .../test_sync_trans_true_read_tensor.py | 89 ------ tests/st/debugger/test_watchpoints.py | 204 ++++++++++++++ 17 files changed, 1001 insertions(+), 756 deletions(-) create mode 100644 tests/st/debugger/golden/async_read_tensors_expected.json delete mode 100644 tests/st/debugger/golden/async_sink_mode_true_read_tensors.expected delete mode 100644 tests/st/debugger/golden/async_sink_mode_true_watchpoints.expected create mode 100644 tests/st/debugger/golden/async_watchpoints_expected.json create mode 100644 tests/st/debugger/golden/sync_read_tensors_expected.json delete mode 100644 tests/st/debugger/golden/sync_trans_false_read_tensors.expected delete mode 100644 tests/st/debugger/golden/sync_trans_false_watchpoints.expected delete mode 100644 tests/st/debugger/golden/sync_trans_true_read_tensors.expected create mode 100644 tests/st/debugger/golden/sync_watchpoints_expected.json delete mode 100644 tests/st/debugger/test_async_sink_mode_true_read_tensors.py delete mode 100644 tests/st/debugger/test_async_sink_mode_true_watchpoints.py create mode 100644 tests/st/debugger/test_read_tensors.py delete mode 100644 tests/st/debugger/test_sync_trans_false_read_tensors.py delete mode 100644 tests/st/debugger/test_sync_trans_false_watchpoints.py delete mode 100644 tests/st/debugger/test_sync_trans_true_read_tensor.py create mode 100644 tests/st/debugger/test_watchpoints.py diff --git a/tests/st/debugger/dump_test_utils.py b/tests/st/debugger/dump_test_utils.py index 73fbb712e93..77a41113fde 100644 --- a/tests/st/debugger/dump_test_utils.py +++ b/tests/st/debugger/dump_test_utils.py @@ -16,15 +16,28 @@ Utils for testing offline debugger. """ -import filecmp import os +import tempfile +import numpy as np -def compare_actual_with_expected(test_name): - """Compare actual file with expected.""" - pwd = os.getcwd() - is_eq = filecmp.cmp(pwd + "/golden/" + - test_name + ".expected", test_name + ".actual", shallow=False) - if os.path.exists(test_name + ".actual"): - os.remove(test_name + ".actual") - return is_eq +def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_info_list): + """Build dump file structure from tensor_list.""" + temp_dir = tempfile.mkdtemp(prefix=net_name, dir=path) + for tensor_name, tensor, tensor_info in zip(tensor_name_list, tensor_list, tensor_info_list): + slot = str(tensor_info.slot) + iteration = str(tensor_info.iteration) + rank_id = str(tensor_info.rank_id) + root_graph_id = str(tensor_info.root_graph_id) + is_output = str(tensor_info.is_output) + path = os.path.join(temp_dir, "rank_" + rank_id, net_name, root_graph_id, iteration) + os.makedirs(path, exist_ok=True) + if is_output == "True": + file = tempfile.mkstemp(prefix=tensor_name, suffix=".output." + slot + + ".DefaultFormat.npy", dir=path) + else: + file = tempfile.mkstemp(prefix=tensor_name, suffix=".input." + slot + + ".DefaultFormat.npy", dir=path) + full_path = file[1] + np.save(full_path, tensor) + return temp_dir diff --git a/tests/st/debugger/golden/async_read_tensors_expected.json b/tests/st/debugger/golden/async_read_tensors_expected.json new file mode 100644 index 00000000000..1af0f0b2590 --- /dev/null +++ b/tests/st/debugger/golden/async_read_tensors_expected.json @@ -0,0 +1,253 @@ +[ + { + "tensor_1": { + "tensor_info": { + "node_name": "Default/CudnnUniformReal-op391", + "slot": 0, + "iteration": 0, + "rank_id": 0, + "root_graph_id": 0, + "is_output": false + }, + "tensor_data": { + "data": [ + 0, + 0, + 0, + 66, + 0, + 0, + 128, + 69 + ], + "size_in_bytes": 8, + "debugger_dtype": 11, + "shape": [ + 2 + ] + } + } + }, + { + "tensor_2": { + "tensor_info": { + "node_name": "Gradients/Default/network-WithLossCell/_backbone-AlexNet/gradReLU/ReluGradV2-op406", + "slot": 1, + "iteration": 1, + "rank_id": 0, + "root_graph_id": 0, + "is_output": false + }, + "tensor_data": { + "data": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 66, + 0, + 0, + 128, + 69, + 0, + 0, + 144, + 64, + 195, + 245, + 216, + 64, + 0, + 0, + 48, + 193 + ], + "size_in_bytes": 24, + "debugger_dtype": 11, + "shape": [ + 2, + 3 + ] + } + } + }, + { + "tensor_3": { + "tensor_info": { + "node_name": "Gradients/Default/network-WithLossCell/_backbone-AlexNet/conv5-Conv2d/gradConv2D/Conv2DBackpropFilter-op424", + "slot": 0, + "iteration": 1, + "rank_id": 0, + "root_graph_id": 0, + "is_output": true + }, + "tensor_data": { + "data": [ + 8, + 255, + 166, + 56, + 189, + 58, + 71, + 56, + 103, + 3, + 217, + 55, + 170, + 225, + 174, + 56, + 135, + 195, + 82, + 56, + 54, + 253, + 225, + 55, + 254, + 158, + 179, + 56, + 33, + 66, + 88, + 56, + 30, + 248, + 222, + 55, + 241, + 32, + 168, + 56, + 143, + 126, + 73, + 56, + 116, + 129, + 228, + 55, + 53, + 254, + 175, + 56, + 2, + 0, + 87, + 56, + 246, + 124, + 238, + 55, + 177, + 160, + 180, + 56, + 156, + 126, + 92, + 56, + 144, + 121, + 236, + 55, + 117, + 189, + 159, + 56, + 25, + 132, + 32, + 56, + 154, + 1, + 178, + 54, + 187, + 189, + 156, + 56, + 117, + 252, + 27, + 56, + 205, + 2, + 76, + 54, + 212, + 127, + 148, + 56, + 129, + 1, + 12, + 56, + 53, + 253, + 11, + 182 + ], + "size_in_bytes": 108, + "debugger_dtype": 11, + "shape": [ + 3, + 3, + 3 + ] + } + } + }, + { + "tensor_4": { + "tensor_info": { + "node_name": "Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op381", + "slot": 1, + "iteration": 0, + "rank_id": 0, + "root_graph_id": 0, + "is_output": true + }, + "tensor_data": { + "data": [ + 104, + 60, + 33, + 79, + 53, + 6, + 131, + 78, + 78, + 232, + 126, + 79, + 154, + 198, + 85, + 79, + 245, + 52, + 84, + 78, + 70, + 207, + 222, + 78 + ], + "size_in_bytes": 24, + "debugger_dtype": 11, + "shape": [ + 6 + ] + } + } + } +] \ No newline at end of file diff --git a/tests/st/debugger/golden/async_sink_mode_true_read_tensors.expected b/tests/st/debugger/golden/async_sink_mode_true_read_tensors.expected deleted file mode 100644 index 4c8a206cc5f..00000000000 --- a/tests/st/debugger/golden/async_sink_mode_true_read_tensors.expected +++ /dev/null @@ -1,28 +0,0 @@ ------------------------------------------------------------ -tensor_info_1 attributes: -node name = Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op169 -slot = 0 -iteration = 2 -device_id = None -root_graph_id = 1 -is_parameter = False - -tensor_data_1 attributes: -data (printed in uint8) = [149 167 122 ... 160 212 164] -size in bytes = 2076672 -debugger dtype = 10 -shape = [32, 12, 13, 13, 16] ------------------------------------------------------------ -tensor_info_2 attributes: -node name = Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/ReLUV2-op348 -slot = 1 -iteration = 2 -device_id = None -root_graph_id = 1 -is_parameter = False - -tensor_data_2 attributes: -data (printed in uint8) = [ 20 21 18 ... 126 98 25] -size in bytes = 129792 -debugger dtype = 6 -shape = [32, 12, 13, 13, 2] diff --git a/tests/st/debugger/golden/async_sink_mode_true_watchpoints.expected b/tests/st/debugger/golden/async_sink_mode_true_watchpoints.expected deleted file mode 100644 index 4e6f066f5ef..00000000000 --- a/tests/st/debugger/golden/async_sink_mode_true_watchpoints.expected +++ /dev/null @@ -1,14 +0,0 @@ ------------------------------------------------------------ -watchpoint_hit for test_1 attributes: -name = Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op169 -slot = 0 -condition = 6 -watchpoint_id = 1 -parameter 0 name = param -parameter 0 disabled = False -parameter 0 value = 0.0 -parameter 0 hit = True -parameter 0 actual_value = -0.1417236328125 -error code = 0 -device_id = 0 -root_graph_id = 1 diff --git a/tests/st/debugger/golden/async_watchpoints_expected.json b/tests/st/debugger/golden/async_watchpoints_expected.json new file mode 100644 index 00000000000..49f57b4b204 --- /dev/null +++ b/tests/st/debugger/golden/async_watchpoints_expected.json @@ -0,0 +1,55 @@ +[ + { + "watchpoint_hit1": { + "name": "Default/network-WithLossCell/_backbone-AlexNet/conv1-Conv2d/Conv2D-op369", + "slot": 1, + "condition": 6, + "watchpoint_id": 1, + "parameter": [ + { + "parameter0": { + "name": "param", + "disabled": false, + "value": 0.0, + "hit": true, + "actual_value": -0.020966000854969025 + } + } + ], + "error_code": 0, + "rank_id": 0, + "root_graph_id": 0 + } + }, + { + "watchpoint_hit2": { + "name": "Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc2.bias", + "slot": 0, + "condition": 18, + "watchpoint_id": 3, + "parameter": [ + { + "parameter0": { + "name": "abs_mean_update_ratio_gt", + "disabled": false, + "value": 0.0, + "hit": true, + "actual_value": 1.0156775705209766 + } + }, + { + "parameter1": { + "name": "epsilon", + "disabled": true, + "value": 0.0, + "hit": false, + "actual_value": 0.0 + } + } + ], + "error_code": 0, + "rank_id": 0, + "root_graph_id": 0 + } + } +] \ No newline at end of file diff --git a/tests/st/debugger/golden/sync_read_tensors_expected.json b/tests/st/debugger/golden/sync_read_tensors_expected.json new file mode 100644 index 00000000000..1af0f0b2590 --- /dev/null +++ b/tests/st/debugger/golden/sync_read_tensors_expected.json @@ -0,0 +1,253 @@ +[ + { + "tensor_1": { + "tensor_info": { + "node_name": "Default/CudnnUniformReal-op391", + "slot": 0, + "iteration": 0, + "rank_id": 0, + "root_graph_id": 0, + "is_output": false + }, + "tensor_data": { + "data": [ + 0, + 0, + 0, + 66, + 0, + 0, + 128, + 69 + ], + "size_in_bytes": 8, + "debugger_dtype": 11, + "shape": [ + 2 + ] + } + } + }, + { + "tensor_2": { + "tensor_info": { + "node_name": "Gradients/Default/network-WithLossCell/_backbone-AlexNet/gradReLU/ReluGradV2-op406", + "slot": 1, + "iteration": 1, + "rank_id": 0, + "root_graph_id": 0, + "is_output": false + }, + "tensor_data": { + "data": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 66, + 0, + 0, + 128, + 69, + 0, + 0, + 144, + 64, + 195, + 245, + 216, + 64, + 0, + 0, + 48, + 193 + ], + "size_in_bytes": 24, + "debugger_dtype": 11, + "shape": [ + 2, + 3 + ] + } + } + }, + { + "tensor_3": { + "tensor_info": { + "node_name": "Gradients/Default/network-WithLossCell/_backbone-AlexNet/conv5-Conv2d/gradConv2D/Conv2DBackpropFilter-op424", + "slot": 0, + "iteration": 1, + "rank_id": 0, + "root_graph_id": 0, + "is_output": true + }, + "tensor_data": { + "data": [ + 8, + 255, + 166, + 56, + 189, + 58, + 71, + 56, + 103, + 3, + 217, + 55, + 170, + 225, + 174, + 56, + 135, + 195, + 82, + 56, + 54, + 253, + 225, + 55, + 254, + 158, + 179, + 56, + 33, + 66, + 88, + 56, + 30, + 248, + 222, + 55, + 241, + 32, + 168, + 56, + 143, + 126, + 73, + 56, + 116, + 129, + 228, + 55, + 53, + 254, + 175, + 56, + 2, + 0, + 87, + 56, + 246, + 124, + 238, + 55, + 177, + 160, + 180, + 56, + 156, + 126, + 92, + 56, + 144, + 121, + 236, + 55, + 117, + 189, + 159, + 56, + 25, + 132, + 32, + 56, + 154, + 1, + 178, + 54, + 187, + 189, + 156, + 56, + 117, + 252, + 27, + 56, + 205, + 2, + 76, + 54, + 212, + 127, + 148, + 56, + 129, + 1, + 12, + 56, + 53, + 253, + 11, + 182 + ], + "size_in_bytes": 108, + "debugger_dtype": 11, + "shape": [ + 3, + 3, + 3 + ] + } + } + }, + { + "tensor_4": { + "tensor_info": { + "node_name": "Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op381", + "slot": 1, + "iteration": 0, + "rank_id": 0, + "root_graph_id": 0, + "is_output": true + }, + "tensor_data": { + "data": [ + 104, + 60, + 33, + 79, + 53, + 6, + 131, + 78, + 78, + 232, + 126, + 79, + 154, + 198, + 85, + 79, + 245, + 52, + 84, + 78, + 70, + 207, + 222, + 78 + ], + "size_in_bytes": 24, + "debugger_dtype": 11, + "shape": [ + 6 + ] + } + } + } +] \ No newline at end of file diff --git a/tests/st/debugger/golden/sync_trans_false_read_tensors.expected b/tests/st/debugger/golden/sync_trans_false_read_tensors.expected deleted file mode 100644 index 246bb535e71..00000000000 --- a/tests/st/debugger/golden/sync_trans_false_read_tensors.expected +++ /dev/null @@ -1,73 +0,0 @@ ------------------------------------------------------------ -tensor_info_1 attributes: -node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias -slot = 0 -iteration = 2 -device_id = None -root_graph_id = 0 -is_parameter = True - -tensor_data_1 attributes: -data (printed in uint8) = [170 19 44 181 254 212 16 52 52 162 148 180 130 115 226 180 183 243 - 101 52 224 79 189 51 10 70 69 51 199 75 159 52 79 98 104 52 - 106 77 19 52 129 183 8 180 252 58 48 180 35 219 9 52 240 201 - 179 51 142 151 158 51 210 145 182 53 140 219 0 53 140 219 22 181 - 46 33 87 180 238 90 122 180 166 10 38 179 202 195 4 53 166 10 - 150 51 214 120 209 52 235 115 37 180 92 177 215 180 0 136 84 51 - 72 114 145 180 43 169 255 180 114 27 61 52 76 225 122 50 126 72 - 159 51 58 35 202 51 114 61 106 51 60 223 63 52 209 179 1 52 - 232 217 44 178 130 158 109 179 213 231 10 179 37 40 94 179 208 68 - 64 53 6 52 249 52 162 35 1 181 231 29 155 52 30 201 69 180 - 229 131 126 51 18 165 109 180 164 112 163 181 116 172 11 178 6 129 - 37 52 54 205 203 180 115 104 145 52 232 106 219 179 36 40 214 52 - 202 50 204 52 76 89 38 179 230 140 232 178 168 53 77 52 180 191 - 108 51 128 183 64 51 56 137 161 180 247 6 143 180 126 63 197 180 - 198 177 94 52 140 185 139 51 150 178 228 180 255 67 150 52 134 201 - 164 52 107 43 14 53 174 216 63 179 40 160 41 53 120 88 72 179 - 218 172 234 52 234 38 25 52 85 159 155 180 254 67 138 180 34 253 - 118 180 218 61 17 52 242 133 253 52 175 37 180 52 171 62 163 52 - 202 195 86 53 160 171 45 52 34 31 176 180 156 85 5 53 178 191 - 68 180 42 203 140 52 248 117 72 52 248 253 212 176 195 100 202 51 - 87 14 141 52 91 100 235 51 48 221 136 52 143 117 17 180 51 196 - 25 52 127 29 112 180 152 144 207 178 219 104 64 52 21 174 251 52 - 164 78 138 181 20 63 6 52 10 249 96 179 163 146 18 53 200 186 - 236 52 2 188 85 52 124 140 121 179 246 185 22 181 246 74 249 51 - 70 182 135 53 189 227 76 52 249 160 159 180 134 235 65 53 64 164 - 255 51 224 156 41 53 142 117 69 181 247 151 101 53 185 175 35 52 - 164 112 21 53 30 31 212 179 142 151 110 179 176 148 29 181 206 204 - 88 53 116 215 214 180 172 173 216 51 106 222 153 180 200 152 19 181 - 176 3 7 52 215 52 87 52] -size in bytes = 512 -debugger dtype = 11 -shape = [128] ------------------------------------------------------------ -tensor_info_2 attributes: -node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op168 -slot = 0 -iteration = 2 -device_id = None -root_graph_id = 0 -is_parameter = False - -tensor_data_2 attributes: -data (printed in uint8) = [181 167 46 26 122 155 141 164 212 39 111 27 247 156 1 152 189 36 - 15 161 254 167 82 163 33 42 101 158 225 161 24 167 103 140 45 42 - 178 170 173 29 48 42 39 32 56 25 216 170 128 41 216 23 153 154 - 39 173 193 42 84 160 111 22 61 144] -size in bytes = 64 -debugger dtype = 10 -shape = [2, 2, 2, 2, 2] ------------------------------------------------------------ -tensor_info_3 attributes: -node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op346 -slot = 1 -iteration = 2 -device_id = None -root_graph_id = 0 -is_parameter = False - -tensor_data_3 attributes: -data (printed in uint8) = [ 50 17 122 ... 94 42 90] -size in bytes = 129792 -debugger dtype = 6 -shape = [32, 12, 13, 13, 2] diff --git a/tests/st/debugger/golden/sync_trans_false_watchpoints.expected b/tests/st/debugger/golden/sync_trans_false_watchpoints.expected deleted file mode 100644 index 74be045cad3..00000000000 --- a/tests/st/debugger/golden/sync_trans_false_watchpoints.expected +++ /dev/null @@ -1,33 +0,0 @@ ------------------------------------------------------------ -watchpoint_hit for test_1 attributes: -name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op168 -slot = 0 -condition = 6 -watchpoint_id = 1 -parameter 0 name = param -parameter 0 disabled = False -parameter 0 value = 0.0 -parameter 0 hit = True -parameter 0 actual_value = -0.08050537109375 -error code = 0 -device_id = 0 -root_graph_id = 0 ------------------------------------------------------------ -watchpoint_hit for test_4 attributes: -name = Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc3.bias -slot = 0 -condition = 18 -watchpoint_id = 3 -parameter 0 name = abs_mean_update_ratio_gt -parameter 0 disabled = False -parameter 0 value = 0.0 -parameter 0 hit = True -parameter 0 actual_value = 0.5243796973599475 -parameter 1 name = epsilon -parameter 1 disabled = True -parameter 1 value = 0.0 -parameter 1 hit = False -parameter 1 actual_value = 0.0 -error code = 0 -device_id = 0 -root_graph_id = 0 diff --git a/tests/st/debugger/golden/sync_trans_true_read_tensors.expected b/tests/st/debugger/golden/sync_trans_true_read_tensors.expected deleted file mode 100644 index d4700336390..00000000000 --- a/tests/st/debugger/golden/sync_trans_true_read_tensors.expected +++ /dev/null @@ -1,98 +0,0 @@ ------------------------------------------------------------ -tensor_info_1 attributes: -node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias -slot = 0 -iteration = 2 -device_id = None -root_graph_id = 0 -is_parameter = True - -tensor_data_1 attributes: -data (printed in uint8) = [230 208 10 52 104 34 252 52 4 231 144 52 188 150 64 180 88 236 - 15 180 254 135 180 51 131 226 147 52 88 202 62 53 2 43 55 53 - 231 29 87 180 220 249 30 180 157 17 177 180 81 107 140 181 8 95 - 192 180 89 134 112 180 96 238 90 178 156 196 212 180 206 25 15 181 - 212 154 6 180 91 211 116 52 191 14 140 51 128 106 124 53 28 158 - 70 181 182 21 251 50 100 204 157 179 88 202 42 180 7 95 8 53 - 128 251 238 52 241 133 241 52 111 86 157 179 48 221 148 180 200 7 - 141 180 236 226 182 51 190 82 158 180 140 108 179 180 195 134 215 179 - 103 213 39 179 89 168 149 180 42 58 58 180 64 53 62 179 250 126 - 158 52 38 83 117 52 0 0 136 180 136 133 122 51 110 18 131 179 - 238 13 94 51 102 136 15 181 134 90 227 180 16 11 117 180 35 74 - 163 52 105 0 87 181 112 18 131 50 226 233 67 181 217 172 10 52 - 206 25 217 52 208 213 22 52 146 203 87 180 74 46 207 52 178 191 - 4 180 100 93 216 52 119 190 171 180 223 2 5 181 128 72 207 179 - 58 146 11 179 224 79 137 52 143 228 154 180 246 219 215 179 14 79 - 195 52 126 29 64 52 132 192 42 51 94 220 86 52 94 109 1 181 - 72 37 117 178 110 197 94 180 160 94 153 179 118 224 80 181 156 17 - 37 50 120 156 162 53 26 115 135 180 228 20 29 53 145 126 147 52 - 99 16 48 180 211 188 199 180 52 51 99 180 93 254 227 52 152 126 - 123 49 6 18 16 181 5 163 130 51 27 158 98 53 134 235 189 52 - 119 45 9 180 130 115 110 52 158 128 162 52 232 251 197 180 178 46 - 158 179 57 214 157 52 172 207 161 180 208 0 222 49 242 99 32 53 - 20 174 135 50 247 117 176 52 194 57 43 180 140 108 135 51 243 65 - 175 51 187 73 156 51 63 232 217 50 180 234 115 52 194 168 148 52 - 27 192 183 180 45 178 157 52 125 208 17 53 236 192 65 53 190 193 - 7 53 254 246 57 53 3 43 199 51 64 164 215 180 220 104 240 51 - 23 72 24 180 68 173 9 51 72 114 29 53 105 0 57 181 188 150 - 8 53 229 97 131 53 0 34 189 51 163 146 74 53 31 244 204 51 - 86 193 220 180 156 51 146 179] -size in bytes = 512 -debugger dtype = 11 -shape = [128] ------------------------------------------------------------ -tensor_info_2 attributes: -node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op171 -slot = 0 -iteration = 2 -device_id = None -root_graph_id = 0 -is_parameter = False - -tensor_data_2 attributes: -data (printed in uint8) = [ 99 26 69 41 190 38 128 38 232 38 16 39 5 39 24 39 1 39 - 218 38 219 38 43 39 241 33 21 165 159 32 15 145 191 28 66 30 - 110 30 149 31 14 29 179 29 249 28 94 29 141 156 210 36 143 166 - 201 162 5 165 54 166 100 165 57 165 81 165 25 166 150 165 236 164 - 20 164 238 165 170 20 200 168 16 168 36 169 9 169 195 168 64 168 - 248 168 10 169 20 168 56 167 137 167 124 168 221 152 35 168 163 167 - 110 169 147 168 198 167 52 168 91 168 14 168 30 168 240 167 171 168 - 235 168 37 161 222 165 16 161 88 164 68 162 156 152 109 151 181 156 - 0 152 84 158 112 154 193 161 13 162 172 28 38 163 16 31 255 26 - 102 21 64 31 177 28 102 156 77 20 62 25 177 26 26 22 241 24 - 188 33 149 160 67 36 171 35 38 36 68 34 148 19 54 162 53 161 - 174 156 195 134 139 24 210 35 175 36 206 158 136 37 88 36 31 36 - 78 20 203 159 6 165 235 163 83 162 7 157 76 31 240 35 38 37 - 20 160 193 38 130 29 95 23 177 161 143 162 46 165 103 164 106 163 - 167 162 36 158 130 161 149 33 171 157 138 37 252 27 198 164 116 166 - 60 165 36 165 47 165 150 166 188 166 112 167 58 166 33 140 141 163 - 93 32 38 159 13 168 194 166 78 166 8 166 201 165 115 166 128 166 - 77 166 29 166 131 157 150 31 46 32 124 164 239 166 219 165 96 166 - 216 166 21 167 28 167 35 167 237 165 202 164 57 32 75 26 208 40 - 148 40 205 40 162 40 187 40 181 40 181 40 155 40 124 40 129 40 - 157 40 186 29 253 32 138 44 226 43 43 43 237 42 164 42 137 42 - 174 42 179 42 160 42 104 42 30 42 53 38 140 25 240 44 120 44 - 236 42 19 43 143 42 6 42 181 41 83 42 0 43 112 42 97 41 - 27 32 177 32 254 44 105 43 242 40 239 40 71 41 223 40 237 40 - 93 41 22 41 211 40 227 40 187 20 71 30 4 44 188 40 79 36 - 133 38 62 39 209 38 15 38 83 38 136 38 146 38 100 37 118 152 - 185 149 165 42 99 41 61 36 241 37 34 38 170 38 62 38 69 39 - 215 39 128 39 49 38 54 33 141 161 184 41 34 40 100 36 230 37 - 133 38 57 37 224 35 7 37] -size in bytes = 512 -debugger dtype = 10 -shape = [4, 4, 4, 4] ------------------------------------------------------------ -tensor_info_3 attributes: -node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op353 -slot = 1 -iteration = 2 -device_id = None -root_graph_id = 0 -is_parameter = False - -tensor_data_3 attributes: -data (printed in uint8) = [19 17 27 ... 94 42 90] -size in bytes = 129792 -debugger dtype = 6 -shape = [32, 12, 13, 13, 2] diff --git a/tests/st/debugger/golden/sync_watchpoints_expected.json b/tests/st/debugger/golden/sync_watchpoints_expected.json new file mode 100644 index 00000000000..49f57b4b204 --- /dev/null +++ b/tests/st/debugger/golden/sync_watchpoints_expected.json @@ -0,0 +1,55 @@ +[ + { + "watchpoint_hit1": { + "name": "Default/network-WithLossCell/_backbone-AlexNet/conv1-Conv2d/Conv2D-op369", + "slot": 1, + "condition": 6, + "watchpoint_id": 1, + "parameter": [ + { + "parameter0": { + "name": "param", + "disabled": false, + "value": 0.0, + "hit": true, + "actual_value": -0.020966000854969025 + } + } + ], + "error_code": 0, + "rank_id": 0, + "root_graph_id": 0 + } + }, + { + "watchpoint_hit2": { + "name": "Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc2.bias", + "slot": 0, + "condition": 18, + "watchpoint_id": 3, + "parameter": [ + { + "parameter0": { + "name": "abs_mean_update_ratio_gt", + "disabled": false, + "value": 0.0, + "hit": true, + "actual_value": 1.0156775705209766 + } + }, + { + "parameter1": { + "name": "epsilon", + "disabled": true, + "value": 0.0, + "hit": false, + "actual_value": 0.0 + } + } + ], + "error_code": 0, + "rank_id": 0, + "root_graph_id": 0 + } + } +] \ No newline at end of file diff --git a/tests/st/debugger/test_async_sink_mode_true_read_tensors.py b/tests/st/debugger/test_async_sink_mode_true_read_tensors.py deleted file mode 100644 index b1d7abf30be..00000000000 --- a/tests/st/debugger/test_async_sink_mode_true_read_tensors.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -""" -Read tensor test script for offline debugger APIs. -""" - -import mindspore.offline_debug.dbg_services as d -import numpy as np -import pytest -from dump_test_utils import compare_actual_with_expected -from tests.security_utils import security_off_wrap - -GENERATE_GOLDEN = False -test_name = "async_sink_mode_true_read_tensors" - - -@pytest.mark.level0 -@pytest.mark.platform_arm_ascend_training -@pytest.mark.platform_x86_ascend_training -@pytest.mark.env_onecard -@pytest.mark.skip(reason="needs updating") -@security_off_wrap -def test_async_sink_mode_true_read_tensors(): - debugger_backend = d.DbgServices( - dump_file_path="/home/workspace/mindspore_dataset/dumps/async_sink_true/") - - _ = debugger_backend.initialize(net_name="alexnet", is_sync_mode=False) - - # output tensor with zero slot - info1 = d.TensorInfo(node_name="Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/" - "conv3-Conv2d/Conv2D-op169", - slot=0, iteration=2, device_id=0, root_graph_id=1, is_parameter=False) - # output tensor with non-zero slot - info2 = d.TensorInfo(node_name="Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/" - "ReLUV2-op348", - slot=1, iteration=2, device_id=0, root_graph_id=1, is_parameter=False) - - tensor_info = [info1, info2] - - tensor_data = debugger_backend.read_tensors(tensor_info) - - print_read_tensors(tensor_info, tensor_data) - if not GENERATE_GOLDEN: - assert compare_actual_with_expected(test_name) - - -def print_read_tensors(tensor_info, tensor_data): - """Print read tensors.""" - if GENERATE_GOLDEN: - f_write = open(test_name + ".expected", "w") - else: - f_write = open(test_name + ".actual", "w") - for x, _ in enumerate(tensor_info): - f_write.write("-----------------------------------------------------------\n") - f_write.write("tensor_info_" + str(x + 1) + " attributes:\n") - f_write.write("node name = " + tensor_info[x].node_name + "\n") - f_write.write("slot = " + str(tensor_info[x].slot) + "\n") - f_write.write("iteration = " + str(tensor_info[x].iteration) + "\n") - f_write.write("device_id = " + str(tensor_info[x].device_id) + "\n") - f_write.write("root_graph_id = " + str(tensor_info[x].root_graph_id) + "\n") - f_write.write("is_parameter = " + str(tensor_info[x].is_parameter) + "\n") - f_write.write("\n") - f_write.write("tensor_data_" + str(x + 1) + " attributes:\n") - f_write.write("data (printed in uint8) = " + str(np.frombuffer( - tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size)) + "\n") - py_byte_size = len(tensor_data[x].data_ptr) - c_byte_size = tensor_data[x].data_size - if c_byte_size != py_byte_size: - f_write.write("The python byte size of " + str(py_byte_size) + - " does not match the C++ byte size of " + str(c_byte_size) + "\n") - f_write.write("size in bytes = " + str(tensor_data[x].data_size) + "\n") - f_write.write("debugger dtype = " + str(tensor_data[x].dtype) + "\n") - f_write.write("shape = " + str(tensor_data[x].shape) + "\n") - f_write.close() diff --git a/tests/st/debugger/test_async_sink_mode_true_watchpoints.py b/tests/st/debugger/test_async_sink_mode_true_watchpoints.py deleted file mode 100644 index c40ab092463..00000000000 --- a/tests/st/debugger/test_async_sink_mode_true_watchpoints.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -""" -Watchpoints test script for offline debugger APIs. -""" - -import mindspore.offline_debug.dbg_services as d -import pytest -from dump_test_utils import compare_actual_with_expected -from tests.security_utils import security_off_wrap - -GENERATE_GOLDEN = False -test_name = "async_sink_mode_true_watchpoints" - - -@pytest.mark.level0 -@pytest.mark.platform_arm_ascend_training -@pytest.mark.platform_x86_ascend_training -@pytest.mark.env_onecard -@pytest.mark.skip(reason="needs updating") -@security_off_wrap -def test_async_sink_mode_true_watchpoints(): - if GENERATE_GOLDEN: - f_write = open(test_name + ".expected", "w") - else: - f_write = open(test_name + ".actual", "w") - - debugger_backend = d.DbgServices( - dump_file_path="/home/workspace/mindspore_dataset/dumps/async_sink_true/") - - _ = debugger_backend.initialize(net_name="alexnet", is_sync_mode=False) - - # NOTES: - # -> watch_condition=6 is MIN_LT - # -> watch_condition=18 is CHANGE_TOO_LARGE - - # test 1: watchpoint set and hit (watch_condition=6) - param1 = d.Parameter(name="param", disabled=False, value=0.0) - _ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6, - check_node_list={"Default/network-TrainOneStepCell/network-WithLossCell/" - "_backbone-AlexNet/conv3-Conv2d/Conv2D-op169": - {"device_id": [0], "root_graph_id": [1], - "is_parameter": False - }}, parameter_list=[param1]) - - watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2) - if len(watchpoint_hits_test_1) != 1: - f_write.write("ERROR -> test 1: watchpoint set but not hit just once\n") - print_watchpoint_hits(watchpoint_hits_test_1, 1, f_write) - - # test 2: watchpoint remove and ensure it's not hit - _ = debugger_backend.remove_watchpoint(watchpoint_id=1) - watchpoint_hits_test_2 = debugger_backend.check_watchpoints(iteration=2) - if watchpoint_hits_test_2: - f_write.write("ERROR -> test 2: watchpoint removed but hit\n") - - # test 3: watchpoint set and not hit, then remove - param2 = d.Parameter(name="param", disabled=False, value=-1000.0) - _ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6, - check_node_list={"Default/network-TrainOneStepCell/network-WithLossCell/" - "_backbone-AlexNet/conv3-Conv2d/Conv2D-op169": - {"device_id": [0], "root_graph_id": [1], - "is_parameter": False - }}, parameter_list=[param2]) - - watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2) - if watchpoint_hits_test_3: - f_write.write("ERROR -> test 3: watchpoint set but not supposed to be hit\n") - _ = debugger_backend.remove_watchpoint(watchpoint_id=2) - f_write.close() - if not GENERATE_GOLDEN: - assert compare_actual_with_expected(test_name) - - -def print_watchpoint_hits(watchpoint_hits, test_id, f_write): - """Print watchpoint hits.""" - for x, _ in enumerate(watchpoint_hits): - f_write.write("-----------------------------------------------------------\n") - f_write.write("watchpoint_hit for test_%u attributes:" % test_id + "\n") - f_write.write("name = " + watchpoint_hits[x].name + "\n") - f_write.write("slot = " + str(watchpoint_hits[x].slot) + "\n") - f_write.write("condition = " + str(watchpoint_hits[x].condition) + "\n") - f_write.write("watchpoint_id = " + str(watchpoint_hits[x].watchpoint_id) + "\n") - for p, _ in enumerate(watchpoint_hits[x].parameters): - f_write.write("parameter " + str(p) + " name = " + - watchpoint_hits[x].parameters[p].name + "\n") - f_write.write("parameter " + str(p) + " disabled = " + - str(watchpoint_hits[x].parameters[p].disabled) + "\n") - f_write.write("parameter " + str(p) + " value = " + - str(watchpoint_hits[x].parameters[p].value) + "\n") - f_write.write("parameter " + str(p) + " hit = " + - str(watchpoint_hits[x].parameters[p].hit) + "\n") - f_write.write("parameter " + str(p) + " actual_value = " + - str(watchpoint_hits[x].parameters[p].actual_value) + "\n") - f_write.write("error code = " + str(watchpoint_hits[x].error_code) + "\n") - f_write.write("device_id = " + str(watchpoint_hits[x].device_id) + "\n") - f_write.write("root_graph_id = " + str(watchpoint_hits[x].root_graph_id) + "\n") diff --git a/tests/st/debugger/test_read_tensors.py b/tests/st/debugger/test_read_tensors.py new file mode 100644 index 00000000000..daf53d45a17 --- /dev/null +++ b/tests/st/debugger/test_read_tensors.py @@ -0,0 +1,159 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Read tensor test script for offline debugger APIs. +""" + +import os +import json +import tempfile +import mindspore.offline_debug.dbg_services as d +import numpy as np +import pytest +from tests.security_utils import security_off_wrap +from dump_test_utils import build_dump_structure + +GENERATE_GOLDEN = False +tensor_json = [] + + +def run_read_tensors(is_sync): + if is_sync: + test_name = "sync_read_tensors" + else: + test_name = "async_read_tensors" + + # input tensor with zero slot + tensor1 = np.array([32.0, 4096.0], np.float32) + name1 = "CudnnUniformReal.CudnnUniformReal-op391.0.0." + info1 = d.TensorInfo(node_name="Default/CudnnUniformReal-op391", + slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=False) + # input tensor with non-zero slot + tensor2 = np.array([[0.0, 32.0, 4096.0], [4.5, 6.78, -11.0]], np.float32) + name2 = "ReluGradV2.ReluGradV2-op406.0.0." + info2 = d.TensorInfo(node_name="Gradients/Default/network-WithLossCell/_backbone-AlexNet/" + "gradReLU/ReluGradV2-op406", + slot=1, iteration=1, rank_id=0, root_graph_id=0, is_output=False) + # output tensor with zero slot + tensor3 = np.array([[[7.963e-05, 4.750e-05, 2.587e-05], + [8.339e-05, 5.025e-05, 2.694e-05], + [8.565e-05, 5.156e-05, 2.658e-05]], + [[8.017e-05, 4.804e-05, 2.724e-05], + [8.392e-05, 5.126e-05, 2.843e-05], + [8.613e-05, 5.257e-05, 2.819e-05]], + [[7.617e-05, 3.827e-05, 5.305e-06], + [7.474e-05, 3.719e-05, 3.040e-06], + [7.081e-05, 3.338e-05, -2.086e-06]]], np.float32) + name3 = "Conv2DBackpropFilter.Conv2DBackpropFilter-op424.0.0." + info3 = d.TensorInfo(node_name="Gradients/Default/network-WithLossCell/_backbone-AlexNet/conv5-Conv2d/" + "gradConv2D/Conv2DBackpropFilter-op424", + slot=0, iteration=1, rank_id=0, root_graph_id=0, is_output=True) + # output tensor with non-zero slot + tensor4 = np.array([2705090541, 1099111076, 4276637100, 3586562544, 890060077, 1869062900], np.float32) + name4 = "ReLUV2.ReLUV2-op381.0.0." + info4 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op381", + slot=1, iteration=0, rank_id=0, root_graph_id=0, is_output=True) + + tensor_name = [name1, name2, name3, name4] + tensor_list = [tensor1, tensor2, tensor3, tensor4] + tensor_info = [info1, info2, info3, info4] + + pwd = os.getcwd() + with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir: + temp_dir = build_dump_structure(tmp_dir, tensor_name, tensor_list, "Test", tensor_info) + + debugger_backend = d.DbgServices(dump_file_path=temp_dir) + debugger_backend.initialize(net_name="Test", is_sync_mode=is_sync) + tensor_data = debugger_backend.read_tensors(tensor_info) + + if GENERATE_GOLDEN: + print_read_tensors(tensor_info, tensor_data, 0, True, test_name) + else: + compare_expect_actual_result(tensor_info, tensor_data, 0, test_name) + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@security_off_wrap +def test_sync_read_tensors(): + run_read_tensors(True) + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@security_off_wrap +def test_async_read_tensors(): + run_read_tensors(False) + + +def compare_expect_actual_result(tensor_info_list, tensor_data_list, test_index, test_name): + """Compare actual result with golden file.""" + pwd = os.getcwd() + golden_file = os.path.realpath(os.path.join(pwd, "golden", test_name + "_expected.json")) + with open(golden_file) as f: + expected_list = json.load(f) + for x, (tensor_info, tensor_data) in enumerate(zip(tensor_info_list, tensor_data_list)): + test_id = "tensor_"+ str(test_index+x+1) + info = expected_list[x+test_index][test_id] + assert tensor_info.node_name == info['tensor_info']['node_name'] + assert tensor_info.slot == info['tensor_info']['slot'] + assert tensor_info.iteration == info['tensor_info']['iteration'] + assert tensor_info.rank_id == info['tensor_info']['rank_id'] + assert tensor_info.root_graph_id == info['tensor_info']['root_graph_id'] + assert tensor_info.is_output == info['tensor_info']['is_output'] + actual_data = np.frombuffer( + tensor_data.data_ptr, np.uint8, tensor_data.data_size).tolist() + assert actual_data == info['tensor_data']['data'] + assert tensor_data.data_size == info['tensor_data']['size_in_bytes'] + assert tensor_data.dtype == info['tensor_data']['debugger_dtype'] + assert tensor_data.shape == info['tensor_data']['shape'] + + +def print_read_tensors(tensor_info_list, tensor_data_list, test_index, is_print, test_name): + """Print read tensors result if GENERATE_GOLDEN is True.""" + for x, (tensor_info, tensor_data) in enumerate(zip(tensor_info_list, tensor_data_list)): + tensor = "tensor_" + str(test_index+x+1) + data = np.frombuffer( + tensor_data.data_ptr, np.uint8, tensor_data.data_size).tolist() + py_byte_size = len(tensor_data.data_ptr) + c_byte_size = tensor_data.data_size + if c_byte_size != py_byte_size: + print("The python byte size of " + str(py_byte_size) + + " does not match the C++ byte size of " + str(c_byte_size) + "\n") + tensor_json.append({ + tensor: { + 'tensor_info': { + 'node_name': tensor_info.node_name, + 'slot': tensor_info.slot, + 'iteration': tensor_info.iteration, + 'rank_id': tensor_info.rank_id, + 'root_graph_id': tensor_info.root_graph_id, + 'is_output': tensor_info.is_output + }, + 'tensor_data': { + 'data': data, + 'size_in_bytes': tensor_data.data_size, + 'debugger_dtype': tensor_data.dtype, + 'shape': tensor_data.shape + } + } + }) + if is_print: + with open(test_name + "_expected.json", "w") as dump_f: + json.dump(tensor_json, dump_f, indent=4, separators=(',', ': ')) diff --git a/tests/st/debugger/test_sync_trans_false_read_tensors.py b/tests/st/debugger/test_sync_trans_false_read_tensors.py deleted file mode 100644 index 993b27f2862..00000000000 --- a/tests/st/debugger/test_sync_trans_false_read_tensors.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -""" -Read tensor test script for offline debugger APIs. -""" - -import mindspore.offline_debug.dbg_services as d -import numpy as np -import pytest -from dump_test_utils import compare_actual_with_expected -from tests.security_utils import security_off_wrap - -GENERATE_GOLDEN = False -test_name = "sync_trans_false_read_tensors" - - -@pytest.mark.level0 -@pytest.mark.platform_arm_ascend_training -@pytest.mark.platform_x86_ascend_training -@pytest.mark.env_onecard -@pytest.mark.skip(reason="needs updating") -@security_off_wrap -def test_sync_trans_false_read_tensors(): - - debugger_backend = d.DbgServices( - dump_file_path="/home/workspace/mindspore_dataset/dumps/sync_trans_false/alexnet/") - - _ = debugger_backend.initialize( - net_name="Network Name goes here!", is_sync_mode=True) - - # parameter - info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias", - slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True) - # output tensor with zero slot - info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op168", - slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False) - # output tensor with non-zero slot - info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op346", - slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False) - - tensor_info = [info1, info2, info3] - - tensor_data = debugger_backend.read_tensors(tensor_info) - - print_read_tensors(tensor_info, tensor_data) - if not GENERATE_GOLDEN: - assert compare_actual_with_expected(test_name) - - -def print_read_tensors(tensor_info, tensor_data): - """Print read tensors.""" - if GENERATE_GOLDEN: - f_write = open(test_name + ".expected", "w") - else: - f_write = open(test_name + ".actual", "w") - for x, _ in enumerate(tensor_info): - f_write.write("-----------------------------------------------------------\n") - f_write.write("tensor_info_" + str(x + 1) + " attributes:\n") - f_write.write("node name = " + tensor_info[x].node_name + "\n") - f_write.write("slot = " + str(tensor_info[x].slot) + "\n") - f_write.write("iteration = " + str(tensor_info[x].iteration) + "\n") - f_write.write("device_id = " + str(tensor_info[x].device_id) + "\n") - f_write.write("root_graph_id = " + str(tensor_info[x].root_graph_id) + "\n") - f_write.write("is_parameter = " + str(tensor_info[x].is_parameter) + "\n") - f_write.write("\n") - f_write.write("tensor_data_" + str(x + 1) + " attributes:\n") - f_write.write("data (printed in uint8) = " + str(np.frombuffer( - tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size)) + "\n") - py_byte_size = len(tensor_data[x].data_ptr) - c_byte_size = tensor_data[x].data_size - if c_byte_size != py_byte_size: - f_write.write("The python byte size of " + str(py_byte_size) + - " does not match the C++ byte size of " + str(c_byte_size) + "\n") - f_write.write("size in bytes = " + str(tensor_data[x].data_size) + "\n") - f_write.write("debugger dtype = " + str(tensor_data[x].dtype) + "\n") - f_write.write("shape = " + str(tensor_data[x].shape) + "\n") - f_write.close() diff --git a/tests/st/debugger/test_sync_trans_false_watchpoints.py b/tests/st/debugger/test_sync_trans_false_watchpoints.py deleted file mode 100644 index 1af39771f1a..00000000000 --- a/tests/st/debugger/test_sync_trans_false_watchpoints.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -""" -Watchpoints test script for offline debugger APIs. -""" - -import mindspore.offline_debug.dbg_services as d -import pytest -from dump_test_utils import compare_actual_with_expected -from tests.security_utils import security_off_wrap - -GENERATE_GOLDEN = False -test_name = "sync_trans_false_watchpoints" - - -@pytest.mark.level0 -@pytest.mark.platform_arm_ascend_training -@pytest.mark.platform_x86_ascend_training -@pytest.mark.env_onecard -@pytest.mark.skip(reason="needs updating") -@security_off_wrap -def test_sync_trans_false_watchpoints(): - - if GENERATE_GOLDEN: - f_write = open(test_name + ".expected", "w") - else: - f_write = open(test_name + ".actual", "w") - - debugger_backend = d.DbgServices( - dump_file_path="/home/workspace/mindspore_dataset/dumps/sync_trans_false/alexnet/") - - _ = debugger_backend.initialize( - net_name="Network Name goes here!", is_sync_mode=True) - - # NOTES: - # -> watch_condition=6 is MIN_LT - # -> watch_condition=18 is CHANGE_TOO_LARGE - - # test 1: watchpoint set and hit (watch_condition=6) - param1 = d.Parameter(name="param", disabled=False, value=0.0) - _ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6, - check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/" - "Conv2D-op168": - {"device_id": [0], "root_graph_id": [0], - "is_parameter": False - }}, parameter_list=[param1]) - - watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2) - if len(watchpoint_hits_test_1) != 1: - f_write.write("ERROR -> test 1: watchpoint set but not hit just once") - print_watchpoint_hits(watchpoint_hits_test_1, 1, f_write) - - # test 2: watchpoint remove and ensure it's not hit - _ = debugger_backend.remove_watchpoint(watchpoint_id=1) - watchpoint_hits_test_2 = debugger_backend.check_watchpoints(iteration=2) - if watchpoint_hits_test_2: - f_write.write("ERROR -> test 2: watchpoint removed but hit") - - # test 3: watchpoint set and not hit, then remove - param2 = d.Parameter(name="param", disabled=False, value=-1000.0) - _ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6, - check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/" - "Conv2D-op308": - {"device_id": [0], "root_graph_id": [0], - "is_parameter": False - }}, parameter_list=[param2]) - - watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2) - if watchpoint_hits_test_3: - f_write.write("ERROR -> test 3: watchpoint set but not supposed to be hit") - _ = debugger_backend.remove_watchpoint(watchpoint_id=2) - - # test 4: weight change watchpoint set and hit - param_abs_mean_update_ratio_gt = d.Parameter( - name="abs_mean_update_ratio_gt", disabled=False, value=0.0) - param_epsilon = d.Parameter(name="epsilon", disabled=True, value=0.0) - _ = debugger_backend.add_watchpoint(watchpoint_id=3, watch_condition=18, - check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/" - "Parameter[6]_11/fc3.bias": - {"device_id": [0], "root_graph_id": [0], - "is_parameter": True - }}, parameter_list=[param_abs_mean_update_ratio_gt, - param_epsilon]) - - watchpoint_hits_test_4 = debugger_backend.check_watchpoints(iteration=3) - if len(watchpoint_hits_test_4) != 1: - f_write.write("ERROR -> test 4: watchpoint weight change set but not hit just once") - print_watchpoint_hits(watchpoint_hits_test_4, 4, f_write) - f_write.close() - if not GENERATE_GOLDEN: - assert compare_actual_with_expected(test_name) - - -def print_watchpoint_hits(watchpoint_hits, test_id, f_write): - """Print watchpoint hits.""" - for x, _ in enumerate(watchpoint_hits): - f_write.write("-----------------------------------------------------------\n") - f_write.write("watchpoint_hit for test_%u attributes:" % test_id + "\n") - f_write.write("name = " + watchpoint_hits[x].name + "\n") - f_write.write("slot = " + str(watchpoint_hits[x].slot) + "\n") - f_write.write("condition = " + str(watchpoint_hits[x].condition) + "\n") - f_write.write("watchpoint_id = " + str(watchpoint_hits[x].watchpoint_id) + "\n") - for p, _ in enumerate(watchpoint_hits[x].parameters): - f_write.write("parameter " + str(p) + " name = " + - watchpoint_hits[x].parameters[p].name + "\n") - f_write.write("parameter " + str(p) + " disabled = " + - str(watchpoint_hits[x].parameters[p].disabled) + "\n") - f_write.write("parameter " + str(p) + " value = " + - str(watchpoint_hits[x].parameters[p].value) + "\n") - f_write.write("parameter " + str(p) + " hit = " + - str(watchpoint_hits[x].parameters[p].hit) + "\n") - f_write.write("parameter " + str(p) + " actual_value = " + - str(watchpoint_hits[x].parameters[p].actual_value) + "\n") - f_write.write("error code = " + str(watchpoint_hits[x].error_code) + "\n") - f_write.write("device_id = " + str(watchpoint_hits[x].device_id) + "\n") - f_write.write("root_graph_id = " + str(watchpoint_hits[x].root_graph_id) + "\n") diff --git a/tests/st/debugger/test_sync_trans_true_read_tensor.py b/tests/st/debugger/test_sync_trans_true_read_tensor.py deleted file mode 100644 index c892ec18874..00000000000 --- a/tests/st/debugger/test_sync_trans_true_read_tensor.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -""" -Read tensor test script for offline debugger APIs. -""" - -import mindspore.offline_debug.dbg_services as d -import numpy as np -import pytest -from dump_test_utils import compare_actual_with_expected -from tests.security_utils import security_off_wrap - -GENERATE_GOLDEN = False -test_name = "sync_trans_true_read_tensors" - - -@pytest.mark.level0 -@pytest.mark.platform_arm_ascend_training -@pytest.mark.platform_x86_ascend_training -@pytest.mark.env_onecard -@pytest.mark.skip(reason="needs updating") -@security_off_wrap -def test_sync_trans_true_read_tensors(): - - debugger_backend = d.DbgServices( - dump_file_path="/home/workspace/mindspore_dataset/dumps/sync_trans_true/alexnet/") - - _ = debugger_backend.initialize( - net_name="Network Name goes here!", is_sync_mode=True) - - # parameter - info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias", - slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True) - # output tensor with zero slot - info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op171", - slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False) - # output tensor with non-zero slot - info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op353", - slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False) - - tensor_info = [info1, info2, info3] - - tensor_data = debugger_backend.read_tensors(tensor_info) - - print_read_tensors(tensor_info, tensor_data) - if not GENERATE_GOLDEN: - assert compare_actual_with_expected(test_name) - - -def print_read_tensors(tensor_info, tensor_data): - """Print read tensors.""" - if GENERATE_GOLDEN: - f_write = open(test_name + ".expected", "w") - else: - f_write = open(test_name + ".actual", "w") - for x, _ in enumerate(tensor_info): - f_write.write("-----------------------------------------------------------\n") - f_write.write("tensor_info_" + str(x + 1) + " attributes:\n") - f_write.write("node name = " + tensor_info[x].node_name + "\n") - f_write.write("slot = " + str(tensor_info[x].slot) + "\n") - f_write.write("iteration = " + str(tensor_info[x].iteration) + "\n") - f_write.write("device_id = " + str(tensor_info[x].device_id) + "\n") - f_write.write("root_graph_id = " + str(tensor_info[x].root_graph_id) + "\n") - f_write.write("is_parameter = " + str(tensor_info[x].is_parameter) + "\n") - f_write.write("\n") - f_write.write("tensor_data_" + str(x + 1) + " attributes:\n") - f_write.write("data (printed in uint8) = " + str(np.frombuffer( - tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size)) + "\n") - py_byte_size = len(tensor_data[x].data_ptr) - c_byte_size = tensor_data[x].data_size - if c_byte_size != py_byte_size: - f_write.write("The python byte size of " + str(py_byte_size) + - " does not match the C++ byte size of " + str(c_byte_size) + "\n") - f_write.write("size in bytes = " + str(tensor_data[x].data_size) + "\n") - f_write.write("debugger dtype = " + str(tensor_data[x].dtype) + "\n") - f_write.write("shape = " + str(tensor_data[x].shape) + "\n") - f_write.close() diff --git a/tests/st/debugger/test_watchpoints.py b/tests/st/debugger/test_watchpoints.py new file mode 100644 index 00000000000..400ea07589f --- /dev/null +++ b/tests/st/debugger/test_watchpoints.py @@ -0,0 +1,204 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Watchpoints test script for offline debugger APIs. +""" + +import os +import json +import tempfile +import numpy as np +import mindspore.offline_debug.dbg_services as d +import pytest +from tests.security_utils import security_off_wrap +from dump_test_utils import build_dump_structure + +GENERATE_GOLDEN = False +watchpoint_hits_json = [] + + +def run_watchpoints(is_sync): + if is_sync: + test_name = "sync_watchpoints" + else: + test_name = "async_watchpoints" + + name1 = "Conv2D.Conv2D-op369.0.0.1" + tensor1 = np.array([[[-1.2808e-03, 7.7629e-03, 1.9241e-02], + [-1.3931e-02, 8.9359e-04, -1.1520e-02], + [-6.3248e-03, 1.8749e-03, 1.0132e-02]], + [[-2.5520e-03, -6.0005e-03, -5.1918e-03], + [-2.7866e-03, 2.5487e-04, 8.4782e-04], + [-4.6310e-03, -8.9111e-03, -8.1778e-05]], + [[1.3914e-03, 6.0844e-04, 1.0643e-03], + [-2.0966e-02, -1.2865e-03, -1.8692e-03], + [-1.6647e-02, 1.0233e-03, -4.1313e-03]]], np.float32) + info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv1-Conv2d/Conv2D-op369", + slot=1, iteration=2, rank_id=0, root_graph_id=0, is_output=False) + + name2 = "Parameter.fc2.bias.0.0.2" + tensor2 = np.array([-5.0167350e-06, 1.2509107e-05, -4.3148934e-06, 8.1415592e-06, + 2.1177532e-07, 2.9952851e-06], np.float32) + info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/" + "Parameter[6]_11/fc2.bias", + slot=0, iteration=2, rank_id=0, root_graph_id=0, is_output=True) + + tensor3 = np.array([2.9060817e-07, -5.1009415e-06, -2.8662325e-06, 2.6036503e-06, + -5.1546101e-07, 6.0798648e-06], np.float32) + info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/" + "Parameter[6]_11/fc2.bias", + slot=0, iteration=3, rank_id=0, root_graph_id=0, is_output=True) + + tensor_info = [info1, info2, info3] + tensor_name = [name1, name2, name2] + tensor_list = [tensor1, tensor2, tensor3] + + pwd = os.getcwd() + with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir: + temp_dir = build_dump_structure(tmp_dir, tensor_name, tensor_list, "Test", tensor_info) + + debugger_backend = d.DbgServices(dump_file_path=temp_dir) + debugger_backend.initialize(net_name="Test", is_sync_mode=False) + + # NOTES: + # -> watch_condition=6 is MIN_LT + # -> watch_condition=18 is CHANGE_TOO_LARGE + + # test 1: watchpoint set and hit (watch_condition=6) + param1 = d.Parameter(name="param", disabled=False, value=0.0) + debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6, + check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/" + "conv1-Conv2d/Conv2D-op369": + {"rank_id": [0], "root_graph_id": [0], "is_output": False + }}, parameter_list=[param1]) + + watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2) + assert len(watchpoint_hits_test_1) == 1 + if GENERATE_GOLDEN: + print_watchpoint_hits(watchpoint_hits_test_1, 0, False, test_name) + else: + compare_expect_actual_result(watchpoint_hits_test_1, 0, test_name) + + # test 2: watchpoint remove and ensure it's not hit + debugger_backend.remove_watchpoint(watchpoint_id=1) + watchpoint_hits_test_2 = debugger_backend.check_watchpoints(iteration=2) + assert not watchpoint_hits_test_2 + + # test 3: watchpoint set and not hit, then remove + param2 = d.Parameter(name="param", disabled=False, value=-1000.0) + debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6, + check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/" + "conv1-Conv2d/Conv2D-op369": + {"rank_id": [0], "root_graph_id": [0], "is_output": False + }}, parameter_list=[param2]) + + watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2) + assert not watchpoint_hits_test_3 + _ = debugger_backend.remove_watchpoint(watchpoint_id=2) + + # test 4: weight change watchpoint set and hit + param_abs_mean_update_ratio_gt = d.Parameter( + name="abs_mean_update_ratio_gt", disabled=False, value=0.0) + param_epsilon = d.Parameter(name="epsilon", disabled=True, value=0.0) + debugger_backend.add_watchpoint(watchpoint_id=3, watch_condition=18, + check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/" + "Parameter[6]_11/fc2.bias": + {"rank_id": [0], "root_graph_id": [0], "is_output": True + }}, parameter_list=[param_abs_mean_update_ratio_gt, + param_epsilon]) + + watchpoint_hits_test_4 = debugger_backend.check_watchpoints(iteration=3) + assert len(watchpoint_hits_test_4) == 1 + + if GENERATE_GOLDEN: + print_watchpoint_hits(watchpoint_hits_test_4, 1, True, test_name) + else: + compare_expect_actual_result(watchpoint_hits_test_4, 1, test_name) + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@security_off_wrap +def test_sync_watchpoints(): + run_watchpoints(True) + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@security_off_wrap +def test_async_watchpoints(): + run_watchpoints(False) + + +def compare_expect_actual_result(watchpoint_hits_list, test_index, test_name): + """Compare actual result with golden file.""" + pwd = os.getcwd() + golden_file = os.path.realpath(os.path.join(pwd, "golden", test_name + "_expected.json")) + with open(golden_file) as f: + expected_list = json.load(f) + for x, watchpoint_hits in enumerate(watchpoint_hits_list): + test_id = "watchpoint_hit" + str(test_index+x+1) + info = expected_list[x+test_index][test_id] + assert watchpoint_hits.name == info['name'] + assert watchpoint_hits.slot == info['slot'] + assert watchpoint_hits.condition == info['condition'] + assert watchpoint_hits.watchpoint_id == info['watchpoint_id'] + assert watchpoint_hits.error_code == info['error_code'] + assert watchpoint_hits.rank_id == info['rank_id'] + assert watchpoint_hits.root_graph_id == info['root_graph_id'] + for p, _ in enumerate(watchpoint_hits.parameters): + parameter = "parameter" + str(p) + assert watchpoint_hits.parameters[p].name == info['parameter'][p][parameter]['name'] + assert watchpoint_hits.parameters[p].disabled == info['parameter'][p][parameter]['disabled'] + assert watchpoint_hits.parameters[p].value == info['parameter'][p][parameter]['value'] + assert watchpoint_hits.parameters[p].hit == info['parameter'][p][parameter]['hit'] + assert watchpoint_hits.parameters[p].actual_value == info['parameter'][p][parameter]['actual_value'] + + +def print_watchpoint_hits(watchpoint_hits_list, test_index, is_print, test_name): + """Print watchpoint hits.""" + for x, watchpoint_hits in enumerate(watchpoint_hits_list): + parameter_json = [] + for p, _ in enumerate(watchpoint_hits.parameters): + parameter = "parameter" + str(p) + parameter_json.append({ + parameter: { + 'name': watchpoint_hits.parameters[p].name, + 'disabled': watchpoint_hits.parameters[p].disabled, + 'value': watchpoint_hits.parameters[p].value, + 'hit': watchpoint_hits.parameters[p].hit, + 'actual_value': watchpoint_hits.parameters[p].actual_value + } + }) + watchpoint_hit = "watchpoint_hit" + str(test_index+x+1) + watchpoint_hits_json.append({ + watchpoint_hit: { + 'name': watchpoint_hits.name, + 'slot': watchpoint_hits.slot, + 'condition': watchpoint_hits.condition, + 'watchpoint_id': watchpoint_hits.watchpoint_id, + 'parameter': parameter_json, + 'error_code': watchpoint_hits.error_code, + 'rank_id': watchpoint_hits.rank_id, + 'root_graph_id': watchpoint_hits.root_graph_id + } + }) + if is_print: + with open(test_name + "_expected.json", "w") as dump_f: + json.dump(watchpoint_hits_json, dump_f, indent=4, separators=(',', ': '))