re-enabling offline debugger ut test

This commit is contained in:
sabrinasun 2021-08-25 19:05:19 -04:00
parent 76a37daa43
commit a9c6bf45a3
28 changed files with 382 additions and 432 deletions

View File

@ -1063,7 +1063,7 @@ class WatchpointHit():
>>> name = watchpoint_hit.name
"""
return self.instance.name()
return self.instance.get_name()
@property
def slot(self):
@ -1086,7 +1086,7 @@ class WatchpointHit():
>>> slot = watchpoint_hit.slot
"""
return self.instance.slot()
return self.instance.get_slot()
@property
def condition(self):
@ -1109,7 +1109,7 @@ class WatchpointHit():
>>> condition = watchpoint_hit.condition
"""
return self.instance.condition()
return self.instance.get_condition()
@property
def watchpoint_id(self):
@ -1132,7 +1132,7 @@ class WatchpointHit():
>>> watchpoint_id = watchpoint_hit.watchpoint_id
"""
return self.instance.watchpoint_id()
return self.instance.get_watchpoint_id()
@property
def parameters(self):
@ -1155,7 +1155,7 @@ class WatchpointHit():
>>> parameters = watchpoint_hit.parameters
"""
params = self.instance.parameters()
params = self.instance.get_parameters()
param_list = []
for elem in params:
tmp = Parameter(elem.get_name(),

View File

@ -0,0 +1,79 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/CudnnUniformReal-op391
slot = 0
iteration = 0
rank_id = 0
root_graph_id = 0
is_output = False
tensor_data_1 attributes:
data (printed in uint8) = [ 0 0 0 66 0 0 128 69]
size in bytes = 8
debugger dtype = 11
shape = [2]
-----------------------------------------------------------
tensor_info_2 attributes:
node name = Gradients/Default/network-WithLossCell/_backbone-AlexNet/gradReLU/ReluGradV2-op406
slot = 1
iteration = 1
rank_id = 0
root_graph_id = 0
is_output = False
tensor_data_2 attributes:
data (printed in uint8) = [ 0 0 0 0 0 0 0 66 0 0 128 69 0 0 144 64 195 245
216 64 0 0 48 193]
size in bytes = 24
debugger dtype = 11
shape = [2, 3]
-----------------------------------------------------------
tensor_info_3 attributes:
node name = Gradients/Default/network-WithLossCell/_backbone-AlexNet/conv5-Conv2d/gradConv2D/Conv2DBackpropFilter-op424
slot = 0
iteration = 1
rank_id = 0
root_graph_id = 0
is_output = True
tensor_data_3 attributes:
data (printed in uint8) = [ 8 255 166 56 189 58 71 56 103 3 217 55 170 225 174 56 135 195
82 56 54 253 225 55 254 158 179 56 33 66 88 56 30 248 222 55
241 32 168 56 143 126 73 56 116 129 228 55 53 254 175 56 2 0
87 56 246 124 238 55 177 160 180 56 156 126 92 56 144 121 236 55
117 189 159 56 25 132 32 56 154 1 178 54 187 189 156 56 117 252
27 56 205 2 76 54 212 127 148 56 129 1 12 56 53 253 11 182]
size in bytes = 108
debugger dtype = 11
shape = [3, 3, 3]
-----------------------------------------------------------
tensor_info_4 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op381
slot = 1
iteration = 0
rank_id = 0
root_graph_id = 0
is_output = True
tensor_data_4 attributes:
data (printed in uint8) = [104 60 33 79 53 6 131 78 78 232 126 79 154 198 85 79 245 52
84 78 70 207 222 78]
size in bytes = 24
debugger dtype = 11
shape = [6]
-----------------------------------------------------------
tensor_info_5 attributes:
node name = Default/Reciprocal-op3
slot = 0
iteration = 0
rank_id = 0
root_graph_id = 0
is_output = True
tensor_data_5 attributes:
data (printed in uint8) = [ 0 0 128 63 0 0 128 255 0 0 128 127 0 0 128 255 0 0
128 127 0 0 128 127 0 0 128 63 0 0 128 255 0 0 128 127
0 0 128 127]
size in bytes = 40
debugger dtype = 11
shape = [2, 5]

View File

@ -0,0 +1,28 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/CudnnUniformReal-op390
slot = 0
iteration = 0
rank_id = 0
root_graph_id = 0
is_output = False
tensor_data_1 attributes:
data (printed in uint8) = []
size in bytes = 0
debugger dtype = 0
shape = []
-----------------------------------------------------------
tensor_info_2 attributes:
node name = Gradients/Default/network-WithLossCell/_backbone-AlexNet/gradReLU/ReluGradV2-op406
slot = 1
iteration = 0
rank_id = 0
root_graph_id = 0
is_output = False
tensor_data_2 attributes:
data (printed in uint8) = []
size in bytes = 0
debugger dtype = 0
shape = []

View File

@ -1,70 +0,0 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = True
tensor_data_1 attributes:
data (printed in uint8) = [ 0 0 0 0 195 127 0 0 176 202 195 248 194 127 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 58 196 248
194 127 0 0 17 0 0 0 0 0 0 0 160 76 6 140 195 127
0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127 0 0
64 195 195 248 194 127 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 88 1 196 248 194 127 0 0 18 0 0 0
0 0 0 0 160 47 6 140 195 127 0 0 69 0 0 0 0 0
0 0 1 0 0 0 195 127 0 0 176 203 195 248 194 127 0 0
176 204 195 248 194 127 0 0 0 0 0 0 0 0 0 0 216 241
195 248 194 127 0 0 19 0 0 0 0 0 0 0 96 39 6 140
195 127 0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127
0 0 112 52 196 248 194 127 0 0 176 52 196 248 194 127 0 0
0 0 0 0 0 0 0 0 88 250 195 248 194 127 0 0 20 0
0 0 0 0 0 0 128 130 5 140 195 127 0 0 69 0 0 0
0 0 0 0 0 0 0 0 195 127 0 0 208 136 195 248 194 127
0 0 176 202 195 248 194 127 0 0 48 52 196 248 194 127 0 0
184 247 195 248 194 127 0 0 21 0 0 0 0 0 0 0 176 213
4 140 195 127 0 0 69 0 0 0 0 0 0 0 0 0 0 0
195 127 0 0 48 52 196 248 194 127 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 8 249 195 248 194 127 0 0
22 0 0 0 0 0 0 0 16 46 4 140 195 127 0 0 69 0
0 0 0 0 0 0 1 0 0 0 195 127 0 0 64 137 195 248
194 127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 88 12 196 248 194 127 0 0 23 0 0 0 0 0 0 0
32 137 3 140 195 127 0 0 85 0 0 0 0 0 0 0 0 0
0 0 195 127 0 0 176 202 195 248 194 127 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 104 246 195 248 194 127
0 0 24 0 0 0 0 0 0 0 48 104 15 140 195 127 0 0
32 104 15 140 195 127 0 0]
size in bytes = 512
debugger dtype = 11
shape = [128]
-----------------------------------------------------------
tensor_info_2 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_2 attributes:
data (printed in uint8) = [ 0 0 0 ... 0 0 192]
size in bytes = 1024
debugger dtype = 11
shape = [4, 4, 4, 4]
-----------------------------------------------------------
tensor_info_3 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300
slot = 1
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_3 attributes:
data (printed in uint8) = [ 0 169 0 ... 244 21 184]
size in bytes = 1024
debugger dtype = 8
shape = [256]

View File

@ -1,70 +0,0 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = True
tensor_data_1 attributes:
data (printed in uint8) = [ 1 0 0 0 195 127 0 0 80 58 118 65 195 127 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 40 186 117 65
195 127 0 0 5 0 0 0 0 0 0 0 160 76 6 204 195 127
0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127 0 0
48 135 117 65 195 127 0 0 16 58 118 65 195 127 0 0 144 58
118 65 195 127 0 0 168 186 117 65 195 127 0 0 6 0 0 0
0 0 0 0 160 47 6 204 195 127 0 0 69 0 0 0 0 0
0 0 1 0 0 0 195 127 0 0 80 58 118 65 195 127 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 184 249
117 65 195 127 0 0 7 0 0 0 0 0 0 0 96 39 6 204
195 127 0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127
0 0 224 218 117 65 195 127 0 0 0 0 0 0 0 0 0 0
224 219 117 65 195 127 0 0 200 17 118 65 195 127 0 0 8 0
0 0 0 0 0 0 128 130 5 204 195 127 0 0 69 0 0 0
0 0 0 0 1 0 0 0 195 127 0 0 120 233 255 59 196 127
0 0 224 217 117 65 195 127 0 0 224 214 117 65 195 127 0 0
120 250 117 65 195 127 0 0 9 0 0 0 0 0 0 0 176 213
4 204 195 127 0 0 69 0 0 0 0 0 0 0 1 0 0 0
195 127 0 0 240 66 118 65 195 127 0 0 160 218 117 65 195 127
0 0 224 215 117 65 195 127 0 0 40 9 118 65 195 127 0 0
10 0 0 0 0 0 0 0 16 46 4 204 195 127 0 0 69 0
0 0 0 0 0 0 1 0 0 0 195 127 0 0 208 59 118 65
195 127 0 0 0 0 0 0 0 0 0 0 96 218 117 65 195 127
0 0 56 251 117 65 195 127 0 0 11 0 0 0 0 0 0 0
32 137 3 204 195 127 0 0 85 0 0 0 0 0 0 0 1 0
0 0 195 127 0 0 224 214 117 65 195 127 0 0 144 59 118 65
195 127 0 0 160 214 117 65 195 127 0 0 136 62 118 65 195 127
0 0 12 0 0 0 0 0 0 0 48 104 15 204 195 127 0 0
32 104 15 204 195 127 0 0]
size in bytes = 512
debugger dtype = 11
shape = [128]
-----------------------------------------------------------
tensor_info_2 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_2 attributes:
data (printed in uint8) = [206 239 74 ... 76 157 184]
size in bytes = 1024
debugger dtype = 11
shape = [4, 4, 4, 4]
-----------------------------------------------------------
tensor_info_3 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300
slot = 1
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_3 attributes:
data (printed in uint8) = [206 239 74 ... 76 157 184]
size in bytes = 1024
debugger dtype = 8
shape = [256]

View File

@ -1,14 +0,0 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op318
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_1 attributes:
data (printed in uint8) = []
size in bytes = 0
debugger dtype = 0
shape = []

View File

@ -1,20 +1,20 @@
-----------------------------------------------------------
watchpoint_hit for test_1 attributes:
name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308
slot = 0
name = Default/network-WithLossCell/_backbone-AlexNet/conv1-Conv2d/Conv2D-op369
slot = 1
condition = 6
watchpoint_id = 1
parameter 0 name = param
parameter 0 disabled = False
parameter 0 value = 0.0
parameter 0 hit = True
parameter 0 actual_value = -2.0
parameter 0 actual_value = -0.020966000854969025
error code = 0
device_id = 0
rank_id = 0
root_graph_id = 0
-----------------------------------------------------------
watchpoint_hit for test_4 attributes:
name = Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc3.bias
name = Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc2.bias
slot = 0
condition = 18
watchpoint_id = 3
@ -22,12 +22,12 @@ parameter 0 name = abs_mean_update_ratio_gt
parameter 0 disabled = False
parameter 0 value = 0.0
parameter 0 hit = True
parameter 0 actual_value = 1.793662034335766e-35
parameter 0 actual_value = 1.0156775705209766
parameter 1 name = epsilon
parameter 1 disabled = True
parameter 1 value = 0.0
parameter 1 hit = False
parameter 1 actual_value = 0.0
error code = 0
device_id = 0
rank_id = 0
root_graph_id = 0

View File

@ -18,6 +18,8 @@ Utils for testing offline debugger.
import filecmp
import os
import tempfile
import numpy as np
def compare_actual_with_expected(test_name):
@ -28,6 +30,23 @@ def compare_actual_with_expected(test_name):
os.remove(test_name + ".actual")
return is_eq
def skip_test():
"""Skips the test."""
return True
def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_list):
"""Build dump file structure from tensor_list."""
temp_dir = tempfile.mkdtemp(prefix=net_name, dir="./")
for x, _ in enumerate(tensor_info_list):
slot = str(tensor_info_list[x].slot)
iteration = str(tensor_info_list[x].iteration)
rank_id = str(tensor_info_list[x].rank_id)
root_graph_id = str(tensor_info_list[x].root_graph_id)
is_output = str(tensor_info_list[x].is_output)
path = os.path.join(temp_dir, "rank_" + rank_id, net_name, root_graph_id, iteration)
os.makedirs(path, exist_ok=True)
if is_output == "True":
file = tempfile.mkstemp(prefix=tensor_name_list[x], suffix=".output." + slot +
".DefaultFormat.npy", dir=path)
else:
file = tempfile.mkstemp(prefix=tensor_name_list[x], suffix=".input." + slot +
".DefaultFormat.npy", dir=path)
full_path = file[1]
np.save(full_path, tensor_list[x])
return temp_dir

View File

@ -0,0 +1,125 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Read tensor test script for offline debugger APIs.
"""
import shutil
import numpy as np
import mindspore.offline_debug.dbg_services as d
from dump_test_utils import compare_actual_with_expected, build_dump_structure
GENERATE_GOLDEN = False
test_name = "sync_read_tensors"
def test_sync_trans_false_read_tensors():
# input tensor with zero slot
tensor1 = np.array([32.0, 4096.0], np.float32)
name1 = "CudnnUniformReal.CudnnUniformReal-op391.0.0."
info1 = d.TensorInfo(node_name="Default/CudnnUniformReal-op391",
slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=False)
# input tensor with non-zero slot
tensor2 = np.array([[0.0, 32.0, 4096.0], [4.5, 6.78, -11.0]], np.float32)
name2 = "ReluGradV2.ReluGradV2-op406.0.0."
info2 = d.TensorInfo(node_name="Gradients/Default/network-WithLossCell/_backbone-AlexNet/gradReLU/ReluGradV2-op406",
slot=1, iteration=1, rank_id=0, root_graph_id=0, is_output=False)
# output tensor with zero slot
tensor3 = np.array([[[7.963e-05, 4.750e-05, 2.587e-05],
[8.339e-05, 5.025e-05, 2.694e-05],
[8.565e-05, 5.156e-05, 2.658e-05]],
[[8.017e-05, 4.804e-05, 2.724e-05],
[8.392e-05, 5.126e-05, 2.843e-05],
[8.613e-05, 5.257e-05, 2.819e-05]],
[[7.617e-05, 3.827e-05, 5.305e-06],
[7.474e-05, 3.719e-05, 3.040e-06],
[7.081e-05, 3.338e-05, -2.086e-06]]], np.float32)
name3 = "Conv2DBackpropFilter.Conv2DBackpropFilter-op424.0.0."
info3 = d.TensorInfo(node_name="Gradients/Default/network-WithLossCell/_backbone-AlexNet/conv5-Conv2d/"
"gradConv2D/Conv2DBackpropFilter-op424",
slot=0, iteration=1, rank_id=0, root_graph_id=0, is_output=True)
# output tensor with non-zero slot
tensor4 = np.array([2705090541, 1099111076, 4276637100, 3586562544, 890060077, 1869062900], np.float32)
name4 = "ReLUV2.ReLUV2-op381.0.0."
info4 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op381",
slot=1, iteration=0, rank_id=0, root_graph_id=0, is_output=True)
# inf tensor
inf_tensor = np.array([[1., -np.inf, np.inf, -np.inf, np.inf], [np.inf, 1., -np.inf, np.inf, np.inf]], np.float32)
inf_name = "Reciprocal.Reciprocal-op3.0.0."
inf_info = d.TensorInfo(node_name="Default/Reciprocal-op3",
slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True)
tensor_name = [name1, name2, name3, name4]
tensor_list = [tensor1, tensor2, tensor3, tensor4]
tensor_info = [info1, info2, info3, info4]
temp_dir = build_dump_structure(tensor_name, tensor_list, "alexnet", tensor_info)
inf_dir = build_dump_structure([inf_name], [inf_tensor], "Inf", [inf_info])
debugger_backend1 = d.DbgServices(dump_file_path=temp_dir)
_ = debugger_backend1.initialize(net_name="alexnet", is_sync_mode=True)
tensor_data = debugger_backend1.read_tensors(tensor_info)
debugger_backend2 = d.DbgServices(dump_file_path=inf_dir)
_ = debugger_backend2.initialize(net_name="Inf", is_sync_mode=True)
tensor_data_inf = debugger_backend2.read_tensors([inf_info])
tensor_info.extend([inf_info])
tensor_data.extend(tensor_data_inf)
shutil.rmtree(temp_dir)
shutil.rmtree(inf_dir)
print_read_tensors(tensor_info, tensor_data)
if not GENERATE_GOLDEN:
assert compare_actual_with_expected(test_name)
def print_read_tensors(tensor_info, tensor_data):
"""Print read tensors."""
if GENERATE_GOLDEN:
f_write = open(test_name + ".expected", "w")
else:
f_write = open(test_name + ".actual", "w")
for x, _ in enumerate(tensor_info):
f_write.write(
"-----------------------------------------------------------\n")
f_write.write("tensor_info_" + str(x+1) + " attributes:\n")
f_write.write("node name = " + tensor_info[x].node_name + "\n")
f_write.write("slot = " + str(tensor_info[x].slot) + "\n")
f_write.write("iteration = " + str(tensor_info[x].iteration) + "\n")
f_write.write("rank_id = " + str(tensor_info[x].rank_id) + "\n")
f_write.write("root_graph_id = " +
str(tensor_info[x].root_graph_id) + "\n")
f_write.write("is_output = " +
str(tensor_info[x].is_output) + "\n")
f_write.write("\n")
f_write.write("tensor_data_" + str(x+1) + " attributes:\n")
f_write.write("data (printed in uint8) = " + str(np.frombuffer(
tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size)) + "\n")
py_byte_size = len(tensor_data[x].data_ptr)
c_byte_size = tensor_data[x].data_size
if c_byte_size != py_byte_size:
f_write.write("The python byte size of " + str(py_byte_size) +
" does not match the C++ byte size of " + str(c_byte_size) + "\n")
f_write.write("size in bytes = " +
str(tensor_data[x].data_size) + "\n")
f_write.write("debugger dtype = " + str(tensor_data[x].dtype) + "\n")
f_write.write("shape = " + str(tensor_data[x].shape) + "\n")
f_write.close()
if __name__ == "__main__":
test_sync_trans_false_read_tensors()

View File

@ -16,12 +16,10 @@
Read tensor base and statistics test script for offline debugger APIs.
"""
import tempfile
import os
import shutil
import numpy as np
import mindspore.offline_debug.dbg_services as d
from dump_test_utils import compare_actual_with_expected
from dump_test_utils import compare_actual_with_expected, build_dump_structure
GENERATE_GOLDEN = False
test_name = "sync_read_tensors_base_stat"
@ -30,12 +28,27 @@ test_name = "sync_read_tensors_base_stat"
def test_sync_read_tensors_base_stat():
value_tensor = np.array([[7.5, 8.56, -9.78], [10.0, -11.0, 0.0]], np.float32)
inf_tensor = np.array([[1., -np.inf, np.inf, -np.inf, np.inf], [np.inf, 1., -np.inf, np.inf, np.inf]], np.float32)
nan_tensor = np.array([-2.1754317, 1.9901361, np.nan, np.nan, -1.8091936], np.float32)
name1 = "Add.Add-op4.0.0."
info1 = d.TensorInfo(node_name="Default/Add-op4",
slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True)
value_path = build_dump_file_structure(value_tensor, "Add", "Add.Add-op4.0.0.")
inf_path = build_dump_file_structure(inf_tensor, "Inf", "Reciprocal.Reciprocal-op3.0.0.")
nan_path = build_dump_file_structure(nan_tensor, "Nan", "ReduceMean.ReduceMean-op92.0.0.")
inf_tensor = np.array([[1., -np.inf, np.inf, -np.inf, np.inf], [np.inf, 1., -np.inf, np.inf, np.inf]], np.float32)
name2 = "Reciprocal.Reciprocal-op3.0.0."
info2 = d.TensorInfo(node_name="Default/Reciprocal-op3",
slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True)
nan_tensor = np.array([-2.1754317, 1.9901361, np.nan, np.nan, -1.8091936], np.float32)
name3 = "ReduceMean.ReduceMean-op92.0.0."
info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-MockModel/ReduceMean-op92",
slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True)
tensor_info_1 = [info1]
tensor_info_2 = [info2]
tensor_info_3 = [info3]
tensor_info = [info1, info2, info3]
value_path = build_dump_structure([name1], [value_tensor], "Add", tensor_info_1)
inf_path = build_dump_structure([name2], [inf_tensor], "Inf", tensor_info_2)
nan_path = build_dump_structure([name3], [nan_tensor], "Nan", tensor_info_3)
debugger_backend = d.DbgServices(
dump_file_path=value_path, verbose=True)
@ -55,19 +68,6 @@ def test_sync_read_tensors_base_stat():
_ = debugger_backend_3.initialize(
net_name="Nan", is_sync_mode=True)
info1 = d.TensorInfo(node_name="Default/Add-op4",
slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True)
info2 = d.TensorInfo(node_name="Default/Reciprocal-op3",
slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True)
info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-MockModel/ReduceMean-op92",
slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True)
tensor_info_1 = [info1]
tensor_info_2 = [info2]
tensor_info_3 = [info3]
tensor_info = [info1, info2, info3]
tensor_base_data_list = debugger_backend.read_tensor_base(tensor_info_1)
tensor_base_data_list_2 = debugger_backend_2.read_tensor_base(tensor_info_2)
tensor_base_data_list.extend(tensor_base_data_list_2)
@ -84,21 +84,10 @@ def test_sync_read_tensors_base_stat():
shutil.rmtree(inf_path)
shutil.rmtree(nan_path)
print_read_tensors(tensor_info, tensor_base_data_list, tensor_stat_data_list)
assert compare_actual_with_expected(test_name)
if not GENERATE_GOLDEN:
assert compare_actual_with_expected(test_name)
def build_dump_file_structure(tensor_array, net_name, tensor_name):
debugger_temp_dir = tempfile.mkdtemp(prefix=net_name, dir="./")
print(debugger_temp_dir)
path = os.path.join(debugger_temp_dir, "rank_0", net_name, "0", "0")
print(path)
os.makedirs(path, exist_ok=True)
file = tempfile.mkstemp(prefix=tensor_name, suffix=".output.0.DefaultFormat.npy", dir=path)
full_path = file[1]
np.save(full_path, tensor_array)
return debugger_temp_dir
def print_read_tensors(tensor_info, tensor_base_data_list, tensor_stat_data_list):
"""Print read tensors info."""
if GENERATE_GOLDEN:

View File

@ -16,39 +16,54 @@
Read tensor test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
import shutil
import numpy as np
from dump_test_utils import compare_actual_with_expected, skip_test
import mindspore.offline_debug.dbg_services as d
from dump_test_utils import compare_actual_with_expected, build_dump_structure
GENERATE_GOLDEN = False
test_name = "sync_trans_true_read_tensors_nonexist_node"
test_name = "sync_read_tensors_nonexist_node"
def test_sync_trans_read_tensors_nonexist_node():
if skip_test():
return
tensor1 = np.array([32.0, 4096.0], np.float32)
name1 = "CudnnUniformReal.CudnnUniformReal-op391.0.0."
info1 = d.TensorInfo(node_name="Default/CudnnUniformReal-op391",
slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=False)
tensor2 = np.array([[0.0, 32.0, 4096.0], [4.5, 6.78, -11.0]], np.float32)
name2 = "ReluGradV2.ReluGradV2-op406.0.0."
info2 = d.TensorInfo(node_name="Gradients/Default/network-WithLossCell/_backbone-AlexNet/gradReLU/ReluGradV2-op406",
slot=1, iteration=1, rank_id=0, root_graph_id=0, is_output=False)
# non-existing tensor with wrong op name
info3 = d.TensorInfo(node_name="Default/CudnnUniformReal-op390",
slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=False)
debugger_backend = d.DbgServices(
dump_file_path="../data/dump/gpu_dumps/sync_trans_true/alexnet")
# non-existing tensor with wrong iteration number
info4 = d.TensorInfo(node_name="Gradients/Default/network-WithLossCell/_backbone-AlexNet/gradReLU/ReluGradV2-op406",
slot=1, iteration=0, rank_id=0, root_graph_id=0, is_output=False)
tensor_name = [name1, name2]
tensor_create_info = [info1, info2]
tensor_list = [tensor1, tensor2]
temp_dir = build_dump_structure(tensor_name, tensor_list, "alexnet", tensor_create_info)
tensor_check_info = [info3, info4]
debugger_backend = d.DbgServices(dump_file_path=temp_dir)
_ = debugger_backend.initialize(
net_name="Network Name goes here!", is_sync_mode=True)
net_name="alexnet", is_sync_mode=True)
# non-existing tensor with wrong op name
info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op318",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
tensor_info = [info1]
tensor_data = debugger_backend.read_tensors(tensor_info)
tensor_data = debugger_backend.read_tensors(tensor_check_info)
# Check the length of tensor list
assert len(tensor_info) == 1
assert len(tensor_data) == 1
assert len(tensor_check_info) == 2
assert len(tensor_data) == 2
print_read_tensors(tensor_info, tensor_data)
assert compare_actual_with_expected(test_name)
print_read_tensors(tensor_check_info, tensor_data)
shutil.rmtree(temp_dir)
if not GENERATE_GOLDEN:
assert compare_actual_with_expected(test_name)
def print_read_tensors(tensor_info, tensor_data):
@ -65,11 +80,11 @@ def print_read_tensors(tensor_info, tensor_data):
f_write.write("node name = " + tensor_info[x].node_name + "\n")
f_write.write("slot = " + str(tensor_info[x].slot) + "\n")
f_write.write("iteration = " + str(tensor_info[x].iteration) + "\n")
f_write.write("device_id = " + str(tensor_info[x].device_id) + "\n")
f_write.write("rank_id = " + str(tensor_info[x].rank_id) + "\n")
f_write.write("root_graph_id = " +
str(tensor_info[x].root_graph_id) + "\n")
f_write.write("is_parameter = " +
str(tensor_info[x].is_parameter) + "\n")
f_write.write("is_output = " +
str(tensor_info[x].is_output) + "\n")
f_write.write("\n")
f_write.write("tensor_data_" + str(x + 1) + " attributes:\n")
f_write.write("data (printed in uint8) = " + str(np.frombuffer(

View File

@ -1,92 +0,0 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Read tensor test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
import numpy as np
from dump_test_utils import compare_actual_with_expected, skip_test
GENERATE_GOLDEN = False
test_name = "sync_trans_false_read_tensors"
def test_sync_trans_false_read_tensors():
if skip_test():
return
debugger_backend = d.DbgServices(
dump_file_path="../data/dump/gpu_dumps/sync_trans_false/alexnet")
_ = debugger_backend.initialize(
net_name="alexnet", is_sync_mode=True)
# parameter
info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True)
# output tensor with zero slot
info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
# output tensor with non-zero slot
info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300",
slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
tensor_info = [info1, info2, info3]
tensor_data = debugger_backend.read_tensors(tensor_info)
print_read_tensors(tensor_info, tensor_data)
assert compare_actual_with_expected(test_name)
def print_read_tensors(tensor_info, tensor_data):
"""Print read tensors."""
if GENERATE_GOLDEN:
f_write = open(test_name + ".expected", "w")
else:
f_write = open(test_name + ".actual", "w")
for x, _ in enumerate(tensor_info):
f_write.write(
"-----------------------------------------------------------\n")
f_write.write("tensor_info_" + str(x+1) + " attributes:\n")
f_write.write("node name = " + tensor_info[x].node_name + "\n")
f_write.write("slot = " + str(tensor_info[x].slot) + "\n")
f_write.write("iteration = " + str(tensor_info[x].iteration) + "\n")
f_write.write("device_id = " + str(tensor_info[x].device_id) + "\n")
f_write.write("root_graph_id = " +
str(tensor_info[x].root_graph_id) + "\n")
f_write.write("is_parameter = " +
str(tensor_info[x].is_parameter) + "\n")
f_write.write("\n")
f_write.write("tensor_data_" + str(x+1) + " attributes:\n")
f_write.write("data (printed in uint8) = " + str(np.frombuffer(
tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size)) + "\n")
py_byte_size = len(tensor_data[x].data_ptr)
c_byte_size = tensor_data[x].data_size
if c_byte_size != py_byte_size:
f_write.write("The python byte size of " + str(py_byte_size) +
" does not match the C++ byte size of " + str(c_byte_size) + "\n")
f_write.write("size in bytes = " +
str(tensor_data[x].data_size) + "\n")
f_write.write("debugger dtype = " + str(tensor_data[x].dtype) + "\n")
f_write.write("shape = " + str(tensor_data[x].shape) + "\n")
f_write.close()
if __name__ == "__main__":
test_sync_trans_false_read_tensors()

View File

@ -1,92 +0,0 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Read tensor test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
import numpy as np
from dump_test_utils import compare_actual_with_expected, skip_test
GENERATE_GOLDEN = False
test_name = "sync_trans_true_read_tensors"
def test_sync_trans_read_tensors():
if skip_test():
return
debugger_backend = d.DbgServices(
dump_file_path="../data/dump/gpu_dumps/sync_trans_true/alexnet")
_ = debugger_backend.initialize(
net_name="Network Name goes here!", is_sync_mode=True)
# parameter
info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True)
# output tensor with zero slot
info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
# output tensor with non-zero slot
info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300",
slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
tensor_info = [info1, info2, info3]
tensor_data = debugger_backend.read_tensors(tensor_info)
print_read_tensors(tensor_info, tensor_data)
assert compare_actual_with_expected(test_name)
def print_read_tensors(tensor_info, tensor_data):
"""Print read tensors."""
if GENERATE_GOLDEN:
f_write = open(test_name + ".expected", "w")
else:
f_write = open(test_name + ".actual", "w")
for x, _ in enumerate(tensor_info):
f_write.write(
"-----------------------------------------------------------\n")
f_write.write("tensor_info_" + str(x+1) + " attributes:\n")
f_write.write("node name = " + tensor_info[x].node_name + "\n")
f_write.write("slot = " + str(tensor_info[x].slot) + "\n")
f_write.write("iteration = " + str(tensor_info[x].iteration) + "\n")
f_write.write("device_id = " + str(tensor_info[x].device_id) + "\n")
f_write.write("root_graph_id = " +
str(tensor_info[x].root_graph_id) + "\n")
f_write.write("is_parameter = " +
str(tensor_info[x].is_parameter) + "\n")
f_write.write("\n")
f_write.write("tensor_data_" + str(x+1) + " attributes:\n")
f_write.write("data (printed in uint8) = " + str(np.frombuffer(
tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size)) + "\n")
py_byte_size = len(tensor_data[x].data_ptr)
c_byte_size = tensor_data[x].data_size
if c_byte_size != py_byte_size:
f_write.write("The python byte size of " + str(py_byte_size) +
" does not match the C++ byte size of " + str(c_byte_size) + "\n")
f_write.write("size in bytes = " +
str(tensor_data[x].data_size) + "\n")
f_write.write("debugger dtype = " + str(tensor_data[x].dtype) + "\n")
f_write.write("shape = " + str(tensor_data[x].shape) + "\n")
f_write.close()
if __name__ == "__main__":
test_sync_trans_read_tensors()

View File

@ -16,39 +16,71 @@
Watchpoints test script for offline debugger APIs.
"""
import shutil
import numpy as np
import mindspore.offline_debug.dbg_services as d
from dump_test_utils import compare_actual_with_expected, skip_test
from dump_test_utils import compare_actual_with_expected, build_dump_structure
GENERATE_GOLDEN = False
test_name = "sync_trans_false_watchpoints"
test_name = "sync_watchpoints"
def test_sync_trans_false_watchpoints():
if skip_test():
return
if GENERATE_GOLDEN:
f_write = open(test_name + ".expected", "w")
else:
f_write = open(test_name + ".actual", "w")
debugger_backend = d.DbgServices(
dump_file_path="../data/dump/gpu_dumps/sync_trans_false/alexnet")
name1 = "Conv2D.Conv2D-op369.0.0."
tensor1 = np.array([[[-1.2808e-03, 7.7629e-03, 1.9241e-02],
[-1.3931e-02, 8.9359e-04, -1.1520e-02],
[-6.3248e-03, 1.8749e-03, 1.0132e-02]],
[[-2.5520e-03, -6.0005e-03, -5.1918e-03],
[-2.7866e-03, 2.5487e-04, 8.4782e-04],
[-4.6310e-03, -8.9111e-03, -8.1778e-05]],
[[1.3914e-03, 6.0844e-04, 1.0643e-03],
[-2.0966e-02, -1.2865e-03, -1.8692e-03],
[-1.6647e-02, 1.0233e-03, -4.1313e-03]]], np.float32)
info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv1-Conv2d/Conv2D-op369",
slot=1, iteration=2, rank_id=0, root_graph_id=0, is_output=False)
_ = debugger_backend.initialize(
net_name="Alexnet", is_sync_mode=True)
name2 = "Parameter.fc2.bias.0.0."
tensor2 = np.array([-5.0167350e-06, 1.2509107e-05, -4.3148934e-06, 8.1415592e-06,
2.1177532e-07, 2.9952851e-06], np.float32)
info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc2.bias",
slot=0, iteration=2, rank_id=0, root_graph_id=0, is_output=True)
tensor3 = np.array([2.9060817e-07, -5.1009415e-06, -2.8662325e-06, 2.6036503e-06,
-5.1546101e-07, 6.0798648e-06], np.float32)
info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc2.bias",
slot=0, iteration=3, rank_id=0, root_graph_id=0, is_output=True)
name3 = "Parameter.fc3.bias.0.0."
tensor4 = np.array([2.2930422e-04, -3.6369250e-04, 7.1337068e-04, -1.9567949e-05], np.float32)
info4 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc3.bias",
slot=0, iteration=2, rank_id=0, root_graph_id=0, is_output=True)
tensor_info = [info1, info2, info3, info4]
tensor_name = [name1, name2, name2, name3]
tensor_list = [tensor1, tensor2, tensor3, tensor4]
temp_dir = build_dump_structure(tensor_name, tensor_list, "alexnet", tensor_info)
debugger_backend = d.DbgServices(dump_file_path=temp_dir)
_ = debugger_backend.initialize(net_name="alexnet", is_sync_mode=True)
# NOTES:
# -> watch_condition=6 is MIN_LT
# -> watch_condition=18 is CHANGE_TOO_LARGE
# -> watch_condition=20 is NOT_CHANGE
# test 1: watchpoint set and hit (watch_condition=6)
param1 = d.Parameter(name="param", disabled=False, value=0.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/"
"Conv2D-op308":
{"device_id": [0], "root_graph_id": [0], "is_parameter": False
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv1-Conv2d/"
"Conv2D-op369":
{"rank_id": [0], "root_graph_id": [0], "is_output": False
}}, parameter_list=[param1])
watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2)
@ -66,9 +98,9 @@ def test_sync_trans_false_watchpoints():
# test 3: watchpoint set and not hit, then remove
param2 = d.Parameter(name="param", disabled=False, value=-1000.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/"
"Conv2D-op308":
{"device_id": [0], "root_graph_id": [0], "is_parameter": False
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv1-Conv2d/"
"Conv2D-op369":
{"rank_id": [0], "root_graph_id": [0], "is_output": False
}}, parameter_list=[param2])
watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2)
@ -83,18 +115,19 @@ def test_sync_trans_false_watchpoints():
param_epsilon = d.Parameter(name="epsilon", disabled=True, value=0.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=3, watch_condition=18,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/"
"Parameter[6]_11/fc3.bias":
{"device_id": [0], "root_graph_id": [0], "is_parameter": True
"Parameter[6]_11/fc2.bias":
{"rank_id": [0], "root_graph_id": [0], "is_output": True
}}, parameter_list=[param_abs_mean_update_ratio_gt,
param_epsilon])
watchpoint_hits_test_4 = debugger_backend.check_watchpoints(iteration=3)
if len(watchpoint_hits_test_4) != 1:
f_write.write(
"ERROR -> test 4: watchpoint weight change set but not hit just once\n")
f_write.write("ERROR -> test 4: watchpoint weight change set but not hit just once\n")
print_watchpoint_hits(watchpoint_hits_test_4, 4, f_write)
f_write.close()
assert compare_actual_with_expected(test_name)
shutil.rmtree(temp_dir)
if not GENERATE_GOLDEN:
assert compare_actual_with_expected(test_name)
def print_watchpoint_hits(watchpoint_hits, test_id, f_write):
@ -104,7 +137,7 @@ def print_watchpoint_hits(watchpoint_hits, test_id, f_write):
"-----------------------------------------------------------\n")
f_write.write("watchpoint_hit for test_%u attributes:" %
test_id + "\n")
f_write.write("name = " + str(watchpoint_hits[x].name) + "\n")
f_write.write("name = " + watchpoint_hits[x].name + "\n")
f_write.write("slot = " + str(watchpoint_hits[x].slot) + "\n")
f_write.write("condition = " +
str(watchpoint_hits[x].condition) + "\n")
@ -123,8 +156,8 @@ def print_watchpoint_hits(watchpoint_hits, test_id, f_write):
str(watchpoint_hits[x].parameters[p].actual_value) + "\n")
f_write.write("error code = " +
str(watchpoint_hits[x].error_code) + "\n")
f_write.write("device_id = " +
str(watchpoint_hits[x].device_id) + "\n")
f_write.write("rank_id = " +
str(watchpoint_hits[x].rank_id) + "\n")
f_write.write("root_graph_id = " +
str(watchpoint_hits[x].root_graph_id) + "\n")