Fix the stream_id error for async dump

Revert "takedown dump related testcases to ensure gate stability"

This reverts commit 24321ded4d.
This commit is contained in:
maning202007 2022-06-22 17:44:13 +08:00
parent 1f63418d09
commit d3f2d391a7
6 changed files with 41 additions and 25 deletions

View File

@ -290,6 +290,10 @@ void DataDumper::ConstructDumpTask(NotNull<const CNodePtr &> kernel, NotNull<aic
}
void DataDumper::SetOpDebugMappingInfo(const NotNull<aicpu::dump::OpMappingInfo *> dump_info) const {
if (!is_op_debug_) {
MS_LOG(INFO) << "[DataDump] Not op debug mode, no need to set OpDebugMappingInfo.";
return;
}
MS_LOG(INFO) << "[DataDump] Add op debug info to OpMappingInfo, task id = " << debug_task_id_
<< ", stream id = " << debug_stream_id_;
aicpu::dump::Task task;
@ -333,6 +337,7 @@ void DataDumper::OpDebugRegister() {
return;
}
is_op_debug_ = true;
int64_t value = 0;
rtError_t rt_ret = rtGetRtCapability(FEATURE_TYPE_MEMORY, MEMORY_INFO_TS_LIMITED, &value);
if (rt_ret != RT_ERROR_NONE) {

View File

@ -40,15 +40,16 @@ class DataDumper {
public:
DataDumper(const session::KernelGraph *kernel_graph, NotNull<std::function<void *()>> model_handle)
: model_handle_(model_handle),
debug_task_id_(-1),
debug_stream_id_(-1),
debug_task_id_(0U),
debug_stream_id_(0U),
op_debug_buffer_addr_(nullptr),
op_debug_dump_args_(nullptr),
load_flag_(false),
dev_load_mem_(nullptr),
dev_unload_mem_(nullptr),
graph_id_(UINT32_MAX),
kernel_graph_(kernel_graph) {}
kernel_graph_(kernel_graph),
is_op_debug_(false) {}
~DataDumper();
void set_runtime_info(const std::map<std::string, std::shared_ptr<RuntimeInfo>> &runtime_info) {
runtime_info_map_ = runtime_info;
@ -88,6 +89,7 @@ class DataDumper {
std::vector<std::string> dump_kernel_names_;
const session::KernelGraph *kernel_graph_;
std::map<std::string, std::shared_ptr<RuntimeInfo>> runtime_info_map_;
bool is_op_debug_;
};
} // namespace ascend
} // namespace device

View File

@ -70,7 +70,7 @@ def run_async_dump(test_name):
del os.environ['MINDSPORE_DUMP_CONFIG']
@pytest.mark.level1
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -84,7 +84,7 @@ def test_async_dump_npy():
run_async_dump("test_async_dump_npy")
@pytest.mark.level1
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard

View File

@ -75,7 +75,7 @@ def run_multi_layer_train(is_set_dump):
train_network(inputs, label)
@pytest.mark.level1
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -118,7 +118,7 @@ def test_ascend_cell_dump():
del os.environ['MINDSPORE_DUMP_CONFIG']
@pytest.mark.level1
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -150,7 +150,7 @@ def test_ascend_not_cell_dump():
del os.environ['MINDSPORE_DUMP_CONFIG']
@pytest.mark.level1
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -179,7 +179,7 @@ def test_ascend_cell_empty_dump():
del os.environ['MINDSPORE_DUMP_CONFIG']
@pytest.mark.level1
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -222,7 +222,7 @@ class OperateSymbolNet(Cell):
return x
@pytest.mark.level1
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard

View File

@ -14,6 +14,7 @@
# ============================================================================
import os
import sys
import json
import tempfile
import time
import shutil
@ -205,7 +206,7 @@ class ReluReduceMeanDenseRelu(Cell):
return x_
@pytest.mark.level1
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -345,12 +346,20 @@ def run_overflow_dump():
overflow_stream_id = overflow_file_name[1][overflow_third_dot_pos+1:overflow_fourth_dot_pos]
assert output_task_id == overflow_task_id
assert output_stream_id == overflow_stream_id
# Convert the overflow file into json format
convert_tool = '/usr/local/Ascend/latest/tools/operator_cmp/compare/msaccucmp.py'
convert_to_json_cmd = 'python {0} convert -d {1} -out {2}'.format(convert_tool, overflow_path, exe_graph_path)
_ = os.system(convert_to_json_cmd)
overflow_json_path = overflow_path + '.output.0.json'
for _ in range(3):
if not os.path.exists(overflow_json_path):
time.sleep(2)
# check if overflow dump file contains same task and stream id as file name
with open(overflow_path, 'rb') as f:
f.seek(321, 0)
raw_data = f.read()
task_id_infile = int.from_bytes(raw_data[24:25], 'little')
stream_id_infile = int.from_bytes(raw_data[16:17], 'little')
with open(overflow_json_path) as f:
file_content = json.load(f)
ai_core = file_content["AI Core"]
task_id_infile = ai_core["task_id"]
stream_id_infile = ai_core["stream_id"]
assert output_task_id == str(task_id_infile)
assert output_stream_id == str(stream_id_infile)
del os.environ['MINDSPORE_DUMP_CONFIG']
@ -376,7 +385,7 @@ def run_not_overflow_dump():
assert not os.path.exists(exe_graph_path)
del os.environ['MINDSPORE_DUMP_CONFIG']
@pytest.mark.level1
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -415,13 +424,13 @@ def check_statistic_dump(dump_file_path):
num_tensors = len(stats)
assert num_tensors == 3
for tensor in stats:
if (tensor['IO'] == 'input' and tensor['Slot'] == 0):
if tensor['IO'] == 'input' and tensor['Slot'] == 0:
assert tensor['Min Value'] == '1'
assert tensor['Max Value'] == '6'
elif (tensor['IO'] == 'input' and tensor['Slot'] == 1):
elif tensor['IO'] == 'input' and tensor['Slot'] == 1:
assert tensor['Min Value'] == '7'
assert tensor['Max Value'] == '12'
elif (tensor['IO'] == 'output' and tensor['Slot'] == 0):
elif tensor['IO'] == 'output' and tensor['Slot'] == 0:
assert tensor['Min Value'] == '8'
assert tensor['Max Value'] == '18'
@ -548,7 +557,7 @@ def test_stat_dump_nulls():
assert output['Avg Value'] == 'null'
@pytest.mark.level1
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -581,7 +590,7 @@ def test_ascend_statistic_dump_kernel_by_kernel():
del os.environ['GRAPH_OP_RUN']
@pytest.mark.level1
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -596,7 +605,7 @@ def test_ascend_tensor_dump():
run_saved_data_dump_test('test_async_dump', 'tensor')
@pytest.mark.level1
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
@ -642,7 +651,7 @@ class ConstantNet(nn.Cell):
return self.relu(construct_tensor(ops.shape(x_)))
@pytest.mark.level1
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard

View File

@ -144,7 +144,7 @@ def test_ascend_kernel_by_kernel_trans_false():
del os.environ['GRAPH_OP_RUN']
@pytest.mark.level1
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard