!19639 [r1.3]Dump failed when there are duplicate fullname_with_scope in graph

Merge pull request !19639 from caifubi/r1.3
This commit is contained in:
i-robot 2021-07-09 01:18:49 +00:00 committed by Gitee
commit 1328bc5fba
26 changed files with 57 additions and 27 deletions

View File

@ -194,7 +194,7 @@ std::vector<TaskInfoPtr> AicpuOpKernelMod::GenTask(const std::vector<AddressPtr>
}
AicpuTaskInfoPtr task_info_ptr = std::make_shared<mindspore::ge::model_runner::AicpuTaskInfo>(
kernel_name_, stream_id, node_so_, node_name_, node_def_str_, ext_info_, input_data_addrs, output_data_addrs,
unique_name_, stream_id, node_so_, node_name_, node_def_str_, ext_info_, input_data_addrs, output_data_addrs,
NeedDump());
MS_LOG(INFO) << "AicpuOpKernelMod GenTask end";

View File

@ -119,7 +119,7 @@ std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &in
MS_LOG(DEBUG) << "The block_dim is:" << block_dim;
TbeTaskInfoPtr task_info_ptr = std::make_shared<mindspore::ge::model_runner::TbeTaskInfo>(
kernel_name_, stream_id, stub_func, block_dim, args, args_size, sm_desc, binary, binary_size, meta_data,
unique_name_, stream_id, stub_func, block_dim, args, args_size, sm_desc, binary, binary_size, meta_data,
input_data_addrs, output_data_addrs, workspace_addrs, NeedDump());
return {task_info_ptr};
}

View File

@ -34,7 +34,8 @@ class AscendKernelMod : public KernelMod {
uint32_t stream_id() { return stream_id_; }
virtual bool NeedDump() {
const auto &dump_json = DumpJsonParser::GetInstance();
return dump_json.NeedDump(kernel_name_) && dump_json.async_dump_enabled() && dump_json.op_debug_mode() == 0;
return dump_json.NeedDump(fullname_) && dump_json.async_dump_enabled() && dump_json.op_debug_mode() == 0 &&
!is_monad_;
}
protected:

View File

@ -52,7 +52,7 @@ void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
reduce_type_ = kReduceAny;
reduce_func_ = [](const T *input, size_t pos, T *out) { *out |= input[pos]; };
} else {
MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << kernel_name_ << " for bool.";
MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << fullname_ << " for bool.";
}
} else {
if (kernel_name == "ReduceMax") {

View File

@ -234,7 +234,7 @@ std::vector<TaskInfoPtr> HcclKernel::GenTask(const std::vector<AddressPtr> &inpu
}
results.emplace_back(
std::make_shared<HcclTaskInfo>(kernel_name_, stream_id, hccl::HcclAdapter::GetHcclType(anf_node), input_data_addr,
std::make_shared<HcclTaskInfo>(unique_name_, stream_id, hccl::HcclAdapter::GetHcclType(anf_node), input_data_addr,
output_data_addr, workspace_addr, task.workspace_size, task.stream_num,
private_def, hccl::HcclAdapter::GetInstance().GetHcclOpsKernelInfoStore(),
hccl_count_, root_id_, op_type_, data_type, group_, NeedDump()));

View File

@ -142,10 +142,14 @@ class KernelMod {
virtual void ReleaseResource() {}
virtual ~KernelMod() = default;
void set_kernel_name(const std::string &kernel_name) { kernel_name_ = kernel_name; }
void set_unique_name(const std::string &unique_name) { unique_name_ = unique_name; }
void set_fullname(const std::string &fullname) { fullname_ = fullname; }
void set_is_monad(bool is_monad) { is_monad_ = is_monad; }
protected:
std::string kernel_name_;
std::string unique_name_;
std::string fullname_;
bool is_monad_{false};
};
using KernelModPtr = std::shared_ptr<KernelMod>;
} // namespace kernel

View File

@ -55,7 +55,7 @@ std::vector<TaskInfoPtr> AssignKernel::GenTask(const std::vector<AddressPtr> &in
stream_id_ = stream_id;
std::shared_ptr<MemcpyAsyncTaskInfo> task_info_ptr =
std::make_shared<MemcpyAsyncTaskInfo>(kernel_name_, stream_id, inputs[0]->addr, inputs[0]->size, inputs[1]->addr,
std::make_shared<MemcpyAsyncTaskInfo>(unique_name_, stream_id, inputs[0]->addr, inputs[0]->size, inputs[1]->addr,
inputs[1]->size, RT_MEMCPY_DEVICE_TO_DEVICE, false);
MS_EXCEPTION_IF_NULL(task_info_ptr);
return {task_info_ptr};

View File

@ -54,7 +54,7 @@ std::vector<TaskInfoPtr> LabelGotoKernel::GenTask(const std::vector<AddressPtr>
MS_LOG(INFO) << "LabelGotoKernel GenTask label:" << label_ << ", stream id:" << stream_id;
std::vector<TaskInfoPtr> task_info_list;
std::shared_ptr<LabelGotoTaskInfo> task_info_ptr =
std::make_shared<LabelGotoTaskInfo>(kernel_name_, stream_id, label_);
std::make_shared<LabelGotoTaskInfo>(unique_name_, stream_id, label_);
MS_EXCEPTION_IF_NULL(task_info_ptr);
task_info_list.emplace_back(task_info_ptr);
return task_info_list;

View File

@ -53,7 +53,7 @@ std::vector<TaskInfoPtr> LabelSetKernel::GenTask(const std::vector<AddressPtr> &
const std::vector<AddressPtr> &, uint32_t stream_id) {
MS_LOG(INFO) << "LabelSetKernel GenTask label:" << label_ << ", stream id:" << stream_id;
std::vector<TaskInfoPtr> task_info_list;
std::shared_ptr<LabelSetTaskInfo> task_info_ptr = std::make_shared<LabelSetTaskInfo>(kernel_name_, stream_id, label_);
std::shared_ptr<LabelSetTaskInfo> task_info_ptr = std::make_shared<LabelSetTaskInfo>(unique_name_, stream_id, label_);
MS_EXCEPTION_IF_NULL(task_info_ptr);
task_info_list.emplace_back(task_info_ptr);
return task_info_list;

View File

@ -64,7 +64,7 @@ std::vector<TaskInfoPtr> LabelSwitchKernel::GenTask(const std::vector<AddressPtr
MS_LOG(INFO) << "LabelSwitchKernel GenTask label size:" << label_size_ << ", stream id:" << stream_id;
std::vector<TaskInfoPtr> task_info_list;
cond_ = inputs[0]->addr;
auto task_info_ptr = std::make_shared<LabelSwitchTaskInfo>(kernel_name_, stream_id, label_size_, label_list_, cond_);
auto task_info_ptr = std::make_shared<LabelSwitchTaskInfo>(unique_name_, stream_id, label_size_, label_list_, cond_);
MS_EXCEPTION_IF_NULL(task_info_ptr);
task_info_list.emplace_back(task_info_ptr);
return task_info_list;

View File

@ -122,7 +122,7 @@ std::vector<TaskInfoPtr> MemCpyAsyncKernel::GenTask(const std::vector<AddressPtr
stream_id_ = stream_id;
std::shared_ptr<MemcpyAsyncTaskInfo> task_info_ptr =
std::make_shared<MemcpyAsyncTaskInfo>(kernel_name_, stream_id, outputs[0]->addr, outputs[0]->size, inputs[0]->addr,
std::make_shared<MemcpyAsyncTaskInfo>(unique_name_, stream_id, outputs[0]->addr, outputs[0]->size, inputs[0]->addr,
inputs[0]->size, RT_MEMCPY_DEVICE_TO_DEVICE, NeedDump());
MS_EXCEPTION_IF_NULL(task_info_ptr);
return {task_info_ptr};

View File

@ -63,7 +63,7 @@ std::vector<TaskInfoPtr> ProfilingKernelMod::GenTask(const std::vector<AddressPt
<< ", outputs size:" << outputs.size();
stream_id_ = stream_id;
std::shared_ptr<ProfilerTraceTaskInfo> task_info_ptr =
std::make_shared<ProfilerTraceTaskInfo>(kernel_name_, stream_id, log_id_, notify_, flags_);
std::make_shared<ProfilerTraceTaskInfo>(unique_name_, stream_id, log_id_, notify_, flags_);
return {task_info_ptr};
}

View File

@ -57,7 +57,7 @@ std::vector<TaskInfoPtr> RecvKernel::GenTask(const std::vector<AddressPtr> &, co
const std::vector<AddressPtr> &, uint32_t stream_id) {
MS_LOG(INFO) << "RecvKernel GenTask event_id_:" << event_id_ << ", stream_id_:" << stream_id;
stream_id_ = stream_id;
EventWaitTaskInfoPtr task_info_ptr = std::make_shared<EventWaitTaskInfo>(kernel_name_, stream_id, event_id_);
EventWaitTaskInfoPtr task_info_ptr = std::make_shared<EventWaitTaskInfo>(unique_name_, stream_id, event_id_);
MS_EXCEPTION_IF_NULL(task_info_ptr);
return {task_info_ptr};
}

View File

@ -56,7 +56,7 @@ std::vector<TaskInfoPtr> SendKernel::GenTask(const std::vector<AddressPtr> &, co
const std::vector<AddressPtr> &, uint32_t stream_id) {
MS_LOG(INFO) << "SendKernel GenTask event id:" << event_id_ << ", stream id:" << stream_id;
stream_id_ = stream_id;
EventRecordTaskInfoPtr task_info_ptr = std::make_shared<EventRecordTaskInfo>(kernel_name_, stream_id, event_id_);
EventRecordTaskInfoPtr task_info_ptr = std::make_shared<EventRecordTaskInfo>(unique_name_, stream_id, event_id_);
MS_EXCEPTION_IF_NULL(task_info_ptr);
return {task_info_ptr};
}

View File

@ -71,7 +71,7 @@ std::vector<TaskInfoPtr> StreamActiveKernel::GenTask(const std::vector<AddressPt
std::vector<TaskInfoPtr> task_info_list;
for (auto &index : active_streams_index_) {
std::shared_ptr<StreamActiveTaskInfo> task_info_ptr =
std::make_shared<StreamActiveTaskInfo>(kernel_name_, stream_id, index);
std::make_shared<StreamActiveTaskInfo>(unique_name_, stream_id, index);
MS_EXCEPTION_IF_NULL(task_info_ptr);
task_info_list.emplace_back(task_info_ptr);
MS_LOG(INFO) << "StreamActiveKernel GenTask: streamId:" << stream_id << ", Active streamId:" << index;

View File

@ -91,7 +91,7 @@ std::vector<TaskInfoPtr> StreamSwitchKernel::GenTask(const std::vector<AddressPt
MS_LOG(INFO) << "cond_:" << static_cast<int>(cond_) << ", true_stream_index_:" << true_stream_index_
<< ", stream_id:" << stream_id;
std::shared_ptr<StreamSwitchTaskInfo> task_info_ptr = std::make_shared<StreamSwitchTaskInfo>(
kernel_name_, stream_id, true_stream_index_, loop_cnt, ites_per_loop, cond_, data_type_);
unique_name_, stream_id, true_stream_index_, loop_cnt, ites_per_loop, cond_, data_type_);
MS_EXCEPTION_IF_NULL(task_info_ptr);
return {task_info_ptr};
}

View File

@ -142,10 +142,10 @@ std::vector<TaskInfoPtr> TensorCopySlices::GenTask(const std::vector<AddressPtr>
stream_id_ = stream_id;
std::shared_ptr<MemcpyAsyncTaskInfo> task_info_ptr1 =
std::make_shared<MemcpyAsyncTaskInfo>(kernel_name_, stream_id, outputs[0]->addr, outputs[0]->size, inputs[0]->addr,
std::make_shared<MemcpyAsyncTaskInfo>(unique_name_, stream_id, outputs[0]->addr, outputs[0]->size, inputs[0]->addr,
inputs[0]->size, RT_MEMCPY_DEVICE_TO_DEVICE, NeedDump());
std::shared_ptr<MemcpyAsyncTaskInfo> task_info_ptr2 = std::make_shared<MemcpyAsyncTaskInfo>(
kernel_name_, stream_id, VoidPointerOffset(outputs[0]->addr, offset_), copy_size_, inputs[1]->addr, copy_size_,
unique_name_, stream_id, VoidPointerOffset(outputs[0]->addr, offset_), copy_size_, inputs[1]->addr, copy_size_,
RT_MEMCPY_DEVICE_TO_DEVICE, NeedDump());
return {task_info_ptr1, task_info_ptr2};
}

View File

@ -103,7 +103,7 @@ std::vector<TaskInfoPtr> TbeKernelMod::GenTask(const std::vector<AddressPtr> &in
MS_LOG(INFO) << "block_dim is:" << block_dim_;
TbeTaskInfoPtr task_info_ptr = std::make_shared<mindspore::ge::model_runner::TbeTaskInfo>(
kernel_name_, stream_id, stub_func, block_dim_, args, 0, sm_desc, nullptr, 0, meta_data, input_data_addrs,
unique_name_, stream_id, stub_func, block_dim_, args, 0, sm_desc, nullptr, 0, meta_data, input_data_addrs,
output_data_addrs, workspace_addrs, NeedDump());
return {task_info_ptr};
}

View File

@ -2163,5 +2163,16 @@ bool AnfRuntimeAlgorithm::IsControlOpExecInBackend(const AnfNodePtr &node) {
static std::set<std::string> control_ops_exec_in_backend = {kBpropCutOpName};
return control_ops_exec_in_backend.find(AnfAlgo::GetCNodeName(node)) != control_ops_exec_in_backend.end();
}
bool AnfRuntimeAlgorithm::IsNodeInputContainMonad(const AnfNodePtr &node) {
auto input_size = GetInputTensorNum(node);
for (size_t i = 0; i < input_size; ++i) {
auto input_with_index = GetPrevNodeOutput(node, i);
if (HasAbstractMonad(input_with_index.first)) {
return true;
}
}
return false;
}
} // namespace session
} // namespace mindspore

View File

@ -300,6 +300,8 @@ class AnfRuntimeAlgorithm {
// executed in vm. For example, the operator "bprop_cut" will be compiled into kernel graph and be launch
// in backend in PyNative mode.
static bool IsControlOpExecInBackend(const AnfNodePtr &node);
static bool IsNodeInputContainMonad(const AnfNodePtr &node);
};
} // namespace session
using AnfAlgo = session::AnfRuntimeAlgorithm;

View File

@ -549,7 +549,7 @@ CNodePtr AscendKernelRuntime::GetErrorNodeName(uint32_t streamid, uint32_t taski
if (task_id == taskid && stream_id == streamid) {
auto &execute_node = current_graph_->execution_order();
auto node = std::find_if(execute_node.begin(), execute_node.end(),
[&iter](const auto &node) { return node->fullname_with_scope() == iter.first; });
[&iter](const auto &node) { return node->UniqueName() == iter.first; });
if (node != execute_node.end()) {
return *node;
}

View File

@ -100,8 +100,8 @@ void DataDumper::LoadDumpInfo() {
if (!KernelNeedDump(kernel)) {
continue;
}
MS_LOG(INFO) << "[DataDump] LoadDumpInfo kernel:" << kernel->fullname_with_scope();
dump_kernel_names_.emplace_back(kernel->fullname_with_scope());
MS_LOG(INFO) << "[DataDump] LoadDumpInfo kernel:" << kernel->UniqueName();
dump_kernel_names_.emplace_back(kernel->UniqueName());
DumpJsonParser::GetInstance().MatchKernel(kernel->fullname_with_scope());
aicpu::dump::Task task;
@ -251,7 +251,7 @@ void DataDumper::ReleaseDevMem(void **ptr) const {
void DataDumper::ConstructDumpTask(NotNull<const CNodePtr &> kernel, NotNull<aicpu::dump::Task *> dump_task) const {
dump_task->set_end_graph(false);
auto iter = runtime_info_map_.find(kernel->fullname_with_scope());
auto iter = runtime_info_map_.find(kernel->UniqueName());
if (iter == runtime_info_map_.end()) {
MS_LOG(EXCEPTION) << "[DataDump] kernel name not found in runtime_info_map";
}
@ -389,6 +389,10 @@ void DataDumper::DumpKernelOutput(const CNodePtr &kernel, void *args, NotNull<ai
MS_LOG(INFO) << "Skip dump output";
return;
}
if (HasAbstractMonad(kernel)) {
MS_LOG(WARNING) << "Skip Monad node output:" << kernel->fullname_with_scope();
return;
}
MS_LOG(INFO) << "[DataDump] DumpKernelOutput start. Kernel:" << kernel->fullname_with_scope();
auto input_size = AnfAlgo::GetInputTensorNum(kernel);
auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
@ -423,6 +427,10 @@ void DataDumper::DumpKernelInput(const CNodePtr &kernel, void *args, NotNull<aic
MS_LOG(INFO) << "Skip dump input";
return;
}
if (AnfAlgo::IsNodeInputContainMonad(kernel)) {
MS_LOG(WARNING) << "Skip Monad node:" << kernel->fullname_with_scope();
return;
}
MS_LOG(INFO) << "[DataDump] DumpKernelInput start. Kernel:" << kernel->fullname_with_scope();
auto input_size = AnfAlgo::GetInputTensorNum(kernel);
uint64_t offset = 0;

View File

@ -203,7 +203,8 @@ void RuntimeModel::DistributeTask() {
std::shared_ptr<RuntimeInfo> runtime_tuple = std::make_shared<RuntimeInfo>(task_id, stream_id, task->Args());
auto emplace_ret = runtime_info_map_.emplace(task->task_name(), runtime_tuple);
if (!emplace_ret.second) {
MS_LOG(WARNING) << "Task name exist: " << task->task_name();
// The task_name is (fullname_with_scope + UniqueId). There should be no duplication.
MS_LOG(EXCEPTION) << "Task name exist: " << task->task_name();
}
}
}

View File

@ -36,7 +36,7 @@ ge::proto::DataType GeTypesConvert::GetGeDataType(TypeId type_id) {
MS_LOG(INFO) << "Vm origin type_id:" << type_id;
auto iter = data_type_map.find(type_id);
if (iter == data_type_map.end()) {
MS_LOG(EXCEPTION) << "Invalid data type:" << type_id;
MS_LOG(EXCEPTION) << "MindSpore data type:" << TypeIdLabel(type_id) << " can't been found in GE.";
}
return iter->second;
}

View File

@ -147,7 +147,9 @@ bool TaskGenerator::LaunchKernel(const CNodePtr &anf_node_ptr, uint32_t stream_i
AddressPtrList kernel_outputs;
auto kernel_mod = AnfAlgo::GetKernelMod(anf_node_ptr);
MS_EXCEPTION_IF_NULL(kernel_mod);
kernel_mod->set_kernel_name(anf_node_ptr->fullname_with_scope());
kernel_mod->set_unique_name(anf_node_ptr->UniqueName());
kernel_mod->set_fullname(anf_node_ptr->fullname_with_scope());
kernel_mod->set_is_monad(AnfAlgo::IsNodeInputContainMonad(anf_node_ptr) && HasAbstractMonad(anf_node_ptr));
auto op_name = AnfAlgo::GetCNodeName(anf_node_ptr);
constexpr size_t kNonePlaceholderIdx = 3;
if ((op_name == kSplitOpName || op_name == kSplitVOpName) && AnfAlgo::HasNodeAttr(kAttrNonTask, anf_node_ptr)) {

View File

@ -153,6 +153,7 @@ class AnfNode : public Base {
std::size_t hash() const override { return this->hash_(this); }
virtual std::string fullname_with_scope() { return ""; }
std::string UniqueName() { return fullname_with_scope() + "_" + UniqueId(); }
virtual std::string DebugString(int recursive_level = 1) const { return ToString(); }
virtual std::string DebugString(bool recursive) const { return DebugString(recursive ? 1 : 0); }