From 5ceaabbf6aa6d93fb618ca8dac33f1dd1bc7c2a9 Mon Sep 17 00:00:00 2001 From: hwjiaorui Date: Fri, 12 Nov 2021 16:12:49 +0800 Subject: [PATCH] MindRT bug fix --- .../kernel_compiler/tbe/tbe_kernel_compile.cc | 11 ++++---- mindspore/ccsrc/pipeline/jit/pipeline.cc | 1 + .../device/ascend/kernel_select_ascend.cc | 20 +++++++------- .../runtime/framework/actor/actor_common.cc | 10 +++++++ .../runtime/framework/actor/actor_common.h | 2 ++ .../framework/actor/data_source_actor.cc | 10 ++++--- .../runtime/framework/actor/output_actor.cc | 4 +-- .../hardware/ascend/ascend_device_context.cc | 26 ++++++++++++++++++- .../hardware/ascend/ascend_device_context.h | 3 +++ 9 files changed, 66 insertions(+), 21 deletions(-) diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_compile.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_compile.cc index baac3d0018d..a90a0a1c420 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_compile.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_compile.cc @@ -298,8 +298,8 @@ void TbeKernelCompileManager::ParseTargetJobStatus(const nlohmann::json &json, T MS_LOG(EXCEPTION) << "Parse query result error."; } auto json_name = GetJsonValue(query_result, kFusionOpName); - auto target_job_id = query_result.at(kJobId); - auto status = query_result.at(kStatus); + auto target_job_id = GetJsonValue(query_result, kJobId); + auto status = GetJsonValue(query_result, kStatus); auto all_logs = GetJsonValue>(query_result, kProcessInfo); auto message = FilterExceptionMessage(all_logs); // save job status and exception message @@ -354,8 +354,8 @@ void TbeKernelCompileManager::JsonAssemble(const std::string &job_type, const nl (*dst_json)[kJobContent] = job_info; } else if (job_type == kQuery) { nlohmann::json content; - content[kSourceId] = src_json[kSourceId]; - content[kJobId] = src_json[kJobId]; + content[kSourceId] = GetJsonValue(src_json, kSourceId); + content[kJobId] = GetJsonValue(src_json, kJobId); (*dst_json)[kJobContent] = content; } else { (*dst_json)[kJobContent] = src_json; @@ -438,7 +438,8 @@ void TbeKernelCompileManager::SaveIOSizeInfo(const nlohmann::json &json, const s std::vector input_size_list; std::vector output_size_list; if (!output_nodes.empty()) { - (void)TbeKernelBuild::GetIOSize(json[kOpList], output_nodes, &input_size_list, &output_size_list); + (void)TbeKernelBuild::GetIOSize(GetJsonValue(json, kOpList), output_nodes, &input_size_list, + &output_size_list); } else { (void)TbeKernelBuild::GetIOSize(json, &input_size_list, &output_size_list); } diff --git a/mindspore/ccsrc/pipeline/jit/pipeline.cc b/mindspore/ccsrc/pipeline/jit/pipeline.cc index 34fa8b710e8..bcb63538b29 100644 --- a/mindspore/ccsrc/pipeline/jit/pipeline.cc +++ b/mindspore/ccsrc/pipeline/jit/pipeline.cc @@ -1641,6 +1641,7 @@ void FinalizeHccl() { (void)FinalizeBackend(); #else session::ExecutorManager::Instance().Clear(); + device::DeviceContextManager::GetInstance().ClearDeviceContexts(); device::KernelRuntimeManager::Instance().ClearRuntimeResource(); #endif } diff --git a/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc b/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc index 1374307297d..b03a3f25813 100644 --- a/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc +++ b/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc @@ -255,18 +255,17 @@ bool CheckHitTargetDtype(const std::map &type_map, const TypeId } bool TagRaiseReduce(const std::shared_ptr &kernel_build_info, const CNodePtr &cnode, - const std::map &type_map) { + const std::map &type_map, bool *int64_flag) { // filte kernel info that unsupported raise or reduce datatype MS_EXCEPTION_IF_NULL(cnode); MS_EXCEPTION_IF_NULL(kernel_build_info); - bool flag = false; for (size_t input_index = 0; input_index < kernel_build_info->GetInputNum(); ++input_index) { auto in_dtype = AnfAlgo::GetPrevNodeOutputInferDataType(cnode, input_index); auto device_dtype = kernel_build_info->GetInputDeviceType(input_index); if (device_dtype == kNumberTypeFloat || device_dtype == kNumberTypeFloat32) { device_dtype = kNumberTypeFloat32; } - if (!CheckHitTargetDtype(type_map, in_dtype, device_dtype, &flag)) { + if (!CheckHitTargetDtype(type_map, in_dtype, device_dtype, int64_flag)) { return false; } } @@ -278,14 +277,10 @@ bool TagRaiseReduce(const std::shared_ptr &kernel_build device_dtype = kNumberTypeFloat32; } - if (!CheckHitTargetDtype(type_map, in_dtype, device_dtype, &flag)) { + if (!CheckHitTargetDtype(type_map, in_dtype, device_dtype, int64_flag)) { return false; } } - if (flag) { - auto node_name = AnfAlgo::GetCNodeName(cnode); - MS_LOG(WARNING) << "Operator:[" << node_name << "] don't support int64, reduce precision from int64 to int32."; - } return true; } @@ -298,10 +293,11 @@ std::vector> FilterRaisedOrReducePrecis const std::map reduce_map = {{kNumberTypeInt64, kNumberTypeInt32}, {kNumberTypeFloat, kNumberTypeFloat16}, {kNumberTypeFloat32, kNumberTypeFloat16}}; + bool int64_reduce = false; // raise precision for (size_t info_index = 0; info_index < kernel_info_list.size(); ++info_index) { MS_EXCEPTION_IF_NULL(kernel_info_list[info_index]); - if (TagRaiseReduce(kernel_info_list[info_index], cnode, raise_map)) { + if (TagRaiseReduce(kernel_info_list[info_index], cnode, raise_map, &int64_reduce)) { filtered_kernel_info_list.push_back(kernel_info_list[info_index]); } } @@ -317,7 +313,7 @@ std::vector> FilterRaisedOrReducePrecis if (context_ptr->get_param(MS_CTX_ENABLE_REDUCE_PRECISION)) { for (size_t info_index = 0; info_index < kernel_info_list.size(); ++info_index) { MS_EXCEPTION_IF_NULL(kernel_info_list[info_index]); - if (TagRaiseReduce(kernel_info_list[info_index], cnode, reduce_map)) { + if (TagRaiseReduce(kernel_info_list[info_index], cnode, reduce_map, &int64_reduce)) { filtered_kernel_info_list.push_back(kernel_info_list[info_index]); } } @@ -325,6 +321,10 @@ std::vector> FilterRaisedOrReducePrecis if (!filtered_kernel_info_list.empty()) { *precision_reduce = true; } + if (int64_reduce) { + auto node_name = AnfAlgo::GetCNodeName(cnode); + MS_LOG(WARNING) << "Operator:[" << node_name << "] don't support int64, reduce precision from int64 to int32."; + } return filtered_kernel_info_list; } diff --git a/mindspore/ccsrc/runtime/framework/actor/actor_common.cc b/mindspore/ccsrc/runtime/framework/actor/actor_common.cc index 3b39dfd4f07..e56020bd35c 100644 --- a/mindspore/ccsrc/runtime/framework/actor/actor_common.cc +++ b/mindspore/ccsrc/runtime/framework/actor/actor_common.cc @@ -292,5 +292,15 @@ std::string FetchActorName(KernelTransformType kernel_type, const std::string &a } return actor_name; } + +bool CheckMemcpyInDevice(const DeviceTensor *dst_device_addr, const DeviceTensor *src_device_addr) { + MS_EXCEPTION_IF_NULL(dst_device_addr); + if (src_device_addr == nullptr) { + return false; + } + return (src_device_addr->DeviceType() == dst_device_addr->DeviceType() && + src_device_addr->format() == dst_device_addr->format() && + src_device_addr->type_id() == dst_device_addr->type_id()); +} } // namespace runtime } // namespace mindspore diff --git a/mindspore/ccsrc/runtime/framework/actor/actor_common.h b/mindspore/ccsrc/runtime/framework/actor/actor_common.h index d27c119ebf9..6a9fa1b339b 100644 --- a/mindspore/ccsrc/runtime/framework/actor/actor_common.h +++ b/mindspore/ccsrc/runtime/framework/actor/actor_common.h @@ -210,6 +210,8 @@ KernelTransformType FetchKernelTransformType(const AnfNodePtr &node, const Kerne GraphExecutionStrategy strategy = GraphExecutionStrategy::kPipeline); std::string FetchActorName(KernelTransformType kernel_type, const std::string &actor_set_name, const AnfNodePtr &node = nullptr, const KernelGraphPtr &graph = nullptr); + +bool CheckMemcpyInDevice(const DeviceTensor *dst_device_tensor, const DeviceTensor *src_device_tensor); } // namespace runtime } // namespace mindspore diff --git a/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc b/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc index 7d0537db3f1..18cbcc2a6c9 100644 --- a/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc +++ b/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc @@ -239,10 +239,14 @@ void HostQueueDataSourceActor::OnMemoryAllocFinish(OpContext *cons auto tensor_device_address = std::dynamic_pointer_cast(host_tensor->device_address()); // Sync data from host_tensor_device_address to device_tensor. if (tensor_device_address != nullptr) { - if ((tensor_device_address.get() != device_tensor) && (!Copy(device_tensor, tensor_device_address.get()))) { - SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "Copy data failed."); + if (CheckMemcpyInDevice(device_tensor, tensor_device_address.get())) { + if ((tensor_device_address.get() != device_tensor) && (!Copy(device_tensor, tensor_device_address.get()))) { + SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "Copy data failed."); + } + continue; + } else { + host_tensor->data_sync(false); } - continue; } // Sync data from host_tensor to device_tensor. diff --git a/mindspore/ccsrc/runtime/framework/actor/output_actor.cc b/mindspore/ccsrc/runtime/framework/actor/output_actor.cc index 41f0636f2c8..cb068ca768d 100644 --- a/mindspore/ccsrc/runtime/framework/actor/output_actor.cc +++ b/mindspore/ccsrc/runtime/framework/actor/output_actor.cc @@ -109,7 +109,7 @@ TensorPtr OutputActor::CreateOutputTensor(const AnfNodePtr &output_node, size_t const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(output_node, output_index, false); MS_EXCEPTION_IF_NULL(device_tensor); // In the input as output scenario, use the device tensor of node. - if (output_node->isa() || output_node->isa()) { + if (IsPersistentDeviceTensor(output_node)) { tensor->set_device_address(device_tensor); return tensor; } @@ -151,7 +151,7 @@ void OutputActor::UpdateOutputDeviceAddress() { auto output_index = output_nodes_[i].second; auto &tensor = outputs_[i]; // In the input as output scenario, the output device tensor may come from the input tensor and can't be replaced. - if ((output_node == nullptr) || output_node->isa() || output_node->isa()) { + if ((output_node == nullptr) || IsPersistentDeviceTensor(output_node)) { continue; } diff --git a/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc b/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc index 6aebe46746f..4f3241be333 100644 --- a/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc +++ b/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc @@ -28,6 +28,7 @@ #include "runtime/hardware/ascend/ascend_graph_optimization.h" #include "backend/kernel_compiler/ascend_kernel_mod.h" #include "runtime/device/ascend/ascend_bucket.h" +#include "common/util/error_manager/error_manager.h" #ifndef ENABLE_SECURITY #include "debug/data_dump/dump_json_parser.h" @@ -65,6 +66,7 @@ namespace ascend { using KernelGraph = mindspore::session::KernelGraph; const char kMsVm[] = "vm"; constexpr size_t kAtomicCleanInputSize = 2; +constexpr auto kUnknowErrorString = "Unknown error occurred"; namespace { CNodePtr GetNextLabelSet(const std::vector &kernel_nodes, uint32_t index) { size_t node_sizes = kernel_nodes.size(); @@ -584,10 +586,30 @@ bool AscendDeviceContext::LaunchGraph(const KernelGraphPtr &graph) const { runtime_instance_->SetContext(); device::KernelAdjust::GetInstance().LoadDeviceLoopCtrlParameters(graph); auto ret = ExecuteGraph(graph); + if (!ret) { + MS_LOG(ERROR) << "run task error!"; + ReportErrorMessage(); + return ret; + } + ReportWarningMessage(); MS_LOG(INFO) << "Status record: end launch graph. graph id: " << graph->graph_id(); return ret; } +void AscendDeviceContext::ReportErrorMessage() const { + const string &error_message = ErrorManager::GetInstance().GetErrorMessage(); + if (!error_message.empty() && error_message.find(kUnknowErrorString) == string::npos) { + MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message; + } +} + +void AscendDeviceContext::ReportWarningMessage() const { + const string &warning_message = ErrorManager::GetInstance().GetWarningMessage(); + if (!warning_message.empty()) { + MS_LOG(WARNING) << "Ascend warning message:\n" << warning_message; + } +} + bool AscendDeviceContext::SyncStream(size_t stream_id) const { MS_EXCEPTION_IF_NULL(runtime_instance_); return runtime_instance_->SyncStream(); @@ -599,7 +621,9 @@ bool AscendDeviceContext::IsExecutingSink(const KernelGraphPtr &graph) const { return ms_context->get_param(MS_CTX_ENABLE_TASK_SINK) && IsGraphMode(); } -bool AscendDeviceContext::IsLoopCountSink(const KernelGraphPtr &graph) const { return IsGraphMode(); } +bool AscendDeviceContext::IsLoopCountSink(const KernelGraphPtr &graph) const { + return device::KernelAdjust::NeedLoopSink() && IsGraphMode(); +} // kernel by kernel mode interface void AscendDeviceContext::OptimizeSingleOpGraph(const KernelGraphPtr &graph) const { diff --git a/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.h b/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.h index 99d6c7c8641..8c6bb32bec5 100644 --- a/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.h +++ b/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.h @@ -138,6 +138,9 @@ class AscendDeviceContext : public DeviceContext { static bool IsGraphMode(); bool SyncRuning() const; + void ReportErrorMessage() const; + void ReportWarningMessage() const; + // Kernel Runtime --- only for task sink AscendKernelRuntime *runtime_instance_{nullptr}; std::shared_ptr mem_manager_{nullptr};