From 21c4bf1f27a55ce8e1b83b0cc0dd644a0c5077c1 Mon Sep 17 00:00:00 2001 From: tanghuikang Date: Mon, 12 Jul 2021 16:47:37 +0800 Subject: [PATCH] Print ascend warning and error message using ErrorManager --- mindspore/ccsrc/CMakeLists.txt | 5 +- .../ccsrc/backend/session/ascend_session.cc | 15 ++++++ .../ccsrc/backend/session/ascend_session.h | 2 + mindspore/ccsrc/backend/session/executor.cc | 6 +++ .../ccsrc/backend/session/session_basic.h | 2 + .../device/ascend/ascend_kernel_runtime.cc | 51 +++++++++++++------ tests/ut/cpp/CMakeLists.txt | 2 +- 7 files changed, 65 insertions(+), 18 deletions(-) diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt index 76a50095e99..8ba0c815689 100644 --- a/mindspore/ccsrc/CMakeLists.txt +++ b/mindspore/ccsrc/CMakeLists.txt @@ -291,6 +291,7 @@ endif() if(MODE_ASCEND_ALL) MESSAGE("USE DAV LIB PATH: ${ASCEND_PATH}") + find_library(ERROR_MANAGER error_manager ${ASCEND_RUNTIME_PATH}) find_library(RUNTIME_LIB runtime ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(TSDCLIENT tsdclient HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) @@ -308,8 +309,8 @@ if(MODE_ASCEND_ALL) target_link_options(ms_profile PRIVATE -Wl,-init,common_log_init) target_link_libraries(ms_profile -Wl,--start-group -Wl,--whole-archive ${PROFILING} -Wl,--no-whole-archive mindspore::protobuf -Wl,--end-group) - target_link_libraries(mindspore ${RUNTIME_LIB} ${TSDCLIENT} ${DATATRANSFER} -Wl,--no-as-needed ${OPTILING} - ${PLATFORM} ${ACL} ${OPT_FEATURE}) + target_link_libraries(mindspore ${RUNTIME_LIB} ${TSDCLIENT} ${DATATRANSFER} ${ERROR_MANAGER} -Wl,--no-as-needed + ${OPTILING} ${PLATFORM} ${ACL} ${OPT_FEATURE}) target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf -Wl,--end-group) elseif(CMAKE_SYSTEM_NAME MATCHES "Windows") target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf mindspore::sentencepiece diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index fe8f7790585..27f4e0b7528 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -66,6 +66,7 @@ #else #include "debug/debugger/proto_exporter_stub.h" #endif +#include "common/util/error_manager/error_manager.h" #include "toolchain/adx_datadump_server.h" #ifdef ENABLE_DUMP_IR #include "debug/rdr/running_data_recorder.h" @@ -1523,5 +1524,19 @@ std::shared_ptr AscendSession::CreateBucket(uint32_t bucket_id, bucket->Init({compute_stream}, {communication_stream}); return bucket; } + +void AscendSession::ReportWarningMessage() { + const string &warning_message = ErrorManager::GetInstance().GetWarningMessage(); + if (!warning_message.empty()) { + MS_LOG(WARNING) << "Ascend warning message:\n" << warning_message; + } +} + +void AscendSession::ReportErrorMessage() { + const string &error_message = ErrorManager::GetInstance().GetErrorMessage(); + if (!error_message.empty()) { + MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message; + } +} } // namespace session } // namespace mindspore diff --git a/mindspore/ccsrc/backend/session/ascend_session.h b/mindspore/ccsrc/backend/session/ascend_session.h index 0de65ac077f..14ba03bb28f 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.h +++ b/mindspore/ccsrc/backend/session/ascend_session.h @@ -66,6 +66,8 @@ class AscendSession : public SessionBasic { const std::vector &graph_inputs, const std::map &cnode_refcount) override; std::string GetCommWorldGroup() override { return kHcclWorldGroup; } + void ReportWarningMessage() override; + void ReportErrorMessage() override; private: // compile child graph when session have multiple child graphs diff --git a/mindspore/ccsrc/backend/session/executor.cc b/mindspore/ccsrc/backend/session/executor.cc index 01b9f230d14..70c482482e3 100644 --- a/mindspore/ccsrc/backend/session/executor.cc +++ b/mindspore/ccsrc/backend/session/executor.cc @@ -207,7 +207,13 @@ void Executor::WorkerLoop() { } try { task->Run(); + if (task->session_ != nullptr) { + task->session_->ReportWarningMessage(); + } } catch (const std::exception &e) { + if (task->session_ != nullptr) { + task->session_->ReportErrorMessage(); + } ExecutorManager::Instance().OnEvent(ExecutorEvent::kException); MsException::Instance().SetException(); } diff --git a/mindspore/ccsrc/backend/session/session_basic.h b/mindspore/ccsrc/backend/session/session_basic.h index 2ba31599b9d..10d3d11a962 100644 --- a/mindspore/ccsrc/backend/session/session_basic.h +++ b/mindspore/ccsrc/backend/session/session_basic.h @@ -150,6 +150,8 @@ class SessionBasic : public std::enable_shared_from_this { void EraseValueNodeTensor(const std::vector &tensors_mask, std::vector *input_tensors); void RunOpRemoveNopNode(const KernelGraphPtr &kernel_graph) const; void RunOpHideNopNode(const KernelGraphPtr &kernel_graph) const; + virtual void ReportWarningMessage() {} + virtual void ReportErrorMessage() {} #ifdef ENABLE_DEBUGGER // set debugger void SetDebugger() { diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc index 97d1dfda45d..c19002873ed 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc @@ -39,6 +39,7 @@ #include "toolchain/adx_datadump_server.h" #include "utils/trace_base.h" #include "graphengine/inc/external/acl/error_codes/rt_error_codes.h" +#include "common/util/error_manager/error_manager.h" #include "debug/anf_ir_dump.h" #ifdef MEM_REUSE_DEBUG #include "backend/optimizer/mem_reuse/mem_reuse_checker.h" @@ -267,8 +268,16 @@ void AscendKernelRuntime::ReleaseDeviceRes() { } void AscendKernelRuntime::PreInit() { + const auto error_manager_ret = ErrorManager::GetInstance().Init(); + if (error_manager_ret != 0) { + MS_LOG(WARNING) << "Init ErrorManager failed."; + } auto ret = ProfilingManager::GetInstance().StartupProfiling(device_id_); if (!ret) { + const string &error_message = ErrorManager::GetInstance().GetErrorMessage(); + if (!error_message.empty()) { + MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message; + } MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed."; } } @@ -278,27 +287,39 @@ bool AscendKernelRuntime::Init() { SetCurrentContext(); return true; } - OpTilingCalculater::GetInstance().Init(); - // Start up profiling before rtSetDevice - - bool ret = InitDevice(); - if (!ret) { - return ret; + const auto error_manager_ret = ErrorManager::GetInstance().Init(); + if (error_manager_ret != 0) { + MS_LOG(WARNING) << "Init ErrorManager failed."; } + try { + OpTilingCalculater::GetInstance().Init(); + // Start up profiling before rtSetDevice - SetDebugger(); - mem_manager_ = std::make_shared(); - MS_EXCEPTION_IF_NULL(mem_manager_); - mem_manager_->MallocDeviceMemory(); + bool ret = InitDevice(); + if (!ret) { + return ret; + } - // Set callback func when exception error - auto rt_ret = rtRegTaskFailCallbackByModule(kModuleName, TaskFailCallback); - if (rt_ret != RT_ERROR_NONE) { - MS_LOG(EXCEPTION) << "Reg SetTaskFailCallback failed, error: " << rt_ret; + SetDebugger(); + mem_manager_ = std::make_shared(); + MS_EXCEPTION_IF_NULL(mem_manager_); + mem_manager_->MallocDeviceMemory(); + + // Set callback func when exception error + auto rt_ret = rtRegTaskFailCallbackByModule(kModuleName, TaskFailCallback); + if (rt_ret != RT_ERROR_NONE) { + MS_LOG(EXCEPTION) << "Reg SetTaskFailCallback failed, error: " << rt_ret; + } + } catch (const std::exception &e) { + const string &error_message = ErrorManager::GetInstance().GetErrorMessage(); + if (!error_message.empty()) { + MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message; + } + throw; } initialized_ = true; - return ret; + return true; } bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph) { diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index 24ac0394dcd..4eb3e99a51f 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -218,4 +218,4 @@ if(USE_GLOG) endif() target_link_libraries(mindspore mindspore_core) -target_link_libraries(ut_tests PRIVATE mindspore mindspore_shared_lib securec graph) +target_link_libraries(ut_tests PRIVATE mindspore mindspore_shared_lib securec graph error_manager)