!20246 Print ascend warning&error message using ErrorManager

Merge pull request !20246 from tanghuikang/error_manager
This commit is contained in:
i-robot 2021-07-16 06:49:31 +00:00 committed by Gitee
commit 3d022c9db8
7 changed files with 65 additions and 18 deletions

View File

@ -291,6 +291,7 @@ endif()
if(MODE_ASCEND_ALL)
MESSAGE("USE DAV LIB PATH: ${ASCEND_PATH}")
find_library(ERROR_MANAGER error_manager ${ASCEND_RUNTIME_PATH})
find_library(RUNTIME_LIB runtime ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
find_library(TSDCLIENT tsdclient HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}
${ASCEND_DRIVER_BACK_PATH})
@ -308,8 +309,8 @@ if(MODE_ASCEND_ALL)
target_link_options(ms_profile PRIVATE -Wl,-init,common_log_init)
target_link_libraries(ms_profile -Wl,--start-group -Wl,--whole-archive ${PROFILING} -Wl,--no-whole-archive
mindspore::protobuf -Wl,--end-group)
target_link_libraries(mindspore ${RUNTIME_LIB} ${TSDCLIENT} ${DATATRANSFER} -Wl,--no-as-needed ${OPTILING}
${PLATFORM} ${ACL} ${OPT_FEATURE})
target_link_libraries(mindspore ${RUNTIME_LIB} ${TSDCLIENT} ${DATATRANSFER} ${ERROR_MANAGER} -Wl,--no-as-needed
${OPTILING} ${PLATFORM} ${ACL} ${OPT_FEATURE})
target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf -Wl,--end-group)
elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf mindspore::sentencepiece

View File

@ -66,6 +66,7 @@
#else
#include "debug/debugger/proto_exporter_stub.h"
#endif
#include "common/util/error_manager/error_manager.h"
#include "toolchain/adx_datadump_server.h"
#ifdef ENABLE_DUMP_IR
#include "debug/rdr/running_data_recorder.h"
@ -1523,5 +1524,19 @@ std::shared_ptr<device::Bucket> AscendSession::CreateBucket(uint32_t bucket_id,
bucket->Init({compute_stream}, {communication_stream});
return bucket;
}
void AscendSession::ReportWarningMessage() {
const string &warning_message = ErrorManager::GetInstance().GetWarningMessage();
if (!warning_message.empty()) {
MS_LOG(WARNING) << "Ascend warning message:\n" << warning_message;
}
}
void AscendSession::ReportErrorMessage() {
const string &error_message = ErrorManager::GetInstance().GetErrorMessage();
if (!error_message.empty()) {
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
}
}
} // namespace session
} // namespace mindspore

View File

@ -66,6 +66,8 @@ class AscendSession : public SessionBasic {
const std::vector<tensor::TensorPtr> &graph_inputs,
const std::map<KernelWithIndex, size_t> &cnode_refcount) override;
std::string GetCommWorldGroup() override { return kHcclWorldGroup; }
void ReportWarningMessage() override;
void ReportErrorMessage() override;
private:
// compile child graph when session have multiple child graphs

View File

@ -207,7 +207,13 @@ void Executor::WorkerLoop() {
}
try {
task->Run();
if (task->session_ != nullptr) {
task->session_->ReportWarningMessage();
}
} catch (const std::exception &e) {
if (task->session_ != nullptr) {
task->session_->ReportErrorMessage();
}
ExecutorManager::Instance().OnEvent(ExecutorEvent::kException);
MsException::Instance().SetException();
}

View File

@ -150,6 +150,8 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
void EraseValueNodeTensor(const std::vector<int64_t> &tensors_mask, std::vector<tensor::TensorPtr> *input_tensors);
void RunOpRemoveNopNode(const KernelGraphPtr &kernel_graph) const;
void RunOpHideNopNode(const KernelGraphPtr &kernel_graph) const;
virtual void ReportWarningMessage() {}
virtual void ReportErrorMessage() {}
#ifdef ENABLE_DEBUGGER
// set debugger
void SetDebugger() {

View File

@ -40,6 +40,7 @@
#include "toolchain/adx_datadump_server.h"
#include "utils/trace_base.h"
#include "graphengine/inc/external/acl/error_codes/rt_error_codes.h"
#include "common/util/error_manager/error_manager.h"
#include "debug/anf_ir_dump.h"
#ifdef MEM_REUSE_DEBUG
#include "backend/optimizer/mem_reuse/mem_reuse_checker.h"
@ -275,8 +276,16 @@ void AscendKernelRuntime::ReleaseDeviceRes() {
}
void AscendKernelRuntime::PreInit() {
const auto error_manager_ret = ErrorManager::GetInstance().Init();
if (error_manager_ret != 0) {
MS_LOG(WARNING) << "Init ErrorManager failed.";
}
auto ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
if (!ret) {
const string &error_message = ErrorManager::GetInstance().GetErrorMessage();
if (!error_message.empty()) {
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
}
MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed.";
}
}
@ -286,6 +295,11 @@ bool AscendKernelRuntime::Init() {
SetCurrentContext();
return true;
}
const auto error_manager_ret = ErrorManager::GetInstance().Init();
if (error_manager_ret != 0) {
MS_LOG(WARNING) << "Init ErrorManager failed.";
}
try {
OpTilingCalculater::GetInstance().Init();
// Start up profiling before rtSetDevice
@ -304,9 +318,16 @@ bool AscendKernelRuntime::Init() {
if (rt_ret != RT_ERROR_NONE) {
MS_LOG(EXCEPTION) << "Reg SetTaskFailCallback failed, error: " << rt_ret;
}
} catch (const std::exception &e) {
const string &error_message = ErrorManager::GetInstance().GetErrorMessage();
if (!error_message.empty()) {
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
}
throw;
}
initialized_ = true;
return ret;
return true;
}
bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph) {

View File

@ -217,4 +217,4 @@ if(USE_GLOG)
endif()
target_link_libraries(mindspore mindspore_core)
target_link_libraries(ut_tests PRIVATE mindspore mindspore_shared_lib securec graph)
target_link_libraries(ut_tests PRIVATE mindspore mindspore_shared_lib securec graph error_manager)