!20246 Print ascend warning&error message using ErrorManager
Merge pull request !20246 from tanghuikang/error_manager
This commit is contained in:
commit
3d022c9db8
|
@ -291,6 +291,7 @@ endif()
|
|||
|
||||
if(MODE_ASCEND_ALL)
|
||||
MESSAGE("USE DAV LIB PATH: ${ASCEND_PATH}")
|
||||
find_library(ERROR_MANAGER error_manager ${ASCEND_RUNTIME_PATH})
|
||||
find_library(RUNTIME_LIB runtime ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
|
||||
find_library(TSDCLIENT tsdclient HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}
|
||||
${ASCEND_DRIVER_BACK_PATH})
|
||||
|
@ -308,8 +309,8 @@ if(MODE_ASCEND_ALL)
|
|||
target_link_options(ms_profile PRIVATE -Wl,-init,common_log_init)
|
||||
target_link_libraries(ms_profile -Wl,--start-group -Wl,--whole-archive ${PROFILING} -Wl,--no-whole-archive
|
||||
mindspore::protobuf -Wl,--end-group)
|
||||
target_link_libraries(mindspore ${RUNTIME_LIB} ${TSDCLIENT} ${DATATRANSFER} -Wl,--no-as-needed ${OPTILING}
|
||||
${PLATFORM} ${ACL} ${OPT_FEATURE})
|
||||
target_link_libraries(mindspore ${RUNTIME_LIB} ${TSDCLIENT} ${DATATRANSFER} ${ERROR_MANAGER} -Wl,--no-as-needed
|
||||
${OPTILING} ${PLATFORM} ${ACL} ${OPT_FEATURE})
|
||||
target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf -Wl,--end-group)
|
||||
elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
|
||||
target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf mindspore::sentencepiece
|
||||
|
|
|
@ -66,6 +66,7 @@
|
|||
#else
|
||||
#include "debug/debugger/proto_exporter_stub.h"
|
||||
#endif
|
||||
#include "common/util/error_manager/error_manager.h"
|
||||
#include "toolchain/adx_datadump_server.h"
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
#include "debug/rdr/running_data_recorder.h"
|
||||
|
@ -1523,5 +1524,19 @@ std::shared_ptr<device::Bucket> AscendSession::CreateBucket(uint32_t bucket_id,
|
|||
bucket->Init({compute_stream}, {communication_stream});
|
||||
return bucket;
|
||||
}
|
||||
|
||||
void AscendSession::ReportWarningMessage() {
|
||||
const string &warning_message = ErrorManager::GetInstance().GetWarningMessage();
|
||||
if (!warning_message.empty()) {
|
||||
MS_LOG(WARNING) << "Ascend warning message:\n" << warning_message;
|
||||
}
|
||||
}
|
||||
|
||||
void AscendSession::ReportErrorMessage() {
|
||||
const string &error_message = ErrorManager::GetInstance().GetErrorMessage();
|
||||
if (!error_message.empty()) {
|
||||
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
|
||||
}
|
||||
}
|
||||
} // namespace session
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -66,6 +66,8 @@ class AscendSession : public SessionBasic {
|
|||
const std::vector<tensor::TensorPtr> &graph_inputs,
|
||||
const std::map<KernelWithIndex, size_t> &cnode_refcount) override;
|
||||
std::string GetCommWorldGroup() override { return kHcclWorldGroup; }
|
||||
void ReportWarningMessage() override;
|
||||
void ReportErrorMessage() override;
|
||||
|
||||
private:
|
||||
// compile child graph when session have multiple child graphs
|
||||
|
|
|
@ -207,7 +207,13 @@ void Executor::WorkerLoop() {
|
|||
}
|
||||
try {
|
||||
task->Run();
|
||||
if (task->session_ != nullptr) {
|
||||
task->session_->ReportWarningMessage();
|
||||
}
|
||||
} catch (const std::exception &e) {
|
||||
if (task->session_ != nullptr) {
|
||||
task->session_->ReportErrorMessage();
|
||||
}
|
||||
ExecutorManager::Instance().OnEvent(ExecutorEvent::kException);
|
||||
MsException::Instance().SetException();
|
||||
}
|
||||
|
|
|
@ -150,6 +150,8 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
|
|||
void EraseValueNodeTensor(const std::vector<int64_t> &tensors_mask, std::vector<tensor::TensorPtr> *input_tensors);
|
||||
void RunOpRemoveNopNode(const KernelGraphPtr &kernel_graph) const;
|
||||
void RunOpHideNopNode(const KernelGraphPtr &kernel_graph) const;
|
||||
virtual void ReportWarningMessage() {}
|
||||
virtual void ReportErrorMessage() {}
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
// set debugger
|
||||
void SetDebugger() {
|
||||
|
|
|
@ -40,6 +40,7 @@
|
|||
#include "toolchain/adx_datadump_server.h"
|
||||
#include "utils/trace_base.h"
|
||||
#include "graphengine/inc/external/acl/error_codes/rt_error_codes.h"
|
||||
#include "common/util/error_manager/error_manager.h"
|
||||
#include "debug/anf_ir_dump.h"
|
||||
#ifdef MEM_REUSE_DEBUG
|
||||
#include "backend/optimizer/mem_reuse/mem_reuse_checker.h"
|
||||
|
@ -275,8 +276,16 @@ void AscendKernelRuntime::ReleaseDeviceRes() {
|
|||
}
|
||||
|
||||
void AscendKernelRuntime::PreInit() {
|
||||
const auto error_manager_ret = ErrorManager::GetInstance().Init();
|
||||
if (error_manager_ret != 0) {
|
||||
MS_LOG(WARNING) << "Init ErrorManager failed.";
|
||||
}
|
||||
auto ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
|
||||
if (!ret) {
|
||||
const string &error_message = ErrorManager::GetInstance().GetErrorMessage();
|
||||
if (!error_message.empty()) {
|
||||
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
|
||||
}
|
||||
MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed.";
|
||||
}
|
||||
}
|
||||
|
@ -286,27 +295,39 @@ bool AscendKernelRuntime::Init() {
|
|||
SetCurrentContext();
|
||||
return true;
|
||||
}
|
||||
OpTilingCalculater::GetInstance().Init();
|
||||
// Start up profiling before rtSetDevice
|
||||
|
||||
bool ret = InitDevice();
|
||||
if (!ret) {
|
||||
return ret;
|
||||
const auto error_manager_ret = ErrorManager::GetInstance().Init();
|
||||
if (error_manager_ret != 0) {
|
||||
MS_LOG(WARNING) << "Init ErrorManager failed.";
|
||||
}
|
||||
try {
|
||||
OpTilingCalculater::GetInstance().Init();
|
||||
// Start up profiling before rtSetDevice
|
||||
|
||||
SetDebugger();
|
||||
mem_manager_ = std::make_shared<AscendMemoryManager>();
|
||||
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||
mem_manager_->MallocDeviceMemory();
|
||||
bool ret = InitDevice();
|
||||
if (!ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Set callback func when exception error
|
||||
auto rt_ret = rtRegTaskFailCallbackByModule(kModuleName, TaskFailCallback);
|
||||
if (rt_ret != RT_ERROR_NONE) {
|
||||
MS_LOG(EXCEPTION) << "Reg SetTaskFailCallback failed, error: " << rt_ret;
|
||||
SetDebugger();
|
||||
mem_manager_ = std::make_shared<AscendMemoryManager>();
|
||||
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||
mem_manager_->MallocDeviceMemory();
|
||||
|
||||
// Set callback func when exception error
|
||||
auto rt_ret = rtRegTaskFailCallbackByModule(kModuleName, TaskFailCallback);
|
||||
if (rt_ret != RT_ERROR_NONE) {
|
||||
MS_LOG(EXCEPTION) << "Reg SetTaskFailCallback failed, error: " << rt_ret;
|
||||
}
|
||||
} catch (const std::exception &e) {
|
||||
const string &error_message = ErrorManager::GetInstance().GetErrorMessage();
|
||||
if (!error_message.empty()) {
|
||||
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
|
||||
}
|
||||
throw;
|
||||
}
|
||||
|
||||
initialized_ = true;
|
||||
return ret;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph) {
|
||||
|
|
|
@ -217,4 +217,4 @@ if(USE_GLOG)
|
|||
endif()
|
||||
|
||||
target_link_libraries(mindspore mindspore_core)
|
||||
target_link_libraries(ut_tests PRIVATE mindspore mindspore_shared_lib securec graph)
|
||||
target_link_libraries(ut_tests PRIVATE mindspore mindspore_shared_lib securec graph error_manager)
|
||||
|
|
Loading…
Reference in New Issue