!22134 Report tbe error code to ErrorManager

Merge pull request !22134 from tanghuikang/tbe_em
This commit is contained in:
i-robot 2021-08-23 03:22:46 +00:00 committed by Gitee
commit 163501e09c
5 changed files with 67 additions and 1 deletions

View File

@ -36,6 +36,7 @@ class LogLevel(Enum):
INFO = 1
WARNING = 2
ERROR = 3
EXCEPTION = 4
class JobStatus(Enum):
@ -140,6 +141,29 @@ class TbeJob:
self.process_info.append(message)
self._sys_logger.error(msg, *args, **kwargs)
def exception(self, msg, *args, **kwargs):
"""
log exception level info
:param msg:
:param args:
:return:
"""
if not msg:
self.warning("Get empty exception message")
return
exception_info = msg[0]
if not isinstance(exception_info, dict):
self.warning("Get illegal exception message")
return
op_name = self.fusion_op_name
if len(msg) >= 2:
op_name = msg[1]
exception_info["op_name"] = op_name
processed_msg = json.dumps(exception_info)
message = LogMessage(len(self.process_info), LogLevel.EXCEPTION, processed_msg)
self.process_info.append(message)
self._sys_logger.exception(msg, *args, **kwargs)
def get_result(self):
"""
Get tht job process result string

View File

@ -384,7 +384,7 @@ class TbeJobManager:
if "except_msg" in new_job:
target_job.error("Query except_msg:{}".format(new_job["except_msg"]))
if "except_tuple_msg" in new_job:
target_job.error("Query except_tuple_msg:{}".format(new_job["except_tuple_msg"]))
target_job.exception(new_job["except_tuple_msg"])
target_job.error("\nOriginal compile json: \n {}\n".format(target_job.json_string))
post_job(self._raw_finish_jobs, target_job)
del_job(self._running_jobs, target_job.source_id, target_job.id)
@ -439,6 +439,10 @@ class DummyLogger:
def error(msg, *args, **kwargs):
pass
@staticmethod
def exception(msg, *args, **kwargs):
pass
def get_job(jobs, source_id, job_id):
"""

View File

@ -30,6 +30,7 @@
#include "backend/kernel_compiler/tbe/tbe_utils.h"
#include "backend/kernel_compiler/tbe/tbe_convert_utils.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "common/util/error_manager/error_manager.h"
#include "debug/anf_ir_dump.h"
#include "frontend/operator/ops.h"
#include "utils/ms_context.h"
@ -55,6 +56,7 @@ constexpr auto kSelectFormat = "SelectFormat";
constexpr auto kFullySupported = "FULLY_SUPPORTED";
constexpr auto kLevel = "level";
constexpr auto kMessage = "message";
constexpr auto kErrorCode = "errCode";
constexpr auto kIndex = "index";
constexpr auto kStatus = "status";
constexpr auto kJobType = "job_type";
@ -100,6 +102,27 @@ inline bool Order(const nlohmann::json &json1, const nlohmann::json &json2) {
return json1[kIndex].dump() > json2[kIndex].dump();
}
void ReportToErrorManager(const string &message) {
nlohmann::json exception_message;
if (!ParseJson(message, &exception_message)) {
MS_LOG(EXCEPTION) << "Parse tbe exception message error.";
}
const auto &error_code = GetJsonValue<std::string>(exception_message, kErrorCode);
std::map<std::string, std::string> arg_map;
for (auto it = exception_message.begin(); it != exception_message.end(); it++) {
const std::string arg_key = it.key();
if (it.key() == kErrorCode) {
continue;
}
const auto &arg_value = GetJsonValue<std::string>(exception_message, arg_key);
arg_map[arg_key] = arg_value;
}
const auto report_ret = ErrorManager::GetInstance().ReportErrMessage(error_code, arg_map);
if (report_ret != 0) {
MS_LOG(WARNING) << "Report error message failed, raw error message: " << message;
}
}
void PrintInfo(const nlohmann::json &info, const std::string &job_name, const int job_id, int adjust_log_level) {
auto level = GetJsonValue<int>(info, kLevel);
level = level > adjust_log_level ? adjust_log_level : level;
@ -112,6 +135,8 @@ void PrintInfo(const nlohmann::json &info, const std::string &job_name, const in
MS_LOG(WARNING) << "Job id:" << job_id << ", name :" << job_name << ", message:" << message;
} else if (level == 3) {
MS_LOG(ERROR) << "Job id:" << job_id << ", name :" << job_name << ", message:" << message;
} else if (level == 4) {
ReportToErrorManager(message);
}
}

View File

@ -93,6 +93,7 @@ const size_t kLoopSinkNextLoopIndex = 1;
const size_t kLoopSinkEpochIndex = 2;
constexpr char SR_TAG[] = "sr_tag";
constexpr char BACKWARD[] = "backward";
constexpr auto kUnknowErrorString = "Unknown error occurred";
namespace {
void DumpGraphExeOrder(const std::vector<CNodePtr> &execution_order, const std::string &tag = "") {
MS_LOG(INFO) << "Dump execution_order size " << execution_order.size();
@ -1546,6 +1547,9 @@ void AscendSession::ReportWarningMessage() {
void AscendSession::ReportErrorMessage() {
const string &error_message = ErrorManager::GetInstance().GetErrorMessage();
if (error_message.find(kUnknowErrorString) != string::npos) {
return;
}
if (!error_message.empty()) {
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
}

View File

@ -26,6 +26,7 @@
#include "acl/acl_tdt.h"
#include "runtime/dev.h"
#include "toolchain/plog.h"
#include "common/util/error_manager/error_manager.h"
#endif
#ifdef ENABLE_GE
#include "transform/graph_ir/df_graph_manager.h"
@ -83,6 +84,10 @@ bool OpenTsd(const std::shared_ptr<MsContext> &ms_context_ptr) {
MS_LOG(INFO) << "Device id = " << device_id << ", rank size = " << rank_size << ".";
auto ret = rtSetDevice(static_cast<int32_t>(device_id));
if (ret != RT_ERROR_NONE) {
const std::string &error_message = ErrorManager::GetInstance().GetErrorMessage();
if (!error_message.empty()) {
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
}
MS_LOG(EXCEPTION) << "Device " << device_id << " call rtSetDevice failed, ret[" << static_cast<int>(ret) << "]";
}
ms_context_ptr->increase_param<uint32_t>(MS_CTX_TSD_REF);
@ -113,6 +118,10 @@ bool CloseTsd(const std::shared_ptr<MsContext> &ms_context_ptr, bool force) {
uint32_t device_id = ms_context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
auto ret = rtDeviceReset(static_cast<int32_t>(device_id));
if (ret != RT_ERROR_NONE) {
const std::string &error_message = ErrorManager::GetInstance().GetErrorMessage();
if (!error_message.empty()) {
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
}
MS_LOG(EXCEPTION) << "Device " << device_id << " call rtDeviceReset failed, ret[" << static_cast<int>(ret) << "]";
return false;
}