forked from mindspore-Ecosystem/mindspore
!22134 Report tbe error code to ErrorManager
Merge pull request !22134 from tanghuikang/tbe_em
This commit is contained in:
commit
163501e09c
|
@ -36,6 +36,7 @@ class LogLevel(Enum):
|
|||
INFO = 1
|
||||
WARNING = 2
|
||||
ERROR = 3
|
||||
EXCEPTION = 4
|
||||
|
||||
|
||||
class JobStatus(Enum):
|
||||
|
@ -140,6 +141,29 @@ class TbeJob:
|
|||
self.process_info.append(message)
|
||||
self._sys_logger.error(msg, *args, **kwargs)
|
||||
|
||||
def exception(self, msg, *args, **kwargs):
|
||||
"""
|
||||
log exception level info
|
||||
:param msg:
|
||||
:param args:
|
||||
:return:
|
||||
"""
|
||||
if not msg:
|
||||
self.warning("Get empty exception message")
|
||||
return
|
||||
exception_info = msg[0]
|
||||
if not isinstance(exception_info, dict):
|
||||
self.warning("Get illegal exception message")
|
||||
return
|
||||
op_name = self.fusion_op_name
|
||||
if len(msg) >= 2:
|
||||
op_name = msg[1]
|
||||
exception_info["op_name"] = op_name
|
||||
processed_msg = json.dumps(exception_info)
|
||||
message = LogMessage(len(self.process_info), LogLevel.EXCEPTION, processed_msg)
|
||||
self.process_info.append(message)
|
||||
self._sys_logger.exception(msg, *args, **kwargs)
|
||||
|
||||
def get_result(self):
|
||||
"""
|
||||
Get tht job process result string
|
||||
|
|
|
@ -384,7 +384,7 @@ class TbeJobManager:
|
|||
if "except_msg" in new_job:
|
||||
target_job.error("Query except_msg:{}".format(new_job["except_msg"]))
|
||||
if "except_tuple_msg" in new_job:
|
||||
target_job.error("Query except_tuple_msg:{}".format(new_job["except_tuple_msg"]))
|
||||
target_job.exception(new_job["except_tuple_msg"])
|
||||
target_job.error("\nOriginal compile json: \n {}\n".format(target_job.json_string))
|
||||
post_job(self._raw_finish_jobs, target_job)
|
||||
del_job(self._running_jobs, target_job.source_id, target_job.id)
|
||||
|
@ -439,6 +439,10 @@ class DummyLogger:
|
|||
def error(msg, *args, **kwargs):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def exception(msg, *args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
def get_job(jobs, source_id, job_id):
|
||||
"""
|
||||
|
|
|
@ -30,6 +30,7 @@
|
|||
#include "backend/kernel_compiler/tbe/tbe_utils.h"
|
||||
#include "backend/kernel_compiler/tbe/tbe_convert_utils.h"
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "common/util/error_manager/error_manager.h"
|
||||
#include "debug/anf_ir_dump.h"
|
||||
#include "frontend/operator/ops.h"
|
||||
#include "utils/ms_context.h"
|
||||
|
@ -55,6 +56,7 @@ constexpr auto kSelectFormat = "SelectFormat";
|
|||
constexpr auto kFullySupported = "FULLY_SUPPORTED";
|
||||
constexpr auto kLevel = "level";
|
||||
constexpr auto kMessage = "message";
|
||||
constexpr auto kErrorCode = "errCode";
|
||||
constexpr auto kIndex = "index";
|
||||
constexpr auto kStatus = "status";
|
||||
constexpr auto kJobType = "job_type";
|
||||
|
@ -100,6 +102,27 @@ inline bool Order(const nlohmann::json &json1, const nlohmann::json &json2) {
|
|||
return json1[kIndex].dump() > json2[kIndex].dump();
|
||||
}
|
||||
|
||||
void ReportToErrorManager(const string &message) {
|
||||
nlohmann::json exception_message;
|
||||
if (!ParseJson(message, &exception_message)) {
|
||||
MS_LOG(EXCEPTION) << "Parse tbe exception message error.";
|
||||
}
|
||||
const auto &error_code = GetJsonValue<std::string>(exception_message, kErrorCode);
|
||||
std::map<std::string, std::string> arg_map;
|
||||
for (auto it = exception_message.begin(); it != exception_message.end(); it++) {
|
||||
const std::string arg_key = it.key();
|
||||
if (it.key() == kErrorCode) {
|
||||
continue;
|
||||
}
|
||||
const auto &arg_value = GetJsonValue<std::string>(exception_message, arg_key);
|
||||
arg_map[arg_key] = arg_value;
|
||||
}
|
||||
const auto report_ret = ErrorManager::GetInstance().ReportErrMessage(error_code, arg_map);
|
||||
if (report_ret != 0) {
|
||||
MS_LOG(WARNING) << "Report error message failed, raw error message: " << message;
|
||||
}
|
||||
}
|
||||
|
||||
void PrintInfo(const nlohmann::json &info, const std::string &job_name, const int job_id, int adjust_log_level) {
|
||||
auto level = GetJsonValue<int>(info, kLevel);
|
||||
level = level > adjust_log_level ? adjust_log_level : level;
|
||||
|
@ -112,6 +135,8 @@ void PrintInfo(const nlohmann::json &info, const std::string &job_name, const in
|
|||
MS_LOG(WARNING) << "Job id:" << job_id << ", name :" << job_name << ", message:" << message;
|
||||
} else if (level == 3) {
|
||||
MS_LOG(ERROR) << "Job id:" << job_id << ", name :" << job_name << ", message:" << message;
|
||||
} else if (level == 4) {
|
||||
ReportToErrorManager(message);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -93,6 +93,7 @@ const size_t kLoopSinkNextLoopIndex = 1;
|
|||
const size_t kLoopSinkEpochIndex = 2;
|
||||
constexpr char SR_TAG[] = "sr_tag";
|
||||
constexpr char BACKWARD[] = "backward";
|
||||
constexpr auto kUnknowErrorString = "Unknown error occurred";
|
||||
namespace {
|
||||
void DumpGraphExeOrder(const std::vector<CNodePtr> &execution_order, const std::string &tag = "") {
|
||||
MS_LOG(INFO) << "Dump execution_order size " << execution_order.size();
|
||||
|
@ -1546,6 +1547,9 @@ void AscendSession::ReportWarningMessage() {
|
|||
|
||||
void AscendSession::ReportErrorMessage() {
|
||||
const string &error_message = ErrorManager::GetInstance().GetErrorMessage();
|
||||
if (error_message.find(kUnknowErrorString) != string::npos) {
|
||||
return;
|
||||
}
|
||||
if (!error_message.empty()) {
|
||||
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
#include "acl/acl_tdt.h"
|
||||
#include "runtime/dev.h"
|
||||
#include "toolchain/plog.h"
|
||||
#include "common/util/error_manager/error_manager.h"
|
||||
#endif
|
||||
#ifdef ENABLE_GE
|
||||
#include "transform/graph_ir/df_graph_manager.h"
|
||||
|
@ -83,6 +84,10 @@ bool OpenTsd(const std::shared_ptr<MsContext> &ms_context_ptr) {
|
|||
MS_LOG(INFO) << "Device id = " << device_id << ", rank size = " << rank_size << ".";
|
||||
auto ret = rtSetDevice(static_cast<int32_t>(device_id));
|
||||
if (ret != RT_ERROR_NONE) {
|
||||
const std::string &error_message = ErrorManager::GetInstance().GetErrorMessage();
|
||||
if (!error_message.empty()) {
|
||||
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
|
||||
}
|
||||
MS_LOG(EXCEPTION) << "Device " << device_id << " call rtSetDevice failed, ret[" << static_cast<int>(ret) << "]";
|
||||
}
|
||||
ms_context_ptr->increase_param<uint32_t>(MS_CTX_TSD_REF);
|
||||
|
@ -113,6 +118,10 @@ bool CloseTsd(const std::shared_ptr<MsContext> &ms_context_ptr, bool force) {
|
|||
uint32_t device_id = ms_context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
auto ret = rtDeviceReset(static_cast<int32_t>(device_id));
|
||||
if (ret != RT_ERROR_NONE) {
|
||||
const std::string &error_message = ErrorManager::GetInstance().GetErrorMessage();
|
||||
if (!error_message.empty()) {
|
||||
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
|
||||
}
|
||||
MS_LOG(EXCEPTION) << "Device " << device_id << " call rtDeviceReset failed, ret[" << static_cast<int>(ret) << "]";
|
||||
return false;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue