diff --git a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_job.py b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_job.py index ce609d06147..dd122c5073d 100644 --- a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_job.py +++ b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_job.py @@ -36,6 +36,7 @@ class LogLevel(Enum): INFO = 1 WARNING = 2 ERROR = 3 + EXCEPTION = 4 class JobStatus(Enum): @@ -140,6 +141,29 @@ class TbeJob: self.process_info.append(message) self._sys_logger.error(msg, *args, **kwargs) + def exception(self, msg, *args, **kwargs): + """ + log exception level info + :param msg: + :param args: + :return: + """ + if not msg: + self.warning("Get empty exception message") + return + exception_info = msg[0] + if not isinstance(exception_info, dict): + self.warning("Get illegal exception message") + return + op_name = self.fusion_op_name + if len(msg) >= 2: + op_name = msg[1] + exception_info["op_name"] = op_name + processed_msg = json.dumps(exception_info) + message = LogMessage(len(self.process_info), LogLevel.EXCEPTION, processed_msg) + self.process_info.append(message) + self._sys_logger.exception(msg, *args, **kwargs) + def get_result(self): """ Get tht job process result string diff --git a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_job_manager.py b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_job_manager.py index e2e6e7895a8..1b36a361f49 100644 --- a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_job_manager.py +++ b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_job_manager.py @@ -384,7 +384,7 @@ class TbeJobManager: if "except_msg" in new_job: target_job.error("Query except_msg:{}".format(new_job["except_msg"])) if "except_tuple_msg" in new_job: - target_job.error("Query except_tuple_msg:{}".format(new_job["except_tuple_msg"])) + target_job.exception(new_job["except_tuple_msg"]) target_job.error("\nOriginal compile json: \n {}\n".format(target_job.json_string)) post_job(self._raw_finish_jobs, target_job) del_job(self._running_jobs, target_job.source_id, target_job.id) @@ -439,6 +439,10 @@ class DummyLogger: def error(msg, *args, **kwargs): pass + @staticmethod + def exception(msg, *args, **kwargs): + pass + def get_job(jobs, source_id, job_id): """ diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/ascend_kernel_compile.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/ascend_kernel_compile.cc index a31ccad2527..fa57a1f8b39 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/tbe/ascend_kernel_compile.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/ascend_kernel_compile.cc @@ -30,6 +30,7 @@ #include "backend/kernel_compiler/tbe/tbe_utils.h" #include "backend/kernel_compiler/tbe/tbe_convert_utils.h" #include "backend/session/anf_runtime_algorithm.h" +#include "common/util/error_manager/error_manager.h" #include "debug/anf_ir_dump.h" #include "frontend/operator/ops.h" #include "utils/ms_context.h" @@ -55,6 +56,7 @@ constexpr auto kSelectFormat = "SelectFormat"; constexpr auto kFullySupported = "FULLY_SUPPORTED"; constexpr auto kLevel = "level"; constexpr auto kMessage = "message"; +constexpr auto kErrorCode = "errCode"; constexpr auto kIndex = "index"; constexpr auto kStatus = "status"; constexpr auto kJobType = "job_type"; @@ -100,6 +102,27 @@ inline bool Order(const nlohmann::json &json1, const nlohmann::json &json2) { return json1[kIndex].dump() > json2[kIndex].dump(); } +void ReportToErrorManager(const string &message) { + nlohmann::json exception_message; + if (!ParseJson(message, &exception_message)) { + MS_LOG(EXCEPTION) << "Parse tbe exception message error."; + } + const auto &error_code = GetJsonValue(exception_message, kErrorCode); + std::map arg_map; + for (auto it = exception_message.begin(); it != exception_message.end(); it++) { + const std::string arg_key = it.key(); + if (it.key() == kErrorCode) { + continue; + } + const auto &arg_value = GetJsonValue(exception_message, arg_key); + arg_map[arg_key] = arg_value; + } + const auto report_ret = ErrorManager::GetInstance().ReportErrMessage(error_code, arg_map); + if (report_ret != 0) { + MS_LOG(WARNING) << "Report error message failed, raw error message: " << message; + } +} + void PrintInfo(const nlohmann::json &info, const std::string &job_name, const int job_id, int adjust_log_level) { auto level = GetJsonValue(info, kLevel); level = level > adjust_log_level ? adjust_log_level : level; @@ -112,6 +135,8 @@ void PrintInfo(const nlohmann::json &info, const std::string &job_name, const in MS_LOG(WARNING) << "Job id:" << job_id << ", name :" << job_name << ", message:" << message; } else if (level == 3) { MS_LOG(ERROR) << "Job id:" << job_id << ", name :" << job_name << ", message:" << message; + } else if (level == 4) { + ReportToErrorManager(message); } } diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index 491634a623d..8cfd25e6ec5 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -93,6 +93,7 @@ const size_t kLoopSinkNextLoopIndex = 1; const size_t kLoopSinkEpochIndex = 2; constexpr char SR_TAG[] = "sr_tag"; constexpr char BACKWARD[] = "backward"; +constexpr auto kUnknowErrorString = "Unknown error occurred"; namespace { void DumpGraphExeOrder(const std::vector &execution_order, const std::string &tag = "") { MS_LOG(INFO) << "Dump execution_order size " << execution_order.size(); @@ -1546,6 +1547,9 @@ void AscendSession::ReportWarningMessage() { void AscendSession::ReportErrorMessage() { const string &error_message = ErrorManager::GetInstance().GetErrorMessage(); + if (error_message.find(kUnknowErrorString) != string::npos) { + return; + } if (!error_message.empty()) { MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message; } diff --git a/mindspore/ccsrc/utils/context/context_extends.cc b/mindspore/ccsrc/utils/context/context_extends.cc index 10d548e0409..669b8264e00 100644 --- a/mindspore/ccsrc/utils/context/context_extends.cc +++ b/mindspore/ccsrc/utils/context/context_extends.cc @@ -26,6 +26,7 @@ #include "acl/acl_tdt.h" #include "runtime/dev.h" #include "toolchain/plog.h" +#include "common/util/error_manager/error_manager.h" #endif #ifdef ENABLE_GE #include "transform/graph_ir/df_graph_manager.h" @@ -83,6 +84,10 @@ bool OpenTsd(const std::shared_ptr &ms_context_ptr) { MS_LOG(INFO) << "Device id = " << device_id << ", rank size = " << rank_size << "."; auto ret = rtSetDevice(static_cast(device_id)); if (ret != RT_ERROR_NONE) { + const std::string &error_message = ErrorManager::GetInstance().GetErrorMessage(); + if (!error_message.empty()) { + MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message; + } MS_LOG(EXCEPTION) << "Device " << device_id << " call rtSetDevice failed, ret[" << static_cast(ret) << "]"; } ms_context_ptr->increase_param(MS_CTX_TSD_REF); @@ -113,6 +118,10 @@ bool CloseTsd(const std::shared_ptr &ms_context_ptr, bool force) { uint32_t device_id = ms_context_ptr->get_param(MS_CTX_DEVICE_ID); auto ret = rtDeviceReset(static_cast(device_id)); if (ret != RT_ERROR_NONE) { + const std::string &error_message = ErrorManager::GetInstance().GetErrorMessage(); + if (!error_message.empty()) { + MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message; + } MS_LOG(EXCEPTION) << "Device " << device_id << " call rtDeviceReset failed, ret[" << static_cast(ret) << "]"; return false; }