!46849 reconfigure to acl kernel when tbe single op compile failed

Merge pull request !46849 from laiyongqiang/tbe_acl
This commit is contained in:
i-robot 2022-12-19 06:16:58 +00:00 committed by Gitee
commit 79694611d9
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
4 changed files with 76 additions and 20 deletions

View File

@ -108,7 +108,28 @@ static bool KernelBuildParallelCompile(const std::vector<CNodePtr> &kernels) {
if (!tbe_nodes.empty()) {
std::lock_guard<std::mutex> lock(compile_mtx);
auto &build_manager = kernel::ascend::TbeKernelCompileManager::GetInstance();
build_manager.TbeSingleOpCompile(tbe_nodes);
build_manager.ClearFailedLog();
auto build_result = build_manager.TbeSingleOpCompile(tbe_nodes);
auto build_failed_nodes = build_result.second;
if (!build_failed_nodes.empty()) {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
bool enable_reconfig_to_acl = !ms_context->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
if (enable_reconfig_to_acl) {
for (const auto &node : build_failed_nodes) {
auto new_builder =
std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(AnfAlgo::GetSelectKernelBuildInfo(node));
MS_EXCEPTION_IF_NULL(new_builder);
new_builder->SetKernelType(ACL_KERNEL);
MS_LOG(INFO) << "SUCCESS SET ACL KERNEL FOR" << node->DebugString();
AnfAlgo::SetSelectKernelBuildInfo(new_builder->Build(), node.get());
(void)other_nodes.emplace_back(node);
}
} else {
MS_LOG(EXCEPTION) << "TBE Single op compile failed. Compile failed op number:" << build_failed_nodes.size()
<< ", failed log:" << build_manager.failed_log();
}
}
}
bool akg_ret = true;
if (!akg_nodes.empty()) {

View File

@ -466,8 +466,9 @@ void TbeKernelCompileManager::SaveTaskInfo(const bool is_dynamic, const nlohmann
}
void TbeKernelCompileManager::QueryProcess(const std::string &type, const std::string &job_result,
std::vector<int> *success_job) {
std::vector<int> *success_job, std::vector<int> *failed_job) {
MS_EXCEPTION_IF_NULL(success_job);
MS_EXCEPTION_IF_NULL(failed_job);
auto json_obj = TurnStrToJson(job_result);
// the query job' status.
if (json_obj.at(kStatus) == kSuccess) {
@ -488,16 +489,19 @@ void TbeKernelCompileManager::QueryProcess(const std::string &type, const std::s
if (type == kPreCompile) {
MS_LOG(INFO) << "Single op pre build failed, op: " << kernel_name
<< "\n except_msg : " << target_status.except_msg;
(void)success_job->emplace_back(target_status.target_job_id);
(void)failed_job->emplace_back(target_status.target_job_id);
} else if (type == kCompile) {
auto target_node = job_id_to_node_[target_status.target_job_id];
ClearOldTask();
MS_LOG(EXCEPTION) << "Single op compile failed, op: " << kernel_name
<< ".#dmsg#Operator Compilation Exception Message:#dmsg#" << target_status.except_msg
<< trace::DumpSourceLines(target_node);
auto target_job_id = target_status.target_job_id;
auto target_cnode = job_id_to_node_[target_job_id];
std::ostringstream oss;
(void)failed_job->emplace_back(target_job_id);
oss << "op: " << kernel_name << ".#dmsg#Operator Compilation Exception Message:#dmsg#"
<< target_status.except_msg << trace::DumpSourceLines(target_cnode) << "\n";
failed_log_ += oss.str();
MS_LOG(INFO) << "Single op compile failed. " << oss.str();
} else {
MS_LOG(INFO) << "Op " << kernel_name << " " << type << " failed,\n except_msg : " << target_status.except_msg;
(void)success_job->emplace_back(target_status.target_job_id);
(void)failed_job->emplace_back(target_status.target_job_id);
}
}
return;
@ -511,6 +515,7 @@ void TbeKernelCompileManager::Query(const std::string &type) {
size_t sleep_time = 0;
while (!task_map_.empty()) {
std::vector<int> success_job;
std::vector<int> failed_job;
auto iter = task_map_.begin();
while (iter != task_map_.end()) {
nlohmann::json query_json;
@ -519,15 +524,22 @@ void TbeKernelCompileManager::Query(const std::string &type) {
JsonAssemble(kQuery, kernel_json, &query_json);
auto job_result = DispatchCompileTask(query_json);
query_cnt++;
QueryProcess(type, job_result, &success_job);
QueryProcess(type, job_result, &success_job, &failed_job);
(void)iter++;
}
bool sleep_flag = true;
for (auto k : success_job) {
(void)task_map_.erase(k);
sleep_flag = false;
}
success_job.clear();
for (auto k : failed_job) {
(void)task_map_.erase(k);
sleep_flag = false;
}
failed_job.clear();
if (sleep_flag && !task_map_.empty()) {
if ((query_cnt - last_sleep) > KSleepInterval * task_map_.size()) {
MS_LOG(INFO) << "Querying Parallel Compilation Job. Current Query Count: " << query_cnt;
@ -539,11 +551,16 @@ void TbeKernelCompileManager::Query(const std::string &type) {
}
}
void TbeKernelCompileManager::GenKernelMod(const std::vector<CNodePtr> &node_list) {
std::pair<std::vector<CNodePtr>, std::vector<CNodePtr>> TbeKernelCompileManager::GenKernelMod(
const std::vector<CNodePtr> &node_list) {
MS_LOG(INFO) << "Gen kernel mod start!";
std::vector<CNodePtr> success_node;
std::vector<CNodePtr> failed_node;
for (auto &node : node_list) {
MS_EXCEPTION_IF_NULL(node);
if (AnfAlgo::GetKernelMod(node) != nullptr) {
(void)success_node.emplace_back(node);
continue; // kernel mod already exist, continue;
}
auto full_name = node->fullname_with_scope();
@ -554,8 +571,9 @@ void TbeKernelCompileManager::GenKernelMod(const std::vector<CNodePtr> &node_lis
MS_EXCEPTION_IF_NULL(bin_map);
kernel_pack = bin_map->SearchInFile(json_name);
if (kernel_pack == nullptr) {
MS_LOG(EXCEPTION) << "Can not find .json file or the .o file for op:" << json_name
<< trace::DumpSourceLines(node);
MS_LOG(INFO) << "Can not find .json file or the .o file for op:" << json_name << trace::DumpSourceLines(node);
(void)failed_node.emplace_back(node);
continue;
}
}
auto kernel_info_json = kernel_pack->kernel_json_info();
@ -575,9 +593,10 @@ void TbeKernelCompileManager::GenKernelMod(const std::vector<CNodePtr> &node_lis
kernel_mod_ptr->SetOutputSizeList(iter->second.output_size_list);
kernel_mod_ptr->SetWorkspaceSizeList(kernel_info_json.workspaces);
AnfAlgo::SetKernelMod(kernel_mod_ptr, node.get());
(void)success_node.emplace_back(node);
}
ClearOldTask();
MS_LOG(INFO) << "Gen kernel mod end!";
return std::make_pair(success_node, failed_node);
}
void TbeKernelCompileManager::UpdateFusionTypeAndOutputDataDesc(const std::vector<CNodePtr> &nodes) {
@ -772,12 +791,17 @@ void TbeKernelCompileManager::TbePreBuild(const KernelGraphPtr &kernel_graph) {
MS_LOG(INFO) << "Single op pre build end.";
}
void TbeKernelCompileManager::TbeSingleOpCompile(const std::vector<CNodePtr> &node_list) {
std::pair<std::vector<CNodePtr>, std::vector<CNodePtr>> TbeKernelCompileManager::TbeSingleOpCompile(
const std::vector<CNodePtr> &node_list) {
MS_LOG(INFO) << "Single op parallel build start.";
auto job_type = is_tune_flag_ ? kTune : kCompile;
DistributeCompileTask(node_list, job_type);
Query(job_type);
GenKernelMod(node_list);
auto ret = GenKernelMod(node_list);
ClearOldTask();
MS_LOG(INFO) << "TBE Single op parallel build result: all:" << node_list.size() << " success:" << ret.first.size()
<< " failed:" << ret.second.size() << ".";
return ret;
}
JsonNameMap TbeKernelCompileManager::TbeFusionOpCompile(const std::vector<FusionScopeInfo> &fusion_scopes) {

View File

@ -22,6 +22,7 @@
#include <set>
#include <memory>
#include <vector>
#include <utility>
#include "ir/anf.h"
#include "kernel/kernel.h"
#include "kernel/kernel_fusion.h"
@ -77,9 +78,11 @@ class TbeKernelCompileManager {
// pre build
void TbePreBuild(const KernelGraphPtr &kernel_graph);
// single op compile
void TbeSingleOpCompile(const std::vector<CNodePtr> &node_list);
std::pair<std::vector<CNodePtr>, std::vector<CNodePtr>> TbeSingleOpCompile(const std::vector<CNodePtr> &node_list);
// fusion op compile
JsonNameMap TbeFusionOpCompile(const std::vector<FusionScopeInfo> &fusion_scopes);
void ClearFailedLog() { failed_log_.clear(); }
std::string failed_log() { return failed_log_; }
private:
TbeKernelCompileManager() = default;
@ -104,7 +107,8 @@ class TbeKernelCompileManager {
// query all build task
void Query(const std::string &type);
// single op build/pre-build
void QueryProcess(const std::string &type, const std::string &job_result, std::vector<int> *success_job);
void QueryProcess(const std::string &type, const std::string &job_result, std::vector<int> *success_job,
std::vector<int> *failed_job);
void GetAllTbeNodes(const std::shared_ptr<session::KernelGraph> &kernel_graph,
std::vector<CNodePtr> *tbe_nodes) const;
void PrintProcessLog(const nlohmann::json &json, int adjust_log_level) const;
@ -119,7 +123,7 @@ class TbeKernelCompileManager {
void ClearOldTask();
void UpdateFusionTypeAndOutputDataDesc(const std::vector<CNodePtr> &nodes);
JsonNameMap GetAllSuccessFusion();
void GenKernelMod(const std::vector<CNodePtr> &node_list);
std::pair<std::vector<CNodePtr>, std::vector<CNodePtr>> GenKernelMod(const std::vector<CNodePtr> &node_list);
void DistributeCompileTask(const std::vector<CNodePtr> &node_list, const std::string &job_type);
void DistributePreBuildTask(const std::vector<CNodePtr> &node_list);
@ -154,6 +158,8 @@ class TbeKernelCompileManager {
// for fusion op
JsonNameMap success_fusion_ops_;
JsonNameMap all_fusion_ops_;
// build failed log
std::string failed_log_;
};
} // namespace ascend
} // namespace kernel

View File

@ -30,7 +30,12 @@ void TbeKernelCompileManager::TbeInitialize() {}
// pre build
void TbeKernelCompileManager::TbePreBuild(const std::shared_ptr<session::KernelGraph> &kernel_graph) {}
// single op compile
void TbeKernelCompileManager::TbeSingleOpCompile(const std::vector<CNodePtr> &anf_nodes) {}
std::pair<std::vector<CNodePtr>, std::vector<CNodePtr>> TbeKernelCompileManager::TbeSingleOpCompile(
const std::vector<CNodePtr> &anf_nodes) {
std::vector<CNodePtr> success_node;
std::vector<CNodePtr> failed_node;
return std::make_pair(success_node, failed_node);
}
// fusion op compile
JsonNameMap TbeKernelCompileManager::TbeFusionOpCompile(const std::vector<FusionScopeInfo> &fusion_scopes) {
JsonNameMap json_name_map;