!49244 Optimize finalizing collective process.

Merge pull request !49244 from ZPaC/add-finalize-log-and-timeout-window
This commit is contained in:
i-robot 2023-02-23 06:22:45 +00:00 committed by Gitee
commit e03257ffff
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
1 changed files with 8 additions and 3 deletions

View File

@ -104,7 +104,7 @@ bool ExecuteFuncInThread(const std::function<bool()> &func, const int64_t timeou
if (!execute_success && !execute_fail) { if (!execute_success && !execute_fail) {
std::string node_id = common::GetEnv("MS_NODE_ID"); std::string node_id = common::GetEnv("MS_NODE_ID");
#if !defined(_WIN32) && !defined(_WIN64) #if !defined(_WIN32) && !defined(_WIN64)
MS_LOG(ERROR) << "Execute function asynchronously timeout, node id: " << node_id << " exit process"; MS_LOG(WARNING) << "Execute function asynchronously timeout, node id: " << node_id << " exit process";
(void)kill(getpid(), SIGTERM); (void)kill(getpid(), SIGTERM);
#endif #endif
} }
@ -340,15 +340,20 @@ bool CollectiveManager::Finalize() {
std::function<bool()> finalize_func = [&, this]() { std::function<bool()> finalize_func = [&, this]() {
if (need_host_collective_) { if (need_host_collective_) {
MS_EXCEPTION_IF_NULL(host_comm_lib_instance_); MS_EXCEPTION_IF_NULL(host_comm_lib_instance_);
MS_LOG(INFO) << "Start finalizing host communication lib.";
if (!host_comm_lib_instance_->Finalize()) { if (!host_comm_lib_instance_->Finalize()) {
MS_LOG(WARNING) << "Failed to finalize device communication library."; MS_LOG(WARNING) << "Failed to finalize device communication library.";
} }
MS_LOG(INFO) << "End finalizing host communication lib.";
} }
MS_EXCEPTION_IF_NULL(device_comm_lib_instance_); MS_EXCEPTION_IF_NULL(device_comm_lib_instance_);
MS_LOG(INFO) << "Start finalizing device communication lib.";
if (!device_comm_lib_instance_->Finalize()) { if (!device_comm_lib_instance_->Finalize()) {
MS_LOG(WARNING) << "Failed to finalize device communication library."; MS_LOG(WARNING) << "Failed to finalize device communication library.";
} }
MS_LOG(INFO) << "End finalizing device communication lib.";
inited_ = false; inited_ = false;
finalized_ = true; finalized_ = true;
@ -358,8 +363,8 @@ bool CollectiveManager::Finalize() {
MS_LOG(INFO) << "Begin finalize collective manager."; MS_LOG(INFO) << "Begin finalize collective manager.";
// Timeout limit 5 seconds to wait to finish finalizing device communication group. // Timeout limit 30 seconds to wait to finish finalizing device communication group.
const int64_t kTimeToWait = 5; const int64_t kTimeToWait = 30;
// Finalize collective manager in thread with timeout limit. // Finalize collective manager in thread with timeout limit.
bool ret = ExecuteFuncInThread(finalize_func, kTimeToWait); bool ret = ExecuteFuncInThread(finalize_func, kTimeToWait);