!19444 fixed node timeout

Merge pull request !19444 from anancds/issue
This commit is contained in:
i-robot 2021-07-07 01:08:05 +00:00 committed by Gitee
commit 187a987066
3 changed files with 7 additions and 7 deletions

View File

@ -91,7 +91,7 @@ bool HttpServer::InitServer() {
result = ::bind(fd_, (struct sockaddr *)&addr, sizeof(addr));
if (result < 0) {
MS_LOG(ERROR) << "Bind ip:" << server_address_ << " port:" << server_port_ << "failed!";
MS_LOG(ERROR) << "Bind ip:" << server_address_ << " port:" << server_port_ << " failed!";
close(fd_);
fd_ = -1;
return false;
@ -99,7 +99,7 @@ bool HttpServer::InitServer() {
result = ::listen(fd_, backlog_);
if (result < 0) {
MS_LOG(ERROR) << "Listen ip:" << server_address_ << " port:" << server_port_ << "failed!";
MS_LOG(ERROR) << "Listen ip:" << server_address_ << " port:" << server_port_ << " failed!";
close(fd_);
fd_ = -1;
return false;

View File

@ -148,6 +148,7 @@ void SchedulerNode::ProcessRegister(std::shared_ptr<TcpServer> server, std::shar
is_ready_ = true;
MS_LOG(INFO) << "There are " << node_manager_.worker_num() << " workers and " << node_manager_.server_num()
<< " servers registered to scheduer, so the scheduler send meta data to worker/server.";
node_manager_.UpdateClusterState(ClusterState::CLUSTER_READY);
if (node_manager_.GetClusterState() == ClusterState::CLUSTER_SCALE_IN) {
auto nodes = node_manager_.nodes_info();
for (const auto &id : scale_in_node_ids_) {
@ -162,7 +163,6 @@ void SchedulerNode::ProcessRegister(std::shared_ptr<TcpServer> server, std::shar
auto client = GetOrCreateClient(kvs.second);
SendMetadata(client, kvs.second.rank_id_);
}
node_manager_.UpdateClusterState(ClusterState::CLUSTER_READY);
wait_start_cond_.notify_all();
}
}

View File

@ -159,7 +159,7 @@ size_t Worker::GetParamKey(const std::string &param_name) {
}
void Worker::SetParamInitInServer(const std::string &param_name, bool init_in_server) {
MS_LOG(INFO) << "Set parameter " << param_name << " init_in_server:" << init_in_server;
MS_LOG(DEBUG) << "Set parameter " << param_name << " init_in_server:" << init_in_server;
param_to_init_in_server_[param_name] = init_in_server;
}
@ -260,8 +260,8 @@ void Worker::InitPSParamAndOptim(const AnfNodePtr &input_node, const tensor::Ten
SetParamInitInServer(param_name, init_in_server);
bool init = IsKeyInit(param_key);
if (!init) {
MS_LOG(INFO) << "Init parameter key " << param_key << " and optimizer in parameter server side for " << param_name
<< ", whether init in server: " << init_in_server;
MS_LOG(DEBUG) << "Init parameter key " << param_key << " and optimizer in parameter server side for " << param_name
<< ", whether init in server: " << init_in_server;
AddKeyToServerId(param_key);
if (!PsDataPrefetch::GetInstance().cache_enable()) {
if (!init_in_server) {
@ -449,7 +449,7 @@ void Worker::AddKeyByHashMod(const Key &key) {
MS_LOG(EXCEPTION) << "Server number is invalid:0";
}
key_to_server_id_[key] = static_cast<int64_t>(key % server_num_);
MS_LOG(INFO) << "The server id of key " << key << " is " << key_to_server_id_[key];
MS_LOG(DEBUG) << "The server id of key " << key << " is " << key_to_server_id_[key];
}
void Worker::InitPSOptimId(const size_t param_key) {