fixed node timeout
This commit is contained in:
parent
ef08a78112
commit
f097f72798
|
@ -91,7 +91,7 @@ bool HttpServer::InitServer() {
|
|||
|
||||
result = ::bind(fd_, (struct sockaddr *)&addr, sizeof(addr));
|
||||
if (result < 0) {
|
||||
MS_LOG(ERROR) << "Bind ip:" << server_address_ << " port:" << server_port_ << "failed!";
|
||||
MS_LOG(ERROR) << "Bind ip:" << server_address_ << " port:" << server_port_ << " failed!";
|
||||
close(fd_);
|
||||
fd_ = -1;
|
||||
return false;
|
||||
|
@ -99,7 +99,7 @@ bool HttpServer::InitServer() {
|
|||
|
||||
result = ::listen(fd_, backlog_);
|
||||
if (result < 0) {
|
||||
MS_LOG(ERROR) << "Listen ip:" << server_address_ << " port:" << server_port_ << "failed!";
|
||||
MS_LOG(ERROR) << "Listen ip:" << server_address_ << " port:" << server_port_ << " failed!";
|
||||
close(fd_);
|
||||
fd_ = -1;
|
||||
return false;
|
||||
|
|
|
@ -148,6 +148,7 @@ void SchedulerNode::ProcessRegister(std::shared_ptr<TcpServer> server, std::shar
|
|||
is_ready_ = true;
|
||||
MS_LOG(INFO) << "There are " << node_manager_.worker_num() << " workers and " << node_manager_.server_num()
|
||||
<< " servers registered to scheduer, so the scheduler send meta data to worker/server.";
|
||||
node_manager_.UpdateClusterState(ClusterState::CLUSTER_READY);
|
||||
if (node_manager_.GetClusterState() == ClusterState::CLUSTER_SCALE_IN) {
|
||||
auto nodes = node_manager_.nodes_info();
|
||||
for (const auto &id : scale_in_node_ids_) {
|
||||
|
@ -162,7 +163,6 @@ void SchedulerNode::ProcessRegister(std::shared_ptr<TcpServer> server, std::shar
|
|||
auto client = GetOrCreateClient(kvs.second);
|
||||
SendMetadata(client, kvs.second.rank_id_);
|
||||
}
|
||||
node_manager_.UpdateClusterState(ClusterState::CLUSTER_READY);
|
||||
wait_start_cond_.notify_all();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -159,7 +159,7 @@ size_t Worker::GetParamKey(const std::string ¶m_name) {
|
|||
}
|
||||
|
||||
void Worker::SetParamInitInServer(const std::string ¶m_name, bool init_in_server) {
|
||||
MS_LOG(INFO) << "Set parameter " << param_name << " init_in_server:" << init_in_server;
|
||||
MS_LOG(DEBUG) << "Set parameter " << param_name << " init_in_server:" << init_in_server;
|
||||
param_to_init_in_server_[param_name] = init_in_server;
|
||||
}
|
||||
|
||||
|
@ -260,8 +260,8 @@ void Worker::InitPSParamAndOptim(const AnfNodePtr &input_node, const tensor::Ten
|
|||
SetParamInitInServer(param_name, init_in_server);
|
||||
bool init = IsKeyInit(param_key);
|
||||
if (!init) {
|
||||
MS_LOG(INFO) << "Init parameter key " << param_key << " and optimizer in parameter server side for " << param_name
|
||||
<< ", whether init in server: " << init_in_server;
|
||||
MS_LOG(DEBUG) << "Init parameter key " << param_key << " and optimizer in parameter server side for " << param_name
|
||||
<< ", whether init in server: " << init_in_server;
|
||||
AddKeyToServerId(param_key);
|
||||
if (!PsDataPrefetch::GetInstance().cache_enable()) {
|
||||
if (!init_in_server) {
|
||||
|
@ -449,7 +449,7 @@ void Worker::AddKeyByHashMod(const Key &key) {
|
|||
MS_LOG(EXCEPTION) << "Server number is invalid:0";
|
||||
}
|
||||
key_to_server_id_[key] = static_cast<int64_t>(key % server_num_);
|
||||
MS_LOG(INFO) << "The server id of key " << key << " is " << key_to_server_id_[key];
|
||||
MS_LOG(DEBUG) << "The server id of key " << key << " is " << key_to_server_id_[key];
|
||||
}
|
||||
|
||||
void Worker::InitPSOptimId(const size_t param_key) {
|
||||
|
|
Loading…
Reference in New Issue