!17803 fixed terminate called without an activate exception

From: @anancds
Reviewed-by: @cristoval,@wilfchen,@limingqi107
Signed-off-by: @limingqi107
This commit is contained in:
mindspore-ci-bot 2021-06-05 14:54:48 +08:00 committed by Gitee
commit 729dbd752f
9 changed files with 34 additions and 21 deletions

View File

@ -20,6 +20,11 @@
namespace mindspore { namespace mindspore {
namespace ps { namespace ps {
namespace core { namespace core {
CommunicatorBase::~CommunicatorBase() {
running_ = false;
Join();
}
bool CommunicatorBase::SendResponse(const void *rsp_data, size_t rsp_len, std::shared_ptr<MessageHandler> msg_handler) { bool CommunicatorBase::SendResponse(const void *rsp_data, size_t rsp_len, std::shared_ptr<MessageHandler> msg_handler) {
// The rsp_len could be 0 because of ProtoBuffer's feature. // The rsp_len could be 0 because of ProtoBuffer's feature.
if (rsp_data == nullptr || msg_handler == nullptr) { if (rsp_data == nullptr || msg_handler == nullptr) {

View File

@ -42,9 +42,9 @@ class CommunicatorBase {
using OnNodeEventCallback = std::function<void(const ClusterEvent &)>; using OnNodeEventCallback = std::function<void(const ClusterEvent &)>;
using TcpMsgCallback = std::function<void(std::shared_ptr<core::TcpConnection> conn, using TcpMsgCallback = std::function<void(std::shared_ptr<core::TcpConnection> conn,
std::shared_ptr<core::MessageMeta> meta, DataPtr data, size_t size)>; std::shared_ptr<core::MessageMeta> meta, DataPtr data, size_t size)>;
CommunicatorBase() = default; CommunicatorBase() : running_(false) {}
virtual ~CommunicatorBase() = default; virtual ~CommunicatorBase();
virtual bool Start() = 0; virtual bool Start() = 0;
virtual bool Stop() = 0; virtual bool Stop() = 0;
@ -59,6 +59,7 @@ class CommunicatorBase {
protected: protected:
std::unordered_map<std::string, MessageCallback> msg_callbacks_; std::unordered_map<std::string, MessageCallback> msg_callbacks_;
std::thread running_thread_; std::thread running_thread_;
bool running_;
}; };
} // namespace core } // namespace core
} // namespace ps } // namespace ps

View File

@ -31,10 +31,8 @@ bool HttpCommunicator::Start() {
MS_LOG(INFO) << "Http communicator started."; MS_LOG(INFO) << "Http communicator started.";
running_thread_ = std::thread([&]() { running_thread_ = std::thread([&]() {
try { while (running_) {
http_server_->Wait(); std::this_thread::yield();
} catch (const std::exception &e) {
MsException::Instance().SetException();
} }
}); });
return true; return true;
@ -42,7 +40,9 @@ bool HttpCommunicator::Start() {
bool HttpCommunicator::Stop() { bool HttpCommunicator::Stop() {
MS_EXCEPTION_IF_NULL(http_server_); MS_EXCEPTION_IF_NULL(http_server_);
return http_server_->Stop(); bool res = http_server_->Stop();
running_ = false;
return res;
} }
void HttpCommunicator::RegisterMsgCallBack(const std::string &msg_type, const MessageCallback &cb) { void HttpCommunicator::RegisterMsgCallBack(const std::string &msg_type, const MessageCallback &cb) {

View File

@ -115,13 +115,17 @@ bool HttpServer::RegisterRoute(const std::string &url, OnRequestReceive *functio
return true; return true;
} }
bool HttpServer::Start() { bool HttpServer::Start(bool is_detach) {
MS_LOG(INFO) << "Start http server!"; MS_LOG(INFO) << "Start http server!";
for (size_t i = 0; i < thread_num_; i++) { for (size_t i = 0; i < thread_num_; i++) {
auto http_request_handler = std::make_shared<HttpRequestHandler>(); auto http_request_handler = std::make_shared<HttpRequestHandler>();
http_request_handler->Initialize(fd_, request_handlers_); http_request_handler->Initialize(fd_, request_handlers_);
http_request_handlers.push_back(http_request_handler); http_request_handlers.push_back(http_request_handler);
worker_threads_.emplace_back(std::make_shared<std::thread>(&HttpRequestHandler::Run, http_request_handler)); auto thread = std::make_shared<std::thread>(&HttpRequestHandler::Run, http_request_handler);
if (is_detach) {
thread->detach();
}
worker_threads_.emplace_back(thread);
} }
return true; return true;
} }

View File

@ -64,7 +64,7 @@ class HttpServer {
// Return: true if success, false if failed, check log to find failure reason // Return: true if success, false if failed, check log to find failure reason
bool RegisterRoute(const std::string &url, OnRequestReceive *func); bool RegisterRoute(const std::string &url, OnRequestReceive *func);
bool Start(); bool Start(bool is_detach = true);
bool Wait(); bool Wait();
bool Stop(); bool Stop();

View File

@ -65,7 +65,6 @@ class TcpCommunicator : public CommunicatorBase {
public: public:
explicit TcpCommunicator(const std::shared_ptr<TaskExecutor> &task_executor, ServerNode *node) explicit TcpCommunicator(const std::shared_ptr<TaskExecutor> &task_executor, ServerNode *node)
: task_executor_(task_executor), : task_executor_(task_executor),
running_(false),
server_num_(0), server_num_(0),
worker_num_(0), worker_num_(0),
scheduler_ip_(""), scheduler_ip_(""),
@ -109,7 +108,6 @@ class TcpCommunicator : public CommunicatorBase {
private: private:
std::shared_ptr<TaskExecutor> task_executor_; std::shared_ptr<TaskExecutor> task_executor_;
bool running_;
TcpMsgCallback tcp_msg_callback_; TcpMsgCallback tcp_msg_callback_;
OnNodeEventCallback event_callback_; OnNodeEventCallback event_callback_;

View File

@ -429,6 +429,7 @@ void SchedulerNode::ProcessScaleOut(std::shared_ptr<HttpMessageHandler> resp) {
nlohmann::json js; nlohmann::json js;
js["message"] = "Cluster begin to scale out."; js["message"] = "Cluster begin to scale out.";
resp->AddRespString(js.dump()); resp->AddRespString(js.dump());
resp->AddRespHeadParam("Content_Type", "application/json");
resp->SetRespCode(HTTP_OK); resp->SetRespCode(HTTP_OK);
resp->SendResponse(); resp->SendResponse();
@ -507,6 +508,7 @@ void SchedulerNode::ProcessScaleIn(std::shared_ptr<HttpMessageHandler> resp) {
nlohmann::json js; nlohmann::json js;
js["message"] = "Cluster begin to scale in."; js["message"] = "Cluster begin to scale in.";
resp->AddRespString(js.dump()); resp->AddRespString(js.dump());
resp->AddRespHeadParam("Content_Type", "application/json");
resp->SetRespCode(HTTP_OK); resp->SetRespCode(HTTP_OK);
resp->SendResponse(); resp->SendResponse();
@ -543,6 +545,7 @@ void SchedulerNode::ProcessGetNodesInfo(std::shared_ptr<HttpMessageHandler> resp
} }
resp->AddRespString(js.dump()); resp->AddRespString(js.dump());
resp->AddRespHeadParam("Content_Type", "application/json");
resp->SetRespCode(HTTP_OK); resp->SetRespCode(HTTP_OK);
resp->SendResponse(); resp->SendResponse();
@ -562,6 +565,7 @@ void SchedulerNode::ProcessGetClusterState(std::shared_ptr<HttpMessageHandler> r
js["cluster_state"] = CommUtil::ClusterStateToString(cluster_state); js["cluster_state"] = CommUtil::ClusterStateToString(cluster_state);
resp->AddRespString(js.dump()); resp->AddRespString(js.dump());
resp->AddRespHeadParam("Content_Type", "application/json");
resp->SetRespCode(HTTP_OK); resp->SetRespCode(HTTP_OK);
resp->SendResponse(); resp->SendResponse();
@ -601,6 +605,13 @@ RequestProcessResult SchedulerNode::CheckIfNodeIdLegal(const std::vector<std::st
ERROR_STATUS(result, RequestProcessResultCode::kInvalidInputs, error_message); ERROR_STATUS(result, RequestProcessResultCode::kInvalidInputs, error_message);
return result; return result;
} }
if (node_infos[val].node_role_ == NodeRole::WORKER) {
std::string error_message = "The node id:" + val + " is the role of worker, should not be scale in.";
MS_LOG(ERROR) << error_message;
ERROR_STATUS(result, RequestProcessResultCode::kInvalidInputs, error_message);
return result;
}
} }
return result; return result;
@ -628,7 +639,7 @@ void SchedulerNode::StartRestfulServer(const std::string &address, std::uint16_t
http_server_->InitServer(); http_server_->InitServer();
http_server_->Start(); http_server_->Start(false);
restful_thread_ = std::make_unique<std::thread>([&]() { http_server_->Wait(); }); restful_thread_ = std::make_unique<std::thread>([&]() { http_server_->Wait(); });
} }

View File

@ -77,6 +77,7 @@ void ServerNode::CreateTcpServer() {
MS_LOG(INFO) << "The server node start a tcp server!"; MS_LOG(INFO) << "The server node start a tcp server!";
this->server_->Start(); this->server_->Start();
}); });
server_thread_->detach();
} }
void ServerNode::Initialize() { void ServerNode::Initialize() {
@ -158,20 +159,13 @@ bool ServerNode::Stop() {
if (!is_already_stopped_.load()) { if (!is_already_stopped_.load()) {
is_already_stopped_ = true; is_already_stopped_ = true;
is_finish_ = true; is_finish_ = true;
if (heart_beat_thread_->joinable()) {
heart_beat_thread_->join();
}
client_to_scheduler_->Stop(); client_to_scheduler_->Stop();
if (!connected_nodes_.empty()) { if (!connected_nodes_.empty()) {
for (auto &connected_node : connected_nodes_) { for (auto &connected_node : connected_nodes_) {
connected_node.second->Stop(); connected_node.second->Stop();
} }
} }
if (client_to_scheduler_thread_->joinable()) {
client_to_scheduler_thread_->join();
}
server_->Stop(); server_->Stop();
server_thread_->join();
} }
return true; return true;
} }

View File

@ -68,6 +68,7 @@ void WorkerNode::CreateTcpServer() {
MS_LOG(INFO) << "The worker node start a tcp server!"; MS_LOG(INFO) << "The worker node start a tcp server!";
server_->Start(); server_->Start();
}); });
server_thread_->detach();
} }
bool WorkerNode::Stop() { bool WorkerNode::Stop() {
@ -82,7 +83,6 @@ bool WorkerNode::Stop() {
} }
} }
server_->Stop(); server_->Stop();
server_thread_->join();
is_already_stopped_ = true; is_already_stopped_ = true;
} }
return true; return true;