forked from mindspore-Ecosystem/mindspore
commit
08f7e99c9e
|
@ -544,7 +544,11 @@ void GPUSession::UpdateOutputTensors(const VectorRef *outputs,
|
|||
// address, to avoid that the device address context of tensor be rewritten in the next step or next loop.
|
||||
// But one time memory application scenarios need to be skipped, because the memory is not allocated next step:
|
||||
// 1. Non cnode 2. Communication kernel.
|
||||
if (node->isa<CNode>() && !AnfAlgo::IsCommunicationOp(node) && !ps::PSContext::instance()->is_ps_mode()) {
|
||||
bool ps_mode = false;
|
||||
#if (ENABLE_CPU && !_WIN32)
|
||||
ps_mode = ps::PSContext::instance()->is_ps_mode();
|
||||
#endif
|
||||
if (node->isa<CNode>() && !AnfAlgo::IsCommunicationOp(node) && !ps_mode) {
|
||||
auto new_address = std::make_shared<device::gpu::GPUDeviceAddress>(nullptr, address->GetSize());
|
||||
AnfAlgo::SetOutputAddr(new_address, output_index, node.get());
|
||||
if (context::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
|
||||
|
|
|
@ -44,6 +44,14 @@ namespace core {
|
|||
HttpServer::~HttpServer() { Stop(); }
|
||||
|
||||
bool HttpServer::InitServer() {
|
||||
if (server_address_ == "") {
|
||||
MS_LOG(INFO) << "The server ip is empty.";
|
||||
std::string interface;
|
||||
std::string server_ip;
|
||||
CommUtil::GetAvailableInterfaceAndIP(&interface, &server_ip);
|
||||
server_address_ = server_ip;
|
||||
}
|
||||
|
||||
if (!CommUtil::CheckIp(server_address_)) {
|
||||
MS_LOG(ERROR) << "The http server ip:" << server_address_ << " is illegal!";
|
||||
return false;
|
||||
|
|
|
@ -21,7 +21,7 @@ namespace ps {
|
|||
namespace core {
|
||||
bool FileConfiguration::Initialize() {
|
||||
if (!CommUtil::IsFileExists(file_path_)) {
|
||||
MS_LOG(ERROR) << "The file path:" << file_path_ << " is not exist.";
|
||||
MS_LOG(INFO) << "The file path:" << file_path_ << " is not exist.";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -25,6 +25,10 @@ uint32_t Node::rank_id() const { return node_info_.rank_id_; }
|
|||
|
||||
NodeRole Node::role() const { return node_info_.node_role_; }
|
||||
|
||||
uint16_t Node::BoundPort() const { return node_info_.port_; }
|
||||
|
||||
std::string Node::BoundIp() const { return node_info_.ip_; }
|
||||
|
||||
bool Node::WaitForStart(const uint32_t &timeout) {
|
||||
std::unique_lock<std::mutex> lock(wait_start_mutex_);
|
||||
bool res = wait_start_cond_.wait_for(lock, std::chrono::seconds(timeout), [&] {
|
||||
|
|
|
@ -64,6 +64,8 @@ class Node {
|
|||
std::string node_id() const;
|
||||
uint32_t rank_id() const;
|
||||
NodeRole role() const;
|
||||
uint16_t BoundPort() const;
|
||||
std::string BoundIp() const;
|
||||
|
||||
bool Wait(uint64_t request_id, const uint32_t &timeout = kCommTimeoutInSeconds);
|
||||
|
||||
|
|
|
@ -36,7 +36,7 @@ enum class ClusterEvent {
|
|||
};
|
||||
|
||||
struct NodeInfo {
|
||||
NodeInfo() : port_(0), node_role_(NodeRole::SCHEDULER), rank_id_(0), is_alive(false) {}
|
||||
NodeInfo() : ip_(""), port_(0), node_role_(NodeRole::SCHEDULER), rank_id_(0), is_alive(false) {}
|
||||
// ip
|
||||
std::string ip_;
|
||||
// the port of this node
|
||||
|
|
|
@ -154,7 +154,7 @@ bool Server::InitCommunicatorWithWorker() {
|
|||
communicators_with_worker_.push_back(tcp_comm);
|
||||
}
|
||||
if (use_http_) {
|
||||
auto http_comm = server_node_->GetOrCreateHttpComm("0.0.0.0", http_port_, task_executor_);
|
||||
auto http_comm = server_node_->GetOrCreateHttpComm(server_node_->BoundIp(), http_port_, task_executor_);
|
||||
MS_EXCEPTION_IF_NULL(http_comm);
|
||||
communicators_with_worker_.push_back(http_comm);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue