Fix partition bug

This commit is contained in:
ZPaC 2022-10-19 14:48:28 +08:00
parent 51dfb054d0
commit e282ddfc01
8 changed files with 15 additions and 11 deletions

View File

@ -199,12 +199,6 @@ void ClusterContext::InitNodeRole() {
MS_LOG(EXCEPTION) << "Role name '" << node_role_ << "' is invalid. " << kDetailedFailureReason;
}
// If node role is valid, judge the execution mode.
// MindSpore cluster does not support PyNative mode.
if (MsContext::GetInstance()->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
MS_LOG(EXCEPTION) << "PyNative mode is not supported in MindSpore cluster.";
}
if (common::GetEnv(kEnvWorkerNum).empty()) {
if (node_role_ == kEnvRoleOfWorker) {
MS_LOG(EXCEPTION) << "Please set env 'WORKER_NUM' to a number greater than 0.";

View File

@ -154,12 +154,13 @@ MessageBase *const MetaServerNode::HandleMessage(MessageBase *const message) {
return rpc::NULL_MSG;
}
const auto &result = (*message_handlers_[name])(message->Body());
delete message;
if (result.length() > 0) {
auto rt_msg = CreateMessage(meta_server_addr_.GetUrl(), name, result);
MS_EXCEPTION_IF_NULL(rt_msg);
delete message;
return rt_msg.release();
} else {
delete message;
return rpc::NULL_MSG;
}
}

View File

@ -426,6 +426,10 @@ class Primitive(Primitive_):
Validator.check_non_negative_int(rank_id, "rank_id", "Primitive.place")
Validator.check_string(role, "MS_WORKER", "role", "Primitive.place")
if context.get_context("mode") == context.PYNATIVE_MODE:
raise RuntimeError("You are calling Primitive.place in pynative mode."
"It's only supported in graph mode. Please switch to graph mode.")
# Get the execution context and check whether calling of this 'place' method is valid.
# This is because placing operators to arbitrary processes while other distributed training mode
# is enabled is very unpredictable and may cause fatal error.

View File

@ -48,6 +48,9 @@ def set_ps_enable(enable):
"""
Set ps enable flag.
"""
if context.get_context("mode") == context.PYNATIVE_MODE:
raise RuntimeError("Parameter server is not supported in pynative mode currently.")
ps_context().set_ps_enable(enable)
# If this is Server or Scheduler and device target is Ascend, reset the target to CPU
if _need_reset_device_target_for_ps(context.get_context("device_target")):

View File

@ -16,7 +16,7 @@ import os
import pytest
@pytest.mark.level2
@pytest.mark.level1
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_single
def test_nccl_lenet():

View File

@ -16,7 +16,7 @@ import os
import pytest
@pytest.mark.level1
@pytest.mark.level0
@pytest.mark.platform_x86_ascend_training
@pytest.mark.platform_arm_ascend_training
@pytest.mark.env_single
@ -39,7 +39,7 @@ def test_full_ps_lenet_ascend():
assert return_code == 0
@pytest.mark.level1
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_full_ps_lenet_gpu():

View File

@ -16,7 +16,7 @@ import os
import pytest
@pytest.mark.level0
@pytest.mark.level1
@pytest.mark.platform_x86_ascend_training
@pytest.mark.platform_arm_ascend_training
@pytest.mark.env_single

View File

@ -28,6 +28,7 @@ def test_noop_pserver():
Expectation: Runs successfully
"""
os.environ['MS_ROLE'] = 'MS_PSERVER'
context.set_context(mode=context.GRAPH_MODE)
context.set_ps_context(enable_ps=True)
data1 = ds.VOCDataset(DATA_DIR, task="Segmentation", usage="train", shuffle=False, decode=True)
num = 0
@ -45,6 +46,7 @@ def test_noop_sched():
Expectation: Runs successfully
"""
os.environ['MS_ROLE'] = 'MS_SCHED'
context.set_context(mode=context.GRAPH_MODE)
context.set_ps_context(enable_ps=True)
data1 = ds.VOCDataset(DATA_DIR, task="Segmentation", usage="train", shuffle=False, decode=True)
num = 0