From 11b674ba4416851742d563e11dd1401e5b2333ef Mon Sep 17 00:00:00 2001 From: ZPaC Date: Tue, 25 Oct 2022 16:08:04 +0800 Subject: [PATCH] Catch exception when timeout --- docs/api/api_python/mindspore/mindspore.Parameter.rst | 4 +++- docs/api/api_python/mindspore/mindspore.set_ps_context.rst | 3 ++- docs/api/api_python/nn/mindspore.nn.Cell.rst | 4 +++- .../ccsrc/distributed/cluster/topology/compute_graph_node.cc | 1 - mindspore/python/mindspore/common/parameter.py | 5 +++++ mindspore/python/mindspore/context.py | 1 + mindspore/python/mindspore/nn/cell.py | 1 + mindspore/python/mindspore/parallel/_ps_context.py | 3 --- 8 files changed, 15 insertions(+), 7 deletions(-) diff --git a/docs/api/api_python/mindspore/mindspore.Parameter.rst b/docs/api/api_python/mindspore/mindspore.Parameter.rst index fef2009a35d..b85eeccadd8 100644 --- a/docs/api/api_python/mindspore/mindspore.Parameter.rst +++ b/docs/api/api_python/mindspore/mindspore.Parameter.rst @@ -151,7 +151,9 @@ 表示可训练参数是否由参数服务器更新,以及可训练参数是否在服务器上初始化。 - .. note:: 仅当运行的任务处于参数服务器模式下有效。 + .. note:: + 仅当运行的任务处于参数服务器模式下有效。 + 只支持在图模式下调用。 参数: - **init_in_server** (bool) - 表示参数服务器更新的可训练参数是否在服务器上初始化。默认值:False。 diff --git a/docs/api/api_python/mindspore/mindspore.set_ps_context.rst b/docs/api/api_python/mindspore/mindspore.set_ps_context.rst index bf002197529..d7a1a68c4c5 100644 --- a/docs/api/api_python/mindspore/mindspore.set_ps_context.rst +++ b/docs/api/api_python/mindspore/mindspore.set_ps_context.rst @@ -6,7 +6,8 @@ mindspore.set_ps_context 设置参数服务器训练模式的上下文。 .. note:: - 需要给参数服务器训练模式设置其他的环境变量。些环境变量如下所示: + 参数服务器训练模式只在图模式下支持。 + 需要给参数服务器训练模式设置其他的环境变量。这些环境变量如下所示: - MS_SERVER_NUM:表示参数服务器数量。 - MS_WORKER_NUM:表示工作进程数量。 diff --git a/docs/api/api_python/nn/mindspore.nn.Cell.rst b/docs/api/api_python/nn/mindspore.nn.Cell.rst index 7ee6f9bd64f..84c2f7bff50 100644 --- a/docs/api/api_python/nn/mindspore.nn.Cell.rst +++ b/docs/api/api_python/nn/mindspore.nn.Cell.rst @@ -517,7 +517,9 @@ 设置可训练参数是否由参数服务器更新,以及是否在服务器上初始化可训练参数。 - .. note:: 只在运行的任务处于参数服务器模式时有效。 + .. note:: + 只在运行的任务处于参数服务器模式时有效。 + 只支持在图模式下调用。 参数: - **recurse** (bool) - 是否设置子网络的可训练参数。默认值:True。 diff --git a/mindspore/ccsrc/distributed/cluster/topology/compute_graph_node.cc b/mindspore/ccsrc/distributed/cluster/topology/compute_graph_node.cc index f789bbed317..6a60c9f76d7 100644 --- a/mindspore/ccsrc/distributed/cluster/topology/compute_graph_node.cc +++ b/mindspore/ccsrc/distributed/cluster/topology/compute_graph_node.cc @@ -285,7 +285,6 @@ bool ComputeGraphNode::Heartbeat() { delete response; MS_LOG(EXCEPTION) << "The state of the cluster is error, total nodes num: " << nodes_num << ", abnormal nodes num: " << abnormal_nodes_num; - return false; } delete response; } diff --git a/mindspore/python/mindspore/common/parameter.py b/mindspore/python/mindspore/common/parameter.py index 912ea05dad7..790188fcbfa 100644 --- a/mindspore/python/mindspore/common/parameter.py +++ b/mindspore/python/mindspore/common/parameter.py @@ -317,6 +317,7 @@ class Parameter(Tensor_): Note: It only works when a running task is in the parameter server mode. + It is supported only in graph mode. Args: init_in_server (bool): Whether trainable parameter updated by parameter server is @@ -327,6 +328,10 @@ class Parameter(Tensor_): "1. context.set_ps_context(enable_ps=True) \n" "2. export MS_ROLE environment variable \n" "Please refer to the official website for detailed usage.") + + if context.get_context("mode") == context.PYNATIVE_MODE: + raise RuntimeError("Parameter server training is not supported in pynative mode currently." + "Please switch to graph mode and retry.") self.is_param_ps = True self.init_in_server = init_in_server self.param_info.init_in_server = init_in_server diff --git a/mindspore/python/mindspore/context.py b/mindspore/python/mindspore/context.py index ce5a8a28077..3b9bd70d28f 100644 --- a/mindspore/python/mindspore/context.py +++ b/mindspore/python/mindspore/context.py @@ -1074,6 +1074,7 @@ def set_ps_context(**kwargs): Set parameter server training mode context. Note: + Parameter server mode is only supported in graph mode. Some other environment variables should also be set for parameter server training mode. These environment variables are listed below: diff --git a/mindspore/python/mindspore/nn/cell.py b/mindspore/python/mindspore/nn/cell.py index 89425055b73..035d39009a5 100755 --- a/mindspore/python/mindspore/nn/cell.py +++ b/mindspore/python/mindspore/nn/cell.py @@ -1996,6 +1996,7 @@ class Cell(Cell_): Note: It only works when a running task is in the parameter server mode. + It is only supported in graph mode. Args: recurse (bool): Whether sets the trainable parameters of subcells. Default: True. diff --git a/mindspore/python/mindspore/parallel/_ps_context.py b/mindspore/python/mindspore/parallel/_ps_context.py index 1085de4fe1b..9416e875abf 100644 --- a/mindspore/python/mindspore/parallel/_ps_context.py +++ b/mindspore/python/mindspore/parallel/_ps_context.py @@ -48,9 +48,6 @@ def set_ps_enable(enable): """ Set ps enable flag. """ - if context.get_context("mode") == context.PYNATIVE_MODE: - raise RuntimeError("Parameter server is not supported in pynative mode currently.") - ps_context().set_ps_enable(enable) # If this is Server or Scheduler and device target is Ascend, reset the target to CPU if _need_reset_device_target_for_ps(context.get_context("device_target")):