!44542 Catch exception when timeout
Merge pull request !44542 from ZPaC/catch-exception-in-cgn
This commit is contained in:
commit
18f73f81b1
|
@ -151,7 +151,9 @@
|
|||
|
||||
表示可训练参数是否由参数服务器更新,以及可训练参数是否在服务器上初始化。
|
||||
|
||||
.. note:: 仅当运行的任务处于参数服务器模式下有效。
|
||||
.. note::
|
||||
仅当运行的任务处于参数服务器模式下有效。
|
||||
只支持在图模式下调用。
|
||||
|
||||
参数:
|
||||
- **init_in_server** (bool) - 表示参数服务器更新的可训练参数是否在服务器上初始化。默认值:False。
|
||||
|
|
|
@ -6,7 +6,8 @@ mindspore.set_ps_context
|
|||
设置参数服务器训练模式的上下文。
|
||||
|
||||
.. note::
|
||||
需要给参数服务器训练模式设置其他的环境变量。些环境变量如下所示:
|
||||
参数服务器训练模式只在图模式下支持。
|
||||
需要给参数服务器训练模式设置其他的环境变量。这些环境变量如下所示:
|
||||
|
||||
- MS_SERVER_NUM:表示参数服务器数量。
|
||||
- MS_WORKER_NUM:表示工作进程数量。
|
||||
|
|
|
@ -517,7 +517,9 @@
|
|||
|
||||
设置可训练参数是否由参数服务器更新,以及是否在服务器上初始化可训练参数。
|
||||
|
||||
.. note:: 只在运行的任务处于参数服务器模式时有效。
|
||||
.. note::
|
||||
只在运行的任务处于参数服务器模式时有效。
|
||||
只支持在图模式下调用。
|
||||
|
||||
参数:
|
||||
- **recurse** (bool) - 是否设置子网络的可训练参数。默认值:True。
|
||||
|
|
|
@ -285,7 +285,6 @@ bool ComputeGraphNode::Heartbeat() {
|
|||
delete response;
|
||||
MS_LOG(EXCEPTION) << "The state of the cluster is error, total nodes num: " << nodes_num
|
||||
<< ", abnormal nodes num: " << abnormal_nodes_num;
|
||||
return false;
|
||||
}
|
||||
delete response;
|
||||
}
|
||||
|
|
|
@ -317,6 +317,7 @@ class Parameter(Tensor_):
|
|||
|
||||
Note:
|
||||
It only works when a running task is in the parameter server mode.
|
||||
It is supported only in graph mode.
|
||||
|
||||
Args:
|
||||
init_in_server (bool): Whether trainable parameter updated by parameter server is
|
||||
|
@ -327,6 +328,10 @@ class Parameter(Tensor_):
|
|||
"1. context.set_ps_context(enable_ps=True) \n"
|
||||
"2. export MS_ROLE environment variable \n"
|
||||
"Please refer to the official website for detailed usage.")
|
||||
|
||||
if context.get_context("mode") == context.PYNATIVE_MODE:
|
||||
raise RuntimeError("Parameter server training is not supported in pynative mode currently."
|
||||
"Please switch to graph mode and retry.")
|
||||
self.is_param_ps = True
|
||||
self.init_in_server = init_in_server
|
||||
self.param_info.init_in_server = init_in_server
|
||||
|
|
|
@ -1074,6 +1074,7 @@ def set_ps_context(**kwargs):
|
|||
Set parameter server training mode context.
|
||||
|
||||
Note:
|
||||
Parameter server mode is only supported in graph mode.
|
||||
Some other environment variables should also be set for parameter server training mode.
|
||||
These environment variables are listed below:
|
||||
|
||||
|
|
|
@ -1996,6 +1996,7 @@ class Cell(Cell_):
|
|||
|
||||
Note:
|
||||
It only works when a running task is in the parameter server mode.
|
||||
It is only supported in graph mode.
|
||||
|
||||
Args:
|
||||
recurse (bool): Whether sets the trainable parameters of subcells. Default: True.
|
||||
|
|
|
@ -48,9 +48,6 @@ def set_ps_enable(enable):
|
|||
"""
|
||||
Set ps enable flag.
|
||||
"""
|
||||
if context.get_context("mode") == context.PYNATIVE_MODE:
|
||||
raise RuntimeError("Parameter server is not supported in pynative mode currently.")
|
||||
|
||||
ps_context().set_ps_enable(enable)
|
||||
# If this is Server or Scheduler and device target is Ascend, reset the target to CPU
|
||||
if _need_reset_device_target_for_ps(context.get_context("device_target")):
|
||||
|
|
Loading…
Reference in New Issue