!40591 Reopen ps cases

Merge pull request !40591 from ZPaC/reopen-ps-cases
2022-08-22 04:37:03 +00:00 · 2022-08-22 04:37:03 +00:00 · 7fafa69610
parent b17ca7def1 60bdbf5864
commit 7fafa69610
19 changed files with 166 additions and 592 deletions
--- a/mindspore/ccsrc/pipeline/jit/compile_cache_manager.cc
+++ b/mindspore/ccsrc/pipeline/jit/compile_cache_manager.cc
@ -31,7 +31,6 @@
 #include "mindspore/core/utils/file_utils.h"
 #ifdef WITH_BACKEND
 #include "ps/ps_context.h"
 #include "ps/core/node.h"
 #include "distributed/cluster/cluster_context.h"
 #endif
@ -71,17 +70,6 @@ std::string GetCompileCacheDir() {
 std::string GetRole() {
 #ifdef WITH_BACKEND
  const std::string &server_mode = ps::PSContext::instance()->server_mode();
  if ((server_mode == ps::kServerModeFL || server_mode == ps::kServerModeHybrid) &&
      ps::PSContext::instance()->is_server()) {
    return kRoleServer;
  }
  if (ps::PSContext::instance()->is_server()) {
    return kRolePServer;
  }
  if (ps::PSContext::instance()->is_scheduler()) {
    return kRolePScheduler;
  }
  if (distributed::cluster::ClusterContext::instance()->initialized()) {
    auto node = distributed::cluster::ClusterContext::instance()->node();
    MS_EXCEPTION_IF_NULL(node);
--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_set.h
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_set.h
@ -81,6 +81,7 @@ struct RpcActorSet {
  std::vector<RecvActorPtr> recv_actors_;
 };
 using RpcActorSetPtr = std::shared_ptr<RpcActorSet>;
 using RpcActorSetWeakPtr = std::weak_ptr<RpcActorSet>;
 #endif
 // The actor set generated by graph transformer is the execution unit of actor runtime.
--- a/mindspore/ccsrc/runtime/graph_scheduler/rpc_node_scheduler.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/rpc_node_scheduler.cc
@ -249,8 +249,8 @@ void RpcActorStatusUpdater::set_rpc_actors(const RpcActorSetPtr &rpc_actors) {
 void RpcActorStatusUpdater::UpdateRpcActorStatus() const {
  // To ensure performance, only mux recv actor need to update ready status for embedding cache mode currently.
  if (is_embedding_cache_server()) {
-    MS_EXCEPTION_IF_NULL(rpc_actors_);
+    MS_EXCEPTION_IF_NULL(rpc_actors_.lock());
-    for (auto &recv_actor : rpc_actors_->recv_actors_) {
+    for (auto &recv_actor : rpc_actors_.lock()->recv_actors_) {
      MS_EXCEPTION_IF_NULL(recv_actor);
      recv_actor->UpdateStatus();
    }
--- a/mindspore/ccsrc/runtime/graph_scheduler/rpc_node_scheduler.h
+++ b/mindspore/ccsrc/runtime/graph_scheduler/rpc_node_scheduler.h
@ -105,7 +105,7 @@ class RpcActorStatusUpdater {
  DISABLE_COPY_AND_ASSIGN(RpcActorStatusUpdater);
  // Record rpc actors which need to update status.
-  RpcActorSetPtr rpc_actors_;
+  RpcActorSetWeakPtr rpc_actors_;
 };
 }  // namespace runtime
 }  // namespace mindspore
--- a/mindspore/python/mindspore/dataset/engine/datasets.py
+++ b/mindspore/python/mindspore/dataset/engine/datasets.py
@ -1931,8 +1931,10 @@ class Dataset:
        """
        # If this is in distributed execution mode,
        # the shard number and shard id might need to be updated according to the process's rank or role.
-        if _is_role_pserver() and _enable_distributed_mindrt():
+        worker_num = _get_ps_context("worker_num")
-            num_shards = _get_ps_context("worker_num")
+        server_num = _get_ps_context("server_num")
        if _is_role_pserver() and _enable_distributed_mindrt() and (worker_num != server_num):
            num_shards = worker_num
            shard_id = 0
        return num_shards, shard_id
--- a/tests/st/frontend_compile_cache/run_lenet_ps.py
+++ b/tests/st/frontend_compile_cache/run_lenet_ps.py
@ -27,6 +27,7 @@ from mindspore.nn.metrics import Accuracy
 from mindspore.train import Model
 from mindspore.train.callback import LossMonitor
 from mindspore.common.initializer import TruncatedNormal
 from mindspore.communication.management import init
 DATASET_PATH = "/home/workspace/mindspore_dataset/mnist"
 context.set_context(mode=context.GRAPH_MODE, enable_compile_cache=True, compile_cache_path=sys.argv[1])
@ -120,6 +121,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1,
 if __name__ == "__main__":
    init()
    network = LeNet5(10)
    network.set_param_ps()
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
--- a/tests/st/frontend_compile_cache/test_compile_cache.py
+++ b/tests/st/frontend_compile_cache/test_compile_cache.py
@ -192,9 +192,9 @@ def run_lenet_ps_twice(file_name, cache_path, log_file_name_first, log_file_name
    first_str_to_check = "Check the consistency of dependency files hash failed. Execute all the compilation actions."
    start_ps_subprocess(file_name, cache_path, first_str_to_check, log_file_name_first)
    assert os.path.exists(cache_path)
-    check_compile_cache_files(cache_path, "")
+    check_compile_cache_files(cache_path, "MS_WORKER")
-    check_compile_cache_files(cache_path, "pserver_")
+    check_compile_cache_files(cache_path, "MS_PSERVER")
-    check_compile_cache_files(cache_path, "pscheduler_")
+    check_compile_cache_files(cache_path, "MS_SCHED")
    # Second run
    os.environ['MS_SCHED_PORT'] = '8183'
    second_str_to_check = "Use the compilation cache and execute the backend actions only. Be aware of correctness" \
--- a/tests/st/ps/cmp_sparse_embedding/shell_run_test.sh
+++ b/tests/st/ps/cmp_sparse_embedding/shell_run_test.sh
@ -23,43 +23,58 @@ export MS_SERVER_NUM=$3
 export MS_SCHED_HOST=$4
 export MS_SCHED_PORT=$5
 export MS_ROLE=MS_SCHED
-for((i=0;i<1;i++));
+
-do
+rm -rf ${execute_path}/sched/
-  rm -rf ${execute_path}/sched_$i/
+mkdir ${execute_path}/sched/
-  mkdir ${execute_path}/sched_$i/
+cd ${execute_path}/sched/ || exit
-  cd ${execute_path}/sched_$i/ || exit
+python ${self_path}/../test_cmp_sparse_embedding.py --device_target=$DEVICE_TARGET > sched.log 2>&1 &
-  python ${self_path}/../test_cmp_sparse_embedding.py --device_target=$DEVICE_TARGET &
+sched_pid=`echo $!`
 done
 export MS_ROLE=MS_PSERVER
 server_pids=()
 for((i=0;i<$MS_SERVER_NUM;i++));
 do
  rm -rf ${execute_path}/server_$i/
  mkdir ${execute_path}/server_$i/
  cd ${execute_path}/server_$i/ || exit
-  python ${self_path}/../test_cmp_sparse_embedding.py --device_target=$DEVICE_TARGET &
+  python ${self_path}/../test_cmp_sparse_embedding.py --device_target=$DEVICE_TARGET > server_$i.log 2>&1 &
  server_pids[${i}]=`echo $!`
 done
 export MS_ROLE=MS_WORKER
-process_pid=()
+worker_pids=()
 for((i=0;i<$MS_WORKER_NUM;i++));
 do
  rm -rf ${execute_path}/worker_$i/
  mkdir ${execute_path}/worker_$i/
  cd ${execute_path}/worker_$i/ || exit
-  python ${self_path}/../test_cmp_sparse_embedding.py --device_target=$DEVICE_TARGET &
+  python ${self_path}/../test_cmp_sparse_embedding.py --device_target=$DEVICE_TARGET > worker_$i.log 2>&1 &
-  process_pid[${i}]=`echo $!`
+  worker_pids[${i}]=`echo $!`
 done
 for((i=0; i<${MS_WORKER_NUM}; i++)); do
-    wait ${process_pid[i]}
+    wait ${worker_pids[i]}
    status=`echo $?`
    if [ "${status}" != "0" ]; then
-        echo "[ERROR] test_cmp_sparse_embedding failed. status: ${status}"
+        echo "[ERROR] test_cmp_sparse_embedding failed. Failed to wait worker_{$i}, status: ${status}"
        exit 1
    else
        echo "[INFO] test_cmp_sparse_embedding success."
    fi
 done
 for((i=0; i<${MS_SERVER_NUM}; i++)); do
    wait ${server_pids[i]}
    status=`echo $?`
    if [ "${status}" != "0" ]; then
        echo "[ERROR] test_cmp_sparse_embedding failed. Failed to wait server_{$i}, status: ${status}"
        exit 1
    fi
 done
 wait ${sched_pid}
 status=`echo $?`
 if [ "${status}" != "0" ]; then
  echo "[ERROR] test_cmp_sparse_embedding failed. Failed to wait scheduler, status: ${status}"
  exit 1
 fi
 exit 0
--- a/tests/st/ps/cmp_sparse_embedding/test_cmp_sparse_embedding.py
+++ b/tests/st/ps/cmp_sparse_embedding/test_cmp_sparse_embedding.py
@ -25,7 +25,7 @@ from mindspore.nn import TrainOneStepCell, WithLossCell
 from mindspore.nn.optim import Adam
 from mindspore.common import set_seed
 from mindspore.ops import operations as P
-from mindspore.parallel._ps_context import _is_role_worker
+from mindspore.parallel._ps_context import _is_role_worker, _is_role_sched
 from mindspore.communication.management import init
 parser = argparse.ArgumentParser(description="test_sparse_embedding")
@ -56,7 +56,7 @@ class LeNet5(nn.Cell):
 def do_sparse_embedding(ps=False):
-    epoch = 10
+    epoch = 5
    net = LeNet5(10)
    if ps:
        net.embedding.embedding_table.set_param_ps()
@ -71,9 +71,11 @@ def do_sparse_embedding(ps=False):
    for _ in range(epoch):
        data = Tensor(np.random.randint(-5, 15, (32, 3), np.int32))
        label = Tensor(np.random.randint(0, 9, (32), np.int32))
-        loss = train_network(data, label).asnumpy()
+        loss = train_network(data, label)
-        losses.append(loss)
+        if _is_role_sched():
-    print(losses)
+            return None
        losses.append(loss.asnumpy())
    print("Do sprase embedding losses:", losses)
    return losses
--- a/tests/st/ps/cmp_sparse_embedding/test_entry_cmp_sparse_embedding.py
+++ b/tests/st/ps/cmp_sparse_embedding/test_entry_cmp_sparse_embedding.py
@ -13,8 +13,25 @@
 # limitations under the License.
 # ============================================================================
 import os
 import pytest
@pytest.mark.level0
@pytest.mark.platform_x86_ascend_training
@pytest.mark.platform_arm_ascend_training
@pytest.mark.env_single
 def test_cmp_sparse_embedding():
-    return_code = os.system("bash shell_run_test.sh GPU 1 1 127.0.0.1 8081")
+    """
    Feature: Parameter Server.
    Description: Test sparse optimizer for ps.
    Expectation: success.
    """
    return_code = os.system("bash shell_run_test.sh Ascend 1 1 127.0.0.1 8081")
    if return_code != 0:
        os.system(f"echo '\n**************** Worker Log ****************'")
        os.system(f"grep -E 'ERROR|Error|error' ./worker*/worker*.log")
        os.system(f"echo '\n**************** Server Log ****************'")
        os.system(f"grep -E 'ERROR|Error|error' ./server*/server*.log")
        os.system(f"echo '\n**************** Scheduler Log ****************'")
        os.system(f"grep -E 'ERROR|Error|error' ./sched/sched.log")
    assert return_code == 0
--- a/tests/st/ps/full_ps/shell_run_test.sh
+++ b/tests/st/ps/full_ps/shell_run_test.sh
@ -25,43 +25,58 @@ export MS_SCHED_HOST=$5
 export MS_SCHED_PORT=$6
 export MS_ROLE=MS_SCHED
 export fusion=True
-for((i=0;i<1;i++));
+
-do
+rm -rf ${execute_path}/sched/
-  rm -rf ${execute_path}/sched_$i/
+mkdir ${execute_path}/sched/
-  mkdir ${execute_path}/sched_$i/
+cd ${execute_path}/sched/ || exit
-  cd ${execute_path}/sched_$i/ || exit
+python ${self_path}/../test_full_ps_lenet.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH > sched.log 2>&1 &
-  python ${self_path}/../test_full_ps_lenet.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH &
+sched_pid=`echo $!`
 done
 export MS_ROLE=MS_PSERVER
 server_pids=()
 for((i=0;i<$MS_SERVER_NUM;i++));
 do
  rm -rf ${execute_path}/server_$i/
  mkdir ${execute_path}/server_$i/
  cd ${execute_path}/server_$i/ || exit
-  python ${self_path}/../test_full_ps_lenet.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH &
+  python ${self_path}/../test_full_ps_lenet.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH > server_$i.log 2>&1 &
  server_pids[${i}]=`echo $!`
 done
 export MS_ROLE=MS_WORKER
-process_pid=()
+worker_pids=()
 for((i=0;i<$MS_WORKER_NUM;i++));
 do
  rm -rf ${execute_path}/worker_$i/
  mkdir ${execute_path}/worker_$i/
  cd ${execute_path}/worker_$i/ || exit
-  python ${self_path}/../test_full_ps_lenet.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH &
+  python ${self_path}/../test_full_ps_lenet.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH > worker_$i.log 2>&1 &
-  process_pid[${i}]=`echo $!`
+  worker_pids[${i}]=`echo $!`
 done
 for((i=0; i<${MS_WORKER_NUM}; i++)); do
-    wait ${process_pid[i]}
+    wait ${worker_pids[i]}
    status=`echo $?`
    if [ "${status}" != "0" ]; then
-        echo "[ERROR] test_full_ps_lenet failed. status: ${status}"
+        echo "[ERROR] test_full_ps_lenet failed. Failed to wait worker_{$i}, status: ${status}"
        exit 1
    else
        echo "[INFO] test_full_ps_lenet success."
    fi
 done
 for((i=0; i<${MS_SERVER_NUM}; i++)); do
    wait ${server_pids[i]}
    status=`echo $?`
    if [ "${status}" != "0" ]; then
        echo "[ERROR] test_full_ps_lenet failed. Failed to wait server_{$i}, status: ${status}"
        exit 1
    fi
 done
 wait ${sched_pid}
 status=`echo $?`
 if [ "${status}" != "0" ]; then
  echo "[ERROR] test_full_ps_lenet failed. Failed to wait scheduler, status: ${status}"
  exit 1
 fi
 exit 0
--- a/tests/st/ps/full_ps/test_entry_full_ps_lenet.py
+++ b/tests/st/ps/full_ps/test_entry_full_ps_lenet.py
@ -13,10 +13,27 @@
 # limitations under the License.
 # ============================================================================
 import os
 import pytest
-def test_full_ps_ascend_lenet():
+@pytest.mark.level0
@pytest.mark.platform_x86_ascend_training
@pytest.mark.platform_arm_ascend_training
@pytest.mark.env_single
 def test_full_ps_lenet():
    """
    Feature: Parameter Server.
    Description: Test LeNet accuracy in ps mode.
    Expectation: success.
    """
    return_code = os.system(
-        "bash shell_run_test.sh GPU /home/workspace/mindspore_dataset/mnist 1 1 127.0.0.1 8082"
+        "bash shell_run_test.sh Ascend /home/workspace/mindspore_dataset/mnist 1 1 127.0.0.1 8082"
    )
    if return_code != 0:
        os.system(f"echo '\n**************** Worker Log ****************'")
        os.system(f"grep -E 'ERROR|Error|error' ./worker*/worker*.log")
        os.system(f"echo '\n**************** Server Log ****************'")
        os.system(f"grep -E 'ERROR|Error|error' ./server*/server*.log")
        os.system(f"echo '\n**************** Scheduler Log ****************'")
        os.system(f"grep -E 'ERROR|Error|error' ./sched/sched.log")
    assert return_code == 0
--- a/tests/st/ps/multi_full_ps/entry.py
+++ b/tests/st/ps/multi_full_ps/entry.py
@ -1,35 +0,0 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 import os
 def test_ps_ascend_multi_worker_multi_server():
    return_code = os.system("bash shell_run_test.sh Ascend 8 8 127.0.0.1 8088")
    assert return_code == 0
 def test_ps_ascend():
    return_code = os.system("bash shell_run_test.sh Ascend 1 1 127.0.0.1 8088")
    assert return_code == 0
 def test_ps_gpu_multi_worker_multi_server():
    return_code = os.system("bash shell_run_test.sh GPU 8 8 127.0.0.1 8088")
    assert return_code == 0
 def test_ps_gpu():
    return_code = os.system("bash shell_run_test.sh GPU 1 1 127.0.0.1 8088")
    assert return_code == 0
--- a/tests/st/ps/multi_full_ps/resnet.py
+++ b/tests/st/ps/multi_full_ps/resnet.py
@ -1,283 +0,0 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """ResNet."""
 import numpy as np
 import mindspore.nn as nn
 from mindspore.ops import operations as P
 from mindspore.common.tensor import Tensor
 def _weight_variable(shape, factor=0.01):
    init_value = np.random.randn(*shape).astype(np.float32) * factor
    return Tensor(init_value)
 def _conv3x3(in_channel, out_channel, stride=1):
    weight_shape = (out_channel, in_channel, 3, 3)
    weight = _weight_variable(weight_shape)
    return nn.Conv2d(in_channel, out_channel,
                     kernel_size=3, stride=stride, padding=0, pad_mode='same', weight_init=weight)
 def _conv1x1(in_channel, out_channel, stride=1):
    weight_shape = (out_channel, in_channel, 1, 1)
    weight = _weight_variable(weight_shape)
    return nn.Conv2d(in_channel, out_channel,
                     kernel_size=1, stride=stride, padding=0, pad_mode='same', weight_init=weight)
 def _conv7x7(in_channel, out_channel, stride=1):
    weight_shape = (out_channel, in_channel, 7, 7)
    weight = _weight_variable(weight_shape)
    return nn.Conv2d(in_channel, out_channel,
                     kernel_size=7, stride=stride, padding=0, pad_mode='same', weight_init=weight)
 def _bn(channel):
    return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9,
                          gamma_init=1, beta_init=0, moving_mean_init=0, moving_var_init=1)
 def _bn_last(channel):
    return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9,
                          gamma_init=0, beta_init=0, moving_mean_init=0, moving_var_init=1)
 def _fc(in_channel, out_channel):
    weight_shape = (out_channel, in_channel)
    weight = _weight_variable(weight_shape)
    return nn.Dense(in_channel, out_channel, has_bias=True, weight_init=weight, bias_init=0)
 class ResidualBlock(nn.Cell):
    """
    ResNet V1 residual block definition.
    Args:
        in_channel (int): Input channel.
        out_channel (int): Output channel.
        stride (int): Stride size for the first convolutional layer. Default: 1.
    Returns:
        Tensor, output tensor.
    Examples:
        >>> ResidualBlock(3, 256, stride=2)
    """
    expansion = 4
    def __init__(self,
                 in_channel,
                 out_channel,
                 stride=1):
        super(ResidualBlock, self).__init__()
        channel = out_channel // self.expansion
        self.conv1 = _conv1x1(in_channel, channel, stride=1)
        self.bn1 = _bn(channel)
        self.conv2 = _conv3x3(channel, channel, stride=stride)
        self.bn2 = _bn(channel)
        self.conv3 = _conv1x1(channel, out_channel, stride=1)
        self.bn3 = _bn_last(out_channel)
        self.relu = nn.ReLU()
        self.down_sample = False
        if stride != 1 or in_channel != out_channel:
            self.down_sample = True
        self.down_sample_layer = None
        if self.down_sample:
            self.down_sample_layer = nn.SequentialCell([_conv1x1(in_channel, out_channel, stride),
                                                        _bn(out_channel)])
        self.add = P.Add()
    def construct(self, x):
        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.conv3(out)
        out = self.bn3(out)
        if self.down_sample:
            identity = self.down_sample_layer(identity)
        out = self.add(out, identity)
        out = self.relu(out)
        return out
 class ResNet(nn.Cell):
    """
    ResNet architecture.
    Args:
        block (Cell): Block for network.
        layer_nums (list): Numbers of block in different layers.
        in_channels (list): Input channel in each layer.
        out_channels (list): Output channel in each layer.
        strides (list):  Stride size in each layer.
        num_classes (int): The number of classes that the training images are belonging to.
    Returns:
        Tensor, output tensor.
    Examples:
        >>> ResNet(ResidualBlock,
        >>>        [3, 4, 6, 3],
        >>>        [64, 256, 512, 1024],
        >>>        [256, 512, 1024, 2048],
        >>>        [1, 2, 2, 2],
        >>>        10)
    """
    def __init__(self,
                 block,
                 layer_nums,
                 in_channels,
                 out_channels,
                 strides,
                 num_classes):
        super(ResNet, self).__init__()
        if not len(layer_nums) == len(in_channels) == len(out_channels) == 4:
            raise ValueError("the length of layer_num, in_channels, out_channels list must be 4!")
        self.conv1 = _conv7x7(3, 64, stride=2)
        self.bn1 = _bn(64)
        self.relu = P.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same")
        self.layer1 = self._make_layer(block,
                                       layer_nums[0],
                                       in_channel=in_channels[0],
                                       out_channel=out_channels[0],
                                       stride=strides[0])
        self.layer2 = self._make_layer(block,
                                       layer_nums[1],
                                       in_channel=in_channels[1],
                                       out_channel=out_channels[1],
                                       stride=strides[1])
        self.layer3 = self._make_layer(block,
                                       layer_nums[2],
                                       in_channel=in_channels[2],
                                       out_channel=out_channels[2],
                                       stride=strides[2])
        self.layer4 = self._make_layer(block,
                                       layer_nums[3],
                                       in_channel=in_channels[3],
                                       out_channel=out_channels[3],
                                       stride=strides[3])
        self.mean = P.ReduceMean(keep_dims=True)
        self.flatten = nn.Flatten()
        self.end_point = _fc(out_channels[3], num_classes)
    def _make_layer(self, block, layer_num, in_channel, out_channel, stride):
        """
        Make stage network of ResNet.
        Args:
            block (Cell): Resnet block.
            layer_num (int): Layer number.
            in_channel (int): Input channel.
            out_channel (int): Output channel.
            stride (int): Stride size for the first convolutional layer.
        Returns:
            SequentialCell, the output layer.
        Examples:
            >>> _make_layer(ResidualBlock, 3, 128, 256, 2)
        """
        layers = []
        resnet_block = block(in_channel, out_channel, stride=stride)
        layers.append(resnet_block)
        for _ in range(1, layer_num):
            resnet_block = block(out_channel, out_channel, stride=1)
            layers.append(resnet_block)
        return nn.SequentialCell(layers)
    def construct(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        c1 = self.maxpool(x)
        c2 = self.layer1(c1)
        c3 = self.layer2(c2)
        c4 = self.layer3(c3)
        c5 = self.layer4(c4)
        out = self.mean(c5, (2, 3))
        out = self.flatten(out)
        out = self.end_point(out)
        return out
 def resnet50(class_num=10):
    """
    Get ResNet50 neural network.
    Args:
        class_num (int): Class number.
    Returns:
        Cell, cell instance of ResNet50 neural network.
    Examples:
        >>> net = resnet50(10)
    """
    return ResNet(ResidualBlock,
                  [3, 4, 6, 3],
                  [64, 256, 512, 1024],
                  [256, 512, 1024, 2048],
                  [1, 2, 2, 2],
                  class_num)
 def resnet101(class_num=1001):
    """
    Get ResNet101 neural network.
    Args:
        class_num (int): Class number.
    Returns:
        Cell, cell instance of ResNet101 neural network.
    Examples:
        >>> net = resnet101(1001)
    """
    return ResNet(ResidualBlock,
                  [3, 4, 23, 3],
                  [64, 256, 512, 1024],
                  [256, 512, 1024, 2048],
                  [1, 2, 2, 2],
                  class_num)
--- a/tests/st/ps/multi_full_ps/shell_run_test.sh
+++ b/tests/st/ps/multi_full_ps/shell_run_test.sh
@ -1,80 +0,0 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 execute_path=$(pwd)
 self_path=$(dirname $0)
 export MS_SCHED_NUM=1
 DEVICE_TARGET=$1
 export MS_WORKER_NUM=$2
 export MS_SERVER_NUM=$3
 export MS_SCHED_HOST=$4
 export MS_SCHED_PORT=$5
 export MS_ROLE=MS_SCHED
 for((i=0;i<1;i++));
 do
  rm -rf ${execute_path}/sched_$i/
  mkdir ${execute_path}/sched_$i/
  cd ${execute_path}/sched_$i/ || exit
  python ${self_path}/../test_multi_full_ps.py --device_target=$DEVICE_TARGET &
 done
 export MS_ROLE=MS_PSERVER
 for((i=0;i<$MS_SERVER_NUM;i++));
 do
  rm -rf ${execute_path}/server_$i/
  mkdir ${execute_path}/server_$i/
  cd ${execute_path}/server_$i/ || exit
  export RANK_ID=$i
  export DEVICE_ID=$i
  python ${self_path}/../test_multi_full_ps.py --device_target=$DEVICE_TARGET &
 done
 export MS_ROLE=MS_WORKER
 process_pid=()
 if [ $DEVICE_TARGET == "Ascend" ];then
 for((i=0;i<$MS_WORKER_NUM;i++));
 do
  rm -rf ${execute_path}/worker_$i/
  mkdir ${execute_path}/worker_$i/
  cd ${execute_path}/worker_$i/ || exit
  export RANK_ID=$i
  export DEVICE_ID=$i
  python ${self_path}/../test_multi_full_ps.py --device_target=$DEVICE_TARGET &
  process_pid[${i}]=`echo $!`
 done
 for((i=0; i<${MS_WORKER_NUM}; i++)); do
    wait ${process_pid[i]}
    status=`echo $?`
    if [ "${status}" != "0" ]; then
        echo "[ERROR] test_multi_full_ps failed. status: ${status}"
        exit 1
    else
        echo "[INFO] test_multi_full_ps success."
    fi
 done
 fi
 if [ $DEVICE_TARGET == "GPU" ];then
  rm -rf ${execute_path}/worker/
  mkdir ${execute_path}/worker/
  cd ${execute_path}/worker/ || exit
  mpirun -n $MS_WORKER_NUM python ${self_path}/../test_multi_full_ps.py --device_target=$DEVICE_TARGET &
  wait $!
 fi
 exit 0
--- a/tests/st/ps/multi_full_ps/test_multi_full_ps.py
+++ b/tests/st/ps/multi_full_ps/test_multi_full_ps.py
@ -1,118 +0,0 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 import sys
 import argparse
 import numpy as np
 import mindspore.context as context
 import mindspore.nn as nn
 from mindspore.common.initializer import TruncatedNormal
 from mindspore import Tensor
 from mindspore.nn import TrainOneStepCell, WithLossCell
 from mindspore.communication.management import init, get_group_size
 from mindspore.parallel._ps_context import _is_role_pserver
 # from resnet import resnet50
 parser = argparse.ArgumentParser(description="test_ps_lenet")
 parser.add_argument("--device_target", type=str, default="Ascend")
 args, _ = parser.parse_known_args()
 device_target = args.device_target
 context.set_context(mode=context.GRAPH_MODE, device_target=device_target)
 context.set_ps_context(enable_ps=True)
 if device_target == "GPU":
    init()
 def conv(in_channels, out_channels, kernel_size, stride=1, padding=0):
    """weight initial for conv layer"""
    weight = weight_variable()
    return nn.Conv2d(
        in_channels,
        out_channels,
        kernel_size=kernel_size,
        stride=stride,
        padding=padding,
        weight_init=weight,
        has_bias=False,
        pad_mode="valid",
    )
 def fc_with_initialize(input_channels, out_channels):
    """weight initial for fc layer"""
    weight = weight_variable()
    bias = weight_variable()
    return nn.Dense(input_channels, out_channels, weight, bias)
 def weight_variable():
    """weight initial"""
    return TruncatedNormal(0.02)
 class LeNet5(nn.Cell):
    def __init__(self, num_class=10, channel=3):
        super(LeNet5, self).__init__()
        self.num_class = num_class
        self.conv1 = conv(channel, 6, 5)
        self.conv2 = conv(6, 16, 5)
        self.fc1 = fc_with_initialize(16 * 5 * 5, 120)
        self.fc2 = fc_with_initialize(120, 84)
        self.fc3 = fc_with_initialize(84, self.num_class)
        self.relu = nn.ReLU()
        self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
        self.flatten = nn.Flatten()
    def construct(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.max_pool2d(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.max_pool2d(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x
 if __name__ == "__main__":
    epoch = 5
    np.random.seed(0)
    network = LeNet5(10)
    network.set_param_ps()
    criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9)
    if device_target == "GPU":
        context.set_auto_parallel_context(parallel_mode="data_parallel", gradients_mean=True,
                                          device_num=get_group_size())
    net_with_criterion = WithLossCell(network, criterion)
    train_network = TrainOneStepCell(net_with_criterion, net_opt)
    train_network.set_train()
    losses = []
    for _ in range(epoch):
        data = Tensor(np.random.rand(32, 3, 32, 32).astype(np.float32))
        label = Tensor(np.random.randint(0, 9, (32)).astype(np.int32))
        if _is_role_pserver():
            train_network(data, label)
            sys.exit()
        else:
            loss = train_network(data, label).asnumpy()
            losses.append(loss)
    print(losses)
--- a/tests/st/ps/part_ps/shell_run_test.sh
+++ b/tests/st/ps/part_ps/shell_run_test.sh
@ -24,45 +24,59 @@ export MS_SERVER_NUM=$4
 export MS_SCHED_HOST=$5
 export MS_SCHED_PORT=$6
 export MS_ROLE=MS_SCHED
-export fusion=True
+
-for((i=0;i<1;i++));
+rm -rf ${execute_path}/sched/
-do
+mkdir ${execute_path}/sched/
-  rm -rf ${execute_path}/sched_$i/
+cd ${execute_path}/sched/ || exit
-  mkdir ${execute_path}/sched_$i/
+python ${self_path}/../test_ps_embedding_heterogeneous_conv2d_adam.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH > sched.log 2>&1 &
-  cd ${execute_path}/sched_$i/ || exit
+sched_pid=`echo $!`
  python ${self_path}/../test_ps_embedding_heterogeneous_conv2d_adam.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH &
 done
 export MS_ROLE=MS_PSERVER
 server_pids=()
 for((i=0;i<$MS_SERVER_NUM;i++));
 do
  rm -rf ${execute_path}/server_$i/
  mkdir ${execute_path}/server_$i/
  cd ${execute_path}/server_$i/ || exit
-  python ${self_path}/../test_ps_embedding_heterogeneous_conv2d_adam.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH &
+  python ${self_path}/../test_ps_embedding_heterogeneous_conv2d_adam.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH > server_$i.log 2>&1 &
  server_pids[${i}]=`echo $!`
 done
 export MS_ROLE=MS_WORKER
-process_pid=()
+worker_pids=()
 for((i=0;i<$MS_WORKER_NUM;i++));
 do
  rm -rf ${execute_path}/worker_$i/
  mkdir ${execute_path}/worker_$i/
  cd ${execute_path}/worker_$i/ || exit
-  python ${self_path}/../test_ps_embedding_heterogeneous_conv2d_adam.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH &
+  python ${self_path}/../test_ps_embedding_heterogeneous_conv2d_adam.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH > worker_$i.log 2>&1 &
-  process_pid[${i}]=`echo $!`
+  worker_pids[${i}]=`echo $!`
 done
 for((i=0; i<${MS_WORKER_NUM}; i++)); do
-    wait ${process_pid[i]}
+    wait ${worker_pids[i]}
    status=`echo $?`
    if [ "${status}" != "0" ]; then
-        echo "[ERROR] test_ps_embedding_heterogeneous_conv2d_adam failed. status: ${status}"
+        echo "[ERROR] test_ps_embedding_heterogeneous_conv2d_adam failed. Failed to wait worker_{$i}, status: ${status}"
        exit 1
    else
        echo "[INFO] test_ps_embedding_heterogeneous_conv2d_adam success."
    fi
 done
 for((i=0; i<${MS_SERVER_NUM}; i++)); do
    wait ${server_pids[i]}
    status=`echo $?`
    if [ "${status}" != "0" ]; then
        echo "[ERROR] test_ps_embedding_heterogeneous_conv2d_adam failed. Failed to wait server_{$i}, status: ${status}"
        exit 1
    fi
 done
 wait ${sched_pid}
 status=`echo $?`
 if [ "${status}" != "0" ]; then
  echo "[ERROR] test_ps_embedding_heterogeneous_conv2d_adam failed. Failed to wait scheduler, status: ${status}"
  exit 1
 fi
 exit 0
--- a/tests/st/ps/part_ps/test_entry_ps_embedding_heterogeneous_conv2d_adam.py
+++ b/tests/st/ps/part_ps/test_entry_ps_embedding_heterogeneous_conv2d_adam.py
@ -13,10 +13,27 @@
 # limitations under the License.
 # ============================================================================
 import os
 import pytest
@pytest.mark.level0
@pytest.mark.platform_x86_ascend_training
@pytest.mark.platform_arm_ascend_training
@pytest.mark.env_single
 def test_ps_embedding_heterogeneous_conv2d_adam():
    """
    Feature: Parameter Server.
    Description: Test embedding with heterogeneous conv2d.
    Expectation: success.
    """
    return_code = os.system(
-        "bash shell_run_test.sh GPU /home/workspace/mindspore_dataset/mnist 1 1 127.0.0.1 8085"
+        "bash shell_run_test.sh Ascend /home/workspace/mindspore_dataset/mnist 1 1 127.0.0.1 8085"
    )
    if return_code != 0:
        os.system(f"echo '\n**************** Worker Log ****************'")
        os.system(f"grep -E 'ERROR|Error|error' ./worker*/worker*.log")
        os.system(f"echo '\n**************** Server Log ****************'")
        os.system(f"grep -E 'ERROR|Error|error' ./server*/server*.log")
        os.system(f"echo '\n**************** Scheduler Log ****************'")
        os.system(f"grep -E 'ERROR|Error|error' ./sched/sched.log")
    assert return_code == 0
--- a/tests/st/ps/part_ps/test_ps_embedding_heterogeneous_conv2d_adam.py
+++ b/tests/st/ps/part_ps/test_ps_embedding_heterogeneous_conv2d_adam.py
@ -85,7 +85,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1,
    create dataset for train or test
    """
    # define dataset
-    mnist_ds = ds.MnistDataset(data_path)
+    mnist_ds = ds.MnistDataset(data_path, num_shards=512, shard_id=0)
    resize_height, resize_width = 32, 32
    rescale = 1.0 / 255.0