forked from mindspore-Ecosystem/mindspore
!40591 Reopen ps cases
Merge pull request !40591 from ZPaC/reopen-ps-cases
This commit is contained in:
commit
7fafa69610
|
@ -31,7 +31,6 @@
|
||||||
#include "mindspore/core/utils/file_utils.h"
|
#include "mindspore/core/utils/file_utils.h"
|
||||||
|
|
||||||
#ifdef WITH_BACKEND
|
#ifdef WITH_BACKEND
|
||||||
#include "ps/ps_context.h"
|
|
||||||
#include "ps/core/node.h"
|
#include "ps/core/node.h"
|
||||||
#include "distributed/cluster/cluster_context.h"
|
#include "distributed/cluster/cluster_context.h"
|
||||||
#endif
|
#endif
|
||||||
|
@ -71,17 +70,6 @@ std::string GetCompileCacheDir() {
|
||||||
|
|
||||||
std::string GetRole() {
|
std::string GetRole() {
|
||||||
#ifdef WITH_BACKEND
|
#ifdef WITH_BACKEND
|
||||||
const std::string &server_mode = ps::PSContext::instance()->server_mode();
|
|
||||||
if ((server_mode == ps::kServerModeFL || server_mode == ps::kServerModeHybrid) &&
|
|
||||||
ps::PSContext::instance()->is_server()) {
|
|
||||||
return kRoleServer;
|
|
||||||
}
|
|
||||||
if (ps::PSContext::instance()->is_server()) {
|
|
||||||
return kRolePServer;
|
|
||||||
}
|
|
||||||
if (ps::PSContext::instance()->is_scheduler()) {
|
|
||||||
return kRolePScheduler;
|
|
||||||
}
|
|
||||||
if (distributed::cluster::ClusterContext::instance()->initialized()) {
|
if (distributed::cluster::ClusterContext::instance()->initialized()) {
|
||||||
auto node = distributed::cluster::ClusterContext::instance()->node();
|
auto node = distributed::cluster::ClusterContext::instance()->node();
|
||||||
MS_EXCEPTION_IF_NULL(node);
|
MS_EXCEPTION_IF_NULL(node);
|
||||||
|
|
|
@ -81,6 +81,7 @@ struct RpcActorSet {
|
||||||
std::vector<RecvActorPtr> recv_actors_;
|
std::vector<RecvActorPtr> recv_actors_;
|
||||||
};
|
};
|
||||||
using RpcActorSetPtr = std::shared_ptr<RpcActorSet>;
|
using RpcActorSetPtr = std::shared_ptr<RpcActorSet>;
|
||||||
|
using RpcActorSetWeakPtr = std::weak_ptr<RpcActorSet>;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// The actor set generated by graph transformer is the execution unit of actor runtime.
|
// The actor set generated by graph transformer is the execution unit of actor runtime.
|
||||||
|
|
|
@ -249,8 +249,8 @@ void RpcActorStatusUpdater::set_rpc_actors(const RpcActorSetPtr &rpc_actors) {
|
||||||
void RpcActorStatusUpdater::UpdateRpcActorStatus() const {
|
void RpcActorStatusUpdater::UpdateRpcActorStatus() const {
|
||||||
// To ensure performance, only mux recv actor need to update ready status for embedding cache mode currently.
|
// To ensure performance, only mux recv actor need to update ready status for embedding cache mode currently.
|
||||||
if (is_embedding_cache_server()) {
|
if (is_embedding_cache_server()) {
|
||||||
MS_EXCEPTION_IF_NULL(rpc_actors_);
|
MS_EXCEPTION_IF_NULL(rpc_actors_.lock());
|
||||||
for (auto &recv_actor : rpc_actors_->recv_actors_) {
|
for (auto &recv_actor : rpc_actors_.lock()->recv_actors_) {
|
||||||
MS_EXCEPTION_IF_NULL(recv_actor);
|
MS_EXCEPTION_IF_NULL(recv_actor);
|
||||||
recv_actor->UpdateStatus();
|
recv_actor->UpdateStatus();
|
||||||
}
|
}
|
||||||
|
|
|
@ -105,7 +105,7 @@ class RpcActorStatusUpdater {
|
||||||
DISABLE_COPY_AND_ASSIGN(RpcActorStatusUpdater);
|
DISABLE_COPY_AND_ASSIGN(RpcActorStatusUpdater);
|
||||||
|
|
||||||
// Record rpc actors which need to update status.
|
// Record rpc actors which need to update status.
|
||||||
RpcActorSetPtr rpc_actors_;
|
RpcActorSetWeakPtr rpc_actors_;
|
||||||
};
|
};
|
||||||
} // namespace runtime
|
} // namespace runtime
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -1931,8 +1931,10 @@ class Dataset:
|
||||||
"""
|
"""
|
||||||
# If this is in distributed execution mode,
|
# If this is in distributed execution mode,
|
||||||
# the shard number and shard id might need to be updated according to the process's rank or role.
|
# the shard number and shard id might need to be updated according to the process's rank or role.
|
||||||
if _is_role_pserver() and _enable_distributed_mindrt():
|
worker_num = _get_ps_context("worker_num")
|
||||||
num_shards = _get_ps_context("worker_num")
|
server_num = _get_ps_context("server_num")
|
||||||
|
if _is_role_pserver() and _enable_distributed_mindrt() and (worker_num != server_num):
|
||||||
|
num_shards = worker_num
|
||||||
shard_id = 0
|
shard_id = 0
|
||||||
return num_shards, shard_id
|
return num_shards, shard_id
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,7 @@ from mindspore.nn.metrics import Accuracy
|
||||||
from mindspore.train import Model
|
from mindspore.train import Model
|
||||||
from mindspore.train.callback import LossMonitor
|
from mindspore.train.callback import LossMonitor
|
||||||
from mindspore.common.initializer import TruncatedNormal
|
from mindspore.common.initializer import TruncatedNormal
|
||||||
|
from mindspore.communication.management import init
|
||||||
|
|
||||||
DATASET_PATH = "/home/workspace/mindspore_dataset/mnist"
|
DATASET_PATH = "/home/workspace/mindspore_dataset/mnist"
|
||||||
context.set_context(mode=context.GRAPH_MODE, enable_compile_cache=True, compile_cache_path=sys.argv[1])
|
context.set_context(mode=context.GRAPH_MODE, enable_compile_cache=True, compile_cache_path=sys.argv[1])
|
||||||
|
@ -120,6 +121,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1,
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
init()
|
||||||
network = LeNet5(10)
|
network = LeNet5(10)
|
||||||
network.set_param_ps()
|
network.set_param_ps()
|
||||||
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
|
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
|
||||||
|
|
|
@ -192,9 +192,9 @@ def run_lenet_ps_twice(file_name, cache_path, log_file_name_first, log_file_name
|
||||||
first_str_to_check = "Check the consistency of dependency files hash failed. Execute all the compilation actions."
|
first_str_to_check = "Check the consistency of dependency files hash failed. Execute all the compilation actions."
|
||||||
start_ps_subprocess(file_name, cache_path, first_str_to_check, log_file_name_first)
|
start_ps_subprocess(file_name, cache_path, first_str_to_check, log_file_name_first)
|
||||||
assert os.path.exists(cache_path)
|
assert os.path.exists(cache_path)
|
||||||
check_compile_cache_files(cache_path, "")
|
check_compile_cache_files(cache_path, "MS_WORKER")
|
||||||
check_compile_cache_files(cache_path, "pserver_")
|
check_compile_cache_files(cache_path, "MS_PSERVER")
|
||||||
check_compile_cache_files(cache_path, "pscheduler_")
|
check_compile_cache_files(cache_path, "MS_SCHED")
|
||||||
# Second run
|
# Second run
|
||||||
os.environ['MS_SCHED_PORT'] = '8183'
|
os.environ['MS_SCHED_PORT'] = '8183'
|
||||||
second_str_to_check = "Use the compilation cache and execute the backend actions only. Be aware of correctness" \
|
second_str_to_check = "Use the compilation cache and execute the backend actions only. Be aware of correctness" \
|
||||||
|
|
|
@ -23,43 +23,58 @@ export MS_SERVER_NUM=$3
|
||||||
export MS_SCHED_HOST=$4
|
export MS_SCHED_HOST=$4
|
||||||
export MS_SCHED_PORT=$5
|
export MS_SCHED_PORT=$5
|
||||||
export MS_ROLE=MS_SCHED
|
export MS_ROLE=MS_SCHED
|
||||||
for((i=0;i<1;i++));
|
|
||||||
do
|
rm -rf ${execute_path}/sched/
|
||||||
rm -rf ${execute_path}/sched_$i/
|
mkdir ${execute_path}/sched/
|
||||||
mkdir ${execute_path}/sched_$i/
|
cd ${execute_path}/sched/ || exit
|
||||||
cd ${execute_path}/sched_$i/ || exit
|
python ${self_path}/../test_cmp_sparse_embedding.py --device_target=$DEVICE_TARGET > sched.log 2>&1 &
|
||||||
python ${self_path}/../test_cmp_sparse_embedding.py --device_target=$DEVICE_TARGET &
|
sched_pid=`echo $!`
|
||||||
done
|
|
||||||
|
|
||||||
export MS_ROLE=MS_PSERVER
|
export MS_ROLE=MS_PSERVER
|
||||||
|
server_pids=()
|
||||||
for((i=0;i<$MS_SERVER_NUM;i++));
|
for((i=0;i<$MS_SERVER_NUM;i++));
|
||||||
do
|
do
|
||||||
rm -rf ${execute_path}/server_$i/
|
rm -rf ${execute_path}/server_$i/
|
||||||
mkdir ${execute_path}/server_$i/
|
mkdir ${execute_path}/server_$i/
|
||||||
cd ${execute_path}/server_$i/ || exit
|
cd ${execute_path}/server_$i/ || exit
|
||||||
python ${self_path}/../test_cmp_sparse_embedding.py --device_target=$DEVICE_TARGET &
|
python ${self_path}/../test_cmp_sparse_embedding.py --device_target=$DEVICE_TARGET > server_$i.log 2>&1 &
|
||||||
|
server_pids[${i}]=`echo $!`
|
||||||
done
|
done
|
||||||
|
|
||||||
export MS_ROLE=MS_WORKER
|
export MS_ROLE=MS_WORKER
|
||||||
process_pid=()
|
worker_pids=()
|
||||||
for((i=0;i<$MS_WORKER_NUM;i++));
|
for((i=0;i<$MS_WORKER_NUM;i++));
|
||||||
do
|
do
|
||||||
rm -rf ${execute_path}/worker_$i/
|
rm -rf ${execute_path}/worker_$i/
|
||||||
mkdir ${execute_path}/worker_$i/
|
mkdir ${execute_path}/worker_$i/
|
||||||
cd ${execute_path}/worker_$i/ || exit
|
cd ${execute_path}/worker_$i/ || exit
|
||||||
python ${self_path}/../test_cmp_sparse_embedding.py --device_target=$DEVICE_TARGET &
|
python ${self_path}/../test_cmp_sparse_embedding.py --device_target=$DEVICE_TARGET > worker_$i.log 2>&1 &
|
||||||
process_pid[${i}]=`echo $!`
|
worker_pids[${i}]=`echo $!`
|
||||||
done
|
done
|
||||||
|
|
||||||
for((i=0; i<${MS_WORKER_NUM}; i++)); do
|
for((i=0; i<${MS_WORKER_NUM}; i++)); do
|
||||||
wait ${process_pid[i]}
|
wait ${worker_pids[i]}
|
||||||
status=`echo $?`
|
status=`echo $?`
|
||||||
if [ "${status}" != "0" ]; then
|
if [ "${status}" != "0" ]; then
|
||||||
echo "[ERROR] test_cmp_sparse_embedding failed. status: ${status}"
|
echo "[ERROR] test_cmp_sparse_embedding failed. Failed to wait worker_{$i}, status: ${status}"
|
||||||
exit 1
|
exit 1
|
||||||
else
|
|
||||||
echo "[INFO] test_cmp_sparse_embedding success."
|
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
for((i=0; i<${MS_SERVER_NUM}; i++)); do
|
||||||
|
wait ${server_pids[i]}
|
||||||
|
status=`echo $?`
|
||||||
|
if [ "${status}" != "0" ]; then
|
||||||
|
echo "[ERROR] test_cmp_sparse_embedding failed. Failed to wait server_{$i}, status: ${status}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
wait ${sched_pid}
|
||||||
|
status=`echo $?`
|
||||||
|
if [ "${status}" != "0" ]; then
|
||||||
|
echo "[ERROR] test_cmp_sparse_embedding failed. Failed to wait scheduler, status: ${status}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
exit 0
|
exit 0
|
||||||
|
|
|
@ -25,7 +25,7 @@ from mindspore.nn import TrainOneStepCell, WithLossCell
|
||||||
from mindspore.nn.optim import Adam
|
from mindspore.nn.optim import Adam
|
||||||
from mindspore.common import set_seed
|
from mindspore.common import set_seed
|
||||||
from mindspore.ops import operations as P
|
from mindspore.ops import operations as P
|
||||||
from mindspore.parallel._ps_context import _is_role_worker
|
from mindspore.parallel._ps_context import _is_role_worker, _is_role_sched
|
||||||
from mindspore.communication.management import init
|
from mindspore.communication.management import init
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="test_sparse_embedding")
|
parser = argparse.ArgumentParser(description="test_sparse_embedding")
|
||||||
|
@ -56,7 +56,7 @@ class LeNet5(nn.Cell):
|
||||||
|
|
||||||
|
|
||||||
def do_sparse_embedding(ps=False):
|
def do_sparse_embedding(ps=False):
|
||||||
epoch = 10
|
epoch = 5
|
||||||
net = LeNet5(10)
|
net = LeNet5(10)
|
||||||
if ps:
|
if ps:
|
||||||
net.embedding.embedding_table.set_param_ps()
|
net.embedding.embedding_table.set_param_ps()
|
||||||
|
@ -71,9 +71,11 @@ def do_sparse_embedding(ps=False):
|
||||||
for _ in range(epoch):
|
for _ in range(epoch):
|
||||||
data = Tensor(np.random.randint(-5, 15, (32, 3), np.int32))
|
data = Tensor(np.random.randint(-5, 15, (32, 3), np.int32))
|
||||||
label = Tensor(np.random.randint(0, 9, (32), np.int32))
|
label = Tensor(np.random.randint(0, 9, (32), np.int32))
|
||||||
loss = train_network(data, label).asnumpy()
|
loss = train_network(data, label)
|
||||||
losses.append(loss)
|
if _is_role_sched():
|
||||||
print(losses)
|
return None
|
||||||
|
losses.append(loss.asnumpy())
|
||||||
|
print("Do sprase embedding losses:", losses)
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -13,8 +13,25 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
import os
|
import os
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.level0
|
||||||
|
@pytest.mark.platform_x86_ascend_training
|
||||||
|
@pytest.mark.platform_arm_ascend_training
|
||||||
|
@pytest.mark.env_single
|
||||||
def test_cmp_sparse_embedding():
|
def test_cmp_sparse_embedding():
|
||||||
return_code = os.system("bash shell_run_test.sh GPU 1 1 127.0.0.1 8081")
|
"""
|
||||||
|
Feature: Parameter Server.
|
||||||
|
Description: Test sparse optimizer for ps.
|
||||||
|
Expectation: success.
|
||||||
|
"""
|
||||||
|
return_code = os.system("bash shell_run_test.sh Ascend 1 1 127.0.0.1 8081")
|
||||||
|
if return_code != 0:
|
||||||
|
os.system(f"echo '\n**************** Worker Log ****************'")
|
||||||
|
os.system(f"grep -E 'ERROR|Error|error' ./worker*/worker*.log")
|
||||||
|
os.system(f"echo '\n**************** Server Log ****************'")
|
||||||
|
os.system(f"grep -E 'ERROR|Error|error' ./server*/server*.log")
|
||||||
|
os.system(f"echo '\n**************** Scheduler Log ****************'")
|
||||||
|
os.system(f"grep -E 'ERROR|Error|error' ./sched/sched.log")
|
||||||
assert return_code == 0
|
assert return_code == 0
|
||||||
|
|
|
@ -25,43 +25,58 @@ export MS_SCHED_HOST=$5
|
||||||
export MS_SCHED_PORT=$6
|
export MS_SCHED_PORT=$6
|
||||||
export MS_ROLE=MS_SCHED
|
export MS_ROLE=MS_SCHED
|
||||||
export fusion=True
|
export fusion=True
|
||||||
for((i=0;i<1;i++));
|
|
||||||
do
|
rm -rf ${execute_path}/sched/
|
||||||
rm -rf ${execute_path}/sched_$i/
|
mkdir ${execute_path}/sched/
|
||||||
mkdir ${execute_path}/sched_$i/
|
cd ${execute_path}/sched/ || exit
|
||||||
cd ${execute_path}/sched_$i/ || exit
|
python ${self_path}/../test_full_ps_lenet.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH > sched.log 2>&1 &
|
||||||
python ${self_path}/../test_full_ps_lenet.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH &
|
sched_pid=`echo $!`
|
||||||
done
|
|
||||||
|
|
||||||
export MS_ROLE=MS_PSERVER
|
export MS_ROLE=MS_PSERVER
|
||||||
|
server_pids=()
|
||||||
for((i=0;i<$MS_SERVER_NUM;i++));
|
for((i=0;i<$MS_SERVER_NUM;i++));
|
||||||
do
|
do
|
||||||
rm -rf ${execute_path}/server_$i/
|
rm -rf ${execute_path}/server_$i/
|
||||||
mkdir ${execute_path}/server_$i/
|
mkdir ${execute_path}/server_$i/
|
||||||
cd ${execute_path}/server_$i/ || exit
|
cd ${execute_path}/server_$i/ || exit
|
||||||
python ${self_path}/../test_full_ps_lenet.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH &
|
python ${self_path}/../test_full_ps_lenet.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH > server_$i.log 2>&1 &
|
||||||
|
server_pids[${i}]=`echo $!`
|
||||||
done
|
done
|
||||||
|
|
||||||
export MS_ROLE=MS_WORKER
|
export MS_ROLE=MS_WORKER
|
||||||
process_pid=()
|
worker_pids=()
|
||||||
for((i=0;i<$MS_WORKER_NUM;i++));
|
for((i=0;i<$MS_WORKER_NUM;i++));
|
||||||
do
|
do
|
||||||
rm -rf ${execute_path}/worker_$i/
|
rm -rf ${execute_path}/worker_$i/
|
||||||
mkdir ${execute_path}/worker_$i/
|
mkdir ${execute_path}/worker_$i/
|
||||||
cd ${execute_path}/worker_$i/ || exit
|
cd ${execute_path}/worker_$i/ || exit
|
||||||
python ${self_path}/../test_full_ps_lenet.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH &
|
python ${self_path}/../test_full_ps_lenet.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH > worker_$i.log 2>&1 &
|
||||||
process_pid[${i}]=`echo $!`
|
worker_pids[${i}]=`echo $!`
|
||||||
done
|
done
|
||||||
|
|
||||||
for((i=0; i<${MS_WORKER_NUM}; i++)); do
|
for((i=0; i<${MS_WORKER_NUM}; i++)); do
|
||||||
wait ${process_pid[i]}
|
wait ${worker_pids[i]}
|
||||||
status=`echo $?`
|
status=`echo $?`
|
||||||
if [ "${status}" != "0" ]; then
|
if [ "${status}" != "0" ]; then
|
||||||
echo "[ERROR] test_full_ps_lenet failed. status: ${status}"
|
echo "[ERROR] test_full_ps_lenet failed. Failed to wait worker_{$i}, status: ${status}"
|
||||||
exit 1
|
exit 1
|
||||||
else
|
|
||||||
echo "[INFO] test_full_ps_lenet success."
|
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
for((i=0; i<${MS_SERVER_NUM}; i++)); do
|
||||||
|
wait ${server_pids[i]}
|
||||||
|
status=`echo $?`
|
||||||
|
if [ "${status}" != "0" ]; then
|
||||||
|
echo "[ERROR] test_full_ps_lenet failed. Failed to wait server_{$i}, status: ${status}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
wait ${sched_pid}
|
||||||
|
status=`echo $?`
|
||||||
|
if [ "${status}" != "0" ]; then
|
||||||
|
echo "[ERROR] test_full_ps_lenet failed. Failed to wait scheduler, status: ${status}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
exit 0
|
exit 0
|
||||||
|
|
|
@ -13,10 +13,27 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
import os
|
import os
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_full_ps_ascend_lenet():
|
@pytest.mark.level0
|
||||||
|
@pytest.mark.platform_x86_ascend_training
|
||||||
|
@pytest.mark.platform_arm_ascend_training
|
||||||
|
@pytest.mark.env_single
|
||||||
|
def test_full_ps_lenet():
|
||||||
|
"""
|
||||||
|
Feature: Parameter Server.
|
||||||
|
Description: Test LeNet accuracy in ps mode.
|
||||||
|
Expectation: success.
|
||||||
|
"""
|
||||||
return_code = os.system(
|
return_code = os.system(
|
||||||
"bash shell_run_test.sh GPU /home/workspace/mindspore_dataset/mnist 1 1 127.0.0.1 8082"
|
"bash shell_run_test.sh Ascend /home/workspace/mindspore_dataset/mnist 1 1 127.0.0.1 8082"
|
||||||
)
|
)
|
||||||
|
if return_code != 0:
|
||||||
|
os.system(f"echo '\n**************** Worker Log ****************'")
|
||||||
|
os.system(f"grep -E 'ERROR|Error|error' ./worker*/worker*.log")
|
||||||
|
os.system(f"echo '\n**************** Server Log ****************'")
|
||||||
|
os.system(f"grep -E 'ERROR|Error|error' ./server*/server*.log")
|
||||||
|
os.system(f"echo '\n**************** Scheduler Log ****************'")
|
||||||
|
os.system(f"grep -E 'ERROR|Error|error' ./sched/sched.log")
|
||||||
assert return_code == 0
|
assert return_code == 0
|
||||||
|
|
|
@ -1,35 +0,0 @@
|
||||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
# ============================================================================
|
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
def test_ps_ascend_multi_worker_multi_server():
|
|
||||||
return_code = os.system("bash shell_run_test.sh Ascend 8 8 127.0.0.1 8088")
|
|
||||||
assert return_code == 0
|
|
||||||
|
|
||||||
|
|
||||||
def test_ps_ascend():
|
|
||||||
return_code = os.system("bash shell_run_test.sh Ascend 1 1 127.0.0.1 8088")
|
|
||||||
assert return_code == 0
|
|
||||||
|
|
||||||
|
|
||||||
def test_ps_gpu_multi_worker_multi_server():
|
|
||||||
return_code = os.system("bash shell_run_test.sh GPU 8 8 127.0.0.1 8088")
|
|
||||||
assert return_code == 0
|
|
||||||
|
|
||||||
|
|
||||||
def test_ps_gpu():
|
|
||||||
return_code = os.system("bash shell_run_test.sh GPU 1 1 127.0.0.1 8088")
|
|
||||||
assert return_code == 0
|
|
|
@ -1,283 +0,0 @@
|
||||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
# ============================================================================
|
|
||||||
"""ResNet."""
|
|
||||||
import numpy as np
|
|
||||||
import mindspore.nn as nn
|
|
||||||
from mindspore.ops import operations as P
|
|
||||||
from mindspore.common.tensor import Tensor
|
|
||||||
|
|
||||||
|
|
||||||
def _weight_variable(shape, factor=0.01):
|
|
||||||
init_value = np.random.randn(*shape).astype(np.float32) * factor
|
|
||||||
return Tensor(init_value)
|
|
||||||
|
|
||||||
|
|
||||||
def _conv3x3(in_channel, out_channel, stride=1):
|
|
||||||
weight_shape = (out_channel, in_channel, 3, 3)
|
|
||||||
weight = _weight_variable(weight_shape)
|
|
||||||
return nn.Conv2d(in_channel, out_channel,
|
|
||||||
kernel_size=3, stride=stride, padding=0, pad_mode='same', weight_init=weight)
|
|
||||||
|
|
||||||
|
|
||||||
def _conv1x1(in_channel, out_channel, stride=1):
|
|
||||||
weight_shape = (out_channel, in_channel, 1, 1)
|
|
||||||
weight = _weight_variable(weight_shape)
|
|
||||||
return nn.Conv2d(in_channel, out_channel,
|
|
||||||
kernel_size=1, stride=stride, padding=0, pad_mode='same', weight_init=weight)
|
|
||||||
|
|
||||||
|
|
||||||
def _conv7x7(in_channel, out_channel, stride=1):
|
|
||||||
weight_shape = (out_channel, in_channel, 7, 7)
|
|
||||||
weight = _weight_variable(weight_shape)
|
|
||||||
return nn.Conv2d(in_channel, out_channel,
|
|
||||||
kernel_size=7, stride=stride, padding=0, pad_mode='same', weight_init=weight)
|
|
||||||
|
|
||||||
|
|
||||||
def _bn(channel):
|
|
||||||
return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9,
|
|
||||||
gamma_init=1, beta_init=0, moving_mean_init=0, moving_var_init=1)
|
|
||||||
|
|
||||||
|
|
||||||
def _bn_last(channel):
|
|
||||||
return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9,
|
|
||||||
gamma_init=0, beta_init=0, moving_mean_init=0, moving_var_init=1)
|
|
||||||
|
|
||||||
|
|
||||||
def _fc(in_channel, out_channel):
|
|
||||||
weight_shape = (out_channel, in_channel)
|
|
||||||
weight = _weight_variable(weight_shape)
|
|
||||||
return nn.Dense(in_channel, out_channel, has_bias=True, weight_init=weight, bias_init=0)
|
|
||||||
|
|
||||||
|
|
||||||
class ResidualBlock(nn.Cell):
|
|
||||||
"""
|
|
||||||
ResNet V1 residual block definition.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
in_channel (int): Input channel.
|
|
||||||
out_channel (int): Output channel.
|
|
||||||
stride (int): Stride size for the first convolutional layer. Default: 1.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tensor, output tensor.
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
>>> ResidualBlock(3, 256, stride=2)
|
|
||||||
"""
|
|
||||||
expansion = 4
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
in_channel,
|
|
||||||
out_channel,
|
|
||||||
stride=1):
|
|
||||||
super(ResidualBlock, self).__init__()
|
|
||||||
|
|
||||||
channel = out_channel // self.expansion
|
|
||||||
self.conv1 = _conv1x1(in_channel, channel, stride=1)
|
|
||||||
self.bn1 = _bn(channel)
|
|
||||||
|
|
||||||
self.conv2 = _conv3x3(channel, channel, stride=stride)
|
|
||||||
self.bn2 = _bn(channel)
|
|
||||||
|
|
||||||
self.conv3 = _conv1x1(channel, out_channel, stride=1)
|
|
||||||
self.bn3 = _bn_last(out_channel)
|
|
||||||
|
|
||||||
self.relu = nn.ReLU()
|
|
||||||
|
|
||||||
self.down_sample = False
|
|
||||||
|
|
||||||
if stride != 1 or in_channel != out_channel:
|
|
||||||
self.down_sample = True
|
|
||||||
self.down_sample_layer = None
|
|
||||||
|
|
||||||
if self.down_sample:
|
|
||||||
self.down_sample_layer = nn.SequentialCell([_conv1x1(in_channel, out_channel, stride),
|
|
||||||
_bn(out_channel)])
|
|
||||||
self.add = P.Add()
|
|
||||||
|
|
||||||
def construct(self, x):
|
|
||||||
identity = x
|
|
||||||
|
|
||||||
out = self.conv1(x)
|
|
||||||
out = self.bn1(out)
|
|
||||||
out = self.relu(out)
|
|
||||||
|
|
||||||
out = self.conv2(out)
|
|
||||||
out = self.bn2(out)
|
|
||||||
out = self.relu(out)
|
|
||||||
|
|
||||||
out = self.conv3(out)
|
|
||||||
out = self.bn3(out)
|
|
||||||
|
|
||||||
if self.down_sample:
|
|
||||||
identity = self.down_sample_layer(identity)
|
|
||||||
|
|
||||||
out = self.add(out, identity)
|
|
||||||
out = self.relu(out)
|
|
||||||
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
class ResNet(nn.Cell):
|
|
||||||
"""
|
|
||||||
ResNet architecture.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
block (Cell): Block for network.
|
|
||||||
layer_nums (list): Numbers of block in different layers.
|
|
||||||
in_channels (list): Input channel in each layer.
|
|
||||||
out_channels (list): Output channel in each layer.
|
|
||||||
strides (list): Stride size in each layer.
|
|
||||||
num_classes (int): The number of classes that the training images are belonging to.
|
|
||||||
Returns:
|
|
||||||
Tensor, output tensor.
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
>>> ResNet(ResidualBlock,
|
|
||||||
>>> [3, 4, 6, 3],
|
|
||||||
>>> [64, 256, 512, 1024],
|
|
||||||
>>> [256, 512, 1024, 2048],
|
|
||||||
>>> [1, 2, 2, 2],
|
|
||||||
>>> 10)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
block,
|
|
||||||
layer_nums,
|
|
||||||
in_channels,
|
|
||||||
out_channels,
|
|
||||||
strides,
|
|
||||||
num_classes):
|
|
||||||
super(ResNet, self).__init__()
|
|
||||||
|
|
||||||
if not len(layer_nums) == len(in_channels) == len(out_channels) == 4:
|
|
||||||
raise ValueError("the length of layer_num, in_channels, out_channels list must be 4!")
|
|
||||||
|
|
||||||
self.conv1 = _conv7x7(3, 64, stride=2)
|
|
||||||
self.bn1 = _bn(64)
|
|
||||||
self.relu = P.ReLU()
|
|
||||||
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same")
|
|
||||||
|
|
||||||
self.layer1 = self._make_layer(block,
|
|
||||||
layer_nums[0],
|
|
||||||
in_channel=in_channels[0],
|
|
||||||
out_channel=out_channels[0],
|
|
||||||
stride=strides[0])
|
|
||||||
self.layer2 = self._make_layer(block,
|
|
||||||
layer_nums[1],
|
|
||||||
in_channel=in_channels[1],
|
|
||||||
out_channel=out_channels[1],
|
|
||||||
stride=strides[1])
|
|
||||||
self.layer3 = self._make_layer(block,
|
|
||||||
layer_nums[2],
|
|
||||||
in_channel=in_channels[2],
|
|
||||||
out_channel=out_channels[2],
|
|
||||||
stride=strides[2])
|
|
||||||
self.layer4 = self._make_layer(block,
|
|
||||||
layer_nums[3],
|
|
||||||
in_channel=in_channels[3],
|
|
||||||
out_channel=out_channels[3],
|
|
||||||
stride=strides[3])
|
|
||||||
|
|
||||||
self.mean = P.ReduceMean(keep_dims=True)
|
|
||||||
self.flatten = nn.Flatten()
|
|
||||||
self.end_point = _fc(out_channels[3], num_classes)
|
|
||||||
|
|
||||||
def _make_layer(self, block, layer_num, in_channel, out_channel, stride):
|
|
||||||
"""
|
|
||||||
Make stage network of ResNet.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
block (Cell): Resnet block.
|
|
||||||
layer_num (int): Layer number.
|
|
||||||
in_channel (int): Input channel.
|
|
||||||
out_channel (int): Output channel.
|
|
||||||
stride (int): Stride size for the first convolutional layer.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
SequentialCell, the output layer.
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
>>> _make_layer(ResidualBlock, 3, 128, 256, 2)
|
|
||||||
"""
|
|
||||||
layers = []
|
|
||||||
|
|
||||||
resnet_block = block(in_channel, out_channel, stride=stride)
|
|
||||||
layers.append(resnet_block)
|
|
||||||
|
|
||||||
for _ in range(1, layer_num):
|
|
||||||
resnet_block = block(out_channel, out_channel, stride=1)
|
|
||||||
layers.append(resnet_block)
|
|
||||||
|
|
||||||
return nn.SequentialCell(layers)
|
|
||||||
|
|
||||||
def construct(self, x):
|
|
||||||
x = self.conv1(x)
|
|
||||||
x = self.bn1(x)
|
|
||||||
x = self.relu(x)
|
|
||||||
c1 = self.maxpool(x)
|
|
||||||
|
|
||||||
c2 = self.layer1(c1)
|
|
||||||
c3 = self.layer2(c2)
|
|
||||||
c4 = self.layer3(c3)
|
|
||||||
c5 = self.layer4(c4)
|
|
||||||
|
|
||||||
out = self.mean(c5, (2, 3))
|
|
||||||
out = self.flatten(out)
|
|
||||||
out = self.end_point(out)
|
|
||||||
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
def resnet50(class_num=10):
|
|
||||||
"""
|
|
||||||
Get ResNet50 neural network.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
class_num (int): Class number.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Cell, cell instance of ResNet50 neural network.
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
>>> net = resnet50(10)
|
|
||||||
"""
|
|
||||||
return ResNet(ResidualBlock,
|
|
||||||
[3, 4, 6, 3],
|
|
||||||
[64, 256, 512, 1024],
|
|
||||||
[256, 512, 1024, 2048],
|
|
||||||
[1, 2, 2, 2],
|
|
||||||
class_num)
|
|
||||||
|
|
||||||
|
|
||||||
def resnet101(class_num=1001):
|
|
||||||
"""
|
|
||||||
Get ResNet101 neural network.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
class_num (int): Class number.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Cell, cell instance of ResNet101 neural network.
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
>>> net = resnet101(1001)
|
|
||||||
"""
|
|
||||||
return ResNet(ResidualBlock,
|
|
||||||
[3, 4, 23, 3],
|
|
||||||
[64, 256, 512, 1024],
|
|
||||||
[256, 512, 1024, 2048],
|
|
||||||
[1, 2, 2, 2],
|
|
||||||
class_num)
|
|
|
@ -1,80 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
# ============================================================================
|
|
||||||
|
|
||||||
execute_path=$(pwd)
|
|
||||||
self_path=$(dirname $0)
|
|
||||||
export MS_SCHED_NUM=1
|
|
||||||
DEVICE_TARGET=$1
|
|
||||||
export MS_WORKER_NUM=$2
|
|
||||||
export MS_SERVER_NUM=$3
|
|
||||||
export MS_SCHED_HOST=$4
|
|
||||||
export MS_SCHED_PORT=$5
|
|
||||||
|
|
||||||
export MS_ROLE=MS_SCHED
|
|
||||||
for((i=0;i<1;i++));
|
|
||||||
do
|
|
||||||
rm -rf ${execute_path}/sched_$i/
|
|
||||||
mkdir ${execute_path}/sched_$i/
|
|
||||||
cd ${execute_path}/sched_$i/ || exit
|
|
||||||
python ${self_path}/../test_multi_full_ps.py --device_target=$DEVICE_TARGET &
|
|
||||||
done
|
|
||||||
|
|
||||||
export MS_ROLE=MS_PSERVER
|
|
||||||
for((i=0;i<$MS_SERVER_NUM;i++));
|
|
||||||
do
|
|
||||||
rm -rf ${execute_path}/server_$i/
|
|
||||||
mkdir ${execute_path}/server_$i/
|
|
||||||
cd ${execute_path}/server_$i/ || exit
|
|
||||||
export RANK_ID=$i
|
|
||||||
export DEVICE_ID=$i
|
|
||||||
python ${self_path}/../test_multi_full_ps.py --device_target=$DEVICE_TARGET &
|
|
||||||
done
|
|
||||||
|
|
||||||
export MS_ROLE=MS_WORKER
|
|
||||||
process_pid=()
|
|
||||||
if [ $DEVICE_TARGET == "Ascend" ];then
|
|
||||||
for((i=0;i<$MS_WORKER_NUM;i++));
|
|
||||||
do
|
|
||||||
rm -rf ${execute_path}/worker_$i/
|
|
||||||
mkdir ${execute_path}/worker_$i/
|
|
||||||
cd ${execute_path}/worker_$i/ || exit
|
|
||||||
export RANK_ID=$i
|
|
||||||
export DEVICE_ID=$i
|
|
||||||
python ${self_path}/../test_multi_full_ps.py --device_target=$DEVICE_TARGET &
|
|
||||||
process_pid[${i}]=`echo $!`
|
|
||||||
done
|
|
||||||
|
|
||||||
for((i=0; i<${MS_WORKER_NUM}; i++)); do
|
|
||||||
wait ${process_pid[i]}
|
|
||||||
status=`echo $?`
|
|
||||||
if [ "${status}" != "0" ]; then
|
|
||||||
echo "[ERROR] test_multi_full_ps failed. status: ${status}"
|
|
||||||
exit 1
|
|
||||||
else
|
|
||||||
echo "[INFO] test_multi_full_ps success."
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $DEVICE_TARGET == "GPU" ];then
|
|
||||||
rm -rf ${execute_path}/worker/
|
|
||||||
mkdir ${execute_path}/worker/
|
|
||||||
cd ${execute_path}/worker/ || exit
|
|
||||||
mpirun -n $MS_WORKER_NUM python ${self_path}/../test_multi_full_ps.py --device_target=$DEVICE_TARGET &
|
|
||||||
wait $!
|
|
||||||
fi
|
|
||||||
|
|
||||||
exit 0
|
|
|
@ -1,118 +0,0 @@
|
||||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
# ============================================================================
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import argparse
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
import mindspore.context as context
|
|
||||||
import mindspore.nn as nn
|
|
||||||
from mindspore.common.initializer import TruncatedNormal
|
|
||||||
from mindspore import Tensor
|
|
||||||
from mindspore.nn import TrainOneStepCell, WithLossCell
|
|
||||||
from mindspore.communication.management import init, get_group_size
|
|
||||||
from mindspore.parallel._ps_context import _is_role_pserver
|
|
||||||
# from resnet import resnet50
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="test_ps_lenet")
|
|
||||||
parser.add_argument("--device_target", type=str, default="Ascend")
|
|
||||||
args, _ = parser.parse_known_args()
|
|
||||||
device_target = args.device_target
|
|
||||||
context.set_context(mode=context.GRAPH_MODE, device_target=device_target)
|
|
||||||
context.set_ps_context(enable_ps=True)
|
|
||||||
if device_target == "GPU":
|
|
||||||
init()
|
|
||||||
|
|
||||||
|
|
||||||
def conv(in_channels, out_channels, kernel_size, stride=1, padding=0):
|
|
||||||
"""weight initial for conv layer"""
|
|
||||||
weight = weight_variable()
|
|
||||||
return nn.Conv2d(
|
|
||||||
in_channels,
|
|
||||||
out_channels,
|
|
||||||
kernel_size=kernel_size,
|
|
||||||
stride=stride,
|
|
||||||
padding=padding,
|
|
||||||
weight_init=weight,
|
|
||||||
has_bias=False,
|
|
||||||
pad_mode="valid",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def fc_with_initialize(input_channels, out_channels):
|
|
||||||
"""weight initial for fc layer"""
|
|
||||||
weight = weight_variable()
|
|
||||||
bias = weight_variable()
|
|
||||||
return nn.Dense(input_channels, out_channels, weight, bias)
|
|
||||||
|
|
||||||
|
|
||||||
def weight_variable():
|
|
||||||
"""weight initial"""
|
|
||||||
return TruncatedNormal(0.02)
|
|
||||||
|
|
||||||
|
|
||||||
class LeNet5(nn.Cell):
|
|
||||||
def __init__(self, num_class=10, channel=3):
|
|
||||||
super(LeNet5, self).__init__()
|
|
||||||
self.num_class = num_class
|
|
||||||
self.conv1 = conv(channel, 6, 5)
|
|
||||||
self.conv2 = conv(6, 16, 5)
|
|
||||||
self.fc1 = fc_with_initialize(16 * 5 * 5, 120)
|
|
||||||
self.fc2 = fc_with_initialize(120, 84)
|
|
||||||
self.fc3 = fc_with_initialize(84, self.num_class)
|
|
||||||
self.relu = nn.ReLU()
|
|
||||||
self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
|
|
||||||
self.flatten = nn.Flatten()
|
|
||||||
|
|
||||||
def construct(self, x):
|
|
||||||
x = self.conv1(x)
|
|
||||||
x = self.relu(x)
|
|
||||||
x = self.max_pool2d(x)
|
|
||||||
x = self.conv2(x)
|
|
||||||
x = self.relu(x)
|
|
||||||
x = self.max_pool2d(x)
|
|
||||||
x = self.flatten(x)
|
|
||||||
x = self.fc1(x)
|
|
||||||
x = self.relu(x)
|
|
||||||
x = self.fc2(x)
|
|
||||||
x = self.relu(x)
|
|
||||||
x = self.fc3(x)
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
epoch = 5
|
|
||||||
np.random.seed(0)
|
|
||||||
network = LeNet5(10)
|
|
||||||
network.set_param_ps()
|
|
||||||
criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
|
|
||||||
net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9)
|
|
||||||
if device_target == "GPU":
|
|
||||||
context.set_auto_parallel_context(parallel_mode="data_parallel", gradients_mean=True,
|
|
||||||
device_num=get_group_size())
|
|
||||||
net_with_criterion = WithLossCell(network, criterion)
|
|
||||||
train_network = TrainOneStepCell(net_with_criterion, net_opt)
|
|
||||||
train_network.set_train()
|
|
||||||
losses = []
|
|
||||||
for _ in range(epoch):
|
|
||||||
data = Tensor(np.random.rand(32, 3, 32, 32).astype(np.float32))
|
|
||||||
label = Tensor(np.random.randint(0, 9, (32)).astype(np.int32))
|
|
||||||
if _is_role_pserver():
|
|
||||||
train_network(data, label)
|
|
||||||
sys.exit()
|
|
||||||
else:
|
|
||||||
loss = train_network(data, label).asnumpy()
|
|
||||||
losses.append(loss)
|
|
||||||
print(losses)
|
|
|
@ -24,45 +24,59 @@ export MS_SERVER_NUM=$4
|
||||||
export MS_SCHED_HOST=$5
|
export MS_SCHED_HOST=$5
|
||||||
export MS_SCHED_PORT=$6
|
export MS_SCHED_PORT=$6
|
||||||
export MS_ROLE=MS_SCHED
|
export MS_ROLE=MS_SCHED
|
||||||
export fusion=True
|
|
||||||
for((i=0;i<1;i++));
|
rm -rf ${execute_path}/sched/
|
||||||
do
|
mkdir ${execute_path}/sched/
|
||||||
rm -rf ${execute_path}/sched_$i/
|
cd ${execute_path}/sched/ || exit
|
||||||
mkdir ${execute_path}/sched_$i/
|
python ${self_path}/../test_ps_embedding_heterogeneous_conv2d_adam.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH > sched.log 2>&1 &
|
||||||
cd ${execute_path}/sched_$i/ || exit
|
sched_pid=`echo $!`
|
||||||
python ${self_path}/../test_ps_embedding_heterogeneous_conv2d_adam.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH &
|
|
||||||
done
|
|
||||||
|
|
||||||
export MS_ROLE=MS_PSERVER
|
export MS_ROLE=MS_PSERVER
|
||||||
|
server_pids=()
|
||||||
for((i=0;i<$MS_SERVER_NUM;i++));
|
for((i=0;i<$MS_SERVER_NUM;i++));
|
||||||
do
|
do
|
||||||
rm -rf ${execute_path}/server_$i/
|
rm -rf ${execute_path}/server_$i/
|
||||||
mkdir ${execute_path}/server_$i/
|
mkdir ${execute_path}/server_$i/
|
||||||
cd ${execute_path}/server_$i/ || exit
|
cd ${execute_path}/server_$i/ || exit
|
||||||
python ${self_path}/../test_ps_embedding_heterogeneous_conv2d_adam.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH &
|
python ${self_path}/../test_ps_embedding_heterogeneous_conv2d_adam.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH > server_$i.log 2>&1 &
|
||||||
|
server_pids[${i}]=`echo $!`
|
||||||
done
|
done
|
||||||
|
|
||||||
export MS_ROLE=MS_WORKER
|
export MS_ROLE=MS_WORKER
|
||||||
process_pid=()
|
worker_pids=()
|
||||||
for((i=0;i<$MS_WORKER_NUM;i++));
|
for((i=0;i<$MS_WORKER_NUM;i++));
|
||||||
do
|
do
|
||||||
rm -rf ${execute_path}/worker_$i/
|
rm -rf ${execute_path}/worker_$i/
|
||||||
mkdir ${execute_path}/worker_$i/
|
mkdir ${execute_path}/worker_$i/
|
||||||
cd ${execute_path}/worker_$i/ || exit
|
cd ${execute_path}/worker_$i/ || exit
|
||||||
python ${self_path}/../test_ps_embedding_heterogeneous_conv2d_adam.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH &
|
python ${self_path}/../test_ps_embedding_heterogeneous_conv2d_adam.py --device_target=$DEVICE_TARGET --dataset_path=$DATASET_PATH > worker_$i.log 2>&1 &
|
||||||
process_pid[${i}]=`echo $!`
|
worker_pids[${i}]=`echo $!`
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|
||||||
for((i=0; i<${MS_WORKER_NUM}; i++)); do
|
for((i=0; i<${MS_WORKER_NUM}; i++)); do
|
||||||
wait ${process_pid[i]}
|
wait ${worker_pids[i]}
|
||||||
status=`echo $?`
|
status=`echo $?`
|
||||||
if [ "${status}" != "0" ]; then
|
if [ "${status}" != "0" ]; then
|
||||||
echo "[ERROR] test_ps_embedding_heterogeneous_conv2d_adam failed. status: ${status}"
|
echo "[ERROR] test_ps_embedding_heterogeneous_conv2d_adam failed. Failed to wait worker_{$i}, status: ${status}"
|
||||||
exit 1
|
exit 1
|
||||||
else
|
|
||||||
echo "[INFO] test_ps_embedding_heterogeneous_conv2d_adam success."
|
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
for((i=0; i<${MS_SERVER_NUM}; i++)); do
|
||||||
|
wait ${server_pids[i]}
|
||||||
|
status=`echo $?`
|
||||||
|
if [ "${status}" != "0" ]; then
|
||||||
|
echo "[ERROR] test_ps_embedding_heterogeneous_conv2d_adam failed. Failed to wait server_{$i}, status: ${status}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
wait ${sched_pid}
|
||||||
|
status=`echo $?`
|
||||||
|
if [ "${status}" != "0" ]; then
|
||||||
|
echo "[ERROR] test_ps_embedding_heterogeneous_conv2d_adam failed. Failed to wait scheduler, status: ${status}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
exit 0
|
exit 0
|
||||||
|
|
|
@ -13,10 +13,27 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
import os
|
import os
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.level0
|
||||||
|
@pytest.mark.platform_x86_ascend_training
|
||||||
|
@pytest.mark.platform_arm_ascend_training
|
||||||
|
@pytest.mark.env_single
|
||||||
def test_ps_embedding_heterogeneous_conv2d_adam():
|
def test_ps_embedding_heterogeneous_conv2d_adam():
|
||||||
|
"""
|
||||||
|
Feature: Parameter Server.
|
||||||
|
Description: Test embedding with heterogeneous conv2d.
|
||||||
|
Expectation: success.
|
||||||
|
"""
|
||||||
return_code = os.system(
|
return_code = os.system(
|
||||||
"bash shell_run_test.sh GPU /home/workspace/mindspore_dataset/mnist 1 1 127.0.0.1 8085"
|
"bash shell_run_test.sh Ascend /home/workspace/mindspore_dataset/mnist 1 1 127.0.0.1 8085"
|
||||||
)
|
)
|
||||||
|
if return_code != 0:
|
||||||
|
os.system(f"echo '\n**************** Worker Log ****************'")
|
||||||
|
os.system(f"grep -E 'ERROR|Error|error' ./worker*/worker*.log")
|
||||||
|
os.system(f"echo '\n**************** Server Log ****************'")
|
||||||
|
os.system(f"grep -E 'ERROR|Error|error' ./server*/server*.log")
|
||||||
|
os.system(f"echo '\n**************** Scheduler Log ****************'")
|
||||||
|
os.system(f"grep -E 'ERROR|Error|error' ./sched/sched.log")
|
||||||
assert return_code == 0
|
assert return_code == 0
|
||||||
|
|
|
@ -85,7 +85,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1,
|
||||||
create dataset for train or test
|
create dataset for train or test
|
||||||
"""
|
"""
|
||||||
# define dataset
|
# define dataset
|
||||||
mnist_ds = ds.MnistDataset(data_path)
|
mnist_ds = ds.MnistDataset(data_path, num_shards=512, shard_id=0)
|
||||||
|
|
||||||
resize_height, resize_width = 32, 32
|
resize_height, resize_width = 32, 32
|
||||||
rescale = 1.0 / 255.0
|
rescale = 1.0 / 255.0
|
||||||
|
|
Loading…
Reference in New Issue