diff --git a/model_zoo/official/cv/cnn_direction_model/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/cnn_direction_model/scripts/run_distribute_train_gpu.sh new file mode 100755 index 00000000000..5de92b472f8 --- /dev/null +++ b/model_zoo/official/cv/cnn_direction_model/scripts/run_distribute_train_gpu.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +if [ $# != 3 ] && [ $# != 4 ] +then + echo "Usage: + sh scripts/run_distribute_train_gpu.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) + " + exit 1 +fi + +if [ $1 -lt 1 ] && [ $1 -gt 8 ] +then + echo "error: DEVICE_NUM=$1 is not in (1-8)" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $3) + +if [ $# == 4 ] +then + PATH2=$(get_real_path $4) +fi + +if [ ! -d $PATH1 ] +then + echo "error: DATASET_PATH=$PATH1 is not a directory" +exit 1 +fi + +if [ $# == 4 ] && [ ! -f $PATH2 ] +then + echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" +exit 1 +fi + +export DEVICE_NUM=$1 +export RANK_SIZE=$1 + +if [ -d "./train" ]; +then + rm -rf ./train +fi +mkdir ./train +cd ./train || exit + +export CUDA_VISIBLE_DEVICES="$2" + +if [ $# == 3 ] +then + mpirun -n $1 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \ + python ../train.py \ + --run_distribute=True \ + --train_dataset_path=$PATH1 > train.log 2>&1 & +fi + +if [ $# == 4 ] +then + mpirun -n $1 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \ + python ../train.py \ + --run_distribute=True \ + --train_dataset_path=$PATH1 \ + --pre_trained=$PATH2 > train.log 2>&1 & +fi diff --git a/model_zoo/official/cv/cnn_direction_model/scripts/run_standalone_eval_gpu.sh b/model_zoo/official/cv/cnn_direction_model/scripts/run_standalone_eval_gpu.sh new file mode 100755 index 00000000000..ebe178f9db6 --- /dev/null +++ b/model_zoo/official/cv/cnn_direction_model/scripts/run_standalone_eval_gpu.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +if [ $# != 2 ] +then + echo "Usage: + sh scripts/run_eval_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH] + " +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) + +if [ ! -f $PATH2 ] +then + echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" +exit 1 +fi + +if [ -d "./eval" ]; +then + rm -rf ./eval +fi +mkdir ./eval +cd ./eval || exit + +python ../eval.py --eval_dataset_path=$PATH1 --checkpoint_path=$PATH2 > eval.log 2>&1 & diff --git a/model_zoo/official/cv/cnn_direction_model/scripts/run_standalone_train_gpu.sh b/model_zoo/official/cv/cnn_direction_model/scripts/run_standalone_train_gpu.sh new file mode 100755 index 00000000000..7d5b5c9d820 --- /dev/null +++ b/model_zoo/official/cv/cnn_direction_model/scripts/run_standalone_train_gpu.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +if [ $# != 1 ] && [ $# != 2 ] +then + echo "Usage: + sh scripts/run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) + " +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) + +if [ $# == 2 ] +then + PATH2=$(get_real_path $2) +fi + +if [ $# == 2 ] && [ ! -f $PATH2 ] +then + echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" +exit 1 +fi + +if [ -d "./train" ]; +then + rm -rf ./train +fi +mkdir ./train +cd ./train || exit + + +if [ $# == 1 ] +then + python ../train.py --train_dataset_path=$PATH1 > train.log 2>&1 & +fi + +if [ $# == 2 ] +then + python ../train.py --train_dataset_path=$PATH1 --pre_trained=$PATH2 > train.log 2>&1 & +fi diff --git a/model_zoo/official/cv/cnn_direction_model/train.py b/model_zoo/official/cv/cnn_direction_model/train.py index 30b421ec61e..77ed32aed95 100644 --- a/model_zoo/official/cv/cnn_direction_model/train.py +++ b/model_zoo/official/cv/cnn_direction_model/train.py @@ -23,7 +23,7 @@ import mindspore as ms from mindspore import Tensor from mindspore import context from mindspore import dataset as de -from mindspore.communication.management import init +from mindspore.communication.management import init, get_rank from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore.nn.metrics import Accuracy from mindspore.nn.optim.adam import Adam @@ -101,21 +101,30 @@ def train(): target = config.device_target ckpt_save_dir = config.save_checkpoint_path - # init context - device_id = get_device_id() - rank_id = get_rank_id() - rank_size = get_device_num() - run_distribute = rank_size > 1 context.set_context(mode=context.GRAPH_MODE, device_target=target, - device_id=device_id, save_graphs=False) + save_graphs=False) + rank_size = get_device_num() + run_distribute = rank_size > 1 + device_id = get_device_id() + if target == "Ascend": + # init context + rank_id = get_rank_id() + context.set_context(device_id=device_id) + + if run_distribute: + context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL) + init() + elif target == "GPU": + rank_id = 0 + if run_distribute: + context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL) + init() + rank_id = get_rank() print("train args: ", config, "\ncfg: ", config, "\nparallel args: rank_id {}, device_id {}, rank_size {}".format(rank_id, device_id, rank_size)) - if run_distribute: - context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL) - init() config.rank_save_ckpt_flag = 0 if config.is_save_on_master: @@ -130,6 +139,8 @@ def train(): ".mindrecord0", config=config, dataset_name=dataset_name) step_size = dataset.get_dataset_size() + print("step_size ", step_size, flush=True) + # define net net = CNNDirectionModel([3, 64, 48, 48, 64], [64, 48, 48, 64, 64], [256, 64], [64, 512])