!19527 cnn_direction_model gpu version

Merge pull request !19527 from panfengfeng/add_cnn_direction_model_gpu_version
This commit is contained in:
i-robot 2021-07-09 07:14:49 +00:00 committed by Gitee
commit 17c9b7397d
4 changed files with 214 additions and 10 deletions

View File

@ -0,0 +1,84 @@
#!/bin/bash
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 3 ] && [ $# != 4 ]
then
echo "Usage:
sh scripts/run_distribute_train_gpu.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
"
exit 1
fi
if [ $1 -lt 1 ] && [ $1 -gt 8 ]
then
echo "error: DEVICE_NUM=$1 is not in (1-8)"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
PATH1=$(get_real_path $3)
if [ $# == 4 ]
then
PATH2=$(get_real_path $4)
fi
if [ ! -d $PATH1 ]
then
echo "error: DATASET_PATH=$PATH1 is not a directory"
exit 1
fi
if [ $# == 4 ] && [ ! -f $PATH2 ]
then
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
exit 1
fi
export DEVICE_NUM=$1
export RANK_SIZE=$1
if [ -d "./train" ];
then
rm -rf ./train
fi
mkdir ./train
cd ./train || exit
export CUDA_VISIBLE_DEVICES="$2"
if [ $# == 3 ]
then
mpirun -n $1 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
python ../train.py \
--run_distribute=True \
--train_dataset_path=$PATH1 > train.log 2>&1 &
fi
if [ $# == 4 ]
then
mpirun -n $1 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
python ../train.py \
--run_distribute=True \
--train_dataset_path=$PATH1 \
--pre_trained=$PATH2 > train.log 2>&1 &
fi

View File

@ -0,0 +1,48 @@
#!/bin/bash
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 2 ]
then
echo "Usage:
sh scripts/run_eval_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH]
"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
PATH1=$(get_real_path $1)
PATH2=$(get_real_path $2)
if [ ! -f $PATH2 ]
then
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
exit 1
fi
if [ -d "./eval" ];
then
rm -rf ./eval
fi
mkdir ./eval
cd ./eval || exit
python ../eval.py --eval_dataset_path=$PATH1 --checkpoint_path=$PATH2 > eval.log 2>&1 &

View File

@ -0,0 +1,61 @@
#!/bin/bash
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 1 ] && [ $# != 2 ]
then
echo "Usage:
sh scripts/run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
PATH1=$(get_real_path $1)
if [ $# == 2 ]
then
PATH2=$(get_real_path $2)
fi
if [ $# == 2 ] && [ ! -f $PATH2 ]
then
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
exit 1
fi
if [ -d "./train" ];
then
rm -rf ./train
fi
mkdir ./train
cd ./train || exit
if [ $# == 1 ]
then
python ../train.py --train_dataset_path=$PATH1 > train.log 2>&1 &
fi
if [ $# == 2 ]
then
python ../train.py --train_dataset_path=$PATH1 --pre_trained=$PATH2 > train.log 2>&1 &
fi

View File

@ -23,7 +23,7 @@ import mindspore as ms
from mindspore import Tensor
from mindspore import context
from mindspore import dataset as de
from mindspore.communication.management import init
from mindspore.communication.management import init, get_rank
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
from mindspore.nn.metrics import Accuracy
from mindspore.nn.optim.adam import Adam
@ -101,21 +101,30 @@ def train():
target = config.device_target
ckpt_save_dir = config.save_checkpoint_path
# init context
device_id = get_device_id()
rank_id = get_rank_id()
rank_size = get_device_num()
run_distribute = rank_size > 1
context.set_context(mode=context.GRAPH_MODE,
device_target=target,
device_id=device_id, save_graphs=False)
save_graphs=False)
rank_size = get_device_num()
run_distribute = rank_size > 1
device_id = get_device_id()
if target == "Ascend":
# init context
rank_id = get_rank_id()
context.set_context(device_id=device_id)
if run_distribute:
context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL)
init()
elif target == "GPU":
rank_id = 0
if run_distribute:
context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL)
init()
rank_id = get_rank()
print("train args: ", config, "\ncfg: ", config,
"\nparallel args: rank_id {}, device_id {}, rank_size {}".format(rank_id, device_id, rank_size))
if run_distribute:
context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL)
init()
config.rank_save_ckpt_flag = 0
if config.is_save_on_master:
@ -130,6 +139,8 @@ def train():
".mindrecord0", config=config, dataset_name=dataset_name)
step_size = dataset.get_dataset_size()
print("step_size ", step_size, flush=True)
# define net
net = CNNDirectionModel([3, 64, 48, 48, 64], [64, 48, 48, 64, 64], [256, 64], [64, 512])