forked from mindspore-Ecosystem/mindspore
!19527 cnn_direction_model gpu version
Merge pull request !19527 from panfengfeng/add_cnn_direction_model_gpu_version
This commit is contained in:
commit
17c9b7397d
|
@ -0,0 +1,84 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
if [ $# != 3 ] && [ $# != 4 ]
|
||||
then
|
||||
echo "Usage:
|
||||
sh scripts/run_distribute_train_gpu.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 -lt 1 ] && [ $1 -gt 8 ]
|
||||
then
|
||||
echo "error: DEVICE_NUM=$1 is not in (1-8)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
|
||||
PATH1=$(get_real_path $3)
|
||||
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
PATH2=$(get_real_path $4)
|
||||
fi
|
||||
|
||||
if [ ! -d $PATH1 ]
|
||||
then
|
||||
echo "error: DATASET_PATH=$PATH1 is not a directory"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $# == 4 ] && [ ! -f $PATH2 ]
|
||||
then
|
||||
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export DEVICE_NUM=$1
|
||||
export RANK_SIZE=$1
|
||||
|
||||
if [ -d "./train" ];
|
||||
then
|
||||
rm -rf ./train
|
||||
fi
|
||||
mkdir ./train
|
||||
cd ./train || exit
|
||||
|
||||
export CUDA_VISIBLE_DEVICES="$2"
|
||||
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
mpirun -n $1 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
|
||||
python ../train.py \
|
||||
--run_distribute=True \
|
||||
--train_dataset_path=$PATH1 > train.log 2>&1 &
|
||||
fi
|
||||
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
mpirun -n $1 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
|
||||
python ../train.py \
|
||||
--run_distribute=True \
|
||||
--train_dataset_path=$PATH1 \
|
||||
--pre_trained=$PATH2 > train.log 2>&1 &
|
||||
fi
|
|
@ -0,0 +1,48 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
if [ $# != 2 ]
|
||||
then
|
||||
echo "Usage:
|
||||
sh scripts/run_eval_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH]
|
||||
"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
|
||||
PATH1=$(get_real_path $1)
|
||||
PATH2=$(get_real_path $2)
|
||||
|
||||
if [ ! -f $PATH2 ]
|
||||
then
|
||||
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -d "./eval" ];
|
||||
then
|
||||
rm -rf ./eval
|
||||
fi
|
||||
mkdir ./eval
|
||||
cd ./eval || exit
|
||||
|
||||
python ../eval.py --eval_dataset_path=$PATH1 --checkpoint_path=$PATH2 > eval.log 2>&1 &
|
|
@ -0,0 +1,61 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
if [ $# != 1 ] && [ $# != 2 ]
|
||||
then
|
||||
echo "Usage:
|
||||
sh scripts/run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
|
||||
PATH1=$(get_real_path $1)
|
||||
|
||||
if [ $# == 2 ]
|
||||
then
|
||||
PATH2=$(get_real_path $2)
|
||||
fi
|
||||
|
||||
if [ $# == 2 ] && [ ! -f $PATH2 ]
|
||||
then
|
||||
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -d "./train" ];
|
||||
then
|
||||
rm -rf ./train
|
||||
fi
|
||||
mkdir ./train
|
||||
cd ./train || exit
|
||||
|
||||
|
||||
if [ $# == 1 ]
|
||||
then
|
||||
python ../train.py --train_dataset_path=$PATH1 > train.log 2>&1 &
|
||||
fi
|
||||
|
||||
if [ $# == 2 ]
|
||||
then
|
||||
python ../train.py --train_dataset_path=$PATH1 --pre_trained=$PATH2 > train.log 2>&1 &
|
||||
fi
|
|
@ -23,7 +23,7 @@ import mindspore as ms
|
|||
from mindspore import Tensor
|
||||
from mindspore import context
|
||||
from mindspore import dataset as de
|
||||
from mindspore.communication.management import init
|
||||
from mindspore.communication.management import init, get_rank
|
||||
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
|
||||
from mindspore.nn.metrics import Accuracy
|
||||
from mindspore.nn.optim.adam import Adam
|
||||
|
@ -101,21 +101,30 @@ def train():
|
|||
target = config.device_target
|
||||
ckpt_save_dir = config.save_checkpoint_path
|
||||
|
||||
# init context
|
||||
device_id = get_device_id()
|
||||
rank_id = get_rank_id()
|
||||
rank_size = get_device_num()
|
||||
run_distribute = rank_size > 1
|
||||
context.set_context(mode=context.GRAPH_MODE,
|
||||
device_target=target,
|
||||
device_id=device_id, save_graphs=False)
|
||||
save_graphs=False)
|
||||
rank_size = get_device_num()
|
||||
run_distribute = rank_size > 1
|
||||
device_id = get_device_id()
|
||||
|
||||
if target == "Ascend":
|
||||
# init context
|
||||
rank_id = get_rank_id()
|
||||
context.set_context(device_id=device_id)
|
||||
|
||||
if run_distribute:
|
||||
context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL)
|
||||
init()
|
||||
elif target == "GPU":
|
||||
rank_id = 0
|
||||
if run_distribute:
|
||||
context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL)
|
||||
init()
|
||||
rank_id = get_rank()
|
||||
print("train args: ", config, "\ncfg: ", config,
|
||||
"\nparallel args: rank_id {}, device_id {}, rank_size {}".format(rank_id, device_id, rank_size))
|
||||
|
||||
if run_distribute:
|
||||
context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL)
|
||||
init()
|
||||
|
||||
config.rank_save_ckpt_flag = 0
|
||||
if config.is_save_on_master:
|
||||
|
@ -130,6 +139,8 @@ def train():
|
|||
".mindrecord0", config=config, dataset_name=dataset_name)
|
||||
step_size = dataset.get_dataset_size()
|
||||
|
||||
print("step_size ", step_size, flush=True)
|
||||
|
||||
# define net
|
||||
net = CNNDirectionModel([3, 64, 48, 48, 64], [64, 48, 48, 64, 64], [256, 64], [64, 512])
|
||||
|
||||
|
|
Loading…
Reference in New Issue