Add MobileNetV1 CPU Script

This commit is contained in:
wuxuejian 2021-03-29 15:09:55 +08:00
parent 12a29ce040
commit 421c31858c
6 changed files with 179 additions and 39 deletions

View File

@ -99,6 +99,7 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil
You can start training using python or shell scripts. The usage of shell scripts as follows:
- Ascend: sh run_distribute_train.sh [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH] (optional)
- CPU: sh run_train_CPU.sh [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH] (optional)
For distributed training, a hccl configuration file with JSON format needs to be created in advance.
@ -109,10 +110,12 @@ Please follow the instructions in the link [hccn_tools](https://gitee.com/mindsp
```shell
# training example
python:
Ascend: python train.py --platform Ascend --dataset_path [TRAIN_DATASET_PATH]
Ascend: python train.py --device_target Ascend --dataset_path [TRAIN_DATASET_PATH]
CPU: python train.py --device_target CPU --dataset_path [TRAIN_DATASET_PATH]
shell:
Ascend: sh run_distribute_train.sh [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
CPU: sh run_train_CPU.sh [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
```
### Result
@ -133,6 +136,7 @@ Epoch time: 150950.623, per step time: 120.664
You can start training using python or shell scripts.If the train method is train or fine tune, should not input the `[CHECKPOINT_PATH]` The usage of shell scripts as follows:
- Ascend: sh run_eval.sh [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
- CPU: sh run_eval_CPU.sh [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
### Launch
@ -140,9 +144,11 @@ You can start training using python or shell scripts.If the train method is trai
# eval example
python:
Ascend: python eval.py --dataset [cifar10|imagenet2012] --dataset_path [VAL_DATASET_PATH] --pretrain_ckpt [CHECKPOINT_PATH]
CPU: python eval.py --dataset [cifar10|imagenet2012] --dataset_path [VAL_DATASET_PATH] --pretrain_ckpt [CHECKPOINT_PATH] --device_target CPU
shell:
Ascend: sh run_eval.sh [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
CPU: sh run_eval_CPU.sh [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
```
> checkpoint can be produced in training process.

View File

@ -45,7 +45,7 @@ if __name__ == '__main__':
# init context
context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
if target != "GPU":
if target == "Ascend":
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(device_id=device_id)

View File

@ -0,0 +1,64 @@
#!/bin/bash
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 3 ]
then
echo "Usage: bash run_eval_cpu.sh [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]"
exit 1
fi
if [ $1 != "cifar10" ] && [ $1 != "imagenet2012" ]
then
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
PATH1=$(get_real_path $2)
PATH2=$(get_real_path $3)
if [ ! -d $PATH1 ]
then
echo "error: DATASET_PATH=$PATH1 is not a directory"
exit 1
fi
if [ ! -f $PATH2 ]
then
echo "error: CHECKPOINT_PATH=$PATH2 is not a file"
exit 1
fi
if [ -d "eval" ];
then
rm -rf ./eval
fi
mkdir ./eval
cp ../*.py ./eval
cp *.sh ./eval
cp -r ../src ./eval
cd ./eval || exit
env > env.log
python eval.py --dataset=$1 --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target=CPU &> log &
cd ..

View File

@ -0,0 +1,75 @@
#!/bin/bash
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 2 ] && [ $# != 3 ]
then
echo "Usage: bash run_train_cpu.sh [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1
fi
if [ $1 != "cifar10" ] && [ $1 != "imagenet2012" ]
then
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
PATH1=$(get_real_path $2)
if [ $# == 3 ]
then
PATH2=$(get_real_path $3)
fi
if [ ! -d $PATH1 ]
then
echo "error: DATASET_PATH=$PATH1 is not a directory"
exit 1
fi
if [ $# == 3 ] && [ ! -f $PATH2 ]
then
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
exit 1
fi
if [ -d "train" ];
then
rm -rf ./train
fi
mkdir ./train
cp ../*.py ./train
cp *.sh ./train
cp -r ../src ./train
cd ./train || exit
env > env.log
if [ $# == 2 ]
then
python train.py --dataset=$1 --dataset_path=$PATH1 --device_target=CPU &> log &
fi
if [ $# == 3 ]
then
python train.py --dataset=$1 --dataset_path=$PATH1 --pre_trained=$PATH2 --device_target=CPU &> log &
fi
cd ..

View File

@ -16,12 +16,15 @@
create train or eval dataset.
"""
import os
from multiprocessing import cpu_count
import mindspore.common.dtype as mstype
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2
from mindspore.communication.management import init, get_rank, get_group_size
THREAD_NUM = 12 if cpu_count() >= 12 else 8
def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"):
"""
@ -38,15 +41,17 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target=
"""
if target == "Ascend":
device_num, rank_id = _get_rank_info()
else:
elif target == "GPU":
init()
rank_id = get_rank()
device_num = get_group_size()
else:
device_num = 1
if device_num == 1:
data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=12, shuffle=True)
data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=THREAD_NUM, shuffle=True)
else:
data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=12, shuffle=True,
data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=THREAD_NUM, shuffle=True,
num_shards=device_num, shard_id=rank_id)
# define map operations
@ -66,8 +71,8 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target=
type_cast_op = C2.TypeCast(mstype.int32)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=12)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=THREAD_NUM)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=THREAD_NUM)
# apply batch operations
data_set = data_set.batch(batch_size, drop_remainder=True)
@ -99,9 +104,9 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target=
device_num = get_group_size()
if device_num == 1:
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=THREAD_NUM, shuffle=True)
else:
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True,
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=THREAD_NUM, shuffle=True,
num_shards=device_num, shard_id=rank_id)
image_size = 224
@ -127,8 +132,8 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target=
type_cast_op = C2.TypeCast(mstype.int32)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=12)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=THREAD_NUM)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=THREAD_NUM)
# apply batch operations
data_set = data_set.batch(batch_size, drop_remainder=True)

View File

@ -116,12 +116,15 @@ if __name__ == '__main__':
else:
no_decayed_params.append(param)
if target == "Ascend":
group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay},
{'params': no_decayed_params},
{'order_params': net.trainable_params()}]
opt = Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale)
else:
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay,
config.loss_scale)
# define loss, model
if target == "Ascend":
if args_opt.dataset == "imagenet2012":
if not config.use_label_smooth:
config.label_smooth_factor = 0.0
@ -130,24 +133,11 @@ if __name__ == '__main__':
else:
loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
if target != "CPU":
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'},
amp_level="O2", keep_batchnorm_fp32=False)
else:
# GPU target
if args_opt.dataset == "imagenet2012":
if not config.use_label_smooth:
config.label_smooth_factor = 0.0
loss = CrossEntropySmooth(sparse=True, reduction="mean",
smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
else:
loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay,
config.loss_scale)
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
# Mixed precision
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'},
amp_level="O2", keep_batchnorm_fp32=False)
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})
# define callbacks
time_cb = TimeMonitor(data_size=step_size)