From 1c959b3489e98c95eed49b9a30e44602044d6acd Mon Sep 17 00:00:00 2001 From: Rescue Date: Tue, 13 Jul 2021 19:29:57 +0000 Subject: [PATCH] Added GPU support for SE-Net updated ReadMe Updated ReadMe Addressed comments Addressed comments_2 pylint fix updated ReadMe v2 Fixed standalone run ckpt fixed ckpt --- model_zoo/research/cv/SE-Net/README.md | 95 +++++++++++++------ .../scripts/run_distribute_train_gpu.sh | 43 +++++++++ .../cv/SE-Net/scripts/run_eval_gpu.sh | 45 +++++++++ .../scripts/run_standalone_train_gpu.sh | 46 +++++++++ .../cv/SE-Net/src/CrossEntropySmooth.py | 4 +- model_zoo/research/cv/SE-Net/src/config.py | 2 +- model_zoo/research/cv/SE-Net/src/resnet.py | 2 +- model_zoo/research/cv/SE-Net/train.py | 23 +++-- 8 files changed, 220 insertions(+), 40 deletions(-) create mode 100644 model_zoo/research/cv/SE-Net/scripts/run_distribute_train_gpu.sh create mode 100644 model_zoo/research/cv/SE-Net/scripts/run_eval_gpu.sh create mode 100644 model_zoo/research/cv/SE-Net/scripts/run_standalone_train_gpu.sh diff --git a/model_zoo/research/cv/SE-Net/README.md b/model_zoo/research/cv/SE-Net/README.md index ef4e5447eb7..bbe5e145a5e 100644 --- a/model_zoo/research/cv/SE-Net/README.md +++ b/model_zoo/research/cv/SE-Net/README.md @@ -58,8 +58,8 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil # [Environment Requirements](#contents) -- Hardware(Ascend) - - Prepare hardware environment with Ascend processor. +- Hardware(Ascend/GPU) + - Prepare hardware environment with Ascend or GPU processor. - Framework - [MindSpore](https://www.mindspore.cn/install/en) - For more information, please check the resources below: @@ -88,6 +88,22 @@ export DEVICE_ID=0 python eval.py --net=se-resnet50 --dataset=imagenet2012 --checkpoint_path=[CHECKPOINT_PATH] --dataset_path=[DATASET_PATH] ``` +- Running on GPU + +```bash +# distributed training +Usage: +sh run_distribute_train_gpu.sh se-resnet50 imagenet2012 [DATASET_PATH] + +# standalone training +Usage: +sh run_standalone_train_gpu.sh se-resnet50 imagenet2012 [DATASET_PATH] + +# run evaluation example +Usage: +sh run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] +``` + # [Script Description](#contents) ## [Script and Sample Code](#contents) @@ -99,8 +115,11 @@ python eval.py --net=se-resnet50 --dataset=imagenet2012 --checkpoint_path=[CHECK ├── ascend310_infer # application for 310 inference ├── scripts ├── run_distribute_train.sh # launch ascend distributed training(8 pcs) + ├── run_distribute_train_gpu.sh # launch gpu distributed training(8 pcs) ├── run_eval.sh # launch ascend evaluation + ├── run_eval_gpu.sh # launch gpu evaluation ├── run_standalone_train.sh # launch ascend standalone training(1 pcs) + ├── run_standalone_train_gpu.sh # launch gpu standalone training(1 pcs) └─ run_infer_310.sh # shell script for 310inference on ascend ├── src ├── config.py # parameter configuration @@ -159,6 +178,18 @@ export DEVICE_ID=0 bash run_standalone_train.sh se-resnet50 imagenet2012 /data/imagenet/train/ ``` +#### Running on GPU + +```bash +# distributed training +Usage: +sh run_distribute_train_gpu.sh se-resnet50 imagenet2012 [DATASET_PATH] + +# standalone training +Usage: +sh run_standalone_train_gpu.sh se-resnet50 imagenet2012 [DATASET_PATH] +``` + For distributed training, a hccl configuration file with JSON format needs to be created in advance. Please follow the instructions in the link [hccn_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools). @@ -195,6 +226,12 @@ export DEVICE_ID=0 bash run_eval.sh /imagenet/val/ /path/to/resnet-90_625.ckpt ``` +#### Running on GPU + +```bash +sh run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] +``` + ### Result - Evaluating SE-ResNet50 with ImageNet2012 dataset @@ -243,38 +280,38 @@ result: {'top_5_accuracy': 93.86%, 'top_1_accuracy': 77.80%} #### SE-ResNet50 on ImageNet2012 -| Parameters | Ascend 910 -| -------------------------- | ------------------------------------------------------------------------ | -| Model Version | SE-ResNet50 | -| Resource | CentOs 8.2, Ascend 910,CPU 2.60GHz 192cores,Memory 755G | -| uploaded Date | 03/19/2021 (month/day/year) | -| MindSpore Version | 0.7.0-alpha | -| Dataset | ImageNet2012 | -| Training Parameters | epoch=90, steps per epoch=5004, batch_size = 256 | -| Optimizer | Momentum | -| Loss Function | Softmax Cross Entropy | -| outputs | probability | -| Loss | 1.5931969 | -| Speed | # ms/step(8pcs) | -| Total time | # mins | -| Parameters (M) | 285M | -| Checkpoint for Fine tuning | # M (.ckpt file) | -| Scripts | | +| Parameters | Ascend | GPU +| -------------------------- | ---------------------------------------------------------- | ---------------------------------------------------------- | +| Model Version | SE-ResNet50 | SE-ResNet50 | +| Resource | CentOs 8.2, Ascend 910,CPU 2.60GHz 192cores,Memory 755G | V100-PCIE 32G | +| uploaded Date | 03/19/2021 (month/day/year) | 07/14/2021 (month/day/year) | +| MindSpore Version | 0.7.0-alpha | 1.3.0 | +| Dataset | ImageNet2012 | ImageNet2012 | +| Training Parameters | epoch=90, steps per epoch=5004, batch_size = 256 | epoch=90, steps per epoch=5004, batch_size = 256 | +| Optimizer | Momentum | Momentum | +| Loss Function | Softmax Cross Entropy | Softmax Cross Entropy | +| outputs | probability | probability | +| Loss | 1.5931969 | 1.6664593 | +| Speed | # ms/step(8pcs) | 8pcs: 1016.9 ms/step | +| Total time | # mins | 8pcs: 15.9 hours | +| Parameters (M) | 285M | 285M | +| Checkpoint for Fine tuning | # M (.ckpt file) | # M (.ckpt file) | +| Scripts | | | ### Inference Performance #### SE-ResNet50 on ImageNet2012 -| Parameters | Ascend | -| ------------------- | --------------------------- | -| Model Version | SE-ResNet50 | -| Resource | Ascend 910 | -| Uploaded Date | 03/19/2021 (month/day/year) | -| MindSpore Version | 0.7.0-alpha | -| Dataset | ImageNet2012 | -| batch_size | 256 | -| Accuracy | 77.74% | -| Model for inference | # (.air file) | +| Parameters | Ascend | GPU | +| ------------------- | --------------------------- | --------------------------- | +| Model Version | SE-ResNet50 | SE-ResNet50 | +| Resource | Ascend 910 | V100-PCIE 32G | +| Uploaded Date | 03/19/2021 (month/day/year) | 07/14/2021 (month/day/year) | +| MindSpore Version | 0.7.0-alpha | 1.3.0 | +| Dataset | ImageNet2012 | ImageNet2012 | +| batch_size | 256 | 256 | +| Accuracy | 77.74% | 77.66% | +| Model for inference | # (.air file) | ------ | ### 310Inference Performance diff --git a/model_zoo/research/cv/SE-Net/scripts/run_distribute_train_gpu.sh b/model_zoo/research/cv/SE-Net/scripts/run_distribute_train_gpu.sh new file mode 100644 index 00000000000..e629f992d77 --- /dev/null +++ b/model_zoo/research/cv/SE-Net/scripts/run_distribute_train_gpu.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +#Usage: sh run_distribute_train.sh [se-resnet50] [imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) + + +ulimit -u unlimited +export DEVICE_NUM=8 +export RANK_SIZE=8 +export NET=$1 +export DATASET=$2 +export DATASET_PATH=$3 + +rm -rf ./train_parallel +mkdir ./train_parallel +cp ../*.py ./train_parallel +cp *.sh ./train_parallel +cp -r ../src ./train_parallel +cd ./train_parallel || exit + +echo "start distributed training with $DEVICE_NUM GPUs." + +mpirun --allow-run-as-root -n $DEVICE_NUM \ + python train.py \ + --device_target="GPU" \ + --net=$NET \ + --dataset=$DATASET \ + --run_distribute=True \ + --device_num=$DEVICE_NUM \ + --dataset_path=$DATASET_PATH > log 2>&1 & diff --git a/model_zoo/research/cv/SE-Net/scripts/run_eval_gpu.sh b/model_zoo/research/cv/SE-Net/scripts/run_eval_gpu.sh new file mode 100644 index 00000000000..1120c4d9ebb --- /dev/null +++ b/model_zoo/research/cv/SE-Net/scripts/run_eval_gpu.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] +then + echo "Usage: sh run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]" +exit 1 +fi + +ulimit -u unlimited +export DEVICE_NUM=1 +export DEVICE_ID=0 +export RANK_SIZE=$DEVICE_NUM +export RANK_ID=0 +export DATA_PATH=$1 +export CKPT_PATH=$2 + +if [ -d "eval" ]; +then + rm -rf ./eval +fi +mkdir ./eval +cp ../*.py ./eval +cp *.sh ./eval +cp -r ../src ./eval +cd ./eval || exit +env > env.log +echo "start evaluation for device $DEVICE_ID" + +python eval.py --device_target="GPU" --dataset_path=$DATA_PATH --checkpoint_path=$CKPT_PATH > log 2>&1 & + +cd .. diff --git a/model_zoo/research/cv/SE-Net/scripts/run_standalone_train_gpu.sh b/model_zoo/research/cv/SE-Net/scripts/run_standalone_train_gpu.sh new file mode 100644 index 00000000000..bf078df7039 --- /dev/null +++ b/model_zoo/research/cv/SE-Net/scripts/run_standalone_train_gpu.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 3 ] +then + echo "Usage: sh run_standalone_train.sh [NET] [DATASET_NAME] [DATASET_PATH]" +exit 1 +fi + +ulimit -u unlimited +export DEVICE_NUM=1 +export DEVICE_ID=0 +export RANK_ID=0 +export RANK_SIZE=1 +export NET=$1 +export DATASET=$2 +export DATASET_PATH=$3 + + +if [ -d "train" ]; +then + rm -rf ./train +fi + +mkdir ./train +cp ../*.py ./train +cp *.sh ./train +cp -r ../src ./train +cd ./train || exit +echo "start training for GPU device $DEVICE_ID" +env > env.log +python train.py --device_target="GPU" --net=$NET --dataset=$DATASET --dataset_path=$DATASET_PATH > log 2>&1 & +cd .. diff --git a/model_zoo/research/cv/SE-Net/src/CrossEntropySmooth.py b/model_zoo/research/cv/SE-Net/src/CrossEntropySmooth.py index 6d63b666946..8400b5413bf 100644 --- a/model_zoo/research/cv/SE-Net/src/CrossEntropySmooth.py +++ b/model_zoo/research/cv/SE-Net/src/CrossEntropySmooth.py @@ -16,12 +16,12 @@ import mindspore.nn as nn from mindspore import Tensor from mindspore.common import dtype as mstype -from mindspore.nn.loss.loss import _Loss +from mindspore.nn.loss.loss import LossBase from mindspore.ops import functional as F from mindspore.ops import operations as P -class CrossEntropySmooth(_Loss): +class CrossEntropySmooth(LossBase): """CrossEntropy""" def __init__(self, sparse=True, reduction='mean', smooth_factor=0., num_classes=1000): super(CrossEntropySmooth, self).__init__() diff --git a/model_zoo/research/cv/SE-Net/src/config.py b/model_zoo/research/cv/SE-Net/src/config.py index 28434a643b0..4817b51b219 100644 --- a/model_zoo/research/cv/SE-Net/src/config.py +++ b/model_zoo/research/cv/SE-Net/src/config.py @@ -27,7 +27,7 @@ config2 = ed({ "pretrain_epoch_size": 0, "save_checkpoint": True, "save_checkpoint_epochs": 5, - "keep_checkpoint_max": 10, + "keep_checkpoint_max": 90, "save_checkpoint_path": "./", "warmup_epochs": 0, "lr_decay_mode": "linear", diff --git a/model_zoo/research/cv/SE-Net/src/resnet.py b/model_zoo/research/cv/SE-Net/src/resnet.py index bf14d2d5e49..7d7e80d6f42 100644 --- a/model_zoo/research/cv/SE-Net/src/resnet.py +++ b/model_zoo/research/cv/SE-Net/src/resnet.py @@ -116,7 +116,7 @@ class Se_ResidualBlock(nn.Cell): if self.down_sample: self.down_sample_layer = nn.SequentialCell([_conv1x1(in_channel, out_channel, stride), _bn(out_channel)])#use_se=self.use_se - self.add = P.TensorAdd() + self.add = P.Add() self.se = SELayer(out_channel, reduction) def construct(self, x): diff --git a/model_zoo/research/cv/SE-Net/train.py b/model_zoo/research/cv/SE-Net/train.py index 1b1613e9d39..eaaca3d4a65 100644 --- a/model_zoo/research/cv/SE-Net/train.py +++ b/model_zoo/research/cv/SE-Net/train.py @@ -24,7 +24,7 @@ from mindspore.context import ParallelMode from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.train.serialization import load_checkpoint, load_param_into_net -from mindspore.communication.management import init +from mindspore.communication.management import init, get_rank, get_group_size from mindspore.common import set_seed from mindspore.parallel import set_algo_parameters import mindspore.nn as nn @@ -67,12 +67,19 @@ if __name__ == '__main__': context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) set_algo_parameters(elementwise_op_strategy_follow=True) - if args_opt.net == "se-resnet50": - context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160]) - else: - context.set_auto_parallel_context(all_reduce_fusion_config=[180, 313]) init() - + elif target == "GPU": + init('nccl') + context.reset_auto_parallel_context() + rank = get_rank() + device_num = get_group_size() + context.set_auto_parallel_context(device_num=device_num, + parallel_mode=ParallelMode.DATA_PARALLEL, + gradients_mean=True) + if args_opt.net == "se-resnet50": + context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160]) + else: + context.set_auto_parallel_context(all_reduce_fusion_config=[180, 313]) # create dataset dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1, @@ -120,7 +127,7 @@ if __name__ == '__main__': {'order_params': net.trainable_params()}] opt = Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale) # define loss, model - if target == "Ascend": + if target in ["Ascend", "GPU"]: if args_opt.dataset == "imagenet2012": if not config.use_label_smooth: config.label_smooth_factor = 0.0 @@ -137,6 +144,8 @@ if __name__ == '__main__': if config.save_checkpoint: config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size, keep_checkpoint_max=config.keep_checkpoint_max) + if target == "GPU" and args_opt.run_distribute: + ckpt_save_dir = os.path.join(config.save_checkpoint_path, "ckpt_" + str(rank) + "/") ckpt_cb = ModelCheckpoint(prefix="resnet", directory=ckpt_save_dir, config=config_ck) cb += [ckpt_cb]