From 0752c566b1b0e2543f9094456b3d28921bb2cd78 Mon Sep 17 00:00:00 2001 From: ms_yan <6576637+ms_yan@user.noreply.gitee.com> Date: Thu, 20 Aug 2020 22:52:23 +0800 Subject: [PATCH] modify format in vgg16 --- model_zoo/official/cv/vgg16/README.md | 312 +++++++++++++----- .../cv/vgg16/scripts/run_distribute_train.sh | 0 .../vgg16/scripts/run_distribute_train_gpu.sh | 2 +- .../official/cv/vgg16/scripts/run_eval.sh | 32 ++ model_zoo/official/cv/vgg16/src/config.py | 0 .../official/cv/vgg16/src/crossentropy.py | 0 model_zoo/official/cv/vgg16/train.py | 67 +--- 7 files changed, 268 insertions(+), 145 deletions(-) mode change 100755 => 100644 model_zoo/official/cv/vgg16/scripts/run_distribute_train.sh create mode 100644 model_zoo/official/cv/vgg16/scripts/run_eval.sh mode change 100755 => 100644 model_zoo/official/cv/vgg16/src/config.py mode change 100755 => 100644 model_zoo/official/cv/vgg16/src/crossentropy.py diff --git a/model_zoo/official/cv/vgg16/README.md b/model_zoo/official/cv/vgg16/README.md index 4ecd749a746..d6c492865b9 100644 --- a/model_zoo/official/cv/vgg16/README.md +++ b/model_zoo/official/cv/vgg16/README.md @@ -1,36 +1,196 @@ -# VGG16 Example +# Contents -## Description +- [VGG Description](#vgg-description) +- [Model Architecture](#model-architecture) +- [Dataset](#dataset) +- [Features](#features) + - [Mixed Precision](#mixed-precision) +- [Environment Requirements](#environment-requirements) +- [Quick Start](#quick-start) +- [Script Description](#script-description) + - [Script and Sample Code](#script-and-sample-code) + - [Script Parameters](#script-parameters) + - [Parameter configuration](#parameter-configuration) + - [Training Process](#training-process) + - [Training](#training) + - [Evaluation Process](#evaluation-process) + - [Evaluation](#evaluation) +- [Model Description](#model-description) + - [Performance](#performance) + - [Training Performance](#training-performance) + - [Evaluation Performance](#evaluation-performance) +- [Description of Random Situation](#description-of-random-situation) +- [ModelZoo Homepage](#modelzoo-homepage) -This example is for VGG16 model training and evaluation. -## Requirements +# [VGG Description](#contents) -- Install [MindSpore](https://www.mindspore.cn/install/en). +VGG, a very deep convolutional networks for large-scale image recognition, was proposed in 2014 and won the 1th place in object localization and 2th place in image classification task in ImageNet Large-Scale Visual Recognition Challenge 2014 (ILSVRC14). -- Download the dataset CIFAR-10 or ImageNet2012. +[Paper](): Simonyan K, zisserman A. Very Deep Convolutional Networks for Large-Scale Image Recognition[J]. arXiv preprint arXiv:1409.1556, 2014. -CIFAR-10 +# [Model Architecture](#contents) +VGG 16 network is mainly consisted by several basic modules (including convolution and pooling layer) and three continuous Dense layer. +here basic modules mainly include basic operation like: **3×3 conv** and **2×2 max pooling**. -> Unzip the CIFAR-10 dataset to any path you want and the folder structure should be as follows: -> ``` -> . -> ├── cifar-10-batches-bin # train dataset -> └── cifar-10-verify-bin # infer dataset -> ``` -ImageNet2012 +# [Dataset](#contents) -> Unzip the ImageNet2012 dataset to any path you want and the folder should include train and eval dataset as follows: -> -> ``` -> . -> └─dataset -> ├─ilsvrc # train dataset -> └─validation_preprocess # evaluate dataset -> ``` +#### Dataset used: [CIFAR-10]() -## Parameter configuration +- CIFAR-10 Dataset size:175M,60,000 32*32 colorful images in 10 classes + - Train:146M,50,000 images + - Test:29.3M,10,000 images + - Data format: binary files + - Note: Data will be processed in src/dataset.py + +#### Dataset used: [ImageNet2012](http://www.image-net.org/) +- Dataset size: ~146G, 1.28 million colorful images in 1000 classes + - Train: 140G, 1,281,167 images + - Test: 6.4G, 50, 000 images + - Data format: RGB images + - Note: Data will be processed in src/dataset.py + +#### Dataset organize way + + CIFAR-10 + + > Unzip the CIFAR-10 dataset to any path you want and the folder structure should be as follows: + > ``` + > . + > ├── cifar-10-batches-bin # train dataset + > └── cifar-10-verify-bin # infer dataset + > ``` + + ImageNet2012 + + > Unzip the ImageNet2012 dataset to any path you want and the folder should include train and eval dataset as follows: + > + > ``` + > . + > └─dataset + > ├─ilsvrc # train dataset + > └─validation_preprocess # evaluate dataset + > ``` + + +# [Features](#contents) + +## Mixed Precision + +The [mixed precision](https://www.mindspore.cn/tutorial/zh-CN/master/advanced_use/mixed_precision.html) training method accelerates the deep learning neural network training process by using both the single-precision and half-precision data formats, and maintains the network precision achieved by the single-precision training at the same time. Mixed precision training can accelerate the computation process, reduce memory usage, and enable a larger model or batch size to be trained on specific hardware. +For FP16 operators, if the input data type is FP32, the backend of MindSpore will automatically handle it with reduced precision. Users could check the reduced-precision operators by enabling INFO log and then searching ‘reduce precision’. + + +# [Environment Requirements](#contents) + +- Hardware(Ascend/GPU) + - Prepare hardware environment with Ascend or GPU processor. If you want to try Ascend , please send the [application form](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx) to ascend@huawei.com. Once approved, you can get the resources. +- Framework + - [MindSpore](https://www.mindspore.cn/install/en) +- For more information, please check the resources below: + - [MindSpore tutorials](https://www.mindspore.cn/tutorial/zh-CN/master/index.html) + - [MindSpore API](https://www.mindspore.cn/api/zh-CN/master/index.html) + + +# [Quick Start](#contents) + +After installing MindSpore via the official website, you can start training and evaluation as follows: + +- Running on Ascend +```python +# run training example +python train.py --data_path=[DATA_PATH] --device_id=[DEVICE_ID] > output.train.log 2>&1 & + +# run distributed training example +sh run_distribute_train.sh [RANL_TABLE_JSON] [DATA_PATH] + +# run evaluation example +python eval.py --data_path=[DATA_PATH] --pre_trained=[PRE_TRAINED] > output.eval.log 2>&1 & +``` +For distributed training, a hccl configuration file with JSON format needs to be created in advance. +Please follow the instructions in the link below: +https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools + +- Running on GPU +``` +# run training example +python train.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_TYPE] --data_path=[DATA_PATH] > output.train.log 2>&1 & + +# run distributed training example +sh run_distribute_train_gpu.sh [DATA_PATH] + +# run evaluation example +python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_TYPE] --data_path=[DATA_PATH] --pre_trained=[PRE_TRAINED] > output.eval.log 2>&1 & +``` + +# [Script Description](#contents) + +## [Script and Sample Code](#contents) + + +``` +├── model_zoo + ├── README.md // descriptions about all the models + ├── vgg16 + ├── README.md // descriptions about googlenet + ├── scripts + │ ├── run_distribute_train.sh // shell script for distributed training on Ascend + │ ├── run_distribute_train_gpu.sh // shell script for distributed training on GPU + ├── src + │ ├── utils + │ │ ├── logging.py // logging format setting + │ │ ├── sampler.py // create sampler for dataset + │ │ ├── util.py // util function + │ │ ├── var_init.py // network parameter init method + │ ├── config.py // parameter configuration + │ ├── crossentropy.py // loss caculation + │ ├── dataset.py // creating dataset + │ ├── linear_warmup.py // linear leanring rate + │ ├── warmup_cosine_annealing_lr.py // consine anealing learning rate + │ ├── warmup_step_lr.py // step or multi step learning rate + │ ├──vgg.py // vgg architecture + ├── train.py // training script + ├── eval.py // evaluation script +``` + +## [Script Parameters](#contents) + +### Training +``` +usage: train.py [--device_target TARGET][--data_path DATA_PATH] + [--dataset DATASET_TYPE][--is_distributed VALUE] + [--device_id DEVICE_ID][--pre_trained PRE_TRAINED] + [--ckpt_path CHECKPOINT_PATH][--ckpt_interval INTERVAL_STEP] + +parameters/options: + --device_target the training backend type, Ascend or GPU, default is Ascend. + --dataset the dataset type, cifar10 or imagenet2012. + --is_distributed the way of traing, whether do distribute traing, value can be 0 or 1. + --data_path the storage path of dataset + --device_id the device which used to train model. + --pre_trained the pretrained checkpoint file path. + --ckpt_path the path to save checkpoint. + --ckpt_interval the epoch interval for saving checkpoint. + +``` + +### Evaluation + +``` +usage: eval.py [--device_target TARGET][--data_path DATA_PATH] + [--dataset DATASET_TYPE][--pre_trained PRE_TRAINED] + [--device_id DEVICE_ID] + +parameters/options: + --device_target the evaluation backend type, Ascend or GPU, default is Ascend. + --dataset the dataset type, cifar10 or imagenet2012. + --data_path the storage path of dataset. + --device_id the device which used to evaluate model. + --pre_trained the checkpoint file path used to evaluate model. +``` + +## [Parameter configuration](#contents) Parameters for both training and evaluation can be set in config.py. @@ -90,12 +250,13 @@ Parameters for both training and evaluation can be set in config.py. "has_dropout": True # wether using Dropout layer ``` -## Running the Example +## [Training Process](#contents) ### Training -**Run vgg16, using CIFAR-10 dataset** -- Training using single device(1p) +#### Run vgg16 on Ascend + +- Training using single device(1p), using CIFAR-10 dataset in default ``` python train.py --data_path=your_data_path --device_id=6 > out.train.log 2>&1 & ``` @@ -105,13 +266,13 @@ After training, you'll get some checkpoint files in specified ckpt_path, default You will get the loss value as following: ``` -# grep "loss is " out.train.log +# grep "loss is " output.train.log epoch: 1 step: 781, loss is 2.093086 epcoh: 2 step: 781, loss is 1.827582 ... ``` -- Distribute Training +- Distributed Training ``` sh run_distribute_train.sh rank_table.json your_data_path ``` @@ -131,37 +292,35 @@ train_parallel1/log:epcoh: 2 step: 97, loss is 1.7133579 > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). -**Run vgg16, using imagenet2012 dataset** +#### Run vgg16 on GPU - Training using single device(1p) ``` python train.py --device_target="GPU" --dataset="imagenet2012" --is_distributed=0 --data_path=$DATA_PATH > output.train.log 2>&1 & ``` -- Distribute Training +- Distributed Training ``` # distributed training(8p) bash scripts/run_distribute_train_gpu.sh /path/ImageNet2012/train" ``` +## [Evaluation Process](#contents) ### Evaluation - Do eval as follows, need to specify dataset type as "cifar10" or "imagenet2012" ``` # when using cifar10 dataset -python eval.py --data_path=your_data_path --dataset="cifar10" --device_target="Ascend" --pre_trained=./*-70-781.ckpt > out.eval.log 2>&1 & +python eval.py --data_path=your_data_path --dataset="cifar10" --device_target="Ascend" --pre_trained=./*-70-781.ckpt > output.eval.log 2>&1 & # when using imagenet2012 dataset -python eval.py --data_path=your_data_path --dataset="imagenet2012" --device_target="GPU" --pre_trained=./*-150-5004.ckpt > out.eval.log 2>&1 & +python eval.py --data_path=your_data_path --dataset="imagenet2012" --device_target="GPU" --pre_trained=./*-150-5004.ckpt > output.eval.log 2>&1 & ``` -- If the using dataset is -The above python command will run in the background, you can view the results through the file `out.eval.log`. - -You will get the accuracy as following: +- The above python command will run in the background, you can view the results through the file `output.eval.log`. You will get the accuracy as following: ``` # when using cifar10 dataset -# grep "result: " out.eval.log +# grep "result: " output.eval.log result: {'acc': 0.92} # when using the imagenet2012 dataset @@ -169,57 +328,46 @@ after allreduce eval: top1_correct=36636, tot=50000, acc=73.27% after allreduce eval: top5_correct=45582, tot=50000, acc=91.16% ``` -## Usage: -### Training -``` -usage: train.py [--device_target TARGET][--data_path DATA_PATH] - [--dataset DATASET_TYPE][--is_distributed VALUE] - [--device_id DEVICE_ID][--pre_trained PRE_TRAINED] - [--ckpt_path CHECKPOINT_PATH][--ckpt_interval INTERVAL_STEP] +# [Model Description](#contents) +## [Performance](#contents) -parameters/options: - --device_target the training backend type, Ascend or GPU, default is Ascend. - --dataset the dataset type, cifar10 or imagenet2012. - --is_distributed the way of traing, whether do distribute traing, value can be 0 or 1. - --data_path the storage path of dataset - --device_id the device which used to train model. - --pre_trained the pretrained checkpoint file path. - --ckpt_path the path to save checkpoint. - --ckpt_interval the epoch interval for saving checkpoint. +### Training Performance -``` +| Parameters | VGG16(Ascend) | VGG16(GPU) | +| -------------------------- | ---------------------------------------------- |------------------------------------| +| Model Version | VGG16 | VGG16 | +| Resource | Ascend 910 ;CPU 2.60GHz,56cores;Memory,314G |NV SMX2 V100-32G | +| uploaded Date | 08/20/2020 |08/20/2020 | +| MindSpore Version | 0.5.0-alpha |0.5.0-alpha | +| Dataset | CIFAR-10 |ImageNet2012 | +| Training Parameters | epoch=70, steps=781, batch_size = 64, lr=0.1 |epoch=150, steps=40036, batch_size = 32, lr=0.1 | +| Optimizer | Momentum |Momentum | +| Loss Function | SoftmaxCrossEntropy |SoftmaxCrossEntropy | +| outputs | probability |probability | +| Loss | 0.01 |1.5~2.0 | +| Speed | 1pc: 79 ms/step; 8pcs: 104 ms/step |1pc: 81 ms/step; 8pcs 94.4ms/step | +| Total time | 1pc: 72 mins; 8pcs: 11.8 mins |8pcs: 19.7 hours | +| Checkpoint for Fine tuning | 1.1G(.ckpt file) |1.1G(.ckpt file) | +| Scripts |[vgg16](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/vgg16) | | -### Evaluation -``` -usage: eval.py [--device_target TARGET][--data_path DATA_PATH] - [--dataset DATASET_TYPE][--pre_trained PRE_TRAINED] - [--device_id DEVICE_ID] +### Evaluation Performance -parameters/options: - --device_target the evaluation backend type, Ascend or GPU, default is Ascend. - --dataset the dataset type, cifar10 or imagenet2012. - --data_path the storage path of dataset. - --device_id the device which used to evaluate model. - --pre_trained the checkpoint file path used to evaluate model. -``` +| Parameters | VGG16(Ascend) | VGG16(GPU) +| ------------------- | --------------------------- |--------------------- +| Model Version | VGG16 | VGG16 | +| Resource | Ascend 910 | GPU | +| Uploaded Date | 08/20/2020 | 08/20/2020 | +| MindSpore Version | 0.5.0-alpha |0.5.0-alpha | +| Dataset | CIFAR-10, 10,000 images |ImageNet2012, 5000 images | +| batch_size | 64 | 32 | +| outputs | probability | probability | +| Accuracy | 1pc: 93.4% |1pc: 73.0%; | -### Distribute Training -- Train on Ascend. +# [Description of Random Situation](#contents) -``` -Usage: sh script/run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH] +In dataset.py, we set the seed inside “create_dataset" function. We also use random seed in train.py. -parameters/options: - RANK_TABLE_FILE HCCL configuration file path. - DATA_PATH the storage path of dataset. -``` - -- Train on GPU. -``` -Usage: bash run_distribute_train_gpu.sh [DATA_PATH] - -parameters/options: - DATA_PATH the storage path of dataset. -``` \ No newline at end of file +# [ModelZoo Homepage](#contents) + Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/master/model_zoo). \ No newline at end of file diff --git a/model_zoo/official/cv/vgg16/scripts/run_distribute_train.sh b/model_zoo/official/cv/vgg16/scripts/run_distribute_train.sh old mode 100755 new mode 100644 diff --git a/model_zoo/official/cv/vgg16/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/vgg16/scripts/run_distribute_train_gpu.sh index 51be33a53af..2784822d430 100644 --- a/model_zoo/official/cv/vgg16/scripts/run_distribute_train_gpu.sh +++ b/model_zoo/official/cv/vgg16/scripts/run_distribute_train_gpu.sh @@ -15,7 +15,7 @@ # ============================================================================ echo "==============================================================================================================" -echo "Please run the scipt as: " +echo "Please run the script as: " echo "bash run_distribute_train_gpu.sh DATA_PATH" echo "for example: bash run_distribute_train_gpu.sh /path/ImageNet2012/train" echo "==============================================================================================================" diff --git a/model_zoo/official/cv/vgg16/scripts/run_eval.sh b/model_zoo/official/cv/vgg16/scripts/run_eval.sh new file mode 100644 index 00000000000..717c2befcc2 --- /dev/null +++ b/model_zoo/official/cv/vgg16/scripts/run_eval.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run_eval.sh DATA_PATH DATASET_TYPE DEVICE_TYPE CHECKPOINT_PATH" +echo "for example: bash run_eval.sh /path/ImageNet2012/train cifar10 Ascend /path/a.ckpt " +echo "==============================================================================================================" + +DATA_PATH=&1 +DATASET_TYPE=$2 +DEVICE_TYPE=$3 +CHECKPOINT_PATH=$4 + +python eval.py \ + --data_path=$DATA_PATH \ + --dataset=$DATASET_TYPE \ + --device_target=$DEVICE_TYPE \ + --pre_trained=$CHECKPOINT_PATH > output.eval.log 2>&1 & \ No newline at end of file diff --git a/model_zoo/official/cv/vgg16/src/config.py b/model_zoo/official/cv/vgg16/src/config.py old mode 100755 new mode 100644 diff --git a/model_zoo/official/cv/vgg16/src/crossentropy.py b/model_zoo/official/cv/vgg16/src/crossentropy.py old mode 100755 new mode 100644 diff --git a/model_zoo/official/cv/vgg16/train.py b/model_zoo/official/cv/vgg16/train.py index c65d64e2c32..ae2f934e1e7 100644 --- a/model_zoo/official/cv/vgg16/train.py +++ b/model_zoo/official/cv/vgg16/train.py @@ -18,7 +18,6 @@ python train.py --data_path=$DATA_HOME --device_id=$DEVICE_ID """ import argparse import datetime -import time import os import random @@ -29,7 +28,7 @@ from mindspore import Tensor from mindspore import context from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.momentum import Momentum -from mindspore.train.callback import Callback, ModelCheckpoint, CheckpointConfig +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.train.model import Model, ParallelMode from mindspore.train.serialization import load_param_into_net, load_checkpoint from mindspore.train.loss_scale_manager import FixedLossScaleManager @@ -49,63 +48,6 @@ random.seed(1) np.random.seed(1) -class ProgressMonitor(Callback): - """monitor loss and time""" - def __init__(self, args_param): - super(ProgressMonitor, self).__init__() - self.me_epoch_start_time = 0 - self.me_epoch_start_step_num = 0 - self.args = args_param - self.ckpt_history = [] - - def begin(self, run_context): - self.args.logger.info('start network train...') - - def epoch_begin(self, run_context): - pass - - def epoch_end(self, run_context): - """ - Called after each epoch finished. - - Args: - run_context (RunContext): Include some information of the model. - """ - cb_params = run_context.original_args() - me_step = cb_params.cur_step_num - 1 - - real_epoch = me_step // self.args.steps_per_epoch - time_used = time.time() - self.me_epoch_start_time - fps_mean = self.args.per_batch_size * (me_step-self.me_epoch_start_step_num) * self.args.group_size / time_used - self.args.logger.info('epoch[{}], iter[{}], loss:{}, mean_fps:{:.2f}' - 'imgs/sec'.format(real_epoch, me_step, cb_params.net_outputs, fps_mean)) - - if self.args.rank_save_ckpt_flag: - import glob - ckpts = glob.glob(os.path.join(self.args.outputs_dir, '*.ckpt')) - for ckpt in ckpts: - ckpt_fn = os.path.basename(ckpt) - if not ckpt_fn.startswith('{}-'.format(self.args.rank)): - continue - if ckpt in self.ckpt_history: - continue - self.ckpt_history.append(ckpt) - self.args.logger.info('epoch[{}], iter[{}], loss:{}, ckpt:{},' - 'ckpt_fn:{}'.format(real_epoch, me_step, cb_params.net_outputs, ckpt, ckpt_fn)) - - self.me_epoch_start_step_num = me_step - self.me_epoch_start_time = time.time() - - def step_begin(self, run_context): - pass - - def step_end(self, run_context, *me_args): - pass - - def end(self, run_context): - self.args.logger.info('end network train...') - - def parse_args(cloud_args=None): """parameters""" parser = argparse.ArgumentParser('mindspore classification training') @@ -279,9 +221,10 @@ if __name__ == '__main__': loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, amp_level="O2") - # checkpoint save - progress_cb = ProgressMonitor(args) - callbacks = [progress_cb,] + # define callbacks + time_cb = TimeMonitor(data_size=batch_num) + loss_cb = LossMonitor(per_print_times=batch_num) + callbacks = [time_cb, loss_cb] if args.rank_save_ckpt_flag: ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch, keep_checkpoint_max=args.ckpt_save_max)