diff --git a/model_zoo/official/cv/vgg16/README.md b/model_zoo/official/cv/vgg16/README.md index 9ba1ae49b54..06bf6d7b82b 100644 --- a/model_zoo/official/cv/vgg16/README.md +++ b/model_zoo/official/cv/vgg16/README.md @@ -8,7 +8,9 @@ This example is for VGG16 model training and evaluation. - Install [MindSpore](https://www.mindspore.cn/install/en). -- Download the CIFAR-10 binary version dataset. +- Download the dataset CIFAR-10 or ImageNet2012. + +CIFAR-10 > Unzip the CIFAR-10 dataset to any path you want and the folder structure should be as follows: > ``` @@ -17,16 +19,89 @@ This example is for VGG16 model training and evaluation. > └── cifar-10-verify-bin # infer dataset > ``` +ImageNet2012 + +> Unzip the ImageNet2012 dataset to any path you want and the folder should include train and eval dataset as follows: +> +> ``` +> . +> └─dataset +> ├─ilsvrc # train dataset +> └─validation_preprocess # evaluate dataset +> ``` + +## Parameter configuration + +Parameters for both training and evaluation can be set in config.py. + +- config for vgg16, CIFAR-10 dataset + +``` +"num_classes": 10, # dataset class num +"lr": 0.01, # learning rate +"lr_init": 0.01, # initial learning rate +"lr_max": 0.1, # max learning rate +"lr_epochs": '30,60,90,120', # lr changing based epochs +"lr_scheduler": "step", # learning rate mode +"warmup_epochs": 5, # number of warmup epoch +"batch_size": 64, # batch size of input tensor +"max_epoch": 70, # only valid for taining, which is always 1 for inference +"momentum": 0.9, # momentum +"weight_decay": 5e-4, # weight decay +"loss_scale": 1.0, # loss scale +"label_smooth": 0, # label smooth +"label_smooth_factor": 0, # label smooth factor +"buffer_size": 10, # shuffle buffer size +"image_size": '224,224', # image size +"pad_mode": 'same', # pad mode for conv2d +"padding": 0, # padding value for conv2d +"has_bias": False, # whether has bias in conv2d +"batch_norm": True, # wether has batch_norm in conv2d +"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint +"initialize_mode": "XavierUniform", # conv2d init mode +"has_dropout": True # wether using Dropout layer +``` + +- config for vgg16, ImageNet2012 dataset + +``` +"num_classes": 1000, # dataset class num +"lr": 0.01, # learning rate +"lr_init": 0.01, # initial learning rate +"lr_max": 0.1, # max learning rate +"lr_epochs": '30,60,90,120', # lr changing based epochs +"lr_scheduler": "cosine_annealing", # learning rate mode +"warmup_epochs": 0, # number of warmup epoch +"batch_size": 32, # batch size of input tensor +"max_epoch": 150, # only valid for taining, which is always 1 for inference +"momentum": 0.9, # momentum +"weight_decay": 1e-4, # weight decay +"loss_scale": 1024, # loss scale +"label_smooth": 1, # label smooth +"label_smooth_factor": 0.1, # label smooth factor +"buffer_size": 10, # shuffle buffer size +"image_size": '224,224', # image size +"pad_mode": 'pad', # pad mode for conv2d +"padding": 1, # padding value for conv2d +"has_bias": True, # whether has bias in conv2d +"batch_norm": False, # wether has batch_norm in conv2d +"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint +"initialize_mode": "KaimingNormal", # conv2d init mode +"has_dropout": True # wether using Dropout layer +``` + ## Running the Example ### Training +**Run vgg16, using CIFAR-10 dataset** +- Training using single device(1p) ``` python train.py --data_path=your_data_path --device_id=6 > out.train.log 2>&1 & ``` The python command above will run in the background, you can view the results through the file `out.train.log`. -After training, you'll get some checkpoint files under the script folder by default. +After training, you'll get some checkpoint files in specified ckpt_path, default in ./output directory. You will get the loss value as following: ``` @@ -36,22 +111,7 @@ epcoh: 2 step: 781, loss is 1.827582 ... ``` -### Evaluation - -- Do eval as follows, need to specify dataset type as "cifar10" or "imagenet2012" -``` -python eval.py --data_path=your_data_path --dataset="cifar10" --pre_trained=./train_vgg_cifar10-70-781.ckpt > out.eval.log 2>&1 & -``` -- If the using dataset is -The above python command will run in the background, you can view the results through the file `out.eval.log`. - -You will get the accuracy as following: -``` -# grep "result: " out.eval.log -result: {'acc': 0.92} -``` - -### Distribute Training +- Distribute Training ``` sh run_distribute_train.sh rank_table.json your_data_path ``` @@ -70,18 +130,63 @@ train_parallel1/log:epcoh: 2 step: 97, loss is 1.7133579 ``` > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). + +**Run vgg16, using imagenet2012 dataset** + +- Training using single device(1p) +``` +python train.py --device_target="GPU" --dataset="imagenet2012" --is_distributed=0 --data_path=$DATA_PATH > output.train.log 2>&1 & +``` + +- Distribute Training +``` +# distributed training(8p) +bash scripts/run_distribute_train_gpu.sh /path/ImageNet2012/train" +``` + + +### Evaluation + +- Do eval as follows, need to specify dataset type as "cifar10" or "imagenet2012" +``` +# when using cifar10 dataset +python eval.py --data_path=your_data_path --dataset="cifar10" --device_target="Ascend" --pre_trained=./*-70-781.ckpt > out.eval.log 2>&1 & + +# when using imagenet2012 dataset +python eval.py --data_path=your_data_path --dataset="imagenet2012" --device_target="GPU" --pre_trained=./*-150-5004.ckpt > out.eval.log 2>&1 & +``` +- If the using dataset is +The above python command will run in the background, you can view the results through the file `out.eval.log`. + +You will get the accuracy as following: +``` +# when using cifar10 dataset +# grep "result: " out.eval.log +result: {'acc': 0.92} + +# when using the imagenet2012 dataset +after allreduce eval: top1_correct=36636, tot=50000, acc=73.27% +after allreduce eval: top5_correct=45582, tot=50000, acc=91.16% +``` + ## Usage: ### Training ``` usage: train.py [--device_target TARGET][--data_path DATA_PATH] + [--dataset DATASET_TYPE][--is_distributed VALUE] [--device_id DEVICE_ID][--pre_trained PRE_TRAINED] + [--ckpt_path CHECKPOINT_PATH][--ckpt_interval INTERVAL_STEP] parameters/options: - --device_target the training backend type, default is Ascend. + --device_target the training backend type, Ascend or GPU, default is Ascend. + --dataset the dataset type, cifar10 or imagenet2012. + --is_distributed the way of traing, whether do distribute traing, value can be 0 or 1. --data_path the storage path of dataset --device_id the device which used to train model. --pre_trained the pretrained checkpoint file path. + --ckpt_path the path to save checkpoint. + --ckpt_interval the epoch interval for saving checkpoint. ``` @@ -89,16 +194,19 @@ parameters/options: ``` usage: eval.py [--device_target TARGET][--data_path DATA_PATH] - [--device_id DEVICE_ID][--checkpoint_path CKPT_PATH] + [--dataset DATASET_TYPE][--pre_trained PRE_TRAINED] + [--device_id DEVICE_ID] parameters/options: - --device_target the evaluation backend type, default is Ascend. - --data_path the storage path of datasetd + --device_target the evaluation backend type, Ascend or GPU, default is Ascend. + --dataset the dataset type, cifar10 or imagenet2012. + --data_path the storage path of dataset. --device_id the device which used to evaluate model. - --pre_trained the checkpoint file path used to evaluate model. + --pre_trained the checkpoint file path used to evaluate model. ``` ### Distribute Training +- Train on Ascend. ``` Usage: sh script/run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH] @@ -107,3 +215,11 @@ parameters/options: MINDSPORE_HCCL_CONFIG_PATH HCCL configuration file path. DATA_PATH the storage path of dataset. ``` + +- Train on GPU. +``` +Usage: bash run_distribute_train_gpu.sh [DATA_PATH] + +parameters/options: + DATA_PATH the storage path of dataset. +``` \ No newline at end of file diff --git a/model_zoo/official/cv/vgg16/eval.py b/model_zoo/official/cv/vgg16/eval.py index 2c42dbea6da..86ce02187b4 100644 --- a/model_zoo/official/cv/vgg16/eval.py +++ b/model_zoo/official/cv/vgg16/eval.py @@ -86,6 +86,8 @@ def parse_args(cloud_args=None): args_opt.padding = cfg.padding args_opt.has_bias = cfg.has_bias args_opt.batch_norm = cfg.batch_norm + args_opt.initialize_mode = cfg.initialize_mode + args_opt.has_dropout = cfg.has_dropout args_opt.image_size = list(map(int, args_opt.image_size.split(','))) diff --git a/model_zoo/official/cv/vgg16/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/vgg16/scripts/run_distribute_train_gpu.sh new file mode 100644 index 00000000000..51be33a53af --- /dev/null +++ b/model_zoo/official/cv/vgg16/scripts/run_distribute_train_gpu.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the scipt as: " +echo "bash run_distribute_train_gpu.sh DATA_PATH" +echo "for example: bash run_distribute_train_gpu.sh /path/ImageNet2012/train" +echo "==============================================================================================================" + +DATA_PATH=$1 + +mpirun -n 8 python train.py \ + --device_target="GPU" \ + --dataset="imagenet2012" \ + --is_distributed=1 \ + --data_path=$DATA_PATH > output.train.log 2>&1 & diff --git a/model_zoo/official/cv/vgg16/src/config.py b/model_zoo/official/cv/vgg16/src/config.py index d8bfab71e7a..0861897ed2e 100755 --- a/model_zoo/official/cv/vgg16/src/config.py +++ b/model_zoo/official/cv/vgg16/src/config.py @@ -19,50 +19,54 @@ from easydict import EasyDict as edict # config for vgg16, cifar10 cifar_cfg = edict({ - 'num_classes': 10, + "num_classes": 10, "lr": 0.01, - 'lr_init': 0.01, - 'lr_max': 0.1, + "lr_init": 0.01, + "lr_max": 0.1, "lr_epochs": '30,60,90,120', "lr_scheduler": "step", - 'warmup_epochs': 5, - 'batch_size': 64, - 'max_epoch': 70, - 'momentum': 0.9, - 'weight_decay': 5e-4, + "warmup_epochs": 5, + "batch_size": 64, + "max_epoch": 70, + "momentum": 0.9, + "weight_decay": 5e-4, "loss_scale": 1.0, "label_smooth": 0, "label_smooth_factor": 0, - 'buffer_size': 10, + "buffer_size": 10, "image_size": '224,224', - 'pad_mode': 'same', - 'padding': 0, - 'has_bias': False, + "pad_mode": 'same', + "padding": 0, + "has_bias": False, "batch_norm": True, - 'keep_checkpoint_max': 10 + "keep_checkpoint_max": 10, + "initialize_mode": "XavierUniform", + "has_dropout": False }) # config for vgg16, imagenet2012 imagenet_cfg = edict({ - 'num_classes': 1000, + "num_classes": 1000, "lr": 0.01, - 'lr_init': 0.01, - 'lr_max': 0.1, + "lr_init": 0.01, + "lr_max": 0.1, "lr_epochs": '30,60,90,120', "lr_scheduler": 'cosine_annealing', - 'warmup_epochs': 0, - 'batch_size': 32, - 'max_epoch': 150, - 'momentum': 0.9, - 'weight_decay': 1e-4, + "warmup_epochs": 0, + "batch_size": 32, + "max_epoch": 150, + "momentum": 0.9, + "weight_decay": 1e-4, "loss_scale": 1024, "label_smooth": 1, "label_smooth_factor": 0.1, - 'buffer_size': 10, + "buffer_size": 10, "image_size": '224,224', - 'pad_mode': 'pad', - 'padding': 1, - 'has_bias': True, + "pad_mode": 'pad', + "padding": 1, + "has_bias": True, "batch_norm": False, - 'keep_checkpoint_max': 10 + "keep_checkpoint_max": 10, + "initialize_mode": "KaimingNormal", + "has_dropout": True }) diff --git a/model_zoo/official/cv/vgg16/src/vgg.py b/model_zoo/official/cv/vgg16/src/vgg.py index 5b94cc1d2e9..dc8c9e698d4 100644 --- a/model_zoo/official/cv/vgg16/src/vgg.py +++ b/model_zoo/official/cv/vgg16/src/vgg.py @@ -33,7 +33,7 @@ def _make_layer(base, args, batch_norm): else: weight_shape = (v, in_channels, 3, 3) weight = initializer('XavierUniform', shape=weight_shape, dtype=mstype.float32).to_tensor() - if args.dataset == "imagenet2012": + if args.initialize_mode == "KaimingNormal": weight = 'normal' conv2d = nn.Conv2d(in_channels=in_channels, out_channels=v, @@ -74,7 +74,7 @@ class Vgg(nn.Cell): self.layers = _make_layer(base, args, batch_norm=batch_norm) self.flatten = nn.Flatten() dropout_ratio = 0.5 - if args.dataset == "cifar10" or phase == "test": + if not args.has_dropout or phase == "test": dropout_ratio = 1.0 self.classifier = nn.SequentialCell([ nn.Dense(512 * 7 * 7, 4096), @@ -84,7 +84,7 @@ class Vgg(nn.Cell): nn.ReLU(), nn.Dropout(dropout_ratio), nn.Dense(4096, num_classes)]) - if args.dataset == "imagenet2012": + if args.initialize_mode == "KaimingNormal": default_recurisive_init(self) self.custom_init_weight() @@ -128,14 +128,14 @@ def vgg16(num_classes=1000, args=None, phase="train"): Args: num_classes (int): Class numbers. Default: 1000. - args(dict): param for net init. + args(namespace): param for net init. phase(str): train or test mode. Returns: Cell, cell instance of Vgg16 neural network with batch normalization. Examples: - >>> vgg16(num_classes=1000) + >>> vgg16(num_classes=1000, args=args) """ net = Vgg(cfg['16'], num_classes=num_classes, args=args, batch_norm=args.batch_norm, phase=phase) diff --git a/model_zoo/official/cv/vgg16/train.py b/model_zoo/official/cv/vgg16/train.py index dedb247aa0f..3b1d85e890d 100644 --- a/model_zoo/official/cv/vgg16/train.py +++ b/model_zoo/official/cv/vgg16/train.py @@ -161,6 +161,8 @@ def parse_args(cloud_args=None): args_opt.padding = cfg.padding args_opt.has_bias = cfg.has_bias args_opt.batch_norm = cfg.batch_norm + args_opt.initialize_mode = cfg.initialize_mode + args_opt.has_dropout = cfg.has_dropout args_opt.lr_epochs = list(map(int, cfg.lr_epochs.split(','))) args_opt.image_size = list(map(int, cfg.image_size.split(','))) diff --git a/tests/ut/python/model/test_vgg.py b/tests/ut/python/model/test_vgg.py new file mode 100644 index 00000000000..fe7f3a76c12 --- /dev/null +++ b/tests/ut/python/model/test_vgg.py @@ -0,0 +1,30 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test_vgg""" +import numpy as np +import pytest + +from mindspore import Tensor +from model_zoo.official.cv.vgg16.src.vgg import vgg16 +from model_zoo.official.cv.vgg16.src.config import cifar_cfg as cfg +from ..ut_filter import non_graph_engine + + +@non_graph_engine +def test_vgg16(): + inputs = Tensor(np.random.rand(1, 3, 112, 112).astype(np.float32)) + net = vgg16(args=cfg) + with pytest.raises(ValueError): + print(net.construct(inputs))