forked from mindspore-Ecosystem/mindspore
modify vgg16 network for clould
This commit is contained in:
parent
4729bb1c58
commit
36ec89656a
|
@ -106,13 +106,13 @@ After installing MindSpore via the official website, you can start training and
|
|||
|
||||
```python
|
||||
# run training example
|
||||
python train.py --data_path=[DATA_PATH] --device_id=[DEVICE_ID] --dataset=[DATASET_TYPE] > output.train.log 2>&1 &
|
||||
python train.py --config_path=[YAML_CONFIG_PATH] --data_dir=[DATA_PATH] --dataset=[DATASET_TYPE] > output.train.log 2>&1 &
|
||||
|
||||
# run distributed training example
|
||||
sh run_distribute_train.sh [RANL_TABLE_JSON] [DATA_PATH] --dataset=[DATASET_TYPE]
|
||||
sh scripts/run_distribute_train.sh [RANL_TABLE_JSON] [DATA_PATH] --dataset=[DATASET_TYPE]
|
||||
|
||||
# run evaluation example
|
||||
python eval.py --data_path=[DATA_PATH] --pre_trained=[PRE_TRAINED] --dataset=[DATASET_TYPE] > output.eval.log 2>&1 &
|
||||
python eval.py --config_path=[YAML_CONFIG_PATH] --data_dir=[DATA_PATH] --pre_trained=[PRE_TRAINED] --dataset=[DATASET_TYPE] > output.eval.log 2>&1 &
|
||||
```
|
||||
|
||||
For distributed training, a hccl configuration file with JSON format needs to be created in advance.
|
||||
|
@ -123,13 +123,118 @@ Please follow the instructions in the link below:
|
|||
|
||||
```bash
|
||||
# run training example
|
||||
python train.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_TYPE] --data_path=[DATA_PATH] > output.train.log 2>&1 &
|
||||
python train.py --config_path=[YAML_CONFIG_PATH] --device_target="GPU" --dataset=[DATASET_TYPE] --data_dir=[DATA_PATH] > output.train.log 2>&1 &
|
||||
|
||||
# run distributed training example
|
||||
sh run_distribute_train_gpu.sh [DATA_PATH] --dataset=[DATASET_TYPE]
|
||||
sh scripts/run_distribute_train_gpu.sh [DATA_PATH] --dataset=[DATASET_TYPE]
|
||||
|
||||
# run evaluation example
|
||||
python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_TYPE] --data_path=[DATA_PATH] --pre_trained=[PRE_TRAINED] > output.eval.log 2>&1 &
|
||||
python eval.py --config_path=[YAML_CONFIG_PATH] --device_target="GPU" --dataset=[DATASET_TYPE] --data_dir=[DATA_PATH] --pre_trained=[PRE_TRAINED] > output.eval.log 2>&1 &
|
||||
```
|
||||
|
||||
- Running on [ModelArts](https://support.huaweicloud.com/modelarts/)
|
||||
|
||||
```bash
|
||||
# Train Cifar10 1p on ModelArts
|
||||
# (1) Add "config_path=/path_to_code/cifar10_config.yaml" on the website UI interface.
|
||||
# (2) Perform a or b.
|
||||
# a. Set "enable_modelarts=True" on cifar10_config.yaml file.
|
||||
# Set "data_dir='/cache/data/cifar10'" on cifar10_config.yaml file.
|
||||
# Set "is_distributed=0" on cifar10_config.yaml file.
|
||||
# Set "dataset='cifar10'" on cifar10_config.yaml file.
|
||||
# Set other parameters on cifar10_config.yaml file you need.
|
||||
# b. Add "enable_modelarts=True" on the website UI interface.
|
||||
# Add "data_dir=/cache/data/cifar10" on the website UI interface.
|
||||
# Add "is_distributed=0" on the website UI interface.
|
||||
# Add "dataset=cifar10" on the website UI interface.
|
||||
# Add other parameters on the website UI interface.
|
||||
# (3) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
|
||||
# (4) Set the code directory to "/path/vgg16" on the website UI interface.
|
||||
# (5) Set the startup file to "train.py" on the website UI interface.
|
||||
# (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
|
||||
# (7) Create your job.
|
||||
#
|
||||
# Train Cifar10 8p on ModelArts
|
||||
# (1) Add "config_path=/path_to_code/cifar10_config.yaml" on the website UI interface.
|
||||
# (2) Perform a or b.
|
||||
# a. Set "enable_modelarts=True" on cifar10_config.yaml file.
|
||||
# Set "data_dir='/cache/data/cifar10'" on cifar10_config.yaml file.
|
||||
# Set "is_distributed=1" on cifar10_config.yaml file.
|
||||
# Set "dataset='cifar10'" on cifar10_config.yaml file.
|
||||
# Set other parameters on cifar10_config.yaml file you need.
|
||||
# b. Add "enable_modelarts=True" on the website UI interface.
|
||||
# Add "data_dir=/cache/data/cifar10" on the website UI interface.
|
||||
# Add "is_distributed=1" on the website UI interface.
|
||||
# Add "dataset=cifar10" on the website UI interface.
|
||||
# Add other parameters on the website UI interface.
|
||||
# (3) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
|
||||
# (4) Set the code directory to "/path/vgg16" on the website UI interface.
|
||||
# (5) Set the startup file to "train.py" on the website UI interface.
|
||||
# (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
|
||||
# (7) Create your job.
|
||||
#
|
||||
# Train Imagenet 8p on ModelArts
|
||||
# (1) Add "config_path=/path_to_code/imagenet2012_config.yaml" on the website UI interface.
|
||||
# (2) Perform a or b.
|
||||
# a. Set "enable_modelarts=True" on imagenet2012_config.yaml file.
|
||||
# Set "data_dir='/cache/data/ImageNet/train'" on imagenet2012_config.yaml file.
|
||||
# Set "is_distributed=1" on imagenet2012_config.yaml file.
|
||||
# Set "dataset='imagenet2012'" on imagenet2012_config.yaml file.
|
||||
# Set other parameters on imagenet2012_config.yaml file you need.
|
||||
# b. Add "enable_modelarts=True" on the website UI interface.
|
||||
# Add "data_dir=/cache/data/ImageNet/train" on the website UI interface.
|
||||
# Add "is_distributed=1" on the website UI interface.
|
||||
# Add "dataset=imagenet2012" on the website UI interface.
|
||||
# Add other parameters on the website UI interface.
|
||||
# (3) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
|
||||
# (4) Set the code directory to "/path/vgg16" on the website UI interface.
|
||||
# (5) Set the startup file to "train.py" on the website UI interface.
|
||||
# (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
|
||||
# (7) Create your job.
|
||||
#
|
||||
# Eval Cifar10 1p on ModelArts
|
||||
# (1) Add "config_path=/path_to_code/cifar10_config.yaml" on the website UI interface.
|
||||
# (2) Perform a or b.
|
||||
# a. Set "enable_modelarts=True" on cifar10_config.yaml file.
|
||||
# Set "data_dir='/cache/data/cifar10'" on cifar10_config.yaml file.
|
||||
# Set "dataset='cifar10'" on cifar10_config.yaml file.
|
||||
# Set "checkpoint_url='s3://dir_to_your_trained_model/'" on cifar10_config.yaml file.
|
||||
# Set "pre_trained='/cache/checkpoint_path/model.ckpt'" on cifar10_config.yaml file.
|
||||
# Set other parameters on cifar10_config.yaml file you need.
|
||||
# b. Add "enable_modelarts=True" on the website UI interface.
|
||||
# Add "data_dir=/cache/data/cifar10" on the website UI interface.
|
||||
# Add "dataset=cifar10" on the website UI interface.
|
||||
# Add "checkpoint_url=s3://dir_to_your_trained_model/" on the website UI interface.
|
||||
# Add "pre_trained=/cache/checkpoint_path/model.ckpt" on the website UI interface.
|
||||
# Add other parameters on the website UI interface.
|
||||
# (3) Upload or copy your pretrained model to S3 bucket.
|
||||
# (4) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
|
||||
# (5) Set the code directory to "/path/vgg16" on the website UI interface.
|
||||
# (6) Set the startup file to "eval.py" on the website UI interface.
|
||||
# (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
|
||||
# (8) Create your job.
|
||||
#
|
||||
# Eval ImageNet 1p on ModelArts
|
||||
# (1) Add "config_path=/path_to_code/imagenet2012_config.yaml" on the website UI interface.
|
||||
# (2) Perform a or b.
|
||||
# a. Set "enable_modelarts=True" on imagenet2012_config.yaml file.
|
||||
# Set "data_dir='/cache/data/ImageNet/validation_preprocess'" on imagenet2012_config.yaml file.
|
||||
# Set "dataset='imagenet2012'" on imagenet2012_config.yaml file.
|
||||
# Set "checkpoint_url='s3://dir_to_your_trained_model/'" on imagenet2012_config.yaml file.
|
||||
# Set "pre_trained='/cache/checkpoint_path/model.ckpt'" on imagenet2012_config.yaml file.
|
||||
# Set other parameters on imagenet2012_config.yaml file you need.
|
||||
# b. Add "enable_modelarts=True" on the website UI interface.
|
||||
# Add "data_dir=/cache/data/ImageNet/validation_preprocess" on the website UI interface.
|
||||
# Add "dataset=imagenet2012" on the website UI interface.
|
||||
# Add "checkpoint_url=s3://dir_to_your_trained_model/" on the website UI interface.
|
||||
# Add "pre_trained=/cache/checkpoint_path/model.ckpt" on the website UI interface.
|
||||
# Add other parameters on the website UI interface.
|
||||
# (3) Upload or copy your pretrained model to S3 bucket.
|
||||
# (4) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
|
||||
# (5) Set the code directory to "/path/vgg16" on the website UI interface.
|
||||
# (6) Set the startup file to "eval.py" on the website UI interface.
|
||||
# (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
|
||||
# (8) Create your job.
|
||||
```
|
||||
|
||||
## [Script Description](#contents)
|
||||
|
@ -140,17 +245,25 @@ python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_
|
|||
├── model_zoo
|
||||
├── README.md // descriptions about all the models
|
||||
├── vgg16
|
||||
├── README.md // descriptions about googlenet
|
||||
├── README.md // descriptions about vgg
|
||||
├── README_CN.md // descriptions about vgg with Chinese
|
||||
├── model_utils
|
||||
│ ├── __init__.py // init file
|
||||
│ ├── config.py // Parse arguments
|
||||
│ ├── device_adapter.py // Device adapter for ModelArts
|
||||
│ ├── local_adapter.py // Local adapter
|
||||
│ ├── moxing_adapter.py // Moxing adapter for ModelArts
|
||||
├── scripts
|
||||
│ ├── run_distribute_train.sh // shell script for distributed training on Ascend
|
||||
│ ├── run_distribute_train_gpu.sh // shell script for distributed training on GPU
|
||||
│ ├── run_eval.sh // shell script for eval on Ascend
|
||||
│ ├── run_infer_310.sh // shell script for infer on Ascend 310
|
||||
├── src
|
||||
│ ├── utils
|
||||
│ │ ├── logging.py // logging format setting
|
||||
│ │ ├── sampler.py // create sampler for dataset
|
||||
│ │ ├── util.py // util function
|
||||
│ │ ├── var_init.py // network parameter init method
|
||||
│ ├── config.py // parameter configuration
|
||||
│ ├── crossentropy.py // loss calculation
|
||||
│ ├── dataset.py // creating dataset
|
||||
│ ├── linear_warmup.py // linear leanring rate
|
||||
|
@ -159,6 +272,11 @@ python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_
|
|||
│ ├──vgg.py // vgg architecture
|
||||
├── train.py // training script
|
||||
├── eval.py // evaluation script
|
||||
├── postprocess.py // postprocess script
|
||||
├── preprocess.py // preprocess script
|
||||
├── mindspore_hub_conf.py // mindspore_hub_conf script
|
||||
├── cifar10_config.yaml // Configurations for cifar10
|
||||
├── imagenet2012_config.yaml // Configurations for imagenet2012
|
||||
```
|
||||
|
||||
### [Script Parameters](#contents)
|
||||
|
@ -166,17 +284,18 @@ python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_
|
|||
#### Training
|
||||
|
||||
```bash
|
||||
usage: train.py [--device_target TARGET][--data_path DATA_PATH]
|
||||
usage: train.py [--config_path YAML_CONFIG_PATH]
|
||||
[--device_target TARGET][--data_dir DATA_PATH]
|
||||
[--dataset DATASET_TYPE][--is_distributed VALUE]
|
||||
[--device_id DEVICE_ID][--pre_trained PRE_TRAINED]
|
||||
[--pre_trained PRE_TRAINED]
|
||||
[--ckpt_path CHECKPOINT_PATH][--ckpt_interval INTERVAL_STEP]
|
||||
|
||||
parameters/options:
|
||||
--config_path the storage path of YAML_CONFIG_FILE
|
||||
--device_target the training backend type, Ascend or GPU, default is Ascend.
|
||||
--dataset the dataset type, cifar10 or imagenet2012.
|
||||
--is_distributed the way of traing, whether do distribute traing, value can be 0 or 1.
|
||||
--data_path the storage path of dataset
|
||||
--device_id the device which used to train model.
|
||||
--data_dir the storage path of dataset
|
||||
--pre_trained the pretrained checkpoint file path.
|
||||
--ckpt_path the path to save checkpoint.
|
||||
--ckpt_interval the epoch interval for saving checkpoint.
|
||||
|
@ -186,76 +305,76 @@ parameters/options:
|
|||
#### Evaluation
|
||||
|
||||
```bash
|
||||
usage: eval.py [--device_target TARGET][--data_path DATA_PATH]
|
||||
usage: eval.py [--config_path YAML_CONFIG_PATH]
|
||||
[--device_target TARGET][--data_dir DATA_PATH]
|
||||
[--dataset DATASET_TYPE][--pre_trained PRE_TRAINED]
|
||||
[--device_id DEVICE_ID]
|
||||
|
||||
parameters/options:
|
||||
--config_path the storage path of YAML_CONFIG_FILE
|
||||
--device_target the evaluation backend type, Ascend or GPU, default is Ascend.
|
||||
--dataset the dataset type, cifar10 or imagenet2012.
|
||||
--data_path the storage path of dataset.
|
||||
--device_id the device which used to evaluate model.
|
||||
--data_dir the storage path of dataset.
|
||||
--pre_trained the checkpoint file path used to evaluate model.
|
||||
```
|
||||
|
||||
### [Parameter configuration](#contents)
|
||||
|
||||
Parameters for both training and evaluation can be set in config.py.
|
||||
Parameters for both training and evaluation can be set in cifar10_config.yaml/cifar10_config.yaml.
|
||||
|
||||
- config for vgg16, CIFAR-10 dataset
|
||||
|
||||
```bash
|
||||
"num_classes": 10, # dataset class num
|
||||
"lr": 0.01, # learning rate
|
||||
"lr_init": 0.01, # initial learning rate
|
||||
"lr_max": 0.1, # max learning rate
|
||||
"lr_epochs": '30,60,90,120', # lr changing based epochs
|
||||
"lr_scheduler": "step", # learning rate mode
|
||||
"warmup_epochs": 5, # number of warmup epoch
|
||||
"batch_size": 64, # batch size of input tensor
|
||||
"max_epoch": 70, # only valid for taining, which is always 1 for inference
|
||||
"momentum": 0.9, # momentum
|
||||
"weight_decay": 5e-4, # weight decay
|
||||
"loss_scale": 1.0, # loss scale
|
||||
"label_smooth": 0, # label smooth
|
||||
"label_smooth_factor": 0, # label smooth factor
|
||||
"buffer_size": 10, # shuffle buffer size
|
||||
"image_size": '224,224', # image size
|
||||
"pad_mode": 'same', # pad mode for conv2d
|
||||
"padding": 0, # padding value for conv2d
|
||||
"has_bias": False, # whether has bias in conv2d
|
||||
"batch_norm": True, # whether has batch_norm in conv2d
|
||||
"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint
|
||||
"initialize_mode": "XavierUniform", # conv2d init mode
|
||||
"has_dropout": True # whether using Dropout layer
|
||||
num_classes: 10 # dataset class num
|
||||
lr: 0.01 # learning rate
|
||||
lr_init: 0.01 # initial learning rate
|
||||
lr_max: 0.1 # max learning rate
|
||||
lr_epochs: '30,60,90,120' # lr changing based epochs
|
||||
lr_scheduler: "step" # learning rate mode
|
||||
warmup_epochs: 5 # number of warmup epoch
|
||||
batch_size: 64 # batch size of input tensor
|
||||
max_epoch: 70 # only valid for taining, which is always 1 for inference
|
||||
momentum: 0.9 # momentum
|
||||
weight_decay: 0.0005 # weight decay
|
||||
loss_scale: 1.0 # loss scale
|
||||
label_smooth: 0 # label smooth
|
||||
label_smooth_factor: 0 # label smooth factor
|
||||
buffer_size: 10 # shuffle buffer size
|
||||
image_size: '224,224' # image size
|
||||
pad_mode: 'same' # pad mode for conv2d
|
||||
padding: 0 # padding value for conv2d
|
||||
has_bias: False # whether has bias in conv2d
|
||||
batch_norm: True # whether has batch_norm in conv2d
|
||||
keep_checkpoint_max: 10 # only keep the last keep_checkpoint_max checkpoint
|
||||
initialize_mode: "XavierUniform" # conv2d init mode
|
||||
has_dropout: True # whether using Dropout layer
|
||||
```
|
||||
|
||||
- config for vgg16, ImageNet2012 dataset
|
||||
|
||||
```bash
|
||||
"num_classes": 1000, # dataset class num
|
||||
"lr": 0.01, # learning rate
|
||||
"lr_init": 0.01, # initial learning rate
|
||||
"lr_max": 0.1, # max learning rate
|
||||
"lr_epochs": '30,60,90,120', # lr changing based epochs
|
||||
"lr_scheduler": "cosine_annealing", # learning rate mode
|
||||
"warmup_epochs": 0, # number of warmup epoch
|
||||
"batch_size": 32, # batch size of input tensor
|
||||
"max_epoch": 150, # only valid for taining, which is always 1 for inference
|
||||
"momentum": 0.9, # momentum
|
||||
"weight_decay": 1e-4, # weight decay
|
||||
"loss_scale": 1024, # loss scale
|
||||
"label_smooth": 1, # label smooth
|
||||
"label_smooth_factor": 0.1, # label smooth factor
|
||||
"buffer_size": 10, # shuffle buffer size
|
||||
"image_size": '224,224', # image size
|
||||
"pad_mode": 'pad', # pad mode for conv2d
|
||||
"padding": 1, # padding value for conv2d
|
||||
"has_bias": True, # whether has bias in conv2d
|
||||
"batch_norm": False, # whether has batch_norm in conv2d
|
||||
"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint
|
||||
"initialize_mode": "KaimingNormal", # conv2d init mode
|
||||
"has_dropout": True # whether using Dropout layer
|
||||
num_classes: 1000 # dataset class num
|
||||
lr: 0.01 # learning rate
|
||||
lr_init: 0.01 # initial learning rate
|
||||
lr_max: 0.1 # max learning rate
|
||||
lr_epochs: '30,60,90,120' # lr changing based epochs
|
||||
lr_scheduler: "cosine_annealing" # learning rate mode
|
||||
warmup_epochs: 0 # number of warmup epoch
|
||||
batch_size: 32 # batch size of input tensor
|
||||
max_epoch: 150 # only valid for taining, which is always 1 for inference
|
||||
momentum: 0.9 # momentum
|
||||
weight_decay: 0.0001 # weight decay
|
||||
loss_scale: 1024 # loss scale
|
||||
label_smooth: 1 # label smooth
|
||||
label_smooth_factor: 0.1 # label smooth factor
|
||||
buffer_size: 10 # shuffle buffer size
|
||||
image_size: '224,224' # image size
|
||||
pad_mode: 'pad' # pad mode for conv2d
|
||||
padding: 1 # padding value for conv2d
|
||||
has_bias: True # whether has bias in conv2d
|
||||
batch_norm: False # whether has batch_norm in conv2d
|
||||
keep_checkpoint_max: 10 # only keep the last keep_checkpoint_max checkpoint
|
||||
initialize_mode: "KaimingNormal" # conv2d init mode
|
||||
has_dropout: True # whether using Dropout layer
|
||||
```
|
||||
|
||||
### [Training Process](#contents)
|
||||
|
@ -267,7 +386,7 @@ Parameters for both training and evaluation can be set in config.py.
|
|||
- Training using single device(1p), using CIFAR-10 dataset in default
|
||||
|
||||
```bash
|
||||
python train.py --data_path=your_data_path --device_id=6 > out.train.log 2>&1 &
|
||||
python train.py --config_path=/dir_to_code/cifar10_config.yaml --data_dir=your_data_path > out.train.log 2>&1 &
|
||||
```
|
||||
|
||||
The python command above will run in the background, you can view the results through the file `out.train.log`.
|
||||
|
@ -312,7 +431,7 @@ train_parallel1/log:epcoh: 2 step: 97, loss is 1.7133579
|
|||
- Training using single device(1p)
|
||||
|
||||
```bash
|
||||
python train.py --device_target="GPU" --dataset="imagenet2012" --is_distributed=0 --data_path=$DATA_PATH > output.train.log 2>&1 &
|
||||
python train.py --config_path=/dir_to_code/imagenet2012_config.yaml --device_target="GPU" --dataset="imagenet2012" --is_distributed=0 --data_dir=$DATA_PATH > output.train.log 2>&1 &
|
||||
```
|
||||
|
||||
- Distributed Training
|
||||
|
@ -330,10 +449,10 @@ bash scripts/run_distribute_train_gpu.sh /path/ImageNet2012/train"
|
|||
|
||||
```bash
|
||||
# when using cifar10 dataset
|
||||
python eval.py --data_path=your_data_path --dataset="cifar10" --device_target="Ascend" --pre_trained=./*-70-781.ckpt > output.eval.log 2>&1 &
|
||||
python eval.py --config_path=/dir_to_code/cifar10_config.yaml --data_dir=your_data_path --dataset="cifar10" --device_target="Ascend" --pre_trained=./*-70-781.ckpt > output.eval.log 2>&1 &
|
||||
|
||||
# when using imagenet2012 dataset
|
||||
python eval.py --data_path=your_data_path --dataset="imagenet2012" --device_target="GPU" --pre_trained=./*-150-5004.ckpt > output.eval.log 2>&1 &
|
||||
python eval.py --config_path=/dir_to_code/imagenet2012.yaml --data_dir=your_data_path --dataset="imagenet2012" --device_target="GPU" --pre_trained=./*-150-5004.ckpt > output.eval.log 2>&1 &
|
||||
```
|
||||
|
||||
- The above python command will run in the background, you can view the results through the file `output.eval.log`. You will get the accuracy as following:
|
||||
|
@ -353,7 +472,7 @@ after allreduce eval: top5_correct=45582, tot=50000, acc=91.16%
|
|||
### [Export MindIR](#contents)
|
||||
|
||||
```shell
|
||||
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
|
||||
python export.py --config_path [YMAL_CONFIG_PATH] --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
|
||||
```
|
||||
|
||||
The ckpt_file parameter is required,
|
||||
|
|
|
@ -109,13 +109,13 @@ VGG 16网络主要由几个基本模块(包括卷积层和池化层)和三
|
|||
|
||||
```python
|
||||
# 训练示例
|
||||
python train.py --data_path=[DATA_PATH] --device_id=[DEVICE_ID] --dataset=[DATASET_TYPE] > output.train.log 2>&1 &
|
||||
python train.py --config_path=[YAML_CONFIG_PATH] --data_dir=[DATA_PATH] --dataset=[DATASET_TYPE] > output.train.log 2>&1 &
|
||||
|
||||
# 分布式训练示例
|
||||
sh run_distribute_train.sh [RANL_TABLE_JSON] [DATA_PATH] --dataset=[DATASET_TYPE]
|
||||
sh scripts/run_distribute_train.sh [RANL_TABLE_JSON] [DATA_PATH] --dataset=[DATASET_TYPE]
|
||||
|
||||
# 评估示例
|
||||
python eval.py --data_path=[DATA_PATH] --pre_trained=[PRE_TRAINED] --dataset=[DATASET_TYPE] > output.eval.log 2>&1 &
|
||||
python eval.py --config_path=[YAML_CONFIG_PATH] --data_dir=[DATA_PATH] --pre_trained=[PRE_TRAINED] --dataset=[DATASET_TYPE] > output.eval.log 2>&1 &
|
||||
```
|
||||
|
||||
分布式训练需要提前创建JSON格式的HCCL配置文件。
|
||||
|
@ -126,13 +126,118 @@ python eval.py --data_path=[DATA_PATH] --pre_trained=[PRE_TRAINED] --dataset=[D
|
|||
|
||||
```python
|
||||
# 训练示例
|
||||
python train.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_TYPE] --data_path=[DATA_PATH] > output.train.log 2>&1 &
|
||||
python train.py --config_path=[YAML_CONFIG_PATH] --device_target="GPU" --dataset=[DATASET_TYPE] --data_dir=[DATA_PATH] > output.train.log 2>&1 &
|
||||
|
||||
# 分布式训练示例
|
||||
sh run_distribute_train_gpu.sh [DATA_PATH] --dataset=[DATASET_TYPE]
|
||||
sh scripts/run_distribute_train_gpu.sh [DATA_PATH] --dataset=[DATASET_TYPE]
|
||||
|
||||
# 评估示例
|
||||
python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_TYPE] --data_path=[DATA_PATH] --pre_trained=[PRE_TRAINED] > output.eval.log 2>&1 &
|
||||
python eval.py --config_path=[YAML_CONFIG_PATH] --device_target="GPU" --dataset=[DATASET_TYPE] --data_dir=[DATA_PATH] --pre_trained=[PRE_TRAINED] > output.eval.log 2>&1 &
|
||||
```
|
||||
|
||||
- 在 ModelArts 进行训练 (如果你想在modelarts上运行,可以参考以下文档 [modelarts](https://support.huaweicloud.com/modelarts/))
|
||||
|
||||
```bash
|
||||
# 在 ModelArts 上使用 单卡训练 cifar10 数据集
|
||||
# (1) 在网页上设置 "config_path=/path_to_code/cifar10_config.yaml"
|
||||
# (2) 执行a或者b
|
||||
# a. 在 cifar10_config.yaml 文件中设置 "enable_modelarts=True"
|
||||
# 在 cifar10_config.yaml 文件中设置 "data_dir='/cache/data/cifar10'"
|
||||
# 在 cifar10_config.yaml 文件中设置 "is_distributed=0"
|
||||
# 在 cifar10_config.yaml 文件中设置 "dataset='cifar10'"
|
||||
# 在 cifar10_config.yaml 文件中设置 其他参数
|
||||
# b. 在网页上设置 "enable_modelarts=True"
|
||||
# 在网页上设置 "data_dir=/cache/data/cifar10"
|
||||
# 在网页上设置 "is_distributed=0"
|
||||
# 在网页上设置 "dataset=cifar10"
|
||||
# 在网页上设置 其他参数
|
||||
# (3) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集,但那可能会很慢。)
|
||||
# (4) 在网页上设置你的代码路径为 "/path/vgg16"
|
||||
# (5) 在网页上设置启动文件为 "train.py"
|
||||
# (6) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等
|
||||
# (7) 创建训练作业
|
||||
#
|
||||
# 在 ModelArts 上使用8卡训练 cifar10 数据集
|
||||
# (1) 在网页上设置 "config_path=/path_to_code/cifar10_config.yaml"
|
||||
# (2) 执行a或者b
|
||||
# a. 在 cifar10_config.yaml 文件中设置 "enable_modelarts=True"
|
||||
# 在 cifar10_config.yaml 文件中设置 "data_dir='/cache/data/cifar10'"
|
||||
# 在 cifar10_config.yaml 文件中设置 "is_distributed=1"
|
||||
# 在 cifar10_config.yaml 文件中设置 "dataset='cifar10'"
|
||||
# 在 cifar10_config.yaml 文件中设置 其他参数
|
||||
# b. 在网页上设置 "enable_modelarts=True"
|
||||
# 在网页上设置 "data_dir=/cache/data/cifar10"
|
||||
# 在网页上设置 "is_distributed=1"
|
||||
# 在网页上设置 "dataset=cifar10"
|
||||
# 在网页上设置 其他参数
|
||||
# (3) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集,但那可能会很慢。)
|
||||
# (4) 在网页上设置你的代码路径为 "/path/vgg16"
|
||||
# (5) 在网页上设置启动文件为 "train.py"
|
||||
# (6) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等
|
||||
# (7) 创建训练作业
|
||||
#
|
||||
# 在 ModelArts 上使用8卡训练 ImageNet 数据集
|
||||
# (1) 在网页上设置 "config_path=/path_to_code/imagenet2012_config.yaml"
|
||||
# (2) 执行a或者b
|
||||
# a. 在 imagenet2012_config.yaml 文件中设置 "enable_modelarts=True"
|
||||
# 在 imagenet2012_config.yaml 文件中设置 "data_dir='/cache/data/ImageNet/train'"
|
||||
# 在 imagenet2012_config.yaml 文件中设置 "is_distributed=1"
|
||||
# 在 imagenet2012_config.yaml 文件中设置 "dataset='imagenet2012'"
|
||||
# 在 imagenet2012_config.yaml 文件中设置 其他参数
|
||||
# b. 在网页上设置 "enable_modelarts=True"
|
||||
# 在网页上设置 "data_dir=/cache/data/ImageNet/train"
|
||||
# 在网页上设置 "is_distributed=1"
|
||||
# 在网页上设置 "dataset=imagenet2012"
|
||||
# 在网页上设置 其他参数
|
||||
# (3) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集,但那可能会很慢。)
|
||||
# (4) 在网页上设置你的代码路径为 "/path/vgg16"
|
||||
# (5) 在网页上设置启动文件为 "train.py"
|
||||
# (6) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等
|
||||
# (7) 创建训练作业
|
||||
#
|
||||
# 在 ModelArts 上使用 单卡验证 Cifar10 数据集
|
||||
# (1) 在网页上设置 "config_path=/path_to_code/cifar10_config.yaml"
|
||||
# (2) 执行a或者b
|
||||
# a. 在 cifar10_config.yaml 文件中设置 "enable_modelarts=True"
|
||||
# 在 cifar10_config.yaml 文件中设置 "data_dir='/cache/data/cifar10'"
|
||||
# 在 cifar10_config.yaml 文件中设置 "dataset='cifar10'"
|
||||
# 在 cifar10_config.yaml 文件中设置 "checkpoint_url='s3://dir_to_your_trained_model/'"
|
||||
# 在 cifar10_config.yaml 文件中设置 "pre_trained='/cache/checkpoint_path/model.ckpt'"
|
||||
# 在 cifar10_config.yaml 文件中设置 其他参数
|
||||
# b. 在网页上设置 "enable_modelarts=True"
|
||||
# 在网页上设置 "data_dir=/cache/data/cifar10"
|
||||
# 在网页上设置 "dataset=cifar10"
|
||||
# 在网页上设置 "checkpoint_url=s3://dir_to_your_trained_model/"
|
||||
# 在网页上设置 "pre_trained=/cache/checkpoint_path/model.ckpt"
|
||||
# 在网页上设置 其他参数
|
||||
# (3) 上传你的预训练模型到 S3 桶上
|
||||
# (4) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集,但那可能会很慢。)
|
||||
# (5) 在网页上设置你的代码路径为 "/path/vgg16"
|
||||
# (6) 在网页上设置启动文件为 "eval.py"
|
||||
# (7) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等
|
||||
# (8) 创建训练作业
|
||||
#
|
||||
# 在 ModelArts 上使用 单卡验证 ImageNet 数据集
|
||||
# (1) 在网页上设置 "config_path=/path_to_code/imagenet2012_config.yaml"
|
||||
# (2) 执行a或者b
|
||||
# a. 在 imagenet2012_config.yaml 文件中设置 "enable_modelarts=True"
|
||||
# 在 imagenet2012_config.yaml 文件中设置 "data_dir='/cache/data/ImageNet/validation_preprocess'"
|
||||
# 在 imagenet2012_config.yaml 文件中设置 "dataset='imagenet2012'"
|
||||
# 在 imagenet2012_config.yaml 文件中设置 "checkpoint_url='s3://dir_to_your_trained_model/'"
|
||||
# 在 imagenet2012_config.yaml 文件中设置 "pre_trained='/cache/checkpoint_path/model.ckpt'"
|
||||
# 在 imagenet2012_config.yaml 文件中设置 其他参数
|
||||
# b. 在网页上设置 "enable_modelarts=True"
|
||||
# 在网页上设置 "data_dir=/cache/data/ImageNet/validation_preprocess"
|
||||
# 在网页上设置 "dataset=imagenet2012"
|
||||
# 在网页上设置 "checkpoint_url=s3://dir_to_your_trained_model/"
|
||||
# 在网页上设置 "pre_trained=/cache/checkpoint_path/model.ckpt"
|
||||
# 在网页上设置 其他参数
|
||||
# (3) 上传你的预训练模型到 S3 桶上
|
||||
# (4) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集,但那可能会很慢。)
|
||||
# (5) 在网页上设置你的代码路径为 "/path/vgg16"
|
||||
# (6) 在网页上设置启动文件为 "eval.py"
|
||||
# (7) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等
|
||||
# (8) 创建训练作业
|
||||
```
|
||||
|
||||
## 脚本说明
|
||||
|
@ -143,17 +248,25 @@ python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_
|
|||
├── model_zoo
|
||||
├── README.md // 所有模型相关说明
|
||||
├── vgg16
|
||||
├── README.md // GoogLeNet相关说明
|
||||
├── README.md // VGG 相关说明
|
||||
├── README_CN.md // VGG 相关中文说明
|
||||
├── model_utils
|
||||
├── __init__.py // 初始化文件
|
||||
├── config.py // 参数配置
|
||||
├── device_adapter.py // ModelArts的设备适配器
|
||||
├── local_adapter.py // 本地适配器
|
||||
└── moxing_adapter.py // ModelArts的模型适配器
|
||||
├── scripts
|
||||
│ ├── run_distribute_train.sh // Ascend分布式训练shell脚本
|
||||
│ ├── run_distribute_train_gpu.sh // GPU分布式训练shell脚本
|
||||
│ ├── run_distribute_train.sh // Ascend 分布式训练shell脚本
|
||||
│ ├── run_distribute_train_gpu.sh // GPU 分布式训练shell脚本
|
||||
│ ├── run_eval.sh // Ascend 验证shell脚本
|
||||
│ ├── run_infer_310.sh // Ascend310 推理shell脚本
|
||||
├── src
|
||||
│ ├── utils
|
||||
│ │ ├── logging.py // 日志格式设置
|
||||
│ │ ├── sampler.py // 为数据集创建采样器
|
||||
│ │ ├── util.py // 工具函数
|
||||
│ │ ├── var_init.py // 网络参数init方法
|
||||
│ ├── config.py // 参数配置
|
||||
│ ├── crossentropy.py // 损失计算
|
||||
│ ├── dataset.py // 创建数据集
|
||||
│ ├── linear_warmup.py // 线性学习率
|
||||
|
@ -162,6 +275,11 @@ python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_
|
|||
│ ├──vgg.py // VGG架构
|
||||
├── train.py // 训练脚本
|
||||
├── eval.py // 评估脚本
|
||||
├── postprocess.py // 后处理脚本
|
||||
├── preprocess.py // 预处理脚本
|
||||
├── mindspore_hub_conf.py // mindspore hub 脚本
|
||||
├── cifar10_config.yaml // cifar10 配置文件
|
||||
├── imagenet2012_config.yaml // imagenet2012 配置文件
|
||||
```
|
||||
|
||||
### 脚本参数
|
||||
|
@ -169,17 +287,18 @@ python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_
|
|||
#### 训练
|
||||
|
||||
```bash
|
||||
用法:train.py [--device_target TARGET][--data_path DATA_PATH]
|
||||
[--dataset DATASET_TYPE][--is_distributed VALUE]
|
||||
[--device_id DEVICE_ID][--pre_trained PRE_TRAINED]
|
||||
[--ckpt_path CHECKPOINT_PATH][--ckpt_interval INTERVAL_STEP]
|
||||
用法:train.py [--config_path YAML_CONFIG_PATH]
|
||||
[--device_target TARGET][--data_dir DATA_PATH]
|
||||
[--dataset DATASET_TYPE][--is_distributed VALUE]
|
||||
[--pre_trained PRE_TRAINED]
|
||||
[--ckpt_path CHECKPOINT_PATH][--ckpt_interval INTERVAL_STEP]
|
||||
|
||||
选项:
|
||||
--config_path yaml配置文件路径
|
||||
--device_target 训练后端类型,Ascend或GPU,默认为Ascend。
|
||||
--dataset 数据集类型,cifar10或imagenet2012。
|
||||
--is_distributed 训练方式,是否为分布式训练,值可以是0或1。
|
||||
--data_path 数据集存储路径
|
||||
--device_id 用于训练模型的设备。
|
||||
--data_dir 数据集存储路径
|
||||
--pre_trained 预训练检查点文件路径。
|
||||
--ckpt_path 存放检查点的路径。
|
||||
--ckpt_interval 保存检查点的轮次间隔。
|
||||
|
@ -189,76 +308,76 @@ python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_
|
|||
#### 评估
|
||||
|
||||
```bash
|
||||
用法:eval.py [--device_target TARGET][--data_path DATA_PATH]
|
||||
[--dataset DATASET_TYPE][--pre_trained PRE_TRAINED]
|
||||
[--device_id DEVICE_ID]
|
||||
用法:eval.py [--config_path YAML_CONFIG_PATH]
|
||||
[--device_target TARGET][--data_dir DATA_PATH]
|
||||
[--dataset DATASET_TYPE][--pre_trained PRE_TRAINED]
|
||||
|
||||
选项:
|
||||
--config_path yaml配置文件路径
|
||||
--device_target 评估后端类型,Ascend或GPU,默认为Ascend。
|
||||
--dataset 数据集类型,cifar10或imagenet2012。
|
||||
--data_path 数据集存储路径。
|
||||
--device_id 用于评估模型的设备。
|
||||
--data_dir 数据集存储路径。
|
||||
--pre_trained 用于评估模型的检查点文件路径。
|
||||
```
|
||||
|
||||
### 参数配置
|
||||
|
||||
在config.py中可以同时配置训练参数和评估参数。
|
||||
在 cifar10_config.yaml/cifar10_config.yaml 中可以同时配置训练参数和评估参数。
|
||||
|
||||
- 配置VGG16,CIFAR-10数据集
|
||||
|
||||
```bash
|
||||
"num_classes": 10, # 数据集类数
|
||||
"lr": 0.01, # 学习率
|
||||
"lr_init": 0.01, # 初始学习率
|
||||
"lr_max": 0.1, # 最大学习率
|
||||
"lr_epochs": '30,60,90,120', # 基于变化lr的轮次
|
||||
"lr_scheduler": "step", # 学习率模式
|
||||
"warmup_epochs": 5, # 热身轮次数
|
||||
"batch_size": 64, # 输入张量批次大小
|
||||
"max_epoch": 70, # 只对训练有效,推理固定值为1
|
||||
"momentum": 0.9, # 动量
|
||||
"weight_decay": 5e-4, # 权重衰减
|
||||
"loss_scale": 1.0, # 损失放大
|
||||
"label_smooth": 0, # 标签平滑
|
||||
"label_smooth_factor": 0, # 标签平滑因子
|
||||
"buffer_size": 10, # 混洗缓冲区大小
|
||||
"image_size": '224,224', # 图像大小
|
||||
"pad_mode": 'same', # conv2d的填充方式
|
||||
"padding": 0, # conv2d的填充值
|
||||
"has_bias": False, # conv2d是否有偏差
|
||||
"batch_norm": True, # 在conv2d中是否有batch_norm
|
||||
"keep_checkpoint_max": 10, # 只保留最后一个keep_checkpoint_max检查点
|
||||
"initialize_mode": "XavierUniform", # conv2d init模式
|
||||
"has_dropout": True # 是否使用Dropout层
|
||||
num_classes: 10 # 数据集类数
|
||||
lr: 0.01 # 学习率
|
||||
lr_init: 0.01 # 初始学习率
|
||||
lr_max: 0.1 # 最大学习率
|
||||
lr_epochs: '30,60,90,120' # 基于变化lr的轮次
|
||||
lr_scheduler: "step" # 学习率模式
|
||||
warmup_epochs: 5 # 热身轮次数
|
||||
batch_size: 64 # 输入张量批次大小
|
||||
max_epoch: 70 # 只对训练有效,推理固定值为1
|
||||
momentum: 0.9 # 动量
|
||||
weight_decay: 5e-4 # 权重衰减
|
||||
loss_scale: 1.0 # 损失放大
|
||||
label_smooth: 0 # 标签平滑
|
||||
label_smooth_factor: 0 # 标签平滑因子
|
||||
buffer_size: 10 # 混洗缓冲区大小
|
||||
image_size: '224,224' # 图像大小
|
||||
pad_mode: 'same' # conv2d的填充方式
|
||||
padding: 0 # conv2d的填充值
|
||||
has_bias: False # conv2d是否有偏差
|
||||
batch_norm: True # 在conv2d中是否有batch_norm
|
||||
keep_checkpoint_max: 10 # 只保留最后一个keep_checkpoint_max检查点
|
||||
initialize_mode: "XavierUniform" # conv2d init模式
|
||||
has_dropout: True # 是否使用Dropout层
|
||||
```
|
||||
|
||||
- VGG16配置,ImageNet2012数据集
|
||||
|
||||
```bash
|
||||
"num_classes": 1000, # 数据集类数
|
||||
"lr": 0.01, # 学习率
|
||||
"lr_init": 0.01, # 初始学习率
|
||||
"lr_max": 0.1, # 最大学习率
|
||||
"lr_epochs": '30,60,90,120', # 基于变化lr的轮次
|
||||
"lr_scheduler": "cosine_annealing", # 学习率模式
|
||||
"warmup_epochs": 0, # 热身轮次数
|
||||
"batch_size": 32, # 输入张量的批次大小
|
||||
"max_epoch": 150, # 只对训练有效,推理固定值为1
|
||||
"momentum": 0.9, # 动量
|
||||
"weight_decay": 1e-4, # 权重衰减
|
||||
"loss_scale": 1024, # 损失放大
|
||||
"label_smooth": 1, # 标签平滑
|
||||
"label_smooth_factor": 0.1, # 标签平滑因子
|
||||
"buffer_size": 10, # 混洗缓冲区大小
|
||||
"image_size": '224,224', # 图像大小
|
||||
"pad_mode": 'pad', # conv2d的填充方式
|
||||
"padding": 1, # conv2d的填充值
|
||||
"has_bias": True, # conv2d是否有偏差
|
||||
"batch_norm": False, # 在conv2d中是否有batch_norm
|
||||
"keep_checkpoint_max": 10, # 只保留最后一个keep_checkpoint_max检查点
|
||||
"initialize_mode": "KaimingNormal", # conv2d init模式
|
||||
"has_dropout": True # 是否使用Dropout层
|
||||
num_classes: 1000 # 数据集类数
|
||||
lr: 0.01 # 学习率
|
||||
lr_init: 0.01 # 初始学习率
|
||||
lr_max: 0.1 # 最大学习率
|
||||
lr_epochs: '30,60,90,120' # 基于变化lr的轮次
|
||||
lr_scheduler: "cosine_annealing" # 学习率模式
|
||||
warmup_epochs: 0 # 热身轮次数
|
||||
batch_size: 32 # 输入张量的批次大小
|
||||
max_epoch: 150 # 只对训练有效,推理固定值为1
|
||||
momentum: 0.9 # 动量
|
||||
weight_decay: 1e-4 # 权重衰减
|
||||
loss_scale: 1024 # 损失放大
|
||||
label_smooth: 1 # 标签平滑
|
||||
label_smooth_factor: 0.1 # 标签平滑因子
|
||||
buffer_size: 10 # 混洗缓冲区大小
|
||||
image_size: '224,224' # 图像大小
|
||||
pad_mode: 'pad' # conv2d的填充方式
|
||||
padding: 1 # conv2d的填充值
|
||||
has_bias: True # conv2d是否有偏差
|
||||
batch_norm: False # 在conv2d中是否有batch_norm
|
||||
keep_checkpoint_max: 10 # 只保留最后一个keep_checkpoint_max检查点
|
||||
initialize_mode: "KaimingNormal" # conv2d init模式
|
||||
has_dropout: True # 是否使用Dropout层
|
||||
```
|
||||
|
||||
### 训练过程
|
||||
|
@ -270,7 +389,7 @@ python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_
|
|||
- 使用单设备(1p)训练,默认使用CIFAR-10数据集
|
||||
|
||||
```bash
|
||||
python train.py --data_path=your_data_path --device_id=6 > out.train.log 2>&1 &
|
||||
python train.py --config_path=/dir_to_code/cifar10_config.yaml --data_dir=your_data_path > out.train.log 2>&1 &
|
||||
```
|
||||
|
||||
上述python命令在后台运行,可通过`out.train.log`文件查看结果。
|
||||
|
@ -289,7 +408,7 @@ epcoh: 2 step: 781, loss is 1.827582
|
|||
- 分布式训练
|
||||
|
||||
```bash
|
||||
sh run_distribute_train.sh rank_table.json your_data_path
|
||||
sh scripts/run_distribute_train.sh rank_table.json your_data_path
|
||||
```
|
||||
|
||||
上述shell脚本会在后台进行分布式训练,可通过`train_parallel[X]/log`文件查看结果。
|
||||
|
@ -316,7 +435,7 @@ train_parallel1/log:epcoh: 2 step: 97, loss is 1.7133579
|
|||
- 单设备训练(1p)
|
||||
|
||||
```bash
|
||||
python train.py --device_target="GPU" --dataset="imagenet2012" --is_distributed=0 --data_path=$DATA_PATH > output.train.log 2>&1 &
|
||||
python train.py --config_path=/dir_to_code/imagenet2012_config.yaml --device_target="GPU" --dataset="imagenet2012" --is_distributed=0 --data_dir=$DATA_PATH > output.train.log 2>&1 &
|
||||
```
|
||||
|
||||
- 分布式训练
|
||||
|
@ -334,10 +453,10 @@ bash scripts/run_distribute_train_gpu.sh /path/ImageNet2012/train"
|
|||
|
||||
```bash
|
||||
# 使用CIFAR-10数据集
|
||||
python eval.py --data_path=your_data_path --dataset="cifar10" --device_target="Ascend" --pre_trained=./*-70-781.ckpt > output.eval.log 2>&1 &
|
||||
python eval.py --config_path=/dir_to_code/cifar10_config.yaml --data_dir=your_data_path --dataset="cifar10" --device_target="Ascend" --pre_trained=./*-70-781.ckpt > output.eval.log 2>&1 &
|
||||
|
||||
# 使用ImageNet2012数据集
|
||||
python eval.py --data_path=your_data_path --dataset="imagenet2012" --device_target="GPU" --pre_trained=./*-150-5004.ckpt > output.eval.log 2>&1 &
|
||||
python eval.py --config_path=/dir_to_code/cifar10_config.yaml --data_dir=your_data_path --dataset="imagenet2012" --device_target="GPU" --pre_trained=./*-150-5004.ckpt > output.eval.log 2>&1 &
|
||||
```
|
||||
|
||||
- 上述python命令在后台运行,可通过`output.eval.log`文件查看结果。准确率如下:
|
||||
|
@ -357,7 +476,7 @@ after allreduce eval: top5_correct=45582, tot=50000, acc=91.16%
|
|||
### [导出MindIR](#contents)
|
||||
|
||||
```shell
|
||||
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
|
||||
python export.py --config_path [YMAL_CONFIG_PATH] --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
|
||||
```
|
||||
|
||||
参数ckpt_file为必填项,
|
||||
|
|
|
@ -0,0 +1,105 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path"
|
||||
device_target: "Ascend"
|
||||
need_modelarts_dataset_unzip: True
|
||||
modelarts_dataset_unzip_name: "cifar10"
|
||||
|
||||
# ==============================================================================
|
||||
# options
|
||||
num_classes: 10
|
||||
lr: 0.01
|
||||
lr_init: 0.01
|
||||
lr_max: 0.1
|
||||
lr_epochs: '30,60,90,120'
|
||||
lr_scheduler: "step"
|
||||
warmup_epochs: 5
|
||||
batch_size: 64
|
||||
max_epoch: 70
|
||||
momentum: 0.9
|
||||
weight_decay: 0.0005 # 5e-4
|
||||
loss_scale: 1.0
|
||||
label_smooth: 0
|
||||
label_smooth_factor: 0
|
||||
buffer_size: 10
|
||||
image_size: '224,224'
|
||||
pad_mode: 'same'
|
||||
padding: 0
|
||||
has_bias: False
|
||||
batch_norm: True
|
||||
keep_checkpoint_max: 10
|
||||
initialize_mode: "XavierUniform"
|
||||
has_dropout: False
|
||||
|
||||
# train options
|
||||
dataset: "cifar10"
|
||||
data_dir: ""
|
||||
pre_trained: ""
|
||||
lr_gamma: 0.1
|
||||
eta_min: 0.0
|
||||
T_max: 90
|
||||
log_interval: 100
|
||||
ckpt_path: "outputs/"
|
||||
ckpt_interval: 5
|
||||
is_save_on_master: 1
|
||||
is_distributed: 0
|
||||
|
||||
# eval options
|
||||
per_batch_size: 32
|
||||
graph_ckpt: 1
|
||||
log_path: "outputs/"
|
||||
|
||||
# postprocess options
|
||||
result_dir: ""
|
||||
label_dir: ""
|
||||
dataset_name: "cifar10"
|
||||
|
||||
# preprocess options
|
||||
result_path: "./preprocess_Result/"
|
||||
|
||||
# export options
|
||||
ckpt_file: ""
|
||||
file_name: "vgg16"
|
||||
file_format: "AIR"
|
||||
|
||||
|
||||
---
|
||||
|
||||
# Help description for each configuration
|
||||
device_target: "device where the code will be implemented."
|
||||
dataset: "choices in ['cifar10', 'imagenet2012']"
|
||||
data_dir: "data dir"
|
||||
pre_trained: "model_path, local pretrained model to load"
|
||||
lr_gamma: "decrease lr by a factor of exponential lr_scheduler"
|
||||
eta_min: "eta_min in cosine_annealing scheduler"
|
||||
T_max: "T-max in cosine_annealing scheduler"
|
||||
log_interval: "logging interval"
|
||||
ckpt_path: "checkpoint save location"
|
||||
ckpt_interval: "ckpt_interval"
|
||||
is_save_on_master: "save ckpt on master or all rank"
|
||||
is_distributed: "if multi device"
|
||||
|
||||
# eval options
|
||||
per_batch_size: "batch size for per npu"
|
||||
graph_ckpt: "graph ckpt or feed ckpt"
|
||||
log_path: "path to save log"
|
||||
|
||||
# postprocess options
|
||||
result_dir: "result files path."
|
||||
label_dir: "image file path."
|
||||
dataset_name: "choices in ['cifar10', 'imagenet2012']"
|
||||
|
||||
# preprocess options
|
||||
result_path: "result path"
|
||||
|
||||
# export options
|
||||
ckpt_file: "vgg16 ckpt file."
|
||||
file_name: "vgg16 output file name."
|
||||
file_format: "file format, choices in ['AIR', 'ONNX', 'MINDIR']"
|
|
@ -15,13 +15,13 @@
|
|||
"""Eval"""
|
||||
import os
|
||||
import time
|
||||
import argparse
|
||||
import datetime
|
||||
import glob
|
||||
import numpy as np
|
||||
import mindspore.nn as nn
|
||||
|
||||
from mindspore import Tensor, context
|
||||
from mindspore.communication.management import get_rank, get_group_size
|
||||
from mindspore.nn.optim.momentum import Momentum
|
||||
from mindspore.train.model import Model
|
||||
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||
|
@ -34,6 +34,9 @@ from src.vgg import vgg16
|
|||
from src.dataset import vgg_create_dataset
|
||||
from src.dataset import classification_dataset
|
||||
|
||||
from model_utils.moxing_adapter import config
|
||||
from model_utils.moxing_adapter import moxing_wrapper
|
||||
from model_utils.device_adapter import get_device_id, get_rank_id, get_device_num
|
||||
|
||||
class ParameterReduce(nn.Cell):
|
||||
"""ParameterReduce"""
|
||||
|
@ -49,51 +52,6 @@ class ParameterReduce(nn.Cell):
|
|||
return ret
|
||||
|
||||
|
||||
def parse_args(cloud_args=None):
|
||||
"""parse_args"""
|
||||
parser = argparse.ArgumentParser('mindspore classification test')
|
||||
parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'],
|
||||
help='device where the code will be implemented. (Default: Ascend)')
|
||||
# dataset related
|
||||
parser.add_argument('--dataset', type=str, choices=["cifar10", "imagenet2012"], default="cifar10")
|
||||
parser.add_argument('--data_path', type=str, default='', help='eval data dir')
|
||||
parser.add_argument('--per_batch_size', default=32, type=int, help='batch size for per npu')
|
||||
# network related
|
||||
parser.add_argument('--graph_ckpt', type=int, default=1, help='graph ckpt or feed ckpt')
|
||||
parser.add_argument('--pre_trained', default='', type=str, help='fully path of pretrained model to load. '
|
||||
'If it is a direction, it will test all ckpt')
|
||||
|
||||
# logging related
|
||||
parser.add_argument('--log_path', type=str, default='outputs/', help='path to save log')
|
||||
parser.add_argument('--rank', type=int, default=0, help='local rank of distributed')
|
||||
parser.add_argument('--group_size', type=int, default=1, help='world size of distributed')
|
||||
|
||||
args_opt = parser.parse_args()
|
||||
args_opt = merge_args(args_opt, cloud_args)
|
||||
|
||||
if args_opt.dataset == "cifar10":
|
||||
from src.config import cifar_cfg as cfg
|
||||
else:
|
||||
from src.config import imagenet_cfg as cfg
|
||||
|
||||
args_opt.image_size = cfg.image_size
|
||||
args_opt.num_classes = cfg.num_classes
|
||||
args_opt.per_batch_size = cfg.batch_size
|
||||
args_opt.momentum = cfg.momentum
|
||||
args_opt.weight_decay = cfg.weight_decay
|
||||
args_opt.buffer_size = cfg.buffer_size
|
||||
args_opt.pad_mode = cfg.pad_mode
|
||||
args_opt.padding = cfg.padding
|
||||
args_opt.has_bias = cfg.has_bias
|
||||
args_opt.batch_norm = cfg.batch_norm
|
||||
args_opt.initialize_mode = cfg.initialize_mode
|
||||
args_opt.has_dropout = cfg.has_dropout
|
||||
|
||||
args_opt.image_size = list(map(int, args_opt.image_size.split(',')))
|
||||
|
||||
return args_opt
|
||||
|
||||
|
||||
def get_top5_acc(top5_arg, gt_class):
|
||||
sub_count = 0
|
||||
for top5, gt in zip(top5_arg, gt_class):
|
||||
|
@ -102,66 +60,122 @@ def get_top5_acc(top5_arg, gt_class):
|
|||
return sub_count
|
||||
|
||||
|
||||
def merge_args(args, cloud_args):
|
||||
"""merge_args"""
|
||||
args_dict = vars(args)
|
||||
if isinstance(cloud_args, dict):
|
||||
for key in cloud_args.keys():
|
||||
val = cloud_args[key]
|
||||
if key in args_dict and val:
|
||||
arg_type = type(args_dict[key])
|
||||
if arg_type is not type(None):
|
||||
val = arg_type(val)
|
||||
args_dict[key] = val
|
||||
return args
|
||||
def modelarts_pre_process():
|
||||
'''modelarts pre process function.'''
|
||||
def unzip(zip_file, save_dir):
|
||||
import zipfile
|
||||
s_time = time.time()
|
||||
if not os.path.exists(os.path.join(save_dir, config.modelarts_dataset_unzip_name)):
|
||||
zip_isexist = zipfile.is_zipfile(zip_file)
|
||||
if zip_isexist:
|
||||
fz = zipfile.ZipFile(zip_file, 'r')
|
||||
data_num = len(fz.namelist())
|
||||
print("Extract Start...")
|
||||
print("unzip file num: {}".format(data_num))
|
||||
data_print = int(data_num / 100) if data_num > 100 else 1
|
||||
i = 0
|
||||
for file in fz.namelist():
|
||||
if i % data_print == 0:
|
||||
print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True)
|
||||
i += 1
|
||||
fz.extract(file, save_dir)
|
||||
print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60),
|
||||
int(int(time.time() - s_time) % 60)))
|
||||
print("Extract Done.")
|
||||
else:
|
||||
print("This is not zip.")
|
||||
else:
|
||||
print("Zip has been extracted.")
|
||||
|
||||
if config.need_modelarts_dataset_unzip:
|
||||
zip_file_1 = os.path.join(config.data_path, config.modelarts_dataset_unzip_name + ".zip")
|
||||
save_dir_1 = os.path.join(config.data_path)
|
||||
|
||||
sync_lock = "/tmp/unzip_sync.lock"
|
||||
|
||||
# Each server contains 8 devices as most.
|
||||
if config.device_target == "GPU":
|
||||
device_id = get_rank()
|
||||
device_num = get_group_size()
|
||||
elif config.device_target == "Ascend":
|
||||
device_id = get_device_id()
|
||||
device_num = get_device_num()
|
||||
else:
|
||||
raise ValueError("Not support device_target.")
|
||||
|
||||
# Each server contains 8 devices as most.
|
||||
if device_id % min(device_num, 8) == 0 and not os.path.exists(sync_lock):
|
||||
print("Zip file path: ", zip_file_1)
|
||||
print("Unzip file save dir: ", save_dir_1)
|
||||
unzip(zip_file_1, save_dir_1)
|
||||
print("===Finish extract data synchronization===")
|
||||
try:
|
||||
os.mknod(sync_lock)
|
||||
except IOError:
|
||||
pass
|
||||
|
||||
while True:
|
||||
if os.path.exists(sync_lock):
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
print("Device: {}, Finish sync unzip data from {} to {}.".format(device_id, zip_file_1, save_dir_1))
|
||||
|
||||
config.log_path = os.path.join(config.output_path, config.log_path)
|
||||
|
||||
|
||||
def test(cloud_args=None):
|
||||
"""test"""
|
||||
args = parse_args(cloud_args)
|
||||
_enable_graph_kernel = args.device_target == "GPU"
|
||||
@moxing_wrapper(pre_process=modelarts_pre_process)
|
||||
def run_eval():
|
||||
"""run eval"""
|
||||
config.per_batch_size = config.batch_size
|
||||
config.image_size = list(map(int, config.image_size.split(',')))
|
||||
config.rank = get_rank_id()
|
||||
config.group_size = get_device_num()
|
||||
|
||||
|
||||
_enable_graph_kernel = config.device_target == "GPU"
|
||||
context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=_enable_graph_kernel,
|
||||
enable_auto_mixed_precision=True, device_target=args.device_target, save_graphs=False)
|
||||
if os.getenv('DEVICE_ID', "not_set").isdigit() and args.device_target == "Ascend":
|
||||
enable_auto_mixed_precision=True, device_target=config.device_target, save_graphs=False)
|
||||
if os.getenv('DEVICE_ID', "not_set").isdigit() and config.device_target == "Ascend":
|
||||
context.set_context(device_id=int(os.getenv('DEVICE_ID')))
|
||||
|
||||
args.outputs_dir = os.path.join(args.log_path,
|
||||
datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
|
||||
config.outputs_dir = os.path.join(config.log_path,
|
||||
datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
|
||||
|
||||
args.logger = get_logger(args.outputs_dir, args.rank)
|
||||
args.logger.save_args(args)
|
||||
config.logger = get_logger(config.outputs_dir, config.rank)
|
||||
config.logger.save_args(config)
|
||||
|
||||
if args.dataset == "cifar10":
|
||||
net = vgg16(num_classes=args.num_classes, args=args)
|
||||
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, args.momentum,
|
||||
weight_decay=args.weight_decay)
|
||||
if config.dataset == "cifar10":
|
||||
net = vgg16(num_classes=config.num_classes, args=config)
|
||||
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, config.momentum,
|
||||
weight_decay=config.weight_decay)
|
||||
loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
|
||||
model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})
|
||||
|
||||
param_dict = load_checkpoint(args.pre_trained)
|
||||
param_dict = load_checkpoint(config.pre_trained)
|
||||
load_param_into_net(net, param_dict)
|
||||
net.set_train(False)
|
||||
dataset = vgg_create_dataset(args.data_path, args.image_size, args.per_batch_size, training=False)
|
||||
dataset = vgg_create_dataset(config.data_dir, config.image_size, config.per_batch_size, training=False)
|
||||
res = model.eval(dataset)
|
||||
print("result: ", res)
|
||||
else:
|
||||
# network
|
||||
args.logger.important_info('start create network')
|
||||
if os.path.isdir(args.pre_trained):
|
||||
models = list(glob.glob(os.path.join(args.pre_trained, '*.ckpt')))
|
||||
config.logger.important_info('start create network')
|
||||
if os.path.isdir(config.pre_trained):
|
||||
models = list(glob.glob(os.path.join(config.pre_trained, '*.ckpt')))
|
||||
print(models)
|
||||
if args.graph_ckpt:
|
||||
if config.graph_ckpt:
|
||||
f = lambda x: -1 * int(os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split('_')[0])
|
||||
else:
|
||||
f = lambda x: -1 * int(os.path.splitext(os.path.split(x)[-1])[0].split('_')[-1])
|
||||
args.models = sorted(models, key=f)
|
||||
config.models = sorted(models, key=f)
|
||||
else:
|
||||
args.models = [args.pre_trained,]
|
||||
config.models = [config.pre_trained,]
|
||||
|
||||
for model in args.models:
|
||||
dataset = classification_dataset(args.data_path, args.image_size, args.per_batch_size, mode='eval')
|
||||
for model in config.models:
|
||||
dataset = classification_dataset(config.data_dir, config.image_size, config.per_batch_size, mode='eval')
|
||||
eval_dataloader = dataset.create_tuple_iterator(output_numpy=True, num_epochs=1)
|
||||
network = vgg16(args.num_classes, args, phase="test")
|
||||
network = vgg16(config.num_classes, config, phase="test")
|
||||
|
||||
# pre_trained
|
||||
load_param_into_net(network, load_checkpoint(model))
|
||||
|
@ -184,30 +198,30 @@ def test(cloud_args=None):
|
|||
t1_correct = np.equal(top1_output, gt_classes).sum()
|
||||
top1_correct += t1_correct
|
||||
top5_correct += get_top5_acc(top5_output, gt_classes)
|
||||
img_tot += args.per_batch_size
|
||||
img_tot += config.per_batch_size
|
||||
|
||||
if args.rank == 0 and it == 0:
|
||||
if config.rank == 0 and it == 0:
|
||||
t_end = time.time()
|
||||
it = 1
|
||||
if args.rank == 0:
|
||||
if config.rank == 0:
|
||||
time_used = time.time() - t_end
|
||||
fps = (img_tot - args.per_batch_size) * args.group_size / time_used
|
||||
args.logger.info('Inference Performance: {:.2f} img/sec'.format(fps))
|
||||
fps = (img_tot - config.per_batch_size) * config.group_size / time_used
|
||||
config.logger.info('Inference Performance: {:.2f} img/sec'.format(fps))
|
||||
results = [[top1_correct], [top5_correct], [img_tot]]
|
||||
args.logger.info('before results={}'.format(results))
|
||||
config.logger.info('before results=%s', results)
|
||||
results = np.array(results)
|
||||
|
||||
args.logger.info('after results={}'.format(results))
|
||||
config.logger.info('after results=%s', results)
|
||||
top1_correct = results[0, 0]
|
||||
top5_correct = results[1, 0]
|
||||
img_tot = results[2, 0]
|
||||
acc1 = 100.0 * top1_correct / img_tot
|
||||
acc5 = 100.0 * top5_correct / img_tot
|
||||
args.logger.info('after allreduce eval: top1_correct={}, tot={},'
|
||||
'acc={:.2f}%(TOP1)'.format(top1_correct, img_tot, acc1))
|
||||
args.logger.info('after allreduce eval: top5_correct={}, tot={},'
|
||||
'acc={:.2f}%(TOP5)'.format(top5_correct, img_tot, acc5))
|
||||
config.logger.info('after allreduce eval: top1_correct={}, tot={},'
|
||||
'acc={:.2f}%(TOP1)'.format(top1_correct, img_tot, acc1))
|
||||
config.logger.info('after allreduce eval: top5_correct={}, tot={},'
|
||||
'acc={:.2f}%(TOP5)'.format(top5_correct, img_tot, acc5))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test()
|
||||
run_eval()
|
||||
|
|
|
@ -13,7 +13,6 @@
|
|||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""export checkpoint file into models"""
|
||||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from mindspore import Tensor, context
|
||||
|
@ -22,42 +21,29 @@ from mindspore.train.serialization import load_checkpoint, export
|
|||
|
||||
from src.vgg import vgg16
|
||||
|
||||
parser = argparse.ArgumentParser(description='VGG16 export')
|
||||
parser.add_argument("--device_id", type=int, default=0, help="Device id")
|
||||
parser.add_argument('--dataset', type=str, choices=["cifar10", "imagenet2012"], default="cifar10", help='ckpt file')
|
||||
parser.add_argument('--ckpt_file', type=str, required=True, help='vgg16 ckpt file.')
|
||||
parser.add_argument('--file_name', type=str, default='vgg16', help='vgg16 output file name.')
|
||||
parser.add_argument('--file_format', type=str, choices=["AIR", "ONNX", "MINDIR"], default='AIR', help='file format')
|
||||
parser.add_argument("--device_target", type=str, choices=["Ascend", "GPU", "CPU"], default="Ascend",
|
||||
help="device target")
|
||||
args = parser.parse_args()
|
||||
from model_utils.moxing_adapter import config
|
||||
from model_utils.device_adapter import get_device_id
|
||||
|
||||
if args.dataset == "cifar10":
|
||||
from src.config import cifar_cfg as cfg
|
||||
else:
|
||||
from src.config import imagenet_cfg as cfg
|
||||
|
||||
args.num_classes = cfg.num_classes
|
||||
args.pad_mode = cfg.pad_mode
|
||||
args.padding = cfg.padding
|
||||
args.has_bias = cfg.has_bias
|
||||
args.initialize_mode = cfg.initialize_mode
|
||||
args.batch_norm = cfg.batch_norm
|
||||
args.has_dropout = cfg.has_dropout
|
||||
args.image_size = list(map(int, cfg.image_size.split(',')))
|
||||
def run_export():
|
||||
config.image_size = list(map(int, config.image_size.split(',')))
|
||||
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
|
||||
if args.device_target == "Ascend":
|
||||
context.set_context(device_id=args.device_id)
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
|
||||
if config.device_target == "Ascend":
|
||||
config.device_id = get_device_id()
|
||||
context.set_context(device_id=config.device_id)
|
||||
|
||||
if __name__ == '__main__':
|
||||
if args.dataset == "cifar10":
|
||||
net = vgg16(num_classes=args.num_classes, args=args)
|
||||
if config.dataset == "cifar10":
|
||||
net = vgg16(num_classes=config.num_classes, args=config)
|
||||
else:
|
||||
net = vgg16(args.num_classes, args, phase="test")
|
||||
net = vgg16(config.num_classes, config, phase="test")
|
||||
|
||||
load_checkpoint(args.ckpt_file, net=net)
|
||||
load_checkpoint(config.ckpt_file, net=net)
|
||||
net.set_train(False)
|
||||
|
||||
input_data = Tensor(np.zeros([cfg.batch_size, 3, args.image_size[0], args.image_size[1]]), mstype.float32)
|
||||
export(net, input_data, file_name=args.file_name, file_format=args.file_format)
|
||||
input_data = Tensor(np.zeros([config.batch_size, 3, config.image_size[0], config.image_size[1]]), mstype.float32)
|
||||
export(net, input_data, file_name=config.file_name, file_format=config.file_format)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run_export()
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path"
|
||||
device_target: "Ascend"
|
||||
need_modelarts_dataset_unzip: True
|
||||
modelarts_dataset_unzip_name: "ImageNet"
|
||||
|
||||
# ==============================================================================
|
||||
# options
|
||||
num_classes: 1000
|
||||
lr: 0.04
|
||||
lr_init: 0.01
|
||||
lr_max: 0.1
|
||||
lr_epochs: '30,60,90,120'
|
||||
lr_scheduler: 'cosine_annealing'
|
||||
warmup_epochs: 0
|
||||
batch_size: 64
|
||||
max_epoch: 90
|
||||
momentum: 0.9
|
||||
weight_decay: 0.0001 # 1e-4
|
||||
loss_scale: 1024
|
||||
label_smooth: 1
|
||||
label_smooth_factor: 0.1
|
||||
buffer_size: 10
|
||||
image_size: '224,224'
|
||||
pad_mode: 'pad'
|
||||
padding: 1
|
||||
has_bias: False
|
||||
batch_norm: False
|
||||
keep_checkpoint_max: 10
|
||||
initialize_mode: "KaimingNormal"
|
||||
has_dropout: True
|
||||
|
||||
# train option
|
||||
dataset: "imagenet2012"
|
||||
data_dir: ""
|
||||
pre_trained: ""
|
||||
lr_gamma: 0.1
|
||||
eta_min: 0.0
|
||||
T_max: 90
|
||||
log_interval: 100
|
||||
ckpt_path: "outputs/"
|
||||
ckpt_interval: 5
|
||||
is_save_on_master: 1
|
||||
is_distributed: 0
|
||||
|
||||
# eval options
|
||||
per_batch_size: 32
|
||||
graph_ckpt: 1
|
||||
log_path: "outputs/"
|
||||
|
||||
# postprocess options
|
||||
result_dir: ""
|
||||
label_dir: ""
|
||||
dataset_name: "imagenet2012"
|
||||
|
||||
# preprocess options
|
||||
result_path: "./preprocess_Result/"
|
||||
|
||||
# export options
|
||||
ckpt_file: ""
|
||||
file_name: "vgg16"
|
||||
file_format: "AIR"
|
||||
|
||||
---
|
||||
|
||||
# Help description for each configuration
|
||||
device_target: "device where the code will be implemented."
|
||||
dataset: "choices in ['cifar10', 'imagenet2012']"
|
||||
data_dir: "data dir"
|
||||
pre_trained: "model_path, local pretrained model to load"
|
||||
lr_gamma: "decrease lr by a factor of exponential lr_scheduler"
|
||||
eta_min: "eta_min in cosine_annealing scheduler"
|
||||
T_max: "T-max in cosine_annealing scheduler"
|
||||
log_interval: "logging interval"
|
||||
ckpt_path: "checkpoint save location"
|
||||
ckpt_interval: "ckpt_interval"
|
||||
is_save_on_master: "save ckpt on master or all rank"
|
||||
is_distributed: "if multi device"
|
||||
|
||||
# eval options
|
||||
per_batch_size: "batch size for per npu"
|
||||
graph_ckpt: "graph ckpt or feed ckpt"
|
||||
log_path: "path to save log"
|
||||
|
||||
# postprocess options
|
||||
result_dir: "result files path."
|
||||
label_dir: "image file path."
|
||||
dataset_name: "choices in ['cifar10', 'imagenet2012']"
|
||||
|
||||
# preprocess options
|
||||
result_path: "result path"
|
||||
|
||||
# export options
|
||||
ckpt_file: "vgg16 ckpt file."
|
||||
file_name: "vgg16 output file name."
|
||||
file_format: "file format, choices in ['AIR', 'ONNX', 'MINDIR']"
|
|
@ -14,7 +14,7 @@
|
|||
# ============================================================================
|
||||
"""hub config."""
|
||||
from src.vgg import vgg16 as VGG16
|
||||
|
||||
from model_utils.moxing_adapter import config
|
||||
|
||||
def vgg16(*args, **kwargs):
|
||||
return VGG16(*args, **kwargs)
|
||||
|
@ -22,5 +22,5 @@ def vgg16(*args, **kwargs):
|
|||
|
||||
def create_network(name, *args, **kwargs):
|
||||
if name == "vgg16":
|
||||
return vgg16(*args, **kwargs)
|
||||
return vgg16(args=config, *args, **kwargs)
|
||||
raise NotImplementedError(f"{name} is not implemented in the repo")
|
||||
|
|
|
@ -0,0 +1,136 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Parse arguments"""
|
||||
|
||||
import os
|
||||
import ast
|
||||
import argparse
|
||||
from pprint import pformat
|
||||
import yaml
|
||||
|
||||
class Config:
|
||||
"""
|
||||
Configuration namespace. Convert dictionary to members.
|
||||
"""
|
||||
def __init__(self, cfg_dict):
|
||||
for k, v in cfg_dict.items():
|
||||
if isinstance(v, (list, tuple)):
|
||||
setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
|
||||
else:
|
||||
setattr(self, k, Config(v) if isinstance(v, dict) else v)
|
||||
|
||||
def __str__(self):
|
||||
return pformat(self.__dict__)
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
|
||||
def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"):
|
||||
"""
|
||||
Parse command line arguments to the configuration according to the default yaml.
|
||||
|
||||
Args:
|
||||
parser: Parent parser.
|
||||
cfg: Base configuration.
|
||||
helper: Helper description.
|
||||
cfg_path: Path to the default yaml config.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
|
||||
parents=[parser])
|
||||
helper = {} if helper is None else helper
|
||||
choices = {} if choices is None else choices
|
||||
for item in cfg:
|
||||
if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
|
||||
help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
|
||||
choice = choices[item] if item in choices else None
|
||||
if isinstance(cfg[item], bool):
|
||||
parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
|
||||
help=help_description)
|
||||
else:
|
||||
parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
|
||||
help=help_description)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def parse_yaml(yaml_path):
|
||||
"""
|
||||
Parse the yaml config file.
|
||||
|
||||
Args:
|
||||
yaml_path: Path to the yaml config.
|
||||
"""
|
||||
with open(yaml_path, 'r') as fin:
|
||||
try:
|
||||
cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
|
||||
cfgs = [x for x in cfgs]
|
||||
if len(cfgs) == 1:
|
||||
cfg_helper = {}
|
||||
cfg = cfgs[0]
|
||||
cfg_choices = {}
|
||||
elif len(cfgs) == 2:
|
||||
cfg, cfg_helper = cfgs
|
||||
cfg_choices = {}
|
||||
elif len(cfgs) == 3:
|
||||
cfg, cfg_helper, cfg_choices = cfgs
|
||||
else:
|
||||
raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml")
|
||||
print(cfg_helper)
|
||||
except:
|
||||
raise ValueError("Failed to parse yaml")
|
||||
return cfg, cfg_helper, cfg_choices
|
||||
|
||||
|
||||
def merge(args, cfg):
|
||||
"""
|
||||
Merge the base config from yaml file and command line arguments.
|
||||
|
||||
Args:
|
||||
args: Command line arguments.
|
||||
cfg: Base configuration.
|
||||
"""
|
||||
args_var = vars(args)
|
||||
for item in args_var:
|
||||
cfg[item] = args_var[item]
|
||||
return cfg
|
||||
|
||||
|
||||
def get_config():
|
||||
"""
|
||||
Get Config according to the yaml file and cli arguments.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="default name", add_help=False)
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../cifar10_config.yaml"),
|
||||
help="Config file path")
|
||||
path_args, _ = parser.parse_known_args()
|
||||
default, helper, choices = parse_yaml(path_args.config_path)
|
||||
args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path)
|
||||
final_config = merge(args, default)
|
||||
return Config(final_config)
|
||||
|
||||
# --------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
def get_config_static(config_path="../cifar10_config.yaml"):
|
||||
"""
|
||||
Get Config according to the yaml file and cli arguments.
|
||||
"""
|
||||
if not config_path.startswith("/"):
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
config_path = os.path.join(current_dir, config_path)
|
||||
final_config, _, _ = parse_yaml(config_path)
|
||||
return Config(final_config)
|
|
@ -0,0 +1,27 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Device adapter for ModelArts"""
|
||||
|
||||
from .moxing_adapter import config
|
||||
|
||||
if config.enable_modelarts:
|
||||
from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
|
||||
else:
|
||||
from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
|
||||
|
||||
__all__ = [
|
||||
"get_device_id", "get_device_num", "get_rank_id", "get_job_id"
|
||||
]
|
|
@ -0,0 +1,36 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Local adapter"""
|
||||
|
||||
import os
|
||||
|
||||
def get_device_id():
|
||||
device_id = os.getenv('DEVICE_ID', '0')
|
||||
return int(device_id)
|
||||
|
||||
|
||||
def get_device_num():
|
||||
device_num = os.getenv('RANK_SIZE', '1')
|
||||
return int(device_num)
|
||||
|
||||
|
||||
def get_rank_id():
|
||||
global_rank_id = os.getenv('RANK_ID', '0')
|
||||
return int(global_rank_id)
|
||||
|
||||
|
||||
def get_job_id():
|
||||
return "Local Job"
|
|
@ -0,0 +1,118 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Moxing adapter for ModelArts"""
|
||||
|
||||
import os
|
||||
import functools
|
||||
from mindspore import context
|
||||
from .config import get_config
|
||||
|
||||
config = get_config()
|
||||
|
||||
_global_sync_count = 0
|
||||
|
||||
def get_device_id():
|
||||
device_id = os.getenv('DEVICE_ID', '0')
|
||||
return int(device_id)
|
||||
|
||||
|
||||
def get_device_num():
|
||||
device_num = os.getenv('RANK_SIZE', '1')
|
||||
return int(device_num)
|
||||
|
||||
|
||||
def get_rank_id():
|
||||
global_rank_id = os.getenv('RANK_ID', '0')
|
||||
return int(global_rank_id)
|
||||
|
||||
|
||||
def get_job_id():
|
||||
job_id = os.getenv('JOB_ID')
|
||||
job_id = job_id if job_id != "" else "default"
|
||||
return job_id
|
||||
|
||||
def sync_data(from_path, to_path):
|
||||
"""
|
||||
Download data from remote obs to local directory if the first url is remote url and the second one is local path
|
||||
Upload data from local directory to remote obs in contrast.
|
||||
"""
|
||||
import moxing as mox
|
||||
import time
|
||||
global _global_sync_count
|
||||
sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
|
||||
_global_sync_count += 1
|
||||
|
||||
# Each server contains 8 devices as most.
|
||||
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
|
||||
print("from path: ", from_path)
|
||||
print("to path: ", to_path)
|
||||
mox.file.copy_parallel(from_path, to_path)
|
||||
print("===finish data synchronization===")
|
||||
try:
|
||||
os.mknod(sync_lock)
|
||||
except IOError:
|
||||
pass
|
||||
print("===save flag===")
|
||||
|
||||
while True:
|
||||
if os.path.exists(sync_lock):
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
print("Finish sync data from {} to {}.".format(from_path, to_path))
|
||||
|
||||
|
||||
def moxing_wrapper(pre_process=None, post_process=None):
|
||||
"""
|
||||
Moxing wrapper to download dataset and upload outputs.
|
||||
"""
|
||||
def wrapper(run_func):
|
||||
@functools.wraps(run_func)
|
||||
def wrapped_func(*args, **kwargs):
|
||||
# Download data from data_url
|
||||
if config.enable_modelarts:
|
||||
if config.data_url:
|
||||
sync_data(config.data_url, config.data_path)
|
||||
print("Dataset downloaded: ", os.listdir(config.data_path))
|
||||
if config.checkpoint_url:
|
||||
sync_data(config.checkpoint_url, config.load_path)
|
||||
print("Preload downloaded: ", os.listdir(config.load_path))
|
||||
if config.train_url:
|
||||
sync_data(config.train_url, config.output_path)
|
||||
print("Workspace downloaded: ", os.listdir(config.output_path))
|
||||
|
||||
context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
|
||||
config.device_num = get_device_num()
|
||||
config.device_id = get_device_id()
|
||||
if not os.path.exists(config.output_path):
|
||||
os.makedirs(config.output_path)
|
||||
|
||||
if pre_process:
|
||||
pre_process()
|
||||
|
||||
# Run the main function
|
||||
run_func(*args, **kwargs)
|
||||
|
||||
# Upload data to train_url
|
||||
if config.enable_modelarts:
|
||||
if post_process:
|
||||
post_process()
|
||||
|
||||
if config.train_url:
|
||||
print("Start to copy output directory")
|
||||
sync_data(config.output_path, config.train_url)
|
||||
return wrapped_func
|
||||
return wrapper
|
|
@ -15,39 +15,31 @@
|
|||
"""postprocess for 310 inference"""
|
||||
import os
|
||||
import json
|
||||
import argparse
|
||||
import numpy as np
|
||||
from mindspore.nn import Top1CategoricalAccuracy, Top5CategoricalAccuracy
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(description="postprocess")
|
||||
parser.add_argument("--result_dir", type=str, required=True, help="result files path.")
|
||||
parser.add_argument("--label_dir", type=str, required=True, help="image file path.")
|
||||
parser.add_argument('--dataset_name', type=str, choices=["cifar10", "imagenet2012"], default="cifar10")
|
||||
args = parser.parse_args()
|
||||
from model_utils.moxing_adapter import config
|
||||
|
||||
if __name__ == '__main__':
|
||||
top1_acc = Top1CategoricalAccuracy()
|
||||
rst_path = args.result_dir
|
||||
if args.dataset_name == "cifar10":
|
||||
from src.config import cifar_cfg as cfg
|
||||
labels = np.load(args.label_dir, allow_pickle=True)
|
||||
rst_path = config.result_dir
|
||||
if config.dataset_name == "cifar10":
|
||||
labels = np.load(config.label_dir, allow_pickle=True)
|
||||
for idx, label in enumerate(labels):
|
||||
f_name = os.path.join(rst_path, "VGG16_data_bs" + str(cfg.batch_size) + "_" + str(idx) + "_0.bin")
|
||||
f_name = os.path.join(rst_path, "VGG16_data_bs" + str(config.batch_size) + "_" + str(idx) + "_0.bin")
|
||||
pred = np.fromfile(f_name, np.float32)
|
||||
pred = pred.reshape(cfg.batch_size, int(pred.shape[0] / cfg.batch_size))
|
||||
pred = pred.reshape(config.batch_size, int(pred.shape[0] / config.batch_size))
|
||||
top1_acc.update(pred, labels[idx])
|
||||
print("acc: ", top1_acc.eval())
|
||||
else:
|
||||
from src.config import imagenet_cfg as cfg
|
||||
top5_acc = Top5CategoricalAccuracy()
|
||||
file_list = os.listdir(rst_path)
|
||||
with open(args.label_dir, "r") as label:
|
||||
with open(config.label_dir, "r") as label:
|
||||
labels = json.load(label)
|
||||
for f in file_list:
|
||||
label = f.split("_0.bin")[0] + ".JPEG"
|
||||
pred = np.fromfile(os.path.join(rst_path, f), np.float32)
|
||||
pred = pred.reshape(cfg.batch_size, int(pred.shape[0] / cfg.batch_size))
|
||||
pred = pred.reshape(config.batch_size, int(pred.shape[0] / config.batch_size))
|
||||
top1_acc.update(pred, [labels[label],])
|
||||
top5_acc.update(pred, [labels[label],])
|
||||
print("Top1 acc: ", top1_acc.eval())
|
||||
|
|
|
@ -14,11 +14,13 @@
|
|||
# ============================================================================
|
||||
"""preprocess"""
|
||||
import os
|
||||
import argparse
|
||||
import json
|
||||
import numpy as np
|
||||
from src.dataset import vgg_create_dataset
|
||||
|
||||
from model_utils.moxing_adapter import config
|
||||
|
||||
|
||||
def create_label(result_path, dir_path):
|
||||
print("[WARNING] Create imagenet label. Currently only use for Imagenet2012!")
|
||||
dirs = os.listdir(dir_path)
|
||||
|
@ -41,33 +43,22 @@ def create_label(result_path, dir_path):
|
|||
|
||||
print("[INFO] Completed! Total {} data.".format(total))
|
||||
|
||||
parser = argparse.ArgumentParser('preprocess')
|
||||
parser.add_argument('--dataset', type=str, choices=["cifar10", "imagenet2012"], default="cifar10")
|
||||
parser.add_argument('--data_path', type=str, default='', help='eval data dir')
|
||||
parser.add_argument('--result_path', type=str, default='./preprocess_Result/', help='result path')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.dataset == "cifar10":
|
||||
from src.config import cifar_cfg as cfg
|
||||
else:
|
||||
from src.config import imagenet_cfg as cfg
|
||||
|
||||
args.per_batch_size = cfg.batch_size
|
||||
args.image_size = list(map(int, cfg.image_size.split(',')))
|
||||
config.per_batch_size = config.batch_size
|
||||
config.image_size = list(map(int, config.image_size.split(',')))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if args.dataset == "cifar10":
|
||||
dataset = vgg_create_dataset(args.data_path, args.image_size, args.per_batch_size, training=False)
|
||||
img_path = os.path.join(args.result_path, "00_data")
|
||||
if config.dataset == "cifar10":
|
||||
dataset = vgg_create_dataset(config.data_dir, config.image_size, config.per_batch_size, training=False)
|
||||
img_path = os.path.join(config.result_path, "00_data")
|
||||
os.makedirs(img_path)
|
||||
label_list = []
|
||||
for idx, data in enumerate(dataset.create_dict_iterator(output_numpy=True)):
|
||||
file_name = "VGG16_data_bs" + str(args.per_batch_size) + "_" + str(idx) + ".bin"
|
||||
file_name = "VGG16_data_bs" + str(config.per_batch_size) + "_" + str(idx) + ".bin"
|
||||
file_path = os.path.join(img_path, file_name)
|
||||
data["image"].tofile(file_path)
|
||||
label_list.append(data["label"])
|
||||
np.save(os.path.join(args.result_path, "cifar10_label_ids.npy"), label_list)
|
||||
np.save(os.path.join(config.result_path, "cifar10_label_ids.npy"), label_list)
|
||||
print("=" * 20, "export bin files finished", "=" * 20)
|
||||
else:
|
||||
create_label(args.result_path, args.data_path)
|
||||
create_label(config.result_path, config.data_dir)
|
||||
|
|
|
@ -32,6 +32,13 @@ then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
|
||||
dataset_type='cifar10'
|
||||
if [ $# == 3 ]
|
||||
|
@ -43,6 +50,8 @@ then
|
|||
fi
|
||||
dataset_type=$3
|
||||
fi
|
||||
config_path=$(get_real_path "./${dataset_type}_config.yaml")
|
||||
echo "config path is : ${config_path}"
|
||||
|
||||
export DEVICE_NUM=8
|
||||
export RANK_SIZE=8
|
||||
|
@ -68,10 +77,12 @@ do
|
|||
rm -rf ./train_parallel$DEVICE_ID
|
||||
mkdir ./train_parallel$DEVICE_ID
|
||||
cp $src_dir/*.py ./train_parallel$DEVICE_ID
|
||||
cp $src_dir/*.yaml ./train_parallel$DEVICE_ID
|
||||
cp -r $src_dir/src ./train_parallel$DEVICE_ID
|
||||
cp -r $src_dir/model_utils ./train_parallel$DEVICE_ID
|
||||
cd ./train_parallel$DEVICE_ID || exit
|
||||
echo "start training for rank $RANK_ID, device $DEVICE_ID, $dataset_type"
|
||||
env > env.log
|
||||
taskset -c $cmdopt python train.py --data_path=$2 --device_target="Ascend" --device_id=$DEVICE_ID --is_distributed=1 --dataset=$dataset_type &> log &
|
||||
taskset -c $cmdopt python train.py --config_path=$config_path --data_dir=$2 --device_target="Ascend" --is_distributed=1 --dataset=$dataset_type &> log &
|
||||
cd ..
|
||||
done
|
||||
|
|
|
@ -22,9 +22,19 @@ echo "==========================================================================
|
|||
|
||||
DATA_PATH=$1
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
config_path=$(get_real_path "./imagenet2012_config.yaml")
|
||||
|
||||
mpirun -n 8 --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py \
|
||||
--config_path=$config_path \
|
||||
--device_target="GPU" \
|
||||
--dataset="imagenet2012" \
|
||||
--is_distributed=1 \
|
||||
--data_path=$DATA_PATH > output.train.log 2>&1 &
|
||||
--data_dir=$DATA_PATH > output.train.log 2>&1 &
|
||||
|
|
|
@ -25,8 +25,20 @@ DATASET_TYPE=$2
|
|||
DEVICE_TYPE=$3
|
||||
CHECKPOINT_PATH=$4
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
|
||||
config_path=$(get_real_path "./${DATASET_TYPE}_config.yaml")
|
||||
echo "config path is : ${config_path}"
|
||||
|
||||
python eval.py \
|
||||
--data_path=$DATA_PATH \
|
||||
--config_path=$config_path \
|
||||
--data_dir=$DATA_PATH \
|
||||
--dataset=$DATASET_TYPE \
|
||||
--device_target=$DEVICE_TYPE \
|
||||
--pre_trained=$CHECKPOINT_PATH > output.eval.log 2>&1 &
|
||||
|
|
|
@ -36,6 +36,8 @@ else
|
|||
echo "DATASET_NAME can choose from ['cifar10', 'imagenet2012']"
|
||||
exit 1
|
||||
fi
|
||||
config_path=$(get_real_path "../${dataset_name}_config.yaml")
|
||||
echo "config path is : ${config_path}"
|
||||
|
||||
dataset_path=$(get_real_path $3)
|
||||
|
||||
|
@ -77,7 +79,7 @@ function preprocess_data()
|
|||
rm -rf ./preprocess_Result
|
||||
fi
|
||||
mkdir preprocess_Result
|
||||
python3.7 ../preprocess.py --dataset=$dataset_name --data_path=$dataset_path --result_path=./preprocess_Result/
|
||||
python3.7 ../preprocess.py --config_path=$config_path --dataset=$dataset_name --data_dir=$dataset_path --result_path=./preprocess_Result/
|
||||
}
|
||||
|
||||
function compile_app()
|
||||
|
@ -108,9 +110,9 @@ function infer()
|
|||
function cal_acc()
|
||||
{
|
||||
if [ "$dataset_name" == "cifar10" ]; then
|
||||
python3.7 ../postprocess.py --result_dir=./result_Files --label_dir=./preprocess_Result/cifar10_label_ids.npy --dataset_name=$dataset_name &> acc.log
|
||||
python3.7 ../postprocess.py --config_path=$config_path --result_dir=./result_Files --label_dir=./preprocess_Result/cifar10_label_ids.npy --dataset_name=$dataset_name &> acc.log
|
||||
else
|
||||
python3.7 ../postprocess.py --result_dir=./result_Files --label_dir=./preprocess_Result/imagenet_label.json --dataset_name=$dataset_name &> acc.log
|
||||
python3.7 ../postprocess.py --config_path=$config_path --result_dir=./result_Files --label_dir=./preprocess_Result/imagenet_label.json --dataset_name=$dataset_name &> acc.log
|
||||
fi
|
||||
}
|
||||
|
||||
|
|
|
@ -1,72 +0,0 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
network config setting, will be used in train.py and eval.py
|
||||
"""
|
||||
from easydict import EasyDict as edict
|
||||
|
||||
# config for vgg16, cifar10
|
||||
cifar_cfg = edict({
|
||||
"num_classes": 10,
|
||||
"lr": 0.01,
|
||||
"lr_init": 0.01,
|
||||
"lr_max": 0.1,
|
||||
"lr_epochs": '30,60,90,120',
|
||||
"lr_scheduler": "step",
|
||||
"warmup_epochs": 5,
|
||||
"batch_size": 64,
|
||||
"max_epoch": 70,
|
||||
"momentum": 0.9,
|
||||
"weight_decay": 5e-4,
|
||||
"loss_scale": 1.0,
|
||||
"label_smooth": 0,
|
||||
"label_smooth_factor": 0,
|
||||
"buffer_size": 10,
|
||||
"image_size": '224,224',
|
||||
"pad_mode": 'same',
|
||||
"padding": 0,
|
||||
"has_bias": False,
|
||||
"batch_norm": True,
|
||||
"keep_checkpoint_max": 10,
|
||||
"initialize_mode": "XavierUniform",
|
||||
"has_dropout": False
|
||||
})
|
||||
|
||||
# config for vgg16, imagenet2012
|
||||
imagenet_cfg = edict({
|
||||
"num_classes": 1000,
|
||||
"lr": 0.04,
|
||||
"lr_init": 0.01,
|
||||
"lr_max": 0.1,
|
||||
"lr_epochs": '30,60,90,120',
|
||||
"lr_scheduler": 'cosine_annealing',
|
||||
"warmup_epochs": 0,
|
||||
"batch_size": 64,
|
||||
"max_epoch": 90,
|
||||
"momentum": 0.9,
|
||||
"weight_decay": 1e-4,
|
||||
"loss_scale": 1024,
|
||||
"label_smooth": 1,
|
||||
"label_smooth_factor": 0.1,
|
||||
"buffer_size": 10,
|
||||
"image_size": '224,224',
|
||||
"pad_mode": 'pad',
|
||||
"padding": 1,
|
||||
"has_bias": False,
|
||||
"batch_norm": False,
|
||||
"keep_checkpoint_max": 10,
|
||||
"initialize_mode": "KaimingNormal",
|
||||
"has_dropout": True
|
||||
})
|
|
@ -142,9 +142,5 @@ def vgg16(num_classes=1000, args=None, phase="train", **kwargs):
|
|||
Examples:
|
||||
>>> vgg16(num_classes=1000, args=args, **kwargs)
|
||||
"""
|
||||
|
||||
if args is None:
|
||||
from .config import cifar_cfg
|
||||
args = cifar_cfg
|
||||
net = Vgg(cfg['16'], num_classes=num_classes, args=args, batch_norm=args.batch_norm, phase=phase, **kwargs)
|
||||
return net
|
||||
|
|
|
@ -15,9 +15,9 @@
|
|||
"""
|
||||
#################train vgg16 example on cifar10########################
|
||||
"""
|
||||
import argparse
|
||||
import datetime
|
||||
import os
|
||||
import time
|
||||
|
||||
import mindspore.nn as nn
|
||||
from mindspore import Tensor
|
||||
|
@ -41,197 +41,193 @@ from src.utils.logging import get_logger
|
|||
from src.utils.util import get_param_groups
|
||||
from src.vgg import vgg16
|
||||
|
||||
from model_utils.moxing_adapter import config
|
||||
from model_utils.moxing_adapter import moxing_wrapper
|
||||
from model_utils.device_adapter import get_device_id, get_rank_id, get_device_num
|
||||
|
||||
set_seed(1)
|
||||
|
||||
def modelarts_pre_process():
|
||||
'''modelarts pre process function.'''
|
||||
def unzip(zip_file, save_dir):
|
||||
import zipfile
|
||||
s_time = time.time()
|
||||
if not os.path.exists(os.path.join(save_dir, config.modelarts_dataset_unzip_name)):
|
||||
zip_isexist = zipfile.is_zipfile(zip_file)
|
||||
if zip_isexist:
|
||||
fz = zipfile.ZipFile(zip_file, 'r')
|
||||
data_num = len(fz.namelist())
|
||||
print("Extract Start...")
|
||||
print("unzip file num: {}".format(data_num))
|
||||
data_print = int(data_num / 100) if data_num > 100 else 1
|
||||
i = 0
|
||||
for file in fz.namelist():
|
||||
if i % data_print == 0:
|
||||
print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True)
|
||||
i += 1
|
||||
fz.extract(file, save_dir)
|
||||
print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60),
|
||||
int(int(time.time() - s_time) % 60)))
|
||||
print("Extract Done.")
|
||||
else:
|
||||
print("This is not zip.")
|
||||
else:
|
||||
print("Zip has been extracted.")
|
||||
|
||||
def parse_args(cloud_args=None):
|
||||
"""parameters"""
|
||||
parser = argparse.ArgumentParser('mindspore classification training')
|
||||
parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'],
|
||||
help='device where the code will be implemented. (Default: Ascend)')
|
||||
parser.add_argument('--device_id', type=int, default=1, help='device id of GPU or Ascend. (Default: None)')
|
||||
if config.need_modelarts_dataset_unzip:
|
||||
zip_file_1 = os.path.join(config.data_path, config.modelarts_dataset_unzip_name + ".zip")
|
||||
save_dir_1 = os.path.join(config.data_path)
|
||||
|
||||
# dataset related
|
||||
parser.add_argument('--dataset', type=str, choices=["cifar10", "imagenet2012"], default="cifar10")
|
||||
parser.add_argument('--data_path', type=str, default='', help='train data dir')
|
||||
sync_lock = "/tmp/unzip_sync.lock"
|
||||
|
||||
# network related
|
||||
parser.add_argument('--pre_trained', default='', type=str, help='model_path, local pretrained model to load')
|
||||
parser.add_argument('--lr_gamma', type=float, default=0.1,
|
||||
help='decrease lr by a factor of exponential lr_scheduler')
|
||||
parser.add_argument('--eta_min', type=float, default=0., help='eta_min in cosine_annealing scheduler')
|
||||
parser.add_argument('--T_max', type=int, default=90, help='T-max in cosine_annealing scheduler')
|
||||
# Each server contains 8 devices as most.
|
||||
if config.device_target == "GPU":
|
||||
device_id = get_rank()
|
||||
device_num = get_group_size()
|
||||
elif config.device_target == "Ascend":
|
||||
device_id = get_device_id()
|
||||
device_num = get_device_num()
|
||||
else:
|
||||
raise ValueError("Not support device_target.")
|
||||
|
||||
# logging and checkpoint related
|
||||
parser.add_argument('--log_interval', type=int, default=100, help='logging interval')
|
||||
parser.add_argument('--ckpt_path', type=str, default='outputs/', help='checkpoint save location')
|
||||
parser.add_argument('--ckpt_interval', type=int, default=5, help='ckpt_interval')
|
||||
parser.add_argument('--is_save_on_master', type=int, default=1, help='save ckpt on master or all rank')
|
||||
if device_id % min(device_num, 8) == 0 and not os.path.exists(sync_lock):
|
||||
print("Zip file path: ", zip_file_1)
|
||||
print("Unzip file save dir: ", save_dir_1)
|
||||
unzip(zip_file_1, save_dir_1)
|
||||
print("===Finish extract data synchronization===")
|
||||
try:
|
||||
os.mknod(sync_lock)
|
||||
except IOError:
|
||||
pass
|
||||
|
||||
# distributed related
|
||||
parser.add_argument('--is_distributed', type=int, default=0, help='if multi device')
|
||||
parser.add_argument('--rank', type=int, default=0, help='local rank of distributed')
|
||||
parser.add_argument('--group_size', type=int, default=1, help='world size of distributed')
|
||||
args_opt = parser.parse_args()
|
||||
args_opt = merge_args(args_opt, cloud_args)
|
||||
while True:
|
||||
if os.path.exists(sync_lock):
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if args_opt.dataset == "cifar10":
|
||||
from src.config import cifar_cfg as cfg
|
||||
else:
|
||||
from src.config import imagenet_cfg as cfg
|
||||
print("Device: {}, Finish sync unzip data from {} to {}.".format(device_id, zip_file_1, save_dir_1))
|
||||
|
||||
args_opt.label_smooth = cfg.label_smooth
|
||||
args_opt.label_smooth_factor = cfg.label_smooth_factor
|
||||
args_opt.lr_scheduler = cfg.lr_scheduler
|
||||
args_opt.loss_scale = cfg.loss_scale
|
||||
args_opt.max_epoch = cfg.max_epoch
|
||||
args_opt.warmup_epochs = cfg.warmup_epochs
|
||||
args_opt.lr = cfg.lr
|
||||
args_opt.lr_init = cfg.lr_init
|
||||
args_opt.lr_max = cfg.lr_max
|
||||
args_opt.momentum = cfg.momentum
|
||||
args_opt.weight_decay = cfg.weight_decay
|
||||
args_opt.per_batch_size = cfg.batch_size
|
||||
args_opt.num_classes = cfg.num_classes
|
||||
args_opt.buffer_size = cfg.buffer_size
|
||||
args_opt.ckpt_save_max = cfg.keep_checkpoint_max
|
||||
args_opt.pad_mode = cfg.pad_mode
|
||||
args_opt.padding = cfg.padding
|
||||
args_opt.has_bias = cfg.has_bias
|
||||
args_opt.batch_norm = cfg.batch_norm
|
||||
args_opt.initialize_mode = cfg.initialize_mode
|
||||
args_opt.has_dropout = cfg.has_dropout
|
||||
|
||||
args_opt.lr_epochs = list(map(int, cfg.lr_epochs.split(',')))
|
||||
args_opt.image_size = list(map(int, cfg.image_size.split(',')))
|
||||
|
||||
return args_opt
|
||||
config.ckpt_path = os.path.join(config.output_path, config.ckpt_path)
|
||||
|
||||
|
||||
def merge_args(args_opt, cloud_args):
|
||||
"""dictionary"""
|
||||
args_dict = vars(args_opt)
|
||||
if isinstance(cloud_args, dict):
|
||||
for key_arg in cloud_args.keys():
|
||||
val = cloud_args[key_arg]
|
||||
if key_arg in args_dict and val:
|
||||
arg_type = type(args_dict[key_arg])
|
||||
if arg_type is not None:
|
||||
val = arg_type(val)
|
||||
args_dict[key_arg] = val
|
||||
return args_opt
|
||||
@moxing_wrapper(pre_process=modelarts_pre_process)
|
||||
def run_train():
|
||||
'''run train'''
|
||||
config.lr_epochs = list(map(int, config.lr_epochs.split(',')))
|
||||
config.image_size = list(map(int, config.image_size.split(',')))
|
||||
config.per_batch_size = config.batch_size
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_args()
|
||||
|
||||
_enable_graph_kernel = args.device_target == "GPU"
|
||||
_enable_graph_kernel = config.device_target == "GPU"
|
||||
context.set_context(mode=context.GRAPH_MODE,
|
||||
enable_graph_kernel=_enable_graph_kernel, device_target=args.device_target)
|
||||
device_num = int(os.environ.get("DEVICE_NUM", 1))
|
||||
if args.is_distributed:
|
||||
if args.device_target == "Ascend":
|
||||
enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target)
|
||||
config.rank = get_rank_id()
|
||||
config.device_id = get_device_id()
|
||||
config.group_size = get_device_num()
|
||||
|
||||
if config.is_distributed:
|
||||
if config.device_target == "Ascend":
|
||||
init()
|
||||
context.set_context(device_id=args.device_id)
|
||||
elif args.device_target == "GPU":
|
||||
context.set_context(device_id=config.device_id)
|
||||
elif config.device_target == "GPU":
|
||||
init()
|
||||
|
||||
args.rank = get_rank()
|
||||
args.group_size = get_group_size()
|
||||
device_num = args.group_size
|
||||
device_num = config.group_size
|
||||
context.reset_auto_parallel_context()
|
||||
context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
|
||||
gradients_mean=True, all_reduce_fusion_config=[2, 18])
|
||||
else:
|
||||
if args.device_target == "Ascend":
|
||||
context.set_context(device_id=args.device_id)
|
||||
if config.device_target == "Ascend":
|
||||
context.set_context(device_id=config.device_id)
|
||||
|
||||
# select for master rank save ckpt or all rank save, compatible for model parallel
|
||||
args.rank_save_ckpt_flag = 0
|
||||
if args.is_save_on_master:
|
||||
if args.rank == 0:
|
||||
args.rank_save_ckpt_flag = 1
|
||||
config.rank_save_ckpt_flag = 0
|
||||
if config.is_save_on_master:
|
||||
if config.rank == 0:
|
||||
config.rank_save_ckpt_flag = 1
|
||||
else:
|
||||
args.rank_save_ckpt_flag = 1
|
||||
config.rank_save_ckpt_flag = 1
|
||||
|
||||
# logger
|
||||
args.outputs_dir = os.path.join(args.ckpt_path,
|
||||
datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
|
||||
args.logger = get_logger(args.outputs_dir, args.rank)
|
||||
config.outputs_dir = os.path.join(config.ckpt_path,
|
||||
datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
|
||||
config.logger = get_logger(config.outputs_dir, config.rank)
|
||||
|
||||
if args.dataset == "cifar10":
|
||||
dataset = vgg_create_dataset(args.data_path, args.image_size, args.per_batch_size, args.rank, args.group_size)
|
||||
if config.dataset == "cifar10":
|
||||
dataset = vgg_create_dataset(config.data_dir, config.image_size, config.per_batch_size,
|
||||
config.rank, config.group_size)
|
||||
else:
|
||||
dataset = classification_dataset(args.data_path, args.image_size, args.per_batch_size,
|
||||
args.rank, args.group_size)
|
||||
dataset = classification_dataset(config.data_dir, config.image_size, config.per_batch_size,
|
||||
config.rank, config.group_size)
|
||||
|
||||
batch_num = dataset.get_dataset_size()
|
||||
args.steps_per_epoch = dataset.get_dataset_size()
|
||||
args.logger.save_args(args)
|
||||
config.steps_per_epoch = dataset.get_dataset_size()
|
||||
config.logger.save_args(config)
|
||||
|
||||
# network
|
||||
args.logger.important_info('start create network')
|
||||
config.logger.important_info('start create network')
|
||||
|
||||
# get network and init
|
||||
network = vgg16(args.num_classes, args)
|
||||
network = vgg16(config.num_classes, config)
|
||||
|
||||
# pre_trained
|
||||
if args.pre_trained:
|
||||
load_param_into_net(network, load_checkpoint(args.pre_trained))
|
||||
if config.pre_trained:
|
||||
load_param_into_net(network, load_checkpoint(config.pre_trained))
|
||||
|
||||
# lr scheduler
|
||||
if args.lr_scheduler == 'exponential':
|
||||
lr = warmup_step_lr(args.lr,
|
||||
args.lr_epochs,
|
||||
args.steps_per_epoch,
|
||||
args.warmup_epochs,
|
||||
args.max_epoch,
|
||||
gamma=args.lr_gamma,
|
||||
if config.lr_scheduler == 'exponential':
|
||||
lr = warmup_step_lr(config.lr,
|
||||
config.lr_epochs,
|
||||
config.steps_per_epoch,
|
||||
config.warmup_epochs,
|
||||
config.max_epoch,
|
||||
gamma=config.lr_gamma,
|
||||
)
|
||||
elif args.lr_scheduler == 'cosine_annealing':
|
||||
lr = warmup_cosine_annealing_lr(args.lr,
|
||||
args.steps_per_epoch,
|
||||
args.warmup_epochs,
|
||||
args.max_epoch,
|
||||
args.T_max,
|
||||
args.eta_min)
|
||||
elif args.lr_scheduler == 'step':
|
||||
lr = lr_steps(0, lr_init=args.lr_init, lr_max=args.lr_max, warmup_epochs=args.warmup_epochs,
|
||||
total_epochs=args.max_epoch, steps_per_epoch=batch_num)
|
||||
elif config.lr_scheduler == 'cosine_annealing':
|
||||
lr = warmup_cosine_annealing_lr(config.lr,
|
||||
config.steps_per_epoch,
|
||||
config.warmup_epochs,
|
||||
config.max_epoch,
|
||||
config.T_max,
|
||||
config.eta_min)
|
||||
elif config.lr_scheduler == 'step':
|
||||
lr = lr_steps(0, lr_init=config.lr_init, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs,
|
||||
total_epochs=config.max_epoch, steps_per_epoch=batch_num)
|
||||
else:
|
||||
raise NotImplementedError(args.lr_scheduler)
|
||||
raise NotImplementedError(config.lr_scheduler)
|
||||
|
||||
# optimizer
|
||||
opt = Momentum(params=get_param_groups(network),
|
||||
learning_rate=Tensor(lr),
|
||||
momentum=args.momentum,
|
||||
weight_decay=args.weight_decay,
|
||||
loss_scale=args.loss_scale)
|
||||
momentum=config.momentum,
|
||||
weight_decay=config.weight_decay,
|
||||
loss_scale=config.loss_scale)
|
||||
|
||||
if args.dataset == "cifar10":
|
||||
if config.dataset == "cifar10":
|
||||
loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
|
||||
model = Model(network, loss_fn=loss, optimizer=opt, metrics={'acc'},
|
||||
amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=None)
|
||||
else:
|
||||
if not args.label_smooth:
|
||||
args.label_smooth_factor = 0.0
|
||||
loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes)
|
||||
if not config.label_smooth:
|
||||
config.label_smooth_factor = 0.0
|
||||
loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.num_classes)
|
||||
|
||||
loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)
|
||||
loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
|
||||
model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, amp_level="O2")
|
||||
|
||||
# define callbacks
|
||||
time_cb = TimeMonitor(data_size=batch_num)
|
||||
loss_cb = LossMonitor(per_print_times=batch_num)
|
||||
callbacks = [time_cb, loss_cb]
|
||||
if args.rank_save_ckpt_flag:
|
||||
ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch,
|
||||
keep_checkpoint_max=args.ckpt_save_max)
|
||||
save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/')
|
||||
if config.rank_save_ckpt_flag:
|
||||
ckpt_config = CheckpointConfig(save_checkpoint_steps=config.ckpt_interval * config.steps_per_epoch,
|
||||
keep_checkpoint_max=config.keep_checkpoint_max)
|
||||
save_ckpt_path = os.path.join(config.outputs_dir, 'ckpt_' + str(config.rank) + '/')
|
||||
ckpt_cb = ModelCheckpoint(config=ckpt_config,
|
||||
directory=save_ckpt_path,
|
||||
prefix='{}'.format(args.rank))
|
||||
prefix='{}'.format(config.rank))
|
||||
callbacks.append(ckpt_cb)
|
||||
|
||||
model.train(args.max_epoch, dataset, callbacks=callbacks)
|
||||
model.train(config.max_epoch, dataset, callbacks=callbacks)
|
||||
|
||||
if __name__ == '__main__':
|
||||
run_train()
|
||||
|
|
|
@ -18,9 +18,10 @@ import pytest
|
|||
|
||||
from mindspore import Tensor
|
||||
from model_zoo.official.cv.vgg16.src.vgg import vgg16
|
||||
from model_zoo.official.cv.vgg16.src.config import cifar_cfg as cfg
|
||||
from model_zoo.official.cv.vgg16.model_utils.config import get_config_static
|
||||
from ..ut_filter import non_graph_engine
|
||||
|
||||
cfg = get_config_static()
|
||||
|
||||
@non_graph_engine
|
||||
def test_vgg16():
|
||||
|
|
Loading…
Reference in New Issue