!18954 modify model_zoo resnet network for clould

Merge pull request !18954 from lilei/modify_model_zoo_resnet50
This commit is contained in:
i-robot 2021-07-02 08:46:52 +00:00 committed by Gitee
commit bb43be1fb4
38 changed files with 1655 additions and 844 deletions

View File

@ -101,27 +101,26 @@ After installing MindSpore via the official website, you can start training and
```bash
# distributed training
Usage: bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
# standalone training
Usage: bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH]
[PRETRAINED_CKPT_PATH](optional)
Usage: bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
# run evaluation example
Usage: bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
```
- Running on GPU
```bash
# distributed training example
bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
# standalone training example
bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
# infer example
bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
# gpu benchmark example
bash run_gpu_resnet_benchmark.sh [DATASET_PATH] [BATCH_SIZE](optional) [DTYPE](optional) [DEVICE_NUM](optional) [SAVE_CKPT](optional) [SAVE_PATH](optional)
@ -131,10 +130,41 @@ bash run_gpu_resnet_benchmark.sh [DATASET_PATH] [BATCH_SIZE](optional) [DTYPE](o
```bash
# standalone training example
python train.py --net=[resnet50|resnet101] --dataset=[cifar10|imagenet2012] --device_target=CPU --dataset_path=[DATASET_PATH] --pre_trained=[CHECKPOINT_PATH](optional)
python train.py --device_target=CPU --data_path=[DATASET_PATH] --config_path [CONFIG_PATH] --pre_trained=[CHECKPOINT_PATH](optional)
# infer example
python eval.py --net=[resnet50|resnet101] --dataset=[cifar10|imagenet2012] --dataset_path=[DATASET_PATH] --checkpoint_path=[CHECKPOINT_PATH] --device_target=CPU
python eval.py --data_path=[DATASET_PATH] --checkpoint_file_path=[CHECKPOINT_PATH] --config_path [CONFIG_PATH] --device_target=CPU
```
If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training and evaluation as follows:
```python
# run distributed training on modelarts example
# (1) First, Perform a or b.
# a. Set "enable_modelarts=True" on yaml file.
# Set other parameters on yaml file you need.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add other parameters on the website UI interface.
# (2) Set the config directory to "config_path=/The path of config in S3/"
# (3) Set the code directory to "/path/resnet" on the website UI interface.
# (4) Set the startup file to "train.py" on the website UI interface.
# (5) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (6) Create your job.
# run evaluation on modelarts example
# (1) Copy or upload your trained model to S3 bucket.
# (2) Perform a or b.
# a. Set "enable_modelarts=True" on yaml file.
# Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on yaml file.
# Set "checkpoint_url=/The path of checkpoint in S3/" on yaml file.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
# Add "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface.
# (3) Set the config directory to "config_path=/The path of config in S3/"
# (4) Set the code directory to "/path/resnet" on the website UI interface.
# (5) Set the startup file to "eval.py" on the website UI interface.
# (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (7) Create your job.
```
# [Script Description](#contents)
@ -158,13 +188,26 @@ python eval.py --net=[resnet50|resnet101] --dataset=[cifar10|imagenet2012] --dat
|── run_eval_gpu_resnet_benckmark.sh # launch gpu benchmark eval for resnet50 with imagenet2012
└── cache_util.sh # a collection of helper functions to manage cache
├── src
├── config.py # parameter configuration
├── dataset.py # data preprocessing
├─ eval_callback.py # evaluation callback while training
├── CrossEntropySmooth.py # loss definition for ImageNet2012 dataset
├── lr_generator.py # generate learning rate for each step
├── resnet.py # resnet backbone, including resnet50 and resnet101 and se-resnet50
└── resnet_gpu_benchmark.py # resnet50 for GPU benchmark
├── model_utils
├──config.py # parameter configuration
├──device_adapter.py # device adapter
├──local_adapter.py # local adapter
├──moxing_adapter.py # moxing adapter
├── resnet18_cifar10_config.yaml # parameter configuration
├── resnet18_imagenet2012_config.yaml # parameter configuration
├── resnet34_imagenet2012_config.yaml # parameter configuration
├── resnet50_cifar10_config.yaml # parameter configuration
├── resnet50_imagenet2012_Ascend_config.yaml # parameter configuration
├── resnet50_imagenet2012_config.yaml # parameter configuration
├── resnet50_imagenet2012_GPU_config.yaml # parameter configuration
├── resnet101_imagenet2012_config.yaml # parameter configuration
├── se-resnet50_imagenet2012_config.yaml # parameter configuration
├── export.py # export model for inference
├── mindspore_hub_conf.py # mindspore hub interface
├── eval.py # eval net
@ -174,7 +217,7 @@ python eval.py --net=[resnet50|resnet101] --dataset=[cifar10|imagenet2012] --dat
## [Script Parameters](#contents)
Parameters for both training and evaluation can be set in config.py.
Parameters for both training and evaluation can be set in config file.
- Config for ResNet18 and ResNet50, CIFAR-10 dataset
@ -189,7 +232,6 @@ Parameters for both training and evaluation can be set in config.py.
"save_checkpoint": True, # whether save checkpoint or not
"save_checkpoint_epochs": 5, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last step
"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint
"save_checkpoint_path": "./", # path to save checkpoint
"warmup_epochs": 5, # number of warmup epoch
"lr_decay_mode": "poly" # decay mode can be selected in steps, ploy and default
"lr_init": 0.01, # initial learning rate
@ -210,7 +252,6 @@ Parameters for both training and evaluation can be set in config.py.
"save_checkpoint": True, # whether save checkpoint or not
"save_checkpoint_epochs": 5, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last epoch
"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint
"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path
"warmup_epochs": 0, # number of warmup epoch
"lr_decay_mode": "Linear", # decay mode for generating learning rate
"use_label_smooth": True, # label smooth
@ -233,7 +274,6 @@ Parameters for both training and evaluation can be set in config.py.
"save_checkpoint": True, # whether save checkpoint or not
"save_checkpoint_epochs": 5, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last epoch
"keep_checkpoint_max": 1, # only keep the last keep_checkpoint_max checkpoint
"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path
"warmup_epochs": 0, # number of warmup epoch
"optimizer": 'Momentum', # optimizer
"use_label_smooth": True, # label smooth
@ -256,7 +296,6 @@ Parameters for both training and evaluation can be set in config.py.
"save_checkpoint": True, # whether save checkpoint or not
"save_checkpoint_epochs": 5, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last epoch
"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint
"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path
"warmup_epochs": 0, # number of warmup epoch
"lr_decay_mode": "cosine" # decay mode for generating learning rate
"use_label_smooth": True, # label_smooth
@ -278,7 +317,6 @@ Parameters for both training and evaluation can be set in config.py.
"save_checkpoint": True, # whether save checkpoint or not
"save_checkpoint_epochs": 4, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last epoch
"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint
"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path
"warmup_epochs": 3, # number of warmup epoch
"lr_decay_mode": "cosine" # decay mode for generating learning rate
"use_label_smooth": True, # label_smooth
@ -296,15 +334,13 @@ Parameters for both training and evaluation can be set in config.py.
```bash
# distributed training
Usage: bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
# standalone training
Usage: bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH]
[PRETRAINED_CKPT_PATH](optional)
Usage: bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
# run evaluation example
Usage: bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
```
For distributed training, a hccl configuration file with JSON format needs to be created in advance.
@ -319,13 +355,14 @@ If you want to change device_id for standalone training, you can set environment
```bash
# distributed training example
bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
# standalone training example
bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
# infer example
bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]
[CONFIG_PATH]
# gpu benchmark training example
bash run_gpu_resnet_benchmark.sh [DATASET_PATH] [BATCH_SIZE](optional) [DTYPE](optional) [DEVICE_NUM](optional) [SAVE_CKPT](optional) [SAVE_PATH](optional)
@ -343,29 +380,29 @@ Please follow the instructions in the link [GPU-Multi-Host](https://www.mindspor
- Parameter server training Ascend example
```bash
bash run_parameter_server_train.sh [resnet18|resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
bash run_parameter_server_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
```
- Parameter server training GPU example
```bash
bash run_parameter_server_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
bash run_parameter_server_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
```
#### Evaluation while training
```bash
# evaluation with distributed training Ascend example:
bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
# evaluation with standalone training Ascend example:
bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
bash run_standalone_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
# evaluation with distributed training GPU example:
bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
# evaluation with standalone training GPU example:
bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
```
`RUN_EVAL` and `EVAL_DATASET_PATH` are optional arguments, setting `RUN_EVAL`=True allows you to do evaluation while training. When `RUN_EVAL` is set, `EVAL_DATASET_PATH` must also be set.
@ -480,12 +517,12 @@ epoch: [0/1] step: [100/5004], loss is 6.814013Epoch time: 3437.154 ms, fps: 148
```bash
# evaluation
Usage: bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
Usage: bash run_eval.sh [DATASET_PATH] [CONFIG_PATH] [CHECKPOINT_PATH]
```
```bash
# evaluation example
bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/train_parallel0/resnet-90_195.ckpt
bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/train_parallel0/resnet-90_195.ckpt --config_path /.yaml
```
> checkpoint can be produced in training process.
@ -493,7 +530,7 @@ bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/tra
#### Running on GPU
```bash
bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
```
### Result
@ -547,13 +584,39 @@ result: {'top_5_accuracy': 0.9342589628681178, 'top_1_accuracy': 0.7680657810499
### [Export MindIR](#contents)
Export MindIR on local
```shell
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
python export.py --checkpoint_file_path [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] --config_path [CONFIG_PATH]
```
The ckpt_file parameter is required,
The checkpoint_file_path parameter is required,
`EXPORT_FORMAT` should be in ["AIR", "MINDIR"]
Export on ModelArts (If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start as follows)
```python
# Export on ModelArts
# (1) Perform a or b.
# a. Set "enable_modelarts=True" on default_config.yaml file.
# Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on default_config.yaml file.
# Set "checkpoint_url='s3://dir_to_trained_ckpt/'" on default_config.yaml file.
# Set "file_name='./resnet'" on default_config.yaml file.
# Set "file_format='AIR'" on default_config.yaml file.
# Set other parameters on default_config.yaml file you need.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
# Add "checkpoint_url='s3://dir_to_trained_ckpt/'" on the website UI interface.
# Add "file_name='./resnet'" on the website UI interface.
# Add "file_format='AIR'" on the website UI interface.
# Add other parameters on the website UI interface.
# (2) Set the config_path="/path/yaml file" on the website UI interface.
# (3) Set the code directory to "/path/resnet" on the website UI interface.
# (4) Set the startup file to "export.py" on the website UI interface.
# (5) Set the "Output file path" and "Job log path" to your path on the website UI interface.
# (6) Create your job.
```
### Infer on Ascend310
Before performing inference, the mindir file must bu exported by `export.py` script. We only provide an example of inference using MINDIR model.

View File

@ -104,27 +104,60 @@ ResNet的总体网络架构如下
```text
# 分布式训练
用法bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选)
用法bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
# 单机训练
用法bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH]
[PRETRAINED_CKPT_PATH](可选)
用法bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
# 运行评估示例
用法bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
用法bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
```
- GPU处理器环境运行
```text
# 分布式训练示例
bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选)
bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
# 单机训练示例
bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选)
bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
# 推理示例
bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
```
如果要在modelarts上进行模型的训练可以参考modelarts的官方指导文档(https://support.huaweicloud.com/modelarts/)
开始进行模型的训练和推理,具体操作如下:
```python
# 在modelarts上使用分布式训练的示例
# (1) 选址a或者b其中一种方式。
# a. 设置 "enable_modelarts=True" 。
# 在yaml文件上设置网络所需的参数。
# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。
# 在modelarts的界面上设置网络所需的参数。
# (2) 在modelarts的界面上设置配置文件的路径"config_path=/The path of config in S3/"
# (3) 在modelarts的界面上设置代码的路径 "/path/resnet"。
# (4) 在modelarts的界面上设置模型的启动文件 "train.py" 。
# (5) 在modelarts的界面上设置模型的数据路径 "Dataset path" ,
# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。
# (6) 开始模型的训练。
# 在modelarts上使用模型推理的示例
# (1) 把训练好的模型地方到桶的对应位置。
# (2) 选址a或者b其中一种方式。
# a. 设置 "enable_modelarts=True"
# 设置 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt" 在 yaml 文件.
# 设置 "checkpoint_url=/The path of checkpoint in S3/" 在 yaml 文件.
# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。
# 增加 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" 参数在modearts的界面上。
# 增加 "checkpoint_url=/The path of checkpoint in S3/" 参数在modearts的界面上。
# (2) 在modelarts的界面上设置配置文件的路径"config_path=/The path of config in S3/"
# (3) 在modelarts的界面上设置代码的路径 "/path/resnet"。
# (4) 在modelarts的界面上设置模型的启动文件 "eval.py" 。
# (5) 在modelarts的界面上设置模型的数据路径 "Dataset path" ,
# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。
# (6) 开始模型的推理。
```
# 脚本说明
@ -146,19 +179,33 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
├── run_standalone_train_gpu.sh # 启动GPU单机训练单卡
└── cache_util.sh # 使用单节点緩存的帮助函数
├── src
├── config.py # 参数配置
├── dataset.py # 数据预处理
├─ eval_callback.py # 训练时推理回调函数
├─ eval_callback.py # 训练时推理回调函数
├── CrossEntropySmooth.py # ImageNet2012数据集的损失定义
├── lr_generator.py # 生成每个步骤的学习率
└── resnet.py # ResNet骨干网络包括ResNet50、ResNet101和SE-ResNet50
├── model_utils
├── config.py # 参数配置
├── device_adapter.py # 设备配置
├── local_adapter.py # 本地设备配置
└── moxing_adapter.py # modelarts设备配置
├── resnet18_cifar10_config.yaml # 参数配置
├── resnet18_imagenet2012_config.yaml # 参数配置
├── resnet34_imagenet2012_config.yaml # 参数配置
├── resnet50_cifar10_config.yaml # 参数配置
├── resnet50_imagenet2012_Ascend_config.yaml # 参数配置
├── resnet50_imagenet2012_config.yaml # 参数配置
├── resnet50_imagenet2012_GPU_config.yaml # 参数配置
├── resnet101_imagenet2012_config.yaml # 参数配置
├── se-resnet50_imagenet2012_config.yaml # 参数配置
├── eval.py # 评估网络
├── eval.py # 评估网络
└── train.py # 训练网络
```
## 脚本参数
在config.py中可以同时配置训练参数和评估参数。
配置文件中可以同时配置训练参数和评估参数。
- 配置ResNet18、ResNet50和CIFAR-10数据集。
@ -173,7 +220,6 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
"save_checkpoint":True, # 是否保存检查点
"save_checkpoint_epochs":5, # 两个检查点之间的周期间隔;默认情况下,最后一个检查点将在最后一步完成后保存
"keep_checkpoint_max":10, # 只保留最后一个keep_checkpoint_max检查点
"save_checkpoint_path":"./", # 检查点保存路径
"warmup_epochs":5, # 热身周期数
"lr_decay_mode":"poly” # 衰减模式可为步骤、策略和默认
"lr_init":0.01, # 初始学习率
@ -194,7 +240,6 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
"save_checkpoint":True, # 是否保存检查点
"save_checkpoint_epochs":5, # 两个检查点之间的周期间隔;默认情况下,最后一个检查点将在最后一个周期完成后保存
"keep_checkpoint_max":10, # 只保存最后一个keep_checkpoint_max检查点
"save_checkpoint_path":"./", # 检查点相对于执行路径的保存路径
"warmup_epochs":0, # 热身周期数
"lr_decay_mode":"Linear", # 用于生成学习率的衰减模式
"use_label_smooth":True, # 标签平滑
@ -217,7 +262,6 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
"save_checkpoint":True, # 是否保存检查点
"save_checkpoint_epochs":5, # 两个检查点之间的周期间隔;默认情况下,最后一个检查点将在最后一个周期完成后保存
"keep_checkpoint_max":1, # 只保存最后一个keep_checkpoint_max检查点
"save_checkpoint_path":"./", # 检查点相对于执行路径的保存路径
"warmup_epochs":0, # 热身周期数
"optimizer":"Momentum", # 优化器
"use_label_smooth":True, # 标签平滑
@ -240,7 +284,6 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
"save_checkpoint":True, # 是否保存检查点
"save_checkpoint_epochs":5, # 两个检查点之间的周期间隔;默认情况下,最后一个检查点将在最后一个周期完成后保存
"keep_checkpoint_max":10, # 只保存最后一个keep_checkpoint_max检查点
"save_checkpoint_path":"./", # 检查点相对于执行路径的保存路径
"warmup_epochs":0, # 热身周期数
"lr_decay_mode":"cosine” # 用于生成学习率的衰减模式
"use_label_smooth":True, # 标签平滑
@ -262,7 +305,6 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
"save_checkpoint":True, # 是否保存检查点
"save_checkpoint_epochs":4, # 两个检查点之间的周期间隔;默认情况下,最后一个检查点将在最后一个周期完成后保存
"keep_checkpoint_max":10, # 只保存最后一个keep_checkpoint_max检查点
"save_checkpoint_path":"./", # checkpoint相对于执行路径的保存路径
"warmup_epochs":3, # 热身周期数
"lr_decay_mode":"cosine” # 用于生成学习率的衰减模式
"use_label_smooth":True, # 标签平滑
@ -280,14 +322,13 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
```text
# 分布式训练
用法bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选)
用法bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
# 单机训练
用法bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH]
[PRETRAINED_CKPT_PATH](可选)
用法bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
# 运行评估示例
用法bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
用法bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
```
@ -303,13 +344,13 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
```text
# 分布式训练示例
bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选)
bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
# 单机训练示例
bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选)
bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
# 推理示例
bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
```
#### 运行参数服务器模式训练
@ -317,29 +358,29 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
- Ascend参数服务器训练示例
```text
bash run_parameter_server_train.sh [resnet18|resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选)
bash run_parameter_server_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
```
- GPU参数服务器训练示例
```text
bash run_parameter_server_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选)
bash run_parameter_server_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
```
#### 训练时推理
```bash
# Ascend 分布式训练时推理示例:
bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
# Ascend 单机训练时推理示例:
bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
bash run_standalone_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
# GPU 分布式训练时推理示例:
bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
bash run_distribute_train_gpu.sh [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
# GPU 单机训练时推理示例:
bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
bash run_standalone_train_gpu.sh [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
```
训练时推理需要在设置`RUN_EVAL`为True与此同时还需要设置`EVAL_DATASET_PATH`。此外,当设置`RUN_EVAL`为True时还可为python脚本设置`save_best_ckpt`, `eval_start_epoch`, `eval_interval`等参数。
@ -446,12 +487,12 @@ epoch:5 step:5004, loss is 3.3501816
```bash
# 评估
Usage: bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
```
```bash
# 评估示例
bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/train_parallel0/resnet-90_195.ckpt
bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/train_parallel0/resnet-90_195.ckpt --config_path /*.yaml
```
> 训练过程中可以生成检查点。
@ -459,7 +500,7 @@ bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/tra
#### GPU处理器环境运行
```bash
bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
```
### 结果
@ -513,13 +554,37 @@ result:{'top_5_accuracy':0.9342589628681178, 'top_1_accuracy':0.768065781049936}
### [导出MindIR](#contents)
导出mindir模型
```shell
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
python export.py --checkpoint_file_path [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] --config_path [CONFIG_PATH]
```
参数ckpt_file为必填项
参数checkpoint_file_path为必填项,
`EXPORT_FORMAT` 必须在 ["AIR", "MINDIR"]中选择。
ModelArts导出mindir
```python
# (1) 把训练好的模型地方到桶的对应位置。
# (2) 选址a或者b其中一种方式。
# a. 设置 "enable_modelarts=True"
# 设置 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt" 在 yaml 文件。
# 设置 "checkpoint_url=/The path of checkpoint in S3/" 在 yaml 文件。
# 设置 "file_name='./resnet'"参数在yaml文件。
# 设置 "file_format='AIR'" 参数在yaml文件。
# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。
# 增加 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" 参数在modearts的界面上。
# 增加 "checkpoint_url=/The path of checkpoint in S3/" 参数在modearts的界面上。
# 设置 "file_name='./resnet'"参数在modearts的界面上。
# 设置 "file_format='AIR'" 参数在modearts的界面上。
# (3) 设置网络配置文件的路径 "config_path=/The path of config in S3/"
# (4) 在modelarts的界面上设置代码的路径 "/path/resnet"。
# (5) 在modelarts的界面上设置模型的启动文件 "export.py" 。
# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。
# (6) 开始导出mindir。
```
### 在Ascend310执行推理
在执行推理前mindir文件必须通过`export.py`脚本导出。以下展示了使用minir模型执行推理的示例。

View File

@ -14,51 +14,39 @@
# ============================================================================
"""train resnet."""
import os
import argparse
from mindspore import context
from mindspore.common import set_seed
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
from mindspore.train.model import Model
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from src.CrossEntropySmooth import CrossEntropySmooth
parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--net', type=str, default=None, help='Resnet Model, either resnet18,resnet34'
'resnet50 or resnet101')
parser.add_argument('--dataset', type=str, default=None, help='Dataset, either cifar10 or imagenet2012')
parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path')
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
parser.add_argument('--device_target', type=str, default='Ascend', choices=("Ascend", "GPU", "CPU"),
help="Device target, support Ascend, GPU and CPU.")
args_opt = parser.parse_args()
from src.model_utils.config import config
from src.model_utils.moxing_adapter import moxing_wrapper
set_seed(1)
if args_opt.net in ("resnet18", "resnet34", "resnet50"):
if args_opt.net == "resnet18":
if config.net_name in ("resnet18", "resnet34", "resnet50"):
if config.net_name == "resnet18":
from src.resnet import resnet18 as resnet
if args_opt.net == "resnet34":
if config.net_name == "resnet34":
from src.resnet import resnet34 as resnet
if args_opt.net == "resnet50":
if config.net_name == "resnet50":
from src.resnet import resnet50 as resnet
if args_opt.dataset == "cifar10":
from src.config import config1 as config
if config.dataset == "cifar10":
from src.dataset import create_dataset1 as create_dataset
else:
from src.config import config2 as config
from src.dataset import create_dataset2 as create_dataset
elif args_opt.net == "resnet101":
elif config.net_name == "resnet101":
from src.resnet import resnet101 as resnet
from src.config import config3 as config
from src.dataset import create_dataset3 as create_dataset
else:
from src.resnet import se_resnet50 as resnet
from src.config import config4 as config
from src.dataset import create_dataset4 as create_dataset
if __name__ == '__main__':
target = args_opt.device_target
@moxing_wrapper()
def eval_net():
"""eval net"""
target = config.device_target
# init context
context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
@ -67,20 +55,19 @@ if __name__ == '__main__':
context.set_context(device_id=device_id)
# create dataset
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, batch_size=config.batch_size,
dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size,
target=target)
step_size = dataset.get_dataset_size()
# define net
net = resnet(class_num=config.class_num)
# load checkpoint
param_dict = load_checkpoint(args_opt.checkpoint_path)
param_dict = load_checkpoint(config.checkpoint_file_path)
load_param_into_net(net, param_dict)
net.set_train(False)
# define loss, model
if args_opt.dataset == "imagenet2012":
if config.dataset == "imagenet2012":
if not config.use_label_smooth:
config.label_smooth_factor = 0.0
loss = CrossEntropySmooth(sparse=True, reduction='mean',
@ -93,4 +80,7 @@ if __name__ == '__main__':
# eval model
res = model.eval(dataset)
print("result:", res, "ckpt=", args_opt.checkpoint_path)
print("result:", res, "ckpt=", config.checkpoint_file_path)
if __name__ == '__main__':
eval_net()

View File

@ -16,67 +16,50 @@
##############export checkpoint file into air and onnx models#################
python export.py
"""
import argparse
import os
import numpy as np
from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context
from src.model_utils.config import config
from src.model_utils.moxing_adapter import moxing_wrapper
parser = argparse.ArgumentParser(description='resnet export')
parser.add_argument('--network_dataset', type=str, default='resnet50_cifar10', choices=['resnet18_cifar10',
'resnet18_imagenet2012',
'resnet34_imagenet2012',
'resnet50_cifar10',
'resnet50_imagenet2012',
'resnet101_imagenet2012',
"se-resnet50_imagenet2012"],
help='network and dataset name.')
parser.add_argument("--device_id", type=int, default=0, help="Device id")
parser.add_argument("--batch_size", type=int, default=1, help="batch size")
parser.add_argument("--ckpt_file", type=str, required=True, help="Checkpoint file path.")
parser.add_argument("--file_name", type=str, default="resnet", help="output file name.")
parser.add_argument('--width', type=int, default=224, help='input width')
parser.add_argument('--height', type=int, default=224, help='input height')
parser.add_argument("--file_format", type=str, choices=["AIR", "ONNX", "MINDIR"], default="AIR", help="file format")
parser.add_argument("--device_target", type=str, default="Ascend",
choices=["Ascend", "GPU", "CPU"], help="device target(default: Ascend)")
args = parser.parse_args()
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
if config.device_target == "Ascend":
context.set_context(device_id=config.device_id)
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
if args.device_target == "Ascend":
context.set_context(device_id=args.device_id)
def modelarts_pre_process():
'''modelarts pre process function.'''
config.file_name = os.path.join(config.output_path, config.file_name)
if __name__ == '__main__':
if args.network_dataset == 'resnet18_cifar10':
from src.config import config1 as config
@moxing_wrapper(pre_process=modelarts_pre_process)
def run_export():
"""run export."""
if config.network_dataset == 'resnet18_cifar10':
from src.resnet import resnet18 as resnet
elif args.network_dataset == 'resnet18_imagenet2012':
from src.config import config2 as config
elif config.network_dataset == 'resnet18_imagenet2012':
from src.resnet import resnet18 as resnet
elif args.network_dataset == 'resnet34_imagenet2012':
from src.config import config2 as config
elif config.network_dataset == 'resnet34_imagenet2012':
from src.resnet import resnet34 as resnet
elif args.network_dataset == 'resnet50_cifar10':
from src.config import config1 as config
elif config.network_dataset == 'resnet50_cifar10':
from src.resnet import resnet50 as resnet
elif args.network_dataset == 'resnet50_imagenet2012':
from src.config import config2 as config
elif config.network_dataset == 'resnet50_imagenet2012':
from src.resnet import resnet50 as resnet
elif args.network_dataset == 'resnet101_imagenet2012':
from src.config import config3 as config
elif config.network_dataset == 'resnet101_imagenet2012':
from src.resnet import resnet101 as resnet
elif args.network_dataset == 'se-resnet50_imagenet2012':
from src.config import config4 as config
elif config.network_dataset == 'se-resnet50_imagenet2012':
from src.resnet import se_resnet50 as resnet
else:
raise ValueError("network and dataset is not support.")
net = resnet(config.class_num)
assert args.ckpt_file is not None, "checkpoint_path is None."
assert config.checkpoint_file_path is not None, "checkpoint_path is None."
param_dict = load_checkpoint(args.ckpt_file)
param_dict = load_checkpoint(config.checkpoint_file_path)
load_param_into_net(net, param_dict)
input_arr = Tensor(np.zeros([args.batch_size, 3, args.height, args.width], np.float32))
export(net, input_arr, file_name=args.file_name, file_format=args.file_format)
input_arr = Tensor(np.zeros([config.batch_size, 3, config.height, config.width], np.float32))
export(net, input_arr, file_name=config.file_name, file_format=config.file_format)
if __name__ == '__main__':
run_export()

View File

@ -13,8 +13,7 @@
# limitations under the License.
# ============================================================================
"""train resnet."""
import argparse
import ast
import os
import time
import numpy as np
from mindspore import context
@ -34,25 +33,11 @@ import mindspore.dataset.vision.c_transforms as C
from src.resnet_gpu_benchmark import resnet50 as resnet
from src.CrossEntropySmooth import CrossEntropySmooth
from src.momentum import Momentum as MomentumWeightDecay
parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--batch_size', type=str, default="256", help='Batch_size: default 256.')
parser.add_argument('--epoch_size', type=str, default="2", help='Epoch_size: default 2')
parser.add_argument('--print_per_steps', type=str, default="20", help='Print loss and time per steps: default 20')
parser.add_argument('--run_distribute', type=ast.literal_eval, default=False, help='Run distribute')
parser.add_argument('--save_ckpt', type=ast.literal_eval, default=False, help='Save ckpt or not: default False')
parser.add_argument('--eval', type=ast.literal_eval, default=False, help='Eval ckpt : default False')
parser.add_argument('--dataset_path', type=str, default=None, help='Imagenet dataset path')
parser.add_argument('--ckpt_path', type=str, default="./", help='The path to save ckpt if save_ckpt is True;\
Or the ckpt model file when eval is True')
parser.add_argument('--mode', type=str, default="GRAPH", choices=["GRAPH", "PYNATIVE"], help='Execute mode')
parser.add_argument('--dtype', type=str, choices=["fp32", "fp16", "FP16", "FP32"], default="fp16", \
help='Compute data type fp32 or fp16: default fp16')
args_opt = parser.parse_args()
from src.model_utils.config import config
from src.model_utils.moxing_adapter import moxing_wrapper
set_seed(1)
class MyTimeMonitor(Callback):
def __init__(self, batch_size, sink_size, dataset_size, mode):
super(MyTimeMonitor, self).__init__()
@ -95,7 +80,7 @@ class MyTimeMonitor(Callback):
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="GPU", dtype="fp16",
device_num=1):
if args_opt.mode == "GRAPH":
if config.mode_name == "GRAPH":
ds_num_parallel_worker = 4
map_num_parallel_worker = 8
batch_num_parallel_worker = None
@ -116,7 +101,7 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="
# define map operations
normalize_op = C.Normalize(mean=mean, std=std)
if dtype == "fp16":
if args_opt.eval:
if config.eval:
x_dtype = "float32"
else:
x_dtype = "float16"
@ -161,25 +146,26 @@ def get_liner_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per
return lr_each_step
@moxing_wrapper()
def train():
# set args
dev = "GPU"
epoch_size = int(args_opt.epoch_size)
total_batch = int(args_opt.batch_size)
print_per_steps = int(args_opt.print_per_steps)
compute_type = str(args_opt.dtype).lower()
ckpt_save_dir = str(args_opt.ckpt_path)
save_ckpt = bool(args_opt.save_ckpt)
epoch_size = int(config.epoch_size)
total_batch = int(config.batch_size)
print_per_steps = int(config.print_per_steps)
compute_type = str(config.dtype).lower()
save_ckpt = bool(config.save_ckpt)
device_num = 1
# init context
if args_opt.mode == "GRAPH":
if config.mode_name == "GRAPH":
mode = context.GRAPH_MODE
all_reduce_fusion_config = [85, 160]
else:
mode = context.PYNATIVE_MODE
all_reduce_fusion_config = [30, 90, 160]
context.set_context(mode=mode, device_target=dev, save_graphs=False)
if args_opt.run_distribute:
ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path)
if config.run_distribute:
init()
device_num = get_group_size()
context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
@ -187,7 +173,7 @@ def train():
ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank()) + "/"
# create dataset
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1,
dataset = create_dataset(dataset_path=config.data_path, do_train=True, repeat_num=1,
batch_size=total_batch, target=dev, dtype=compute_type, device_num=device_num)
step_size = dataset.get_dataset_size()
if (print_per_steps > step_size or print_per_steps < 1):
@ -251,21 +237,21 @@ def train():
else:
model.train(epoch_size, dataset, callbacks=cb)
@moxing_wrapper()
def eval_():
# set args
dev = "GPU"
compute_type = str(args_opt.dtype).lower()
ckpt_dir = str(args_opt.ckpt_path)
total_batch = int(args_opt.batch_size)
compute_type = str(config.dtype).lower()
ckpt_dir = str(config.checkpoint_file_path)
total_batch = int(config.batch_size)
# init context
if args_opt.mode == "GRAPH":
if config.mode_name == "GRAPH":
mode = context.GRAPH_MODE
else:
mode = context.PYNATIVE_MODE
context.set_context(mode=mode, device_target=dev, save_graphs=False)
# create dataset
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, repeat_num=1,
dataset = create_dataset(dataset_path=config.data_path, do_train=False, repeat_num=1,
batch_size=total_batch, target=dev, dtype=compute_type)
# define net
net = resnet(class_num=1001, dtype=compute_type)
@ -284,7 +270,7 @@ def eval_():
if __name__ == '__main__':
if not args_opt.eval:
if not config.eval:
train()
else:
eval_()

View File

@ -14,43 +14,27 @@
# ============================================================================
"""train resnet."""
import os
import argparse
import numpy as np
from mindspore import Tensor
from mindspore import context
from mindspore.common import set_seed
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from src.model_utils.config import config
from src.model_utils.moxing_adapter import moxing_wrapper
parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--net', type=str, default=None, help='Resnet Model, either resnet18, '
'resnet50 or resnet101')
parser.add_argument('--dataset', type=str, default=None, help='Dataset, imagenet2012')
parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path')
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
parser.add_argument('--device_target', type=str, default='Ascend', choices=("Ascend", "GPU", "CPU"),
help="Device target, support Ascend, GPU and CPU.")
args_opt = parser.parse_args()
set_seed(1)
if args_opt.dataset != "imagenet2012":
if config.dataset != "imagenet2012":
raise ValueError("Currently only support imagenet2012 dataset format")
if args_opt.net in ("resnet18", "resnet50"):
if args_opt.net == "resnet18":
if config.net_name in ("resnet18", "resnet50"):
if config.net_name == "resnet18":
from src.resnet import resnet18 as resnet
if args_opt.net == "resnet50":
if config.net_name == "resnet50":
from src.resnet import resnet50 as resnet
from src.config import config2 as config
from src.dataset_infer import create_dataset
elif args_opt.net == "resnet101":
elif config.net_name == "resnet101":
from src.resnet import resnet101 as resnet
from src.config import config3 as config
from src.dataset_infer import create_dataset2 as create_dataset
else:
from src.resnet import se_resnet50 as resnet
from src.config import config4 as config
from src.dataset_infer import create_dataset3 as create_dataset
@ -67,9 +51,9 @@ def show_predict_info(label_list, prediction_list, filename_list, predict_ng):
"label is {}".format(filename, predict_index, label_index))
return predict_ng, label_index
if __name__ == '__main__':
target = args_opt.device_target
@moxing_wrapper()
def infer_net():
target = config.device_target
# init context
context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
@ -78,7 +62,7 @@ if __name__ == '__main__':
context.set_context(device_id=device_id)
# create dataset
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, batch_size=config.batch_size,
dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size,
target=target)
step_size = dataset.get_dataset_size()
@ -86,7 +70,7 @@ if __name__ == '__main__':
net = resnet(class_num=config.class_num)
# load checkpoint
param_dict = load_checkpoint(args_opt.checkpoint_path)
param_dict = load_checkpoint(config.checkpoint_file_path)
load_param_into_net(net, param_dict)
net.set_train(False)
@ -95,7 +79,7 @@ if __name__ == '__main__':
total_sample = step_size * config.batch_size
only_file = 0
data_loader = dataset.create_dict_iterator(output_numpy=True, num_epochs=1)
for i, data in enumerate(data_loader):
for _, data in enumerate(data_loader):
images = data["image"]
label = data["label"]
file_name = data["filename"]
@ -109,3 +93,6 @@ if __name__ == '__main__':
print(f"total {total_sample} data, top1 acc is {(total_sample - len(predict_negative)) * 1.0 / total_sample}")
else:
print("infer completed")
if __name__ == '__main__':
infer_net()

View File

@ -17,7 +17,6 @@ import os
import json
import argparse
import numpy as np
from src.config import config2 as config
batch_size = 1
parser = argparse.ArgumentParser(description="resnet inference")
@ -63,14 +62,14 @@ def cal_acc_imagenet(result_path, label_path):
files = os.listdir(result_path)
with open(label_path, "r") as label:
labels = json.load(label)
result_shape = (1, 1001)
top1 = 0
top5 = 0
total_data = len(files)
for file in files:
img_ids_name = file.split('_0.')[0]
data_path = os.path.join(result_path, img_ids_name + "_0.bin")
result = np.fromfile(data_path, dtype=np.float32).reshape(batch_size, config.class_num)
result = np.fromfile(data_path, dtype=np.float32).reshape(result_shape)
for batch in range(batch_size):
predict = np.argsort(-result[batch], axis=-1)
if labels[img_ids_name+".JPEG"] == predict[0]:

View File

@ -0,0 +1,75 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: ''
# ==============================================================================
# Training options
optimizer: 'Momentum'
infer_label: ""
class_num: 1001
batch_size: 32
loss_scale: 1024
momentum: 0.9
weight_decay: 0.0001
epoch_size: 120
pretrain_epoch_size: 0
save_checkpoint: True
save_checkpoint_epochs: 5
keep_checkpoint_max: 10
warmup_epochs: 0
lr_decay_mode: "cosine"
use_label_smooth: True
label_smooth_factor: 0.1
lr: 0.1
net_name: "resnet101"
dataset: "imagenet2012"
device_num: 1
pre_trained: ""
run_eval: False
eval_dataset_path: ""
parameter_server: False
filter_weight: False
save_best_ckpt: True
eval_start_epoch: 40
eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
# Export options
device_id: 0
width: 224
height: 224
file_name: "resnet101"
file_format: "AIR"
ckpt_file: ""
network_dataset: "resnet101_imagenet2012"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -0,0 +1,75 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: ''
# ==============================================================================
# Training options
optimizer: 'Momentum'
infer_label: ""
class_num: 10
batch_size: 32
loss_scale: 1024
momentum: 0.9
weight_decay: 0.0001
epoch_size: 90
pretrain_epoch_size: 0
save_checkpoint: True
save_checkpoint_epochs: 5
keep_checkpoint_max: 10
warmup_epochs: 5
lr_decay_mode: "poly"
lr_init: 0.01
lr_end: 0.00001
lr_max: 0.1
net_name: "resnet18"
dataset: "cifar10"
device_num: 1
pre_trained: ""
run_eval: False
eval_dataset_path: ""
parameter_server: False
filter_weight: False
save_best_ckpt: True
eval_start_epoch: 40
eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
# Export options
device_id: 0
width: 224
height: 224
file_name: "resnet18"
file_format: "AIR"
ckpt_file: ""
network_dataset: "resnet18_cifar10"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -0,0 +1,77 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: ''
# ==============================================================================
# Training options
optimizer: 'Momentum'
infer_label: ""
class_num: 1001
batch_size: 256
loss_scale: 1024
momentum: 0.9
weight_decay: 0.0001
epoch_size: 90
pretrain_epoch_size: 0
save_checkpoint: True
save_checkpoint_epochs: 5
keep_checkpoint_max: 10
warmup_epochs: 0
lr_decay_mode: "linear"
use_label_smooth: True
label_smooth_factor: 0.1
lr_init: 0
lr_max: 0.8
lr_end: 0.0
net_name: "resnet18"
dataset: "imagenet2012"
device_num: 1
pre_trained: ""
run_eval: False
eval_dataset_path: ""
parameter_server: False
filter_weight: False
save_best_ckpt: True
eval_start_epoch: 40
eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
# Export options
device_id: 0
width: 224
height: 224
file_name: "resnet18"
file_format: "AIR"
ckpt_file: ""
network_dataset: "resnet18_imagenet2012"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -0,0 +1,77 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: ''
# ==============================================================================
# Training options
optimizer: 'Momentum'
infer_label: ""
class_num: 1001
batch_size: 256
loss_scale: 1024
momentum: 0.9
weight_decay: 0.0001
epoch_size: 90
pretrain_epoch_size: 0
save_checkpoint: True
save_checkpoint_epochs: 5
keep_checkpoint_max: 10
warmup_epochs: 0
lr_decay_mode: "linear"
use_label_smooth: True
label_smooth_factor: 0.1
lr_init: 0
lr_max: 0.8
lr_end: 0.0
net_name: "resnet34"
dataset: "imagenet2012"
device_num: 1
pre_trained: ""
run_eval: False
eval_dataset_path: ""
parameter_server: False
filter_weight: False
save_best_ckpt: True
eval_start_epoch: 40
eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
# Export options
device_id: 0
width: 224
height: 224
file_name: "resnet34"
file_format: "AIR"
ckpt_file: ""
network_dataset: "resnet34_imagenet2012"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -0,0 +1,75 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: ''
# ==============================================================================
# Training options
optimizer: 'Momentum'
infer_label: ""
class_num: 10
batch_size: 32
loss_scale: 1024
momentum: 0.9
weight_decay: 0.0001
epoch_size: 90
pretrain_epoch_size: 0
save_checkpoint: True
save_checkpoint_epochs: 5
keep_checkpoint_max: 10
warmup_epochs: 5
lr_decay_mode: "poly"
lr_init: 0.01
lr_end: 0.00001
lr_max: 0.1
net_name: "resnet50"
dataset: "cifar10"
device_num: 1
pre_trained: ""
run_eval: False
eval_dataset_path: ""
parameter_server: False
filter_weight: False
save_best_ckpt: True
eval_start_epoch: 40
eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
# Export options
device_id: 0
width: 224
height: 224
file_name: "resnet50"
file_format: "AIR"
ckpt_file: ""
network_dataset: "resnet50_cifar10"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -0,0 +1,78 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: ''
# ==============================================================================
# Training options
optimizer: 'Momentum'
infer_label: ""
class_num: 1001
batch_size: 32
loss_scale: 128
momentum: 0.9
weight_decay: 0.0005
epoch_size: 45
pretrain_epoch_size: 0
save_checkpoint: True
save_checkpoint_epochs: 2
keep_checkpoint_max: 15
use_label_smooth: True
label_smooth_factor: 0.1
lr_init: 0.05803
lr_decay: 4.04839
lr_end_epoch: 53
damping_init: 0.02714
damping_decay: 0.50036
frequency: 834
net_name: "resnet50"
dataset: "imagenet2012"
device_num: 1
pre_trained: ""
run_eval: False
eval_dataset_path: ""
parameter_server: False
filter_weight: False
save_best_ckpt: True
eval_start_epoch: 40
eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
# Export options
device_id: 0
width: 224
height: 224
file_name: "resnet50"
file_format: "AIR"
ckpt_file: ""
network_dataset: "resnet50_imagenet2012"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -0,0 +1,78 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: ''
# ==============================================================================
# Training options
optimizer: 'Thor'
infer_label: ""
class_num: 1001
batch_size: 32
loss_scale: 128
momentum: 0.9
weight_decay: 0.0005
epoch_size: 40
pretrain_epoch_size: 0
save_checkpoint: True
save_checkpoint_epochs: 1
keep_checkpoint_max: 15
use_label_smooth: True
label_smooth_factor: 0.1
lr_init: 0.05672
lr_decay: 4.9687
lr_end_epoch: 50
damping_init: 0.02345
damping_decay: 0.5467
frequency: 834
net_name: "resnet50"
dataset: "imagenet2012"
device_num: 1
pre_trained: ""
run_eval: False
eval_dataset_path: ""
parameter_server: False
filter_weight: False
save_best_ckpt: True
eval_start_epoch: 40
eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
# Export options
device_id: 0
width: 224
height: 224
file_name: "resnet50"
file_format: "AIR"
ckpt_file: ""
network_dataset: "resnet50_imagenet2012"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -0,0 +1,77 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: ''
# ==============================================================================
# Training options
optimizer: 'Momentum'
infer_label: ""
class_num: 1001
batch_size: 256
loss_scale: 1024
momentum: 0.9
weight_decay: 0.0001
epoch_size: 90
pretrain_epoch_size: 0
save_checkpoint: True
save_checkpoint_epochs: 5
keep_checkpoint_max: 10
warmup_epochs: 0
lr_decay_mode: "linear"
use_label_smooth: True
label_smooth_factor: 0.1
lr_init: 0
lr_max: 0.8
lr_end: 0.0
net_name: "resnet50"
dataset: "imagenet2012"
device_num: 1
pre_trained: ""
run_eval: False
eval_dataset_path: ""
parameter_server: False
filter_weight: False
save_best_ckpt: True
eval_start_epoch: 40
eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
# Export options
device_id: 0
width: 224
height: 224
file_name: "resnet50"
file_format: "AIR"
ckpt_file: ""
network_dataset: "resnet50_imagenet2012"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -0,0 +1,52 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: ''
# ==============================================================================
# Training options
optimizer: 'Momentum'
infer_label: ""
batch_size: 256
epoch_size: 2
print_per_steps: 20
eval: False
save_ckpt: False
mode_name: "GRAPH"
dtype: "fp16"
# Export options
device_id: 0
width: 224
height: 224
file_name: "resnet"
file_format: "AIR"
ckpt_file: ""
network_dataset: "resnet50_cifar10"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -17,37 +17,13 @@
CURPATH="$(dirname "$0")"
. ${CURPATH}/cache_util.sh
if [ $# != 4 ] && [ $# != 5 ] && [ $# != 6 ]
if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ]
then
echo "Usage: bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
echo " bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)"
echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)"
echo " bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)"
exit 1
fi
if [ $1 != "resnet18" ] && [ $1 != "resnet34" ] && [ $1 != "resnet50" ] && [ $1 != "resnet101" ] && [ $1 != "se-resnet50" ]
then
echo "error: the selected net is neither resnet50 nor resnet101 and se-resnet50"
exit 1
fi
if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
then
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
exit 1
fi
if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
then
echo "error: training resnet101 with cifar10 dataset is unsupported now!"
exit 1
fi
if [ $1 == "se-resnet50" ] && [ $2 == "cifar10" ]
then
echo "error: evaluating se-resnet50 with cifar10 dataset is unsupported now!"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
@ -56,18 +32,19 @@ get_real_path(){
fi
}
PATH1=$(get_real_path $3)
PATH2=$(get_real_path $4)
PATH1=$(get_real_path $1)
PATH2=$(get_real_path $2)
CONFIG_FILE=$3
if [ $# == 5 ]
if [ $# == 4 ]
then
PATH3=$(get_real_path $5)
PATH3=$(get_real_path $4)
fi
if [ $# == 6 ]
if [ $# == 5 ]
then
RUN_EVAL=$5
EVAL_DATASET_PATH=$(get_real_path $6)
RUN_EVAL=$4
EVAL_DATASET_PATH=$(get_real_path $5)
fi
if [ ! -f $PATH1 ]
@ -82,7 +59,7 @@ then
exit 1
fi
if [ $# == 5 ] && [ ! -f $PATH3 ]
if [ $# == 4 ] && [ ! -f $PATH3 ]
then
echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file"
exit 1
@ -123,24 +100,28 @@ do
mkdir ./train_parallel$i
cp ../*.py ./train_parallel$i
cp *.sh ./train_parallel$i
cp -r ../*.yaml ./train_parallel$i
cp -r ../src ./train_parallel$i
cd ./train_parallel$i || exit
echo "start training for rank $RANK_ID, device $DEVICE_ID"
env > env.log
if [ $# == 4 ]
if [ $# == 3 ]
then
taskset -c $cmdopt python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log &
taskset -c $cmdopt python train.py --run_distribute=True --device_num=$DEVICE_NUM --data_path=$PATH2 \
--config_path=$CONFIG_FILE --output_path './output' &> log &
fi
if [ $# == 5 ]
if [ $# == 4 ]
then
taskset -c $cmdopt python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 &> log &
taskset -c $cmdopt python train.py --run_distribute=True --device_num=$DEVICE_NUM --data_path=$PATH2 --pre_trained=$PATH3 \
--config_path=$CONFIG_FILE --output_path './output' &> log &
fi
if [ $# == 6 ]
if [ $# == 5 ]
then
taskset -c $cmdopt python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 \
--run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID &> log &
taskset -c $cmdopt python train.py --run_distribute=True --device_num=$DEVICE_NUM --data_path=$PATH2 \
--run_eval=$RUN_EVAL --eval_data_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID \
--config_path=$CONFIG_FILE --output_path './output' &> log &
if [ "x${RUN_EVAL}" == "xTrue" ]
then
echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\""

View File

@ -17,32 +17,13 @@
CURPATH="$(dirname "$0")"
. ${CURPATH}/cache_util.sh
if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ]
if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ]
then
echo "Usage: bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
echo " bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)"
echo "Usage: bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)"
echo " bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)"
exit 1
fi
if [ $1 != "resnet50" ] && [ $1 != "resnet101" ]
then
echo "error: the selected net is neither resnet50 nor resnet101"
exit 1
fi
if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
then
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
exit 1
fi
if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
then
echo "error: training resnet101 with cifar10 dataset is unsupported now!"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
@ -51,17 +32,18 @@ get_real_path(){
fi
}
PATH1=$(get_real_path $3)
PATH1=$(get_real_path $1)
CONFIG_FILE=$2
if [ $# == 4 ]
if [ $# == 3 ]
then
PATH2=$(get_real_path $4)
PATH2=$(get_real_path $3)
fi
if [ $# == 5 ]
if [ $# == 4 ]
then
RUN_EVAL=$4
EVAL_DATASET_PATH=$(get_real_path $5)
RUN_EVAL=$3
EVAL_DATASET_PATH=$(get_real_path $4)
fi
@ -71,7 +53,7 @@ then
exit 1
fi
if [ $# == 5 ] && [ ! -f $PATH2 ]
if [ $# == 4 ] && [ ! -f $PATH2 ]
then
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
exit 1
@ -97,29 +79,30 @@ rm -rf ./train_parallel
mkdir ./train_parallel
cp ../*.py ./train_parallel
cp *.sh ./train_parallel
cp -r ../*.yaml ./train_parallel
cp -r ../src ./train_parallel
cd ./train_parallel || exit
if [ $# == 2 ]
then
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
python train.py --config_path=$CONFIG_FILE --run_distribute=True --device_num=$DEVICE_NUM \
--device_target="GPU" --data_path=$PATH1 --output_path './output' &> log &
fi
if [ $# == 3 ]
then
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
python train.py --net=$1 --dataset=$2 --run_distribute=True \
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log &
fi
if [ $# == 4 ]
then
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
python train.py --net=$1 --dataset=$2 --run_distribute=True \
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
python train.py --config_path=$CONFIG_FILE --run_distribute=True --device_num=$DEVICE_NUM \
--device_target="GPU" --data_path=$PATH1 --pre_trained=$PATH2 --output_path './output' &> log &
fi
if [ $# == 5 ]
if [ $# == 4 ]
then
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
python train.py --net=$1 --dataset=$2 --run_distribute=True \
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --run_eval=$RUN_EVAL \
--eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID &> log &
python train.py --config_path=$CONFIG_FILE --run_distribute=True --device_num=$DEVICE_NUM \
--device_target="GPU" --data_path=$PATH1 --run_eval=$RUN_EVAL --eval_data_path=$EVAL_DATASET_PATH \
--enable_cache=True --cache_session_id=$CACHE_SESSION_ID --output_path './output' &> log &
if [ "x${RUN_EVAL}" == "xTrue" ]
then
echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\""

View File

@ -14,33 +14,9 @@
# limitations under the License.
# ============================================================================
if [ $# != 4 ]
if [ $# != 3 ]
then
echo "Usage: bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]"
exit 1
fi
if [ $1 != "resnet18" ] && [ $1 != "resnet34" ] && [ $1 != "resnet50" ] && [ $1 != "resnet101" ] && [ $1 != "se-resnet50" ]
then
echo "error: the selected net is neither resnet50 nor resnet101 nor se-resnet50"
exit 1
fi
if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
then
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
exit 1
fi
if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
then
echo "error: evaluating resnet101 with cifar10 dataset is unsupported now!"
exit 1
fi
if [ $1 == "se-resnet50" ] && [ $2 == "cifar10" ]
then
echo "error: evaluating se-resnet50 with cifar10 dataset is unsupported now!"
echo "Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]"
exit 1
fi
@ -52,8 +28,9 @@ get_real_path(){
fi
}
PATH1=$(get_real_path $3)
PATH2=$(get_real_path $4)
PATH1=$(get_real_path $1)
PATH2=$(get_real_path $2)
CONFIG_FILE=$3
if [ ! -d $PATH1 ]
@ -81,9 +58,10 @@ fi
mkdir ./eval
cp ../*.py ./eval
cp *.sh ./eval
cp -r ../*.yaml ./eval
cp -r ../src ./eval
cd ./eval || exit
env > env.log
echo "start evaluation for device $DEVICE_ID"
python eval.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --checkpoint_path=$PATH2 &> log &
python eval.py --data_path=$PATH1 --checkpoint_file_path=$PATH2 --config_path=$CONFIG_FILE &> log &
cd ..

View File

@ -14,31 +14,12 @@
# limitations under the License.
# ============================================================================
if [ $# != 4 ]
if [ $# != 3 ]
then
echo "Usage: bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]"
echo "Usage: bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]"
exit 1
fi
if [ $1 != "resnet50" ] && [ $1 != "resnet101" ]
then
echo "error: the selected net is neither resnet50 nor resnet101"
exit 1
fi
if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
then
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
exit 1
fi
if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
then
echo "error: evaluating resnet101 with cifar10 dataset is unsupported now!"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
@ -47,8 +28,9 @@ get_real_path(){
fi
}
PATH1=$(get_real_path $3)
PATH2=$(get_real_path $4)
PATH1=$(get_real_path $1)
PATH2=$(get_real_path $2)
CONFIG_FILE=$3
if [ ! -d $PATH1 ]
@ -76,9 +58,11 @@ fi
mkdir ./eval
cp ../*.py ./eval
cp *.sh ./eval
cp -r ../*.yaml ./eval
cp -r ../src ./eval
cd ./eval || exit
env > env.log
echo "start evaluation for device $DEVICE_ID"
python eval.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target="GPU" &> log &
python eval.py --data_path=$PATH1 --checkpoint_file_path=$PATH2 --device_target="GPU" \
--config_path=$CONFIG_FILE &> log &
cd ..

View File

@ -14,11 +14,11 @@
# limitations under the License.
# ============================================================================
if [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ]
if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] && [ $# != 6 ]
then
echo "Usage: bash run_eval_gpu_resnet_benchmark.sh [DATASET_PATH] [CKPT_PATH] [BATCH_SIZE](optional) \
echo "Usage: bash run_eval_gpu_resnet_benchmark.sh [DATASET_PATH] [CKPT_PATH] [CONFIG_PATH] [BATCH_SIZE](optional) \
[DTYPE](optional)"
echo "Example: sh run_eval_gpu_resnet_benchmark.sh /path/imagenet/train /path/ckpt 256 FP16"
echo "Example: sh run_eval_gpu_resnet_benchmark.sh /path/imagenet/train /path/ckpt /*.yaml 256 FP16"
exit 1
fi
@ -33,19 +33,19 @@ get_real_path(){
DATAPATH=$(get_real_path $1)
script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}")
if [ $# == 2 ]
then
python ${self_path}/../gpu_resnet_benchmark.py --dataset_path=$DATAPATH --eval=True --ckpt_path=$2
fi
if [ $# == 3 ]
then
python ${self_path}/../gpu_resnet_benchmark.py --dataset_path=$DATAPATH --eval=True --ckpt_path=$2 \
--batch_size=$3
python ${self_path}/../gpu_resnet_benchmark.py --data_path=$DATAPATH --eval=True --checkpoint_file_path=$2 --config_path=$3
fi
if [ $# == 4 ]
then
python ${self_path}/../gpu_resnet_benchmark.py--dataset_path=$DATAPATH --eval=True --ckpt_path=$2 \
--batch_size=$3 --dtype=$4
python ${self_path}/../gpu_resnet_benchmark.py --data_path=$DATAPATH --eval=True --checkpoint_file_path=$2 \
--config_path=$3 --batch_size=$4
fi
if [ $# == 5 ]
then
python ${self_path}/../gpu_resnet_benchmark.py--data_path=$DATAPATH --eval=True --checkpoint_file_path=$2 \
--config_path=$3 --batch_size=$4 --dtype=$5
fi

View File

@ -14,11 +14,11 @@
# limitations under the License.
# ============================================================================
if [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ]
if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] && [ $# != 6 ]
then
echo "Usage: bash run_gpu_resnet_benchmark.sh [DATASET_PATH] [BATCH_SIZE](optional) [DTYPE](optional)\
echo "Usage: bash run_gpu_resnet_benchmark.sh [DATASET_PATH] [CONFIG_PATH] [BATCH_SIZE](optional) [DTYPE](optional)\
[DEVICE_NUM](optional) [SAVE_CKPT](optional) [SAVE_PATH](optional)"
echo "Example: sh run_gpu_resnet_benchmark.sh /path/imagenet/train 256 FP16 8 true /path/ckpt"
echo "Example: sh run_gpu_resnet_benchmark.sh /path/imagenet/train /*yaml 256 FP16 8 true /path/ckpt"
exit 1
fi
@ -33,35 +33,35 @@ get_real_path(){
DATAPATH=$(get_real_path $1)
script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}")
if [ $# == 1 ]
then
python ${self_path}/../gpu_resnet_benchmark.py --dataset_path=$DATAPATH
fi
if [ $# == 2 ]
then
python ${self_path}/../gpu_resnet_benchmark.py --dataset_path=$DATAPATH --batch_size=$2
python ${self_path}/../gpu_resnet_benchmark.py --data_path=$DATAPATH --config_path=$2
fi
if [ $# == 3 ]
then
python ${self_path}/../gpu_resnet_benchmark.py --dataset_path=$DATAPATH --batch_size=$2 --dtype=$3
python ${self_path}/../gpu_resnet_benchmark.py --data_path=$DATAPATH --config_path=$2 --batch_size=$3
fi
if [ $# == 4 ]
then
mpirun --allow-run-as-root -n $4 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \
--dataset_path=$DATAPATH --batch_size=$2 --dtype=$3
python ${self_path}/../gpu_resnet_benchmark.py --data_path=$DATAPATH --config_path=$2 --batch_size=$3 --dtype=$4
fi
if [ $# == 5 ]
then
mpirun --allow-run-as-root -n $4 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \
--dataset_path=$DATAPATH --batch_size=$2 --dtype=$3 --save_ckpt=$5
mpirun --allow-run-as-root -n $5 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \
--data_path=$DATAPATH --config_path=$2 --batch_size=$3 --dtype=$4
fi
if [ $# == 6 ]
then
mpirun --allow-run-as-root -n $4 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \
--dataset_path=$DATAPATH --batch_size=$2 --dtype=$3 --save_ckpt=$5 --ckpt_path=$6
mpirun --allow-run-as-root -n $5 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \
--data_path=$DATAPATH --config_path=$2 --batch_size=$3 --dtype=$4 --save_ckpt=$6
fi
if [ $# == 7 ]
then
mpirun --allow-run-as-root -n $5 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \
--data_path=$DATAPATH --config_path=$2 --batch_size=$3 --dtype=$4 --save_ckpt=$6 --checkpoint_file_path=$7
fi

View File

@ -14,25 +14,12 @@
# limitations under the License.
# ============================================================================
if [ $# != 4 ]
if [ $# != 3 ]
then
echo "Usage: bash run_eval.sh [resnet18|resnet50|resnet101|se-resnet50] [imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]"
echo "Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]"
exit 1
fi
if [ $1 != "resnet18" ] && [ $1 != "resnet50" ] && [ $1 != "resnet101" ] && [ $1 != "se-resnet50" ]
then
echo "error: the selected net is neither resnet50 nor resnet101 nor se-resnet50"
exit 1
fi
if [ $2 != "imagenet2012" ]
then
echo "error: only support imagenet2012"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
@ -41,8 +28,9 @@ get_real_path(){
fi
}
PATH1=$(get_real_path $3)
PATH2=$(get_real_path $4)
PATH1=$(get_real_path $1)
PATH2=$(get_real_path $2)
CONFIG_FILE=$3
if [ ! -d $PATH1 ]
@ -68,11 +56,12 @@ then
rm -rf ./infer
fi
mkdir ./infer
cp ../*.yaml ./infer
cp ../*.py ./infer
cp *.sh ./infer
cp -r ../src ./infer
cd ./infer || exit
env > env.log
echo "start evaluation for device $DEVICE_ID"
python infer.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --checkpoint_path=$PATH2 &> log &
python infer.py --data_path=$PATH1 --checkpoint_file_path=$PATH2 --config_path=$CONFIG_FILE &> log &
cd ..

View File

@ -14,31 +14,12 @@
# limitations under the License.
# ============================================================================
if [ $# != 4 ] && [ $# != 5 ]
if [ $# != 3 ] && [ $# != 4 ]
then
echo "Usage: bash run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1
fi
if [ $1 != "resnet50" ] && [ $1 != "resnet101" ]
then
echo "error: the selected net is neither resnet50 nor resnet101"
exit 1
fi
if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
then
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
exit 1
fi
if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
then
echo "error: training resnet101 with cifar10 dataset is unsupported now!"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
@ -47,12 +28,13 @@ get_real_path(){
fi
}
PATH1=$(get_real_path $3)
PATH2=$(get_real_path $4)
PATH1=$(get_real_path $1)
PATH2=$(get_real_path $2)
CONFIG_FILE=$3
if [ $# == 5 ]
if [ $# == 4 ]
then
PATH3=$(get_real_path $5)
PATH3=$(get_real_path $4)
fi
if [ ! -f $PATH1 ]
@ -67,7 +49,7 @@ then
exit 1
fi
if [ $# == 5 ] && [ ! -f $PATH3 ]
if [ $# == 4 ] && [ ! -f $PATH3 ]
then
echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file"
exit 1
@ -89,19 +71,22 @@ export DEVICE_ID=0
export RANK_ID=0
rm -rf ./sched
mkdir ./sched
cp ../*.yaml ./sched
cp ../*.py ./sched
cp *.sh ./sched
cp -r ../src ./sched
cd ./sched || exit
echo "start scheduler"
if [ $# == 4 ]
if [ $# == 3 ]
then
python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True &> sched.log &
python train.py --run_distribute=True --device_num=1 --data_path=$PATH2 --parameter_server=True \
--config_path=$CONFIG_FILE --output_path './output' &> sched.log &
fi
if [ $# == 5 ]
if [ $# == 4 ]
then
python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> sched.log &
python train.py --run_distribute=True --device_num=1 --data_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 \
--config_path=$CONFIG_FILE --output_path './output' &> sched.log &
fi
cd ..
@ -112,19 +97,22 @@ do
export RANK_ID=$i
rm -rf ./server_$i
mkdir ./server_$i
cp ../*.yaml ./server_$i
cp ../*.py ./server_$i
cp *.sh ./server_$i
cp -r ../src ./server_$i
cd ./server_$i || exit
echo "start server"
if [ $# == 4 ]
if [ $# == 3 ]
then
python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True &> server_$i.log &
python train.py --run_distribute=True --device_num=1 --data_path=$PATH2 --parameter_server=True \
--config_path=$CONFIG_FILE --output_path './output' &> server_$i.log &
fi
if [ $# == 5 ]
if [ $# == 4 ]
then
python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> server_$i.log &
python train.py --run_distribute=True --device_num=1 --data_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 \
--config_path=$CONFIG_FILE --output_path './output' &> server_$i.log &
fi
cd ..
@ -137,20 +125,23 @@ do
export RANK_ID=$i
rm -rf ./worker_$i
mkdir ./worker_$i
cp ../*.yaml ./worker_$i
cp ../*.py ./worker_$i
cp *.sh ./worker_$i
cp -r ../src ./worker_$i
cd ./worker_$i || exit
echo "start training for worker rank $RANK_ID, device $DEVICE_ID"
env > env.log
if [ $# == 4 ]
if [ $# == 3 ]
then
python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --parameter_server=True &> worker_$i.log &
python train.py --run_distribute=True --device_num=$DEVICE_NUM --data_path=$PATH2 --parameter_server=True \
--config_path=$CONFIG_FILE --output_path './output' &> worker_$i.log &
fi
if [ $# == 5 ]
if [ $# == 4 ]
then
python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> worker_$i.log &
python train.py --run_distribute=True --device_num=$DEVICE_NUM --data_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 \
--config_path=$CONFIG_FILE --output_path './output' &> worker_$i.log &
fi
cd ..

View File

@ -14,31 +14,12 @@
# limitations under the License.
# ============================================================================
if [ $# != 3 ] && [ $# != 4 ]
if [ $# != 2 ] && [ $# != 3 ]
then
echo "Usage: bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
echo "Usage: bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1
fi
if [ $1 != "resnet50" ] && [ $1 != "resnet101" ]
then
echo "error: the selected net is neither resnet50 nor resnet101"
exit 1
fi
if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
then
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
exit 1
fi
if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
then
echo "error: training resnet101 with cifar10 dataset is unsupported now!"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
@ -47,11 +28,11 @@ get_real_path(){
fi
}
PATH1=$(get_real_path $3)
if [ $# == 4 ]
PATH1=$(get_real_path $1)
CONFIG_FILE=$2
if [ $# == 3 ]
then
PATH2=$(get_real_path $4)
PATH2=$(get_real_path $3)
fi
@ -61,7 +42,7 @@ then
exit 1
fi
if [ $# == 5 ] && [ ! -f $PATH2 ]
if [ $# == 4 ] && [ ! -f $PATH2 ]
then
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
exit 1
@ -79,22 +60,23 @@ export MS_SCHED_PORT=8081
export MS_ROLE=MS_SCHED
rm -rf ./sched
mkdir ./sched
cp ../*.yaml ./sched
cp ../*.py ./sched
cp *.sh ./sched
cp -r ../src ./sched
cd ./sched || exit
if [ $# == 2 ]
then
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU" \
--data_path=$PATH1 --parameter_server=True --config_path=$CONFIG_FILE --output_path './output' &> sched.log &
fi
if [ $# == 3 ]
then
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
python train.py --net=$1 --dataset=$2 --run_distribute=True \
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> sched.log &
fi
if [ $# == 4 ]
then
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
python train.py --net=$1 --dataset=$2 --run_distribute=True \
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> sched.log &
python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU" \
--data_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 --config_path=$CONFIG_FILE --output_path './output' &> sched.log &
fi
cd ..
@ -103,22 +85,24 @@ for((i=0;i<$MS_SERVER_NUM;i++));
do
rm -rf ./server_$i
mkdir ./server_$i
cp ../*.yaml ./server_$i
cp ../*.py ./server_$i
cp *.sh ./server_$i
cp -r ../src ./server_$i
cd ./server_$i || exit
if [ $# == 2 ]
then
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU" \
--data_path=$PATH1 --parameter_server=True --config_path=$CONFIG_FILE --output_path './output' &> server_$i.log &
fi
if [ $# == 3 ]
then
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
python train.py --net=$1 --dataset=$2 --run_distribute=True \
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> server_$i.log &
fi
if [ $# == 4 ]
then
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
python train.py --net=$1 --dataset=$2 --run_distribute=True \
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> server_$i.log &
python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU" \
--data_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 \
--config_path=$CONFIG_FILE --output_path './output' &> server_$i.log &
fi
cd ..
done
@ -126,21 +110,23 @@ done
export MS_ROLE=MS_WORKER
rm -rf ./worker
mkdir ./worker
cp ../*.yaml ./worker
cp ../*.py ./worker
cp *.sh ./worker
cp -r ../src ./worker
cd ./worker || exit
if [ $# == 2 ]
then
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU" \
--data_path=$PATH1 --parameter_server=True --config_path=$CONFIG_FILE --output_path './output' &> worker.log &
fi
if [ $# == 3 ]
then
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
python train.py --net=$1 --dataset=$2 --run_distribute=True \
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> worker.log &
fi
if [ $# == 4 ]
then
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
python train.py --net=$1 --dataset=$2 --run_distribute=True \
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> worker.log &
python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU"\
--data_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 \
--config_path=$CONFIG_FILE --output_path './output' &> worker.log &
fi
cd ..

View File

@ -17,34 +17,10 @@
CURPATH="$(dirname "$0")"
. ${CURPATH}/cache_util.sh
if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ]
if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ]
then
echo "Usage: bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
echo " bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)"
exit 1
fi
if [ $1 != "resnet18" ] && [ $1 != "resnet34" ] && [ $1 != "resnet50" ] && [ $1 != "resnet101" ] && [ $1 != "se-resnet50" ]
then
echo "error: the selected net is neither resnet50 nor resnet101 and se-resnet50"
exit 1
fi
if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
then
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
exit 1
fi
if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
then
echo "error: training resnet101 with cifar10 dataset is unsupported now!"
exit 1
fi
if [ $1 == "se-resnet50" ] && [ $2 == "cifar10" ]
then
echo "error: evaluating se-resnet50 with cifar10 dataset is unsupported now!"
echo "Usage: bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)"
echo "bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)"
exit 1
fi
@ -56,17 +32,17 @@ get_real_path(){
fi
}
PATH1=$(get_real_path $3)
PATH1=$(get_real_path $1)
CONFIG_FILE=$2
if [ $# == 3 ]
then
PATH2=$(get_real_path $3)
fi
if [ $# == 4 ]
then
PATH2=$(get_real_path $4)
fi
if [ $# == 5 ]
then
RUN_EVAL=$4
EVAL_DATASET_PATH=$(get_real_path $5)
RUN_EVAL=$2
EVAL_DATASET_PATH=$(get_real_path $4)
fi
if [ ! -d $PATH1 ]
@ -75,7 +51,7 @@ then
exit 1
fi
if [ $# == 4 ] && [ ! -f $PATH2 ]
if [ $# == 3 ] && [ ! -f $PATH2 ]
then
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
exit 1
@ -103,26 +79,28 @@ then
rm -rf ./train
fi
mkdir ./train
cp ../*.yaml ./train
cp ../*.py ./train
cp *.sh ./train
cp -r ../src ./train
cd ./train || exit
echo "start training for device $DEVICE_ID"
env > env.log
if [ $# == 2 ]
then
python train.py --data_path=$PATH1 --config_path=$CONFIG_FILE --output_path './output' &> log &
fi
if [ $# == 3 ]
then
python train.py --net=$1 --dataset=$2 --dataset_path=$PATH1 &> log &
python train.py --data_path=$PATH1 --pre_trained=$PATH2 --config_path=$CONFIG_FILE --output_path './output' &> log &
fi
if [ $# == 4 ]
then
python train.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
fi
if [ $# == 5 ]
then
python train.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --run_eval=$RUN_EVAL \
--eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID &> log &
python train.py --data_path=$PATH1 --run_eval=$RUN_EVAL --eval_data_path=$EVAL_DATASET_PATH \
--enable_cache=True --cache_session_id=$CACHE_SESSION_ID \
--config_path=$CONFIG_FILE --output_path './output' &> log &
if [ "x${RUN_EVAL}" == "xTrue" ]
then
echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\""

View File

@ -17,32 +17,13 @@
CURPATH="$(dirname "$0")"
. ${CURPATH}/cache_util.sh
if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ]
if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ]
then
echo "Usage: bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
echo " bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)"
echo "Usage: bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)"
echo " bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)"
exit 1
fi
if [ $1 != "resnet50" ] && [ $1 != "resnet101" ]
then
echo "error: the selected net is neither resnet50 nor resnet101"
exit 1
fi
if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
then
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
exit 1
fi
if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
then
echo "error: training resnet101 with cifar10 dataset is unsupported now!"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
@ -51,17 +32,18 @@ get_real_path(){
fi
}
PATH1=$(get_real_path $3)
PATH1=$(get_real_path $1)
CONFIG_FILE=$2
if [ $# == 3 ]
then
PATH2=$(get_real_path $3)
fi
if [ $# == 4 ]
then
PATH2=$(get_real_path $4)
fi
if [ $# == 5 ]
then
RUN_EVAL=$4
EVAL_DATASET_PATH=$(get_real_path $5)
RUN_EVAL=$3
EVAL_DATASET_PATH=$(get_real_path $4)
fi
if [ ! -d $PATH1 ]
@ -70,7 +52,7 @@ then
exit 1
fi
if [ $# == 4 ] && [ ! -f $PATH2 ]
if [ $# == 3 ] && [ ! -f $PATH2 ]
then
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
exit 1
@ -100,26 +82,30 @@ then
rm -rf ./train
fi
mkdir ./train
cp ../*.yaml ./train
cp ../*.py ./train
cp *.sh ./train
cp -r ../src ./train
cd ./train || exit
echo "start training for device $DEVICE_ID"
env > env.log
if [ $# == 2 ]
then
python train.py --device_target="GPU" --data_path=$PATH1 \
--config_path=$CONFIG_FILE --output_path './output' &> log &
fi
if [ $# == 3 ]
then
python train.py --net=$1 --dataset=$2 --device_target="GPU" --dataset_path=$PATH1 &> log &
python train.py --device_target="GPU" --data_path=$PATH1 --pre_trained=$PATH2 \
--config_path=$CONFIG_FILE --output_path './output' &> log &
fi
if [ $# == 4 ]
then
python train.py --net=$1 --dataset=$2 --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
fi
if [ $# == 5 ]
then
python train.py --net=$1 --dataset=$2 --device_target="GPU" --dataset_path=$PATH1 --run_eval=$RUN_EVAL \
--eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID &> log &
python train.py --device_target="GPU" --data_path=$PATH1 --run_eval=$RUN_EVAL \
--eval_data_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID \
--config_path=$CONFIG_FILE --output_path './output' &> log &
if [ "x${RUN_EVAL}" == "xTrue" ]
then
echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\""

View File

@ -0,0 +1,78 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: ''
# ==============================================================================
# Training options
optimizer: 'Momentum'
infer_label: ""
class_num: 1001
batch_size: 32
loss_scale: 1024
momentum: 0.9
weight_decay: 0.0001
epoch_size: 28
train_epoch_size: 24
pretrain_epoch_size: 0
save_checkpoint: True
save_checkpoint_epochs: 4
keep_checkpoint_max: 10
warmup_epochs: 3
lr_decay_mode: "cosine"
use_label_smooth: True
label_smooth_factor: 0.1
lr_init: 0
lr_end: 0.0001
lr_max: 0.3
net_name: "se-resnet50"
dataset: "imagenet2012"
device_num: 1
pre_trained: ""
run_eval: False
eval_dataset_path: ""
parameter_server: False
filter_weight: False
save_best_ckpt: True
eval_start_epoch: 40
eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
# Export options
device_id: 0
width: 224
height: 224
file_name: "se-resnet50"
file_format: "AIR"
ckpt_file: ""
network_dataset: "se-resnet50_imagenet2012"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -1,155 +0,0 @@
# Copyright 2020-2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
network config setting, will be used in train.py and eval.py
"""
from easydict import EasyDict as ed
# config optimizer for resnet50, imagenet2012. Momentum is default, Thor is optional.
# infer_label is a directory and label mapping table. such as 'infer_label': {"directory0": 0, "directory1": 1, ...}
cfg = ed({
'optimizer': 'Momentum',
'infer_label': {}
})
# config for resent50, cifar10
config1 = ed({
"class_num": 10,
"batch_size": 32,
"loss_scale": 1024,
"momentum": 0.9,
"weight_decay": 1e-4,
"epoch_size": 90,
"pretrain_epoch_size": 0,
"save_checkpoint": True,
"save_checkpoint_epochs": 5,
"keep_checkpoint_max": 10,
"save_checkpoint_path": "./",
"warmup_epochs": 5,
"lr_decay_mode": "poly",
"lr_init": 0.01,
"lr_end": 0.00001,
"lr_max": 0.1
})
# config for resnet50, imagenet2012
config2 = ed({
"class_num": 1001,
"batch_size": 256,
"loss_scale": 1024,
"momentum": 0.9,
"weight_decay": 1e-4,
"epoch_size": 90,
"pretrain_epoch_size": 0,
"save_checkpoint": True,
"save_checkpoint_epochs": 5,
"keep_checkpoint_max": 10,
"save_checkpoint_path": "./",
"warmup_epochs": 0,
"lr_decay_mode": "linear",
"use_label_smooth": True,
"label_smooth_factor": 0.1,
"lr_init": 0,
"lr_max": 0.8,
"lr_end": 0.0
})
# config for resent101, imagenet2012
config3 = ed({
"class_num": 1001,
"batch_size": 32,
"loss_scale": 1024,
"momentum": 0.9,
"weight_decay": 1e-4,
"epoch_size": 120,
"pretrain_epoch_size": 0,
"save_checkpoint": True,
"save_checkpoint_epochs": 5,
"keep_checkpoint_max": 10,
"save_checkpoint_path": "./",
"warmup_epochs": 0,
"lr_decay_mode": "cosine",
"use_label_smooth": True,
"label_smooth_factor": 0.1,
"lr": 0.1
})
# config for se-resnet50, imagenet2012
config4 = ed({
"class_num": 1001,
"batch_size": 32,
"loss_scale": 1024,
"momentum": 0.9,
"weight_decay": 1e-4,
"epoch_size": 28,
"train_epoch_size": 24,
"pretrain_epoch_size": 0,
"save_checkpoint": True,
"save_checkpoint_epochs": 4,
"keep_checkpoint_max": 10,
"save_checkpoint_path": "./",
"warmup_epochs": 3,
"lr_decay_mode": "cosine",
"use_label_smooth": True,
"label_smooth_factor": 0.1,
"lr_init": 0.0,
"lr_max": 0.3,
"lr_end": 0.0001
})
# config for resnet50, imagenet2012, Ascend 910
config_thor_Ascend = ed({
"class_num": 1001,
"batch_size": 32,
"loss_scale": 128,
"momentum": 0.9,
"weight_decay": 5e-4,
"epoch_size": 45,
"pretrain_epoch_size": 0,
"save_checkpoint": True,
"save_checkpoint_epochs": 2,
"keep_checkpoint_max": 15,
"save_checkpoint_path": "./",
"use_label_smooth": True,
"label_smooth_factor": 0.1,
"lr_init": 0.05803,
"lr_decay": 4.04839,
"lr_end_epoch": 53,
"damping_init": 0.02714,
"damping_decay": 0.50036,
"frequency": 834,
})
# config for resnet50, imagenet2012, GPU
config_thor_gpu = ed({
"class_num": 1001,
"batch_size": 32,
"loss_scale": 128,
"momentum": 0.9,
"weight_decay": 5e-4,
"epoch_size": 40,
"pretrain_epoch_size": 0,
"save_checkpoint": True,
"save_checkpoint_epochs": 1,
"keep_checkpoint_max": 15,
"save_checkpoint_path": "./",
"use_label_smooth": True,
"label_smooth_factor": 0.1,
"lr_init": 0.05672,
"lr_decay": 4.9687,
"lr_end_epoch": 50,
"damping_init": 0.02345,
"damping_decay": 0.5467,
"frequency": 834,
})

View File

@ -21,6 +21,8 @@ import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2
from mindspore.communication.management import init, get_rank, get_group_size
from src.model_utils.config import config
from src.model_utils.device_adapter import get_device_num, get_rank_id
def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False,
@ -407,11 +409,19 @@ def _get_rank_info():
"""
rank_size = int(os.environ.get("RANK_SIZE", 1))
if rank_size > 1:
rank_size = get_group_size()
rank_id = get_rank()
if config.device_target == "Ascend":
if rank_size > 1:
rank_size = get_device_num()
rank_id = get_rank_id()
else:
rank_size = 1
rank_id = 0
else:
rank_size = 1
rank_id = 0
if rank_size > 1:
rank_size = get_group_size()
rank_id = get_rank()
else:
rank_size = 1
rank_id = 0
return rank_size, rank_id

View File

@ -22,7 +22,7 @@ import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2
from mindspore.communication.management import init, get_rank, get_group_size
from src.config import cfg
from src.model_utils.config import config
class ImgDataset:
@ -39,7 +39,7 @@ class ImgDataset:
self.data = []
self.dir_label_dict = {}
self.img_format = (".bmp", ".png", ".jpg", ".jpeg")
self.dir_label = cfg.infer_label
self.dir_label = config.infer_label
dataset_list = sorted(os.listdir(dataset_path))
file_exist = dir_exist = False
for index, data_name in enumerate(dataset_list):

View File

@ -0,0 +1,125 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Parse arguments"""
import os
import ast
import argparse
from pprint import pprint, pformat
import yaml
_config_path = "./resnet50_cifar10_config.yaml"
class Config:
"""
Configuration namespace. Convert dictionary to members.
"""
def __init__(self, cfg_dict):
for k, v in cfg_dict.items():
if isinstance(v, (list, tuple)):
setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
else:
setattr(self, k, Config(v) if isinstance(v, dict) else v)
def __str__(self):
return pformat(self.__dict__)
def __repr__(self):
return self.__str__()
def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="resnet50_cifar10_config.yaml"):
"""
Parse command line arguments to the configuration according to the default yaml.
Args:
parser: Parent parser.
cfg: Base configuration.
helper: Helper description.
cfg_path: Path to the default yaml config.
"""
parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
parents=[parser])
helper = {} if helper is None else helper
choices = {} if choices is None else choices
for item in cfg:
if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
choice = choices[item] if item in choices else None
if isinstance(cfg[item], bool):
parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
help=help_description)
else:
parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
help=help_description)
args = parser.parse_args()
return args
def parse_yaml(yaml_path):
"""
Parse the yaml config file.
Args:
yaml_path: Path to the yaml config.
"""
with open(yaml_path, 'r') as fin:
try:
cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
cfgs = [x for x in cfgs]
if len(cfgs) == 1:
cfg_helper = {}
cfg = cfgs[0]
elif len(cfgs) == 2:
cfg, cfg_helper = cfgs
else:
raise ValueError("At most 2 docs (config and help description for help) are supported in config yaml")
print(cfg_helper)
except:
raise ValueError("Failed to parse yaml")
return cfg, cfg_helper
def merge(args, cfg):
"""
Merge the base config from yaml file and command line arguments.
Args:
args: Command line arguments.
cfg: Base configuration.
"""
args_var = vars(args)
for item in args_var:
cfg[item] = args_var[item]
return cfg
def get_config():
"""
Get Config according to the yaml file and cli arguments.
"""
parser = argparse.ArgumentParser(description="default name", add_help=False)
current_dir = os.path.dirname(os.path.abspath(__file__))
parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, \
"../resnet50_cifar10_config.yaml"), help="Config file path")
path_args, _ = parser.parse_known_args()
default, helper = parse_yaml(path_args.config_path)
pprint(default)
args = parse_cli_to_yaml(parser, default, helper, path_args.config_path)
final_config = merge(args, default)
return Config(final_config)
config = get_config()

View File

@ -0,0 +1,27 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Device adapter for ModelArts"""
from src.model_utils.config import config
if config.enable_modelarts:
from src.model_utils.moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
else:
from src.model_utils.local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
__all__ = [
"get_device_id", "get_device_num", "get_rank_id", "get_job_id"
]

View File

@ -0,0 +1,36 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Local adapter"""
import os
def get_device_id():
device_id = os.getenv('DEVICE_ID', '0')
return int(device_id)
def get_device_num():
device_num = os.getenv('RANK_SIZE', '1')
return int(device_num)
def get_rank_id():
global_rank_id = os.getenv('RANK_ID', '0')
return int(global_rank_id)
def get_job_id():
return "Local Job"

View File

@ -0,0 +1,115 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Moxing adapter for ModelArts"""
import os
import functools
from mindspore import context
from src.model_utils.config import config
_global_sync_count = 0
def get_device_id():
device_id = os.getenv('DEVICE_ID', '0')
return int(device_id)
def get_device_num():
device_num = os.getenv('RANK_SIZE', '1')
return int(device_num)
def get_rank_id():
global_rank_id = os.getenv('RANK_ID', '0')
return int(global_rank_id)
def get_job_id():
job_id = os.getenv('JOB_ID')
job_id = job_id if job_id != "" else "default"
return job_id
def sync_data(from_path, to_path):
"""
Download data from remote obs to local directory if the first url is remote url and the second one is local path
Upload data from local directory to remote obs in contrast.
"""
import moxing as mox
import time
global _global_sync_count
sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
_global_sync_count += 1
# Each server contains 8 devices as most.
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
print("from path: ", from_path)
print("to path: ", to_path)
mox.file.copy_parallel(from_path, to_path)
print("===finish data synchronization===")
try:
os.mknod(sync_lock)
except IOError:
pass
print("===save flag===")
while True:
if os.path.exists(sync_lock):
break
time.sleep(1)
print("Finish sync data from {} to {}.".format(from_path, to_path))
def moxing_wrapper(pre_process=None, post_process=None):
"""
Moxing wrapper to download dataset and upload outputs.
"""
def wrapper(run_func):
@functools.wraps(run_func)
def wrapped_func(*args, **kwargs):
# Download data from data_url
if config.enable_modelarts:
if config.data_url:
sync_data(config.data_url, config.data_path)
print("Dataset downloaded: ", os.listdir(config.data_path))
if config.checkpoint_url:
sync_data(config.checkpoint_url, config.load_path)
print("Preload downloaded: ", os.listdir(config.load_path))
if config.train_url:
sync_data(config.train_url, config.output_path)
print("Workspace downloaded: ", os.listdir(config.output_path))
context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
config.device_num = get_device_num()
config.device_id = get_device_id()
if not os.path.exists(config.output_path):
os.makedirs(config.output_path)
if pre_process:
pre_process()
run_func(*args, **kwargs)
# Upload data to train_url
if config.enable_modelarts:
if post_process:
post_process()
if config.train_url:
print("Start to copy output directory")
sync_data(config.output_path, config.train_url)
return wrapped_func
return wrapper

View File

@ -14,8 +14,6 @@
# ============================================================================
"""train resnet."""
import os
import argparse
import ast
from mindspore import context
from mindspore import Tensor
from mindspore.nn.optim import Momentum, thor
@ -26,7 +24,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMoni
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.communication.management import init, get_rank, get_group_size
from mindspore.communication.management import init
from mindspore.common import set_seed
from mindspore.parallel import set_algo_parameters
import mindspore.nn as nn
@ -34,72 +32,35 @@ import mindspore.common.initializer as weight_init
import mindspore.log as logger
from src.lr_generator import get_lr, warmup_cosine_annealing_lr
from src.CrossEntropySmooth import CrossEntropySmooth
from src.config import cfg
from src.eval_callback import EvalCallBack
from src.metric import DistAccuracy, ClassifyCorrectCell
parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--net', type=str, default=None, help='Resnet Model, resnet18, resnet34, resnet50 or resnet101')
parser.add_argument('--dataset', type=str, default=None, help='Dataset, either cifar10 or imagenet2012')
parser.add_argument('--run_distribute', type=ast.literal_eval, default=False, help='Run distribute')
parser.add_argument('--device_num', type=int, default=1, help='Device num.')
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
parser.add_argument('--device_target', type=str, default='Ascend', choices=("Ascend", "GPU", "CPU"),
help="Device target, support Ascend, GPU and CPU.")
parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path')
parser.add_argument('--parameter_server', type=ast.literal_eval, default=False, help='Run parameter server train')
parser.add_argument("--filter_weight", type=ast.literal_eval, default=False,
help="Filter head weight parameters, default is False.")
parser.add_argument("--run_eval", type=ast.literal_eval, default=False,
help="Run evaluation when training, default is False.")
parser.add_argument('--eval_dataset_path', type=str, default=None, help='Evaluation dataset path when run_eval is True')
parser.add_argument("--save_best_ckpt", type=ast.literal_eval, default=True,
help="Save best checkpoint when run_eval is True, default is True.")
parser.add_argument("--eval_start_epoch", type=int, default=40,
help="Evaluation start epoch when run_eval is True, default is 40.")
parser.add_argument("--eval_interval", type=int, default=1,
help="Evaluation interval when run_eval is True, default is 1.")
parser.add_argument('--enable_cache', type=ast.literal_eval, default=False,
help='Caching the eval dataset in memory to speedup evaluation, default is False.')
parser.add_argument('--cache_session_id', type=str, default="", help='The session id for cache service.')
parser.add_argument('--mode', type=str, default='GRAPH', choices=('GRAPH', 'PYNATIVE'),
help="Graph mode or PyNative mode, default is Graph mode")
args_opt = parser.parse_args()
from src.model_utils.config import config
from src.model_utils.moxing_adapter import moxing_wrapper
from src.model_utils.device_adapter import get_rank_id, get_device_num
set_seed(1)
if args_opt.net in ("resnet18", "resnet34", "resnet50"):
if args_opt.net == "resnet18":
if config.net_name in ("resnet18", "resnet34", "resnet50"):
if config.net_name == "resnet18":
from src.resnet import resnet18 as resnet
if args_opt.net == "resnet34":
if config.net_name == "resnet34":
from src.resnet import resnet34 as resnet
if args_opt.net == "resnet50":
if config.net_name == "resnet50":
from src.resnet import resnet50 as resnet
if args_opt.dataset == "cifar10":
from src.config import config1 as config
if config.dataset == "cifar10":
from src.dataset import create_dataset1 as create_dataset
else:
from src.config import config2 as config
if args_opt.mode == "GRAPH":
if config.mode_name == "GRAPH":
from src.dataset import create_dataset2 as create_dataset
else:
from src.dataset import create_dataset_pynative as create_dataset
elif args_opt.net == "resnet101":
elif config.net_name == "resnet101":
from src.resnet import resnet101 as resnet
from src.config import config3 as config
from src.dataset import create_dataset3 as create_dataset
else:
from src.resnet import se_resnet50 as resnet
from src.config import config4 as config
from src.dataset import create_dataset4 as create_dataset
if cfg.optimizer == "Thor":
if args_opt.device_target == "Ascend":
from src.config import config_thor_Ascend as config
else:
from src.config import config_thor_gpu as config
def filter_checkpoint_parameter_by_list(origin_dict, param_filter):
"""remove useless parameters according to filter_list"""
@ -122,56 +83,46 @@ def set_graph_kernel_context(run_platform, net_name):
context.set_context(enable_graph_kernel=True)
context.set_context(graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D")
if __name__ == '__main__':
target = args_opt.device_target
def set_parameter():
"""set_parameter"""
target = config.device_target
if target == "CPU":
args_opt.run_distribute = False
ckpt_save_dir = config.save_checkpoint_path
config.run_distribute = False
# init context
if args_opt.mode == 'GRAPH':
if config.mode_name == 'GRAPH':
context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
set_graph_kernel_context(target, args_opt.net)
set_graph_kernel_context(target, config.net_name)
else:
context.set_context(mode=context.PYNATIVE_MODE, device_target=target, save_graphs=False)
if args_opt.parameter_server:
if config.parameter_server:
context.set_ps_context(enable_ps=True)
if args_opt.run_distribute:
if config.run_distribute:
if target == "Ascend":
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(device_id=device_id, enable_auto_mixed_precision=True)
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
context.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True)
set_algo_parameters(elementwise_op_strategy_follow=True)
if args_opt.net == "resnet50" or args_opt.net == "se-resnet50":
if config.net_name == "resnet50" or config.net_name == "se-resnet50":
context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160])
elif args_opt.net == "resnet101":
elif config.net_name == "resnet101":
context.set_auto_parallel_context(all_reduce_fusion_config=[80, 210, 313])
init()
# GPU target
else:
init()
context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
context.set_auto_parallel_context(device_num=get_device_num(),
parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True)
if args_opt.net == "resnet50":
if config.net_name == "resnet50":
context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160])
ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"
# create dataset
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1,
batch_size=config.batch_size, target=target, distribute=args_opt.run_distribute)
step_size = dataset.get_dataset_size()
# define net
net = resnet(class_num=config.class_num)
if args_opt.parameter_server:
net.set_param_ps()
# init weight
if args_opt.pre_trained:
param_dict = load_checkpoint(args_opt.pre_trained)
if args_opt.filter_weight:
def init_weight(net):
"""init_weight"""
if config.pre_trained:
param_dict = load_checkpoint(config.pre_trained)
if config.filter_weight:
filter_list = [x.name for x in net.end_point.get_parameters()]
filter_checkpoint_parameter_by_list(param_dict, filter_list)
load_param_into_net(net, param_dict)
@ -186,20 +137,60 @@ if __name__ == '__main__':
cell.weight.shape,
cell.weight.dtype))
# init lr
if cfg.optimizer == "Thor":
def init_lr(step_size):
"""init lr"""
if config.optimizer == "Thor":
from src.lr_generator import get_thor_lr
lr = get_thor_lr(0, config.lr_init, config.lr_decay, config.lr_end_epoch, step_size, decay_epochs=39)
else:
if args_opt.net in ("resnet18", "resnet34", "resnet50", "se-resnet50"):
if config.net_name in ("resnet18", "resnet34", "resnet50", "se-resnet50"):
lr = get_lr(lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max,
warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, steps_per_epoch=step_size,
lr_decay_mode=config.lr_decay_mode)
else:
lr = warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, config.epoch_size,
config.pretrain_epoch_size * step_size)
lr = Tensor(lr)
return lr
def init_loss_scale():
if config.dataset == "imagenet2012":
if not config.use_label_smooth:
config.label_smooth_factor = 0.0
loss = CrossEntropySmooth(sparse=True, reduction="mean",
smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
else:
loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
return loss
def run_eval(target, model, ckpt_save_dir, cb):
"""run_eval"""
if config.run_eval:
if config.eval_dataset_path is None or (not os.path.isdir(config.eval_dataset_path)):
raise ValueError("{} is not a existing path.".format(config.eval_dataset_path))
eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False,
batch_size=config.batch_size, target=target, enable_cache=config.enable_cache,
cache_session_id=config.cache_session_id)
eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"}
eval_cb = EvalCallBack(apply_eval, eval_param_dict, interval=config.eval_interval,
eval_start_epoch=config.eval_start_epoch, save_best_ckpt=config.save_best_ckpt,
ckpt_directory=ckpt_save_dir, besk_ckpt_name="best_acc.ckpt",
metrics_name="acc")
cb += [eval_cb]
@moxing_wrapper()
def train_net():
"""train net"""
target = config.device_target
set_parameter()
dataset = create_dataset(dataset_path=config.data_path, do_train=True, repeat_num=1,
batch_size=config.batch_size, target=target,
distribute=config.run_distribute)
step_size = dataset.get_dataset_size()
net = resnet(class_num=config.class_num)
if config.parameter_server:
net.set_param_ps()
init_weight(net=net)
lr = Tensor(init_lr(step_size=step_size))
# define opt
decayed_params = []
no_decayed_params = []
@ -213,27 +204,21 @@ if __name__ == '__main__':
{'params': no_decayed_params},
{'order_params': net.trainable_params()}]
opt = Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale)
if args_opt.dataset == "imagenet2012":
if not config.use_label_smooth:
config.label_smooth_factor = 0.0
loss = CrossEntropySmooth(sparse=True, reduction="mean",
smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
else:
loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
loss = init_loss_scale()
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
dist_eval_network = ClassifyCorrectCell(net) if args_opt.run_distribute else None
dist_eval_network = ClassifyCorrectCell(net) if config.run_distribute else None
metrics = {"acc"}
if args_opt.run_distribute:
metrics = {'acc': DistAccuracy(batch_size=config.batch_size, device_num=args_opt.device_num)}
if (args_opt.net not in ("resnet18", "resnet34", "resnet50", "resnet101", "se-resnet50")) or \
args_opt.parameter_server or target == "CPU":
if config.run_distribute:
metrics = {'acc': DistAccuracy(batch_size=config.batch_size, device_num=config.device_num)}
if (config.net_name not in ("resnet18", "resnet34", "resnet50", "resnet101", "se-resnet50")) or \
config.parameter_server or target == "CPU":
## fp32 training
model = Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network)
else:
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics,
amp_level="O2", keep_batchnorm_fp32=False, eval_network=dist_eval_network)
if cfg.optimizer == "Thor" and args_opt.dataset == "imagenet2012":
if config.optimizer == "Thor" and config.dataset == "imagenet2012":
from src.lr_generator import get_thor_damping
damping = get_thor_damping(0, config.damping_init, config.damping_decay, 70, step_size)
split_indices = [26, 53]
@ -242,36 +227,30 @@ if __name__ == '__main__':
model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt,
loss_scale_manager=loss_scale, metrics={'acc'},
amp_level="O2", keep_batchnorm_fp32=False)
args_opt.run_eval = False
config.run_eval = False
logger.warning("Thor optimizer not support evaluation while training.")
# define callbacks
time_cb = TimeMonitor(data_size=step_size)
loss_cb = LossMonitor()
cb = [time_cb, loss_cb]
ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path)
ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank_id()) + "/"
if config.save_checkpoint:
config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
keep_checkpoint_max=config.keep_checkpoint_max)
ckpt_cb = ModelCheckpoint(prefix="resnet", directory=ckpt_save_dir, config=config_ck)
cb += [ckpt_cb]
if args_opt.run_eval:
if args_opt.eval_dataset_path is None or (not os.path.isdir(args_opt.eval_dataset_path)):
raise ValueError("{} is not a existing path.".format(args_opt.eval_dataset_path))
eval_dataset = create_dataset(dataset_path=args_opt.eval_dataset_path, do_train=False,
batch_size=config.batch_size, target=target, enable_cache=args_opt.enable_cache,
cache_session_id=args_opt.cache_session_id)
eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"}
eval_cb = EvalCallBack(apply_eval, eval_param_dict, interval=args_opt.eval_interval,
eval_start_epoch=args_opt.eval_start_epoch, save_best_ckpt=args_opt.save_best_ckpt,
ckpt_directory=ckpt_save_dir, besk_ckpt_name="best_acc.ckpt",
metrics_name="acc")
cb += [eval_cb]
run_eval(target, model, ckpt_save_dir, cb)
# train model
if args_opt.net == "se-resnet50":
if config.net_name == "se-resnet50":
config.epoch_size = config.train_epoch_size
dataset_sink_mode = (not args_opt.parameter_server) and target != "CPU"
dataset_sink_mode = (not config.parameter_server) and target != "CPU"
model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb,
sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode)
if args_opt.run_eval and args_opt.enable_cache:
if config.run_eval and config.enable_cache:
print("Remember to shut down the cache server via \"cache_admin --stop\"")
if __name__ == '__main__':
train_net()

View File

@ -147,8 +147,8 @@ def test(cloud_args=None):
# network
config.logger.important_info('start create network')
if os.path.isdir(config.pretrained):
models = list(glob.glob(os.path.join(config.pretrained, '*.ckpt')))
if os.path.isdir(config.checkpoint_file_path):
models = list(glob.glob(os.path.join(config.checkpoint_file_path, '*.ckpt')))
print(models)
if config.graph_ckpt:
f = lambda x: -1 * int(os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split('_')[0])

View File

@ -33,8 +33,9 @@ def test_resnet50_cifar10_ascend():
new_list = ["total_epochs=10", "10"]
utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py"))
dataset_path = os.path.join(utils.data_root, "cifar-10-batches-bin")
exec_network_shell = "cd resnet/scripts; bash run_distribute_train.sh resnet50 cifar10 {} {}"\
.format(utils.rank_table_path, dataset_path)
config_path = os.path.join(cur_model_path, "resnet50_cifar10_config.yaml")
exec_network_shell = "cd resnet/scripts; bash run_distribute_train.sh {} {} {}"\
.format(utils.rank_table_path, dataset_path, config_path)
os.system(exec_network_shell)
cmd = "ps -ef | grep python | grep train.py | grep -v grep"
ret = utils.process_check(100, cmd)
@ -63,7 +64,9 @@ def test_resnet50_cifar10_gpu():
new_list = ["total_epochs=10", "10"]
utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py"))
dataset_path = os.path.join(utils.data_root, "cifar-10-batches-bin")
exec_network_shell = "cd resnet/scripts; sh run_distribute_train_gpu.sh resnet50 cifar10 {}".format(dataset_path)
config_path = os.path.join(cur_model_path, "resnet50_cifar10_config.yaml")
exec_network_shell = "cd resnet/scripts; sh run_distribute_train_gpu.sh {} {}" \
.format(dataset_path, config_path)
logger.warning("cmd [{}] is running...".format(exec_network_shell))
os.system(exec_network_shell)
cmd = "ps -ef | grep python | grep train.py | grep -v grep"