From 48a2b44fbb9648e403c4f96cf9a60d1858779b8b Mon Sep 17 00:00:00 2001 From: lilei Date: Sun, 27 Jun 2021 10:55:33 +0800 Subject: [PATCH] modify model_zoo resnet network for clould --- model_zoo/official/cv/resnet/README.md | 133 +++++++++--- model_zoo/official/cv/resnet/README_CN.md | 131 ++++++++--- model_zoo/official/cv/resnet/eval.py | 48 ++--- model_zoo/official/cv/resnet/export.py | 69 +++--- .../cv/resnet/gpu_resnet_benchmark.py | 58 ++--- model_zoo/official/cv/resnet/infer.py | 45 ++-- model_zoo/official/cv/resnet/postprocess.py | 5 +- .../resnet/resnet101_imagenet2012_config.yaml | 75 +++++++ .../cv/resnet/resnet18_cifar10_config.yaml | 75 +++++++ .../resnet/resnet18_imagenet2012_config.yaml | 77 +++++++ .../resnet/resnet34_imagenet2012_config.yaml | 77 +++++++ .../cv/resnet/resnet50_cifar10_config.yaml | 75 +++++++ .../resnet50_imagenet2012_Ascend_config.yaml | 78 +++++++ .../resnet50_imagenet2012_GPU_config.yaml | 78 +++++++ .../resnet/resnet50_imagenet2012_config.yaml | 77 +++++++ .../cv/resnet/resnet_benchmark_GPU.yaml | 52 +++++ .../cv/resnet/scripts/run_distribute_train.sh | 65 ++---- .../scripts/run_distribute_train_gpu.sh | 67 +++--- .../official/cv/resnet/scripts/run_eval.sh | 36 +--- .../cv/resnet/scripts/run_eval_gpu.sh | 32 +-- .../scripts/run_eval_gpu_resnet_benckmark.sh | 24 +-- .../scripts/run_gpu_resnet_benchmark.sh | 32 +-- .../official/cv/resnet/scripts/run_infer.sh | 25 +-- .../scripts/run_parameter_server_train.sh | 67 +++--- .../scripts/run_parameter_server_train_gpu.sh | 92 ++++---- .../cv/resnet/scripts/run_standalone_train.sh | 66 ++---- .../scripts/run_standalone_train_gpu.sh | 64 +++--- .../se-resnet50_imagenet2012_config.yaml | 78 +++++++ model_zoo/official/cv/resnet/src/config.py | 155 ------------- model_zoo/official/cv/resnet/src/dataset.py | 20 +- .../official/cv/resnet/src/dataset_infer.py | 4 +- .../cv/resnet/src/model_utils/config.py | 125 +++++++++++ .../resnet/src/model_utils/device_adapter.py | 27 +++ .../resnet/src/model_utils/local_adapter.py | 36 ++++ .../resnet/src/model_utils/moxing_adapter.py | 115 ++++++++++ model_zoo/official/cv/resnet/train.py | 203 ++++++++---------- model_zoo/official/cv/resnext/eval.py | 4 +- .../resnet50/test_resnet50_cifar10.py | 9 +- 38 files changed, 1655 insertions(+), 844 deletions(-) create mode 100644 model_zoo/official/cv/resnet/resnet101_imagenet2012_config.yaml create mode 100644 model_zoo/official/cv/resnet/resnet18_cifar10_config.yaml create mode 100644 model_zoo/official/cv/resnet/resnet18_imagenet2012_config.yaml create mode 100644 model_zoo/official/cv/resnet/resnet34_imagenet2012_config.yaml create mode 100644 model_zoo/official/cv/resnet/resnet50_cifar10_config.yaml create mode 100644 model_zoo/official/cv/resnet/resnet50_imagenet2012_Ascend_config.yaml create mode 100644 model_zoo/official/cv/resnet/resnet50_imagenet2012_GPU_config.yaml create mode 100644 model_zoo/official/cv/resnet/resnet50_imagenet2012_config.yaml create mode 100644 model_zoo/official/cv/resnet/resnet_benchmark_GPU.yaml create mode 100644 model_zoo/official/cv/resnet/se-resnet50_imagenet2012_config.yaml delete mode 100755 model_zoo/official/cv/resnet/src/config.py create mode 100644 model_zoo/official/cv/resnet/src/model_utils/config.py create mode 100644 model_zoo/official/cv/resnet/src/model_utils/device_adapter.py create mode 100644 model_zoo/official/cv/resnet/src/model_utils/local_adapter.py create mode 100644 model_zoo/official/cv/resnet/src/model_utils/moxing_adapter.py diff --git a/model_zoo/official/cv/resnet/README.md b/model_zoo/official/cv/resnet/README.md index 384d4741501..aeb0849d4ff 100644 --- a/model_zoo/official/cv/resnet/README.md +++ b/model_zoo/official/cv/resnet/README.md @@ -101,27 +101,26 @@ After installing MindSpore via the official website, you can start training and ```bash # distributed training -Usage: bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional) # standalone training -Usage: bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] -[PRETRAINED_CKPT_PATH](optional) +Usage: bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional) # run evaluation example -Usage: bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] +Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH] ``` - Running on GPU ```bash # distributed training example -bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional) # standalone training example -bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional) # infer example -bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] +bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH] # gpu benchmark example bash run_gpu_resnet_benchmark.sh [DATASET_PATH] [BATCH_SIZE](optional) [DTYPE](optional) [DEVICE_NUM](optional) [SAVE_CKPT](optional) [SAVE_PATH](optional) @@ -131,10 +130,41 @@ bash run_gpu_resnet_benchmark.sh [DATASET_PATH] [BATCH_SIZE](optional) [DTYPE](o ```bash # standalone training example -python train.py --net=[resnet50|resnet101] --dataset=[cifar10|imagenet2012] --device_target=CPU --dataset_path=[DATASET_PATH] --pre_trained=[CHECKPOINT_PATH](optional) +python train.py --device_target=CPU --data_path=[DATASET_PATH] --config_path [CONFIG_PATH] --pre_trained=[CHECKPOINT_PATH](optional) # infer example -python eval.py --net=[resnet50|resnet101] --dataset=[cifar10|imagenet2012] --dataset_path=[DATASET_PATH] --checkpoint_path=[CHECKPOINT_PATH] --device_target=CPU +python eval.py --data_path=[DATASET_PATH] --checkpoint_file_path=[CHECKPOINT_PATH] --config_path [CONFIG_PATH] --device_target=CPU +``` + +If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training and evaluation as follows: + +```python +# run distributed training on modelarts example +# (1) First, Perform a or b. +# a. Set "enable_modelarts=True" on yaml file. +# Set other parameters on yaml file you need. +# b. Add "enable_modelarts=True" on the website UI interface. +# Add other parameters on the website UI interface. +# (2) Set the config directory to "config_path=/The path of config in S3/" +# (3) Set the code directory to "/path/resnet" on the website UI interface. +# (4) Set the startup file to "train.py" on the website UI interface. +# (5) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface. +# (6) Create your job. + +# run evaluation on modelarts example +# (1) Copy or upload your trained model to S3 bucket. +# (2) Perform a or b. +# a. Set "enable_modelarts=True" on yaml file. +# Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on yaml file. +# Set "checkpoint_url=/The path of checkpoint in S3/" on yaml file. +# b. Add "enable_modelarts=True" on the website UI interface. +# Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface. +# Add "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface. +# (3) Set the config directory to "config_path=/The path of config in S3/" +# (4) Set the code directory to "/path/resnet" on the website UI interface. +# (5) Set the startup file to "eval.py" on the website UI interface. +# (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface. +# (7) Create your job. ``` # [Script Description](#contents) @@ -158,13 +188,26 @@ python eval.py --net=[resnet50|resnet101] --dataset=[cifar10|imagenet2012] --dat |── run_eval_gpu_resnet_benckmark.sh # launch gpu benchmark eval for resnet50 with imagenet2012 └── cache_util.sh # a collection of helper functions to manage cache ├── src - ├── config.py # parameter configuration ├── dataset.py # data preprocessing ├─ eval_callback.py # evaluation callback while training ├── CrossEntropySmooth.py # loss definition for ImageNet2012 dataset ├── lr_generator.py # generate learning rate for each step ├── resnet.py # resnet backbone, including resnet50 and resnet101 and se-resnet50 └── resnet_gpu_benchmark.py # resnet50 for GPU benchmark + ├── model_utils + ├──config.py # parameter configuration + ├──device_adapter.py # device adapter + ├──local_adapter.py # local adapter + ├──moxing_adapter.py # moxing adapter + ├── resnet18_cifar10_config.yaml # parameter configuration + ├── resnet18_imagenet2012_config.yaml # parameter configuration + ├── resnet34_imagenet2012_config.yaml # parameter configuration + ├── resnet50_cifar10_config.yaml # parameter configuration + ├── resnet50_imagenet2012_Ascend_config.yaml # parameter configuration + ├── resnet50_imagenet2012_config.yaml # parameter configuration + ├── resnet50_imagenet2012_GPU_config.yaml # parameter configuration + ├── resnet101_imagenet2012_config.yaml # parameter configuration + ├── se-resnet50_imagenet2012_config.yaml # parameter configuration ├── export.py # export model for inference ├── mindspore_hub_conf.py # mindspore hub interface ├── eval.py # eval net @@ -174,7 +217,7 @@ python eval.py --net=[resnet50|resnet101] --dataset=[cifar10|imagenet2012] --dat ## [Script Parameters](#contents) -Parameters for both training and evaluation can be set in config.py. +Parameters for both training and evaluation can be set in config file. - Config for ResNet18 and ResNet50, CIFAR-10 dataset @@ -189,7 +232,6 @@ Parameters for both training and evaluation can be set in config.py. "save_checkpoint": True, # whether save checkpoint or not "save_checkpoint_epochs": 5, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last step "keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint -"save_checkpoint_path": "./", # path to save checkpoint "warmup_epochs": 5, # number of warmup epoch "lr_decay_mode": "poly" # decay mode can be selected in steps, ploy and default "lr_init": 0.01, # initial learning rate @@ -210,7 +252,6 @@ Parameters for both training and evaluation can be set in config.py. "save_checkpoint": True, # whether save checkpoint or not "save_checkpoint_epochs": 5, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last epoch "keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint -"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path "warmup_epochs": 0, # number of warmup epoch "lr_decay_mode": "Linear", # decay mode for generating learning rate "use_label_smooth": True, # label smooth @@ -233,7 +274,6 @@ Parameters for both training and evaluation can be set in config.py. "save_checkpoint": True, # whether save checkpoint or not "save_checkpoint_epochs": 5, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last epoch "keep_checkpoint_max": 1, # only keep the last keep_checkpoint_max checkpoint -"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path "warmup_epochs": 0, # number of warmup epoch "optimizer": 'Momentum', # optimizer "use_label_smooth": True, # label smooth @@ -256,7 +296,6 @@ Parameters for both training and evaluation can be set in config.py. "save_checkpoint": True, # whether save checkpoint or not "save_checkpoint_epochs": 5, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last epoch "keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint -"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path "warmup_epochs": 0, # number of warmup epoch "lr_decay_mode": "cosine" # decay mode for generating learning rate "use_label_smooth": True, # label_smooth @@ -278,7 +317,6 @@ Parameters for both training and evaluation can be set in config.py. "save_checkpoint": True, # whether save checkpoint or not "save_checkpoint_epochs": 4, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last epoch "keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint -"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path "warmup_epochs": 3, # number of warmup epoch "lr_decay_mode": "cosine" # decay mode for generating learning rate "use_label_smooth": True, # label_smooth @@ -296,15 +334,13 @@ Parameters for both training and evaluation can be set in config.py. ```bash # distributed training -Usage: bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional) # standalone training -Usage: bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] -[PRETRAINED_CKPT_PATH](optional) +Usage: bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional) # run evaluation example -Usage: bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] - +Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH] ``` For distributed training, a hccl configuration file with JSON format needs to be created in advance. @@ -319,13 +355,14 @@ If you want to change device_id for standalone training, you can set environment ```bash # distributed training example -bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional) # standalone training example -bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional) # infer example -bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] +bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] +[CONFIG_PATH] # gpu benchmark training example bash run_gpu_resnet_benchmark.sh [DATASET_PATH] [BATCH_SIZE](optional) [DTYPE](optional) [DEVICE_NUM](optional) [SAVE_CKPT](optional) [SAVE_PATH](optional) @@ -343,29 +380,29 @@ Please follow the instructions in the link [GPU-Multi-Host](https://www.mindspor - Parameter server training Ascend example ```bash -bash run_parameter_server_train.sh [resnet18|resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +bash run_parameter_server_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) ``` - Parameter server training GPU example ```bash -bash run_parameter_server_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +bash run_parameter_server_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) ``` #### Evaluation while training ```bash # evaluation with distributed training Ascend example: -bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) +bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) # evaluation with standalone training Ascend example: -bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) +bash run_standalone_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) # evaluation with distributed training GPU example: -bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) +bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) # evaluation with standalone training GPU example: -bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) +bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) ``` `RUN_EVAL` and `EVAL_DATASET_PATH` are optional arguments, setting `RUN_EVAL`=True allows you to do evaluation while training. When `RUN_EVAL` is set, `EVAL_DATASET_PATH` must also be set. @@ -480,12 +517,12 @@ epoch: [0/1] step: [100/5004], loss is 6.814013Epoch time: 3437.154 ms, fps: 148 ```bash # evaluation -Usage: bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] +Usage: bash run_eval.sh [DATASET_PATH] [CONFIG_PATH] [CHECKPOINT_PATH] ``` ```bash # evaluation example -bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/train_parallel0/resnet-90_195.ckpt +bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/train_parallel0/resnet-90_195.ckpt --config_path /.yaml ``` > checkpoint can be produced in training process. @@ -493,7 +530,7 @@ bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/tra #### Running on GPU ```bash -bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] +bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH] ``` ### Result @@ -547,13 +584,39 @@ result: {'top_5_accuracy': 0.9342589628681178, 'top_1_accuracy': 0.7680657810499 ### [Export MindIR](#contents) +Export MindIR on local + ```shell -python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] +python export.py --checkpoint_file_path [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] --config_path [CONFIG_PATH] ``` -The ckpt_file parameter is required, +The checkpoint_file_path parameter is required, `EXPORT_FORMAT` should be in ["AIR", "MINDIR"] +Export on ModelArts (If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start as follows) + +```python +# Export on ModelArts +# (1) Perform a or b. +# a. Set "enable_modelarts=True" on default_config.yaml file. +# Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on default_config.yaml file. +# Set "checkpoint_url='s3://dir_to_trained_ckpt/'" on default_config.yaml file. +# Set "file_name='./resnet'" on default_config.yaml file. +# Set "file_format='AIR'" on default_config.yaml file. +# Set other parameters on default_config.yaml file you need. +# b. Add "enable_modelarts=True" on the website UI interface. +# Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface. +# Add "checkpoint_url='s3://dir_to_trained_ckpt/'" on the website UI interface. +# Add "file_name='./resnet'" on the website UI interface. +# Add "file_format='AIR'" on the website UI interface. +# Add other parameters on the website UI interface. +# (2) Set the config_path="/path/yaml file" on the website UI interface. +# (3) Set the code directory to "/path/resnet" on the website UI interface. +# (4) Set the startup file to "export.py" on the website UI interface. +# (5) Set the "Output file path" and "Job log path" to your path on the website UI interface. +# (6) Create your job. +``` + ### Infer on Ascend310 Before performing inference, the mindir file must bu exported by `export.py` script. We only provide an example of inference using MINDIR model. diff --git a/model_zoo/official/cv/resnet/README_CN.md b/model_zoo/official/cv/resnet/README_CN.md index 3888e32b314..e2793cda223 100755 --- a/model_zoo/official/cv/resnet/README_CN.md +++ b/model_zoo/official/cv/resnet/README_CN.md @@ -104,27 +104,60 @@ ResNet的总体网络架构如下: ```text # 分布式训练 -用法:bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选) +用法:bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选) # 单机训练 -用法:bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] -[PRETRAINED_CKPT_PATH](可选) +用法:bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选) # 运行评估示例 -用法:bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] +用法:bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH] ``` - GPU处理器环境运行 ```text # 分布式训练示例 -bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选) +bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选) # 单机训练示例 -bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选) +bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选) # 推理示例 -bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] +bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH] +``` + +如果要在modelarts上进行模型的训练,可以参考modelarts的官方指导文档(https://support.huaweicloud.com/modelarts/) +开始进行模型的训练和推理,具体操作如下: + +```python +# 在modelarts上使用分布式训练的示例: +# (1) 选址a或者b其中一种方式。 +# a. 设置 "enable_modelarts=True" 。 +# 在yaml文件上设置网络所需的参数。 +# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。 +# 在modelarts的界面上设置网络所需的参数。 +# (2) 在modelarts的界面上设置配置文件的路径"config_path=/The path of config in S3/" +# (3) 在modelarts的界面上设置代码的路径 "/path/resnet"。 +# (4) 在modelarts的界面上设置模型的启动文件 "train.py" 。 +# (5) 在modelarts的界面上设置模型的数据路径 "Dataset path" , +# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。 +# (6) 开始模型的训练。 + +# 在modelarts上使用模型推理的示例 +# (1) 把训练好的模型地方到桶的对应位置。 +# (2) 选址a或者b其中一种方式。 +# a. 设置 "enable_modelarts=True" +# 设置 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt" 在 yaml 文件. +# 设置 "checkpoint_url=/The path of checkpoint in S3/" 在 yaml 文件. +# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。 +# 增加 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" 参数在modearts的界面上。 +# 增加 "checkpoint_url=/The path of checkpoint in S3/" 参数在modearts的界面上。 +# (2) 在modelarts的界面上设置配置文件的路径"config_path=/The path of config in S3/" +# (3) 在modelarts的界面上设置代码的路径 "/path/resnet"。 +# (4) 在modelarts的界面上设置模型的启动文件 "eval.py" 。 +# (5) 在modelarts的界面上设置模型的数据路径 "Dataset path" , +# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。 +# (6) 开始模型的推理。 ``` # 脚本说明 @@ -146,19 +179,33 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] ├── run_standalone_train_gpu.sh # 启动GPU单机训练(单卡) └── cache_util.sh # 使用单节点緩存的帮助函数 ├── src - ├── config.py # 参数配置 ├── dataset.py # 数据预处理 - ├─ eval_callback.py # 训练时推理回调函数 + ├── eval_callback.py # 训练时推理回调函数 ├── CrossEntropySmooth.py # ImageNet2012数据集的损失定义 ├── lr_generator.py # 生成每个步骤的学习率 └── resnet.py # ResNet骨干网络,包括ResNet50、ResNet101和SE-ResNet50 + ├── model_utils + ├── config.py # 参数配置 + ├── device_adapter.py # 设备配置 + ├── local_adapter.py # 本地设备配置 + └── moxing_adapter.py # modelarts设备配置 + ├── resnet18_cifar10_config.yaml # 参数配置 + ├── resnet18_imagenet2012_config.yaml # 参数配置 + ├── resnet34_imagenet2012_config.yaml # 参数配置 + ├── resnet50_cifar10_config.yaml # 参数配置 + ├── resnet50_imagenet2012_Ascend_config.yaml # 参数配置 + ├── resnet50_imagenet2012_config.yaml # 参数配置 + ├── resnet50_imagenet2012_GPU_config.yaml # 参数配置 + ├── resnet101_imagenet2012_config.yaml # 参数配置 + ├── se-resnet50_imagenet2012_config.yaml # 参数配置 + ├── eval.py # 评估网络 ├── eval.py # 评估网络 └── train.py # 训练网络 ``` ## 脚本参数 -在config.py中可以同时配置训练参数和评估参数。 +在配置文件中可以同时配置训练参数和评估参数。 - 配置ResNet18、ResNet50和CIFAR-10数据集。 @@ -173,7 +220,6 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] "save_checkpoint":True, # 是否保存检查点 "save_checkpoint_epochs":5, # 两个检查点之间的周期间隔;默认情况下,最后一个检查点将在最后一步完成后保存 "keep_checkpoint_max":10, # 只保留最后一个keep_checkpoint_max检查点 -"save_checkpoint_path":"./", # 检查点保存路径 "warmup_epochs":5, # 热身周期数 "lr_decay_mode":"poly” # 衰减模式可为步骤、策略和默认 "lr_init":0.01, # 初始学习率 @@ -194,7 +240,6 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] "save_checkpoint":True, # 是否保存检查点 "save_checkpoint_epochs":5, # 两个检查点之间的周期间隔;默认情况下,最后一个检查点将在最后一个周期完成后保存 "keep_checkpoint_max":10, # 只保存最后一个keep_checkpoint_max检查点 -"save_checkpoint_path":"./", # 检查点相对于执行路径的保存路径 "warmup_epochs":0, # 热身周期数 "lr_decay_mode":"Linear", # 用于生成学习率的衰减模式 "use_label_smooth":True, # 标签平滑 @@ -217,7 +262,6 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] "save_checkpoint":True, # 是否保存检查点 "save_checkpoint_epochs":5, # 两个检查点之间的周期间隔;默认情况下,最后一个检查点将在最后一个周期完成后保存 "keep_checkpoint_max":1, # 只保存最后一个keep_checkpoint_max检查点 -"save_checkpoint_path":"./", # 检查点相对于执行路径的保存路径 "warmup_epochs":0, # 热身周期数 "optimizer":"Momentum", # 优化器 "use_label_smooth":True, # 标签平滑 @@ -240,7 +284,6 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] "save_checkpoint":True, # 是否保存检查点 "save_checkpoint_epochs":5, # 两个检查点之间的周期间隔;默认情况下,最后一个检查点将在最后一个周期完成后保存 "keep_checkpoint_max":10, # 只保存最后一个keep_checkpoint_max检查点 -"save_checkpoint_path":"./", # 检查点相对于执行路径的保存路径 "warmup_epochs":0, # 热身周期数 "lr_decay_mode":"cosine” # 用于生成学习率的衰减模式 "use_label_smooth":True, # 标签平滑 @@ -262,7 +305,6 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] "save_checkpoint":True, # 是否保存检查点 "save_checkpoint_epochs":4, # 两个检查点之间的周期间隔;默认情况下,最后一个检查点将在最后一个周期完成后保存 "keep_checkpoint_max":10, # 只保存最后一个keep_checkpoint_max检查点 -"save_checkpoint_path":"./", # checkpoint相对于执行路径的保存路径 "warmup_epochs":3, # 热身周期数 "lr_decay_mode":"cosine” # 用于生成学习率的衰减模式 "use_label_smooth":True, # 标签平滑 @@ -280,14 +322,13 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] ```text # 分布式训练 -用法:bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选) +用法:bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选) # 单机训练 -用法:bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] -[PRETRAINED_CKPT_PATH](可选) +用法:bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选) # 运行评估示例 -用法:bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] +用法:bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH] ``` @@ -303,13 +344,13 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] ```text # 分布式训练示例 -bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选) +bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选) # 单机训练示例 -bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选) +bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选) # 推理示例 -bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] +bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH] ``` #### 运行参数服务器模式训练 @@ -317,29 +358,29 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] - Ascend参数服务器训练示例 ```text -bash run_parameter_server_train.sh [resnet18|resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选) +bash run_parameter_server_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选) ``` - GPU参数服务器训练示例 ```text -bash run_parameter_server_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选) +bash run_parameter_server_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选) ``` #### 训练时推理 ```bash # Ascend 分布式训练时推理示例: -bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) +bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) # Ascend 单机训练时推理示例: -bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) +bash run_standalone_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) # GPU 分布式训练时推理示例: -bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) +bash run_distribute_train_gpu.sh [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) # GPU 单机训练时推理示例: -bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) +bash run_standalone_train_gpu.sh [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) ``` 训练时推理需要在设置`RUN_EVAL`为True,与此同时还需要设置`EVAL_DATASET_PATH`。此外,当设置`RUN_EVAL`为True时还可为python脚本设置`save_best_ckpt`, `eval_start_epoch`, `eval_interval`等参数。 @@ -446,12 +487,12 @@ epoch:5 step:5004, loss is 3.3501816 ```bash # 评估 -Usage: bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] +Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH] ``` ```bash # 评估示例 -bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/train_parallel0/resnet-90_195.ckpt +bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/train_parallel0/resnet-90_195.ckpt --config_path /*.yaml ``` > 训练过程中可以生成检查点。 @@ -459,7 +500,7 @@ bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/tra #### GPU处理器环境运行 ```bash -bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] +bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH] ``` ### 结果 @@ -513,13 +554,37 @@ result:{'top_5_accuracy':0.9342589628681178, 'top_1_accuracy':0.768065781049936} ### [导出MindIR](#contents) +导出mindir模型 + ```shell -python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] +python export.py --checkpoint_file_path [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] --config_path [CONFIG_PATH] ``` -参数ckpt_file为必填项, +参数checkpoint_file_path为必填项, `EXPORT_FORMAT` 必须在 ["AIR", "MINDIR"]中选择。 +ModelArts导出mindir + +```python +# (1) 把训练好的模型地方到桶的对应位置。 +# (2) 选址a或者b其中一种方式。 +# a. 设置 "enable_modelarts=True" +# 设置 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt" 在 yaml 文件。 +# 设置 "checkpoint_url=/The path of checkpoint in S3/" 在 yaml 文件。 +# 设置 "file_name='./resnet'"参数在yaml文件。 +# 设置 "file_format='AIR'" 参数在yaml文件。 +# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。 +# 增加 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" 参数在modearts的界面上。 +# 增加 "checkpoint_url=/The path of checkpoint in S3/" 参数在modearts的界面上。 +# 设置 "file_name='./resnet'"参数在modearts的界面上。 +# 设置 "file_format='AIR'" 参数在modearts的界面上。 +# (3) 设置网络配置文件的路径 "config_path=/The path of config in S3/" +# (4) 在modelarts的界面上设置代码的路径 "/path/resnet"。 +# (5) 在modelarts的界面上设置模型的启动文件 "export.py" 。 +# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。 +# (6) 开始导出mindir。 +``` + ### 在Ascend310执行推理 在执行推理前,mindir文件必须通过`export.py`脚本导出。以下展示了使用minir模型执行推理的示例。 diff --git a/model_zoo/official/cv/resnet/eval.py b/model_zoo/official/cv/resnet/eval.py index 8f2fd8e9d35..d1b8e2bfae2 100755 --- a/model_zoo/official/cv/resnet/eval.py +++ b/model_zoo/official/cv/resnet/eval.py @@ -14,51 +14,39 @@ # ============================================================================ """train resnet.""" import os -import argparse from mindspore import context from mindspore.common import set_seed from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.CrossEntropySmooth import CrossEntropySmooth - -parser = argparse.ArgumentParser(description='Image classification') -parser.add_argument('--net', type=str, default=None, help='Resnet Model, either resnet18,resnet34' - 'resnet50 or resnet101') -parser.add_argument('--dataset', type=str, default=None, help='Dataset, either cifar10 or imagenet2012') - -parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path') -parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') -parser.add_argument('--device_target', type=str, default='Ascend', choices=("Ascend", "GPU", "CPU"), - help="Device target, support Ascend, GPU and CPU.") -args_opt = parser.parse_args() +from src.model_utils.config import config +from src.model_utils.moxing_adapter import moxing_wrapper set_seed(1) -if args_opt.net in ("resnet18", "resnet34", "resnet50"): - if args_opt.net == "resnet18": +if config.net_name in ("resnet18", "resnet34", "resnet50"): + if config.net_name == "resnet18": from src.resnet import resnet18 as resnet - if args_opt.net == "resnet34": + if config.net_name == "resnet34": from src.resnet import resnet34 as resnet - if args_opt.net == "resnet50": + if config.net_name == "resnet50": from src.resnet import resnet50 as resnet - if args_opt.dataset == "cifar10": - from src.config import config1 as config + if config.dataset == "cifar10": from src.dataset import create_dataset1 as create_dataset else: - from src.config import config2 as config from src.dataset import create_dataset2 as create_dataset -elif args_opt.net == "resnet101": +elif config.net_name == "resnet101": from src.resnet import resnet101 as resnet - from src.config import config3 as config from src.dataset import create_dataset3 as create_dataset else: from src.resnet import se_resnet50 as resnet - from src.config import config4 as config from src.dataset import create_dataset4 as create_dataset -if __name__ == '__main__': - target = args_opt.device_target +@moxing_wrapper() +def eval_net(): + """eval net""" + target = config.device_target # init context context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False) @@ -67,20 +55,19 @@ if __name__ == '__main__': context.set_context(device_id=device_id) # create dataset - dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, batch_size=config.batch_size, + dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, target=target) - step_size = dataset.get_dataset_size() # define net net = resnet(class_num=config.class_num) # load checkpoint - param_dict = load_checkpoint(args_opt.checkpoint_path) + param_dict = load_checkpoint(config.checkpoint_file_path) load_param_into_net(net, param_dict) net.set_train(False) # define loss, model - if args_opt.dataset == "imagenet2012": + if config.dataset == "imagenet2012": if not config.use_label_smooth: config.label_smooth_factor = 0.0 loss = CrossEntropySmooth(sparse=True, reduction='mean', @@ -93,4 +80,7 @@ if __name__ == '__main__': # eval model res = model.eval(dataset) - print("result:", res, "ckpt=", args_opt.checkpoint_path) + print("result:", res, "ckpt=", config.checkpoint_file_path) + +if __name__ == '__main__': + eval_net() diff --git a/model_zoo/official/cv/resnet/export.py b/model_zoo/official/cv/resnet/export.py index 80272824567..d1000103757 100644 --- a/model_zoo/official/cv/resnet/export.py +++ b/model_zoo/official/cv/resnet/export.py @@ -16,67 +16,50 @@ ##############export checkpoint file into air and onnx models################# python export.py """ -import argparse +import os import numpy as np from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context +from src.model_utils.config import config +from src.model_utils.moxing_adapter import moxing_wrapper -parser = argparse.ArgumentParser(description='resnet export') -parser.add_argument('--network_dataset', type=str, default='resnet50_cifar10', choices=['resnet18_cifar10', - 'resnet18_imagenet2012', - 'resnet34_imagenet2012', - 'resnet50_cifar10', - 'resnet50_imagenet2012', - 'resnet101_imagenet2012', - "se-resnet50_imagenet2012"], - help='network and dataset name.') -parser.add_argument("--device_id", type=int, default=0, help="Device id") -parser.add_argument("--batch_size", type=int, default=1, help="batch size") -parser.add_argument("--ckpt_file", type=str, required=True, help="Checkpoint file path.") -parser.add_argument("--file_name", type=str, default="resnet", help="output file name.") -parser.add_argument('--width', type=int, default=224, help='input width') -parser.add_argument('--height', type=int, default=224, help='input height') -parser.add_argument("--file_format", type=str, choices=["AIR", "ONNX", "MINDIR"], default="AIR", help="file format") -parser.add_argument("--device_target", type=str, default="Ascend", - choices=["Ascend", "GPU", "CPU"], help="device target(default: Ascend)") -args = parser.parse_args() +context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +if config.device_target == "Ascend": + context.set_context(device_id=config.device_id) -context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) -if args.device_target == "Ascend": - context.set_context(device_id=args.device_id) +def modelarts_pre_process(): + '''modelarts pre process function.''' + config.file_name = os.path.join(config.output_path, config.file_name) -if __name__ == '__main__': - - if args.network_dataset == 'resnet18_cifar10': - from src.config import config1 as config +@moxing_wrapper(pre_process=modelarts_pre_process) +def run_export(): + """run export.""" + if config.network_dataset == 'resnet18_cifar10': from src.resnet import resnet18 as resnet - elif args.network_dataset == 'resnet18_imagenet2012': - from src.config import config2 as config + elif config.network_dataset == 'resnet18_imagenet2012': from src.resnet import resnet18 as resnet - elif args.network_dataset == 'resnet34_imagenet2012': - from src.config import config2 as config + elif config.network_dataset == 'resnet34_imagenet2012': from src.resnet import resnet34 as resnet - elif args.network_dataset == 'resnet50_cifar10': - from src.config import config1 as config + elif config.network_dataset == 'resnet50_cifar10': from src.resnet import resnet50 as resnet - elif args.network_dataset == 'resnet50_imagenet2012': - from src.config import config2 as config + elif config.network_dataset == 'resnet50_imagenet2012': from src.resnet import resnet50 as resnet - elif args.network_dataset == 'resnet101_imagenet2012': - from src.config import config3 as config + elif config.network_dataset == 'resnet101_imagenet2012': from src.resnet import resnet101 as resnet - elif args.network_dataset == 'se-resnet50_imagenet2012': - from src.config import config4 as config + elif config.network_dataset == 'se-resnet50_imagenet2012': from src.resnet import se_resnet50 as resnet else: raise ValueError("network and dataset is not support.") net = resnet(config.class_num) - assert args.ckpt_file is not None, "checkpoint_path is None." + assert config.checkpoint_file_path is not None, "checkpoint_path is None." - param_dict = load_checkpoint(args.ckpt_file) + param_dict = load_checkpoint(config.checkpoint_file_path) load_param_into_net(net, param_dict) - input_arr = Tensor(np.zeros([args.batch_size, 3, args.height, args.width], np.float32)) - export(net, input_arr, file_name=args.file_name, file_format=args.file_format) + input_arr = Tensor(np.zeros([config.batch_size, 3, config.height, config.width], np.float32)) + export(net, input_arr, file_name=config.file_name, file_format=config.file_format) + +if __name__ == '__main__': + run_export() diff --git a/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py b/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py index 78fe2707cb9..8e35a9c83ce 100644 --- a/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py +++ b/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py @@ -13,8 +13,7 @@ # limitations under the License. # ============================================================================ """train resnet.""" -import argparse -import ast +import os import time import numpy as np from mindspore import context @@ -34,25 +33,11 @@ import mindspore.dataset.vision.c_transforms as C from src.resnet_gpu_benchmark import resnet50 as resnet from src.CrossEntropySmooth import CrossEntropySmooth from src.momentum import Momentum as MomentumWeightDecay - -parser = argparse.ArgumentParser(description='Image classification') -parser.add_argument('--batch_size', type=str, default="256", help='Batch_size: default 256.') -parser.add_argument('--epoch_size', type=str, default="2", help='Epoch_size: default 2') -parser.add_argument('--print_per_steps', type=str, default="20", help='Print loss and time per steps: default 20') -parser.add_argument('--run_distribute', type=ast.literal_eval, default=False, help='Run distribute') -parser.add_argument('--save_ckpt', type=ast.literal_eval, default=False, help='Save ckpt or not: default False') -parser.add_argument('--eval', type=ast.literal_eval, default=False, help='Eval ckpt : default False') -parser.add_argument('--dataset_path', type=str, default=None, help='Imagenet dataset path') -parser.add_argument('--ckpt_path', type=str, default="./", help='The path to save ckpt if save_ckpt is True;\ - Or the ckpt model file when eval is True') -parser.add_argument('--mode', type=str, default="GRAPH", choices=["GRAPH", "PYNATIVE"], help='Execute mode') -parser.add_argument('--dtype', type=str, choices=["fp32", "fp16", "FP16", "FP32"], default="fp16", \ - help='Compute data type fp32 or fp16: default fp16') -args_opt = parser.parse_args() +from src.model_utils.config import config +from src.model_utils.moxing_adapter import moxing_wrapper set_seed(1) - class MyTimeMonitor(Callback): def __init__(self, batch_size, sink_size, dataset_size, mode): super(MyTimeMonitor, self).__init__() @@ -95,7 +80,7 @@ class MyTimeMonitor(Callback): def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="GPU", dtype="fp16", device_num=1): - if args_opt.mode == "GRAPH": + if config.mode_name == "GRAPH": ds_num_parallel_worker = 4 map_num_parallel_worker = 8 batch_num_parallel_worker = None @@ -116,7 +101,7 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" # define map operations normalize_op = C.Normalize(mean=mean, std=std) if dtype == "fp16": - if args_opt.eval: + if config.eval: x_dtype = "float32" else: x_dtype = "float16" @@ -161,25 +146,26 @@ def get_liner_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per return lr_each_step +@moxing_wrapper() def train(): # set args dev = "GPU" - epoch_size = int(args_opt.epoch_size) - total_batch = int(args_opt.batch_size) - print_per_steps = int(args_opt.print_per_steps) - compute_type = str(args_opt.dtype).lower() - ckpt_save_dir = str(args_opt.ckpt_path) - save_ckpt = bool(args_opt.save_ckpt) + epoch_size = int(config.epoch_size) + total_batch = int(config.batch_size) + print_per_steps = int(config.print_per_steps) + compute_type = str(config.dtype).lower() + save_ckpt = bool(config.save_ckpt) device_num = 1 # init context - if args_opt.mode == "GRAPH": + if config.mode_name == "GRAPH": mode = context.GRAPH_MODE all_reduce_fusion_config = [85, 160] else: mode = context.PYNATIVE_MODE all_reduce_fusion_config = [30, 90, 160] context.set_context(mode=mode, device_target=dev, save_graphs=False) - if args_opt.run_distribute: + ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path) + if config.run_distribute: init() device_num = get_group_size() context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, @@ -187,7 +173,7 @@ def train(): ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank()) + "/" # create dataset - dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1, + dataset = create_dataset(dataset_path=config.data_path, do_train=True, repeat_num=1, batch_size=total_batch, target=dev, dtype=compute_type, device_num=device_num) step_size = dataset.get_dataset_size() if (print_per_steps > step_size or print_per_steps < 1): @@ -251,21 +237,21 @@ def train(): else: model.train(epoch_size, dataset, callbacks=cb) - +@moxing_wrapper() def eval_(): # set args dev = "GPU" - compute_type = str(args_opt.dtype).lower() - ckpt_dir = str(args_opt.ckpt_path) - total_batch = int(args_opt.batch_size) + compute_type = str(config.dtype).lower() + ckpt_dir = str(config.checkpoint_file_path) + total_batch = int(config.batch_size) # init context - if args_opt.mode == "GRAPH": + if config.mode_name == "GRAPH": mode = context.GRAPH_MODE else: mode = context.PYNATIVE_MODE context.set_context(mode=mode, device_target=dev, save_graphs=False) # create dataset - dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, repeat_num=1, + dataset = create_dataset(dataset_path=config.data_path, do_train=False, repeat_num=1, batch_size=total_batch, target=dev, dtype=compute_type) # define net net = resnet(class_num=1001, dtype=compute_type) @@ -284,7 +270,7 @@ def eval_(): if __name__ == '__main__': - if not args_opt.eval: + if not config.eval: train() else: eval_() diff --git a/model_zoo/official/cv/resnet/infer.py b/model_zoo/official/cv/resnet/infer.py index fc8e649019d..0a7af5e33c9 100644 --- a/model_zoo/official/cv/resnet/infer.py +++ b/model_zoo/official/cv/resnet/infer.py @@ -14,43 +14,27 @@ # ============================================================================ """train resnet.""" import os -import argparse import numpy as np from mindspore import Tensor from mindspore import context -from mindspore.common import set_seed from mindspore.train.serialization import load_checkpoint, load_param_into_net +from src.model_utils.config import config +from src.model_utils.moxing_adapter import moxing_wrapper -parser = argparse.ArgumentParser(description='Image classification') -parser.add_argument('--net', type=str, default=None, help='Resnet Model, either resnet18, ' - 'resnet50 or resnet101') -parser.add_argument('--dataset', type=str, default=None, help='Dataset, imagenet2012') - -parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path') -parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') -parser.add_argument('--device_target', type=str, default='Ascend', choices=("Ascend", "GPU", "CPU"), - help="Device target, support Ascend, GPU and CPU.") -args_opt = parser.parse_args() - -set_seed(1) - -if args_opt.dataset != "imagenet2012": +if config.dataset != "imagenet2012": raise ValueError("Currently only support imagenet2012 dataset format") -if args_opt.net in ("resnet18", "resnet50"): - if args_opt.net == "resnet18": +if config.net_name in ("resnet18", "resnet50"): + if config.net_name == "resnet18": from src.resnet import resnet18 as resnet - if args_opt.net == "resnet50": + if config.net_name == "resnet50": from src.resnet import resnet50 as resnet - from src.config import config2 as config from src.dataset_infer import create_dataset -elif args_opt.net == "resnet101": +elif config.net_name == "resnet101": from src.resnet import resnet101 as resnet - from src.config import config3 as config from src.dataset_infer import create_dataset2 as create_dataset else: from src.resnet import se_resnet50 as resnet - from src.config import config4 as config from src.dataset_infer import create_dataset3 as create_dataset @@ -67,9 +51,9 @@ def show_predict_info(label_list, prediction_list, filename_list, predict_ng): "label is {}".format(filename, predict_index, label_index)) return predict_ng, label_index - -if __name__ == '__main__': - target = args_opt.device_target +@moxing_wrapper() +def infer_net(): + target = config.device_target # init context context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False) @@ -78,7 +62,7 @@ if __name__ == '__main__': context.set_context(device_id=device_id) # create dataset - dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, batch_size=config.batch_size, + dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size, target=target) step_size = dataset.get_dataset_size() @@ -86,7 +70,7 @@ if __name__ == '__main__': net = resnet(class_num=config.class_num) # load checkpoint - param_dict = load_checkpoint(args_opt.checkpoint_path) + param_dict = load_checkpoint(config.checkpoint_file_path) load_param_into_net(net, param_dict) net.set_train(False) @@ -95,7 +79,7 @@ if __name__ == '__main__': total_sample = step_size * config.batch_size only_file = 0 data_loader = dataset.create_dict_iterator(output_numpy=True, num_epochs=1) - for i, data in enumerate(data_loader): + for _, data in enumerate(data_loader): images = data["image"] label = data["label"] file_name = data["filename"] @@ -109,3 +93,6 @@ if __name__ == '__main__': print(f"total {total_sample} data, top1 acc is {(total_sample - len(predict_negative)) * 1.0 / total_sample}") else: print("infer completed") + +if __name__ == '__main__': + infer_net() diff --git a/model_zoo/official/cv/resnet/postprocess.py b/model_zoo/official/cv/resnet/postprocess.py index 5f91bcc81eb..e63ce6cd7a8 100644 --- a/model_zoo/official/cv/resnet/postprocess.py +++ b/model_zoo/official/cv/resnet/postprocess.py @@ -17,7 +17,6 @@ import os import json import argparse import numpy as np -from src.config import config2 as config batch_size = 1 parser = argparse.ArgumentParser(description="resnet inference") @@ -63,14 +62,14 @@ def cal_acc_imagenet(result_path, label_path): files = os.listdir(result_path) with open(label_path, "r") as label: labels = json.load(label) - + result_shape = (1, 1001) top1 = 0 top5 = 0 total_data = len(files) for file in files: img_ids_name = file.split('_0.')[0] data_path = os.path.join(result_path, img_ids_name + "_0.bin") - result = np.fromfile(data_path, dtype=np.float32).reshape(batch_size, config.class_num) + result = np.fromfile(data_path, dtype=np.float32).reshape(result_shape) for batch in range(batch_size): predict = np.argsort(-result[batch], axis=-1) if labels[img_ids_name+".JPEG"] == predict[0]: diff --git a/model_zoo/official/cv/resnet/resnet101_imagenet2012_config.yaml b/model_zoo/official/cv/resnet/resnet101_imagenet2012_config.yaml new file mode 100644 index 00000000000..d2727eb5bff --- /dev/null +++ b/model_zoo/official/cv/resnet/resnet101_imagenet2012_config.yaml @@ -0,0 +1,75 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +run_distribute: False +enable_profiling: False +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path/" +device_target: 'Ascend' +checkpoint_path: './checkpoint/' +checkpoint_file_path: '' + +# ============================================================================== +# Training options +optimizer: 'Momentum' +infer_label: "" +class_num: 1001 +batch_size: 32 +loss_scale: 1024 +momentum: 0.9 +weight_decay: 0.0001 +epoch_size: 120 +pretrain_epoch_size: 0 +save_checkpoint: True +save_checkpoint_epochs: 5 +keep_checkpoint_max: 10 +warmup_epochs: 0 +lr_decay_mode: "cosine" +use_label_smooth: True +label_smooth_factor: 0.1 +lr: 0.1 + +net_name: "resnet101" +dataset: "imagenet2012" +device_num: 1 +pre_trained: "" +run_eval: False +eval_dataset_path: "" +parameter_server: False +filter_weight: False +save_best_ckpt: True +eval_start_epoch: 40 +eval_interval: 1 +enable_cache: False +cache_session_id: "" +mode_name: "GRAPH" + +# Export options +device_id: 0 +width: 224 +height: 224 +file_name: "resnet101" +file_format: "AIR" +ckpt_file: "" +network_dataset: "resnet101_imagenet2012" + +--- +# Help description for each configuration +enable_modelarts: 'Whether training on modelarts, default: False' +data_url: 'Dataset url for obs' +checkpoint_url: 'The location of checkpoint for obs' +data_path: 'Dataset path for local' +output_path: 'Training output path for local' +load_path: 'The location of checkpoint for obs' +device_target: 'Target device type, available: [Ascend, GPU, CPU]' +enable_profiling: 'Whether enable profiling while training, default: False' +num_classes: 'Class for dataset' +batch_size: "Batch size for training and evaluation" +epoch_size: "Total training epochs." +checkpoint_path: "The location of the checkpoint file." +checkpoint_file_path: "The location of the checkpoint file." diff --git a/model_zoo/official/cv/resnet/resnet18_cifar10_config.yaml b/model_zoo/official/cv/resnet/resnet18_cifar10_config.yaml new file mode 100644 index 00000000000..1b6f7ba9e65 --- /dev/null +++ b/model_zoo/official/cv/resnet/resnet18_cifar10_config.yaml @@ -0,0 +1,75 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +run_distribute: False +enable_profiling: False +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path/" +device_target: 'Ascend' +checkpoint_path: './checkpoint/' +checkpoint_file_path: '' + +# ============================================================================== +# Training options +optimizer: 'Momentum' +infer_label: "" +class_num: 10 +batch_size: 32 +loss_scale: 1024 +momentum: 0.9 +weight_decay: 0.0001 +epoch_size: 90 +pretrain_epoch_size: 0 +save_checkpoint: True +save_checkpoint_epochs: 5 +keep_checkpoint_max: 10 +warmup_epochs: 5 +lr_decay_mode: "poly" +lr_init: 0.01 +lr_end: 0.00001 +lr_max: 0.1 + +net_name: "resnet18" +dataset: "cifar10" +device_num: 1 +pre_trained: "" +run_eval: False +eval_dataset_path: "" +parameter_server: False +filter_weight: False +save_best_ckpt: True +eval_start_epoch: 40 +eval_interval: 1 +enable_cache: False +cache_session_id: "" +mode_name: "GRAPH" + +# Export options +device_id: 0 +width: 224 +height: 224 +file_name: "resnet18" +file_format: "AIR" +ckpt_file: "" +network_dataset: "resnet18_cifar10" + +--- +# Help description for each configuration +enable_modelarts: 'Whether training on modelarts, default: False' +data_url: 'Dataset url for obs' +checkpoint_url: 'The location of checkpoint for obs' +data_path: 'Dataset path for local' +output_path: 'Training output path for local' +load_path: 'The location of checkpoint for obs' +device_target: 'Target device type, available: [Ascend, GPU, CPU]' +enable_profiling: 'Whether enable profiling while training, default: False' +num_classes: 'Class for dataset' +batch_size: "Batch size for training and evaluation" +epoch_size: "Total training epochs." +checkpoint_path: "The location of the checkpoint file." +checkpoint_file_path: "The location of the checkpoint file." diff --git a/model_zoo/official/cv/resnet/resnet18_imagenet2012_config.yaml b/model_zoo/official/cv/resnet/resnet18_imagenet2012_config.yaml new file mode 100644 index 00000000000..b0834058f83 --- /dev/null +++ b/model_zoo/official/cv/resnet/resnet18_imagenet2012_config.yaml @@ -0,0 +1,77 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +run_distribute: False +enable_profiling: False +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path/" +device_target: 'Ascend' +checkpoint_path: './checkpoint/' +checkpoint_file_path: '' + +# ============================================================================== +# Training options +optimizer: 'Momentum' +infer_label: "" +class_num: 1001 +batch_size: 256 +loss_scale: 1024 +momentum: 0.9 +weight_decay: 0.0001 +epoch_size: 90 +pretrain_epoch_size: 0 +save_checkpoint: True +save_checkpoint_epochs: 5 +keep_checkpoint_max: 10 +warmup_epochs: 0 +lr_decay_mode: "linear" +use_label_smooth: True +label_smooth_factor: 0.1 +lr_init: 0 +lr_max: 0.8 +lr_end: 0.0 + +net_name: "resnet18" +dataset: "imagenet2012" +device_num: 1 +pre_trained: "" +run_eval: False +eval_dataset_path: "" +parameter_server: False +filter_weight: False +save_best_ckpt: True +eval_start_epoch: 40 +eval_interval: 1 +enable_cache: False +cache_session_id: "" +mode_name: "GRAPH" + +# Export options +device_id: 0 +width: 224 +height: 224 +file_name: "resnet18" +file_format: "AIR" +ckpt_file: "" +network_dataset: "resnet18_imagenet2012" + +--- +# Help description for each configuration +enable_modelarts: 'Whether training on modelarts, default: False' +data_url: 'Dataset url for obs' +checkpoint_url: 'The location of checkpoint for obs' +data_path: 'Dataset path for local' +output_path: 'Training output path for local' +load_path: 'The location of checkpoint for obs' +device_target: 'Target device type, available: [Ascend, GPU, CPU]' +enable_profiling: 'Whether enable profiling while training, default: False' +num_classes: 'Class for dataset' +batch_size: "Batch size for training and evaluation" +epoch_size: "Total training epochs." +checkpoint_path: "The location of the checkpoint file." +checkpoint_file_path: "The location of the checkpoint file." diff --git a/model_zoo/official/cv/resnet/resnet34_imagenet2012_config.yaml b/model_zoo/official/cv/resnet/resnet34_imagenet2012_config.yaml new file mode 100644 index 00000000000..41e029b2fa1 --- /dev/null +++ b/model_zoo/official/cv/resnet/resnet34_imagenet2012_config.yaml @@ -0,0 +1,77 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +run_distribute: False +enable_profiling: False +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path/" +device_target: 'Ascend' +checkpoint_path: './checkpoint/' +checkpoint_file_path: '' + +# ============================================================================== +# Training options +optimizer: 'Momentum' +infer_label: "" +class_num: 1001 +batch_size: 256 +loss_scale: 1024 +momentum: 0.9 +weight_decay: 0.0001 +epoch_size: 90 +pretrain_epoch_size: 0 +save_checkpoint: True +save_checkpoint_epochs: 5 +keep_checkpoint_max: 10 +warmup_epochs: 0 +lr_decay_mode: "linear" +use_label_smooth: True +label_smooth_factor: 0.1 +lr_init: 0 +lr_max: 0.8 +lr_end: 0.0 + +net_name: "resnet34" +dataset: "imagenet2012" +device_num: 1 +pre_trained: "" +run_eval: False +eval_dataset_path: "" +parameter_server: False +filter_weight: False +save_best_ckpt: True +eval_start_epoch: 40 +eval_interval: 1 +enable_cache: False +cache_session_id: "" +mode_name: "GRAPH" + +# Export options +device_id: 0 +width: 224 +height: 224 +file_name: "resnet34" +file_format: "AIR" +ckpt_file: "" +network_dataset: "resnet34_imagenet2012" + +--- +# Help description for each configuration +enable_modelarts: 'Whether training on modelarts, default: False' +data_url: 'Dataset url for obs' +checkpoint_url: 'The location of checkpoint for obs' +data_path: 'Dataset path for local' +output_path: 'Training output path for local' +load_path: 'The location of checkpoint for obs' +device_target: 'Target device type, available: [Ascend, GPU, CPU]' +enable_profiling: 'Whether enable profiling while training, default: False' +num_classes: 'Class for dataset' +batch_size: "Batch size for training and evaluation" +epoch_size: "Total training epochs." +checkpoint_path: "The location of the checkpoint file." +checkpoint_file_path: "The location of the checkpoint file." diff --git a/model_zoo/official/cv/resnet/resnet50_cifar10_config.yaml b/model_zoo/official/cv/resnet/resnet50_cifar10_config.yaml new file mode 100644 index 00000000000..f0a1e5cdbd6 --- /dev/null +++ b/model_zoo/official/cv/resnet/resnet50_cifar10_config.yaml @@ -0,0 +1,75 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +run_distribute: False +enable_profiling: False +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path/" +device_target: 'Ascend' +checkpoint_path: './checkpoint/' +checkpoint_file_path: '' + +# ============================================================================== +# Training options +optimizer: 'Momentum' +infer_label: "" +class_num: 10 +batch_size: 32 +loss_scale: 1024 +momentum: 0.9 +weight_decay: 0.0001 +epoch_size: 90 +pretrain_epoch_size: 0 +save_checkpoint: True +save_checkpoint_epochs: 5 +keep_checkpoint_max: 10 +warmup_epochs: 5 +lr_decay_mode: "poly" +lr_init: 0.01 +lr_end: 0.00001 +lr_max: 0.1 + +net_name: "resnet50" +dataset: "cifar10" +device_num: 1 +pre_trained: "" +run_eval: False +eval_dataset_path: "" +parameter_server: False +filter_weight: False +save_best_ckpt: True +eval_start_epoch: 40 +eval_interval: 1 +enable_cache: False +cache_session_id: "" +mode_name: "GRAPH" + +# Export options +device_id: 0 +width: 224 +height: 224 +file_name: "resnet50" +file_format: "AIR" +ckpt_file: "" +network_dataset: "resnet50_cifar10" + +--- +# Help description for each configuration +enable_modelarts: 'Whether training on modelarts, default: False' +data_url: 'Dataset url for obs' +checkpoint_url: 'The location of checkpoint for obs' +data_path: 'Dataset path for local' +output_path: 'Training output path for local' +load_path: 'The location of checkpoint for obs' +device_target: 'Target device type, available: [Ascend, GPU, CPU]' +enable_profiling: 'Whether enable profiling while training, default: False' +num_classes: 'Class for dataset' +batch_size: "Batch size for training and evaluation" +epoch_size: "Total training epochs." +checkpoint_path: "The location of the checkpoint file." +checkpoint_file_path: "The location of the checkpoint file." diff --git a/model_zoo/official/cv/resnet/resnet50_imagenet2012_Ascend_config.yaml b/model_zoo/official/cv/resnet/resnet50_imagenet2012_Ascend_config.yaml new file mode 100644 index 00000000000..b9a0037e978 --- /dev/null +++ b/model_zoo/official/cv/resnet/resnet50_imagenet2012_Ascend_config.yaml @@ -0,0 +1,78 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +run_distribute: False +enable_profiling: False +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path/" +device_target: 'Ascend' +checkpoint_path: './checkpoint/' +checkpoint_file_path: '' + +# ============================================================================== +# Training options +optimizer: 'Momentum' +infer_label: "" +class_num: 1001 +batch_size: 32 +loss_scale: 128 +momentum: 0.9 +weight_decay: 0.0005 +epoch_size: 45 +pretrain_epoch_size: 0 +save_checkpoint: True +save_checkpoint_epochs: 2 +keep_checkpoint_max: 15 +use_label_smooth: True +label_smooth_factor: 0.1 +lr_init: 0.05803 +lr_decay: 4.04839 +lr_end_epoch: 53 +damping_init: 0.02714 +damping_decay: 0.50036 +frequency: 834 + +net_name: "resnet50" +dataset: "imagenet2012" +device_num: 1 +pre_trained: "" +run_eval: False +eval_dataset_path: "" +parameter_server: False +filter_weight: False +save_best_ckpt: True +eval_start_epoch: 40 +eval_interval: 1 +enable_cache: False +cache_session_id: "" +mode_name: "GRAPH" + +# Export options +device_id: 0 +width: 224 +height: 224 +file_name: "resnet50" +file_format: "AIR" +ckpt_file: "" +network_dataset: "resnet50_imagenet2012" + +--- +# Help description for each configuration +enable_modelarts: 'Whether training on modelarts, default: False' +data_url: 'Dataset url for obs' +checkpoint_url: 'The location of checkpoint for obs' +data_path: 'Dataset path for local' +output_path: 'Training output path for local' +load_path: 'The location of checkpoint for obs' +device_target: 'Target device type, available: [Ascend, GPU, CPU]' +enable_profiling: 'Whether enable profiling while training, default: False' +num_classes: 'Class for dataset' +batch_size: "Batch size for training and evaluation" +epoch_size: "Total training epochs." +checkpoint_path: "The location of the checkpoint file." +checkpoint_file_path: "The location of the checkpoint file." diff --git a/model_zoo/official/cv/resnet/resnet50_imagenet2012_GPU_config.yaml b/model_zoo/official/cv/resnet/resnet50_imagenet2012_GPU_config.yaml new file mode 100644 index 00000000000..4fe6e61905a --- /dev/null +++ b/model_zoo/official/cv/resnet/resnet50_imagenet2012_GPU_config.yaml @@ -0,0 +1,78 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +run_distribute: False +enable_profiling: False +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path/" +device_target: 'Ascend' +checkpoint_path: './checkpoint/' +checkpoint_file_path: '' + +# ============================================================================== +# Training options +optimizer: 'Thor' +infer_label: "" +class_num: 1001 +batch_size: 32 +loss_scale: 128 +momentum: 0.9 +weight_decay: 0.0005 +epoch_size: 40 +pretrain_epoch_size: 0 +save_checkpoint: True +save_checkpoint_epochs: 1 +keep_checkpoint_max: 15 +use_label_smooth: True +label_smooth_factor: 0.1 +lr_init: 0.05672 +lr_decay: 4.9687 +lr_end_epoch: 50 +damping_init: 0.02345 +damping_decay: 0.5467 +frequency: 834 + +net_name: "resnet50" +dataset: "imagenet2012" +device_num: 1 +pre_trained: "" +run_eval: False +eval_dataset_path: "" +parameter_server: False +filter_weight: False +save_best_ckpt: True +eval_start_epoch: 40 +eval_interval: 1 +enable_cache: False +cache_session_id: "" +mode_name: "GRAPH" + +# Export options +device_id: 0 +width: 224 +height: 224 +file_name: "resnet50" +file_format: "AIR" +ckpt_file: "" +network_dataset: "resnet50_imagenet2012" + +--- +# Help description for each configuration +enable_modelarts: 'Whether training on modelarts, default: False' +data_url: 'Dataset url for obs' +checkpoint_url: 'The location of checkpoint for obs' +data_path: 'Dataset path for local' +output_path: 'Training output path for local' +load_path: 'The location of checkpoint for obs' +device_target: 'Target device type, available: [Ascend, GPU, CPU]' +enable_profiling: 'Whether enable profiling while training, default: False' +num_classes: 'Class for dataset' +batch_size: "Batch size for training and evaluation" +epoch_size: "Total training epochs." +checkpoint_path: "The location of the checkpoint file." +checkpoint_file_path: "The location of the checkpoint file." diff --git a/model_zoo/official/cv/resnet/resnet50_imagenet2012_config.yaml b/model_zoo/official/cv/resnet/resnet50_imagenet2012_config.yaml new file mode 100644 index 00000000000..b61f43c9caf --- /dev/null +++ b/model_zoo/official/cv/resnet/resnet50_imagenet2012_config.yaml @@ -0,0 +1,77 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +run_distribute: False +enable_profiling: False +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path/" +device_target: 'Ascend' +checkpoint_path: './checkpoint/' +checkpoint_file_path: '' + +# ============================================================================== +# Training options +optimizer: 'Momentum' +infer_label: "" +class_num: 1001 +batch_size: 256 +loss_scale: 1024 +momentum: 0.9 +weight_decay: 0.0001 +epoch_size: 90 +pretrain_epoch_size: 0 +save_checkpoint: True +save_checkpoint_epochs: 5 +keep_checkpoint_max: 10 +warmup_epochs: 0 +lr_decay_mode: "linear" +use_label_smooth: True +label_smooth_factor: 0.1 +lr_init: 0 +lr_max: 0.8 +lr_end: 0.0 + +net_name: "resnet50" +dataset: "imagenet2012" +device_num: 1 +pre_trained: "" +run_eval: False +eval_dataset_path: "" +parameter_server: False +filter_weight: False +save_best_ckpt: True +eval_start_epoch: 40 +eval_interval: 1 +enable_cache: False +cache_session_id: "" +mode_name: "GRAPH" + +# Export options +device_id: 0 +width: 224 +height: 224 +file_name: "resnet50" +file_format: "AIR" +ckpt_file: "" +network_dataset: "resnet50_imagenet2012" + +--- +# Help description for each configuration +enable_modelarts: 'Whether training on modelarts, default: False' +data_url: 'Dataset url for obs' +checkpoint_url: 'The location of checkpoint for obs' +data_path: 'Dataset path for local' +output_path: 'Training output path for local' +load_path: 'The location of checkpoint for obs' +device_target: 'Target device type, available: [Ascend, GPU, CPU]' +enable_profiling: 'Whether enable profiling while training, default: False' +num_classes: 'Class for dataset' +batch_size: "Batch size for training and evaluation" +epoch_size: "Total training epochs." +checkpoint_path: "The location of the checkpoint file." +checkpoint_file_path: "The location of the checkpoint file." diff --git a/model_zoo/official/cv/resnet/resnet_benchmark_GPU.yaml b/model_zoo/official/cv/resnet/resnet_benchmark_GPU.yaml new file mode 100644 index 00000000000..157adf58cc4 --- /dev/null +++ b/model_zoo/official/cv/resnet/resnet_benchmark_GPU.yaml @@ -0,0 +1,52 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +run_distribute: False +enable_profiling: False +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path/" +device_target: 'Ascend' +checkpoint_path: './checkpoint/' +checkpoint_file_path: '' + +# ============================================================================== +# Training options +optimizer: 'Momentum' +infer_label: "" +batch_size: 256 +epoch_size: 2 +print_per_steps: 20 +eval: False +save_ckpt: False +mode_name: "GRAPH" +dtype: "fp16" + +# Export options +device_id: 0 +width: 224 +height: 224 +file_name: "resnet" +file_format: "AIR" +ckpt_file: "" +network_dataset: "resnet50_cifar10" + +--- +# Help description for each configuration +enable_modelarts: 'Whether training on modelarts, default: False' +data_url: 'Dataset url for obs' +checkpoint_url: 'The location of checkpoint for obs' +data_path: 'Dataset path for local' +output_path: 'Training output path for local' +load_path: 'The location of checkpoint for obs' +device_target: 'Target device type, available: [Ascend, GPU, CPU]' +enable_profiling: 'Whether enable profiling while training, default: False' +num_classes: 'Class for dataset' +batch_size: "Batch size for training and evaluation" +epoch_size: "Total training epochs." +checkpoint_path: "The location of the checkpoint file." +checkpoint_file_path: "The location of the checkpoint file." diff --git a/model_zoo/official/cv/resnet/scripts/run_distribute_train.sh b/model_zoo/official/cv/resnet/scripts/run_distribute_train.sh index 66a7ac411df..017c3cffcd3 100755 --- a/model_zoo/official/cv/resnet/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/resnet/scripts/run_distribute_train.sh @@ -17,37 +17,13 @@ CURPATH="$(dirname "$0")" . ${CURPATH}/cache_util.sh -if [ $# != 4 ] && [ $# != 5 ] && [ $# != 6 ] +if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] then - echo "Usage: bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" - echo " bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)" + echo " bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" exit 1 fi -if [ $1 != "resnet18" ] && [ $1 != "resnet34" ] && [ $1 != "resnet50" ] && [ $1 != "resnet101" ] && [ $1 != "se-resnet50" ] -then - echo "error: the selected net is neither resnet50 nor resnet101 and se-resnet50" -exit 1 -fi - -if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ] -then - echo "error: the selected dataset is neither cifar10 nor imagenet2012" -exit 1 -fi - -if [ $1 == "resnet101" ] && [ $2 == "cifar10" ] -then - echo "error: training resnet101 with cifar10 dataset is unsupported now!" -exit 1 -fi - -if [ $1 == "se-resnet50" ] && [ $2 == "cifar10" ] -then - echo "error: evaluating se-resnet50 with cifar10 dataset is unsupported now!" -exit 1 -fi - get_real_path(){ if [ "${1:0:1}" == "/" ]; then echo "$1" @@ -56,18 +32,19 @@ get_real_path(){ fi } -PATH1=$(get_real_path $3) -PATH2=$(get_real_path $4) +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) +CONFIG_FILE=$3 -if [ $# == 5 ] +if [ $# == 4 ] then - PATH3=$(get_real_path $5) + PATH3=$(get_real_path $4) fi -if [ $# == 6 ] +if [ $# == 5 ] then - RUN_EVAL=$5 - EVAL_DATASET_PATH=$(get_real_path $6) + RUN_EVAL=$4 + EVAL_DATASET_PATH=$(get_real_path $5) fi if [ ! -f $PATH1 ] @@ -82,7 +59,7 @@ then exit 1 fi -if [ $# == 5 ] && [ ! -f $PATH3 ] +if [ $# == 4 ] && [ ! -f $PATH3 ] then echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file" exit 1 @@ -123,24 +100,28 @@ do mkdir ./train_parallel$i cp ../*.py ./train_parallel$i cp *.sh ./train_parallel$i + cp -r ../*.yaml ./train_parallel$i cp -r ../src ./train_parallel$i cd ./train_parallel$i || exit echo "start training for rank $RANK_ID, device $DEVICE_ID" env > env.log - if [ $# == 4 ] + if [ $# == 3 ] then - taskset -c $cmdopt python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log & + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$DEVICE_NUM --data_path=$PATH2 \ + --config_path=$CONFIG_FILE --output_path './output' &> log & fi - if [ $# == 5 ] + if [ $# == 4 ] then - taskset -c $cmdopt python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 &> log & + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$DEVICE_NUM --data_path=$PATH2 --pre_trained=$PATH3 \ + --config_path=$CONFIG_FILE --output_path './output' &> log & fi - if [ $# == 6 ] + if [ $# == 5 ] then - taskset -c $cmdopt python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 \ - --run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID &> log & + taskset -c $cmdopt python train.py --run_distribute=True --device_num=$DEVICE_NUM --data_path=$PATH2 \ + --run_eval=$RUN_EVAL --eval_data_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID \ + --config_path=$CONFIG_FILE --output_path './output' &> log & if [ "x${RUN_EVAL}" == "xTrue" ] then echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" diff --git a/model_zoo/official/cv/resnet/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/resnet/scripts/run_distribute_train_gpu.sh index 4d7217635bc..ebd382a6a57 100755 --- a/model_zoo/official/cv/resnet/scripts/run_distribute_train_gpu.sh +++ b/model_zoo/official/cv/resnet/scripts/run_distribute_train_gpu.sh @@ -17,32 +17,13 @@ CURPATH="$(dirname "$0")" . ${CURPATH}/cache_util.sh -if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] +if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] then - echo "Usage: bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" - echo " bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" + echo "Usage: bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)" + echo " bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" exit 1 fi -if [ $1 != "resnet50" ] && [ $1 != "resnet101" ] -then - echo "error: the selected net is neither resnet50 nor resnet101" - exit 1 -fi - -if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ] -then - echo "error: the selected dataset is neither cifar10 nor imagenet2012" - exit 1 -fi - -if [ $1 == "resnet101" ] && [ $2 == "cifar10" ] -then - echo "error: training resnet101 with cifar10 dataset is unsupported now!" - exit 1 -fi - - get_real_path(){ if [ "${1:0:1}" == "/" ]; then echo "$1" @@ -51,17 +32,18 @@ get_real_path(){ fi } -PATH1=$(get_real_path $3) +PATH1=$(get_real_path $1) +CONFIG_FILE=$2 -if [ $# == 4 ] +if [ $# == 3 ] then - PATH2=$(get_real_path $4) + PATH2=$(get_real_path $3) fi -if [ $# == 5 ] +if [ $# == 4 ] then - RUN_EVAL=$4 - EVAL_DATASET_PATH=$(get_real_path $5) + RUN_EVAL=$3 + EVAL_DATASET_PATH=$(get_real_path $4) fi @@ -71,7 +53,7 @@ then exit 1 fi -if [ $# == 5 ] && [ ! -f $PATH2 ] +if [ $# == 4 ] && [ ! -f $PATH2 ] then echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" exit 1 @@ -97,29 +79,30 @@ rm -rf ./train_parallel mkdir ./train_parallel cp ../*.py ./train_parallel cp *.sh ./train_parallel +cp -r ../*.yaml ./train_parallel cp -r ../src ./train_parallel cd ./train_parallel || exit +if [ $# == 2 ] +then + mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ + python train.py --config_path=$CONFIG_FILE --run_distribute=True --device_num=$DEVICE_NUM \ + --device_target="GPU" --data_path=$PATH1 --output_path './output' &> log & +fi + if [ $# == 3 ] then mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ - python train.py --net=$1 --dataset=$2 --run_distribute=True \ - --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log & -fi - -if [ $# == 4 ] -then - mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ - python train.py --net=$1 --dataset=$2 --run_distribute=True \ - --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & + python train.py --config_path=$CONFIG_FILE --run_distribute=True --device_num=$DEVICE_NUM \ + --device_target="GPU" --data_path=$PATH1 --pre_trained=$PATH2 --output_path './output' &> log & fi -if [ $# == 5 ] +if [ $# == 4 ] then mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ - python train.py --net=$1 --dataset=$2 --run_distribute=True \ - --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --run_eval=$RUN_EVAL \ - --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID &> log & + python train.py --config_path=$CONFIG_FILE --run_distribute=True --device_num=$DEVICE_NUM \ + --device_target="GPU" --data_path=$PATH1 --run_eval=$RUN_EVAL --eval_data_path=$EVAL_DATASET_PATH \ + --enable_cache=True --cache_session_id=$CACHE_SESSION_ID --output_path './output' &> log & if [ "x${RUN_EVAL}" == "xTrue" ] then echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" diff --git a/model_zoo/official/cv/resnet/scripts/run_eval.sh b/model_zoo/official/cv/resnet/scripts/run_eval.sh index 8759581e64e..85c75682c3b 100755 --- a/model_zoo/official/cv/resnet/scripts/run_eval.sh +++ b/model_zoo/official/cv/resnet/scripts/run_eval.sh @@ -14,33 +14,9 @@ # limitations under the License. # ============================================================================ -if [ $# != 4 ] +if [ $# != 3 ] then - echo "Usage: bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]" -exit 1 -fi - -if [ $1 != "resnet18" ] && [ $1 != "resnet34" ] && [ $1 != "resnet50" ] && [ $1 != "resnet101" ] && [ $1 != "se-resnet50" ] -then - echo "error: the selected net is neither resnet50 nor resnet101 nor se-resnet50" -exit 1 -fi - -if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ] -then - echo "error: the selected dataset is neither cifar10 nor imagenet2012" -exit 1 -fi - -if [ $1 == "resnet101" ] && [ $2 == "cifar10" ] -then - echo "error: evaluating resnet101 with cifar10 dataset is unsupported now!" -exit 1 -fi - -if [ $1 == "se-resnet50" ] && [ $2 == "cifar10" ] -then - echo "error: evaluating se-resnet50 with cifar10 dataset is unsupported now!" + echo "Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]" exit 1 fi @@ -52,8 +28,9 @@ get_real_path(){ fi } -PATH1=$(get_real_path $3) -PATH2=$(get_real_path $4) +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) +CONFIG_FILE=$3 if [ ! -d $PATH1 ] @@ -81,9 +58,10 @@ fi mkdir ./eval cp ../*.py ./eval cp *.sh ./eval +cp -r ../*.yaml ./eval cp -r ../src ./eval cd ./eval || exit env > env.log echo "start evaluation for device $DEVICE_ID" -python eval.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --checkpoint_path=$PATH2 &> log & +python eval.py --data_path=$PATH1 --checkpoint_file_path=$PATH2 --config_path=$CONFIG_FILE &> log & cd .. diff --git a/model_zoo/official/cv/resnet/scripts/run_eval_gpu.sh b/model_zoo/official/cv/resnet/scripts/run_eval_gpu.sh index 0247eecbe5a..ed93cb09c08 100755 --- a/model_zoo/official/cv/resnet/scripts/run_eval_gpu.sh +++ b/model_zoo/official/cv/resnet/scripts/run_eval_gpu.sh @@ -14,31 +14,12 @@ # limitations under the License. # ============================================================================ -if [ $# != 4 ] +if [ $# != 3 ] then - echo "Usage: bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]" + echo "Usage: bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]" exit 1 fi -if [ $1 != "resnet50" ] && [ $1 != "resnet101" ] -then - echo "error: the selected net is neither resnet50 nor resnet101" -exit 1 -fi - -if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ] -then - echo "error: the selected dataset is neither cifar10 nor imagenet2012" -exit 1 -fi - -if [ $1 == "resnet101" ] && [ $2 == "cifar10" ] -then - echo "error: evaluating resnet101 with cifar10 dataset is unsupported now!" -exit 1 -fi - - get_real_path(){ if [ "${1:0:1}" == "/" ]; then echo "$1" @@ -47,8 +28,9 @@ get_real_path(){ fi } -PATH1=$(get_real_path $3) -PATH2=$(get_real_path $4) +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) +CONFIG_FILE=$3 if [ ! -d $PATH1 ] @@ -76,9 +58,11 @@ fi mkdir ./eval cp ../*.py ./eval cp *.sh ./eval +cp -r ../*.yaml ./eval cp -r ../src ./eval cd ./eval || exit env > env.log echo "start evaluation for device $DEVICE_ID" -python eval.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target="GPU" &> log & +python eval.py --data_path=$PATH1 --checkpoint_file_path=$PATH2 --device_target="GPU" \ +--config_path=$CONFIG_FILE &> log & cd .. diff --git a/model_zoo/official/cv/resnet/scripts/run_eval_gpu_resnet_benckmark.sh b/model_zoo/official/cv/resnet/scripts/run_eval_gpu_resnet_benckmark.sh index ba9d14004d7..d120da91f07 100644 --- a/model_zoo/official/cv/resnet/scripts/run_eval_gpu_resnet_benckmark.sh +++ b/model_zoo/official/cv/resnet/scripts/run_eval_gpu_resnet_benckmark.sh @@ -14,11 +14,11 @@ # limitations under the License. # ============================================================================ -if [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] +if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] && [ $# != 6 ] then - echo "Usage: bash run_eval_gpu_resnet_benchmark.sh [DATASET_PATH] [CKPT_PATH] [BATCH_SIZE](optional) \ + echo "Usage: bash run_eval_gpu_resnet_benchmark.sh [DATASET_PATH] [CKPT_PATH] [CONFIG_PATH] [BATCH_SIZE](optional) \ [DTYPE](optional)" - echo "Example: sh run_eval_gpu_resnet_benchmark.sh /path/imagenet/train /path/ckpt 256 FP16" + echo "Example: sh run_eval_gpu_resnet_benchmark.sh /path/imagenet/train /path/ckpt /*.yaml 256 FP16" exit 1 fi @@ -33,19 +33,19 @@ get_real_path(){ DATAPATH=$(get_real_path $1) script_self=$(readlink -f "$0") self_path=$(dirname "${script_self}") -if [ $# == 2 ] -then - python ${self_path}/../gpu_resnet_benchmark.py --dataset_path=$DATAPATH --eval=True --ckpt_path=$2 -fi - if [ $# == 3 ] then - python ${self_path}/../gpu_resnet_benchmark.py --dataset_path=$DATAPATH --eval=True --ckpt_path=$2 \ - --batch_size=$3 + python ${self_path}/../gpu_resnet_benchmark.py --data_path=$DATAPATH --eval=True --checkpoint_file_path=$2 --config_path=$3 fi if [ $# == 4 ] then - python ${self_path}/../gpu_resnet_benchmark.py--dataset_path=$DATAPATH --eval=True --ckpt_path=$2 \ - --batch_size=$3 --dtype=$4 + python ${self_path}/../gpu_resnet_benchmark.py --data_path=$DATAPATH --eval=True --checkpoint_file_path=$2 \ + --config_path=$3 --batch_size=$4 +fi + +if [ $# == 5 ] +then + python ${self_path}/../gpu_resnet_benchmark.py--data_path=$DATAPATH --eval=True --checkpoint_file_path=$2 \ + --config_path=$3 --batch_size=$4 --dtype=$5 fi diff --git a/model_zoo/official/cv/resnet/scripts/run_gpu_resnet_benchmark.sh b/model_zoo/official/cv/resnet/scripts/run_gpu_resnet_benchmark.sh index 0989ddf9f95..65224562a2c 100644 --- a/model_zoo/official/cv/resnet/scripts/run_gpu_resnet_benchmark.sh +++ b/model_zoo/official/cv/resnet/scripts/run_gpu_resnet_benchmark.sh @@ -14,11 +14,11 @@ # limitations under the License. # ============================================================================ -if [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] +if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] && [ $# != 6 ] then - echo "Usage: bash run_gpu_resnet_benchmark.sh [DATASET_PATH] [BATCH_SIZE](optional) [DTYPE](optional)\ + echo "Usage: bash run_gpu_resnet_benchmark.sh [DATASET_PATH] [CONFIG_PATH] [BATCH_SIZE](optional) [DTYPE](optional)\ [DEVICE_NUM](optional) [SAVE_CKPT](optional) [SAVE_PATH](optional)" - echo "Example: sh run_gpu_resnet_benchmark.sh /path/imagenet/train 256 FP16 8 true /path/ckpt" + echo "Example: sh run_gpu_resnet_benchmark.sh /path/imagenet/train /*yaml 256 FP16 8 true /path/ckpt" exit 1 fi @@ -33,35 +33,35 @@ get_real_path(){ DATAPATH=$(get_real_path $1) script_self=$(readlink -f "$0") self_path=$(dirname "${script_self}") -if [ $# == 1 ] -then - python ${self_path}/../gpu_resnet_benchmark.py --dataset_path=$DATAPATH -fi - if [ $# == 2 ] then - python ${self_path}/../gpu_resnet_benchmark.py --dataset_path=$DATAPATH --batch_size=$2 + python ${self_path}/../gpu_resnet_benchmark.py --data_path=$DATAPATH --config_path=$2 fi if [ $# == 3 ] then - python ${self_path}/../gpu_resnet_benchmark.py --dataset_path=$DATAPATH --batch_size=$2 --dtype=$3 + python ${self_path}/../gpu_resnet_benchmark.py --data_path=$DATAPATH --config_path=$2 --batch_size=$3 fi if [ $# == 4 ] then - mpirun --allow-run-as-root -n $4 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \ - --dataset_path=$DATAPATH --batch_size=$2 --dtype=$3 + python ${self_path}/../gpu_resnet_benchmark.py --data_path=$DATAPATH --config_path=$2 --batch_size=$3 --dtype=$4 fi if [ $# == 5 ] then - mpirun --allow-run-as-root -n $4 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \ - --dataset_path=$DATAPATH --batch_size=$2 --dtype=$3 --save_ckpt=$5 + mpirun --allow-run-as-root -n $5 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \ + --data_path=$DATAPATH --config_path=$2 --batch_size=$3 --dtype=$4 fi if [ $# == 6 ] then - mpirun --allow-run-as-root -n $4 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \ - --dataset_path=$DATAPATH --batch_size=$2 --dtype=$3 --save_ckpt=$5 --ckpt_path=$6 + mpirun --allow-run-as-root -n $5 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \ + --data_path=$DATAPATH --config_path=$2 --batch_size=$3 --dtype=$4 --save_ckpt=$6 +fi + +if [ $# == 7 ] +then + mpirun --allow-run-as-root -n $5 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \ + --data_path=$DATAPATH --config_path=$2 --batch_size=$3 --dtype=$4 --save_ckpt=$6 --checkpoint_file_path=$7 fi diff --git a/model_zoo/official/cv/resnet/scripts/run_infer.sh b/model_zoo/official/cv/resnet/scripts/run_infer.sh index daf9054bb0e..34ae0fadadc 100644 --- a/model_zoo/official/cv/resnet/scripts/run_infer.sh +++ b/model_zoo/official/cv/resnet/scripts/run_infer.sh @@ -14,25 +14,12 @@ # limitations under the License. # ============================================================================ -if [ $# != 4 ] +if [ $# != 3 ] then - echo "Usage: bash run_eval.sh [resnet18|resnet50|resnet101|se-resnet50] [imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]" + echo "Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]" exit 1 fi -if [ $1 != "resnet18" ] && [ $1 != "resnet50" ] && [ $1 != "resnet101" ] && [ $1 != "se-resnet50" ] -then - echo "error: the selected net is neither resnet50 nor resnet101 nor se-resnet50" -exit 1 -fi - -if [ $2 != "imagenet2012" ] -then - echo "error: only support imagenet2012" -exit 1 -fi - - get_real_path(){ if [ "${1:0:1}" == "/" ]; then echo "$1" @@ -41,8 +28,9 @@ get_real_path(){ fi } -PATH1=$(get_real_path $3) -PATH2=$(get_real_path $4) +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) +CONFIG_FILE=$3 if [ ! -d $PATH1 ] @@ -68,11 +56,12 @@ then rm -rf ./infer fi mkdir ./infer +cp ../*.yaml ./infer cp ../*.py ./infer cp *.sh ./infer cp -r ../src ./infer cd ./infer || exit env > env.log echo "start evaluation for device $DEVICE_ID" -python infer.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --checkpoint_path=$PATH2 &> log & +python infer.py --data_path=$PATH1 --checkpoint_file_path=$PATH2 --config_path=$CONFIG_FILE &> log & cd .. diff --git a/model_zoo/official/cv/resnet/scripts/run_parameter_server_train.sh b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train.sh index 4d76251e850..e3dd2d6372a 100644 --- a/model_zoo/official/cv/resnet/scripts/run_parameter_server_train.sh +++ b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train.sh @@ -14,31 +14,12 @@ # limitations under the License. # ============================================================================ -if [ $# != 4 ] && [ $# != 5 ] +if [ $# != 3 ] && [ $# != 4 ] then - echo "Usage: bash run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)" exit 1 fi -if [ $1 != "resnet50" ] && [ $1 != "resnet101" ] -then - echo "error: the selected net is neither resnet50 nor resnet101" - exit 1 -fi - -if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ] -then - echo "error: the selected dataset is neither cifar10 nor imagenet2012" - exit 1 -fi - -if [ $1 == "resnet101" ] && [ $2 == "cifar10" ] -then - echo "error: training resnet101 with cifar10 dataset is unsupported now!" - exit 1 -fi - - get_real_path(){ if [ "${1:0:1}" == "/" ]; then echo "$1" @@ -47,12 +28,13 @@ get_real_path(){ fi } -PATH1=$(get_real_path $3) -PATH2=$(get_real_path $4) +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) +CONFIG_FILE=$3 -if [ $# == 5 ] +if [ $# == 4 ] then - PATH3=$(get_real_path $5) + PATH3=$(get_real_path $4) fi if [ ! -f $PATH1 ] @@ -67,7 +49,7 @@ then exit 1 fi -if [ $# == 5 ] && [ ! -f $PATH3 ] +if [ $# == 4 ] && [ ! -f $PATH3 ] then echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file" exit 1 @@ -89,19 +71,22 @@ export DEVICE_ID=0 export RANK_ID=0 rm -rf ./sched mkdir ./sched +cp ../*.yaml ./sched cp ../*.py ./sched cp *.sh ./sched cp -r ../src ./sched cd ./sched || exit echo "start scheduler" -if [ $# == 4 ] +if [ $# == 3 ] then - python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True &> sched.log & + python train.py --run_distribute=True --device_num=1 --data_path=$PATH2 --parameter_server=True \ + --config_path=$CONFIG_FILE --output_path './output' &> sched.log & fi -if [ $# == 5 ] +if [ $# == 4 ] then - python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> sched.log & + python train.py --run_distribute=True --device_num=1 --data_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 \ + --config_path=$CONFIG_FILE --output_path './output' &> sched.log & fi cd .. @@ -112,19 +97,22 @@ do export RANK_ID=$i rm -rf ./server_$i mkdir ./server_$i + cp ../*.yaml ./server_$i cp ../*.py ./server_$i cp *.sh ./server_$i cp -r ../src ./server_$i cd ./server_$i || exit echo "start server" - if [ $# == 4 ] + if [ $# == 3 ] then - python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True &> server_$i.log & + python train.py --run_distribute=True --device_num=1 --data_path=$PATH2 --parameter_server=True \ + --config_path=$CONFIG_FILE --output_path './output' &> server_$i.log & fi - if [ $# == 5 ] + if [ $# == 4 ] then - python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> server_$i.log & + python train.py --run_distribute=True --device_num=1 --data_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 \ + --config_path=$CONFIG_FILE --output_path './output' &> server_$i.log & fi cd .. @@ -137,20 +125,23 @@ do export RANK_ID=$i rm -rf ./worker_$i mkdir ./worker_$i + cp ../*.yaml ./worker_$i cp ../*.py ./worker_$i cp *.sh ./worker_$i cp -r ../src ./worker_$i cd ./worker_$i || exit echo "start training for worker rank $RANK_ID, device $DEVICE_ID" env > env.log - if [ $# == 4 ] + if [ $# == 3 ] then - python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --parameter_server=True &> worker_$i.log & + python train.py --run_distribute=True --device_num=$DEVICE_NUM --data_path=$PATH2 --parameter_server=True \ + --config_path=$CONFIG_FILE --output_path './output' &> worker_$i.log & fi - if [ $# == 5 ] + if [ $# == 4 ] then - python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> worker_$i.log & + python train.py --run_distribute=True --device_num=$DEVICE_NUM --data_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 \ + --config_path=$CONFIG_FILE --output_path './output' &> worker_$i.log & fi cd .. diff --git a/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh index ca3a654ab7c..ba83f209644 100755 --- a/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh +++ b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh @@ -14,31 +14,12 @@ # limitations under the License. # ============================================================================ -if [ $# != 3 ] && [ $# != 4 ] +if [ $# != 2 ] && [ $# != 3 ] then - echo "Usage: bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" + echo "Usage: bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)" exit 1 fi -if [ $1 != "resnet50" ] && [ $1 != "resnet101" ] -then - echo "error: the selected net is neither resnet50 nor resnet101" - exit 1 -fi - -if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ] -then - echo "error: the selected dataset is neither cifar10 nor imagenet2012" - exit 1 -fi - -if [ $1 == "resnet101" ] && [ $2 == "cifar10" ] -then - echo "error: training resnet101 with cifar10 dataset is unsupported now!" - exit 1 -fi - - get_real_path(){ if [ "${1:0:1}" == "/" ]; then echo "$1" @@ -47,11 +28,11 @@ get_real_path(){ fi } -PATH1=$(get_real_path $3) - -if [ $# == 4 ] +PATH1=$(get_real_path $1) +CONFIG_FILE=$2 +if [ $# == 3 ] then - PATH2=$(get_real_path $4) + PATH2=$(get_real_path $3) fi @@ -61,7 +42,7 @@ then exit 1 fi -if [ $# == 5 ] && [ ! -f $PATH2 ] +if [ $# == 4 ] && [ ! -f $PATH2 ] then echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" exit 1 @@ -79,22 +60,23 @@ export MS_SCHED_PORT=8081 export MS_ROLE=MS_SCHED rm -rf ./sched mkdir ./sched +cp ../*.yaml ./sched cp ../*.py ./sched cp *.sh ./sched cp -r ../src ./sched cd ./sched || exit +if [ $# == 2 ] +then + mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \ + python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU" \ + --data_path=$PATH1 --parameter_server=True --config_path=$CONFIG_FILE --output_path './output' &> sched.log & +fi + if [ $# == 3 ] then mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \ - python train.py --net=$1 --dataset=$2 --run_distribute=True \ - --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> sched.log & -fi - -if [ $# == 4 ] -then - mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \ - python train.py --net=$1 --dataset=$2 --run_distribute=True \ - --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> sched.log & + python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU" \ + --data_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 --config_path=$CONFIG_FILE --output_path './output' &> sched.log & fi cd .. @@ -103,22 +85,24 @@ for((i=0;i<$MS_SERVER_NUM;i++)); do rm -rf ./server_$i mkdir ./server_$i + cp ../*.yaml ./server_$i cp ../*.py ./server_$i cp *.sh ./server_$i cp -r ../src ./server_$i cd ./server_$i || exit + if [ $# == 2 ] + then + mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \ + python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU" \ + --data_path=$PATH1 --parameter_server=True --config_path=$CONFIG_FILE --output_path './output' &> server_$i.log & + fi + if [ $# == 3 ] then mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \ - python train.py --net=$1 --dataset=$2 --run_distribute=True \ - --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> server_$i.log & - fi - - if [ $# == 4 ] - then - mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \ - python train.py --net=$1 --dataset=$2 --run_distribute=True \ - --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> server_$i.log & + python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU" \ + --data_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 \ + --config_path=$CONFIG_FILE --output_path './output' &> server_$i.log & fi cd .. done @@ -126,21 +110,23 @@ done export MS_ROLE=MS_WORKER rm -rf ./worker mkdir ./worker +cp ../*.yaml ./worker cp ../*.py ./worker cp *.sh ./worker cp -r ../src ./worker cd ./worker || exit +if [ $# == 2 ] +then + mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ + python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU" \ + --data_path=$PATH1 --parameter_server=True --config_path=$CONFIG_FILE --output_path './output' &> worker.log & +fi + if [ $# == 3 ] then mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ - python train.py --net=$1 --dataset=$2 --run_distribute=True \ - --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> worker.log & -fi - -if [ $# == 4 ] -then - mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ - python train.py --net=$1 --dataset=$2 --run_distribute=True \ - --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> worker.log & + python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU"\ + --data_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 \ + --config_path=$CONFIG_FILE --output_path './output' &> worker.log & fi cd .. diff --git a/model_zoo/official/cv/resnet/scripts/run_standalone_train.sh b/model_zoo/official/cv/resnet/scripts/run_standalone_train.sh index 51fec969bbe..7e8e24f1575 100755 --- a/model_zoo/official/cv/resnet/scripts/run_standalone_train.sh +++ b/model_zoo/official/cv/resnet/scripts/run_standalone_train.sh @@ -17,34 +17,10 @@ CURPATH="$(dirname "$0")" . ${CURPATH}/cache_util.sh -if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] +if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] then - echo "Usage: bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" - echo " bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" -exit 1 -fi - -if [ $1 != "resnet18" ] && [ $1 != "resnet34" ] && [ $1 != "resnet50" ] && [ $1 != "resnet101" ] && [ $1 != "se-resnet50" ] -then - echo "error: the selected net is neither resnet50 nor resnet101 and se-resnet50" -exit 1 -fi - -if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ] -then - echo "error: the selected dataset is neither cifar10 nor imagenet2012" -exit 1 -fi - -if [ $1 == "resnet101" ] && [ $2 == "cifar10" ] -then - echo "error: training resnet101 with cifar10 dataset is unsupported now!" -exit 1 -fi - -if [ $1 == "se-resnet50" ] && [ $2 == "cifar10" ] -then - echo "error: evaluating se-resnet50 with cifar10 dataset is unsupported now!" + echo "Usage: bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)" + echo "bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" exit 1 fi @@ -56,17 +32,17 @@ get_real_path(){ fi } -PATH1=$(get_real_path $3) +PATH1=$(get_real_path $1) +CONFIG_FILE=$2 +if [ $# == 3 ] +then + PATH2=$(get_real_path $3) +fi if [ $# == 4 ] then - PATH2=$(get_real_path $4) -fi - -if [ $# == 5 ] -then - RUN_EVAL=$4 - EVAL_DATASET_PATH=$(get_real_path $5) + RUN_EVAL=$2 + EVAL_DATASET_PATH=$(get_real_path $4) fi if [ ! -d $PATH1 ] @@ -75,7 +51,7 @@ then exit 1 fi -if [ $# == 4 ] && [ ! -f $PATH2 ] +if [ $# == 3 ] && [ ! -f $PATH2 ] then echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" exit 1 @@ -103,26 +79,28 @@ then rm -rf ./train fi mkdir ./train +cp ../*.yaml ./train cp ../*.py ./train cp *.sh ./train cp -r ../src ./train cd ./train || exit echo "start training for device $DEVICE_ID" env > env.log +if [ $# == 2 ] +then + python train.py --data_path=$PATH1 --config_path=$CONFIG_FILE --output_path './output' &> log & +fi + if [ $# == 3 ] then - python train.py --net=$1 --dataset=$2 --dataset_path=$PATH1 &> log & + python train.py --data_path=$PATH1 --pre_trained=$PATH2 --config_path=$CONFIG_FILE --output_path './output' &> log & fi if [ $# == 4 ] then - python train.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & -fi - -if [ $# == 5 ] -then - python train.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --run_eval=$RUN_EVAL \ - --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID &> log & + python train.py --data_path=$PATH1 --run_eval=$RUN_EVAL --eval_data_path=$EVAL_DATASET_PATH \ + --enable_cache=True --cache_session_id=$CACHE_SESSION_ID \ + --config_path=$CONFIG_FILE --output_path './output' &> log & if [ "x${RUN_EVAL}" == "xTrue" ] then echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" diff --git a/model_zoo/official/cv/resnet/scripts/run_standalone_train_gpu.sh b/model_zoo/official/cv/resnet/scripts/run_standalone_train_gpu.sh index 19433be36cb..1a38f7ba459 100755 --- a/model_zoo/official/cv/resnet/scripts/run_standalone_train_gpu.sh +++ b/model_zoo/official/cv/resnet/scripts/run_standalone_train_gpu.sh @@ -17,32 +17,13 @@ CURPATH="$(dirname "$0")" . ${CURPATH}/cache_util.sh -if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] +if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] then - echo "Usage: bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" - echo " bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" + echo "Usage: bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)" + echo " bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)" exit 1 fi -if [ $1 != "resnet50" ] && [ $1 != "resnet101" ] -then - echo "error: the selected net is neither resnet50 nor resnet101" -exit 1 -fi - -if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ] -then - echo "error: the selected dataset is neither cifar10 nor imagenet2012" -exit 1 -fi - -if [ $1 == "resnet101" ] && [ $2 == "cifar10" ] -then - echo "error: training resnet101 with cifar10 dataset is unsupported now!" -exit 1 -fi - - get_real_path(){ if [ "${1:0:1}" == "/" ]; then echo "$1" @@ -51,17 +32,18 @@ get_real_path(){ fi } -PATH1=$(get_real_path $3) +PATH1=$(get_real_path $1) +CONFIG_FILE=$2 + +if [ $# == 3 ] +then + PATH2=$(get_real_path $3) +fi if [ $# == 4 ] then - PATH2=$(get_real_path $4) -fi - -if [ $# == 5 ] -then - RUN_EVAL=$4 - EVAL_DATASET_PATH=$(get_real_path $5) + RUN_EVAL=$3 + EVAL_DATASET_PATH=$(get_real_path $4) fi if [ ! -d $PATH1 ] @@ -70,7 +52,7 @@ then exit 1 fi -if [ $# == 4 ] && [ ! -f $PATH2 ] +if [ $# == 3 ] && [ ! -f $PATH2 ] then echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" exit 1 @@ -100,26 +82,30 @@ then rm -rf ./train fi mkdir ./train +cp ../*.yaml ./train cp ../*.py ./train cp *.sh ./train cp -r ../src ./train cd ./train || exit echo "start training for device $DEVICE_ID" env > env.log +if [ $# == 2 ] +then + python train.py --device_target="GPU" --data_path=$PATH1 \ + --config_path=$CONFIG_FILE --output_path './output' &> log & +fi + if [ $# == 3 ] then - python train.py --net=$1 --dataset=$2 --device_target="GPU" --dataset_path=$PATH1 &> log & + python train.py --device_target="GPU" --data_path=$PATH1 --pre_trained=$PATH2 \ + --config_path=$CONFIG_FILE --output_path './output' &> log & fi if [ $# == 4 ] then - python train.py --net=$1 --dataset=$2 --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & -fi - -if [ $# == 5 ] -then - python train.py --net=$1 --dataset=$2 --device_target="GPU" --dataset_path=$PATH1 --run_eval=$RUN_EVAL \ - --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID &> log & + python train.py --device_target="GPU" --data_path=$PATH1 --run_eval=$RUN_EVAL \ + --eval_data_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID \ + --config_path=$CONFIG_FILE --output_path './output' &> log & if [ "x${RUN_EVAL}" == "xTrue" ] then echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\"" diff --git a/model_zoo/official/cv/resnet/se-resnet50_imagenet2012_config.yaml b/model_zoo/official/cv/resnet/se-resnet50_imagenet2012_config.yaml new file mode 100644 index 00000000000..a6a45f06f53 --- /dev/null +++ b/model_zoo/official/cv/resnet/se-resnet50_imagenet2012_config.yaml @@ -0,0 +1,78 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +run_distribute: False +enable_profiling: False +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path/" +device_target: 'Ascend' +checkpoint_path: './checkpoint/' +checkpoint_file_path: '' + +# ============================================================================== +# Training options +optimizer: 'Momentum' +infer_label: "" +class_num: 1001 +batch_size: 32 +loss_scale: 1024 +momentum: 0.9 +weight_decay: 0.0001 +epoch_size: 28 +train_epoch_size: 24 +pretrain_epoch_size: 0 +save_checkpoint: True +save_checkpoint_epochs: 4 +keep_checkpoint_max: 10 +warmup_epochs: 3 +lr_decay_mode: "cosine" +use_label_smooth: True +label_smooth_factor: 0.1 +lr_init: 0 +lr_end: 0.0001 +lr_max: 0.3 + +net_name: "se-resnet50" +dataset: "imagenet2012" +device_num: 1 +pre_trained: "" +run_eval: False +eval_dataset_path: "" +parameter_server: False +filter_weight: False +save_best_ckpt: True +eval_start_epoch: 40 +eval_interval: 1 +enable_cache: False +cache_session_id: "" +mode_name: "GRAPH" + +# Export options +device_id: 0 +width: 224 +height: 224 +file_name: "se-resnet50" +file_format: "AIR" +ckpt_file: "" +network_dataset: "se-resnet50_imagenet2012" + +--- +# Help description for each configuration +enable_modelarts: 'Whether training on modelarts, default: False' +data_url: 'Dataset url for obs' +checkpoint_url: 'The location of checkpoint for obs' +data_path: 'Dataset path for local' +output_path: 'Training output path for local' +load_path: 'The location of checkpoint for obs' +device_target: 'Target device type, available: [Ascend, GPU, CPU]' +enable_profiling: 'Whether enable profiling while training, default: False' +num_classes: 'Class for dataset' +batch_size: "Batch size for training and evaluation" +epoch_size: "Total training epochs." +checkpoint_path: "The location of the checkpoint file." +checkpoint_file_path: "The location of the checkpoint file." diff --git a/model_zoo/official/cv/resnet/src/config.py b/model_zoo/official/cv/resnet/src/config.py deleted file mode 100755 index 3868774ec36..00000000000 --- a/model_zoo/official/cv/resnet/src/config.py +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright 2020-2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -network config setting, will be used in train.py and eval.py -""" -from easydict import EasyDict as ed -# config optimizer for resnet50, imagenet2012. Momentum is default, Thor is optional. -# infer_label is a directory and label mapping table. such as 'infer_label': {"directory0": 0, "directory1": 1, ...} -cfg = ed({ - 'optimizer': 'Momentum', - 'infer_label': {} - }) - -# config for resent50, cifar10 -config1 = ed({ - "class_num": 10, - "batch_size": 32, - "loss_scale": 1024, - "momentum": 0.9, - "weight_decay": 1e-4, - "epoch_size": 90, - "pretrain_epoch_size": 0, - "save_checkpoint": True, - "save_checkpoint_epochs": 5, - "keep_checkpoint_max": 10, - "save_checkpoint_path": "./", - "warmup_epochs": 5, - "lr_decay_mode": "poly", - "lr_init": 0.01, - "lr_end": 0.00001, - "lr_max": 0.1 -}) - -# config for resnet50, imagenet2012 -config2 = ed({ - "class_num": 1001, - "batch_size": 256, - "loss_scale": 1024, - "momentum": 0.9, - "weight_decay": 1e-4, - "epoch_size": 90, - "pretrain_epoch_size": 0, - "save_checkpoint": True, - "save_checkpoint_epochs": 5, - "keep_checkpoint_max": 10, - "save_checkpoint_path": "./", - "warmup_epochs": 0, - "lr_decay_mode": "linear", - "use_label_smooth": True, - "label_smooth_factor": 0.1, - "lr_init": 0, - "lr_max": 0.8, - "lr_end": 0.0 -}) - -# config for resent101, imagenet2012 -config3 = ed({ - "class_num": 1001, - "batch_size": 32, - "loss_scale": 1024, - "momentum": 0.9, - "weight_decay": 1e-4, - "epoch_size": 120, - "pretrain_epoch_size": 0, - "save_checkpoint": True, - "save_checkpoint_epochs": 5, - "keep_checkpoint_max": 10, - "save_checkpoint_path": "./", - "warmup_epochs": 0, - "lr_decay_mode": "cosine", - "use_label_smooth": True, - "label_smooth_factor": 0.1, - "lr": 0.1 -}) - -# config for se-resnet50, imagenet2012 -config4 = ed({ - "class_num": 1001, - "batch_size": 32, - "loss_scale": 1024, - "momentum": 0.9, - "weight_decay": 1e-4, - "epoch_size": 28, - "train_epoch_size": 24, - "pretrain_epoch_size": 0, - "save_checkpoint": True, - "save_checkpoint_epochs": 4, - "keep_checkpoint_max": 10, - "save_checkpoint_path": "./", - "warmup_epochs": 3, - "lr_decay_mode": "cosine", - "use_label_smooth": True, - "label_smooth_factor": 0.1, - "lr_init": 0.0, - "lr_max": 0.3, - "lr_end": 0.0001 -}) - -# config for resnet50, imagenet2012, Ascend 910 -config_thor_Ascend = ed({ - "class_num": 1001, - "batch_size": 32, - "loss_scale": 128, - "momentum": 0.9, - "weight_decay": 5e-4, - "epoch_size": 45, - "pretrain_epoch_size": 0, - "save_checkpoint": True, - "save_checkpoint_epochs": 2, - "keep_checkpoint_max": 15, - "save_checkpoint_path": "./", - "use_label_smooth": True, - "label_smooth_factor": 0.1, - "lr_init": 0.05803, - "lr_decay": 4.04839, - "lr_end_epoch": 53, - "damping_init": 0.02714, - "damping_decay": 0.50036, - "frequency": 834, -}) - -# config for resnet50, imagenet2012, GPU -config_thor_gpu = ed({ - "class_num": 1001, - "batch_size": 32, - "loss_scale": 128, - "momentum": 0.9, - "weight_decay": 5e-4, - "epoch_size": 40, - "pretrain_epoch_size": 0, - "save_checkpoint": True, - "save_checkpoint_epochs": 1, - "keep_checkpoint_max": 15, - "save_checkpoint_path": "./", - "use_label_smooth": True, - "label_smooth_factor": 0.1, - "lr_init": 0.05672, - "lr_decay": 4.9687, - "lr_end_epoch": 50, - "damping_init": 0.02345, - "damping_decay": 0.5467, - "frequency": 834, -}) diff --git a/model_zoo/official/cv/resnet/src/dataset.py b/model_zoo/official/cv/resnet/src/dataset.py index 98b4fa7d376..23d11517614 100755 --- a/model_zoo/official/cv/resnet/src/dataset.py +++ b/model_zoo/official/cv/resnet/src/dataset.py @@ -21,6 +21,8 @@ import mindspore.dataset as ds import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.transforms.c_transforms as C2 from mindspore.communication.management import init, get_rank, get_group_size +from src.model_utils.config import config +from src.model_utils.device_adapter import get_device_num, get_rank_id def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False, @@ -406,11 +408,19 @@ def _get_rank_info(): """ rank_size = int(os.environ.get("RANK_SIZE", 1)) - if rank_size > 1: - rank_size = get_group_size() - rank_id = get_rank() + if config.device_target == "Ascend": + if rank_size > 1: + rank_size = get_device_num() + rank_id = get_rank_id() + else: + rank_size = 1 + rank_id = 0 else: - rank_size = 1 - rank_id = 0 + if rank_size > 1: + rank_size = get_group_size() + rank_id = get_rank() + else: + rank_size = 1 + rank_id = 0 return rank_size, rank_id diff --git a/model_zoo/official/cv/resnet/src/dataset_infer.py b/model_zoo/official/cv/resnet/src/dataset_infer.py index 556f7673849..21777ec2bcb 100644 --- a/model_zoo/official/cv/resnet/src/dataset_infer.py +++ b/model_zoo/official/cv/resnet/src/dataset_infer.py @@ -22,7 +22,7 @@ import mindspore.dataset as ds import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.transforms.c_transforms as C2 from mindspore.communication.management import init, get_rank, get_group_size -from src.config import cfg +from src.model_utils.config import config class ImgDataset: @@ -39,7 +39,7 @@ class ImgDataset: self.data = [] self.dir_label_dict = {} self.img_format = (".bmp", ".png", ".jpg", ".jpeg") - self.dir_label = cfg.infer_label + self.dir_label = config.infer_label dataset_list = sorted(os.listdir(dataset_path)) file_exist = dir_exist = False for index, data_name in enumerate(dataset_list): diff --git a/model_zoo/official/cv/resnet/src/model_utils/config.py b/model_zoo/official/cv/resnet/src/model_utils/config.py new file mode 100644 index 00000000000..e6faaaefbc5 --- /dev/null +++ b/model_zoo/official/cv/resnet/src/model_utils/config.py @@ -0,0 +1,125 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Parse arguments""" + +import os +import ast +import argparse +from pprint import pprint, pformat +import yaml + +_config_path = "./resnet50_cifar10_config.yaml" + +class Config: + """ + Configuration namespace. Convert dictionary to members. + """ + def __init__(self, cfg_dict): + for k, v in cfg_dict.items(): + if isinstance(v, (list, tuple)): + setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v]) + else: + setattr(self, k, Config(v) if isinstance(v, dict) else v) + + def __str__(self): + return pformat(self.__dict__) + + def __repr__(self): + return self.__str__() + + +def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="resnet50_cifar10_config.yaml"): + """ + Parse command line arguments to the configuration according to the default yaml. + + Args: + parser: Parent parser. + cfg: Base configuration. + helper: Helper description. + cfg_path: Path to the default yaml config. + """ + parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]", + parents=[parser]) + helper = {} if helper is None else helper + choices = {} if choices is None else choices + for item in cfg: + if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict): + help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path) + choice = choices[item] if item in choices else None + if isinstance(cfg[item], bool): + parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice, + help=help_description) + else: + parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice, + help=help_description) + args = parser.parse_args() + return args + + +def parse_yaml(yaml_path): + """ + Parse the yaml config file. + + Args: + yaml_path: Path to the yaml config. + """ + with open(yaml_path, 'r') as fin: + try: + cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader) + cfgs = [x for x in cfgs] + if len(cfgs) == 1: + cfg_helper = {} + cfg = cfgs[0] + elif len(cfgs) == 2: + cfg, cfg_helper = cfgs + else: + raise ValueError("At most 2 docs (config and help description for help) are supported in config yaml") + print(cfg_helper) + except: + raise ValueError("Failed to parse yaml") + return cfg, cfg_helper + + +def merge(args, cfg): + """ + Merge the base config from yaml file and command line arguments. + + Args: + args: Command line arguments. + cfg: Base configuration. + """ + args_var = vars(args) + for item in args_var: + cfg[item] = args_var[item] + return cfg + + +def get_config(): + """ + Get Config according to the yaml file and cli arguments. + """ + parser = argparse.ArgumentParser(description="default name", add_help=False) + current_dir = os.path.dirname(os.path.abspath(__file__)) + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, \ + "../resnet50_cifar10_config.yaml"), help="Config file path") + path_args, _ = parser.parse_known_args() + default, helper = parse_yaml(path_args.config_path) + pprint(default) + args = parse_cli_to_yaml(parser, default, helper, path_args.config_path) + final_config = merge(args, default) + return Config(final_config) + +config = get_config() diff --git a/model_zoo/official/cv/resnet/src/model_utils/device_adapter.py b/model_zoo/official/cv/resnet/src/model_utils/device_adapter.py new file mode 100644 index 00000000000..9c3d21d5e47 --- /dev/null +++ b/model_zoo/official/cv/resnet/src/model_utils/device_adapter.py @@ -0,0 +1,27 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Device adapter for ModelArts""" + +from src.model_utils.config import config + +if config.enable_modelarts: + from src.model_utils.moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id +else: + from src.model_utils.local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id + +__all__ = [ + "get_device_id", "get_device_num", "get_rank_id", "get_job_id" +] diff --git a/model_zoo/official/cv/resnet/src/model_utils/local_adapter.py b/model_zoo/official/cv/resnet/src/model_utils/local_adapter.py new file mode 100644 index 00000000000..769fa6dc78e --- /dev/null +++ b/model_zoo/official/cv/resnet/src/model_utils/local_adapter.py @@ -0,0 +1,36 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Local adapter""" + +import os + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + return "Local Job" diff --git a/model_zoo/official/cv/resnet/src/model_utils/moxing_adapter.py b/model_zoo/official/cv/resnet/src/model_utils/moxing_adapter.py new file mode 100644 index 00000000000..aabd5ac6cf1 --- /dev/null +++ b/model_zoo/official/cv/resnet/src/model_utils/moxing_adapter.py @@ -0,0 +1,115 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Moxing adapter for ModelArts""" + +import os +import functools +from mindspore import context +from src.model_utils.config import config + +_global_sync_count = 0 + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + job_id = os.getenv('JOB_ID') + job_id = job_id if job_id != "" else "default" + return job_id + +def sync_data(from_path, to_path): + """ + Download data from remote obs to local directory if the first url is remote url and the second one is local path + Upload data from local directory to remote obs in contrast. + """ + import moxing as mox + import time + global _global_sync_count + sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count) + _global_sync_count += 1 + + # Each server contains 8 devices as most. + if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): + print("from path: ", from_path) + print("to path: ", to_path) + mox.file.copy_parallel(from_path, to_path) + print("===finish data synchronization===") + try: + os.mknod(sync_lock) + except IOError: + pass + print("===save flag===") + + while True: + if os.path.exists(sync_lock): + break + time.sleep(1) + + print("Finish sync data from {} to {}.".format(from_path, to_path)) + + +def moxing_wrapper(pre_process=None, post_process=None): + """ + Moxing wrapper to download dataset and upload outputs. + """ + def wrapper(run_func): + @functools.wraps(run_func) + def wrapped_func(*args, **kwargs): + # Download data from data_url + if config.enable_modelarts: + if config.data_url: + sync_data(config.data_url, config.data_path) + print("Dataset downloaded: ", os.listdir(config.data_path)) + if config.checkpoint_url: + sync_data(config.checkpoint_url, config.load_path) + print("Preload downloaded: ", os.listdir(config.load_path)) + if config.train_url: + sync_data(config.train_url, config.output_path) + print("Workspace downloaded: ", os.listdir(config.output_path)) + + context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + config.device_num = get_device_num() + config.device_id = get_device_id() + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + + if pre_process: + pre_process() + + run_func(*args, **kwargs) + + # Upload data to train_url + if config.enable_modelarts: + if post_process: + post_process() + + if config.train_url: + print("Start to copy output directory") + sync_data(config.output_path, config.train_url) + return wrapped_func + return wrapper diff --git a/model_zoo/official/cv/resnet/train.py b/model_zoo/official/cv/resnet/train.py index 3935149efb8..75518c09263 100755 --- a/model_zoo/official/cv/resnet/train.py +++ b/model_zoo/official/cv/resnet/train.py @@ -14,8 +14,6 @@ # ============================================================================ """train resnet.""" import os -import argparse -import ast from mindspore import context from mindspore import Tensor from mindspore.nn.optim import Momentum, thor @@ -26,7 +24,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMoni from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.train.serialization import load_checkpoint, load_param_into_net -from mindspore.communication.management import init, get_rank, get_group_size +from mindspore.communication.management import init from mindspore.common import set_seed from mindspore.parallel import set_algo_parameters import mindspore.nn as nn @@ -34,72 +32,35 @@ import mindspore.common.initializer as weight_init import mindspore.log as logger from src.lr_generator import get_lr, warmup_cosine_annealing_lr from src.CrossEntropySmooth import CrossEntropySmooth -from src.config import cfg from src.eval_callback import EvalCallBack from src.metric import DistAccuracy, ClassifyCorrectCell - -parser = argparse.ArgumentParser(description='Image classification') -parser.add_argument('--net', type=str, default=None, help='Resnet Model, resnet18, resnet34, resnet50 or resnet101') -parser.add_argument('--dataset', type=str, default=None, help='Dataset, either cifar10 or imagenet2012') -parser.add_argument('--run_distribute', type=ast.literal_eval, default=False, help='Run distribute') -parser.add_argument('--device_num', type=int, default=1, help='Device num.') - -parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') -parser.add_argument('--device_target', type=str, default='Ascend', choices=("Ascend", "GPU", "CPU"), - help="Device target, support Ascend, GPU and CPU.") -parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path') -parser.add_argument('--parameter_server', type=ast.literal_eval, default=False, help='Run parameter server train') -parser.add_argument("--filter_weight", type=ast.literal_eval, default=False, - help="Filter head weight parameters, default is False.") -parser.add_argument("--run_eval", type=ast.literal_eval, default=False, - help="Run evaluation when training, default is False.") -parser.add_argument('--eval_dataset_path', type=str, default=None, help='Evaluation dataset path when run_eval is True') -parser.add_argument("--save_best_ckpt", type=ast.literal_eval, default=True, - help="Save best checkpoint when run_eval is True, default is True.") -parser.add_argument("--eval_start_epoch", type=int, default=40, - help="Evaluation start epoch when run_eval is True, default is 40.") -parser.add_argument("--eval_interval", type=int, default=1, - help="Evaluation interval when run_eval is True, default is 1.") -parser.add_argument('--enable_cache', type=ast.literal_eval, default=False, - help='Caching the eval dataset in memory to speedup evaluation, default is False.') -parser.add_argument('--cache_session_id', type=str, default="", help='The session id for cache service.') -parser.add_argument('--mode', type=str, default='GRAPH', choices=('GRAPH', 'PYNATIVE'), - help="Graph mode or PyNative mode, default is Graph mode") -args_opt = parser.parse_args() +from src.model_utils.config import config +from src.model_utils.moxing_adapter import moxing_wrapper +from src.model_utils.device_adapter import get_rank_id, get_device_num set_seed(1) -if args_opt.net in ("resnet18", "resnet34", "resnet50"): - if args_opt.net == "resnet18": +if config.net_name in ("resnet18", "resnet34", "resnet50"): + if config.net_name == "resnet18": from src.resnet import resnet18 as resnet - if args_opt.net == "resnet34": + if config.net_name == "resnet34": from src.resnet import resnet34 as resnet - if args_opt.net == "resnet50": + if config.net_name == "resnet50": from src.resnet import resnet50 as resnet - if args_opt.dataset == "cifar10": - from src.config import config1 as config + if config.dataset == "cifar10": from src.dataset import create_dataset1 as create_dataset else: - from src.config import config2 as config - if args_opt.mode == "GRAPH": + if config.mode_name == "GRAPH": from src.dataset import create_dataset2 as create_dataset else: from src.dataset import create_dataset_pynative as create_dataset -elif args_opt.net == "resnet101": +elif config.net_name == "resnet101": from src.resnet import resnet101 as resnet - from src.config import config3 as config from src.dataset import create_dataset3 as create_dataset else: from src.resnet import se_resnet50 as resnet - from src.config import config4 as config from src.dataset import create_dataset4 as create_dataset -if cfg.optimizer == "Thor": - if args_opt.device_target == "Ascend": - from src.config import config_thor_Ascend as config - else: - from src.config import config_thor_gpu as config - def filter_checkpoint_parameter_by_list(origin_dict, param_filter): """remove useless parameters according to filter_list""" @@ -122,56 +83,46 @@ def set_graph_kernel_context(run_platform, net_name): context.set_context(enable_graph_kernel=True) context.set_context(graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D") -if __name__ == '__main__': - target = args_opt.device_target +def set_parameter(): + """set_parameter""" + target = config.device_target if target == "CPU": - args_opt.run_distribute = False - - ckpt_save_dir = config.save_checkpoint_path + config.run_distribute = False # init context - if args_opt.mode == 'GRAPH': + if config.mode_name == 'GRAPH': context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False) - set_graph_kernel_context(target, args_opt.net) + set_graph_kernel_context(target, config.net_name) else: context.set_context(mode=context.PYNATIVE_MODE, device_target=target, save_graphs=False) - if args_opt.parameter_server: + if config.parameter_server: context.set_ps_context(enable_ps=True) - if args_opt.run_distribute: + if config.run_distribute: if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) context.set_context(device_id=device_id, enable_auto_mixed_precision=True) - context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + context.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) set_algo_parameters(elementwise_op_strategy_follow=True) - if args_opt.net == "resnet50" or args_opt.net == "se-resnet50": + if config.net_name == "resnet50" or config.net_name == "se-resnet50": context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160]) - elif args_opt.net == "resnet101": + elif config.net_name == "resnet101": context.set_auto_parallel_context(all_reduce_fusion_config=[80, 210, 313]) init() # GPU target else: init() - context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, + context.set_auto_parallel_context(device_num=get_device_num(), + parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) - if args_opt.net == "resnet50": + if config.net_name == "resnet50": context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160]) - ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/" - # create dataset - dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1, - batch_size=config.batch_size, target=target, distribute=args_opt.run_distribute) - step_size = dataset.get_dataset_size() - - # define net - net = resnet(class_num=config.class_num) - if args_opt.parameter_server: - net.set_param_ps() - - # init weight - if args_opt.pre_trained: - param_dict = load_checkpoint(args_opt.pre_trained) - if args_opt.filter_weight: +def init_weight(net): + """init_weight""" + if config.pre_trained: + param_dict = load_checkpoint(config.pre_trained) + if config.filter_weight: filter_list = [x.name for x in net.end_point.get_parameters()] filter_checkpoint_parameter_by_list(param_dict, filter_list) load_param_into_net(net, param_dict) @@ -186,20 +137,60 @@ if __name__ == '__main__': cell.weight.shape, cell.weight.dtype)) - # init lr - if cfg.optimizer == "Thor": +def init_lr(step_size): + """init lr""" + if config.optimizer == "Thor": from src.lr_generator import get_thor_lr lr = get_thor_lr(0, config.lr_init, config.lr_decay, config.lr_end_epoch, step_size, decay_epochs=39) else: - if args_opt.net in ("resnet18", "resnet34", "resnet50", "se-resnet50"): + if config.net_name in ("resnet18", "resnet34", "resnet50", "se-resnet50"): lr = get_lr(lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode) else: lr = warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, config.epoch_size, config.pretrain_epoch_size * step_size) - lr = Tensor(lr) + return lr +def init_loss_scale(): + if config.dataset == "imagenet2012": + if not config.use_label_smooth: + config.label_smooth_factor = 0.0 + loss = CrossEntropySmooth(sparse=True, reduction="mean", + smooth_factor=config.label_smooth_factor, num_classes=config.class_num) + else: + loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') + return loss + +def run_eval(target, model, ckpt_save_dir, cb): + """run_eval""" + if config.run_eval: + if config.eval_dataset_path is None or (not os.path.isdir(config.eval_dataset_path)): + raise ValueError("{} is not a existing path.".format(config.eval_dataset_path)) + eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, + batch_size=config.batch_size, target=target, enable_cache=config.enable_cache, + cache_session_id=config.cache_session_id) + eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"} + eval_cb = EvalCallBack(apply_eval, eval_param_dict, interval=config.eval_interval, + eval_start_epoch=config.eval_start_epoch, save_best_ckpt=config.save_best_ckpt, + ckpt_directory=ckpt_save_dir, besk_ckpt_name="best_acc.ckpt", + metrics_name="acc") + cb += [eval_cb] + +@moxing_wrapper() +def train_net(): + """train net""" + target = config.device_target + set_parameter() + dataset = create_dataset(dataset_path=config.data_path, do_train=True, repeat_num=1, + batch_size=config.batch_size, target=target, + distribute=config.run_distribute) + step_size = dataset.get_dataset_size() + net = resnet(class_num=config.class_num) + if config.parameter_server: + net.set_param_ps() + init_weight(net=net) + lr = Tensor(init_lr(step_size=step_size)) # define opt decayed_params = [] no_decayed_params = [] @@ -213,27 +204,21 @@ if __name__ == '__main__': {'params': no_decayed_params}, {'order_params': net.trainable_params()}] opt = Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale) - if args_opt.dataset == "imagenet2012": - if not config.use_label_smooth: - config.label_smooth_factor = 0.0 - loss = CrossEntropySmooth(sparse=True, reduction="mean", - smooth_factor=config.label_smooth_factor, num_classes=config.class_num) - else: - loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') + loss = init_loss_scale() loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) - dist_eval_network = ClassifyCorrectCell(net) if args_opt.run_distribute else None + dist_eval_network = ClassifyCorrectCell(net) if config.run_distribute else None metrics = {"acc"} - if args_opt.run_distribute: - metrics = {'acc': DistAccuracy(batch_size=config.batch_size, device_num=args_opt.device_num)} - if (args_opt.net not in ("resnet18", "resnet34", "resnet50", "resnet101", "se-resnet50")) or \ - args_opt.parameter_server or target == "CPU": + if config.run_distribute: + metrics = {'acc': DistAccuracy(batch_size=config.batch_size, device_num=config.device_num)} + if (config.net_name not in ("resnet18", "resnet34", "resnet50", "resnet101", "se-resnet50")) or \ + config.parameter_server or target == "CPU": ## fp32 training model = Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network) else: model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics, amp_level="O2", keep_batchnorm_fp32=False, eval_network=dist_eval_network) - if cfg.optimizer == "Thor" and args_opt.dataset == "imagenet2012": + if config.optimizer == "Thor" and config.dataset == "imagenet2012": from src.lr_generator import get_thor_damping damping = get_thor_damping(0, config.damping_init, config.damping_decay, 70, step_size) split_indices = [26, 53] @@ -242,36 +227,30 @@ if __name__ == '__main__': model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, amp_level="O2", keep_batchnorm_fp32=False) - args_opt.run_eval = False + config.run_eval = False logger.warning("Thor optimizer not support evaluation while training.") # define callbacks time_cb = TimeMonitor(data_size=step_size) loss_cb = LossMonitor() cb = [time_cb, loss_cb] + ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path) + ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank_id()) + "/" if config.save_checkpoint: config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size, keep_checkpoint_max=config.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="resnet", directory=ckpt_save_dir, config=config_ck) cb += [ckpt_cb] - if args_opt.run_eval: - if args_opt.eval_dataset_path is None or (not os.path.isdir(args_opt.eval_dataset_path)): - raise ValueError("{} is not a existing path.".format(args_opt.eval_dataset_path)) - eval_dataset = create_dataset(dataset_path=args_opt.eval_dataset_path, do_train=False, - batch_size=config.batch_size, target=target, enable_cache=args_opt.enable_cache, - cache_session_id=args_opt.cache_session_id) - eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"} - eval_cb = EvalCallBack(apply_eval, eval_param_dict, interval=args_opt.eval_interval, - eval_start_epoch=args_opt.eval_start_epoch, save_best_ckpt=args_opt.save_best_ckpt, - ckpt_directory=ckpt_save_dir, besk_ckpt_name="best_acc.ckpt", - metrics_name="acc") - cb += [eval_cb] + run_eval(target, model, ckpt_save_dir, cb) # train model - if args_opt.net == "se-resnet50": + if config.net_name == "se-resnet50": config.epoch_size = config.train_epoch_size - dataset_sink_mode = (not args_opt.parameter_server) and target != "CPU" + dataset_sink_mode = (not config.parameter_server) and target != "CPU" model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb, sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) - if args_opt.run_eval and args_opt.enable_cache: + if config.run_eval and config.enable_cache: print("Remember to shut down the cache server via \"cache_admin --stop\"") + +if __name__ == '__main__': + train_net() diff --git a/model_zoo/official/cv/resnext/eval.py b/model_zoo/official/cv/resnext/eval.py index c1af08a7a47..4d96a4fc142 100644 --- a/model_zoo/official/cv/resnext/eval.py +++ b/model_zoo/official/cv/resnext/eval.py @@ -147,8 +147,8 @@ def test(cloud_args=None): # network config.logger.important_info('start create network') - if os.path.isdir(config.pretrained): - models = list(glob.glob(os.path.join(config.pretrained, '*.ckpt'))) + if os.path.isdir(config.checkpoint_file_path): + models = list(glob.glob(os.path.join(config.checkpoint_file_path, '*.ckpt'))) print(models) if config.graph_ckpt: f = lambda x: -1 * int(os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split('_')[0]) diff --git a/tests/st/model_zoo_tests/resnet50/test_resnet50_cifar10.py b/tests/st/model_zoo_tests/resnet50/test_resnet50_cifar10.py index d636d0c2eaf..2372b075b55 100644 --- a/tests/st/model_zoo_tests/resnet50/test_resnet50_cifar10.py +++ b/tests/st/model_zoo_tests/resnet50/test_resnet50_cifar10.py @@ -33,8 +33,9 @@ def test_resnet50_cifar10_ascend(): new_list = ["total_epochs=10", "10"] utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py")) dataset_path = os.path.join(utils.data_root, "cifar-10-batches-bin") - exec_network_shell = "cd resnet/scripts; bash run_distribute_train.sh resnet50 cifar10 {} {}"\ - .format(utils.rank_table_path, dataset_path) + config_path = os.path.join(cur_model_path, "resnet50_cifar10_config.yaml") + exec_network_shell = "cd resnet/scripts; bash run_distribute_train.sh {} {} {}"\ + .format(utils.rank_table_path, dataset_path, config_path) os.system(exec_network_shell) cmd = "ps -ef | grep python | grep train.py | grep -v grep" ret = utils.process_check(100, cmd) @@ -63,7 +64,9 @@ def test_resnet50_cifar10_gpu(): new_list = ["total_epochs=10", "10"] utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py")) dataset_path = os.path.join(utils.data_root, "cifar-10-batches-bin") - exec_network_shell = "cd resnet/scripts; sh run_distribute_train_gpu.sh resnet50 cifar10 {}".format(dataset_path) + config_path = os.path.join(cur_model_path, "resnet50_cifar10_config.yaml") + exec_network_shell = "cd resnet/scripts; sh run_distribute_train_gpu.sh {} {}" \ + .format(dataset_path, config_path) logger.warning("cmd [{}] is running...".format(exec_network_shell)) os.system(exec_network_shell) cmd = "ps -ef | grep python | grep train.py | grep -v grep"