forked from mindspore-Ecosystem/mindspore
!18954 modify model_zoo resnet network for clould
Merge pull request !18954 from lilei/modify_model_zoo_resnet50
This commit is contained in:
commit
bb43be1fb4
|
@ -101,27 +101,26 @@ After installing MindSpore via the official website, you can start training and
|
|||
|
||||
```bash
|
||||
# distributed training
|
||||
Usage: bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
|
||||
# standalone training
|
||||
Usage: bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH]
|
||||
[PRETRAINED_CKPT_PATH](optional)
|
||||
Usage: bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
|
||||
# run evaluation example
|
||||
Usage: bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
|
||||
Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
|
||||
```
|
||||
|
||||
- Running on GPU
|
||||
|
||||
```bash
|
||||
# distributed training example
|
||||
bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
|
||||
# standalone training example
|
||||
bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
|
||||
# infer example
|
||||
bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
|
||||
bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
|
||||
|
||||
# gpu benchmark example
|
||||
bash run_gpu_resnet_benchmark.sh [DATASET_PATH] [BATCH_SIZE](optional) [DTYPE](optional) [DEVICE_NUM](optional) [SAVE_CKPT](optional) [SAVE_PATH](optional)
|
||||
|
@ -131,10 +130,41 @@ bash run_gpu_resnet_benchmark.sh [DATASET_PATH] [BATCH_SIZE](optional) [DTYPE](o
|
|||
|
||||
```bash
|
||||
# standalone training example
|
||||
python train.py --net=[resnet50|resnet101] --dataset=[cifar10|imagenet2012] --device_target=CPU --dataset_path=[DATASET_PATH] --pre_trained=[CHECKPOINT_PATH](optional)
|
||||
python train.py --device_target=CPU --data_path=[DATASET_PATH] --config_path [CONFIG_PATH] --pre_trained=[CHECKPOINT_PATH](optional)
|
||||
|
||||
# infer example
|
||||
python eval.py --net=[resnet50|resnet101] --dataset=[cifar10|imagenet2012] --dataset_path=[DATASET_PATH] --checkpoint_path=[CHECKPOINT_PATH] --device_target=CPU
|
||||
python eval.py --data_path=[DATASET_PATH] --checkpoint_file_path=[CHECKPOINT_PATH] --config_path [CONFIG_PATH] --device_target=CPU
|
||||
```
|
||||
|
||||
If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training and evaluation as follows:
|
||||
|
||||
```python
|
||||
# run distributed training on modelarts example
|
||||
# (1) First, Perform a or b.
|
||||
# a. Set "enable_modelarts=True" on yaml file.
|
||||
# Set other parameters on yaml file you need.
|
||||
# b. Add "enable_modelarts=True" on the website UI interface.
|
||||
# Add other parameters on the website UI interface.
|
||||
# (2) Set the config directory to "config_path=/The path of config in S3/"
|
||||
# (3) Set the code directory to "/path/resnet" on the website UI interface.
|
||||
# (4) Set the startup file to "train.py" on the website UI interface.
|
||||
# (5) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
|
||||
# (6) Create your job.
|
||||
|
||||
# run evaluation on modelarts example
|
||||
# (1) Copy or upload your trained model to S3 bucket.
|
||||
# (2) Perform a or b.
|
||||
# a. Set "enable_modelarts=True" on yaml file.
|
||||
# Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on yaml file.
|
||||
# Set "checkpoint_url=/The path of checkpoint in S3/" on yaml file.
|
||||
# b. Add "enable_modelarts=True" on the website UI interface.
|
||||
# Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
|
||||
# Add "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface.
|
||||
# (3) Set the config directory to "config_path=/The path of config in S3/"
|
||||
# (4) Set the code directory to "/path/resnet" on the website UI interface.
|
||||
# (5) Set the startup file to "eval.py" on the website UI interface.
|
||||
# (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
|
||||
# (7) Create your job.
|
||||
```
|
||||
|
||||
# [Script Description](#contents)
|
||||
|
@ -158,13 +188,26 @@ python eval.py --net=[resnet50|resnet101] --dataset=[cifar10|imagenet2012] --dat
|
|||
|── run_eval_gpu_resnet_benckmark.sh # launch gpu benchmark eval for resnet50 with imagenet2012
|
||||
└── cache_util.sh # a collection of helper functions to manage cache
|
||||
├── src
|
||||
├── config.py # parameter configuration
|
||||
├── dataset.py # data preprocessing
|
||||
├─ eval_callback.py # evaluation callback while training
|
||||
├── CrossEntropySmooth.py # loss definition for ImageNet2012 dataset
|
||||
├── lr_generator.py # generate learning rate for each step
|
||||
├── resnet.py # resnet backbone, including resnet50 and resnet101 and se-resnet50
|
||||
└── resnet_gpu_benchmark.py # resnet50 for GPU benchmark
|
||||
├── model_utils
|
||||
├──config.py # parameter configuration
|
||||
├──device_adapter.py # device adapter
|
||||
├──local_adapter.py # local adapter
|
||||
├──moxing_adapter.py # moxing adapter
|
||||
├── resnet18_cifar10_config.yaml # parameter configuration
|
||||
├── resnet18_imagenet2012_config.yaml # parameter configuration
|
||||
├── resnet34_imagenet2012_config.yaml # parameter configuration
|
||||
├── resnet50_cifar10_config.yaml # parameter configuration
|
||||
├── resnet50_imagenet2012_Ascend_config.yaml # parameter configuration
|
||||
├── resnet50_imagenet2012_config.yaml # parameter configuration
|
||||
├── resnet50_imagenet2012_GPU_config.yaml # parameter configuration
|
||||
├── resnet101_imagenet2012_config.yaml # parameter configuration
|
||||
├── se-resnet50_imagenet2012_config.yaml # parameter configuration
|
||||
├── export.py # export model for inference
|
||||
├── mindspore_hub_conf.py # mindspore hub interface
|
||||
├── eval.py # eval net
|
||||
|
@ -174,7 +217,7 @@ python eval.py --net=[resnet50|resnet101] --dataset=[cifar10|imagenet2012] --dat
|
|||
|
||||
## [Script Parameters](#contents)
|
||||
|
||||
Parameters for both training and evaluation can be set in config.py.
|
||||
Parameters for both training and evaluation can be set in config file.
|
||||
|
||||
- Config for ResNet18 and ResNet50, CIFAR-10 dataset
|
||||
|
||||
|
@ -189,7 +232,6 @@ Parameters for both training and evaluation can be set in config.py.
|
|||
"save_checkpoint": True, # whether save checkpoint or not
|
||||
"save_checkpoint_epochs": 5, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last step
|
||||
"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint
|
||||
"save_checkpoint_path": "./", # path to save checkpoint
|
||||
"warmup_epochs": 5, # number of warmup epoch
|
||||
"lr_decay_mode": "poly" # decay mode can be selected in steps, ploy and default
|
||||
"lr_init": 0.01, # initial learning rate
|
||||
|
@ -210,7 +252,6 @@ Parameters for both training and evaluation can be set in config.py.
|
|||
"save_checkpoint": True, # whether save checkpoint or not
|
||||
"save_checkpoint_epochs": 5, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last epoch
|
||||
"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint
|
||||
"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path
|
||||
"warmup_epochs": 0, # number of warmup epoch
|
||||
"lr_decay_mode": "Linear", # decay mode for generating learning rate
|
||||
"use_label_smooth": True, # label smooth
|
||||
|
@ -233,7 +274,6 @@ Parameters for both training and evaluation can be set in config.py.
|
|||
"save_checkpoint": True, # whether save checkpoint or not
|
||||
"save_checkpoint_epochs": 5, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last epoch
|
||||
"keep_checkpoint_max": 1, # only keep the last keep_checkpoint_max checkpoint
|
||||
"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path
|
||||
"warmup_epochs": 0, # number of warmup epoch
|
||||
"optimizer": 'Momentum', # optimizer
|
||||
"use_label_smooth": True, # label smooth
|
||||
|
@ -256,7 +296,6 @@ Parameters for both training and evaluation can be set in config.py.
|
|||
"save_checkpoint": True, # whether save checkpoint or not
|
||||
"save_checkpoint_epochs": 5, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last epoch
|
||||
"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint
|
||||
"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path
|
||||
"warmup_epochs": 0, # number of warmup epoch
|
||||
"lr_decay_mode": "cosine" # decay mode for generating learning rate
|
||||
"use_label_smooth": True, # label_smooth
|
||||
|
@ -278,7 +317,6 @@ Parameters for both training and evaluation can be set in config.py.
|
|||
"save_checkpoint": True, # whether save checkpoint or not
|
||||
"save_checkpoint_epochs": 4, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last epoch
|
||||
"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint
|
||||
"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path
|
||||
"warmup_epochs": 3, # number of warmup epoch
|
||||
"lr_decay_mode": "cosine" # decay mode for generating learning rate
|
||||
"use_label_smooth": True, # label_smooth
|
||||
|
@ -296,15 +334,13 @@ Parameters for both training and evaluation can be set in config.py.
|
|||
|
||||
```bash
|
||||
# distributed training
|
||||
Usage: bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
|
||||
# standalone training
|
||||
Usage: bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH]
|
||||
[PRETRAINED_CKPT_PATH](optional)
|
||||
Usage: bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
|
||||
# run evaluation example
|
||||
Usage: bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
|
||||
|
||||
Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
|
||||
```
|
||||
|
||||
For distributed training, a hccl configuration file with JSON format needs to be created in advance.
|
||||
|
@ -319,13 +355,14 @@ If you want to change device_id for standalone training, you can set environment
|
|||
|
||||
```bash
|
||||
# distributed training example
|
||||
bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
|
||||
# standalone training example
|
||||
bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
|
||||
# infer example
|
||||
bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
|
||||
bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]
|
||||
[CONFIG_PATH]
|
||||
|
||||
# gpu benchmark training example
|
||||
bash run_gpu_resnet_benchmark.sh [DATASET_PATH] [BATCH_SIZE](optional) [DTYPE](optional) [DEVICE_NUM](optional) [SAVE_CKPT](optional) [SAVE_PATH](optional)
|
||||
|
@ -343,29 +380,29 @@ Please follow the instructions in the link [GPU-Multi-Host](https://www.mindspor
|
|||
- Parameter server training Ascend example
|
||||
|
||||
```bash
|
||||
bash run_parameter_server_train.sh [resnet18|resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
bash run_parameter_server_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
```
|
||||
|
||||
- Parameter server training GPU example
|
||||
|
||||
```bash
|
||||
bash run_parameter_server_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
bash run_parameter_server_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
```
|
||||
|
||||
#### Evaluation while training
|
||||
|
||||
```bash
|
||||
# evaluation with distributed training Ascend example:
|
||||
bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
|
||||
# evaluation with standalone training Ascend example:
|
||||
bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
bash run_standalone_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
|
||||
# evaluation with distributed training GPU example:
|
||||
bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
|
||||
# evaluation with standalone training GPU example:
|
||||
bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
```
|
||||
|
||||
`RUN_EVAL` and `EVAL_DATASET_PATH` are optional arguments, setting `RUN_EVAL`=True allows you to do evaluation while training. When `RUN_EVAL` is set, `EVAL_DATASET_PATH` must also be set.
|
||||
|
@ -480,12 +517,12 @@ epoch: [0/1] step: [100/5004], loss is 6.814013Epoch time: 3437.154 ms, fps: 148
|
|||
|
||||
```bash
|
||||
# evaluation
|
||||
Usage: bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
|
||||
Usage: bash run_eval.sh [DATASET_PATH] [CONFIG_PATH] [CHECKPOINT_PATH]
|
||||
```
|
||||
|
||||
```bash
|
||||
# evaluation example
|
||||
bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/train_parallel0/resnet-90_195.ckpt
|
||||
bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/train_parallel0/resnet-90_195.ckpt --config_path /.yaml
|
||||
```
|
||||
|
||||
> checkpoint can be produced in training process.
|
||||
|
@ -493,7 +530,7 @@ bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/tra
|
|||
#### Running on GPU
|
||||
|
||||
```bash
|
||||
bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
|
||||
bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
|
||||
```
|
||||
|
||||
### Result
|
||||
|
@ -547,13 +584,39 @@ result: {'top_5_accuracy': 0.9342589628681178, 'top_1_accuracy': 0.7680657810499
|
|||
|
||||
### [Export MindIR](#contents)
|
||||
|
||||
Export MindIR on local
|
||||
|
||||
```shell
|
||||
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
|
||||
python export.py --checkpoint_file_path [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] --config_path [CONFIG_PATH]
|
||||
```
|
||||
|
||||
The ckpt_file parameter is required,
|
||||
The checkpoint_file_path parameter is required,
|
||||
`EXPORT_FORMAT` should be in ["AIR", "MINDIR"]
|
||||
|
||||
Export on ModelArts (If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start as follows)
|
||||
|
||||
```python
|
||||
# Export on ModelArts
|
||||
# (1) Perform a or b.
|
||||
# a. Set "enable_modelarts=True" on default_config.yaml file.
|
||||
# Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on default_config.yaml file.
|
||||
# Set "checkpoint_url='s3://dir_to_trained_ckpt/'" on default_config.yaml file.
|
||||
# Set "file_name='./resnet'" on default_config.yaml file.
|
||||
# Set "file_format='AIR'" on default_config.yaml file.
|
||||
# Set other parameters on default_config.yaml file you need.
|
||||
# b. Add "enable_modelarts=True" on the website UI interface.
|
||||
# Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
|
||||
# Add "checkpoint_url='s3://dir_to_trained_ckpt/'" on the website UI interface.
|
||||
# Add "file_name='./resnet'" on the website UI interface.
|
||||
# Add "file_format='AIR'" on the website UI interface.
|
||||
# Add other parameters on the website UI interface.
|
||||
# (2) Set the config_path="/path/yaml file" on the website UI interface.
|
||||
# (3) Set the code directory to "/path/resnet" on the website UI interface.
|
||||
# (4) Set the startup file to "export.py" on the website UI interface.
|
||||
# (5) Set the "Output file path" and "Job log path" to your path on the website UI interface.
|
||||
# (6) Create your job.
|
||||
```
|
||||
|
||||
### Infer on Ascend310
|
||||
|
||||
Before performing inference, the mindir file must bu exported by `export.py` script. We only provide an example of inference using MINDIR model.
|
||||
|
|
|
@ -104,27 +104,60 @@ ResNet的总体网络架构如下:
|
|||
|
||||
```text
|
||||
# 分布式训练
|
||||
用法:bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
用法:bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
|
||||
# 单机训练
|
||||
用法:bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH]
|
||||
[PRETRAINED_CKPT_PATH](可选)
|
||||
用法:bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
|
||||
# 运行评估示例
|
||||
用法:bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
|
||||
用法:bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
|
||||
```
|
||||
|
||||
- GPU处理器环境运行
|
||||
|
||||
```text
|
||||
# 分布式训练示例
|
||||
bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
|
||||
# 单机训练示例
|
||||
bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
|
||||
# 推理示例
|
||||
bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
|
||||
bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
|
||||
```
|
||||
|
||||
如果要在modelarts上进行模型的训练,可以参考modelarts的官方指导文档(https://support.huaweicloud.com/modelarts/)
|
||||
开始进行模型的训练和推理,具体操作如下:
|
||||
|
||||
```python
|
||||
# 在modelarts上使用分布式训练的示例:
|
||||
# (1) 选址a或者b其中一种方式。
|
||||
# a. 设置 "enable_modelarts=True" 。
|
||||
# 在yaml文件上设置网络所需的参数。
|
||||
# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。
|
||||
# 在modelarts的界面上设置网络所需的参数。
|
||||
# (2) 在modelarts的界面上设置配置文件的路径"config_path=/The path of config in S3/"
|
||||
# (3) 在modelarts的界面上设置代码的路径 "/path/resnet"。
|
||||
# (4) 在modelarts的界面上设置模型的启动文件 "train.py" 。
|
||||
# (5) 在modelarts的界面上设置模型的数据路径 "Dataset path" ,
|
||||
# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。
|
||||
# (6) 开始模型的训练。
|
||||
|
||||
# 在modelarts上使用模型推理的示例
|
||||
# (1) 把训练好的模型地方到桶的对应位置。
|
||||
# (2) 选址a或者b其中一种方式。
|
||||
# a. 设置 "enable_modelarts=True"
|
||||
# 设置 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt" 在 yaml 文件.
|
||||
# 设置 "checkpoint_url=/The path of checkpoint in S3/" 在 yaml 文件.
|
||||
# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。
|
||||
# 增加 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" 参数在modearts的界面上。
|
||||
# 增加 "checkpoint_url=/The path of checkpoint in S3/" 参数在modearts的界面上。
|
||||
# (2) 在modelarts的界面上设置配置文件的路径"config_path=/The path of config in S3/"
|
||||
# (3) 在modelarts的界面上设置代码的路径 "/path/resnet"。
|
||||
# (4) 在modelarts的界面上设置模型的启动文件 "eval.py" 。
|
||||
# (5) 在modelarts的界面上设置模型的数据路径 "Dataset path" ,
|
||||
# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。
|
||||
# (6) 开始模型的推理。
|
||||
```
|
||||
|
||||
# 脚本说明
|
||||
|
@ -146,19 +179,33 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
|
|||
├── run_standalone_train_gpu.sh # 启动GPU单机训练(单卡)
|
||||
└── cache_util.sh # 使用单节点緩存的帮助函数
|
||||
├── src
|
||||
├── config.py # 参数配置
|
||||
├── dataset.py # 数据预处理
|
||||
├─ eval_callback.py # 训练时推理回调函数
|
||||
├── eval_callback.py # 训练时推理回调函数
|
||||
├── CrossEntropySmooth.py # ImageNet2012数据集的损失定义
|
||||
├── lr_generator.py # 生成每个步骤的学习率
|
||||
└── resnet.py # ResNet骨干网络,包括ResNet50、ResNet101和SE-ResNet50
|
||||
├── model_utils
|
||||
├── config.py # 参数配置
|
||||
├── device_adapter.py # 设备配置
|
||||
├── local_adapter.py # 本地设备配置
|
||||
└── moxing_adapter.py # modelarts设备配置
|
||||
├── resnet18_cifar10_config.yaml # 参数配置
|
||||
├── resnet18_imagenet2012_config.yaml # 参数配置
|
||||
├── resnet34_imagenet2012_config.yaml # 参数配置
|
||||
├── resnet50_cifar10_config.yaml # 参数配置
|
||||
├── resnet50_imagenet2012_Ascend_config.yaml # 参数配置
|
||||
├── resnet50_imagenet2012_config.yaml # 参数配置
|
||||
├── resnet50_imagenet2012_GPU_config.yaml # 参数配置
|
||||
├── resnet101_imagenet2012_config.yaml # 参数配置
|
||||
├── se-resnet50_imagenet2012_config.yaml # 参数配置
|
||||
├── eval.py # 评估网络
|
||||
├── eval.py # 评估网络
|
||||
└── train.py # 训练网络
|
||||
```
|
||||
|
||||
## 脚本参数
|
||||
|
||||
在config.py中可以同时配置训练参数和评估参数。
|
||||
在配置文件中可以同时配置训练参数和评估参数。
|
||||
|
||||
- 配置ResNet18、ResNet50和CIFAR-10数据集。
|
||||
|
||||
|
@ -173,7 +220,6 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
|
|||
"save_checkpoint":True, # 是否保存检查点
|
||||
"save_checkpoint_epochs":5, # 两个检查点之间的周期间隔;默认情况下,最后一个检查点将在最后一步完成后保存
|
||||
"keep_checkpoint_max":10, # 只保留最后一个keep_checkpoint_max检查点
|
||||
"save_checkpoint_path":"./", # 检查点保存路径
|
||||
"warmup_epochs":5, # 热身周期数
|
||||
"lr_decay_mode":"poly” # 衰减模式可为步骤、策略和默认
|
||||
"lr_init":0.01, # 初始学习率
|
||||
|
@ -194,7 +240,6 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
|
|||
"save_checkpoint":True, # 是否保存检查点
|
||||
"save_checkpoint_epochs":5, # 两个检查点之间的周期间隔;默认情况下,最后一个检查点将在最后一个周期完成后保存
|
||||
"keep_checkpoint_max":10, # 只保存最后一个keep_checkpoint_max检查点
|
||||
"save_checkpoint_path":"./", # 检查点相对于执行路径的保存路径
|
||||
"warmup_epochs":0, # 热身周期数
|
||||
"lr_decay_mode":"Linear", # 用于生成学习率的衰减模式
|
||||
"use_label_smooth":True, # 标签平滑
|
||||
|
@ -217,7 +262,6 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
|
|||
"save_checkpoint":True, # 是否保存检查点
|
||||
"save_checkpoint_epochs":5, # 两个检查点之间的周期间隔;默认情况下,最后一个检查点将在最后一个周期完成后保存
|
||||
"keep_checkpoint_max":1, # 只保存最后一个keep_checkpoint_max检查点
|
||||
"save_checkpoint_path":"./", # 检查点相对于执行路径的保存路径
|
||||
"warmup_epochs":0, # 热身周期数
|
||||
"optimizer":"Momentum", # 优化器
|
||||
"use_label_smooth":True, # 标签平滑
|
||||
|
@ -240,7 +284,6 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
|
|||
"save_checkpoint":True, # 是否保存检查点
|
||||
"save_checkpoint_epochs":5, # 两个检查点之间的周期间隔;默认情况下,最后一个检查点将在最后一个周期完成后保存
|
||||
"keep_checkpoint_max":10, # 只保存最后一个keep_checkpoint_max检查点
|
||||
"save_checkpoint_path":"./", # 检查点相对于执行路径的保存路径
|
||||
"warmup_epochs":0, # 热身周期数
|
||||
"lr_decay_mode":"cosine” # 用于生成学习率的衰减模式
|
||||
"use_label_smooth":True, # 标签平滑
|
||||
|
@ -262,7 +305,6 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
|
|||
"save_checkpoint":True, # 是否保存检查点
|
||||
"save_checkpoint_epochs":4, # 两个检查点之间的周期间隔;默认情况下,最后一个检查点将在最后一个周期完成后保存
|
||||
"keep_checkpoint_max":10, # 只保存最后一个keep_checkpoint_max检查点
|
||||
"save_checkpoint_path":"./", # checkpoint相对于执行路径的保存路径
|
||||
"warmup_epochs":3, # 热身周期数
|
||||
"lr_decay_mode":"cosine” # 用于生成学习率的衰减模式
|
||||
"use_label_smooth":True, # 标签平滑
|
||||
|
@ -280,14 +322,13 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
|
|||
|
||||
```text
|
||||
# 分布式训练
|
||||
用法:bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
用法:bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
|
||||
# 单机训练
|
||||
用法:bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH]
|
||||
[PRETRAINED_CKPT_PATH](可选)
|
||||
用法:bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
|
||||
# 运行评估示例
|
||||
用法:bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
|
||||
用法:bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
|
||||
|
||||
```
|
||||
|
||||
|
@ -303,13 +344,13 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
|
|||
|
||||
```text
|
||||
# 分布式训练示例
|
||||
bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
|
||||
# 单机训练示例
|
||||
bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
|
||||
# 推理示例
|
||||
bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
|
||||
bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
|
||||
```
|
||||
|
||||
#### 运行参数服务器模式训练
|
||||
|
@ -317,29 +358,29 @@ bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH]
|
|||
- Ascend参数服务器训练示例
|
||||
|
||||
```text
|
||||
bash run_parameter_server_train.sh [resnet18|resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
bash run_parameter_server_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
```
|
||||
|
||||
- GPU参数服务器训练示例
|
||||
|
||||
```text
|
||||
bash run_parameter_server_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
bash run_parameter_server_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](可选)
|
||||
```
|
||||
|
||||
#### 训练时推理
|
||||
|
||||
```bash
|
||||
# Ascend 分布式训练时推理示例:
|
||||
bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
|
||||
# Ascend 单机训练时推理示例:
|
||||
bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
bash run_standalone_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
|
||||
# GPU 分布式训练时推理示例:
|
||||
bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
bash run_distribute_train_gpu.sh [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
|
||||
# GPU 单机训练时推理示例:
|
||||
bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
bash run_standalone_train_gpu.sh [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
```
|
||||
|
||||
训练时推理需要在设置`RUN_EVAL`为True,与此同时还需要设置`EVAL_DATASET_PATH`。此外,当设置`RUN_EVAL`为True时还可为python脚本设置`save_best_ckpt`, `eval_start_epoch`, `eval_interval`等参数。
|
||||
|
@ -446,12 +487,12 @@ epoch:5 step:5004, loss is 3.3501816
|
|||
|
||||
```bash
|
||||
# 评估
|
||||
Usage: bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
|
||||
Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
|
||||
```
|
||||
|
||||
```bash
|
||||
# 评估示例
|
||||
bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/train_parallel0/resnet-90_195.ckpt
|
||||
bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/train_parallel0/resnet-90_195.ckpt --config_path /*.yaml
|
||||
```
|
||||
|
||||
> 训练过程中可以生成检查点。
|
||||
|
@ -459,7 +500,7 @@ bash run_eval.sh resnet50 cifar10 ~/cifar10-10-verify-bin ~/resnet50_cifar10/tra
|
|||
#### GPU处理器环境运行
|
||||
|
||||
```bash
|
||||
bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
|
||||
bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]
|
||||
```
|
||||
|
||||
### 结果
|
||||
|
@ -513,13 +554,37 @@ result:{'top_5_accuracy':0.9342589628681178, 'top_1_accuracy':0.768065781049936}
|
|||
|
||||
### [导出MindIR](#contents)
|
||||
|
||||
导出mindir模型
|
||||
|
||||
```shell
|
||||
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
|
||||
python export.py --checkpoint_file_path [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] --config_path [CONFIG_PATH]
|
||||
```
|
||||
|
||||
参数ckpt_file为必填项,
|
||||
参数checkpoint_file_path为必填项,
|
||||
`EXPORT_FORMAT` 必须在 ["AIR", "MINDIR"]中选择。
|
||||
|
||||
ModelArts导出mindir
|
||||
|
||||
```python
|
||||
# (1) 把训练好的模型地方到桶的对应位置。
|
||||
# (2) 选址a或者b其中一种方式。
|
||||
# a. 设置 "enable_modelarts=True"
|
||||
# 设置 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt" 在 yaml 文件。
|
||||
# 设置 "checkpoint_url=/The path of checkpoint in S3/" 在 yaml 文件。
|
||||
# 设置 "file_name='./resnet'"参数在yaml文件。
|
||||
# 设置 "file_format='AIR'" 参数在yaml文件。
|
||||
# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。
|
||||
# 增加 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" 参数在modearts的界面上。
|
||||
# 增加 "checkpoint_url=/The path of checkpoint in S3/" 参数在modearts的界面上。
|
||||
# 设置 "file_name='./resnet'"参数在modearts的界面上。
|
||||
# 设置 "file_format='AIR'" 参数在modearts的界面上。
|
||||
# (3) 设置网络配置文件的路径 "config_path=/The path of config in S3/"
|
||||
# (4) 在modelarts的界面上设置代码的路径 "/path/resnet"。
|
||||
# (5) 在modelarts的界面上设置模型的启动文件 "export.py" 。
|
||||
# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。
|
||||
# (6) 开始导出mindir。
|
||||
```
|
||||
|
||||
### 在Ascend310执行推理
|
||||
|
||||
在执行推理前,mindir文件必须通过`export.py`脚本导出。以下展示了使用minir模型执行推理的示例。
|
||||
|
|
|
@ -14,51 +14,39 @@
|
|||
# ============================================================================
|
||||
"""train resnet."""
|
||||
import os
|
||||
import argparse
|
||||
from mindspore import context
|
||||
from mindspore.common import set_seed
|
||||
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
|
||||
from mindspore.train.model import Model
|
||||
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||
from src.CrossEntropySmooth import CrossEntropySmooth
|
||||
|
||||
parser = argparse.ArgumentParser(description='Image classification')
|
||||
parser.add_argument('--net', type=str, default=None, help='Resnet Model, either resnet18,resnet34'
|
||||
'resnet50 or resnet101')
|
||||
parser.add_argument('--dataset', type=str, default=None, help='Dataset, either cifar10 or imagenet2012')
|
||||
|
||||
parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path')
|
||||
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
|
||||
parser.add_argument('--device_target', type=str, default='Ascend', choices=("Ascend", "GPU", "CPU"),
|
||||
help="Device target, support Ascend, GPU and CPU.")
|
||||
args_opt = parser.parse_args()
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||
|
||||
set_seed(1)
|
||||
|
||||
if args_opt.net in ("resnet18", "resnet34", "resnet50"):
|
||||
if args_opt.net == "resnet18":
|
||||
if config.net_name in ("resnet18", "resnet34", "resnet50"):
|
||||
if config.net_name == "resnet18":
|
||||
from src.resnet import resnet18 as resnet
|
||||
if args_opt.net == "resnet34":
|
||||
if config.net_name == "resnet34":
|
||||
from src.resnet import resnet34 as resnet
|
||||
if args_opt.net == "resnet50":
|
||||
if config.net_name == "resnet50":
|
||||
from src.resnet import resnet50 as resnet
|
||||
if args_opt.dataset == "cifar10":
|
||||
from src.config import config1 as config
|
||||
if config.dataset == "cifar10":
|
||||
from src.dataset import create_dataset1 as create_dataset
|
||||
else:
|
||||
from src.config import config2 as config
|
||||
from src.dataset import create_dataset2 as create_dataset
|
||||
elif args_opt.net == "resnet101":
|
||||
elif config.net_name == "resnet101":
|
||||
from src.resnet import resnet101 as resnet
|
||||
from src.config import config3 as config
|
||||
from src.dataset import create_dataset3 as create_dataset
|
||||
else:
|
||||
from src.resnet import se_resnet50 as resnet
|
||||
from src.config import config4 as config
|
||||
from src.dataset import create_dataset4 as create_dataset
|
||||
|
||||
if __name__ == '__main__':
|
||||
target = args_opt.device_target
|
||||
@moxing_wrapper()
|
||||
def eval_net():
|
||||
"""eval net"""
|
||||
target = config.device_target
|
||||
|
||||
# init context
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
|
||||
|
@ -67,20 +55,19 @@ if __name__ == '__main__':
|
|||
context.set_context(device_id=device_id)
|
||||
|
||||
# create dataset
|
||||
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, batch_size=config.batch_size,
|
||||
dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size,
|
||||
target=target)
|
||||
step_size = dataset.get_dataset_size()
|
||||
|
||||
# define net
|
||||
net = resnet(class_num=config.class_num)
|
||||
|
||||
# load checkpoint
|
||||
param_dict = load_checkpoint(args_opt.checkpoint_path)
|
||||
param_dict = load_checkpoint(config.checkpoint_file_path)
|
||||
load_param_into_net(net, param_dict)
|
||||
net.set_train(False)
|
||||
|
||||
# define loss, model
|
||||
if args_opt.dataset == "imagenet2012":
|
||||
if config.dataset == "imagenet2012":
|
||||
if not config.use_label_smooth:
|
||||
config.label_smooth_factor = 0.0
|
||||
loss = CrossEntropySmooth(sparse=True, reduction='mean',
|
||||
|
@ -93,4 +80,7 @@ if __name__ == '__main__':
|
|||
|
||||
# eval model
|
||||
res = model.eval(dataset)
|
||||
print("result:", res, "ckpt=", args_opt.checkpoint_path)
|
||||
print("result:", res, "ckpt=", config.checkpoint_file_path)
|
||||
|
||||
if __name__ == '__main__':
|
||||
eval_net()
|
||||
|
|
|
@ -16,67 +16,50 @@
|
|||
##############export checkpoint file into air and onnx models#################
|
||||
python export.py
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||
|
||||
parser = argparse.ArgumentParser(description='resnet export')
|
||||
parser.add_argument('--network_dataset', type=str, default='resnet50_cifar10', choices=['resnet18_cifar10',
|
||||
'resnet18_imagenet2012',
|
||||
'resnet34_imagenet2012',
|
||||
'resnet50_cifar10',
|
||||
'resnet50_imagenet2012',
|
||||
'resnet101_imagenet2012',
|
||||
"se-resnet50_imagenet2012"],
|
||||
help='network and dataset name.')
|
||||
parser.add_argument("--device_id", type=int, default=0, help="Device id")
|
||||
parser.add_argument("--batch_size", type=int, default=1, help="batch size")
|
||||
parser.add_argument("--ckpt_file", type=str, required=True, help="Checkpoint file path.")
|
||||
parser.add_argument("--file_name", type=str, default="resnet", help="output file name.")
|
||||
parser.add_argument('--width', type=int, default=224, help='input width')
|
||||
parser.add_argument('--height', type=int, default=224, help='input height')
|
||||
parser.add_argument("--file_format", type=str, choices=["AIR", "ONNX", "MINDIR"], default="AIR", help="file format")
|
||||
parser.add_argument("--device_target", type=str, default="Ascend",
|
||||
choices=["Ascend", "GPU", "CPU"], help="device target(default: Ascend)")
|
||||
args = parser.parse_args()
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
|
||||
if config.device_target == "Ascend":
|
||||
context.set_context(device_id=config.device_id)
|
||||
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
|
||||
if args.device_target == "Ascend":
|
||||
context.set_context(device_id=args.device_id)
|
||||
def modelarts_pre_process():
|
||||
'''modelarts pre process function.'''
|
||||
config.file_name = os.path.join(config.output_path, config.file_name)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if args.network_dataset == 'resnet18_cifar10':
|
||||
from src.config import config1 as config
|
||||
@moxing_wrapper(pre_process=modelarts_pre_process)
|
||||
def run_export():
|
||||
"""run export."""
|
||||
if config.network_dataset == 'resnet18_cifar10':
|
||||
from src.resnet import resnet18 as resnet
|
||||
elif args.network_dataset == 'resnet18_imagenet2012':
|
||||
from src.config import config2 as config
|
||||
elif config.network_dataset == 'resnet18_imagenet2012':
|
||||
from src.resnet import resnet18 as resnet
|
||||
elif args.network_dataset == 'resnet34_imagenet2012':
|
||||
from src.config import config2 as config
|
||||
elif config.network_dataset == 'resnet34_imagenet2012':
|
||||
from src.resnet import resnet34 as resnet
|
||||
elif args.network_dataset == 'resnet50_cifar10':
|
||||
from src.config import config1 as config
|
||||
elif config.network_dataset == 'resnet50_cifar10':
|
||||
from src.resnet import resnet50 as resnet
|
||||
elif args.network_dataset == 'resnet50_imagenet2012':
|
||||
from src.config import config2 as config
|
||||
elif config.network_dataset == 'resnet50_imagenet2012':
|
||||
from src.resnet import resnet50 as resnet
|
||||
elif args.network_dataset == 'resnet101_imagenet2012':
|
||||
from src.config import config3 as config
|
||||
elif config.network_dataset == 'resnet101_imagenet2012':
|
||||
from src.resnet import resnet101 as resnet
|
||||
elif args.network_dataset == 'se-resnet50_imagenet2012':
|
||||
from src.config import config4 as config
|
||||
elif config.network_dataset == 'se-resnet50_imagenet2012':
|
||||
from src.resnet import se_resnet50 as resnet
|
||||
else:
|
||||
raise ValueError("network and dataset is not support.")
|
||||
|
||||
net = resnet(config.class_num)
|
||||
|
||||
assert args.ckpt_file is not None, "checkpoint_path is None."
|
||||
assert config.checkpoint_file_path is not None, "checkpoint_path is None."
|
||||
|
||||
param_dict = load_checkpoint(args.ckpt_file)
|
||||
param_dict = load_checkpoint(config.checkpoint_file_path)
|
||||
load_param_into_net(net, param_dict)
|
||||
|
||||
input_arr = Tensor(np.zeros([args.batch_size, 3, args.height, args.width], np.float32))
|
||||
export(net, input_arr, file_name=args.file_name, file_format=args.file_format)
|
||||
input_arr = Tensor(np.zeros([config.batch_size, 3, config.height, config.width], np.float32))
|
||||
export(net, input_arr, file_name=config.file_name, file_format=config.file_format)
|
||||
|
||||
if __name__ == '__main__':
|
||||
run_export()
|
||||
|
|
|
@ -13,8 +13,7 @@
|
|||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""train resnet."""
|
||||
import argparse
|
||||
import ast
|
||||
import os
|
||||
import time
|
||||
import numpy as np
|
||||
from mindspore import context
|
||||
|
@ -34,25 +33,11 @@ import mindspore.dataset.vision.c_transforms as C
|
|||
from src.resnet_gpu_benchmark import resnet50 as resnet
|
||||
from src.CrossEntropySmooth import CrossEntropySmooth
|
||||
from src.momentum import Momentum as MomentumWeightDecay
|
||||
|
||||
parser = argparse.ArgumentParser(description='Image classification')
|
||||
parser.add_argument('--batch_size', type=str, default="256", help='Batch_size: default 256.')
|
||||
parser.add_argument('--epoch_size', type=str, default="2", help='Epoch_size: default 2')
|
||||
parser.add_argument('--print_per_steps', type=str, default="20", help='Print loss and time per steps: default 20')
|
||||
parser.add_argument('--run_distribute', type=ast.literal_eval, default=False, help='Run distribute')
|
||||
parser.add_argument('--save_ckpt', type=ast.literal_eval, default=False, help='Save ckpt or not: default False')
|
||||
parser.add_argument('--eval', type=ast.literal_eval, default=False, help='Eval ckpt : default False')
|
||||
parser.add_argument('--dataset_path', type=str, default=None, help='Imagenet dataset path')
|
||||
parser.add_argument('--ckpt_path', type=str, default="./", help='The path to save ckpt if save_ckpt is True;\
|
||||
Or the ckpt model file when eval is True')
|
||||
parser.add_argument('--mode', type=str, default="GRAPH", choices=["GRAPH", "PYNATIVE"], help='Execute mode')
|
||||
parser.add_argument('--dtype', type=str, choices=["fp32", "fp16", "FP16", "FP32"], default="fp16", \
|
||||
help='Compute data type fp32 or fp16: default fp16')
|
||||
args_opt = parser.parse_args()
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||
|
||||
set_seed(1)
|
||||
|
||||
|
||||
class MyTimeMonitor(Callback):
|
||||
def __init__(self, batch_size, sink_size, dataset_size, mode):
|
||||
super(MyTimeMonitor, self).__init__()
|
||||
|
@ -95,7 +80,7 @@ class MyTimeMonitor(Callback):
|
|||
|
||||
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="GPU", dtype="fp16",
|
||||
device_num=1):
|
||||
if args_opt.mode == "GRAPH":
|
||||
if config.mode_name == "GRAPH":
|
||||
ds_num_parallel_worker = 4
|
||||
map_num_parallel_worker = 8
|
||||
batch_num_parallel_worker = None
|
||||
|
@ -116,7 +101,7 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="
|
|||
# define map operations
|
||||
normalize_op = C.Normalize(mean=mean, std=std)
|
||||
if dtype == "fp16":
|
||||
if args_opt.eval:
|
||||
if config.eval:
|
||||
x_dtype = "float32"
|
||||
else:
|
||||
x_dtype = "float16"
|
||||
|
@ -161,25 +146,26 @@ def get_liner_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per
|
|||
return lr_each_step
|
||||
|
||||
|
||||
@moxing_wrapper()
|
||||
def train():
|
||||
# set args
|
||||
dev = "GPU"
|
||||
epoch_size = int(args_opt.epoch_size)
|
||||
total_batch = int(args_opt.batch_size)
|
||||
print_per_steps = int(args_opt.print_per_steps)
|
||||
compute_type = str(args_opt.dtype).lower()
|
||||
ckpt_save_dir = str(args_opt.ckpt_path)
|
||||
save_ckpt = bool(args_opt.save_ckpt)
|
||||
epoch_size = int(config.epoch_size)
|
||||
total_batch = int(config.batch_size)
|
||||
print_per_steps = int(config.print_per_steps)
|
||||
compute_type = str(config.dtype).lower()
|
||||
save_ckpt = bool(config.save_ckpt)
|
||||
device_num = 1
|
||||
# init context
|
||||
if args_opt.mode == "GRAPH":
|
||||
if config.mode_name == "GRAPH":
|
||||
mode = context.GRAPH_MODE
|
||||
all_reduce_fusion_config = [85, 160]
|
||||
else:
|
||||
mode = context.PYNATIVE_MODE
|
||||
all_reduce_fusion_config = [30, 90, 160]
|
||||
context.set_context(mode=mode, device_target=dev, save_graphs=False)
|
||||
if args_opt.run_distribute:
|
||||
ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path)
|
||||
if config.run_distribute:
|
||||
init()
|
||||
device_num = get_group_size()
|
||||
context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
|
||||
|
@ -187,7 +173,7 @@ def train():
|
|||
ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank()) + "/"
|
||||
|
||||
# create dataset
|
||||
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1,
|
||||
dataset = create_dataset(dataset_path=config.data_path, do_train=True, repeat_num=1,
|
||||
batch_size=total_batch, target=dev, dtype=compute_type, device_num=device_num)
|
||||
step_size = dataset.get_dataset_size()
|
||||
if (print_per_steps > step_size or print_per_steps < 1):
|
||||
|
@ -251,21 +237,21 @@ def train():
|
|||
else:
|
||||
model.train(epoch_size, dataset, callbacks=cb)
|
||||
|
||||
|
||||
@moxing_wrapper()
|
||||
def eval_():
|
||||
# set args
|
||||
dev = "GPU"
|
||||
compute_type = str(args_opt.dtype).lower()
|
||||
ckpt_dir = str(args_opt.ckpt_path)
|
||||
total_batch = int(args_opt.batch_size)
|
||||
compute_type = str(config.dtype).lower()
|
||||
ckpt_dir = str(config.checkpoint_file_path)
|
||||
total_batch = int(config.batch_size)
|
||||
# init context
|
||||
if args_opt.mode == "GRAPH":
|
||||
if config.mode_name == "GRAPH":
|
||||
mode = context.GRAPH_MODE
|
||||
else:
|
||||
mode = context.PYNATIVE_MODE
|
||||
context.set_context(mode=mode, device_target=dev, save_graphs=False)
|
||||
# create dataset
|
||||
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, repeat_num=1,
|
||||
dataset = create_dataset(dataset_path=config.data_path, do_train=False, repeat_num=1,
|
||||
batch_size=total_batch, target=dev, dtype=compute_type)
|
||||
# define net
|
||||
net = resnet(class_num=1001, dtype=compute_type)
|
||||
|
@ -284,7 +270,7 @@ def eval_():
|
|||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if not args_opt.eval:
|
||||
if not config.eval:
|
||||
train()
|
||||
else:
|
||||
eval_()
|
||||
|
|
|
@ -14,43 +14,27 @@
|
|||
# ============================================================================
|
||||
"""train resnet."""
|
||||
import os
|
||||
import argparse
|
||||
import numpy as np
|
||||
from mindspore import Tensor
|
||||
from mindspore import context
|
||||
from mindspore.common import set_seed
|
||||
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||
|
||||
parser = argparse.ArgumentParser(description='Image classification')
|
||||
parser.add_argument('--net', type=str, default=None, help='Resnet Model, either resnet18, '
|
||||
'resnet50 or resnet101')
|
||||
parser.add_argument('--dataset', type=str, default=None, help='Dataset, imagenet2012')
|
||||
|
||||
parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path')
|
||||
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
|
||||
parser.add_argument('--device_target', type=str, default='Ascend', choices=("Ascend", "GPU", "CPU"),
|
||||
help="Device target, support Ascend, GPU and CPU.")
|
||||
args_opt = parser.parse_args()
|
||||
|
||||
set_seed(1)
|
||||
|
||||
if args_opt.dataset != "imagenet2012":
|
||||
if config.dataset != "imagenet2012":
|
||||
raise ValueError("Currently only support imagenet2012 dataset format")
|
||||
if args_opt.net in ("resnet18", "resnet50"):
|
||||
if args_opt.net == "resnet18":
|
||||
if config.net_name in ("resnet18", "resnet50"):
|
||||
if config.net_name == "resnet18":
|
||||
from src.resnet import resnet18 as resnet
|
||||
if args_opt.net == "resnet50":
|
||||
if config.net_name == "resnet50":
|
||||
from src.resnet import resnet50 as resnet
|
||||
from src.config import config2 as config
|
||||
from src.dataset_infer import create_dataset
|
||||
|
||||
elif args_opt.net == "resnet101":
|
||||
elif config.net_name == "resnet101":
|
||||
from src.resnet import resnet101 as resnet
|
||||
from src.config import config3 as config
|
||||
from src.dataset_infer import create_dataset2 as create_dataset
|
||||
else:
|
||||
from src.resnet import se_resnet50 as resnet
|
||||
from src.config import config4 as config
|
||||
from src.dataset_infer import create_dataset3 as create_dataset
|
||||
|
||||
|
||||
|
@ -67,9 +51,9 @@ def show_predict_info(label_list, prediction_list, filename_list, predict_ng):
|
|||
"label is {}".format(filename, predict_index, label_index))
|
||||
return predict_ng, label_index
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
target = args_opt.device_target
|
||||
@moxing_wrapper()
|
||||
def infer_net():
|
||||
target = config.device_target
|
||||
|
||||
# init context
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
|
||||
|
@ -78,7 +62,7 @@ if __name__ == '__main__':
|
|||
context.set_context(device_id=device_id)
|
||||
|
||||
# create dataset
|
||||
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, batch_size=config.batch_size,
|
||||
dataset = create_dataset(dataset_path=config.data_path, do_train=False, batch_size=config.batch_size,
|
||||
target=target)
|
||||
step_size = dataset.get_dataset_size()
|
||||
|
||||
|
@ -86,7 +70,7 @@ if __name__ == '__main__':
|
|||
net = resnet(class_num=config.class_num)
|
||||
|
||||
# load checkpoint
|
||||
param_dict = load_checkpoint(args_opt.checkpoint_path)
|
||||
param_dict = load_checkpoint(config.checkpoint_file_path)
|
||||
load_param_into_net(net, param_dict)
|
||||
net.set_train(False)
|
||||
|
||||
|
@ -95,7 +79,7 @@ if __name__ == '__main__':
|
|||
total_sample = step_size * config.batch_size
|
||||
only_file = 0
|
||||
data_loader = dataset.create_dict_iterator(output_numpy=True, num_epochs=1)
|
||||
for i, data in enumerate(data_loader):
|
||||
for _, data in enumerate(data_loader):
|
||||
images = data["image"]
|
||||
label = data["label"]
|
||||
file_name = data["filename"]
|
||||
|
@ -109,3 +93,6 @@ if __name__ == '__main__':
|
|||
print(f"total {total_sample} data, top1 acc is {(total_sample - len(predict_negative)) * 1.0 / total_sample}")
|
||||
else:
|
||||
print("infer completed")
|
||||
|
||||
if __name__ == '__main__':
|
||||
infer_net()
|
||||
|
|
|
@ -17,7 +17,6 @@ import os
|
|||
import json
|
||||
import argparse
|
||||
import numpy as np
|
||||
from src.config import config2 as config
|
||||
|
||||
batch_size = 1
|
||||
parser = argparse.ArgumentParser(description="resnet inference")
|
||||
|
@ -63,14 +62,14 @@ def cal_acc_imagenet(result_path, label_path):
|
|||
files = os.listdir(result_path)
|
||||
with open(label_path, "r") as label:
|
||||
labels = json.load(label)
|
||||
|
||||
result_shape = (1, 1001)
|
||||
top1 = 0
|
||||
top5 = 0
|
||||
total_data = len(files)
|
||||
for file in files:
|
||||
img_ids_name = file.split('_0.')[0]
|
||||
data_path = os.path.join(result_path, img_ids_name + "_0.bin")
|
||||
result = np.fromfile(data_path, dtype=np.float32).reshape(batch_size, config.class_num)
|
||||
result = np.fromfile(data_path, dtype=np.float32).reshape(result_shape)
|
||||
for batch in range(batch_size):
|
||||
predict = np.argsort(-result[batch], axis=-1)
|
||||
if labels[img_ids_name+".JPEG"] == predict[0]:
|
||||
|
|
|
@ -0,0 +1,75 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
run_distribute: False
|
||||
enable_profiling: False
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path/"
|
||||
device_target: 'Ascend'
|
||||
checkpoint_path: './checkpoint/'
|
||||
checkpoint_file_path: ''
|
||||
|
||||
# ==============================================================================
|
||||
# Training options
|
||||
optimizer: 'Momentum'
|
||||
infer_label: ""
|
||||
class_num: 1001
|
||||
batch_size: 32
|
||||
loss_scale: 1024
|
||||
momentum: 0.9
|
||||
weight_decay: 0.0001
|
||||
epoch_size: 120
|
||||
pretrain_epoch_size: 0
|
||||
save_checkpoint: True
|
||||
save_checkpoint_epochs: 5
|
||||
keep_checkpoint_max: 10
|
||||
warmup_epochs: 0
|
||||
lr_decay_mode: "cosine"
|
||||
use_label_smooth: True
|
||||
label_smooth_factor: 0.1
|
||||
lr: 0.1
|
||||
|
||||
net_name: "resnet101"
|
||||
dataset: "imagenet2012"
|
||||
device_num: 1
|
||||
pre_trained: ""
|
||||
run_eval: False
|
||||
eval_dataset_path: ""
|
||||
parameter_server: False
|
||||
filter_weight: False
|
||||
save_best_ckpt: True
|
||||
eval_start_epoch: 40
|
||||
eval_interval: 1
|
||||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
width: 224
|
||||
height: 224
|
||||
file_name: "resnet101"
|
||||
file_format: "AIR"
|
||||
ckpt_file: ""
|
||||
network_dataset: "resnet101_imagenet2012"
|
||||
|
||||
---
|
||||
# Help description for each configuration
|
||||
enable_modelarts: 'Whether training on modelarts, default: False'
|
||||
data_url: 'Dataset url for obs'
|
||||
checkpoint_url: 'The location of checkpoint for obs'
|
||||
data_path: 'Dataset path for local'
|
||||
output_path: 'Training output path for local'
|
||||
load_path: 'The location of checkpoint for obs'
|
||||
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
||||
num_classes: 'Class for dataset'
|
||||
batch_size: "Batch size for training and evaluation"
|
||||
epoch_size: "Total training epochs."
|
||||
checkpoint_path: "The location of the checkpoint file."
|
||||
checkpoint_file_path: "The location of the checkpoint file."
|
|
@ -0,0 +1,75 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
run_distribute: False
|
||||
enable_profiling: False
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path/"
|
||||
device_target: 'Ascend'
|
||||
checkpoint_path: './checkpoint/'
|
||||
checkpoint_file_path: ''
|
||||
|
||||
# ==============================================================================
|
||||
# Training options
|
||||
optimizer: 'Momentum'
|
||||
infer_label: ""
|
||||
class_num: 10
|
||||
batch_size: 32
|
||||
loss_scale: 1024
|
||||
momentum: 0.9
|
||||
weight_decay: 0.0001
|
||||
epoch_size: 90
|
||||
pretrain_epoch_size: 0
|
||||
save_checkpoint: True
|
||||
save_checkpoint_epochs: 5
|
||||
keep_checkpoint_max: 10
|
||||
warmup_epochs: 5
|
||||
lr_decay_mode: "poly"
|
||||
lr_init: 0.01
|
||||
lr_end: 0.00001
|
||||
lr_max: 0.1
|
||||
|
||||
net_name: "resnet18"
|
||||
dataset: "cifar10"
|
||||
device_num: 1
|
||||
pre_trained: ""
|
||||
run_eval: False
|
||||
eval_dataset_path: ""
|
||||
parameter_server: False
|
||||
filter_weight: False
|
||||
save_best_ckpt: True
|
||||
eval_start_epoch: 40
|
||||
eval_interval: 1
|
||||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
width: 224
|
||||
height: 224
|
||||
file_name: "resnet18"
|
||||
file_format: "AIR"
|
||||
ckpt_file: ""
|
||||
network_dataset: "resnet18_cifar10"
|
||||
|
||||
---
|
||||
# Help description for each configuration
|
||||
enable_modelarts: 'Whether training on modelarts, default: False'
|
||||
data_url: 'Dataset url for obs'
|
||||
checkpoint_url: 'The location of checkpoint for obs'
|
||||
data_path: 'Dataset path for local'
|
||||
output_path: 'Training output path for local'
|
||||
load_path: 'The location of checkpoint for obs'
|
||||
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
||||
num_classes: 'Class for dataset'
|
||||
batch_size: "Batch size for training and evaluation"
|
||||
epoch_size: "Total training epochs."
|
||||
checkpoint_path: "The location of the checkpoint file."
|
||||
checkpoint_file_path: "The location of the checkpoint file."
|
|
@ -0,0 +1,77 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
run_distribute: False
|
||||
enable_profiling: False
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path/"
|
||||
device_target: 'Ascend'
|
||||
checkpoint_path: './checkpoint/'
|
||||
checkpoint_file_path: ''
|
||||
|
||||
# ==============================================================================
|
||||
# Training options
|
||||
optimizer: 'Momentum'
|
||||
infer_label: ""
|
||||
class_num: 1001
|
||||
batch_size: 256
|
||||
loss_scale: 1024
|
||||
momentum: 0.9
|
||||
weight_decay: 0.0001
|
||||
epoch_size: 90
|
||||
pretrain_epoch_size: 0
|
||||
save_checkpoint: True
|
||||
save_checkpoint_epochs: 5
|
||||
keep_checkpoint_max: 10
|
||||
warmup_epochs: 0
|
||||
lr_decay_mode: "linear"
|
||||
use_label_smooth: True
|
||||
label_smooth_factor: 0.1
|
||||
lr_init: 0
|
||||
lr_max: 0.8
|
||||
lr_end: 0.0
|
||||
|
||||
net_name: "resnet18"
|
||||
dataset: "imagenet2012"
|
||||
device_num: 1
|
||||
pre_trained: ""
|
||||
run_eval: False
|
||||
eval_dataset_path: ""
|
||||
parameter_server: False
|
||||
filter_weight: False
|
||||
save_best_ckpt: True
|
||||
eval_start_epoch: 40
|
||||
eval_interval: 1
|
||||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
width: 224
|
||||
height: 224
|
||||
file_name: "resnet18"
|
||||
file_format: "AIR"
|
||||
ckpt_file: ""
|
||||
network_dataset: "resnet18_imagenet2012"
|
||||
|
||||
---
|
||||
# Help description for each configuration
|
||||
enable_modelarts: 'Whether training on modelarts, default: False'
|
||||
data_url: 'Dataset url for obs'
|
||||
checkpoint_url: 'The location of checkpoint for obs'
|
||||
data_path: 'Dataset path for local'
|
||||
output_path: 'Training output path for local'
|
||||
load_path: 'The location of checkpoint for obs'
|
||||
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
||||
num_classes: 'Class for dataset'
|
||||
batch_size: "Batch size for training and evaluation"
|
||||
epoch_size: "Total training epochs."
|
||||
checkpoint_path: "The location of the checkpoint file."
|
||||
checkpoint_file_path: "The location of the checkpoint file."
|
|
@ -0,0 +1,77 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
run_distribute: False
|
||||
enable_profiling: False
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path/"
|
||||
device_target: 'Ascend'
|
||||
checkpoint_path: './checkpoint/'
|
||||
checkpoint_file_path: ''
|
||||
|
||||
# ==============================================================================
|
||||
# Training options
|
||||
optimizer: 'Momentum'
|
||||
infer_label: ""
|
||||
class_num: 1001
|
||||
batch_size: 256
|
||||
loss_scale: 1024
|
||||
momentum: 0.9
|
||||
weight_decay: 0.0001
|
||||
epoch_size: 90
|
||||
pretrain_epoch_size: 0
|
||||
save_checkpoint: True
|
||||
save_checkpoint_epochs: 5
|
||||
keep_checkpoint_max: 10
|
||||
warmup_epochs: 0
|
||||
lr_decay_mode: "linear"
|
||||
use_label_smooth: True
|
||||
label_smooth_factor: 0.1
|
||||
lr_init: 0
|
||||
lr_max: 0.8
|
||||
lr_end: 0.0
|
||||
|
||||
net_name: "resnet34"
|
||||
dataset: "imagenet2012"
|
||||
device_num: 1
|
||||
pre_trained: ""
|
||||
run_eval: False
|
||||
eval_dataset_path: ""
|
||||
parameter_server: False
|
||||
filter_weight: False
|
||||
save_best_ckpt: True
|
||||
eval_start_epoch: 40
|
||||
eval_interval: 1
|
||||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
width: 224
|
||||
height: 224
|
||||
file_name: "resnet34"
|
||||
file_format: "AIR"
|
||||
ckpt_file: ""
|
||||
network_dataset: "resnet34_imagenet2012"
|
||||
|
||||
---
|
||||
# Help description for each configuration
|
||||
enable_modelarts: 'Whether training on modelarts, default: False'
|
||||
data_url: 'Dataset url for obs'
|
||||
checkpoint_url: 'The location of checkpoint for obs'
|
||||
data_path: 'Dataset path for local'
|
||||
output_path: 'Training output path for local'
|
||||
load_path: 'The location of checkpoint for obs'
|
||||
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
||||
num_classes: 'Class for dataset'
|
||||
batch_size: "Batch size for training and evaluation"
|
||||
epoch_size: "Total training epochs."
|
||||
checkpoint_path: "The location of the checkpoint file."
|
||||
checkpoint_file_path: "The location of the checkpoint file."
|
|
@ -0,0 +1,75 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
run_distribute: False
|
||||
enable_profiling: False
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path/"
|
||||
device_target: 'Ascend'
|
||||
checkpoint_path: './checkpoint/'
|
||||
checkpoint_file_path: ''
|
||||
|
||||
# ==============================================================================
|
||||
# Training options
|
||||
optimizer: 'Momentum'
|
||||
infer_label: ""
|
||||
class_num: 10
|
||||
batch_size: 32
|
||||
loss_scale: 1024
|
||||
momentum: 0.9
|
||||
weight_decay: 0.0001
|
||||
epoch_size: 90
|
||||
pretrain_epoch_size: 0
|
||||
save_checkpoint: True
|
||||
save_checkpoint_epochs: 5
|
||||
keep_checkpoint_max: 10
|
||||
warmup_epochs: 5
|
||||
lr_decay_mode: "poly"
|
||||
lr_init: 0.01
|
||||
lr_end: 0.00001
|
||||
lr_max: 0.1
|
||||
|
||||
net_name: "resnet50"
|
||||
dataset: "cifar10"
|
||||
device_num: 1
|
||||
pre_trained: ""
|
||||
run_eval: False
|
||||
eval_dataset_path: ""
|
||||
parameter_server: False
|
||||
filter_weight: False
|
||||
save_best_ckpt: True
|
||||
eval_start_epoch: 40
|
||||
eval_interval: 1
|
||||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
width: 224
|
||||
height: 224
|
||||
file_name: "resnet50"
|
||||
file_format: "AIR"
|
||||
ckpt_file: ""
|
||||
network_dataset: "resnet50_cifar10"
|
||||
|
||||
---
|
||||
# Help description for each configuration
|
||||
enable_modelarts: 'Whether training on modelarts, default: False'
|
||||
data_url: 'Dataset url for obs'
|
||||
checkpoint_url: 'The location of checkpoint for obs'
|
||||
data_path: 'Dataset path for local'
|
||||
output_path: 'Training output path for local'
|
||||
load_path: 'The location of checkpoint for obs'
|
||||
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
||||
num_classes: 'Class for dataset'
|
||||
batch_size: "Batch size for training and evaluation"
|
||||
epoch_size: "Total training epochs."
|
||||
checkpoint_path: "The location of the checkpoint file."
|
||||
checkpoint_file_path: "The location of the checkpoint file."
|
|
@ -0,0 +1,78 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
run_distribute: False
|
||||
enable_profiling: False
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path/"
|
||||
device_target: 'Ascend'
|
||||
checkpoint_path: './checkpoint/'
|
||||
checkpoint_file_path: ''
|
||||
|
||||
# ==============================================================================
|
||||
# Training options
|
||||
optimizer: 'Momentum'
|
||||
infer_label: ""
|
||||
class_num: 1001
|
||||
batch_size: 32
|
||||
loss_scale: 128
|
||||
momentum: 0.9
|
||||
weight_decay: 0.0005
|
||||
epoch_size: 45
|
||||
pretrain_epoch_size: 0
|
||||
save_checkpoint: True
|
||||
save_checkpoint_epochs: 2
|
||||
keep_checkpoint_max: 15
|
||||
use_label_smooth: True
|
||||
label_smooth_factor: 0.1
|
||||
lr_init: 0.05803
|
||||
lr_decay: 4.04839
|
||||
lr_end_epoch: 53
|
||||
damping_init: 0.02714
|
||||
damping_decay: 0.50036
|
||||
frequency: 834
|
||||
|
||||
net_name: "resnet50"
|
||||
dataset: "imagenet2012"
|
||||
device_num: 1
|
||||
pre_trained: ""
|
||||
run_eval: False
|
||||
eval_dataset_path: ""
|
||||
parameter_server: False
|
||||
filter_weight: False
|
||||
save_best_ckpt: True
|
||||
eval_start_epoch: 40
|
||||
eval_interval: 1
|
||||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
width: 224
|
||||
height: 224
|
||||
file_name: "resnet50"
|
||||
file_format: "AIR"
|
||||
ckpt_file: ""
|
||||
network_dataset: "resnet50_imagenet2012"
|
||||
|
||||
---
|
||||
# Help description for each configuration
|
||||
enable_modelarts: 'Whether training on modelarts, default: False'
|
||||
data_url: 'Dataset url for obs'
|
||||
checkpoint_url: 'The location of checkpoint for obs'
|
||||
data_path: 'Dataset path for local'
|
||||
output_path: 'Training output path for local'
|
||||
load_path: 'The location of checkpoint for obs'
|
||||
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
||||
num_classes: 'Class for dataset'
|
||||
batch_size: "Batch size for training and evaluation"
|
||||
epoch_size: "Total training epochs."
|
||||
checkpoint_path: "The location of the checkpoint file."
|
||||
checkpoint_file_path: "The location of the checkpoint file."
|
|
@ -0,0 +1,78 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
run_distribute: False
|
||||
enable_profiling: False
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path/"
|
||||
device_target: 'Ascend'
|
||||
checkpoint_path: './checkpoint/'
|
||||
checkpoint_file_path: ''
|
||||
|
||||
# ==============================================================================
|
||||
# Training options
|
||||
optimizer: 'Thor'
|
||||
infer_label: ""
|
||||
class_num: 1001
|
||||
batch_size: 32
|
||||
loss_scale: 128
|
||||
momentum: 0.9
|
||||
weight_decay: 0.0005
|
||||
epoch_size: 40
|
||||
pretrain_epoch_size: 0
|
||||
save_checkpoint: True
|
||||
save_checkpoint_epochs: 1
|
||||
keep_checkpoint_max: 15
|
||||
use_label_smooth: True
|
||||
label_smooth_factor: 0.1
|
||||
lr_init: 0.05672
|
||||
lr_decay: 4.9687
|
||||
lr_end_epoch: 50
|
||||
damping_init: 0.02345
|
||||
damping_decay: 0.5467
|
||||
frequency: 834
|
||||
|
||||
net_name: "resnet50"
|
||||
dataset: "imagenet2012"
|
||||
device_num: 1
|
||||
pre_trained: ""
|
||||
run_eval: False
|
||||
eval_dataset_path: ""
|
||||
parameter_server: False
|
||||
filter_weight: False
|
||||
save_best_ckpt: True
|
||||
eval_start_epoch: 40
|
||||
eval_interval: 1
|
||||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
width: 224
|
||||
height: 224
|
||||
file_name: "resnet50"
|
||||
file_format: "AIR"
|
||||
ckpt_file: ""
|
||||
network_dataset: "resnet50_imagenet2012"
|
||||
|
||||
---
|
||||
# Help description for each configuration
|
||||
enable_modelarts: 'Whether training on modelarts, default: False'
|
||||
data_url: 'Dataset url for obs'
|
||||
checkpoint_url: 'The location of checkpoint for obs'
|
||||
data_path: 'Dataset path for local'
|
||||
output_path: 'Training output path for local'
|
||||
load_path: 'The location of checkpoint for obs'
|
||||
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
||||
num_classes: 'Class for dataset'
|
||||
batch_size: "Batch size for training and evaluation"
|
||||
epoch_size: "Total training epochs."
|
||||
checkpoint_path: "The location of the checkpoint file."
|
||||
checkpoint_file_path: "The location of the checkpoint file."
|
|
@ -0,0 +1,77 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
run_distribute: False
|
||||
enable_profiling: False
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path/"
|
||||
device_target: 'Ascend'
|
||||
checkpoint_path: './checkpoint/'
|
||||
checkpoint_file_path: ''
|
||||
|
||||
# ==============================================================================
|
||||
# Training options
|
||||
optimizer: 'Momentum'
|
||||
infer_label: ""
|
||||
class_num: 1001
|
||||
batch_size: 256
|
||||
loss_scale: 1024
|
||||
momentum: 0.9
|
||||
weight_decay: 0.0001
|
||||
epoch_size: 90
|
||||
pretrain_epoch_size: 0
|
||||
save_checkpoint: True
|
||||
save_checkpoint_epochs: 5
|
||||
keep_checkpoint_max: 10
|
||||
warmup_epochs: 0
|
||||
lr_decay_mode: "linear"
|
||||
use_label_smooth: True
|
||||
label_smooth_factor: 0.1
|
||||
lr_init: 0
|
||||
lr_max: 0.8
|
||||
lr_end: 0.0
|
||||
|
||||
net_name: "resnet50"
|
||||
dataset: "imagenet2012"
|
||||
device_num: 1
|
||||
pre_trained: ""
|
||||
run_eval: False
|
||||
eval_dataset_path: ""
|
||||
parameter_server: False
|
||||
filter_weight: False
|
||||
save_best_ckpt: True
|
||||
eval_start_epoch: 40
|
||||
eval_interval: 1
|
||||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
width: 224
|
||||
height: 224
|
||||
file_name: "resnet50"
|
||||
file_format: "AIR"
|
||||
ckpt_file: ""
|
||||
network_dataset: "resnet50_imagenet2012"
|
||||
|
||||
---
|
||||
# Help description for each configuration
|
||||
enable_modelarts: 'Whether training on modelarts, default: False'
|
||||
data_url: 'Dataset url for obs'
|
||||
checkpoint_url: 'The location of checkpoint for obs'
|
||||
data_path: 'Dataset path for local'
|
||||
output_path: 'Training output path for local'
|
||||
load_path: 'The location of checkpoint for obs'
|
||||
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
||||
num_classes: 'Class for dataset'
|
||||
batch_size: "Batch size for training and evaluation"
|
||||
epoch_size: "Total training epochs."
|
||||
checkpoint_path: "The location of the checkpoint file."
|
||||
checkpoint_file_path: "The location of the checkpoint file."
|
|
@ -0,0 +1,52 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
run_distribute: False
|
||||
enable_profiling: False
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path/"
|
||||
device_target: 'Ascend'
|
||||
checkpoint_path: './checkpoint/'
|
||||
checkpoint_file_path: ''
|
||||
|
||||
# ==============================================================================
|
||||
# Training options
|
||||
optimizer: 'Momentum'
|
||||
infer_label: ""
|
||||
batch_size: 256
|
||||
epoch_size: 2
|
||||
print_per_steps: 20
|
||||
eval: False
|
||||
save_ckpt: False
|
||||
mode_name: "GRAPH"
|
||||
dtype: "fp16"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
width: 224
|
||||
height: 224
|
||||
file_name: "resnet"
|
||||
file_format: "AIR"
|
||||
ckpt_file: ""
|
||||
network_dataset: "resnet50_cifar10"
|
||||
|
||||
---
|
||||
# Help description for each configuration
|
||||
enable_modelarts: 'Whether training on modelarts, default: False'
|
||||
data_url: 'Dataset url for obs'
|
||||
checkpoint_url: 'The location of checkpoint for obs'
|
||||
data_path: 'Dataset path for local'
|
||||
output_path: 'Training output path for local'
|
||||
load_path: 'The location of checkpoint for obs'
|
||||
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
||||
num_classes: 'Class for dataset'
|
||||
batch_size: "Batch size for training and evaluation"
|
||||
epoch_size: "Total training epochs."
|
||||
checkpoint_path: "The location of the checkpoint file."
|
||||
checkpoint_file_path: "The location of the checkpoint file."
|
|
@ -17,37 +17,13 @@
|
|||
CURPATH="$(dirname "$0")"
|
||||
. ${CURPATH}/cache_util.sh
|
||||
|
||||
if [ $# != 4 ] && [ $# != 5 ] && [ $# != 6 ]
|
||||
if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ]
|
||||
then
|
||||
echo "Usage: bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
||||
echo " bash run_distribute_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)"
|
||||
echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
||||
echo " bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 != "resnet18" ] && [ $1 != "resnet34" ] && [ $1 != "resnet50" ] && [ $1 != "resnet101" ] && [ $1 != "se-resnet50" ]
|
||||
then
|
||||
echo "error: the selected net is neither resnet50 nor resnet101 and se-resnet50"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
|
||||
then
|
||||
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
|
||||
then
|
||||
echo "error: training resnet101 with cifar10 dataset is unsupported now!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 == "se-resnet50" ] && [ $2 == "cifar10" ]
|
||||
then
|
||||
echo "error: evaluating se-resnet50 with cifar10 dataset is unsupported now!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
|
@ -56,18 +32,19 @@ get_real_path(){
|
|||
fi
|
||||
}
|
||||
|
||||
PATH1=$(get_real_path $3)
|
||||
PATH2=$(get_real_path $4)
|
||||
PATH1=$(get_real_path $1)
|
||||
PATH2=$(get_real_path $2)
|
||||
CONFIG_FILE=$3
|
||||
|
||||
if [ $# == 5 ]
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
PATH3=$(get_real_path $5)
|
||||
PATH3=$(get_real_path $4)
|
||||
fi
|
||||
|
||||
if [ $# == 6 ]
|
||||
if [ $# == 5 ]
|
||||
then
|
||||
RUN_EVAL=$5
|
||||
EVAL_DATASET_PATH=$(get_real_path $6)
|
||||
RUN_EVAL=$4
|
||||
EVAL_DATASET_PATH=$(get_real_path $5)
|
||||
fi
|
||||
|
||||
if [ ! -f $PATH1 ]
|
||||
|
@ -82,7 +59,7 @@ then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
if [ $# == 5 ] && [ ! -f $PATH3 ]
|
||||
if [ $# == 4 ] && [ ! -f $PATH3 ]
|
||||
then
|
||||
echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file"
|
||||
exit 1
|
||||
|
@ -123,24 +100,28 @@ do
|
|||
mkdir ./train_parallel$i
|
||||
cp ../*.py ./train_parallel$i
|
||||
cp *.sh ./train_parallel$i
|
||||
cp -r ../*.yaml ./train_parallel$i
|
||||
cp -r ../src ./train_parallel$i
|
||||
cd ./train_parallel$i || exit
|
||||
echo "start training for rank $RANK_ID, device $DEVICE_ID"
|
||||
env > env.log
|
||||
if [ $# == 4 ]
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
taskset -c $cmdopt python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log &
|
||||
taskset -c $cmdopt python train.py --run_distribute=True --device_num=$DEVICE_NUM --data_path=$PATH2 \
|
||||
--config_path=$CONFIG_FILE --output_path './output' &> log &
|
||||
fi
|
||||
|
||||
if [ $# == 5 ]
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
taskset -c $cmdopt python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 &> log &
|
||||
taskset -c $cmdopt python train.py --run_distribute=True --device_num=$DEVICE_NUM --data_path=$PATH2 --pre_trained=$PATH3 \
|
||||
--config_path=$CONFIG_FILE --output_path './output' &> log &
|
||||
fi
|
||||
|
||||
if [ $# == 6 ]
|
||||
if [ $# == 5 ]
|
||||
then
|
||||
taskset -c $cmdopt python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 \
|
||||
--run_eval=$RUN_EVAL --eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID &> log &
|
||||
taskset -c $cmdopt python train.py --run_distribute=True --device_num=$DEVICE_NUM --data_path=$PATH2 \
|
||||
--run_eval=$RUN_EVAL --eval_data_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID \
|
||||
--config_path=$CONFIG_FILE --output_path './output' &> log &
|
||||
if [ "x${RUN_EVAL}" == "xTrue" ]
|
||||
then
|
||||
echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\""
|
||||
|
|
|
@ -17,32 +17,13 @@
|
|||
CURPATH="$(dirname "$0")"
|
||||
. ${CURPATH}/cache_util.sh
|
||||
|
||||
if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ]
|
||||
if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ]
|
||||
then
|
||||
echo "Usage: bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
||||
echo " bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)"
|
||||
echo "Usage: bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
||||
echo " bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 != "resnet50" ] && [ $1 != "resnet101" ]
|
||||
then
|
||||
echo "error: the selected net is neither resnet50 nor resnet101"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
|
||||
then
|
||||
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
|
||||
then
|
||||
echo "error: training resnet101 with cifar10 dataset is unsupported now!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
|
@ -51,17 +32,18 @@ get_real_path(){
|
|||
fi
|
||||
}
|
||||
|
||||
PATH1=$(get_real_path $3)
|
||||
PATH1=$(get_real_path $1)
|
||||
CONFIG_FILE=$2
|
||||
|
||||
if [ $# == 4 ]
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
PATH2=$(get_real_path $4)
|
||||
PATH2=$(get_real_path $3)
|
||||
fi
|
||||
|
||||
if [ $# == 5 ]
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
RUN_EVAL=$4
|
||||
EVAL_DATASET_PATH=$(get_real_path $5)
|
||||
RUN_EVAL=$3
|
||||
EVAL_DATASET_PATH=$(get_real_path $4)
|
||||
fi
|
||||
|
||||
|
||||
|
@ -71,7 +53,7 @@ then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
if [ $# == 5 ] && [ ! -f $PATH2 ]
|
||||
if [ $# == 4 ] && [ ! -f $PATH2 ]
|
||||
then
|
||||
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
|
||||
exit 1
|
||||
|
@ -97,29 +79,30 @@ rm -rf ./train_parallel
|
|||
mkdir ./train_parallel
|
||||
cp ../*.py ./train_parallel
|
||||
cp *.sh ./train_parallel
|
||||
cp -r ../*.yaml ./train_parallel
|
||||
cp -r ../src ./train_parallel
|
||||
cd ./train_parallel || exit
|
||||
|
||||
if [ $# == 2 ]
|
||||
then
|
||||
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py --config_path=$CONFIG_FILE --run_distribute=True --device_num=$DEVICE_NUM \
|
||||
--device_target="GPU" --data_path=$PATH1 --output_path './output' &> log &
|
||||
fi
|
||||
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log &
|
||||
fi
|
||||
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
|
||||
python train.py --config_path=$CONFIG_FILE --run_distribute=True --device_num=$DEVICE_NUM \
|
||||
--device_target="GPU" --data_path=$PATH1 --pre_trained=$PATH2 --output_path './output' &> log &
|
||||
fi
|
||||
|
||||
if [ $# == 5 ]
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --run_eval=$RUN_EVAL \
|
||||
--eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID &> log &
|
||||
python train.py --config_path=$CONFIG_FILE --run_distribute=True --device_num=$DEVICE_NUM \
|
||||
--device_target="GPU" --data_path=$PATH1 --run_eval=$RUN_EVAL --eval_data_path=$EVAL_DATASET_PATH \
|
||||
--enable_cache=True --cache_session_id=$CACHE_SESSION_ID --output_path './output' &> log &
|
||||
if [ "x${RUN_EVAL}" == "xTrue" ]
|
||||
then
|
||||
echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\""
|
||||
|
|
|
@ -14,33 +14,9 @@
|
|||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
if [ $# != 4 ]
|
||||
if [ $# != 3 ]
|
||||
then
|
||||
echo "Usage: bash run_eval.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 != "resnet18" ] && [ $1 != "resnet34" ] && [ $1 != "resnet50" ] && [ $1 != "resnet101" ] && [ $1 != "se-resnet50" ]
|
||||
then
|
||||
echo "error: the selected net is neither resnet50 nor resnet101 nor se-resnet50"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
|
||||
then
|
||||
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
|
||||
then
|
||||
echo "error: evaluating resnet101 with cifar10 dataset is unsupported now!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 == "se-resnet50" ] && [ $2 == "cifar10" ]
|
||||
then
|
||||
echo "error: evaluating se-resnet50 with cifar10 dataset is unsupported now!"
|
||||
echo "Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -52,8 +28,9 @@ get_real_path(){
|
|||
fi
|
||||
}
|
||||
|
||||
PATH1=$(get_real_path $3)
|
||||
PATH2=$(get_real_path $4)
|
||||
PATH1=$(get_real_path $1)
|
||||
PATH2=$(get_real_path $2)
|
||||
CONFIG_FILE=$3
|
||||
|
||||
|
||||
if [ ! -d $PATH1 ]
|
||||
|
@ -81,9 +58,10 @@ fi
|
|||
mkdir ./eval
|
||||
cp ../*.py ./eval
|
||||
cp *.sh ./eval
|
||||
cp -r ../*.yaml ./eval
|
||||
cp -r ../src ./eval
|
||||
cd ./eval || exit
|
||||
env > env.log
|
||||
echo "start evaluation for device $DEVICE_ID"
|
||||
python eval.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --checkpoint_path=$PATH2 &> log &
|
||||
python eval.py --data_path=$PATH1 --checkpoint_file_path=$PATH2 --config_path=$CONFIG_FILE &> log &
|
||||
cd ..
|
||||
|
|
|
@ -14,31 +14,12 @@
|
|||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
if [ $# != 4 ]
|
||||
if [ $# != 3 ]
|
||||
then
|
||||
echo "Usage: bash run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]"
|
||||
echo "Usage: bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 != "resnet50" ] && [ $1 != "resnet101" ]
|
||||
then
|
||||
echo "error: the selected net is neither resnet50 nor resnet101"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
|
||||
then
|
||||
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
|
||||
then
|
||||
echo "error: evaluating resnet101 with cifar10 dataset is unsupported now!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
|
@ -47,8 +28,9 @@ get_real_path(){
|
|||
fi
|
||||
}
|
||||
|
||||
PATH1=$(get_real_path $3)
|
||||
PATH2=$(get_real_path $4)
|
||||
PATH1=$(get_real_path $1)
|
||||
PATH2=$(get_real_path $2)
|
||||
CONFIG_FILE=$3
|
||||
|
||||
|
||||
if [ ! -d $PATH1 ]
|
||||
|
@ -76,9 +58,11 @@ fi
|
|||
mkdir ./eval
|
||||
cp ../*.py ./eval
|
||||
cp *.sh ./eval
|
||||
cp -r ../*.yaml ./eval
|
||||
cp -r ../src ./eval
|
||||
cd ./eval || exit
|
||||
env > env.log
|
||||
echo "start evaluation for device $DEVICE_ID"
|
||||
python eval.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target="GPU" &> log &
|
||||
python eval.py --data_path=$PATH1 --checkpoint_file_path=$PATH2 --device_target="GPU" \
|
||||
--config_path=$CONFIG_FILE &> log &
|
||||
cd ..
|
||||
|
|
|
@ -14,11 +14,11 @@
|
|||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
if [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ]
|
||||
if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] && [ $# != 6 ]
|
||||
then
|
||||
echo "Usage: bash run_eval_gpu_resnet_benchmark.sh [DATASET_PATH] [CKPT_PATH] [BATCH_SIZE](optional) \
|
||||
echo "Usage: bash run_eval_gpu_resnet_benchmark.sh [DATASET_PATH] [CKPT_PATH] [CONFIG_PATH] [BATCH_SIZE](optional) \
|
||||
[DTYPE](optional)"
|
||||
echo "Example: sh run_eval_gpu_resnet_benchmark.sh /path/imagenet/train /path/ckpt 256 FP16"
|
||||
echo "Example: sh run_eval_gpu_resnet_benchmark.sh /path/imagenet/train /path/ckpt /*.yaml 256 FP16"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -33,19 +33,19 @@ get_real_path(){
|
|||
DATAPATH=$(get_real_path $1)
|
||||
script_self=$(readlink -f "$0")
|
||||
self_path=$(dirname "${script_self}")
|
||||
if [ $# == 2 ]
|
||||
then
|
||||
python ${self_path}/../gpu_resnet_benchmark.py --dataset_path=$DATAPATH --eval=True --ckpt_path=$2
|
||||
fi
|
||||
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
python ${self_path}/../gpu_resnet_benchmark.py --dataset_path=$DATAPATH --eval=True --ckpt_path=$2 \
|
||||
--batch_size=$3
|
||||
python ${self_path}/../gpu_resnet_benchmark.py --data_path=$DATAPATH --eval=True --checkpoint_file_path=$2 --config_path=$3
|
||||
fi
|
||||
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
python ${self_path}/../gpu_resnet_benchmark.py--dataset_path=$DATAPATH --eval=True --ckpt_path=$2 \
|
||||
--batch_size=$3 --dtype=$4
|
||||
python ${self_path}/../gpu_resnet_benchmark.py --data_path=$DATAPATH --eval=True --checkpoint_file_path=$2 \
|
||||
--config_path=$3 --batch_size=$4
|
||||
fi
|
||||
|
||||
if [ $# == 5 ]
|
||||
then
|
||||
python ${self_path}/../gpu_resnet_benchmark.py--data_path=$DATAPATH --eval=True --checkpoint_file_path=$2 \
|
||||
--config_path=$3 --batch_size=$4 --dtype=$5
|
||||
fi
|
||||
|
|
|
@ -14,11 +14,11 @@
|
|||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
if [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ]
|
||||
if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] && [ $# != 6 ]
|
||||
then
|
||||
echo "Usage: bash run_gpu_resnet_benchmark.sh [DATASET_PATH] [BATCH_SIZE](optional) [DTYPE](optional)\
|
||||
echo "Usage: bash run_gpu_resnet_benchmark.sh [DATASET_PATH] [CONFIG_PATH] [BATCH_SIZE](optional) [DTYPE](optional)\
|
||||
[DEVICE_NUM](optional) [SAVE_CKPT](optional) [SAVE_PATH](optional)"
|
||||
echo "Example: sh run_gpu_resnet_benchmark.sh /path/imagenet/train 256 FP16 8 true /path/ckpt"
|
||||
echo "Example: sh run_gpu_resnet_benchmark.sh /path/imagenet/train /*yaml 256 FP16 8 true /path/ckpt"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -33,35 +33,35 @@ get_real_path(){
|
|||
DATAPATH=$(get_real_path $1)
|
||||
script_self=$(readlink -f "$0")
|
||||
self_path=$(dirname "${script_self}")
|
||||
if [ $# == 1 ]
|
||||
then
|
||||
python ${self_path}/../gpu_resnet_benchmark.py --dataset_path=$DATAPATH
|
||||
fi
|
||||
|
||||
if [ $# == 2 ]
|
||||
then
|
||||
python ${self_path}/../gpu_resnet_benchmark.py --dataset_path=$DATAPATH --batch_size=$2
|
||||
python ${self_path}/../gpu_resnet_benchmark.py --data_path=$DATAPATH --config_path=$2
|
||||
fi
|
||||
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
python ${self_path}/../gpu_resnet_benchmark.py --dataset_path=$DATAPATH --batch_size=$2 --dtype=$3
|
||||
python ${self_path}/../gpu_resnet_benchmark.py --data_path=$DATAPATH --config_path=$2 --batch_size=$3
|
||||
fi
|
||||
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
mpirun --allow-run-as-root -n $4 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \
|
||||
--dataset_path=$DATAPATH --batch_size=$2 --dtype=$3
|
||||
python ${self_path}/../gpu_resnet_benchmark.py --data_path=$DATAPATH --config_path=$2 --batch_size=$3 --dtype=$4
|
||||
fi
|
||||
|
||||
if [ $# == 5 ]
|
||||
then
|
||||
mpirun --allow-run-as-root -n $4 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \
|
||||
--dataset_path=$DATAPATH --batch_size=$2 --dtype=$3 --save_ckpt=$5
|
||||
mpirun --allow-run-as-root -n $5 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \
|
||||
--data_path=$DATAPATH --config_path=$2 --batch_size=$3 --dtype=$4
|
||||
fi
|
||||
|
||||
if [ $# == 6 ]
|
||||
then
|
||||
mpirun --allow-run-as-root -n $4 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \
|
||||
--dataset_path=$DATAPATH --batch_size=$2 --dtype=$3 --save_ckpt=$5 --ckpt_path=$6
|
||||
mpirun --allow-run-as-root -n $5 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \
|
||||
--data_path=$DATAPATH --config_path=$2 --batch_size=$3 --dtype=$4 --save_ckpt=$6
|
||||
fi
|
||||
|
||||
if [ $# == 7 ]
|
||||
then
|
||||
mpirun --allow-run-as-root -n $5 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \
|
||||
--data_path=$DATAPATH --config_path=$2 --batch_size=$3 --dtype=$4 --save_ckpt=$6 --checkpoint_file_path=$7
|
||||
fi
|
||||
|
|
|
@ -14,25 +14,12 @@
|
|||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
if [ $# != 4 ]
|
||||
if [ $# != 3 ]
|
||||
then
|
||||
echo "Usage: bash run_eval.sh [resnet18|resnet50|resnet101|se-resnet50] [imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]"
|
||||
echo "Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] [CONFIG_PATH]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 != "resnet18" ] && [ $1 != "resnet50" ] && [ $1 != "resnet101" ] && [ $1 != "se-resnet50" ]
|
||||
then
|
||||
echo "error: the selected net is neither resnet50 nor resnet101 nor se-resnet50"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $2 != "imagenet2012" ]
|
||||
then
|
||||
echo "error: only support imagenet2012"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
|
@ -41,8 +28,9 @@ get_real_path(){
|
|||
fi
|
||||
}
|
||||
|
||||
PATH1=$(get_real_path $3)
|
||||
PATH2=$(get_real_path $4)
|
||||
PATH1=$(get_real_path $1)
|
||||
PATH2=$(get_real_path $2)
|
||||
CONFIG_FILE=$3
|
||||
|
||||
|
||||
if [ ! -d $PATH1 ]
|
||||
|
@ -68,11 +56,12 @@ then
|
|||
rm -rf ./infer
|
||||
fi
|
||||
mkdir ./infer
|
||||
cp ../*.yaml ./infer
|
||||
cp ../*.py ./infer
|
||||
cp *.sh ./infer
|
||||
cp -r ../src ./infer
|
||||
cd ./infer || exit
|
||||
env > env.log
|
||||
echo "start evaluation for device $DEVICE_ID"
|
||||
python infer.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --checkpoint_path=$PATH2 &> log &
|
||||
python infer.py --data_path=$PATH1 --checkpoint_file_path=$PATH2 --config_path=$CONFIG_FILE &> log &
|
||||
cd ..
|
||||
|
|
|
@ -14,31 +14,12 @@
|
|||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
if [ $# != 4 ] && [ $# != 5 ]
|
||||
if [ $# != 3 ] && [ $# != 4 ]
|
||||
then
|
||||
echo "Usage: bash run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
||||
echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 != "resnet50" ] && [ $1 != "resnet101" ]
|
||||
then
|
||||
echo "error: the selected net is neither resnet50 nor resnet101"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
|
||||
then
|
||||
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
|
||||
then
|
||||
echo "error: training resnet101 with cifar10 dataset is unsupported now!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
|
@ -47,12 +28,13 @@ get_real_path(){
|
|||
fi
|
||||
}
|
||||
|
||||
PATH1=$(get_real_path $3)
|
||||
PATH2=$(get_real_path $4)
|
||||
PATH1=$(get_real_path $1)
|
||||
PATH2=$(get_real_path $2)
|
||||
CONFIG_FILE=$3
|
||||
|
||||
if [ $# == 5 ]
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
PATH3=$(get_real_path $5)
|
||||
PATH3=$(get_real_path $4)
|
||||
fi
|
||||
|
||||
if [ ! -f $PATH1 ]
|
||||
|
@ -67,7 +49,7 @@ then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
if [ $# == 5 ] && [ ! -f $PATH3 ]
|
||||
if [ $# == 4 ] && [ ! -f $PATH3 ]
|
||||
then
|
||||
echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file"
|
||||
exit 1
|
||||
|
@ -89,19 +71,22 @@ export DEVICE_ID=0
|
|||
export RANK_ID=0
|
||||
rm -rf ./sched
|
||||
mkdir ./sched
|
||||
cp ../*.yaml ./sched
|
||||
cp ../*.py ./sched
|
||||
cp *.sh ./sched
|
||||
cp -r ../src ./sched
|
||||
cd ./sched || exit
|
||||
echo "start scheduler"
|
||||
if [ $# == 4 ]
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True &> sched.log &
|
||||
python train.py --run_distribute=True --device_num=1 --data_path=$PATH2 --parameter_server=True \
|
||||
--config_path=$CONFIG_FILE --output_path './output' &> sched.log &
|
||||
fi
|
||||
|
||||
if [ $# == 5 ]
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> sched.log &
|
||||
python train.py --run_distribute=True --device_num=1 --data_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 \
|
||||
--config_path=$CONFIG_FILE --output_path './output' &> sched.log &
|
||||
fi
|
||||
cd ..
|
||||
|
||||
|
@ -112,19 +97,22 @@ do
|
|||
export RANK_ID=$i
|
||||
rm -rf ./server_$i
|
||||
mkdir ./server_$i
|
||||
cp ../*.yaml ./server_$i
|
||||
cp ../*.py ./server_$i
|
||||
cp *.sh ./server_$i
|
||||
cp -r ../src ./server_$i
|
||||
cd ./server_$i || exit
|
||||
echo "start server"
|
||||
if [ $# == 4 ]
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True &> server_$i.log &
|
||||
python train.py --run_distribute=True --device_num=1 --data_path=$PATH2 --parameter_server=True \
|
||||
--config_path=$CONFIG_FILE --output_path './output' &> server_$i.log &
|
||||
fi
|
||||
|
||||
if [ $# == 5 ]
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> server_$i.log &
|
||||
python train.py --run_distribute=True --device_num=1 --data_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 \
|
||||
--config_path=$CONFIG_FILE --output_path './output' &> server_$i.log &
|
||||
fi
|
||||
|
||||
cd ..
|
||||
|
@ -137,20 +125,23 @@ do
|
|||
export RANK_ID=$i
|
||||
rm -rf ./worker_$i
|
||||
mkdir ./worker_$i
|
||||
cp ../*.yaml ./worker_$i
|
||||
cp ../*.py ./worker_$i
|
||||
cp *.sh ./worker_$i
|
||||
cp -r ../src ./worker_$i
|
||||
cd ./worker_$i || exit
|
||||
echo "start training for worker rank $RANK_ID, device $DEVICE_ID"
|
||||
env > env.log
|
||||
if [ $# == 4 ]
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --parameter_server=True &> worker_$i.log &
|
||||
python train.py --run_distribute=True --device_num=$DEVICE_NUM --data_path=$PATH2 --parameter_server=True \
|
||||
--config_path=$CONFIG_FILE --output_path './output' &> worker_$i.log &
|
||||
fi
|
||||
|
||||
if [ $# == 5 ]
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> worker_$i.log &
|
||||
python train.py --run_distribute=True --device_num=$DEVICE_NUM --data_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 \
|
||||
--config_path=$CONFIG_FILE --output_path './output' &> worker_$i.log &
|
||||
fi
|
||||
|
||||
cd ..
|
||||
|
|
|
@ -14,31 +14,12 @@
|
|||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
if [ $# != 3 ] && [ $# != 4 ]
|
||||
if [ $# != 2 ] && [ $# != 3 ]
|
||||
then
|
||||
echo "Usage: bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
||||
echo "Usage: bash run_distribute_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 != "resnet50" ] && [ $1 != "resnet101" ]
|
||||
then
|
||||
echo "error: the selected net is neither resnet50 nor resnet101"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
|
||||
then
|
||||
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
|
||||
then
|
||||
echo "error: training resnet101 with cifar10 dataset is unsupported now!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
|
@ -47,11 +28,11 @@ get_real_path(){
|
|||
fi
|
||||
}
|
||||
|
||||
PATH1=$(get_real_path $3)
|
||||
|
||||
if [ $# == 4 ]
|
||||
PATH1=$(get_real_path $1)
|
||||
CONFIG_FILE=$2
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
PATH2=$(get_real_path $4)
|
||||
PATH2=$(get_real_path $3)
|
||||
fi
|
||||
|
||||
|
||||
|
@ -61,7 +42,7 @@ then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
if [ $# == 5 ] && [ ! -f $PATH2 ]
|
||||
if [ $# == 4 ] && [ ! -f $PATH2 ]
|
||||
then
|
||||
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
|
||||
exit 1
|
||||
|
@ -79,22 +60,23 @@ export MS_SCHED_PORT=8081
|
|||
export MS_ROLE=MS_SCHED
|
||||
rm -rf ./sched
|
||||
mkdir ./sched
|
||||
cp ../*.yaml ./sched
|
||||
cp ../*.py ./sched
|
||||
cp *.sh ./sched
|
||||
cp -r ../src ./sched
|
||||
cd ./sched || exit
|
||||
if [ $# == 2 ]
|
||||
then
|
||||
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU" \
|
||||
--data_path=$PATH1 --parameter_server=True --config_path=$CONFIG_FILE --output_path './output' &> sched.log &
|
||||
fi
|
||||
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> sched.log &
|
||||
fi
|
||||
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> sched.log &
|
||||
python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU" \
|
||||
--data_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 --config_path=$CONFIG_FILE --output_path './output' &> sched.log &
|
||||
fi
|
||||
cd ..
|
||||
|
||||
|
@ -103,22 +85,24 @@ for((i=0;i<$MS_SERVER_NUM;i++));
|
|||
do
|
||||
rm -rf ./server_$i
|
||||
mkdir ./server_$i
|
||||
cp ../*.yaml ./server_$i
|
||||
cp ../*.py ./server_$i
|
||||
cp *.sh ./server_$i
|
||||
cp -r ../src ./server_$i
|
||||
cd ./server_$i || exit
|
||||
if [ $# == 2 ]
|
||||
then
|
||||
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU" \
|
||||
--data_path=$PATH1 --parameter_server=True --config_path=$CONFIG_FILE --output_path './output' &> server_$i.log &
|
||||
fi
|
||||
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> server_$i.log &
|
||||
fi
|
||||
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> server_$i.log &
|
||||
python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU" \
|
||||
--data_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 \
|
||||
--config_path=$CONFIG_FILE --output_path './output' &> server_$i.log &
|
||||
fi
|
||||
cd ..
|
||||
done
|
||||
|
@ -126,21 +110,23 @@ done
|
|||
export MS_ROLE=MS_WORKER
|
||||
rm -rf ./worker
|
||||
mkdir ./worker
|
||||
cp ../*.yaml ./worker
|
||||
cp ../*.py ./worker
|
||||
cp *.sh ./worker
|
||||
cp -r ../src ./worker
|
||||
cd ./worker || exit
|
||||
if [ $# == 2 ]
|
||||
then
|
||||
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU" \
|
||||
--data_path=$PATH1 --parameter_server=True --config_path=$CONFIG_FILE --output_path './output' &> worker.log &
|
||||
fi
|
||||
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> worker.log &
|
||||
fi
|
||||
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> worker.log &
|
||||
python train.py --run_distribute=True --device_num=$DEVICE_NUM --device_target="GPU"\
|
||||
--data_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 \
|
||||
--config_path=$CONFIG_FILE --output_path './output' &> worker.log &
|
||||
fi
|
||||
cd ..
|
||||
|
|
|
@ -17,34 +17,10 @@
|
|||
CURPATH="$(dirname "$0")"
|
||||
. ${CURPATH}/cache_util.sh
|
||||
|
||||
if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ]
|
||||
if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ]
|
||||
then
|
||||
echo "Usage: bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
||||
echo " bash run_standalone_train.sh [resnet18|resnet34|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 != "resnet18" ] && [ $1 != "resnet34" ] && [ $1 != "resnet50" ] && [ $1 != "resnet101" ] && [ $1 != "se-resnet50" ]
|
||||
then
|
||||
echo "error: the selected net is neither resnet50 nor resnet101 and se-resnet50"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
|
||||
then
|
||||
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
|
||||
then
|
||||
echo "error: training resnet101 with cifar10 dataset is unsupported now!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 == "se-resnet50" ] && [ $2 == "cifar10" ]
|
||||
then
|
||||
echo "error: evaluating se-resnet50 with cifar10 dataset is unsupported now!"
|
||||
echo "Usage: bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
||||
echo "bash run_standalone_train.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -56,17 +32,17 @@ get_real_path(){
|
|||
fi
|
||||
}
|
||||
|
||||
PATH1=$(get_real_path $3)
|
||||
PATH1=$(get_real_path $1)
|
||||
CONFIG_FILE=$2
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
PATH2=$(get_real_path $3)
|
||||
fi
|
||||
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
PATH2=$(get_real_path $4)
|
||||
fi
|
||||
|
||||
if [ $# == 5 ]
|
||||
then
|
||||
RUN_EVAL=$4
|
||||
EVAL_DATASET_PATH=$(get_real_path $5)
|
||||
RUN_EVAL=$2
|
||||
EVAL_DATASET_PATH=$(get_real_path $4)
|
||||
fi
|
||||
|
||||
if [ ! -d $PATH1 ]
|
||||
|
@ -75,7 +51,7 @@ then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
if [ $# == 4 ] && [ ! -f $PATH2 ]
|
||||
if [ $# == 3 ] && [ ! -f $PATH2 ]
|
||||
then
|
||||
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
|
||||
exit 1
|
||||
|
@ -103,26 +79,28 @@ then
|
|||
rm -rf ./train
|
||||
fi
|
||||
mkdir ./train
|
||||
cp ../*.yaml ./train
|
||||
cp ../*.py ./train
|
||||
cp *.sh ./train
|
||||
cp -r ../src ./train
|
||||
cd ./train || exit
|
||||
echo "start training for device $DEVICE_ID"
|
||||
env > env.log
|
||||
if [ $# == 2 ]
|
||||
then
|
||||
python train.py --data_path=$PATH1 --config_path=$CONFIG_FILE --output_path './output' &> log &
|
||||
fi
|
||||
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
python train.py --net=$1 --dataset=$2 --dataset_path=$PATH1 &> log &
|
||||
python train.py --data_path=$PATH1 --pre_trained=$PATH2 --config_path=$CONFIG_FILE --output_path './output' &> log &
|
||||
fi
|
||||
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
python train.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
|
||||
fi
|
||||
|
||||
if [ $# == 5 ]
|
||||
then
|
||||
python train.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --run_eval=$RUN_EVAL \
|
||||
--eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID &> log &
|
||||
python train.py --data_path=$PATH1 --run_eval=$RUN_EVAL --eval_data_path=$EVAL_DATASET_PATH \
|
||||
--enable_cache=True --cache_session_id=$CACHE_SESSION_ID \
|
||||
--config_path=$CONFIG_FILE --output_path './output' &> log &
|
||||
if [ "x${RUN_EVAL}" == "xTrue" ]
|
||||
then
|
||||
echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\""
|
||||
|
|
|
@ -17,32 +17,13 @@
|
|||
CURPATH="$(dirname "$0")"
|
||||
. ${CURPATH}/cache_util.sh
|
||||
|
||||
if [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ]
|
||||
if [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ]
|
||||
then
|
||||
echo "Usage: bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
||||
echo " bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)"
|
||||
echo "Usage: bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
||||
echo " bash run_standalone_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 != "resnet50" ] && [ $1 != "resnet101" ]
|
||||
then
|
||||
echo "error: the selected net is neither resnet50 nor resnet101"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
|
||||
then
|
||||
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
|
||||
then
|
||||
echo "error: training resnet101 with cifar10 dataset is unsupported now!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
|
@ -51,17 +32,18 @@ get_real_path(){
|
|||
fi
|
||||
}
|
||||
|
||||
PATH1=$(get_real_path $3)
|
||||
PATH1=$(get_real_path $1)
|
||||
CONFIG_FILE=$2
|
||||
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
PATH2=$(get_real_path $3)
|
||||
fi
|
||||
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
PATH2=$(get_real_path $4)
|
||||
fi
|
||||
|
||||
if [ $# == 5 ]
|
||||
then
|
||||
RUN_EVAL=$4
|
||||
EVAL_DATASET_PATH=$(get_real_path $5)
|
||||
RUN_EVAL=$3
|
||||
EVAL_DATASET_PATH=$(get_real_path $4)
|
||||
fi
|
||||
|
||||
if [ ! -d $PATH1 ]
|
||||
|
@ -70,7 +52,7 @@ then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
if [ $# == 4 ] && [ ! -f $PATH2 ]
|
||||
if [ $# == 3 ] && [ ! -f $PATH2 ]
|
||||
then
|
||||
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
|
||||
exit 1
|
||||
|
@ -100,26 +82,30 @@ then
|
|||
rm -rf ./train
|
||||
fi
|
||||
mkdir ./train
|
||||
cp ../*.yaml ./train
|
||||
cp ../*.py ./train
|
||||
cp *.sh ./train
|
||||
cp -r ../src ./train
|
||||
cd ./train || exit
|
||||
echo "start training for device $DEVICE_ID"
|
||||
env > env.log
|
||||
if [ $# == 2 ]
|
||||
then
|
||||
python train.py --device_target="GPU" --data_path=$PATH1 \
|
||||
--config_path=$CONFIG_FILE --output_path './output' &> log &
|
||||
fi
|
||||
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
python train.py --net=$1 --dataset=$2 --device_target="GPU" --dataset_path=$PATH1 &> log &
|
||||
python train.py --device_target="GPU" --data_path=$PATH1 --pre_trained=$PATH2 \
|
||||
--config_path=$CONFIG_FILE --output_path './output' &> log &
|
||||
fi
|
||||
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
python train.py --net=$1 --dataset=$2 --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
|
||||
fi
|
||||
|
||||
if [ $# == 5 ]
|
||||
then
|
||||
python train.py --net=$1 --dataset=$2 --device_target="GPU" --dataset_path=$PATH1 --run_eval=$RUN_EVAL \
|
||||
--eval_dataset_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID &> log &
|
||||
python train.py --device_target="GPU" --data_path=$PATH1 --run_eval=$RUN_EVAL \
|
||||
--eval_data_path=$EVAL_DATASET_PATH --enable_cache=True --cache_session_id=$CACHE_SESSION_ID \
|
||||
--config_path=$CONFIG_FILE --output_path './output' &> log &
|
||||
if [ "x${RUN_EVAL}" == "xTrue" ]
|
||||
then
|
||||
echo -e "\nWhen training run is done, remember to shut down the cache server via \"cache_admin --stop\""
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
run_distribute: False
|
||||
enable_profiling: False
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path/"
|
||||
device_target: 'Ascend'
|
||||
checkpoint_path: './checkpoint/'
|
||||
checkpoint_file_path: ''
|
||||
|
||||
# ==============================================================================
|
||||
# Training options
|
||||
optimizer: 'Momentum'
|
||||
infer_label: ""
|
||||
class_num: 1001
|
||||
batch_size: 32
|
||||
loss_scale: 1024
|
||||
momentum: 0.9
|
||||
weight_decay: 0.0001
|
||||
epoch_size: 28
|
||||
train_epoch_size: 24
|
||||
pretrain_epoch_size: 0
|
||||
save_checkpoint: True
|
||||
save_checkpoint_epochs: 4
|
||||
keep_checkpoint_max: 10
|
||||
warmup_epochs: 3
|
||||
lr_decay_mode: "cosine"
|
||||
use_label_smooth: True
|
||||
label_smooth_factor: 0.1
|
||||
lr_init: 0
|
||||
lr_end: 0.0001
|
||||
lr_max: 0.3
|
||||
|
||||
net_name: "se-resnet50"
|
||||
dataset: "imagenet2012"
|
||||
device_num: 1
|
||||
pre_trained: ""
|
||||
run_eval: False
|
||||
eval_dataset_path: ""
|
||||
parameter_server: False
|
||||
filter_weight: False
|
||||
save_best_ckpt: True
|
||||
eval_start_epoch: 40
|
||||
eval_interval: 1
|
||||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
width: 224
|
||||
height: 224
|
||||
file_name: "se-resnet50"
|
||||
file_format: "AIR"
|
||||
ckpt_file: ""
|
||||
network_dataset: "se-resnet50_imagenet2012"
|
||||
|
||||
---
|
||||
# Help description for each configuration
|
||||
enable_modelarts: 'Whether training on modelarts, default: False'
|
||||
data_url: 'Dataset url for obs'
|
||||
checkpoint_url: 'The location of checkpoint for obs'
|
||||
data_path: 'Dataset path for local'
|
||||
output_path: 'Training output path for local'
|
||||
load_path: 'The location of checkpoint for obs'
|
||||
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
||||
num_classes: 'Class for dataset'
|
||||
batch_size: "Batch size for training and evaluation"
|
||||
epoch_size: "Total training epochs."
|
||||
checkpoint_path: "The location of the checkpoint file."
|
||||
checkpoint_file_path: "The location of the checkpoint file."
|
|
@ -1,155 +0,0 @@
|
|||
# Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
network config setting, will be used in train.py and eval.py
|
||||
"""
|
||||
from easydict import EasyDict as ed
|
||||
# config optimizer for resnet50, imagenet2012. Momentum is default, Thor is optional.
|
||||
# infer_label is a directory and label mapping table. such as 'infer_label': {"directory0": 0, "directory1": 1, ...}
|
||||
cfg = ed({
|
||||
'optimizer': 'Momentum',
|
||||
'infer_label': {}
|
||||
})
|
||||
|
||||
# config for resent50, cifar10
|
||||
config1 = ed({
|
||||
"class_num": 10,
|
||||
"batch_size": 32,
|
||||
"loss_scale": 1024,
|
||||
"momentum": 0.9,
|
||||
"weight_decay": 1e-4,
|
||||
"epoch_size": 90,
|
||||
"pretrain_epoch_size": 0,
|
||||
"save_checkpoint": True,
|
||||
"save_checkpoint_epochs": 5,
|
||||
"keep_checkpoint_max": 10,
|
||||
"save_checkpoint_path": "./",
|
||||
"warmup_epochs": 5,
|
||||
"lr_decay_mode": "poly",
|
||||
"lr_init": 0.01,
|
||||
"lr_end": 0.00001,
|
||||
"lr_max": 0.1
|
||||
})
|
||||
|
||||
# config for resnet50, imagenet2012
|
||||
config2 = ed({
|
||||
"class_num": 1001,
|
||||
"batch_size": 256,
|
||||
"loss_scale": 1024,
|
||||
"momentum": 0.9,
|
||||
"weight_decay": 1e-4,
|
||||
"epoch_size": 90,
|
||||
"pretrain_epoch_size": 0,
|
||||
"save_checkpoint": True,
|
||||
"save_checkpoint_epochs": 5,
|
||||
"keep_checkpoint_max": 10,
|
||||
"save_checkpoint_path": "./",
|
||||
"warmup_epochs": 0,
|
||||
"lr_decay_mode": "linear",
|
||||
"use_label_smooth": True,
|
||||
"label_smooth_factor": 0.1,
|
||||
"lr_init": 0,
|
||||
"lr_max": 0.8,
|
||||
"lr_end": 0.0
|
||||
})
|
||||
|
||||
# config for resent101, imagenet2012
|
||||
config3 = ed({
|
||||
"class_num": 1001,
|
||||
"batch_size": 32,
|
||||
"loss_scale": 1024,
|
||||
"momentum": 0.9,
|
||||
"weight_decay": 1e-4,
|
||||
"epoch_size": 120,
|
||||
"pretrain_epoch_size": 0,
|
||||
"save_checkpoint": True,
|
||||
"save_checkpoint_epochs": 5,
|
||||
"keep_checkpoint_max": 10,
|
||||
"save_checkpoint_path": "./",
|
||||
"warmup_epochs": 0,
|
||||
"lr_decay_mode": "cosine",
|
||||
"use_label_smooth": True,
|
||||
"label_smooth_factor": 0.1,
|
||||
"lr": 0.1
|
||||
})
|
||||
|
||||
# config for se-resnet50, imagenet2012
|
||||
config4 = ed({
|
||||
"class_num": 1001,
|
||||
"batch_size": 32,
|
||||
"loss_scale": 1024,
|
||||
"momentum": 0.9,
|
||||
"weight_decay": 1e-4,
|
||||
"epoch_size": 28,
|
||||
"train_epoch_size": 24,
|
||||
"pretrain_epoch_size": 0,
|
||||
"save_checkpoint": True,
|
||||
"save_checkpoint_epochs": 4,
|
||||
"keep_checkpoint_max": 10,
|
||||
"save_checkpoint_path": "./",
|
||||
"warmup_epochs": 3,
|
||||
"lr_decay_mode": "cosine",
|
||||
"use_label_smooth": True,
|
||||
"label_smooth_factor": 0.1,
|
||||
"lr_init": 0.0,
|
||||
"lr_max": 0.3,
|
||||
"lr_end": 0.0001
|
||||
})
|
||||
|
||||
# config for resnet50, imagenet2012, Ascend 910
|
||||
config_thor_Ascend = ed({
|
||||
"class_num": 1001,
|
||||
"batch_size": 32,
|
||||
"loss_scale": 128,
|
||||
"momentum": 0.9,
|
||||
"weight_decay": 5e-4,
|
||||
"epoch_size": 45,
|
||||
"pretrain_epoch_size": 0,
|
||||
"save_checkpoint": True,
|
||||
"save_checkpoint_epochs": 2,
|
||||
"keep_checkpoint_max": 15,
|
||||
"save_checkpoint_path": "./",
|
||||
"use_label_smooth": True,
|
||||
"label_smooth_factor": 0.1,
|
||||
"lr_init": 0.05803,
|
||||
"lr_decay": 4.04839,
|
||||
"lr_end_epoch": 53,
|
||||
"damping_init": 0.02714,
|
||||
"damping_decay": 0.50036,
|
||||
"frequency": 834,
|
||||
})
|
||||
|
||||
# config for resnet50, imagenet2012, GPU
|
||||
config_thor_gpu = ed({
|
||||
"class_num": 1001,
|
||||
"batch_size": 32,
|
||||
"loss_scale": 128,
|
||||
"momentum": 0.9,
|
||||
"weight_decay": 5e-4,
|
||||
"epoch_size": 40,
|
||||
"pretrain_epoch_size": 0,
|
||||
"save_checkpoint": True,
|
||||
"save_checkpoint_epochs": 1,
|
||||
"keep_checkpoint_max": 15,
|
||||
"save_checkpoint_path": "./",
|
||||
"use_label_smooth": True,
|
||||
"label_smooth_factor": 0.1,
|
||||
"lr_init": 0.05672,
|
||||
"lr_decay": 4.9687,
|
||||
"lr_end_epoch": 50,
|
||||
"damping_init": 0.02345,
|
||||
"damping_decay": 0.5467,
|
||||
"frequency": 834,
|
||||
})
|
|
@ -21,6 +21,8 @@ import mindspore.dataset as ds
|
|||
import mindspore.dataset.vision.c_transforms as C
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
from mindspore.communication.management import init, get_rank, get_group_size
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.device_adapter import get_device_num, get_rank_id
|
||||
|
||||
|
||||
def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False,
|
||||
|
@ -407,11 +409,19 @@ def _get_rank_info():
|
|||
"""
|
||||
rank_size = int(os.environ.get("RANK_SIZE", 1))
|
||||
|
||||
if rank_size > 1:
|
||||
rank_size = get_group_size()
|
||||
rank_id = get_rank()
|
||||
if config.device_target == "Ascend":
|
||||
if rank_size > 1:
|
||||
rank_size = get_device_num()
|
||||
rank_id = get_rank_id()
|
||||
else:
|
||||
rank_size = 1
|
||||
rank_id = 0
|
||||
else:
|
||||
rank_size = 1
|
||||
rank_id = 0
|
||||
if rank_size > 1:
|
||||
rank_size = get_group_size()
|
||||
rank_id = get_rank()
|
||||
else:
|
||||
rank_size = 1
|
||||
rank_id = 0
|
||||
|
||||
return rank_size, rank_id
|
||||
|
|
|
@ -22,7 +22,7 @@ import mindspore.dataset as ds
|
|||
import mindspore.dataset.vision.c_transforms as C
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
from mindspore.communication.management import init, get_rank, get_group_size
|
||||
from src.config import cfg
|
||||
from src.model_utils.config import config
|
||||
|
||||
|
||||
class ImgDataset:
|
||||
|
@ -39,7 +39,7 @@ class ImgDataset:
|
|||
self.data = []
|
||||
self.dir_label_dict = {}
|
||||
self.img_format = (".bmp", ".png", ".jpg", ".jpeg")
|
||||
self.dir_label = cfg.infer_label
|
||||
self.dir_label = config.infer_label
|
||||
dataset_list = sorted(os.listdir(dataset_path))
|
||||
file_exist = dir_exist = False
|
||||
for index, data_name in enumerate(dataset_list):
|
||||
|
|
|
@ -0,0 +1,125 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Parse arguments"""
|
||||
|
||||
import os
|
||||
import ast
|
||||
import argparse
|
||||
from pprint import pprint, pformat
|
||||
import yaml
|
||||
|
||||
_config_path = "./resnet50_cifar10_config.yaml"
|
||||
|
||||
class Config:
|
||||
"""
|
||||
Configuration namespace. Convert dictionary to members.
|
||||
"""
|
||||
def __init__(self, cfg_dict):
|
||||
for k, v in cfg_dict.items():
|
||||
if isinstance(v, (list, tuple)):
|
||||
setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
|
||||
else:
|
||||
setattr(self, k, Config(v) if isinstance(v, dict) else v)
|
||||
|
||||
def __str__(self):
|
||||
return pformat(self.__dict__)
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
|
||||
def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="resnet50_cifar10_config.yaml"):
|
||||
"""
|
||||
Parse command line arguments to the configuration according to the default yaml.
|
||||
|
||||
Args:
|
||||
parser: Parent parser.
|
||||
cfg: Base configuration.
|
||||
helper: Helper description.
|
||||
cfg_path: Path to the default yaml config.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
|
||||
parents=[parser])
|
||||
helper = {} if helper is None else helper
|
||||
choices = {} if choices is None else choices
|
||||
for item in cfg:
|
||||
if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
|
||||
help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
|
||||
choice = choices[item] if item in choices else None
|
||||
if isinstance(cfg[item], bool):
|
||||
parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
|
||||
help=help_description)
|
||||
else:
|
||||
parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
|
||||
help=help_description)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def parse_yaml(yaml_path):
|
||||
"""
|
||||
Parse the yaml config file.
|
||||
|
||||
Args:
|
||||
yaml_path: Path to the yaml config.
|
||||
"""
|
||||
with open(yaml_path, 'r') as fin:
|
||||
try:
|
||||
cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
|
||||
cfgs = [x for x in cfgs]
|
||||
if len(cfgs) == 1:
|
||||
cfg_helper = {}
|
||||
cfg = cfgs[0]
|
||||
elif len(cfgs) == 2:
|
||||
cfg, cfg_helper = cfgs
|
||||
else:
|
||||
raise ValueError("At most 2 docs (config and help description for help) are supported in config yaml")
|
||||
print(cfg_helper)
|
||||
except:
|
||||
raise ValueError("Failed to parse yaml")
|
||||
return cfg, cfg_helper
|
||||
|
||||
|
||||
def merge(args, cfg):
|
||||
"""
|
||||
Merge the base config from yaml file and command line arguments.
|
||||
|
||||
Args:
|
||||
args: Command line arguments.
|
||||
cfg: Base configuration.
|
||||
"""
|
||||
args_var = vars(args)
|
||||
for item in args_var:
|
||||
cfg[item] = args_var[item]
|
||||
return cfg
|
||||
|
||||
|
||||
def get_config():
|
||||
"""
|
||||
Get Config according to the yaml file and cli arguments.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="default name", add_help=False)
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, \
|
||||
"../resnet50_cifar10_config.yaml"), help="Config file path")
|
||||
path_args, _ = parser.parse_known_args()
|
||||
default, helper = parse_yaml(path_args.config_path)
|
||||
pprint(default)
|
||||
args = parse_cli_to_yaml(parser, default, helper, path_args.config_path)
|
||||
final_config = merge(args, default)
|
||||
return Config(final_config)
|
||||
|
||||
config = get_config()
|
|
@ -0,0 +1,27 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Device adapter for ModelArts"""
|
||||
|
||||
from src.model_utils.config import config
|
||||
|
||||
if config.enable_modelarts:
|
||||
from src.model_utils.moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
|
||||
else:
|
||||
from src.model_utils.local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
|
||||
|
||||
__all__ = [
|
||||
"get_device_id", "get_device_num", "get_rank_id", "get_job_id"
|
||||
]
|
|
@ -0,0 +1,36 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Local adapter"""
|
||||
|
||||
import os
|
||||
|
||||
def get_device_id():
|
||||
device_id = os.getenv('DEVICE_ID', '0')
|
||||
return int(device_id)
|
||||
|
||||
|
||||
def get_device_num():
|
||||
device_num = os.getenv('RANK_SIZE', '1')
|
||||
return int(device_num)
|
||||
|
||||
|
||||
def get_rank_id():
|
||||
global_rank_id = os.getenv('RANK_ID', '0')
|
||||
return int(global_rank_id)
|
||||
|
||||
|
||||
def get_job_id():
|
||||
return "Local Job"
|
|
@ -0,0 +1,115 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Moxing adapter for ModelArts"""
|
||||
|
||||
import os
|
||||
import functools
|
||||
from mindspore import context
|
||||
from src.model_utils.config import config
|
||||
|
||||
_global_sync_count = 0
|
||||
|
||||
def get_device_id():
|
||||
device_id = os.getenv('DEVICE_ID', '0')
|
||||
return int(device_id)
|
||||
|
||||
|
||||
def get_device_num():
|
||||
device_num = os.getenv('RANK_SIZE', '1')
|
||||
return int(device_num)
|
||||
|
||||
|
||||
def get_rank_id():
|
||||
global_rank_id = os.getenv('RANK_ID', '0')
|
||||
return int(global_rank_id)
|
||||
|
||||
|
||||
def get_job_id():
|
||||
job_id = os.getenv('JOB_ID')
|
||||
job_id = job_id if job_id != "" else "default"
|
||||
return job_id
|
||||
|
||||
def sync_data(from_path, to_path):
|
||||
"""
|
||||
Download data from remote obs to local directory if the first url is remote url and the second one is local path
|
||||
Upload data from local directory to remote obs in contrast.
|
||||
"""
|
||||
import moxing as mox
|
||||
import time
|
||||
global _global_sync_count
|
||||
sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
|
||||
_global_sync_count += 1
|
||||
|
||||
# Each server contains 8 devices as most.
|
||||
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
|
||||
print("from path: ", from_path)
|
||||
print("to path: ", to_path)
|
||||
mox.file.copy_parallel(from_path, to_path)
|
||||
print("===finish data synchronization===")
|
||||
try:
|
||||
os.mknod(sync_lock)
|
||||
except IOError:
|
||||
pass
|
||||
print("===save flag===")
|
||||
|
||||
while True:
|
||||
if os.path.exists(sync_lock):
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
print("Finish sync data from {} to {}.".format(from_path, to_path))
|
||||
|
||||
|
||||
def moxing_wrapper(pre_process=None, post_process=None):
|
||||
"""
|
||||
Moxing wrapper to download dataset and upload outputs.
|
||||
"""
|
||||
def wrapper(run_func):
|
||||
@functools.wraps(run_func)
|
||||
def wrapped_func(*args, **kwargs):
|
||||
# Download data from data_url
|
||||
if config.enable_modelarts:
|
||||
if config.data_url:
|
||||
sync_data(config.data_url, config.data_path)
|
||||
print("Dataset downloaded: ", os.listdir(config.data_path))
|
||||
if config.checkpoint_url:
|
||||
sync_data(config.checkpoint_url, config.load_path)
|
||||
print("Preload downloaded: ", os.listdir(config.load_path))
|
||||
if config.train_url:
|
||||
sync_data(config.train_url, config.output_path)
|
||||
print("Workspace downloaded: ", os.listdir(config.output_path))
|
||||
|
||||
context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
|
||||
config.device_num = get_device_num()
|
||||
config.device_id = get_device_id()
|
||||
if not os.path.exists(config.output_path):
|
||||
os.makedirs(config.output_path)
|
||||
|
||||
if pre_process:
|
||||
pre_process()
|
||||
|
||||
run_func(*args, **kwargs)
|
||||
|
||||
# Upload data to train_url
|
||||
if config.enable_modelarts:
|
||||
if post_process:
|
||||
post_process()
|
||||
|
||||
if config.train_url:
|
||||
print("Start to copy output directory")
|
||||
sync_data(config.output_path, config.train_url)
|
||||
return wrapped_func
|
||||
return wrapper
|
|
@ -14,8 +14,6 @@
|
|||
# ============================================================================
|
||||
"""train resnet."""
|
||||
import os
|
||||
import argparse
|
||||
import ast
|
||||
from mindspore import context
|
||||
from mindspore import Tensor
|
||||
from mindspore.nn.optim import Momentum, thor
|
||||
|
@ -26,7 +24,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMoni
|
|||
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
|
||||
from mindspore.train.loss_scale_manager import FixedLossScaleManager
|
||||
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||
from mindspore.communication.management import init, get_rank, get_group_size
|
||||
from mindspore.communication.management import init
|
||||
from mindspore.common import set_seed
|
||||
from mindspore.parallel import set_algo_parameters
|
||||
import mindspore.nn as nn
|
||||
|
@ -34,72 +32,35 @@ import mindspore.common.initializer as weight_init
|
|||
import mindspore.log as logger
|
||||
from src.lr_generator import get_lr, warmup_cosine_annealing_lr
|
||||
from src.CrossEntropySmooth import CrossEntropySmooth
|
||||
from src.config import cfg
|
||||
from src.eval_callback import EvalCallBack
|
||||
from src.metric import DistAccuracy, ClassifyCorrectCell
|
||||
|
||||
parser = argparse.ArgumentParser(description='Image classification')
|
||||
parser.add_argument('--net', type=str, default=None, help='Resnet Model, resnet18, resnet34, resnet50 or resnet101')
|
||||
parser.add_argument('--dataset', type=str, default=None, help='Dataset, either cifar10 or imagenet2012')
|
||||
parser.add_argument('--run_distribute', type=ast.literal_eval, default=False, help='Run distribute')
|
||||
parser.add_argument('--device_num', type=int, default=1, help='Device num.')
|
||||
|
||||
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
|
||||
parser.add_argument('--device_target', type=str, default='Ascend', choices=("Ascend", "GPU", "CPU"),
|
||||
help="Device target, support Ascend, GPU and CPU.")
|
||||
parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path')
|
||||
parser.add_argument('--parameter_server', type=ast.literal_eval, default=False, help='Run parameter server train')
|
||||
parser.add_argument("--filter_weight", type=ast.literal_eval, default=False,
|
||||
help="Filter head weight parameters, default is False.")
|
||||
parser.add_argument("--run_eval", type=ast.literal_eval, default=False,
|
||||
help="Run evaluation when training, default is False.")
|
||||
parser.add_argument('--eval_dataset_path', type=str, default=None, help='Evaluation dataset path when run_eval is True')
|
||||
parser.add_argument("--save_best_ckpt", type=ast.literal_eval, default=True,
|
||||
help="Save best checkpoint when run_eval is True, default is True.")
|
||||
parser.add_argument("--eval_start_epoch", type=int, default=40,
|
||||
help="Evaluation start epoch when run_eval is True, default is 40.")
|
||||
parser.add_argument("--eval_interval", type=int, default=1,
|
||||
help="Evaluation interval when run_eval is True, default is 1.")
|
||||
parser.add_argument('--enable_cache', type=ast.literal_eval, default=False,
|
||||
help='Caching the eval dataset in memory to speedup evaluation, default is False.')
|
||||
parser.add_argument('--cache_session_id', type=str, default="", help='The session id for cache service.')
|
||||
parser.add_argument('--mode', type=str, default='GRAPH', choices=('GRAPH', 'PYNATIVE'),
|
||||
help="Graph mode or PyNative mode, default is Graph mode")
|
||||
args_opt = parser.parse_args()
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||
from src.model_utils.device_adapter import get_rank_id, get_device_num
|
||||
|
||||
set_seed(1)
|
||||
|
||||
if args_opt.net in ("resnet18", "resnet34", "resnet50"):
|
||||
if args_opt.net == "resnet18":
|
||||
if config.net_name in ("resnet18", "resnet34", "resnet50"):
|
||||
if config.net_name == "resnet18":
|
||||
from src.resnet import resnet18 as resnet
|
||||
if args_opt.net == "resnet34":
|
||||
if config.net_name == "resnet34":
|
||||
from src.resnet import resnet34 as resnet
|
||||
if args_opt.net == "resnet50":
|
||||
if config.net_name == "resnet50":
|
||||
from src.resnet import resnet50 as resnet
|
||||
if args_opt.dataset == "cifar10":
|
||||
from src.config import config1 as config
|
||||
if config.dataset == "cifar10":
|
||||
from src.dataset import create_dataset1 as create_dataset
|
||||
else:
|
||||
from src.config import config2 as config
|
||||
if args_opt.mode == "GRAPH":
|
||||
if config.mode_name == "GRAPH":
|
||||
from src.dataset import create_dataset2 as create_dataset
|
||||
else:
|
||||
from src.dataset import create_dataset_pynative as create_dataset
|
||||
elif args_opt.net == "resnet101":
|
||||
elif config.net_name == "resnet101":
|
||||
from src.resnet import resnet101 as resnet
|
||||
from src.config import config3 as config
|
||||
from src.dataset import create_dataset3 as create_dataset
|
||||
else:
|
||||
from src.resnet import se_resnet50 as resnet
|
||||
from src.config import config4 as config
|
||||
from src.dataset import create_dataset4 as create_dataset
|
||||
|
||||
if cfg.optimizer == "Thor":
|
||||
if args_opt.device_target == "Ascend":
|
||||
from src.config import config_thor_Ascend as config
|
||||
else:
|
||||
from src.config import config_thor_gpu as config
|
||||
|
||||
|
||||
def filter_checkpoint_parameter_by_list(origin_dict, param_filter):
|
||||
"""remove useless parameters according to filter_list"""
|
||||
|
@ -122,56 +83,46 @@ def set_graph_kernel_context(run_platform, net_name):
|
|||
context.set_context(enable_graph_kernel=True)
|
||||
context.set_context(graph_kernel_flags="--enable_parallel_fusion --enable_expand_ops=Conv2D")
|
||||
|
||||
if __name__ == '__main__':
|
||||
target = args_opt.device_target
|
||||
def set_parameter():
|
||||
"""set_parameter"""
|
||||
target = config.device_target
|
||||
if target == "CPU":
|
||||
args_opt.run_distribute = False
|
||||
|
||||
ckpt_save_dir = config.save_checkpoint_path
|
||||
config.run_distribute = False
|
||||
|
||||
# init context
|
||||
if args_opt.mode == 'GRAPH':
|
||||
if config.mode_name == 'GRAPH':
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
|
||||
set_graph_kernel_context(target, args_opt.net)
|
||||
set_graph_kernel_context(target, config.net_name)
|
||||
else:
|
||||
context.set_context(mode=context.PYNATIVE_MODE, device_target=target, save_graphs=False)
|
||||
if args_opt.parameter_server:
|
||||
if config.parameter_server:
|
||||
context.set_ps_context(enable_ps=True)
|
||||
if args_opt.run_distribute:
|
||||
if config.run_distribute:
|
||||
if target == "Ascend":
|
||||
device_id = int(os.getenv('DEVICE_ID'))
|
||||
context.set_context(device_id=device_id, enable_auto_mixed_precision=True)
|
||||
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
|
||||
context.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
|
||||
gradients_mean=True)
|
||||
set_algo_parameters(elementwise_op_strategy_follow=True)
|
||||
if args_opt.net == "resnet50" or args_opt.net == "se-resnet50":
|
||||
if config.net_name == "resnet50" or config.net_name == "se-resnet50":
|
||||
context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160])
|
||||
elif args_opt.net == "resnet101":
|
||||
elif config.net_name == "resnet101":
|
||||
context.set_auto_parallel_context(all_reduce_fusion_config=[80, 210, 313])
|
||||
init()
|
||||
# GPU target
|
||||
else:
|
||||
init()
|
||||
context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
|
||||
context.set_auto_parallel_context(device_num=get_device_num(),
|
||||
parallel_mode=ParallelMode.DATA_PARALLEL,
|
||||
gradients_mean=True)
|
||||
if args_opt.net == "resnet50":
|
||||
if config.net_name == "resnet50":
|
||||
context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160])
|
||||
ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"
|
||||
|
||||
# create dataset
|
||||
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1,
|
||||
batch_size=config.batch_size, target=target, distribute=args_opt.run_distribute)
|
||||
step_size = dataset.get_dataset_size()
|
||||
|
||||
# define net
|
||||
net = resnet(class_num=config.class_num)
|
||||
if args_opt.parameter_server:
|
||||
net.set_param_ps()
|
||||
|
||||
# init weight
|
||||
if args_opt.pre_trained:
|
||||
param_dict = load_checkpoint(args_opt.pre_trained)
|
||||
if args_opt.filter_weight:
|
||||
def init_weight(net):
|
||||
"""init_weight"""
|
||||
if config.pre_trained:
|
||||
param_dict = load_checkpoint(config.pre_trained)
|
||||
if config.filter_weight:
|
||||
filter_list = [x.name for x in net.end_point.get_parameters()]
|
||||
filter_checkpoint_parameter_by_list(param_dict, filter_list)
|
||||
load_param_into_net(net, param_dict)
|
||||
|
@ -186,20 +137,60 @@ if __name__ == '__main__':
|
|||
cell.weight.shape,
|
||||
cell.weight.dtype))
|
||||
|
||||
# init lr
|
||||
if cfg.optimizer == "Thor":
|
||||
def init_lr(step_size):
|
||||
"""init lr"""
|
||||
if config.optimizer == "Thor":
|
||||
from src.lr_generator import get_thor_lr
|
||||
lr = get_thor_lr(0, config.lr_init, config.lr_decay, config.lr_end_epoch, step_size, decay_epochs=39)
|
||||
else:
|
||||
if args_opt.net in ("resnet18", "resnet34", "resnet50", "se-resnet50"):
|
||||
if config.net_name in ("resnet18", "resnet34", "resnet50", "se-resnet50"):
|
||||
lr = get_lr(lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max,
|
||||
warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, steps_per_epoch=step_size,
|
||||
lr_decay_mode=config.lr_decay_mode)
|
||||
else:
|
||||
lr = warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, config.epoch_size,
|
||||
config.pretrain_epoch_size * step_size)
|
||||
lr = Tensor(lr)
|
||||
return lr
|
||||
|
||||
def init_loss_scale():
|
||||
if config.dataset == "imagenet2012":
|
||||
if not config.use_label_smooth:
|
||||
config.label_smooth_factor = 0.0
|
||||
loss = CrossEntropySmooth(sparse=True, reduction="mean",
|
||||
smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
|
||||
else:
|
||||
loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
|
||||
return loss
|
||||
|
||||
def run_eval(target, model, ckpt_save_dir, cb):
|
||||
"""run_eval"""
|
||||
if config.run_eval:
|
||||
if config.eval_dataset_path is None or (not os.path.isdir(config.eval_dataset_path)):
|
||||
raise ValueError("{} is not a existing path.".format(config.eval_dataset_path))
|
||||
eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False,
|
||||
batch_size=config.batch_size, target=target, enable_cache=config.enable_cache,
|
||||
cache_session_id=config.cache_session_id)
|
||||
eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"}
|
||||
eval_cb = EvalCallBack(apply_eval, eval_param_dict, interval=config.eval_interval,
|
||||
eval_start_epoch=config.eval_start_epoch, save_best_ckpt=config.save_best_ckpt,
|
||||
ckpt_directory=ckpt_save_dir, besk_ckpt_name="best_acc.ckpt",
|
||||
metrics_name="acc")
|
||||
cb += [eval_cb]
|
||||
|
||||
@moxing_wrapper()
|
||||
def train_net():
|
||||
"""train net"""
|
||||
target = config.device_target
|
||||
set_parameter()
|
||||
dataset = create_dataset(dataset_path=config.data_path, do_train=True, repeat_num=1,
|
||||
batch_size=config.batch_size, target=target,
|
||||
distribute=config.run_distribute)
|
||||
step_size = dataset.get_dataset_size()
|
||||
net = resnet(class_num=config.class_num)
|
||||
if config.parameter_server:
|
||||
net.set_param_ps()
|
||||
init_weight(net=net)
|
||||
lr = Tensor(init_lr(step_size=step_size))
|
||||
# define opt
|
||||
decayed_params = []
|
||||
no_decayed_params = []
|
||||
|
@ -213,27 +204,21 @@ if __name__ == '__main__':
|
|||
{'params': no_decayed_params},
|
||||
{'order_params': net.trainable_params()}]
|
||||
opt = Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale)
|
||||
if args_opt.dataset == "imagenet2012":
|
||||
if not config.use_label_smooth:
|
||||
config.label_smooth_factor = 0.0
|
||||
loss = CrossEntropySmooth(sparse=True, reduction="mean",
|
||||
smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
|
||||
else:
|
||||
loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
|
||||
loss = init_loss_scale()
|
||||
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
|
||||
dist_eval_network = ClassifyCorrectCell(net) if args_opt.run_distribute else None
|
||||
dist_eval_network = ClassifyCorrectCell(net) if config.run_distribute else None
|
||||
metrics = {"acc"}
|
||||
if args_opt.run_distribute:
|
||||
metrics = {'acc': DistAccuracy(batch_size=config.batch_size, device_num=args_opt.device_num)}
|
||||
if (args_opt.net not in ("resnet18", "resnet34", "resnet50", "resnet101", "se-resnet50")) or \
|
||||
args_opt.parameter_server or target == "CPU":
|
||||
if config.run_distribute:
|
||||
metrics = {'acc': DistAccuracy(batch_size=config.batch_size, device_num=config.device_num)}
|
||||
if (config.net_name not in ("resnet18", "resnet34", "resnet50", "resnet101", "se-resnet50")) or \
|
||||
config.parameter_server or target == "CPU":
|
||||
## fp32 training
|
||||
model = Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network)
|
||||
else:
|
||||
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics,
|
||||
amp_level="O2", keep_batchnorm_fp32=False, eval_network=dist_eval_network)
|
||||
|
||||
if cfg.optimizer == "Thor" and args_opt.dataset == "imagenet2012":
|
||||
if config.optimizer == "Thor" and config.dataset == "imagenet2012":
|
||||
from src.lr_generator import get_thor_damping
|
||||
damping = get_thor_damping(0, config.damping_init, config.damping_decay, 70, step_size)
|
||||
split_indices = [26, 53]
|
||||
|
@ -242,36 +227,30 @@ if __name__ == '__main__':
|
|||
model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt,
|
||||
loss_scale_manager=loss_scale, metrics={'acc'},
|
||||
amp_level="O2", keep_batchnorm_fp32=False)
|
||||
args_opt.run_eval = False
|
||||
config.run_eval = False
|
||||
logger.warning("Thor optimizer not support evaluation while training.")
|
||||
|
||||
# define callbacks
|
||||
time_cb = TimeMonitor(data_size=step_size)
|
||||
loss_cb = LossMonitor()
|
||||
cb = [time_cb, loss_cb]
|
||||
ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path)
|
||||
ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank_id()) + "/"
|
||||
if config.save_checkpoint:
|
||||
config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
|
||||
keep_checkpoint_max=config.keep_checkpoint_max)
|
||||
ckpt_cb = ModelCheckpoint(prefix="resnet", directory=ckpt_save_dir, config=config_ck)
|
||||
cb += [ckpt_cb]
|
||||
if args_opt.run_eval:
|
||||
if args_opt.eval_dataset_path is None or (not os.path.isdir(args_opt.eval_dataset_path)):
|
||||
raise ValueError("{} is not a existing path.".format(args_opt.eval_dataset_path))
|
||||
eval_dataset = create_dataset(dataset_path=args_opt.eval_dataset_path, do_train=False,
|
||||
batch_size=config.batch_size, target=target, enable_cache=args_opt.enable_cache,
|
||||
cache_session_id=args_opt.cache_session_id)
|
||||
eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"}
|
||||
eval_cb = EvalCallBack(apply_eval, eval_param_dict, interval=args_opt.eval_interval,
|
||||
eval_start_epoch=args_opt.eval_start_epoch, save_best_ckpt=args_opt.save_best_ckpt,
|
||||
ckpt_directory=ckpt_save_dir, besk_ckpt_name="best_acc.ckpt",
|
||||
metrics_name="acc")
|
||||
cb += [eval_cb]
|
||||
run_eval(target, model, ckpt_save_dir, cb)
|
||||
# train model
|
||||
if args_opt.net == "se-resnet50":
|
||||
if config.net_name == "se-resnet50":
|
||||
config.epoch_size = config.train_epoch_size
|
||||
dataset_sink_mode = (not args_opt.parameter_server) and target != "CPU"
|
||||
dataset_sink_mode = (not config.parameter_server) and target != "CPU"
|
||||
model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb,
|
||||
sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode)
|
||||
|
||||
if args_opt.run_eval and args_opt.enable_cache:
|
||||
if config.run_eval and config.enable_cache:
|
||||
print("Remember to shut down the cache server via \"cache_admin --stop\"")
|
||||
|
||||
if __name__ == '__main__':
|
||||
train_net()
|
||||
|
|
|
@ -147,8 +147,8 @@ def test(cloud_args=None):
|
|||
|
||||
# network
|
||||
config.logger.important_info('start create network')
|
||||
if os.path.isdir(config.pretrained):
|
||||
models = list(glob.glob(os.path.join(config.pretrained, '*.ckpt')))
|
||||
if os.path.isdir(config.checkpoint_file_path):
|
||||
models = list(glob.glob(os.path.join(config.checkpoint_file_path, '*.ckpt')))
|
||||
print(models)
|
||||
if config.graph_ckpt:
|
||||
f = lambda x: -1 * int(os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split('_')[0])
|
||||
|
|
|
@ -33,8 +33,9 @@ def test_resnet50_cifar10_ascend():
|
|||
new_list = ["total_epochs=10", "10"]
|
||||
utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py"))
|
||||
dataset_path = os.path.join(utils.data_root, "cifar-10-batches-bin")
|
||||
exec_network_shell = "cd resnet/scripts; bash run_distribute_train.sh resnet50 cifar10 {} {}"\
|
||||
.format(utils.rank_table_path, dataset_path)
|
||||
config_path = os.path.join(cur_model_path, "resnet50_cifar10_config.yaml")
|
||||
exec_network_shell = "cd resnet/scripts; bash run_distribute_train.sh {} {} {}"\
|
||||
.format(utils.rank_table_path, dataset_path, config_path)
|
||||
os.system(exec_network_shell)
|
||||
cmd = "ps -ef | grep python | grep train.py | grep -v grep"
|
||||
ret = utils.process_check(100, cmd)
|
||||
|
@ -63,7 +64,9 @@ def test_resnet50_cifar10_gpu():
|
|||
new_list = ["total_epochs=10", "10"]
|
||||
utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py"))
|
||||
dataset_path = os.path.join(utils.data_root, "cifar-10-batches-bin")
|
||||
exec_network_shell = "cd resnet/scripts; sh run_distribute_train_gpu.sh resnet50 cifar10 {}".format(dataset_path)
|
||||
config_path = os.path.join(cur_model_path, "resnet50_cifar10_config.yaml")
|
||||
exec_network_shell = "cd resnet/scripts; sh run_distribute_train_gpu.sh {} {}" \
|
||||
.format(dataset_path, config_path)
|
||||
logger.warning("cmd [{}] is running...".format(exec_network_shell))
|
||||
os.system(exec_network_shell)
|
||||
cmd = "ps -ef | grep python | grep train.py | grep -v grep"
|
||||
|
|
Loading…
Reference in New Issue