diff --git a/model_zoo/official/cv/tinydarknet/README.md b/model_zoo/official/cv/tinydarknet/README.md index b87f4a9181a..c34f41447c9 100644 --- a/model_zoo/official/cv/tinydarknet/README.md +++ b/model_zoo/official/cv/tinydarknet/README.md @@ -60,8 +60,8 @@ Dataset used can refer to [paper]( +- running on GPU with gpu default parameters + + ```python + # GPU standalone training example + python train.py \ + --config_path=./imagenet_config_gpu.yaml \ + --dataset_name=imagenet --train_data_dir=../dataset/imagenet_original/train --device_target=GPU + OR + cd scripts + bash run_distribute_train_gpu.sh [DEVICE_ID] [TRAIN_DATA_DIR] [cifar10 | imagenet] + + # GPU distribute training example + export RANK_SIZE=8 + mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ + python train.py \ + --config_path=./config/imagenet_config_gpu.yaml \ + --dataset_name=imagenet \ + --train_data_dir=../dataset/imagenet_original/train \ + --device_target=GPU + OR + bash scripts/run_distribute_train_gpu.sh [RANK_SIZE] [TRAIN_DATA_DIR] [cifar10 | imagenet] + + # GPU evaluation example + python eval.py -device_target=GPU --val_data_dir=../dataset/imagenet_original/val --dataset_name=imagenet --config_path=./config/imagenet_config_gpu.yaml \ + --checkpoint_path=$PATH2 + OR + bash scripts/run_train_gpu.sh [VAL_DATA_DIR] [cifar10|imagenet] [checkpoint_path] + ``` + - Running on ModelArts If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training as follows. @@ -155,12 +184,20 @@ For more details, please refer the specify script. ├── README.md // descriptions about Tiny-Darknet in English ├── README_CN.md // descriptions about Tiny-Darknet in Chinese ├── ascend310_infer // application for 310 inference + ├── src + ├── imagenet_config.yaml // imagenet parameter configuration + ├── imagenet_config_gpu.yaml // imagenet parameter configuration for GPU + ├── cifar10_config.yaml // cifar10 parameter configuration + ├── cifar10_config_gpu.yaml // cifar10 parameter configuration for GPU ├── scripts ├── run_standalone_train.sh // shell script for single on Ascend + ├── run_standalone_train_gpu.sh // shell script for single on GPU ├── run_distribute_train.sh // shell script for distributed on Ascend + ├── run_distribute_train_gpu.sh // shell script for distributed on GPU ├── run_train_cpu.sh // shell script for distributed on CPU ├── run_eval.sh // shell script for evaluation on Ascend ├── run_eval_cpu.sh // shell script for evaluation on CPU + ├── run_eval_gpu.sh // shell script for evaluation on GPU ├── run_infer_310.sh // shell script for inference on Ascend310 ├── src ├── lr_scheduler //learning rate scheduler @@ -179,8 +216,6 @@ For more details, please refer the specify script. ├── train.py // training script ├── eval.py // evaluation script ├── export.py // export checkpoint file into air/onnx - ├── imagenet_config.yaml // imagenet parameter configuration - ├── cifar10_config.yaml // cifar10 parameter configuration ├── mindspore_hub_conf.py // hub config ├── postprocess.py // postprocess script @@ -252,6 +287,29 @@ For more configuration details, please refer the script `imagenet_config.yaml`. The model checkpoint file will be saved in the current folder. +- running on GPU: + + ```python + cd scripts + bash run_standalone_train_gpu.sh [DEVICE_ID] [TRAIN_DATA_DIR] [cifar10|imagenet] + ``` + + The command above will run in the background, you can view the results through the file train.log. + + After training, you'll get some checkpoint files under the script folder by default. The loss value will be achieved as follows: + + + ```python + # grep "loss is " train.log + epoch: 498 step: 1251, loss is 2.7798953 + Epoch time: 130690.544, per step time: 104.469 + epoch: 499 step: 1251, loss is 2.9261637 + Epoch time: 130511.081, per step time: 104.325 + epoch: 500 step: 1251, loss is 2.69412 + Epoch time: 127067.548, per step time: 101.573 + ... + ``` + - running on CPU ```python @@ -279,6 +337,25 @@ For more configuration details, please refer the script `imagenet_config.yaml`. ... ``` +- running on GPU: + + ```python + bash scripts/run_distribute_train_gpu.sh [RANK_SIZE] [TRAIN_DATA_DIR] [cifar10|imagenet] + ``` + + The above shell script will run distribute training in the background. You can view the results through the file train_parallel[X]/log. The loss value will be achieved as follows: + + ```python + # grep "result: " distribute_train_gpu/nohup.out + epoch: 498 step: 1251, loss is 2.7825122 + epoch time: 200066.210 ms, per step time: 159.925 ms + epoch: 499 step: 1251, loss is 2.799798 + epoch time: 199098.258 ms, per step time: 159.151 ms + epoch: 500 step: 1251, loss is 2.8718748 + epoch time: 197784.661 ms, per step time: 158.101 ms + ... + ``` + ## [Evaluation Process](#contents) ### [Evaluation](#contents) @@ -307,6 +384,28 @@ For more configuration details, please refer the script `imagenet_config.yaml`. accuracy: {'top_1_accuracy': 0.5871979166666667, 'top_5_accuracy': 0.8175280448717949} ``` +- evaluation on Imagenet dataset when running on GPU: + + Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path, e.g., "/username/tinydaeknet/train_tinydarknet.ckpt". + + ```python + bash scripts/run_train_gpu.sh [VAL_DATA_DIR] [cifar10|imagenet] [checkpoint_path] + ``` + + The above python command will run in the background. You can view the results through the file "eval.log". The accuracy of the test dataset will be as follows: + + ```python + # grep "accuracy: " eval.log + accuracy: {'top_1_accuracy': 0.5896033653846153, 'top_5_accuracy': 0.8176482371794872} + ``` + + Note that for evaluation after distributed training, please set the checkpoint_path to be the last saved checkpoint file. The accuracy of the test dataset will be as follows: + + ```python + # grep "accuracy: " eval.log + accuracy: {'top_1_accuracy': 0.5896033653846153, 'top_5_accuracy': 0.8176482371794872} + ``` + - evaluation on cifar-10 dataset when running on CPU: Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path, e.g., "/username/tinydaeknet/train_tinydarknet.ckpt". @@ -389,34 +488,33 @@ Inference result is saved in current path, you can find result like this in acc. ### [Training Performance](#contents) -| Parameters | Ascend | -| -------------------------- | ----------------------------------------------------------- | -| Model Version | V1 | -| Resource | Ascend 910; CPU 2.60GHz, 56cores; Memory 314G; OS Euler2.8 | -| Uploaded Date | 2020/12/22 | -| MindSpore Version | 1.1.0 | -| Dataset | 1200k images | -| Training Parameters | epoch=500, steps=1251, batch_size=128, lr=0.1 | -| Optimizer | Momentum | -| Loss Function | Softmax Cross Entropy | -| Speed | 8 pc: 104 ms/step | -| Total Time | 8 pc: 17.8 hours | -| Parameters(M) | 4.0M | -| Scripts | [Tiny-Darknet Scripts](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/tinydarknet) | +| Parameters | Ascend | GPU | +| -------------------------- | ------------------------------------------------------------| ----------------------------------------------------| +| Model Version | V1 | V1 | +| Resource | Ascend 910;CPU 2.60GHz,56cores;内存 314G;系统 Euler2.8 | PCIE V100-32G | +| Uploaded Date | 2020/12/22 | 2021/07/15 | +| MindSpore Version | 1.1.0 | 1.3.0 | +| Dataset | 1200k images | 1200k images | +| Training Parameters | epoch=500, steps=1251, batch_size=128, lr=0.1 | epoch=500, steps=1251, batch_size = 128, lr=0.005 | +| Optimizer | Momentum | Momentum | +| Loss Function | Softmax Cross Entropy | Softmax Cross Entropy | +| Speed | 8pc: 104 ms/step | 8pc: 255 ms/step | +| Parameters(M) | 4.0; | 4.0; | +| Scripts | [Tiny-Darknet scripts](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/tinydarknet) ### [Evaluation Performance](#contents) -| Parameters | Ascend | -| ------------------- | --------------------------- | -| Model Version | V1 | -| Resource | Ascend 910; OS Euler2.8 | -| Uploaded Date | 2020/12/22 | -| MindSpore Version | 1.1.0 | -| Dataset | 200k images | -| batch_size | 128 | -| Outputs | probability | -| Accuracy | 8 pc Top-1: 58.7%; Top-5: 81.7% | -| Model for inference | 11.6M (.ckpt file) | +| Parameters | Ascend | GPU | +| ------------------- | ----------------------------------| ----------------------------------| +| Model Version | V1 | V1 | +| Resource | Ascend 910;Euler2.8 | PCIE V100-32G | +| Uploaded Date | 2020/12/22 | 2021/7/15 | +| MindSpore Version | 1.1.0 | 1.3.0 | +| Dataset | 200k images | 200k images | +| batch_size | 128 | 128 | +| Outputs | probability | probability | +| Accuracy | 8pcs Top-1: 58.7%; Top-5: 81.7% | 8pcs Top-1: 58.9%; Top-5: 81.7% | +| Model for inference | 11.6M (.ckpt file) | 10.06M (.ckpt file) | ### [Inference Performance](#contents) diff --git a/model_zoo/official/cv/tinydarknet/README_CN.md b/model_zoo/official/cv/tinydarknet/README_CN.md index 3943f32c1f9..9514a3a5744 100644 --- a/model_zoo/official/cv/tinydarknet/README_CN.md +++ b/model_zoo/official/cv/tinydarknet/README_CN.md @@ -68,8 +68,8 @@ Tiny-DarkNet是Joseph Chet Redmon等人提出的一个16层的针对于经典的 # [环境要求](#目录) -- 硬件(Ascend/CPU) - - 请准备具有Ascend/CPU处理器的硬件环境. +- 硬件(Ascend/CPU/GPU) + - 请准备具有Ascend/CPU处理器/GPU的硬件环境. - 框架 - [MindSpore](https://www.mindspore.cn/install) - 更多的信息请访问以下链接: @@ -101,6 +101,35 @@ Tiny-DarkNet是Joseph Chet Redmon等人提出的一个16层的针对于经典的 +- running on GPU with gpu default parameters + + ```python + # GPU单卡训练示例 + python train.py \ + --config_path=./config/imagenet_config_gpu.yaml \ + --dataset_name=imagenet --train_data_dir=../dataset/imagenet_original/train --device_target=GPU + OR + cd scripts + bash run_distribute_train_gpu.sh [DEVICE_ID] [TRAIN_DATA_DIR] [cifar10 | imagenet] + + # GPU多卡训练示例 + export RANK_SIZE=8 + mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ + python train.py \ + --config_path=./config/imagenet_config_gpu.yaml \ + --dataset_name=imagenet \ + --train_data_dir=../dataset/imagenet_original/train \ + --device_target=GPU + OR + bash scripts/run_distribute_train_gpu.sh [RANK_SIZE] [TRAIN_DATA_DIR] [cifar10 | imagenet] + + # GPU评估示例 + python eval.py -device_target=GPU --val_data_dir=../dataset/imagenet_original/val --dataset_name=imagenet --config_path=./config/imagenet_config_gpu.yaml \ + --checkpoint_path=$PATH2 + OR + bash scripts/run_train_gpu.sh [VAL_DATA_DIR] [cifar10|imagenet] [checkpoint_path] + ``` + - 在ModelArts上运行 如果你想在modelarts上运行,可以参考以下文档 [modelarts](https://support.huaweicloud.com/modelarts/) @@ -162,12 +191,20 @@ Tiny-DarkNet是Joseph Chet Redmon等人提出的一个16层的针对于经典的 ├── README.md // Tiny-Darknet英文说明 ├── README_CN.md // Tiny-Darknet中文说明 ├── ascend310_infer // 用于310推理 + ├── config + ├── imagenet_config.yaml // imagenet参数配置 + ├── imagenet_config_gpu.yaml // imagenet参数配置 + ├── cifar10_config.yaml // cifar10参数配置 + ├── cifar10_config_gpu.yaml // cifar10参数配置 ├── scripts ├── run_standalone_train.sh // Ascend单卡训练shell脚本 + ├── run_standalone_train_gpu.sh // GPU单卡训练shell脚本 ├── run_distribute_train.sh // Ascend分布式训练shell脚本 + ├── run_distribute_train_gpu.sh // GPU分布式训练shell脚本 ├── run_train_cpu.sh // CPU训练shell脚本 ├── run_eval.sh // Ascend评估shell脚本 ├── run_eval_cpu.sh // CPU评估shell脚本 + ├── run_eval_gpu.sh // GPU评估shell脚本 └── run_infer_310.sh // Ascend310推理shell脚本 ├── src ├── lr_scheduler // 学习率策略 @@ -186,8 +223,6 @@ Tiny-DarkNet是Joseph Chet Redmon等人提出的一个16层的针对于经典的 ├── train.py // 训练脚本 ├── eval.py // 评估脚本 ├── export.py // 导出checkpoint文件 - ├── imagenet_config.yaml // imagenet参数配置 - ├── cifar10_config.yaml // cifar10参数配置 ├── mindspore_hub_conf.py // hub配置文件 └── postprocess.py // 310推理后处理脚本 @@ -259,6 +294,29 @@ Tiny-DarkNet是Joseph Chet Redmon等人提出的一个16层的针对于经典的 模型checkpoint文件将会保存在当前文件夹下. +- 在GPU资源上运行: + + ```python + cd scripts + bash run_standalone_train_gpu.sh [DEVICE_ID] [TRAIN_DATA_DIR] [cifar10|imagenet] + ``` + + 上述的命令将运行在后台中,可以通过 `train_single_gpu/train.log` 文件查看运行结果. + + 训练完成后,默认情况下,可在script文件夹下得到一些checkpoint文件. 训练的损失值将以如下的形式展示: + + + ```python + # grep "loss is " train.log + epoch: 498 step: 1251, loss is 2.7798953 + Epoch time: 130690.544, per step time: 104.469 + epoch: 499 step: 1251, loss is 2.9261637 + Epoch time: 130511.081, per step time: 104.325 + epoch: 500 step: 1251, loss is 2.69412 + Epoch time: 127067.548, per step time: 101.573 + ... + ``` + - 在CPU资源上运行: ```python @@ -273,16 +331,35 @@ Tiny-DarkNet是Joseph Chet Redmon等人提出的一个16层的针对于经典的 bash scripts/run_distribute_train.sh [RANK_TABLE_FILE] ``` - 上述的脚本命令将在后台中进行分布式训练,可以通过`train_parallel[X]/log`文件查看运行结果. 训练的损失值将以如下的形式展示: + 上述的脚本命令将在后台中进行分布式训练,可以通过`distribute_train/nohup.out`文件查看运行结果. 训练的损失值将以如下的形式展示: ```python - # grep "result: " train_parallel*/log - epoch: 498 step: 1251, loss is 2.7798953 - Epoch time: 130690.544, per step time: 104.469 - epoch: 499 step: 1251, loss is 2.9261637 - Epoch time: 130511.081, per step time: 104.325 - epoch: 500 step: 1251, loss is 2.69412 - Epoch time: 127067.548, per step time: 101.573 + # grep "result: " distribute_train/nohup.out + epoch: 498 step: 1251, loss is 2.7825122 + epoch time: 200066.210 ms, per step time: 159.925 ms + epoch: 499 step: 1251, loss is 2.799798 + epoch time: 199098.258 ms, per step time: 159.151 ms + epoch: 500 step: 1251, loss is 2.8718748 + epoch time: 197784.661 ms, per step time: 158.101 ms + ... + ``` + +- 在GPU资源上运行: + + ```python + bash scripts/run_distribute_train_gpu.sh [RANK_SIZE] [TRAIN_DATA_DIR] [cifar10|imagenet] + ``` + + 上述的脚本命令将在后台中进行分布式训练,可以通过`distribute_train_gpu/nohup.out`文件查看运行结果. 训练的损失值将以如下的形式展示: + + ```python + # grep "result: " distribute_train_gpu/nohup.out + epoch: 498 step: 1251, loss is 2.7825122 + epoch time: 200066.210 ms, per step time: 159.925 ms + epoch: 499 step: 1251, loss is 2.799798 + epoch time: 199098.258 ms, per step time: 159.151 ms + epoch: 500 step: 1251, loss is 2.8718748 + epoch time: 197784.661 ms, per step time: 158.101 ms ... ``` @@ -314,12 +391,34 @@ Tiny-DarkNet是Joseph Chet Redmon等人提出的一个16层的针对于经典的 accuracy: {'top_1_accuracy': 0.5871979166666667, 'top_5_accuracy': 0.8175280448717949} ``` +- 在GPU资源上进行评估: + + 在运行如下命令前,请确认用于评估的checkpoint文件的路径.checkpoint文件须包含在tinydarknet文件夹内.请将checkpoint路径设置为相对于 eval.py文件 的路径,例如:"./ckpts/train_tinydarknet.ckpt"(ckpts 与 eval.py 同级). + + ```python + bash scripts/run_train_gpu.sh [VAL_DATA_DIR] [cifar10|imagenet] [checkpoint_path] + ``` + + 上述的python命令将运行在后台中,可以通过"eval.log"文件查看结果. 测试数据集的准确率将如下面所列: + + ```python + # grep "accuracy: " eval.log + accuracy: {'top_1_accuracy': 0.5896033653846153, 'top_5_accuracy': 0.8176482371794872} + ``` + + 请注意在并行训练后,测试请将checkpoint_path设置为最后保存的checkpoint文件的路径,准确率将如下面所列: + + ```python + # grep "accuracy: " eval.log + accuracy: {'top_1_accuracy': 0.5896033653846153, 'top_5_accuracy': 0.8176482371794872} + ``` + - 在CPU资源上进行评估 在运行如下命令前,请确认用于评估的checkpoint文件的路径.checkpoint文件须包含在tinydarknet文件夹内.请将checkpoint路径设置为相对于 eval.py文件 的路径,例如:"./ckpts/train_tinydarknet.ckpt"(ckpts 与 eval.py 同级). ```python - bash scripts/run_eval.sh [VAL_DATA_DIR] [imagenet|cifar10] [CHECKPOINT_PATH] + bash scripts/run_eval_cpu.sh [VAL_DATA_DIR] [imagenet|cifar10] [CHECKPOINT_PATH] ``` 可以通过"eval.log"文件查看结果. 测试数据集的准确率将如下面所列: @@ -395,34 +494,36 @@ bash run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [LABEL_PATH] [DVPP] [DEVICE_ID] ### [训练性能](#目录) -| 参数 | Ascend | -| -------------------------- | ----------------------------------------------------------- | -| 模型版本 | V1 | -| 资源 | Ascend 910;CPU 2.60GHz,56cores;内存 314G;系统 Euler2.8 | -| 上传日期 | 2020/12/22 | -| MindSpore版本 | 1.1.0 | -| 数据集 | 1200k张图片 | -| 训练参数 | epoch=500, steps=1251, batch_size=128, lr=0.1 | -| 优化器 | Momentum | -| 损失函数 | Softmax Cross Entropy | -| 速度 | 8卡: 104 ms/step | -| 总时间 | 8卡: 17.8小时 | -| 参数(M) | 4.0 | -| 脚本 | [Tiny-Darknet脚本](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/tinydarknet) | +#### Tinydarknet on ImageNet 2012 + +| 参数 | Ascend | GPU | +| -------------------------- | ------------------------------------------------------------| ----------------------------------------------------| +| 模型版本 | V1 | V1 | +| 资源 | Ascend 910;CPU 2.60GHz,56cores;内存 314G;系统 Euler2.8 | PCIE V100-32G | +| 上传日期 | 2020/12/22 | 2021/07/15 | +| MindSpore版本 | 1.1.0 | 1.3.0 | +| 数据集 | 1200k张图片 | 1200k张图片 | +| 训练参数 | epoch=500, steps=1251, batch_size=128, lr=0.1 | epoch=500, steps=1251, batch_size = 128, lr=0.005 | +| 优化器 | Momentum | Momentum | +| 损失函数 | Softmax Cross Entropy | Softmax Cross Entropy | +| 速度 | 8卡: 104 ms/step | 8卡: 255 ms/step | +| 总时间 | 8卡: 17.8小时 | 8卡: 46.9小时 | +| 参数(M) | 4.0; | 4.0; | +| 脚本 | [Tiny-Darknet脚本](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/tinydarknet) ### [评估性能](#目录) -| 参数 | Ascend | -| ------------------- | --------------------------- | -| 模型版本 | V1 | -| 资源 | Ascend 910;系统 Euler2.8 | -| 上传日期 | 2020/12/22 | -| MindSpore版本 | 1.1.0 | -| 数据集 | 200k张图片 | -| batch_size | 128 | -| 输出 | 分类概率 | -| 准确率 | 8卡 Top-1: 58.7%; Top-5: 81.7% | -| 推理模型 | 11.6M (.ckpt文件) | +| 参数 | Ascend | GPU | +| ------------------- | ----------------------------------| ----------------------------------| +| 模型版本 | V1 | V1 | +| 资源 | Ascend 910;系统 Euler2.8 | NV SMX2 V100-32G | +| 上传日期 | 2020/12/22 | 2021/7/15 | +| MindSpore版本 | 1.1.0 | 1.3.0 | +| 数据集 | 200k张图片 | 200k张图片 | +| batch_size | 128 | 128 | +| 输出 | 分类概率 | 分类概率 | +| 准确率 | 8卡 Top-1: 58.7%; Top-5: 81.7% | 8卡 Top-1: 58.9%; Top-5: 81.7% | +| 推理模型 | 11.6M (.ckpt文件) | 10.06M (.ckpt文件) | ### [推理性能](#目录) diff --git a/model_zoo/official/cv/tinydarknet/cifar10_config.yaml b/model_zoo/official/cv/tinydarknet/config/cifar10_config.yaml similarity index 100% rename from model_zoo/official/cv/tinydarknet/cifar10_config.yaml rename to model_zoo/official/cv/tinydarknet/config/cifar10_config.yaml diff --git a/model_zoo/official/cv/tinydarknet/config/cifar10_config_gpu.yaml b/model_zoo/official/cv/tinydarknet/config/cifar10_config_gpu.yaml new file mode 100644 index 00000000000..30c87b9763a --- /dev/null +++ b/model_zoo/official/cv/tinydarknet/config/cifar10_config_gpu.yaml @@ -0,0 +1,57 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path" +device_target: "GPU" +enable_profiling: False + +modelarts_dataset_unzip_name: '' +# ============================================================================== +#train-eval-export related +dataset_name : cifar10 +ckpt_save_dir: checkpoints +pre_trained: False +device_id: 0 +num_classes: 10 +lr_init: 0.1 +batch_size: 32 +epoch_size: 120 +momentum: 0.9 +weight_decay: 0.0001 +image_height: 227 +image_width: 227 +train_data_dir: './data/cifar10_train/' +val_data_dir: './data/cifar10_val/' +keep_checkpoint_max: 1 +checkpoint_path: './scripts/train_parallel4/ckpt_4/train_tinydarknet_imagenet-300_1251.ckpt' +onnx_filename: 'tinydarknet.onnx' +air_filename: 'tinydarknet.air' +# optimizer and lr related +lr_scheduler: 'exponential' +lr_epochs: [70, 140, 210, 280] +lr_gamma: 0.1 +eta_min: 0.0 +T_max: 150 +warmup_epochs: 0 +# loss related +is_dynamic_loss_scale: False +loss_scale: 1024 +label_smooth_factor: 0.1 +use_label_smooth: True + +--- + +# Help description for each configuration +enable_modelarts: "Whether training on modelarts, default: False" +data_url: "Url for modelarts" +train_url: "Url for modelarts" +data_path: "The location of the input data." +output_path: "The location of the output file." +device_target: "Running platform, choose from Ascend, GPU or CPU, and default is Ascend." +enable_profiling: 'Whether enable profiling while training, default: False' diff --git a/model_zoo/official/cv/tinydarknet/imagenet_config.yaml b/model_zoo/official/cv/tinydarknet/config/imagenet_config.yaml similarity index 100% rename from model_zoo/official/cv/tinydarknet/imagenet_config.yaml rename to model_zoo/official/cv/tinydarknet/config/imagenet_config.yaml diff --git a/model_zoo/official/cv/tinydarknet/config/imagenet_config_gpu.yaml b/model_zoo/official/cv/tinydarknet/config/imagenet_config_gpu.yaml new file mode 100644 index 00000000000..0444227409e --- /dev/null +++ b/model_zoo/official/cv/tinydarknet/config/imagenet_config_gpu.yaml @@ -0,0 +1,61 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path" +device_target: "GPU" +enable_profiling: False + +modelarts_dataset_unzip_name: '' +# ============================================================================== +#train-eval-export related +dataset_name: imagenet +ckpt_save_dir: checkpoints +pre_trained: False +device_id: 0 +num_classes: 1000 +lr_init: 0.1 +batch_size: 128 +epoch_size: 500 +momentum: 0.9 +weight_decay: 0.0001 +image_height: 224 +image_width: 224 +train_data_dir: './dataset/imagenet_original/train/' +val_data_dir: './dataset/imagenet_original/val/' +keep_checkpoint_max: 1 +checkpoint_path: './scripts/train_parallel4/ckpt_4/train_tinydarknet_imagenet-300_1251.ckpt' +file_name: 'tinydarknet' +file_format: 'MINDIR' +# optimizer and lr related +lr_scheduler: 'exponential' +lr_epochs: [70, 140, 210, 280] +lr_gamma: 0.3 +eta_min: 0.0 +T_max: 150 +warmup_epochs: 0 +# loss related +is_dynamic_loss_scale: False +loss_scale: 1024 +label_smooth_factor: 0.1 +use_label_smooth: True +#310infer postprocess +result_path: '' +label_file: '' + +--- + +# Help description for each configuration +enable_modelarts: "Whether training on modelarts, default: False" +data_url: "Url for modelarts" +train_url: "Url for modelarts" +data_path: "The location of the input data." +output_path: "The location of the output file." +device_target: "Running platform, choose from Ascend, GPU or CPU, and default is Ascend." +enable_profiling: 'Whether enable profiling while training, default: False' +file_format: '["MINDIR", "AIR"]' diff --git a/model_zoo/official/cv/tinydarknet/scripts/run_distribute_train.sh b/model_zoo/official/cv/tinydarknet/scripts/run_distribute_train.sh index f0d775bee35..28ffe26789d 100644 --- a/model_zoo/official/cv/tinydarknet/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/tinydarknet/scripts/run_distribute_train.sh @@ -57,7 +57,7 @@ do mkdir ./train_parallel$i cp -r ../src ./train_parallel$i cp ../train.py ./train_parallel$i - cp ../*.yaml ./train_parallel$i + cp -r ../config ./train_parallel$i echo "start training for rank $RANK_ID, device $DEVICE_ID, $dataset_type" cd ./train_parallel$i || exit env > env.log diff --git a/model_zoo/official/cv/tinydarknet/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/tinydarknet/scripts/run_distribute_train_gpu.sh new file mode 100644 index 00000000000..bb31d5bbd80 --- /dev/null +++ b/model_zoo/official/cv/tinydarknet/scripts/run_distribute_train_gpu.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 3 ]; then + echo "Usage: sh run_distribute_train_gpu.sh [RANK_SIZE] [TRAIN_DATA_DIR] [cifar10|imagenet]" + exit 1 +fi + +get_real_path() { + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +dataset_type='imagenet' +if [ $# == 3 ] +then + if [ $3 != "cifar10" ] && [ $3 != "imagenet" ] + then + echo "error: the selected dataset is neither cifar10 nor imagenet" + exit 1 + fi + dataset_type=$3 +fi + +export RANK_SIZE=$1 +PROJECT_DIR=$(cd ./"`dirname $0`" || exit; pwd) +TRAIN_DATA_DIR=$(get_real_path $2) + +if [ ! -d $TRAIN_DATA_DIR ]; then + echo "error: TRAIN_DATA_DIR=$TRAIN_DATA_DIR is not a directory" + exit 1 +fi + +if [ -d "distribute_train_gpu" ]; then + rm -rf ./distribute_train_gpu +fi + +mkdir ./distribute_train_gpu +cp ./*.py ./distribute_train_gpu +cp -r ./config ./distribute_train_gpu +cp -r ./src ./distribute_train_gpu +cd ./distribute_train_gpu || exit + +if [ $3 == 'imagenet' ]; then + CONFIG_FILE="$PROJECT_DIR/../config/imagenet_config_gpu.yaml" +elif [ $3 == 'cifar10' ]; then + CONFIG_FILE="$PROJECT_DIR/../config/cifar10_config_gpu.yaml" +else + echo "error: the selected dataset is neither cifar10 nor imagenet" +exit 1 +fi + +mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ +nohup python train.py \ + --config_path=$CONFIG_FILE \ + --dataset_name=$dataset_type \ + --train_data_dir=$TRAIN_DATA_DIR \ + --device_target=GPU > log.txt 2>&1 & +cd .. \ No newline at end of file diff --git a/model_zoo/official/cv/tinydarknet/scripts/run_eval.sh b/model_zoo/official/cv/tinydarknet/scripts/run_eval.sh index 4033e7b5f47..1c5b4fde06c 100644 --- a/model_zoo/official/cv/tinydarknet/scripts/run_eval.sh +++ b/model_zoo/official/cv/tinydarknet/scripts/run_eval.sh @@ -22,7 +22,7 @@ rm -rf ./eval mkdir ./eval cp -r ../src ./eval cp ../eval.py ./eval -cp ../*.yaml ./eval +cp -r ../config ./eval cd ./eval || exit env >env.log python ./eval.py > ./eval.log 2>&1 & diff --git a/model_zoo/official/cv/tinydarknet/scripts/run_eval_cpu.sh b/model_zoo/official/cv/tinydarknet/scripts/run_eval_cpu.sh index d79bdd31d2e..6a13b776e79 100644 --- a/model_zoo/official/cv/tinydarknet/scripts/run_eval_cpu.sh +++ b/model_zoo/official/cv/tinydarknet/scripts/run_eval_cpu.sh @@ -43,9 +43,9 @@ fi BASE_PATH=$(dirname "$(dirname "$(readlink -f $0)")") if [ $2 == 'imagenet' ]; then - CONFIG_FILE="${BASE_PATH}/imagenet_config.yaml" + CONFIG_FILE="${BASE_PATH}/config/imagenet_config.yaml" elif [ $2 == 'cifar10' ]; then - CONFIG_FILE="${BASE_PATH}/cifar10_config.yaml" + CONFIG_FILE="${BASE_PATH}/config/cifar10_config.yaml" else echo "error: the selected dataset is neither cifar10 nor imagenet" exit 1 @@ -55,7 +55,7 @@ rm -rf ./eval mkdir ./eval cp -r ./src ./eval cp ./eval.py ./eval -cp ./*.yaml ./eval +cp -r ./config ./eval env >env.log echo "start evaluation for device CPU" cd ./eval || exit diff --git a/model_zoo/official/cv/tinydarknet/scripts/run_eval_gpu.sh b/model_zoo/official/cv/tinydarknet/scripts/run_eval_gpu.sh new file mode 100644 index 00000000000..1075aa1e31e --- /dev/null +++ b/model_zoo/official/cv/tinydarknet/scripts/run_eval_gpu.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +if [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ] +then + echo "Usage bash scripts/run_train_gpu.sh [VAL_DATA_DIR] [cifar10|imagenet] [checkpoint_path]" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +if [ ! -d $PATH1 ] +then + echo "error: VAL_DATA_DIR=$PATH1 is not a directory" +exit 1 +fi + +PATH2=$(get_real_path $3) +if [ ! -f $PATH2 ] +then + echo "error: CHECKPOINT_PATH=$PATH2 is not a file" +exit 1 +fi + +BASE_PATH=$(dirname "$(dirname "$(readlink -f $0)")") +if [ $2 == 'imagenet' ]; then + CONFIG_FILE="${BASE_PATH}/config/imagenet_config_gpu.yaml" +elif [ $2 == 'cifar10' ]; then + CONFIG_FILE="${BASE_PATH}/config/cifar10_config_gpu.yaml" +else + echo "error: the selected dataset is neither cifar10 nor imagenet" +exit 1 +fi + +rm -rf ./eval +mkdir ./eval +cp -r ./src ./eval +cp ./eval.py ./eval +cp -r ./config ./eval +env >env.log +echo "start evaluation for device GPU" +cd ./eval || exit +python ./eval.py --device_target=GPU --val_data_dir=$PATH1 --dataset_name=$2 --config_path=$CONFIG_FILE \ +--checkpoint_path=$PATH2 > ./eval.log 2>&1 & +cd .. diff --git a/model_zoo/official/cv/tinydarknet/scripts/run_standalone_train.sh b/model_zoo/official/cv/tinydarknet/scripts/run_standalone_train.sh index 784f04528f3..91b2e04c47b 100644 --- a/model_zoo/official/cv/tinydarknet/scripts/run_standalone_train.sh +++ b/model_zoo/official/cv/tinydarknet/scripts/run_standalone_train.sh @@ -55,7 +55,7 @@ rm -rf ./train_single mkdir ./train_single cp -r ../src ./train_single cp ../train.py ./train_single -cp ../*.yaml ./train_single +cp -r ../config ./train_single echo "start training for rank $RANK_ID, device $DEVICE_ID, $dataset_type" cd ./train_single || exit python ./train.py --dataset_name=$dataset_type --train_data_dir=$train_data_dir> ./train.log 2>&1 & diff --git a/model_zoo/official/cv/tinydarknet/scripts/run_standalone_train_gpu.sh b/model_zoo/official/cv/tinydarknet/scripts/run_standalone_train_gpu.sh new file mode 100644 index 00000000000..513ce25ab6a --- /dev/null +++ b/model_zoo/official/cv/tinydarknet/scripts/run_standalone_train_gpu.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "$1 $2 $3" + +if [ $# != 2 ] && [ $# != 3 ] +then + echo "Usage: bash run_distribute_train_gpu.sh [DEVICE_ID] [TRAIN_DATA_DIR] [cifar10|imagenet]" +exit 1 +fi + +expr $1 + 6 &>/dev/null +if [ $? != 0 ] +then + echo "error:DEVICE_ID=$1 is not a integer" +exit 1 +fi + +if [ ! -d $2 ] +then + echo "error:TRAIN_DATA_DIR=$2 is not a folder" +exit 1 +fi +train_data_dir=$2 +PROJECT_DIR=$(cd ./"`dirname $0`" || exit; pwd) +CONFIG_FILE="$PROJECT_DIR/../config/imagenet_config_gpu.yaml" +dataset_type='imagenet' +if [ $# == 3 ] +then + if [ $3 != "cifar10" ] && [ $3 != "imagenet" ] + then + echo "error: the selected dataset is neither cifar10 nor imagenet" + exit 1 + fi + dataset_type=$3 +fi + +if [ $3 == 'imagenet' ]; then + CONFIG_FILE="$PROJECT_DIR/../config/imagenet_config_gpu.yaml" +elif [ $3 == 'cifar10' ]; then + CONFIG_FILE="$PROJECT_DIR/../config/cifar10_config_gpu.yaml" +else + echo "error: the selected dataset is neither cifar10 nor imagenet" +exit 1 +fi + +export DEVICE_ID=$1 +export RANK_ID=0 +export DEVICE_NUM=1 +export RANK_SIZE=1 +rm -rf ./train_single_gpu +mkdir ./train_single_gpu +cp -r ../src ./train_single_gpu +cp ../train.py ./train_single_gpu +cp -r ../config ./train_single_gpu +echo "start training for rank $RANK_ID, device $DEVICE_ID, $dataset_type" +cd ./train_single_gpu || exit +python ./train.py --config_path=$CONFIG_FILE \ +--dataset_name=$dataset_type --train_data_dir=$train_data_dir --device_target=GPU> ./train.log 2>&1 & + diff --git a/model_zoo/official/cv/tinydarknet/scripts/run_train_cpu.sh b/model_zoo/official/cv/tinydarknet/scripts/run_train_cpu.sh index 7a5276c46e6..601bca325cd 100644 --- a/model_zoo/official/cv/tinydarknet/scripts/run_train_cpu.sh +++ b/model_zoo/official/cv/tinydarknet/scripts/run_train_cpu.sh @@ -49,7 +49,7 @@ rm -rf ./train_cpu mkdir ./train_cpu cp ./train.py ./train_cpu cp -r ./src ./train_cpu -cp ./*.yaml ./train_cpu +cp -r ./config ./train_cpu echo "start training for device CPU" cd ./train_cpu || exit env > env.log diff --git a/model_zoo/official/cv/tinydarknet/src/dataset.py b/model_zoo/official/cv/tinydarknet/src/dataset.py index 8e255117753..2c62eeec868 100644 --- a/model_zoo/official/cv/tinydarknet/src/dataset.py +++ b/model_zoo/official/cv/tinydarknet/src/dataset.py @@ -40,14 +40,10 @@ def create_dataset_cifar(dataset_path, Returns: dataset """ - if target == "Ascend": - device_num, rank_id = _get_rank_info() - elif target == "CPU": + if target == "CPU": device_num = 1 else: - init() - rank_id = get_rank() - device_num = get_group_size() + device_num, rank_id = _get_rank_info() if device_num == 1: data_set = ds.Cifar10Dataset(dataset_path, @@ -165,7 +161,8 @@ def _get_rank_info(): rank_size = int(os.environ.get("RANK_SIZE", 1)) if rank_size > 1: - from mindspore.communication.management import get_rank, get_group_size + from mindspore.communication.management import init, get_rank, get_group_size + init() rank_size = get_group_size() rank_id = get_rank() else: diff --git a/model_zoo/official/cv/tinydarknet/src/model_utils/config.py b/model_zoo/official/cv/tinydarknet/src/model_utils/config.py index b23658303c5..c6eb2958de6 100644 --- a/model_zoo/official/cv/tinydarknet/src/model_utils/config.py +++ b/model_zoo/official/cv/tinydarknet/src/model_utils/config.py @@ -117,7 +117,7 @@ def get_config(): """ parser = argparse.ArgumentParser(description="default name", add_help=False) current_dir = os.path.dirname(os.path.abspath(__file__)) - parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../../{}".format(_config)), + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../../config/{}".format(_config)), help="Config file path") path_args, _ = parser.parse_known_args() default, helper, choices = parse_yaml(path_args.config_path) diff --git a/model_zoo/official/cv/tinydarknet/train.py b/model_zoo/official/cv/tinydarknet/train.py index 4397c210eaa..23238716a54 100644 --- a/model_zoo/official/cv/tinydarknet/train.py +++ b/model_zoo/official/cv/tinydarknet/train.py @@ -21,7 +21,7 @@ import time from mindspore import Tensor from mindspore import context -from mindspore.communication.management import init +from mindspore.communication.management import init, get_rank from mindspore.nn.optim.momentum import Momentum from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager @@ -36,7 +36,7 @@ from src.tinydarknet import TinyDarkNet from src.CrossEntropySmooth import CrossEntropySmooth from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id +from src.model_utils.device_adapter import get_device_id, get_device_num set_seed(1) @@ -132,11 +132,11 @@ def run_train(): else: context.set_context(device_id=get_device_id()) if device_num > 1: + init() context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) - init() - rank = get_rank_id() + rank = get_rank() if config.dataset_name == "imagenet": dataset = create_dataset_imagenet(config.train_data_dir, 1) @@ -204,10 +204,12 @@ def run_train(): if device_target == "CPU": model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}, loss_scale_manager=loss_scale_manager) - else: + elif device_target == "Ascend": model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}, amp_level="O3", loss_scale_manager=loss_scale_manager) - + elif device_target == "GPU": + model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}, + amp_level="O2", loss_scale_manager=loss_scale_manager) config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 50, keep_checkpoint_max=config.keep_checkpoint_max) time_cb = TimeMonitor(data_size=batch_num) ckpt_save_dir = os.path.join(config.ckpt_save_dir, str(rank))