!16736 modify yolov3_resnet18 for clould

From: @zhanghuiyao
Reviewed-by: @c_34,@wuxuejian
Signed-off-by: @c_34
This commit is contained in:
mindspore-ci-bot 2021-05-25 10:08:10 +08:00 committed by Gitee
commit 7f3b8e1adb
13 changed files with 804 additions and 104 deletions

View File

@ -79,7 +79,7 @@ Dataset used: [COCO2017](<http://images.cocodataset.org/>)
After installing MindSpore via the official website, you can start training and evaluation on Ascend as follows:
- running on Ascend
- Running on Ascend
```shell script
#run standalone training example
@ -92,16 +92,138 @@ After installing MindSpore via the official website, you can start training and
sh run_eval.sh [DEVICE_ID] [CKPT_PATH] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH]
```
- Running on [ModelArts](https://support.huaweicloud.com/modelarts/)
```bash
# Train 8p with Ascend
# (1) Perform a or b.
# a. Set "enable_modelarts=True" on default_config.yaml file.
# Set "distribute=True" on default_config.yaml file.
# Set "need_modelarts_dataset_unzip=True" on default_config.yaml file.
# Set "modelarts_dataset_unzip_name='coco'" on default_config.yaml file.
# Set "lr=0.005" on default_config.yaml file.
# Set "mindrecord_dir='/cache/data/coco/Mindrecord_train'" on default_config.yaml file.
# Set "image_dir='/cache/data'" on default_config.yaml file.
# Set "anno_path='/cache/data/coco/train_Person+Face-coco-20190118.txt'" on default_config.yaml file.
# Set "epoch_size=160" on default_config.yaml file.
# (optional)Set "pre_trained_epoch_size=YOUR_SIZE" on default_config.yaml file.
# (optional)Set "checkpoint_url='s3://dir_to_your_pretrained/'" on default_config.yaml file.
# (optional)Set "pre_trained=/cache/checkpoint_path/model.ckpt" on default_config.yaml file.
# Set other parameters on default_config.yaml file you need.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add "need_modelarts_dataset_unzip=True" on the website UI interface.
# Add "modelarts_dataset_unzip_name='coco'" on the website UI interface.
# Add "distribute=True" on the website UI interface.
# Add "lr=0.005" on the website UI interface.
# Add "mindrecord_dir=/cache/data/coco/Mindrecord_train" on the website UI interface.
# Add "image_dir=/cache/data" on the website UI interface.
# Add "anno_path=/cache/data/coco/train_Person+Face-coco-20190118.txt" on the website UI interface.
# Add "epoch_size=160" on the website UI interface.
# (optional)Add "pre_trained_epoch_size=YOUR_SIZE" on the website UI interface.
# (optional)Add "checkpoint_url='s3://dir_to_your_pretrained/'" on the website UI interface.
# (optional)Add "pre_trained=/cache/checkpoint_path/model.ckpt" on the website UI interface.
# Add other parameters on the website UI interface.
# (3) Upload or copy your pretrained model to S3 bucket if you want to finetune.
# (4) Perform a or b. (suggested option a)
# a. First, run "train.py" like the following to create MindRecord dataset locally from coco2017.
# "python train.py --only_create_dataset=True --mindrecord_dir=$MINDRECORD_DIR --image_dir=$IMAGE_DIR --anno_path=$ANNO_PATH"
# Second, zip MindRecord dataset to one zip file.
# Finally, Upload your zip dataset to S3 bucket.(you could also upload the origin mindrecord dataset, but it can be so slow.)
# b. Upload the original coco dataset to S3 bucket.
# (Data set conversion occurs during training process and costs a lot of time. it happens every time you train.)
# (5) Set the code directory to "/path/yolov3_resnet18" on the website UI interface.
# (6) Set the startup file to "train.py" on the website UI interface.
# (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (8) Create your job.
#
# Train 1p with Ascend
# (1) Perform a or b.
# a. Set "enable_modelarts=True" on default_config.yaml file.
# Set "need_modelarts_dataset_unzip=True" on default_config.yaml file.
# Set "modelarts_dataset_unzip_name='coco'" on default_config.yaml file.
# Set "mindrecord_dir='/cache/data/coco/Mindrecord_train'" on default_config.yaml file.
# Set "image_dir='/cache/data'" on default_config.yaml file.
# Set "anno_path='/cache/data/coco/train_Person+Face-coco-20190118.txt'" on default_config.yaml file.
# Set "epoch_size=160" on default_config.yaml file.
# (optional)Set "pre_trained_epoch_size=YOUR_SIZE" on default_config.yaml file.
# (optional)Set "checkpoint_url='s3://dir_to_your_pretrained/'" on default_config.yaml file.
# (optional)Set "pre_trained=/cache/checkpoint_path/model.ckpt" on default_config.yaml file.
# Set other parameters on default_config.yaml file you need.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add "need_modelarts_dataset_unzip=True" on the website UI interface.
# Add "modelarts_dataset_unzip_name='coco'" on the website UI interface.
# Add "mindrecord_dir='/cache/data/coco/Mindrecord_train'" on the website UI interface.
# Add "image_dir='/cache/data'" on the website UI interface.
# Add "anno_path='/cache/data/coco/train_Person+Face-coco-20190118.txt'" on the website UI interface.
# Add "epoch_size=160" on the website UI interface.
# (optional)Add "pre_trained_epoch_size=YOUR_SIZE" on the website UI interface.
# (optional)Add "checkpoint_url='s3://dir_to_your_pretrained/'" on the website UI interface.
# (optional)Add "pre_trained=/cache/checkpoint_path/model.ckpt" on the website UI interface.
# Add other parameters on the website UI interface.
# (3) Upload or copy your pretrained model to S3 bucket if you want to finetune.
# (4) Perform a or b. (suggested option a)
# a. First, run "train.py" like the following to create MindRecord dataset locally from coco2017.
# "python train.py --only_create_dataset=True --mindrecord_dir=$MINDRECORD_DIR --image_dir=$IMAGE_DIR --anno_path=$ANNO_PATH"
# Second, zip MindRecord dataset to one zip file.
# Finally, Upload your zip dataset to S3 bucket.(you could also upload the origin mindrecord dataset, but it can be so slow.)
# b. Upload the original coco dataset to S3 bucket.
# (Data set conversion occurs during training process and costs a lot of time. it happens every time you train.)
# (5) Set the code directory to "/path/yolov3_resnet18" on the website UI interface.
# (6) Set the startup file to "train.py" on the website UI interface.
# (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (8) Create your job.
#
# Eval 1p with Ascend
# (1) Perform a or b.
# a. Set "enable_modelarts=True" on default_config.yaml file.
# Set "need_modelarts_dataset_unzip=True" on default_config.yaml file.
# Set "modelarts_dataset_unzip_name='coco'" on default_config.yaml file.
# Set "checkpoint_url='s3://dir_to_your_trained_model/'" on base_config.yaml file.
# Set "ckpt_path='/cache/checkpoint_path/yolov3-160_156.ckpt'" on default_config.yaml file.
# Set "eval_mindrecord_dir='/cache/data/coco/Mindrecord_eval'" on default_config.yaml file.
# Set "image_dir='/cache/data'" on default_config.yaml file.
# Set "anno_path='/cache/data/coco/test_Person+Face-coco-20190118.txt'" on default_config.yaml file.
# Set other parameters on default_config.yaml file you need.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add "need_modelarts_dataset_unzip=True" on the website UI interface.
# Add "modelarts_dataset_unzip_name='coco'" on the website UI interface.
# Add "checkpoint_url='s3://dir_to_your_trained_model/'" on the website UI interface.
# Add "ckpt_path='/cache/checkpoint_path/yolov3-160_156.ckpt'" on the website UI interface.
# Add "eval_mindrecord_dir='/cache/data/coco/Mindrecord_eval'" on the website UI interface.
# Add "image_dir='/cache/data'" on the website UI interface.
# Add "anno_path='/cache/data/coco/test_Person+Face-coco-20190118.txt'" on the website UI interface.
# Add other parameters on the website UI interface.
# (3) Upload or copy your trained model to S3 bucket.
# (4) Perform a or b. (suggested option a)
# a. First, run "eval.py" like the following to create MindRecord dataset locally from coco2017.
# "python eval.py --only_create_dataset=True --eval_mindrecord_dir=$EVAL_MINDRECORD_DIR --image_dir=$EVAL_IMAGE_DIR --anno_path=$EVAL_ANNO_PATH"
# Second, zip MindRecord dataset to one zip file.
# Finally, Upload your zip dataset to S3 bucket.(you could also upload the origin mindrecord dataset, but it can be so slow.)
# b. Upload the original coco dataset to S3 bucket.
# (Data set conversion occurs during training process and costs a lot of time. it happens every time you train.)
# (5) Set the code directory to "/path/yolov3_resnet18" on the website UI interface.
# (6) Set the startup file to "eval.py" on the website UI interface.
# (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (8) Create your job.
```
# [Script Description](#contents)
## [Script and Sample Code](#contents)
```python
```text
└── cv
├── README.md // descriptions about all the models
├── mindspore_hub_conf.md // config for mindspore hub
└── yolov3_resnet18
├── README.md // descriptions about yolov3_resnet18
├── README_CN.md // descriptions about yolov3_resnet18 with Chinese
├── model_utils
├── __init__.py // init file
├── config.py // Parse arguments
├── device_adapter.py // Device adapter for ModelArts
├── local_adapter.py // Local adapter
└── moxing_adapter.py // Moxing adapter for ModelArts
├── scripts
├── run_distribute_train.sh // shell script for distributed on Ascend
├── run_standalone_train.sh // shell script for distributed on Ascend
@ -109,10 +231,14 @@ After installing MindSpore via the official website, you can start training and
├── src
├── dataset.py // creating dataset
├── yolov3.py // yolov3 architecture
├── config.py // parameter configuration
├── config.py // default arguments for network architecture
└── utils.py // util function
├── train.py // training script
└── eval.py // evaluation script
├── default_config.yaml // configurations
├── eval.py // evaluation script
├── export.py // export script
├── mindspore_hub_conf.py // hub config
├── postprocess.py // postprocess script
└── train.py // train script
```
## [Script Parameters](#contents)
@ -238,11 +364,11 @@ Inference result is saved in current path, you can find result in acc.log file.
| Parameters | Ascend |
| -------------------------- | ----------------------------------------------------------- |
| Model Version | YOLOv3_Resnet18 V1 |
| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 |
| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 |
| uploaded Date | 09/15/2020 (month/day/year) |
| MindSpore Version | 1.0.0 |
| Dataset | COCO2017 |
| Training Parameters | epoch = 150, batch_size = 32, lr = 0.001 |
| Training Parameters | epoch = 160, batch_size = 32, lr = 0.005 |
| Optimizer | Adam |
| Loss Function | Sigmoid Cross Entropy |
| outputs | probability |
@ -256,7 +382,7 @@ Inference result is saved in current path, you can find result in acc.log file.
| Parameters | Ascend |
| ------------------- | ----------------------------------------------- |
| Model Version | YOLOv3_Resnet18 V1 |
| Resource | Ascend 910; OS Euler2.8 |
| Resource | Ascend 910; OS Euler2.8 |
| Uploaded Date | 09/15/2020 (month/day/year) |
| MindSpore Version | 1.0.0 |
| Dataset | COCO2017 |

View File

@ -95,6 +95,121 @@ YOLOv3整体网络架构如下
sh run_eval.sh [DEVICE_ID] [CKPT_PATH] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH]
```
- 在 ModelArts 进行训练 (如果你想在modelarts上运行可以参考以下文档 [modelarts](https://support.huaweicloud.com/modelarts/))
```bash
# 在 ModelArts 上使用8卡训练
# (1) 执行a或者b
# a. 在 default_config.yaml 文件中设置 "enable_modelarts=True"
# 在 default_config.yaml 文件中设置 "distribute=True"
# 在 default_config.yaml 文件中设置 "need_modelarts_dataset_unzip=True"
# 在 default_config.yaml 文件中设置 "modelarts_dataset_unzip_name='coco'"
# 在 default_config.yaml 文件中设置 "lr=0.005"
# 在 default_config.yaml 文件中设置 "mindrecord_dir='/cache/data/coco/Mindrecord_train'"
# 在 default_config.yaml 文件中设置 "image_dir='/cache/data'"
# 在 default_config.yaml 文件中设置 "anno_path='/cache/data/coco/train_Person+Face-coco-20190118.txt'"
# 在 default_config.yaml 文件中设置 "epoch_size=160"
# (可选)在 default_config.yaml 文件中设置 "pre_trained_epoch_size=YOUR_SIZE"
# (可选)在 default_config.yaml 文件中设置 "checkpoint_url='s3://dir_to_your_pretrained/'"
# (可选)在 default_config.yaml 文件中设置 "pre_trained=/cache/checkpoint_path/model.ckpt"
# 在 default_config.yaml 文件中设置 其他参数
# b. 在网页上设置 "enable_modelarts=True"
# 在网页上设置 "need_modelarts_dataset_unzip=True"
# 在网页上设置 "modelarts_dataset_unzip_name='coco'"
# 在网页上设置 "distribute=True"
# 在网页上设置 "lr=0.005"
# 在网页上设置 "mindrecord_dir=/cache/data/coco/Mindrecord_train"
# 在网页上设置 "image_dir=/cache/data"
# 在网页上设置 "anno_path=/cache/data/coco/train_Person+Face-coco-20190118.txt"
# 在网页上设置 "epoch_size=160"
# (可选)在网页上设置 "pre_trained_epoch_size=YOUR_SIZE"
# (可选)在网页上设置 "checkpoint_url='s3://dir_to_your_pretrained/'"
# (可选)在网页上设置 "pre_trained=/cache/checkpoint_path/model.ckpt"
# 在网页上设置 其他参数
# (3) 如果选择微调您的模型,请上传你的预训练模型到 S3 桶上
# (4) 执行a或者b (推荐选择 a)
# a. 第一, 根据以下方式在本地运行 "train.py" 脚本来生成 MindRecord 格式的数据集。
# "python train.py --only_create_dataset=True --mindrecord_dir=$MINDRECORD_DIR --image_dir=$IMAGE_DIR --anno_path=$ANNO_PATH"
# 第二, 将该数据集压缩为一个 ".zip" 文件。
# 最后, 上传你的压缩数据集到 S3 桶上 (你也可以上传未压缩的数据集,但那可能会很慢。)
# b. 上传原始 coco 数据集到 S3 桶上。
# (数据集转换发生在训练过程中,需要花费较多的时间。每次训练的时候都会重新进行转换。)
# (5) 在网页上设置你的代码路径为 "/path/yolov3_resnet18"
# (6) 在网页上设置启动文件为 "train.py"
# (7) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等
# (8) 创建训练作业
#
# 在 ModelArts 上使用单卡训练
# (1) 执行a或者b
# a. 在 default_config.yaml 文件中设置 "enable_modelarts=True"
# 在 default_config.yaml 文件中设置 "need_modelarts_dataset_unzip=True"
# 在 default_config.yaml 文件中设置 "modelarts_dataset_unzip_name='coco'"
# 在 default_config.yaml 文件中设置 "mindrecord_dir='/cache/data/coco/Mindrecord_train'"
# 在 default_config.yaml 文件中设置 "image_dir='/cache/data'"
# 在 default_config.yaml 文件中设置 "anno_path='/cache/data/coco/train_Person+Face-coco-20190118.txt'"
# 在 default_config.yaml 文件中设置 "epoch_size=160"
# (可选)在 default_config.yaml 文件中设置 "pre_trained_epoch_size=YOUR_SIZE"
# (可选)在 default_config.yaml 文件中设置 "checkpoint_url='s3://dir_to_your_pretrained/'"
# (可选)在 default_config.yaml 文件中设置 "pre_trained=/cache/checkpoint_path/model.ckpt"
# 在 default_config.yaml 文件中设置 其他参数
# b. 在网页上设置 "enable_modelarts=True"
# 在网页上设置 "need_modelarts_dataset_unzip=True"
# 在网页上设置 "modelarts_dataset_unzip_name='coco'"
# 在网页上设置 "mindrecord_dir='/cache/data/coco/Mindrecord_train'"
# 在网页上设置 "image_dir='/cache/data'"
# 在网页上设置 "anno_path='/cache/data/coco/train_Person+Face-coco-20190118.txt'"
# 在网页上设置 "epoch_size=160"
# (可选)在网页上设置 "pre_trained_epoch_size=YOUR_SIZE"
# (可选)在网页上设置 "checkpoint_url='s3://dir_to_your_pretrained/'"
# (可选)在网页上设置 "pre_trained=/cache/checkpoint_path/model.ckpt"
# 在网页上设置 其他参数
# (3) 如果选择微调您的模型,上传你的预训练模型到 S3 桶上
# (4) 执行a或者b (推荐选择 a)
# a. 第一, 根据以下方式在本地运行 "train.py" 脚本来生成 MindRecord 格式的数据集。
# "python train.py --only_create_dataset=True --mindrecord_dir=$MINDRECORD_DIR --image_dir=$IMAGE_DIR --anno_path=$ANNO_PATH"
# 第二, 将该数据集压缩为一个 ".zip" 文件。
# 最后, 上传你的压缩数据集到 S3 桶上 (你也可以上传未压缩的数据集,但那可能会很慢。)
# b. 上传原始 coco 数据集到 S3 桶上。
# (数据集转换发生在训练过程中,需要花费较多的时间。每次训练的时候都会重新进行转换。)
# (5) 在网页上设置你的代码路径为 "/path/yolov3_resnet18"
# (6) 在网页上设置启动文件为 "train.py"
# (7) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等
# (8) 创建训练作业
#
# 在 ModelArts 上使用单卡验证
# (1) 执行a或者b
# a. 在 default_config.yaml 文件中设置 "enable_modelarts=True"
# 在 default_config.yaml 文件中设置 "need_modelarts_dataset_unzip=True"
# 在 default_config.yaml 文件中设置 "modelarts_dataset_unzip_name='coco'"
# 在 default_config.yaml 文件中设置 "checkpoint_url='s3://dir_to_your_trained_model/'"
# 在 default_config.yaml 文件中设置 "ckpt_path='/cache/checkpoint_path/yolov3-160_156.ckpt'"
# 在 default_config.yaml 文件中设置 "eval_mindrecord_dir='/cache/data/coco/Mindrecord_eval'"
# 在 default_config.yaml 文件中设置 "image_dir='/cache/data'"
# 在 default_config.yaml 文件中设置 "anno_path='/cache/data/coco/test_Person+Face-coco-20190118.txt'"
# 在 default_config.yaml 文件中设置 其他参数
# b. 在网页上设置 "enable_modelarts=True"
# 在网页上设置 "need_modelarts_dataset_unzip=True"
# 在网页上设置 "modelarts_dataset_unzip_name='coco'"
# 在网页上设置 "checkpoint_url='s3://dir_to_your_trained_model/'"
# 在网页上设置 "ckpt_path='/cache/checkpoint_path/yolov3-160_156.ckpt'"
# 在网页上设置 "eval_mindrecord_dir='/cache/data/coco/Mindrecord_eval'"
# 在网页上设置 "image_dir='/cache/data'"
# 在网页上设置 "anno_path='/cache/data/coco/test_Person+Face-coco-20190118.txt'"
# 在网页上设置 其他参数
# (3) 上传你训练好的模型到 S3 桶上
# (4) 执行a或者b (推荐选择 a)
# a. 第一, 根据以下方式在本地运行 "train.py" 脚本来生成 MindRecord 格式的数据集。
# "python train.py --only_create_dataset=True --mindrecord_dir=$MINDRECORD_DIR --image_dir=$IMAGE_DIR --anno_path=$ANNO_PATH"
# 第二, 将该数据集压缩为一个 ".zip" 文件。
# 最后, 上传你的压缩数据集到 S3 桶上 (你也可以上传未压缩的数据集,但那可能会很慢。)
# b. 上传原始 coco 数据集到 S3 桶上。
# (数据集转换发生在训练过程中,需要花费较多的时间。每次训练的时候都会重新进行转换。)
# (5) 在网页上设置你的代码路径为 "/path/yolov3_resnet18"
# (6) 在网页上设置启动文件为 "train.py"
# (7) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等
# (8) 创建训练作业
```
# 脚本说明
## 脚本及样例代码
@ -102,9 +217,16 @@ YOLOv3整体网络架构如下
```text
└── cv
├── README.md // 所有模型相关说明
├── README_CN.md // 所有模型相关中文说明
├── mindspore_hub_conf.md // Mindspore Hub配置
└── yolov3_resnet18
├── README.md // yolov3_resnet18相关说明
├── model_utils
├── __init__.py // 初始化文件
├── config.py // 参数配置
├── device_adapter.py // ModelArts的设备适配器
├── local_adapter.py // 本地适配器
└── moxing_adapter.py // ModelArts的模型适配器
├── scripts
├── run_distribute_train.sh // Ascend上分布式shell脚本
├── run_standalone_train.sh // Ascend上分布式shell脚本
@ -112,10 +234,14 @@ YOLOv3整体网络架构如下
├── src
├── dataset.py // 创建数据集
├── yolov3.py // yolov3架构
├── config.py // 参数配置
├── config.py // 网络结构的默认参数配置
└── utils.py // 工具函数
├── train.py // 训练脚本
└── eval.py // 评估脚本
├── default_config.yaml // 参数配置
├── eval.py // 验证脚本
├── export.py // 导出脚本
├── mindspore_hub_conf.py // hub配置
├── postprocess.py // 后处理脚本
└── train.py // 训练脚本
```
## 脚本参数

View File

@ -0,0 +1,56 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path"
device_target: "Ascend"
need_modelarts_dataset_unzip: True
modelarts_dataset_unzip_name: "coco"
# ==============================================================================
# Train options
only_create_dataset: False
distribute: False
lr: 0.001
mode: "sink"
epoch_size: 50
batch_size: 32
pre_trained: ""
pre_trained_epoch_size: 0
save_checkpoint_epochs: 5
save_checkpoint_dir: "./"
loss_scale: 1024
mindrecord_dir: "./Mindrecord_train"
image_dir: ""
anno_path: ""
# Eval options
eval_mindrecord_dir: "./Mindrecord_eval"
ckpt_path: ""
---
# Help description for each configuration
# Train options
only_create_dataset: "If set it true, only create Mindrecord."
distribute: "Run distribute"
lr: "Learning rate"
mode: "Run sink mode or not"
epoch_size: "Epoch size"
batch_size: "Batch size"
pre_trained: "Pretrained checkpoint file path"
pre_trained_epoch_size: "Pretrained epoch size"
save_checkpoint_epochs: "Save checkpoint epochs"
loss_scale: "Loss scale"
mindrecord_dir: "Mindrecord directory. If the mindrecord_dir is empty, it wil generate mindrecord file by image_dir and anno_path. Note if mindrecord_dir isn't empty, it will use mindrecord_dir rather than image_dir and anno_path."
image_dir: "Dataset directory, the absolute image path is joined by the image_dir and the relative path in anno_path"
anno_path: "Annotation path."
# Eval options
eval_mindrecord_dir: "Mindrecord directory for eval."
ckpt_path: "Checkpoint path."

View File

@ -15,7 +15,6 @@
"""Evaluation for yolov3-resnet18"""
import os
import argparse
import time
from mindspore import context, Tensor
from mindspore.train.serialization import load_checkpoint, load_param_into_net
@ -24,6 +23,10 @@ from src.dataset import create_yolo_dataset, data_to_mindrecord_byte_image
from src.config import ConfigYOLOV3ResNet18
from src.utils import metrics
from model_utils.config import config as default_config
from model_utils.moxing_adapter import moxing_wrapper
from model_utils.device_adapter import get_device_id, get_device_num
def yolo_eval(dataset_path, ckpt_path):
"""Yolov3 evaluation."""
@ -66,40 +69,85 @@ def yolo_eval(dataset_path, ckpt_path):
for i in range(config.num_classes):
print("class {} precision is {:.2f}%, recall is {:.2f}%".format(i, precisions[i] * 100, recalls[i] * 100))
def modelarts_pre_process():
'''modelarts pre process function.'''
def unzip(zip_file, save_dir):
import zipfile
s_time = time.time()
if not os.path.exists(os.path.join(save_dir, default_config.modelarts_dataset_unzip_name)):
zip_isexist = zipfile.is_zipfile(zip_file)
if zip_isexist:
fz = zipfile.ZipFile(zip_file, 'r')
data_num = len(fz.namelist())
print("Extract Start...")
print("unzip file num: {}".format(data_num))
data_print = int(data_num / 100) if data_num > 100 else 1
i = 0
for file in fz.namelist():
if i % data_print == 0:
print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True)
i += 1
fz.extract(file, save_dir)
print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60),
int(int(time.time() - s_time) % 60)))
print("Extract Done.")
else:
print("This is not zip.")
else:
print("Zip has been extracted.")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Yolov3 evaluation')
parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
parser.add_argument("--mindrecord_dir", type=str, default="./Mindrecord_eval",
help="Mindrecord directory. If the mindrecord_dir is empty, it wil generate mindrecord file by"
"image_dir and anno_path. Note if mindrecord_dir isn't empty, it will use mindrecord_dir "
"rather than image_dir and anno_path. Default is ./Mindrecord_eval")
parser.add_argument("--image_dir", type=str, default="", help="Dataset directory, "
"the absolute image path is joined by the image_dir "
"and the relative path in anno_path.")
parser.add_argument("--anno_path", type=str, default="", help="Annotation path.")
parser.add_argument("--ckpt_path", type=str, required=True, help="Checkpoint path.")
args_opt = parser.parse_args()
if default_config.need_modelarts_dataset_unzip:
zip_file_1 = os.path.join(default_config.data_path, default_config.modelarts_dataset_unzip_name + ".zip")
save_dir_1 = os.path.join(default_config.data_path)
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id)
sync_lock = "/tmp/unzip_sync.lock"
# It will generate mindrecord file in args_opt.mindrecord_dir,
# Each server contains 8 devices as most.
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
print("Zip file path: ", zip_file_1)
print("Unzip file save dir: ", save_dir_1)
unzip(zip_file_1, save_dir_1)
print("===Finish extract data synchronization===")
try:
os.mknod(sync_lock)
except IOError:
pass
while True:
if os.path.exists(sync_lock):
break
time.sleep(1)
print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1))
@moxing_wrapper(pre_process=modelarts_pre_process)
def run_eval():
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=get_device_id())
# It will generate mindrecord file in default_config.eval_mindrecord_dir,
# and the file name is yolo.mindrecord0, 1, ... file_num.
if not os.path.isdir(args_opt.mindrecord_dir):
os.makedirs(args_opt.mindrecord_dir)
if not os.path.isdir(default_config.eval_mindrecord_dir):
os.makedirs(default_config.eval_mindrecord_dir)
yolo_prefix = "yolo.mindrecord"
mindrecord_file = os.path.join(args_opt.mindrecord_dir, yolo_prefix + "0")
mindrecord_file = os.path.join(default_config.eval_mindrecord_dir, yolo_prefix + "0")
if not os.path.exists(mindrecord_file):
if os.path.isdir(args_opt.image_dir) and os.path.exists(args_opt.anno_path):
if os.path.isdir(default_config.image_dir) and os.path.exists(default_config.anno_path):
print("Create Mindrecord")
data_to_mindrecord_byte_image(args_opt.image_dir,
args_opt.anno_path,
args_opt.mindrecord_dir,
data_to_mindrecord_byte_image(default_config.image_dir,
default_config.anno_path,
default_config.eval_mindrecord_dir,
prefix=yolo_prefix,
file_num=8)
print("Create Mindrecord Done, at {}".format(args_opt.mindrecord_dir))
print("Create Mindrecord Done, at {}".format(default_config.eval_mindrecord_dir))
else:
print("image_dir or anno_path not exits")
print("Start Eval!")
yolo_eval(mindrecord_file, args_opt.ckpt_path)
if not default_config.only_create_dataset:
print("Start Eval!")
yolo_eval(mindrecord_file, default_config.ckpt_path)
if __name__ == '__main__':
run_eval()

View File

@ -0,0 +1,126 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Parse arguments"""
import os
import ast
import argparse
from pprint import pformat
import yaml
class Config:
"""
Configuration namespace. Convert dictionary to members.
"""
def __init__(self, cfg_dict):
for k, v in cfg_dict.items():
if isinstance(v, (list, tuple)):
setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
else:
setattr(self, k, Config(v) if isinstance(v, dict) else v)
def __str__(self):
return pformat(self.__dict__)
def __repr__(self):
return self.__str__()
def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"):
"""
Parse command line arguments to the configuration according to the default yaml.
Args:
parser: Parent parser.
cfg: Base configuration.
helper: Helper description.
cfg_path: Path to the default yaml config.
"""
parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
parents=[parser])
helper = {} if helper is None else helper
choices = {} if choices is None else choices
for item in cfg:
if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
choice = choices[item] if item in choices else None
if isinstance(cfg[item], bool):
parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
help=help_description)
else:
parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
help=help_description)
args = parser.parse_args()
return args
def parse_yaml(yaml_path):
"""
Parse the yaml config file.
Args:
yaml_path: Path to the yaml config.
"""
with open(yaml_path, 'r') as fin:
try:
cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
cfgs = [x for x in cfgs]
if len(cfgs) == 1:
cfg_helper = {}
cfg = cfgs[0]
cfg_choices = {}
elif len(cfgs) == 2:
cfg, cfg_helper = cfgs
cfg_choices = {}
elif len(cfgs) == 3:
cfg, cfg_helper, cfg_choices = cfgs
else:
raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml")
print(cfg_helper)
except:
raise ValueError("Failed to parse yaml")
return cfg, cfg_helper, cfg_choices
def merge(args, cfg):
"""
Merge the base config from yaml file and command line arguments.
Args:
args: Command line arguments.
cfg: Base configuration.
"""
args_var = vars(args)
for item in args_var:
cfg[item] = args_var[item]
return cfg
def get_config():
"""
Get Config according to the yaml file and cli arguments.
"""
parser = argparse.ArgumentParser(description="default name", add_help=False)
current_dir = os.path.dirname(os.path.abspath(__file__))
parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../default_config.yaml"),
help="Config file path")
path_args, _ = parser.parse_known_args()
default, helper, choices = parse_yaml(path_args.config_path)
args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path)
final_config = merge(args, default)
return Config(final_config)
config = get_config()

View File

@ -0,0 +1,27 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Device adapter for ModelArts"""
from .config import config
if config.enable_modelarts:
from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
else:
from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
__all__ = [
"get_device_id", "get_device_num", "get_rank_id", "get_job_id"
]

View File

@ -0,0 +1,36 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Local adapter"""
import os
def get_device_id():
device_id = os.getenv('DEVICE_ID', '0')
return int(device_id)
def get_device_num():
device_num = os.getenv('RANK_SIZE', '1')
return int(device_num)
def get_rank_id():
global_rank_id = os.getenv('RANK_ID', '0')
return int(global_rank_id)
def get_job_id():
return "Local Job"

View File

@ -0,0 +1,116 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Moxing adapter for ModelArts"""
import os
import functools
from mindspore import context
from .config import config
_global_sync_count = 0
def get_device_id():
device_id = os.getenv('DEVICE_ID', '0')
return int(device_id)
def get_device_num():
device_num = os.getenv('RANK_SIZE', '1')
return int(device_num)
def get_rank_id():
global_rank_id = os.getenv('RANK_ID', '0')
return int(global_rank_id)
def get_job_id():
job_id = os.getenv('JOB_ID')
job_id = job_id if job_id != "" else "default"
return job_id
def sync_data(from_path, to_path):
"""
Download data from remote obs to local directory if the first url is remote url and the second one is local path
Upload data from local directory to remote obs in contrast.
"""
import moxing as mox
import time
global _global_sync_count
sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
_global_sync_count += 1
# Each server contains 8 devices as most.
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
print("from path: ", from_path)
print("to path: ", to_path)
mox.file.copy_parallel(from_path, to_path)
print("===finish data synchronization===")
try:
os.mknod(sync_lock)
except IOError:
pass
print("===save flag===")
while True:
if os.path.exists(sync_lock):
break
time.sleep(1)
print("Finish sync data from {} to {}.".format(from_path, to_path))
def moxing_wrapper(pre_process=None, post_process=None):
"""
Moxing wrapper to download dataset and upload outputs.
"""
def wrapper(run_func):
@functools.wraps(run_func)
def wrapped_func(*args, **kwargs):
# Download data from data_url
if config.enable_modelarts:
if config.data_url:
sync_data(config.data_url, config.data_path)
print("Dataset downloaded: ", os.listdir(config.data_path))
if config.checkpoint_url:
sync_data(config.checkpoint_url, config.load_path)
print("Preload downloaded: ", os.listdir(config.load_path))
if config.train_url:
sync_data(config.train_url, config.output_path)
print("Workspace downloaded: ", os.listdir(config.output_path))
context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
config.device_num = get_device_num()
config.device_id = get_device_id()
if not os.path.exists(config.output_path):
os.makedirs(config.output_path)
if pre_process:
pre_process()
# Run the main function
run_func(*args, **kwargs)
# Upload data to train_url
if config.enable_modelarts:
if post_process:
post_process()
if config.train_url:
print("Start to copy output directory")
sync_data(config.output_path, config.train_url)
return wrapped_func
return wrapper

View File

@ -17,7 +17,7 @@
echo "======================================================================================================================================================="
echo "Please run the script as: "
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH RANK_TABLE_FILE PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
echo "For example: sh run_distribute_train.sh 8 150 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)"
echo "For example: sh run_distribute_train.sh 8 160 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)"
echo "It is better to use absolute path."
echo "The learning rate is 0.005 as default, if you want other lr, please change the value in this script."
echo "======================================================================================================================================================="
@ -63,7 +63,9 @@ do
rm -rf LOG$i
mkdir ./LOG$i
cp *.py ./LOG$i
cp *.yaml ./LOG$i
cp -r ./src ./LOG$i
cp -r ./model_utils ./LOG$i
cd ./LOG$i || exit
export RANK_ID=$i
echo "start training for rank $i, device $DEVICE_ID"
@ -74,8 +76,6 @@ do
taskset -c $cmdopt python train.py \
--distribute=True \
--lr=0.005 \
--device_num=$RANK_SIZE \
--device_id=$DEVICE_ID \
--mindrecord_dir=$MINDRECORD_DIR \
--image_dir=$IMAGE_DIR \
--epoch_size=$EPOCH_SIZE \
@ -87,8 +87,6 @@ do
taskset -c $cmdopt python train.py \
--distribute=True \
--lr=0.005 \
--device_num=$RANK_SIZE \
--device_id=$DEVICE_ID \
--mindrecord_dir=$MINDRECORD_DIR \
--image_dir=$IMAGE_DIR \
--epoch_size=$EPOCH_SIZE \

View File

@ -23,4 +23,8 @@ echo "==========================================================================
BASE_PATH=$(cd "`dirname $0`" || exit; pwd)
cd $BASE_PATH/../ || exit
python eval.py --device_id=$1 --ckpt_path=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5
export RANK_SIZE=1
export DEVICE_ID=$1
export RANK_ID=$1
python eval.py --ckpt_path=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5

View File

@ -17,7 +17,7 @@
echo "========================================================================================================================================="
echo "Please run the script as: "
echo "sh run_standalone_train.sh DEVICE_ID EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
echo "for example: sh run_standalone_train.sh 0 50 ./Mindrecord_train ./dataset ./dataset/train.txt /opt/yolov3-50.ckpt(optional) 30(optional)"
echo "for example: sh run_standalone_train.sh 0 60 ./Mindrecord_train ./dataset ./dataset/train.txt /opt/yolov3-50.ckpt(optional) 30(optional)"
echo "========================================================================================================================================="
if [ $# != 5 ] && [ $# != 7 ]
@ -30,12 +30,16 @@ fi
BASE_PATH=$(cd "`dirname $0`" || exit; pwd)
cd $BASE_PATH/../ || exit
export RANK_SIZE=1
export DEVICE_ID=$1
export RANK_ID=$1
if [ $# == 5 ]
then
python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5
python train.py --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5
fi
if [ $# == 7 ]
then
python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5 --pre_trained=$6 --pre_trained_epoch_size=$7
python train.py --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5 --pre_trained=$6 --pre_trained_epoch_size=$7
fi

View File

@ -23,8 +23,7 @@ Note if mindrecord_dir isn't empty, it will use mindrecord_dir rather than image
"""
import os
import argparse
import ast
import time
import numpy as np
import mindspore.nn as nn
from mindspore import context, Tensor
@ -40,6 +39,10 @@ from src.yolov3 import yolov3_resnet18, YoloWithLossCell, TrainingWrapper
from src.dataset import create_yolo_dataset, data_to_mindrecord_byte_image
from src.config import ConfigYOLOV3ResNet18
from model_utils.config import config as default_config
from model_utils.moxing_adapter import moxing_wrapper
from model_utils.device_adapter import get_device_id, get_rank_id, get_device_num
set_seed(1)
def get_lr(learning_rate, start_step, global_step, decay_step, decay_rate, steps=False):
@ -63,71 +66,99 @@ def init_net_param(network, init_value='ones'):
p.set_data(initializer(init_value, p.data.shape, p.data.dtype))
def main():
parser = argparse.ArgumentParser(description="YOLOv3 train")
parser.add_argument("--only_create_dataset", type=ast.literal_eval, default=False,
help="If set it true, only create Mindrecord, default is False.")
parser.add_argument("--distribute", type=ast.literal_eval, default=False, help="Run distribute, default is False.")
parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.")
parser.add_argument("--lr", type=float, default=0.001, help="Learning rate, default is 0.001.")
parser.add_argument("--mode", type=str, default="sink", help="Run sink mode or not, default is sink")
parser.add_argument("--epoch_size", type=int, default=50, help="Epoch size, default is 50")
parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.")
parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained checkpoint file path")
parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size")
parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.")
parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.")
parser.add_argument("--mindrecord_dir", type=str, default="./Mindrecord_train",
help="Mindrecord directory. If the mindrecord_dir is empty, it wil generate mindrecord file by "
"image_dir and anno_path. Note if mindrecord_dir isn't empty, it will use mindrecord_dir "
"rather than image_dir and anno_path. Default is ./Mindrecord_train")
parser.add_argument("--image_dir", type=str, default="", help="Dataset directory, "
"the absolute image path is joined by the image_dir "
"and the relative path in anno_path")
parser.add_argument("--anno_path", type=str, default="", help="Annotation path.")
args_opt = parser.parse_args()
def modelarts_pre_process():
'''modelarts pre process function.'''
def unzip(zip_file, save_dir):
import zipfile
s_time = time.time()
if not os.path.exists(os.path.join(save_dir, default_config.modelarts_dataset_unzip_name)):
zip_isexist = zipfile.is_zipfile(zip_file)
if zip_isexist:
fz = zipfile.ZipFile(zip_file, 'r')
data_num = len(fz.namelist())
print("Extract Start...")
print("unzip file num: {}".format(data_num))
data_print = int(data_num / 100) if data_num > 100 else 1
i = 0
for file in fz.namelist():
if i % data_print == 0:
print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True)
i += 1
fz.extract(file, save_dir)
print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60),
int(int(time.time() - s_time) % 60)))
print("Extract Done.")
else:
print("This is not zip.")
else:
print("Zip has been extracted.")
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id)
if args_opt.distribute:
device_num = args_opt.device_num
if default_config.need_modelarts_dataset_unzip:
zip_file_1 = os.path.join(default_config.data_path, default_config.modelarts_dataset_unzip_name + ".zip")
save_dir_1 = os.path.join(default_config.data_path)
sync_lock = "/tmp/unzip_sync.lock"
# Each server contains 8 devices as most.
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
print("Zip file path: ", zip_file_1)
print("Unzip file save dir: ", save_dir_1)
unzip(zip_file_1, save_dir_1)
print("===Finish extract data synchronization===")
try:
os.mknod(sync_lock)
except IOError:
pass
while True:
if os.path.exists(sync_lock):
break
time.sleep(1)
print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1))
default_config.save_checkpoint_dir = os.path.join(default_config.output_path, default_config.save_checkpoint_dir)
@moxing_wrapper(pre_process=modelarts_pre_process)
def run_train():
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=get_device_id())
rank = get_rank_id()
device_num = get_device_num()
if default_config.distribute:
context.reset_auto_parallel_context()
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
device_num=device_num)
init()
rank = args_opt.device_id % device_num
else:
rank = 0
device_num = 1
print("Start create dataset!")
# It will generate mindrecord file in args_opt.mindrecord_dir,
# It will generate mindrecord file in default_config.mindrecord_dir,
# and the file name is yolo.mindrecord0, 1, ... file_num.
if not os.path.isdir(args_opt.mindrecord_dir):
os.makedirs(args_opt.mindrecord_dir)
if not os.path.isdir(default_config.mindrecord_dir):
os.makedirs(default_config.mindrecord_dir)
prefix = "yolo.mindrecord"
mindrecord_file = os.path.join(args_opt.mindrecord_dir, prefix + "0")
mindrecord_file = os.path.join(default_config.mindrecord_dir, prefix + "0")
if not os.path.exists(mindrecord_file):
if os.path.isdir(args_opt.image_dir) and os.path.exists(args_opt.anno_path):
if os.path.isdir(default_config.image_dir) and os.path.exists(default_config.anno_path):
print("Create Mindrecord.")
data_to_mindrecord_byte_image(args_opt.image_dir,
args_opt.anno_path,
args_opt.mindrecord_dir,
data_to_mindrecord_byte_image(default_config.image_dir,
default_config.anno_path,
default_config.mindrecord_dir,
prefix,
8)
print("Create Mindrecord Done, at {}".format(args_opt.mindrecord_dir))
print("Create Mindrecord Done, at {}".format(default_config.mindrecord_dir))
else:
raise ValueError('image_dir {} or anno_path {} does not exist'.format(\
args_opt.image_dir, args_opt.anno_path))
raise ValueError('image_dir {} or anno_path {} does not exist'.
format(default_config.image_dir, default_config.anno_path))
if not args_opt.only_create_dataset:
loss_scale = float(args_opt.loss_scale)
if not default_config.only_create_dataset:
loss_scale = float(default_config.loss_scale)
# When create MindDataset, using the fitst mindrecord file, such as yolo.mindrecord0.
dataset = create_yolo_dataset(mindrecord_file,
batch_size=args_opt.batch_size, device_num=device_num, rank=rank)
batch_size=default_config.batch_size, device_num=device_num, rank=rank)
dataset_size = dataset.get_dataset_size()
print("Create dataset done!")
@ -136,18 +167,20 @@ def main():
init_net_param(net, "XavierUniform")
# checkpoint
ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs)
ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory='./ckpt_' + str(rank) + '/', config=ckpt_config)
ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * default_config.save_checkpoint_epochs)
save_ckpt_dir = os.path.join(default_config.save_checkpoint_dir, 'ckpt_' + str(rank) + '/')
ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory=save_ckpt_dir, config=ckpt_config)
if args_opt.pre_trained:
if args_opt.pre_trained_epoch_size <= 0:
if default_config.pre_trained:
if default_config.pre_trained_epoch_size <= 0:
raise KeyError("pre_trained_epoch_size must be greater than 0.")
param_dict = load_checkpoint(args_opt.pre_trained)
param_dict = load_checkpoint(default_config.pre_trained)
load_param_into_net(net, param_dict)
total_epoch_size = 60
if args_opt.distribute:
if default_config.distribute:
total_epoch_size = 160
lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=args_opt.pre_trained_epoch_size * dataset_size,
lr = Tensor(get_lr(learning_rate=default_config.lr,
start_step=default_config.pre_trained_epoch_size * dataset_size,
global_step=total_epoch_size * dataset_size,
decay_step=1000, decay_rate=0.95, steps=True))
opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale)
@ -157,11 +190,11 @@ def main():
model = Model(net)
dataset_sink_mode = False
if args_opt.mode == "sink":
if default_config.mode == "sink":
print("In sink mode, one epoch return a loss.")
dataset_sink_mode = True
print("Start train YOLOv3, the first epoch will be slower because of the graph compilation.")
model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
model.train(default_config.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
if __name__ == '__main__':
main()
run_train()