diff --git a/model_zoo/official/cv/yolov3_resnet18/README.md b/model_zoo/official/cv/yolov3_resnet18/README.md index 17116d034ad..34824272e73 100644 --- a/model_zoo/official/cv/yolov3_resnet18/README.md +++ b/model_zoo/official/cv/yolov3_resnet18/README.md @@ -79,7 +79,7 @@ Dataset used: [COCO2017]() After installing MindSpore via the official website, you can start training and evaluation on Ascend as follows: -- running on Ascend +- Running on Ascend ```shell script #run standalone training example @@ -92,16 +92,138 @@ After installing MindSpore via the official website, you can start training and sh run_eval.sh [DEVICE_ID] [CKPT_PATH] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] ``` +- Running on [ModelArts](https://support.huaweicloud.com/modelarts/) + + ```bash + # Train 8p with Ascend + # (1) Perform a or b. + # a. Set "enable_modelarts=True" on default_config.yaml file. + # Set "distribute=True" on default_config.yaml file. + # Set "need_modelarts_dataset_unzip=True" on default_config.yaml file. + # Set "modelarts_dataset_unzip_name='coco'" on default_config.yaml file. + # Set "lr=0.005" on default_config.yaml file. + # Set "mindrecord_dir='/cache/data/coco/Mindrecord_train'" on default_config.yaml file. + # Set "image_dir='/cache/data'" on default_config.yaml file. + # Set "anno_path='/cache/data/coco/train_Person+Face-coco-20190118.txt'" on default_config.yaml file. + # Set "epoch_size=160" on default_config.yaml file. + # (optional)Set "pre_trained_epoch_size=YOUR_SIZE" on default_config.yaml file. + # (optional)Set "checkpoint_url='s3://dir_to_your_pretrained/'" on default_config.yaml file. + # (optional)Set "pre_trained=/cache/checkpoint_path/model.ckpt" on default_config.yaml file. + # Set other parameters on default_config.yaml file you need. + # b. Add "enable_modelarts=True" on the website UI interface. + # Add "need_modelarts_dataset_unzip=True" on the website UI interface. + # Add "modelarts_dataset_unzip_name='coco'" on the website UI interface. + # Add "distribute=True" on the website UI interface. + # Add "lr=0.005" on the website UI interface. + # Add "mindrecord_dir=/cache/data/coco/Mindrecord_train" on the website UI interface. + # Add "image_dir=/cache/data" on the website UI interface. + # Add "anno_path=/cache/data/coco/train_Person+Face-coco-20190118.txt" on the website UI interface. + # Add "epoch_size=160" on the website UI interface. + # (optional)Add "pre_trained_epoch_size=YOUR_SIZE" on the website UI interface. + # (optional)Add "checkpoint_url='s3://dir_to_your_pretrained/'" on the website UI interface. + # (optional)Add "pre_trained=/cache/checkpoint_path/model.ckpt" on the website UI interface. + # Add other parameters on the website UI interface. + # (3) Upload or copy your pretrained model to S3 bucket if you want to finetune. + # (4) Perform a or b. (suggested option a) + # a. First, run "train.py" like the following to create MindRecord dataset locally from coco2017. + # "python train.py --only_create_dataset=True --mindrecord_dir=$MINDRECORD_DIR --image_dir=$IMAGE_DIR --anno_path=$ANNO_PATH" + # Second, zip MindRecord dataset to one zip file. + # Finally, Upload your zip dataset to S3 bucket.(you could also upload the origin mindrecord dataset, but it can be so slow.) + # b. Upload the original coco dataset to S3 bucket. + # (Data set conversion occurs during training process and costs a lot of time. it happens every time you train.) + # (5) Set the code directory to "/path/yolov3_resnet18" on the website UI interface. + # (6) Set the startup file to "train.py" on the website UI interface. + # (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface. + # (8) Create your job. + # + # Train 1p with Ascend + # (1) Perform a or b. + # a. Set "enable_modelarts=True" on default_config.yaml file. + # Set "need_modelarts_dataset_unzip=True" on default_config.yaml file. + # Set "modelarts_dataset_unzip_name='coco'" on default_config.yaml file. + # Set "mindrecord_dir='/cache/data/coco/Mindrecord_train'" on default_config.yaml file. + # Set "image_dir='/cache/data'" on default_config.yaml file. + # Set "anno_path='/cache/data/coco/train_Person+Face-coco-20190118.txt'" on default_config.yaml file. + # Set "epoch_size=160" on default_config.yaml file. + # (optional)Set "pre_trained_epoch_size=YOUR_SIZE" on default_config.yaml file. + # (optional)Set "checkpoint_url='s3://dir_to_your_pretrained/'" on default_config.yaml file. + # (optional)Set "pre_trained=/cache/checkpoint_path/model.ckpt" on default_config.yaml file. + # Set other parameters on default_config.yaml file you need. + # b. Add "enable_modelarts=True" on the website UI interface. + # Add "need_modelarts_dataset_unzip=True" on the website UI interface. + # Add "modelarts_dataset_unzip_name='coco'" on the website UI interface. + # Add "mindrecord_dir='/cache/data/coco/Mindrecord_train'" on the website UI interface. + # Add "image_dir='/cache/data'" on the website UI interface. + # Add "anno_path='/cache/data/coco/train_Person+Face-coco-20190118.txt'" on the website UI interface. + # Add "epoch_size=160" on the website UI interface. + # (optional)Add "pre_trained_epoch_size=YOUR_SIZE" on the website UI interface. + # (optional)Add "checkpoint_url='s3://dir_to_your_pretrained/'" on the website UI interface. + # (optional)Add "pre_trained=/cache/checkpoint_path/model.ckpt" on the website UI interface. + # Add other parameters on the website UI interface. + # (3) Upload or copy your pretrained model to S3 bucket if you want to finetune. + # (4) Perform a or b. (suggested option a) + # a. First, run "train.py" like the following to create MindRecord dataset locally from coco2017. + # "python train.py --only_create_dataset=True --mindrecord_dir=$MINDRECORD_DIR --image_dir=$IMAGE_DIR --anno_path=$ANNO_PATH" + # Second, zip MindRecord dataset to one zip file. + # Finally, Upload your zip dataset to S3 bucket.(you could also upload the origin mindrecord dataset, but it can be so slow.) + # b. Upload the original coco dataset to S3 bucket. + # (Data set conversion occurs during training process and costs a lot of time. it happens every time you train.) + # (5) Set the code directory to "/path/yolov3_resnet18" on the website UI interface. + # (6) Set the startup file to "train.py" on the website UI interface. + # (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface. + # (8) Create your job. + # + # Eval 1p with Ascend + # (1) Perform a or b. + # a. Set "enable_modelarts=True" on default_config.yaml file. + # Set "need_modelarts_dataset_unzip=True" on default_config.yaml file. + # Set "modelarts_dataset_unzip_name='coco'" on default_config.yaml file. + # Set "checkpoint_url='s3://dir_to_your_trained_model/'" on base_config.yaml file. + # Set "ckpt_path='/cache/checkpoint_path/yolov3-160_156.ckpt'" on default_config.yaml file. + # Set "eval_mindrecord_dir='/cache/data/coco/Mindrecord_eval'" on default_config.yaml file. + # Set "image_dir='/cache/data'" on default_config.yaml file. + # Set "anno_path='/cache/data/coco/test_Person+Face-coco-20190118.txt'" on default_config.yaml file. + # Set other parameters on default_config.yaml file you need. + # b. Add "enable_modelarts=True" on the website UI interface. + # Add "need_modelarts_dataset_unzip=True" on the website UI interface. + # Add "modelarts_dataset_unzip_name='coco'" on the website UI interface. + # Add "checkpoint_url='s3://dir_to_your_trained_model/'" on the website UI interface. + # Add "ckpt_path='/cache/checkpoint_path/yolov3-160_156.ckpt'" on the website UI interface. + # Add "eval_mindrecord_dir='/cache/data/coco/Mindrecord_eval'" on the website UI interface. + # Add "image_dir='/cache/data'" on the website UI interface. + # Add "anno_path='/cache/data/coco/test_Person+Face-coco-20190118.txt'" on the website UI interface. + # Add other parameters on the website UI interface. + # (3) Upload or copy your trained model to S3 bucket. + # (4) Perform a or b. (suggested option a) + # a. First, run "eval.py" like the following to create MindRecord dataset locally from coco2017. + # "python eval.py --only_create_dataset=True --eval_mindrecord_dir=$EVAL_MINDRECORD_DIR --image_dir=$EVAL_IMAGE_DIR --anno_path=$EVAL_ANNO_PATH" + # Second, zip MindRecord dataset to one zip file. + # Finally, Upload your zip dataset to S3 bucket.(you could also upload the origin mindrecord dataset, but it can be so slow.) + # b. Upload the original coco dataset to S3 bucket. + # (Data set conversion occurs during training process and costs a lot of time. it happens every time you train.) + # (5) Set the code directory to "/path/yolov3_resnet18" on the website UI interface. + # (6) Set the startup file to "eval.py" on the website UI interface. + # (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface. + # (8) Create your job. + ``` + # [Script Description](#contents) ## [Script and Sample Code](#contents) -```python +```text └── cv ├── README.md // descriptions about all the models ├── mindspore_hub_conf.md // config for mindspore hub └── yolov3_resnet18 ├── README.md // descriptions about yolov3_resnet18 + ├── README_CN.md // descriptions about yolov3_resnet18 with Chinese + ├── model_utils + ├── __init__.py // init file + ├── config.py // Parse arguments + ├── device_adapter.py // Device adapter for ModelArts + ├── local_adapter.py // Local adapter + └── moxing_adapter.py // Moxing adapter for ModelArts ├── scripts ├── run_distribute_train.sh // shell script for distributed on Ascend ├── run_standalone_train.sh // shell script for distributed on Ascend @@ -109,10 +231,14 @@ After installing MindSpore via the official website, you can start training and ├── src ├── dataset.py // creating dataset ├── yolov3.py // yolov3 architecture - ├── config.py // parameter configuration + ├── config.py // default arguments for network architecture └── utils.py // util function - ├── train.py // training script - └── eval.py // evaluation script + ├── default_config.yaml // configurations + ├── eval.py // evaluation script + ├── export.py // export script + ├── mindspore_hub_conf.py // hub config + ├── postprocess.py // postprocess script + └── train.py // train script ``` ## [Script Parameters](#contents) @@ -238,11 +364,11 @@ Inference result is saved in current path, you can find result in acc.log file. | Parameters | Ascend | | -------------------------- | ----------------------------------------------------------- | | Model Version | YOLOv3_Resnet18 V1 | -| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 | +| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 | | uploaded Date | 09/15/2020 (month/day/year) | | MindSpore Version | 1.0.0 | | Dataset | COCO2017 | -| Training Parameters | epoch = 150, batch_size = 32, lr = 0.001 | +| Training Parameters | epoch = 160, batch_size = 32, lr = 0.005 | | Optimizer | Adam | | Loss Function | Sigmoid Cross Entropy | | outputs | probability | @@ -256,7 +382,7 @@ Inference result is saved in current path, you can find result in acc.log file. | Parameters | Ascend | | ------------------- | ----------------------------------------------- | | Model Version | YOLOv3_Resnet18 V1 | -| Resource | Ascend 910; OS Euler2.8 | +| Resource | Ascend 910; OS Euler2.8 | | Uploaded Date | 09/15/2020 (month/day/year) | | MindSpore Version | 1.0.0 | | Dataset | COCO2017 | diff --git a/model_zoo/official/cv/yolov3_resnet18/README_CN.md b/model_zoo/official/cv/yolov3_resnet18/README_CN.md index 3af48ba4eaf..e22f6ed1900 100644 --- a/model_zoo/official/cv/yolov3_resnet18/README_CN.md +++ b/model_zoo/official/cv/yolov3_resnet18/README_CN.md @@ -95,6 +95,121 @@ YOLOv3整体网络架构如下: sh run_eval.sh [DEVICE_ID] [CKPT_PATH] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] ``` +- 在 ModelArts 进行训练 (如果你想在modelarts上运行,可以参考以下文档 [modelarts](https://support.huaweicloud.com/modelarts/)) + + ```bash + # 在 ModelArts 上使用8卡训练 + # (1) 执行a或者b + # a. 在 default_config.yaml 文件中设置 "enable_modelarts=True" + # 在 default_config.yaml 文件中设置 "distribute=True" + # 在 default_config.yaml 文件中设置 "need_modelarts_dataset_unzip=True" + # 在 default_config.yaml 文件中设置 "modelarts_dataset_unzip_name='coco'" + # 在 default_config.yaml 文件中设置 "lr=0.005" + # 在 default_config.yaml 文件中设置 "mindrecord_dir='/cache/data/coco/Mindrecord_train'" + # 在 default_config.yaml 文件中设置 "image_dir='/cache/data'" + # 在 default_config.yaml 文件中设置 "anno_path='/cache/data/coco/train_Person+Face-coco-20190118.txt'" + # 在 default_config.yaml 文件中设置 "epoch_size=160" + # (可选)在 default_config.yaml 文件中设置 "pre_trained_epoch_size=YOUR_SIZE" + # (可选)在 default_config.yaml 文件中设置 "checkpoint_url='s3://dir_to_your_pretrained/'" + # (可选)在 default_config.yaml 文件中设置 "pre_trained=/cache/checkpoint_path/model.ckpt" + # 在 default_config.yaml 文件中设置 其他参数 + # b. 在网页上设置 "enable_modelarts=True" + # 在网页上设置 "need_modelarts_dataset_unzip=True" + # 在网页上设置 "modelarts_dataset_unzip_name='coco'" + # 在网页上设置 "distribute=True" + # 在网页上设置 "lr=0.005" + # 在网页上设置 "mindrecord_dir=/cache/data/coco/Mindrecord_train" + # 在网页上设置 "image_dir=/cache/data" + # 在网页上设置 "anno_path=/cache/data/coco/train_Person+Face-coco-20190118.txt" + # 在网页上设置 "epoch_size=160" + # (可选)在网页上设置 "pre_trained_epoch_size=YOUR_SIZE" + # (可选)在网页上设置 "checkpoint_url='s3://dir_to_your_pretrained/'" + # (可选)在网页上设置 "pre_trained=/cache/checkpoint_path/model.ckpt" + # 在网页上设置 其他参数 + # (3) 如果选择微调您的模型,请上传你的预训练模型到 S3 桶上 + # (4) 执行a或者b (推荐选择 a) + # a. 第一, 根据以下方式在本地运行 "train.py" 脚本来生成 MindRecord 格式的数据集。 + # "python train.py --only_create_dataset=True --mindrecord_dir=$MINDRECORD_DIR --image_dir=$IMAGE_DIR --anno_path=$ANNO_PATH" + # 第二, 将该数据集压缩为一个 ".zip" 文件。 + # 最后, 上传你的压缩数据集到 S3 桶上 (你也可以上传未压缩的数据集,但那可能会很慢。) + # b. 上传原始 coco 数据集到 S3 桶上。 + # (数据集转换发生在训练过程中,需要花费较多的时间。每次训练的时候都会重新进行转换。) + # (5) 在网页上设置你的代码路径为 "/path/yolov3_resnet18" + # (6) 在网页上设置启动文件为 "train.py" + # (7) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等 + # (8) 创建训练作业 + # + # 在 ModelArts 上使用单卡训练 + # (1) 执行a或者b + # a. 在 default_config.yaml 文件中设置 "enable_modelarts=True" + # 在 default_config.yaml 文件中设置 "need_modelarts_dataset_unzip=True" + # 在 default_config.yaml 文件中设置 "modelarts_dataset_unzip_name='coco'" + # 在 default_config.yaml 文件中设置 "mindrecord_dir='/cache/data/coco/Mindrecord_train'" + # 在 default_config.yaml 文件中设置 "image_dir='/cache/data'" + # 在 default_config.yaml 文件中设置 "anno_path='/cache/data/coco/train_Person+Face-coco-20190118.txt'" + # 在 default_config.yaml 文件中设置 "epoch_size=160" + # (可选)在 default_config.yaml 文件中设置 "pre_trained_epoch_size=YOUR_SIZE" + # (可选)在 default_config.yaml 文件中设置 "checkpoint_url='s3://dir_to_your_pretrained/'" + # (可选)在 default_config.yaml 文件中设置 "pre_trained=/cache/checkpoint_path/model.ckpt" + # 在 default_config.yaml 文件中设置 其他参数 + # b. 在网页上设置 "enable_modelarts=True" + # 在网页上设置 "need_modelarts_dataset_unzip=True" + # 在网页上设置 "modelarts_dataset_unzip_name='coco'" + # 在网页上设置 "mindrecord_dir='/cache/data/coco/Mindrecord_train'" + # 在网页上设置 "image_dir='/cache/data'" + # 在网页上设置 "anno_path='/cache/data/coco/train_Person+Face-coco-20190118.txt'" + # 在网页上设置 "epoch_size=160" + # (可选)在网页上设置 "pre_trained_epoch_size=YOUR_SIZE" + # (可选)在网页上设置 "checkpoint_url='s3://dir_to_your_pretrained/'" + # (可选)在网页上设置 "pre_trained=/cache/checkpoint_path/model.ckpt" + # 在网页上设置 其他参数 + # (3) 如果选择微调您的模型,上传你的预训练模型到 S3 桶上 + # (4) 执行a或者b (推荐选择 a) + # a. 第一, 根据以下方式在本地运行 "train.py" 脚本来生成 MindRecord 格式的数据集。 + # "python train.py --only_create_dataset=True --mindrecord_dir=$MINDRECORD_DIR --image_dir=$IMAGE_DIR --anno_path=$ANNO_PATH" + # 第二, 将该数据集压缩为一个 ".zip" 文件。 + # 最后, 上传你的压缩数据集到 S3 桶上 (你也可以上传未压缩的数据集,但那可能会很慢。) + # b. 上传原始 coco 数据集到 S3 桶上。 + # (数据集转换发生在训练过程中,需要花费较多的时间。每次训练的时候都会重新进行转换。) + # (5) 在网页上设置你的代码路径为 "/path/yolov3_resnet18" + # (6) 在网页上设置启动文件为 "train.py" + # (7) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等 + # (8) 创建训练作业 + # + # 在 ModelArts 上使用单卡验证 + # (1) 执行a或者b + # a. 在 default_config.yaml 文件中设置 "enable_modelarts=True" + # 在 default_config.yaml 文件中设置 "need_modelarts_dataset_unzip=True" + # 在 default_config.yaml 文件中设置 "modelarts_dataset_unzip_name='coco'" + # 在 default_config.yaml 文件中设置 "checkpoint_url='s3://dir_to_your_trained_model/'" + # 在 default_config.yaml 文件中设置 "ckpt_path='/cache/checkpoint_path/yolov3-160_156.ckpt'" + # 在 default_config.yaml 文件中设置 "eval_mindrecord_dir='/cache/data/coco/Mindrecord_eval'" + # 在 default_config.yaml 文件中设置 "image_dir='/cache/data'" + # 在 default_config.yaml 文件中设置 "anno_path='/cache/data/coco/test_Person+Face-coco-20190118.txt'" + # 在 default_config.yaml 文件中设置 其他参数 + # b. 在网页上设置 "enable_modelarts=True" + # 在网页上设置 "need_modelarts_dataset_unzip=True" + # 在网页上设置 "modelarts_dataset_unzip_name='coco'" + # 在网页上设置 "checkpoint_url='s3://dir_to_your_trained_model/'" + # 在网页上设置 "ckpt_path='/cache/checkpoint_path/yolov3-160_156.ckpt'" + # 在网页上设置 "eval_mindrecord_dir='/cache/data/coco/Mindrecord_eval'" + # 在网页上设置 "image_dir='/cache/data'" + # 在网页上设置 "anno_path='/cache/data/coco/test_Person+Face-coco-20190118.txt'" + # 在网页上设置 其他参数 + # (3) 上传你训练好的模型到 S3 桶上 + # (4) 执行a或者b (推荐选择 a) + # a. 第一, 根据以下方式在本地运行 "train.py" 脚本来生成 MindRecord 格式的数据集。 + # "python train.py --only_create_dataset=True --mindrecord_dir=$MINDRECORD_DIR --image_dir=$IMAGE_DIR --anno_path=$ANNO_PATH" + # 第二, 将该数据集压缩为一个 ".zip" 文件。 + # 最后, 上传你的压缩数据集到 S3 桶上 (你也可以上传未压缩的数据集,但那可能会很慢。) + # b. 上传原始 coco 数据集到 S3 桶上。 + # (数据集转换发生在训练过程中,需要花费较多的时间。每次训练的时候都会重新进行转换。) + # (5) 在网页上设置你的代码路径为 "/path/yolov3_resnet18" + # (6) 在网页上设置启动文件为 "train.py" + # (7) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等 + # (8) 创建训练作业 + ``` + # 脚本说明 ## 脚本及样例代码 @@ -102,9 +217,16 @@ YOLOv3整体网络架构如下: ```text └── cv ├── README.md // 所有模型相关说明 + ├── README_CN.md // 所有模型相关中文说明 ├── mindspore_hub_conf.md // Mindspore Hub配置 └── yolov3_resnet18 ├── README.md // yolov3_resnet18相关说明 + ├── model_utils + ├── __init__.py // 初始化文件 + ├── config.py // 参数配置 + ├── device_adapter.py // ModelArts的设备适配器 + ├── local_adapter.py // 本地适配器 + └── moxing_adapter.py // ModelArts的模型适配器 ├── scripts ├── run_distribute_train.sh // Ascend上分布式shell脚本 ├── run_standalone_train.sh // Ascend上分布式shell脚本 @@ -112,10 +234,14 @@ YOLOv3整体网络架构如下: ├── src ├── dataset.py // 创建数据集 ├── yolov3.py // yolov3架构 - ├── config.py // 参数配置 + ├── config.py // 网络结构的默认参数配置 └── utils.py // 工具函数 - ├── train.py // 训练脚本 - └── eval.py // 评估脚本 + ├── default_config.yaml // 参数配置 + ├── eval.py // 验证脚本 + ├── export.py // 导出脚本 + ├── mindspore_hub_conf.py // hub配置 + ├── postprocess.py // 后处理脚本 + └── train.py // 训练脚本 ``` ## 脚本参数 diff --git a/model_zoo/official/cv/yolov3_resnet18/default_config.yaml b/model_zoo/official/cv/yolov3_resnet18/default_config.yaml new file mode 100644 index 00000000000..c89d0d69f1e --- /dev/null +++ b/model_zoo/official/cv/yolov3_resnet18/default_config.yaml @@ -0,0 +1,56 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +# Url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# Path for local +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path" +device_target: "Ascend" +need_modelarts_dataset_unzip: True +modelarts_dataset_unzip_name: "coco" + +# ============================================================================== +# Train options +only_create_dataset: False +distribute: False +lr: 0.001 +mode: "sink" +epoch_size: 50 +batch_size: 32 +pre_trained: "" +pre_trained_epoch_size: 0 +save_checkpoint_epochs: 5 +save_checkpoint_dir: "./" +loss_scale: 1024 +mindrecord_dir: "./Mindrecord_train" +image_dir: "" +anno_path: "" + +# Eval options +eval_mindrecord_dir: "./Mindrecord_eval" +ckpt_path: "" + +--- + +# Help description for each configuration +# Train options +only_create_dataset: "If set it true, only create Mindrecord." +distribute: "Run distribute" +lr: "Learning rate" +mode: "Run sink mode or not" +epoch_size: "Epoch size" +batch_size: "Batch size" +pre_trained: "Pretrained checkpoint file path" +pre_trained_epoch_size: "Pretrained epoch size" +save_checkpoint_epochs: "Save checkpoint epochs" +loss_scale: "Loss scale" +mindrecord_dir: "Mindrecord directory. If the mindrecord_dir is empty, it wil generate mindrecord file by image_dir and anno_path. Note if mindrecord_dir isn't empty, it will use mindrecord_dir rather than image_dir and anno_path." +image_dir: "Dataset directory, the absolute image path is joined by the image_dir and the relative path in anno_path" +anno_path: "Annotation path." + +# Eval options +eval_mindrecord_dir: "Mindrecord directory for eval." +ckpt_path: "Checkpoint path." \ No newline at end of file diff --git a/model_zoo/official/cv/yolov3_resnet18/eval.py b/model_zoo/official/cv/yolov3_resnet18/eval.py index 1cdddf51c7b..48c0c5e2a90 100644 --- a/model_zoo/official/cv/yolov3_resnet18/eval.py +++ b/model_zoo/official/cv/yolov3_resnet18/eval.py @@ -15,7 +15,6 @@ """Evaluation for yolov3-resnet18""" import os -import argparse import time from mindspore import context, Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -24,6 +23,10 @@ from src.dataset import create_yolo_dataset, data_to_mindrecord_byte_image from src.config import ConfigYOLOV3ResNet18 from src.utils import metrics +from model_utils.config import config as default_config +from model_utils.moxing_adapter import moxing_wrapper +from model_utils.device_adapter import get_device_id, get_device_num + def yolo_eval(dataset_path, ckpt_path): """Yolov3 evaluation.""" @@ -66,40 +69,85 @@ def yolo_eval(dataset_path, ckpt_path): for i in range(config.num_classes): print("class {} precision is {:.2f}%, recall is {:.2f}%".format(i, precisions[i] * 100, recalls[i] * 100)) +def modelarts_pre_process(): + '''modelarts pre process function.''' + def unzip(zip_file, save_dir): + import zipfile + s_time = time.time() + if not os.path.exists(os.path.join(save_dir, default_config.modelarts_dataset_unzip_name)): + zip_isexist = zipfile.is_zipfile(zip_file) + if zip_isexist: + fz = zipfile.ZipFile(zip_file, 'r') + data_num = len(fz.namelist()) + print("Extract Start...") + print("unzip file num: {}".format(data_num)) + data_print = int(data_num / 100) if data_num > 100 else 1 + i = 0 + for file in fz.namelist(): + if i % data_print == 0: + print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True) + i += 1 + fz.extract(file, save_dir) + print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60), + int(int(time.time() - s_time) % 60))) + print("Extract Done.") + else: + print("This is not zip.") + else: + print("Zip has been extracted.") -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Yolov3 evaluation') - parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") - parser.add_argument("--mindrecord_dir", type=str, default="./Mindrecord_eval", - help="Mindrecord directory. If the mindrecord_dir is empty, it wil generate mindrecord file by" - "image_dir and anno_path. Note if mindrecord_dir isn't empty, it will use mindrecord_dir " - "rather than image_dir and anno_path. Default is ./Mindrecord_eval") - parser.add_argument("--image_dir", type=str, default="", help="Dataset directory, " - "the absolute image path is joined by the image_dir " - "and the relative path in anno_path.") - parser.add_argument("--anno_path", type=str, default="", help="Annotation path.") - parser.add_argument("--ckpt_path", type=str, required=True, help="Checkpoint path.") - args_opt = parser.parse_args() + if default_config.need_modelarts_dataset_unzip: + zip_file_1 = os.path.join(default_config.data_path, default_config.modelarts_dataset_unzip_name + ".zip") + save_dir_1 = os.path.join(default_config.data_path) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) + sync_lock = "/tmp/unzip_sync.lock" - # It will generate mindrecord file in args_opt.mindrecord_dir, + # Each server contains 8 devices as most. + if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): + print("Zip file path: ", zip_file_1) + print("Unzip file save dir: ", save_dir_1) + unzip(zip_file_1, save_dir_1) + print("===Finish extract data synchronization===") + try: + os.mknod(sync_lock) + except IOError: + pass + + while True: + if os.path.exists(sync_lock): + break + time.sleep(1) + + print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1)) + + +@moxing_wrapper(pre_process=modelarts_pre_process) +def run_eval(): + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=get_device_id()) + + # It will generate mindrecord file in default_config.eval_mindrecord_dir, # and the file name is yolo.mindrecord0, 1, ... file_num. - if not os.path.isdir(args_opt.mindrecord_dir): - os.makedirs(args_opt.mindrecord_dir) + if not os.path.isdir(default_config.eval_mindrecord_dir): + os.makedirs(default_config.eval_mindrecord_dir) yolo_prefix = "yolo.mindrecord" - mindrecord_file = os.path.join(args_opt.mindrecord_dir, yolo_prefix + "0") + mindrecord_file = os.path.join(default_config.eval_mindrecord_dir, yolo_prefix + "0") if not os.path.exists(mindrecord_file): - if os.path.isdir(args_opt.image_dir) and os.path.exists(args_opt.anno_path): + if os.path.isdir(default_config.image_dir) and os.path.exists(default_config.anno_path): print("Create Mindrecord") - data_to_mindrecord_byte_image(args_opt.image_dir, - args_opt.anno_path, - args_opt.mindrecord_dir, + data_to_mindrecord_byte_image(default_config.image_dir, + default_config.anno_path, + default_config.eval_mindrecord_dir, prefix=yolo_prefix, file_num=8) - print("Create Mindrecord Done, at {}".format(args_opt.mindrecord_dir)) + print("Create Mindrecord Done, at {}".format(default_config.eval_mindrecord_dir)) else: print("image_dir or anno_path not exits") - print("Start Eval!") - yolo_eval(mindrecord_file, args_opt.ckpt_path) + + if not default_config.only_create_dataset: + print("Start Eval!") + yolo_eval(mindrecord_file, default_config.ckpt_path) + + +if __name__ == '__main__': + run_eval() diff --git a/model_zoo/official/cv/yolov3_resnet18/model_utils/__init__.py b/model_zoo/official/cv/yolov3_resnet18/model_utils/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/model_zoo/official/cv/yolov3_resnet18/model_utils/config.py b/model_zoo/official/cv/yolov3_resnet18/model_utils/config.py new file mode 100644 index 00000000000..ad0d7497a8e --- /dev/null +++ b/model_zoo/official/cv/yolov3_resnet18/model_utils/config.py @@ -0,0 +1,126 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Parse arguments""" + +import os +import ast +import argparse +from pprint import pformat +import yaml + +class Config: + """ + Configuration namespace. Convert dictionary to members. + """ + def __init__(self, cfg_dict): + for k, v in cfg_dict.items(): + if isinstance(v, (list, tuple)): + setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v]) + else: + setattr(self, k, Config(v) if isinstance(v, dict) else v) + + def __str__(self): + return pformat(self.__dict__) + + def __repr__(self): + return self.__str__() + + +def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"): + """ + Parse command line arguments to the configuration according to the default yaml. + + Args: + parser: Parent parser. + cfg: Base configuration. + helper: Helper description. + cfg_path: Path to the default yaml config. + """ + parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]", + parents=[parser]) + helper = {} if helper is None else helper + choices = {} if choices is None else choices + for item in cfg: + if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict): + help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path) + choice = choices[item] if item in choices else None + if isinstance(cfg[item], bool): + parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice, + help=help_description) + else: + parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice, + help=help_description) + args = parser.parse_args() + return args + + +def parse_yaml(yaml_path): + """ + Parse the yaml config file. + + Args: + yaml_path: Path to the yaml config. + """ + with open(yaml_path, 'r') as fin: + try: + cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader) + cfgs = [x for x in cfgs] + if len(cfgs) == 1: + cfg_helper = {} + cfg = cfgs[0] + cfg_choices = {} + elif len(cfgs) == 2: + cfg, cfg_helper = cfgs + cfg_choices = {} + elif len(cfgs) == 3: + cfg, cfg_helper, cfg_choices = cfgs + else: + raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml") + print(cfg_helper) + except: + raise ValueError("Failed to parse yaml") + return cfg, cfg_helper, cfg_choices + + +def merge(args, cfg): + """ + Merge the base config from yaml file and command line arguments. + + Args: + args: Command line arguments. + cfg: Base configuration. + """ + args_var = vars(args) + for item in args_var: + cfg[item] = args_var[item] + return cfg + + +def get_config(): + """ + Get Config according to the yaml file and cli arguments. + """ + parser = argparse.ArgumentParser(description="default name", add_help=False) + current_dir = os.path.dirname(os.path.abspath(__file__)) + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../default_config.yaml"), + help="Config file path") + path_args, _ = parser.parse_known_args() + default, helper, choices = parse_yaml(path_args.config_path) + args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) + final_config = merge(args, default) + return Config(final_config) + +config = get_config() diff --git a/model_zoo/official/cv/yolov3_resnet18/model_utils/device_adapter.py b/model_zoo/official/cv/yolov3_resnet18/model_utils/device_adapter.py new file mode 100644 index 00000000000..7c5d7f837dd --- /dev/null +++ b/model_zoo/official/cv/yolov3_resnet18/model_utils/device_adapter.py @@ -0,0 +1,27 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Device adapter for ModelArts""" + +from .config import config + +if config.enable_modelarts: + from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id +else: + from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id + +__all__ = [ + "get_device_id", "get_device_num", "get_rank_id", "get_job_id" +] diff --git a/model_zoo/official/cv/yolov3_resnet18/model_utils/local_adapter.py b/model_zoo/official/cv/yolov3_resnet18/model_utils/local_adapter.py new file mode 100644 index 00000000000..769fa6dc78e --- /dev/null +++ b/model_zoo/official/cv/yolov3_resnet18/model_utils/local_adapter.py @@ -0,0 +1,36 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Local adapter""" + +import os + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + return "Local Job" diff --git a/model_zoo/official/cv/yolov3_resnet18/model_utils/moxing_adapter.py b/model_zoo/official/cv/yolov3_resnet18/model_utils/moxing_adapter.py new file mode 100644 index 00000000000..25838a7da99 --- /dev/null +++ b/model_zoo/official/cv/yolov3_resnet18/model_utils/moxing_adapter.py @@ -0,0 +1,116 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Moxing adapter for ModelArts""" + +import os +import functools +from mindspore import context +from .config import config + +_global_sync_count = 0 + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + job_id = os.getenv('JOB_ID') + job_id = job_id if job_id != "" else "default" + return job_id + +def sync_data(from_path, to_path): + """ + Download data from remote obs to local directory if the first url is remote url and the second one is local path + Upload data from local directory to remote obs in contrast. + """ + import moxing as mox + import time + global _global_sync_count + sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count) + _global_sync_count += 1 + + # Each server contains 8 devices as most. + if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): + print("from path: ", from_path) + print("to path: ", to_path) + mox.file.copy_parallel(from_path, to_path) + print("===finish data synchronization===") + try: + os.mknod(sync_lock) + except IOError: + pass + print("===save flag===") + + while True: + if os.path.exists(sync_lock): + break + time.sleep(1) + + print("Finish sync data from {} to {}.".format(from_path, to_path)) + + +def moxing_wrapper(pre_process=None, post_process=None): + """ + Moxing wrapper to download dataset and upload outputs. + """ + def wrapper(run_func): + @functools.wraps(run_func) + def wrapped_func(*args, **kwargs): + # Download data from data_url + if config.enable_modelarts: + if config.data_url: + sync_data(config.data_url, config.data_path) + print("Dataset downloaded: ", os.listdir(config.data_path)) + if config.checkpoint_url: + sync_data(config.checkpoint_url, config.load_path) + print("Preload downloaded: ", os.listdir(config.load_path)) + if config.train_url: + sync_data(config.train_url, config.output_path) + print("Workspace downloaded: ", os.listdir(config.output_path)) + + context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + config.device_num = get_device_num() + config.device_id = get_device_id() + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + + if pre_process: + pre_process() + + # Run the main function + run_func(*args, **kwargs) + + # Upload data to train_url + if config.enable_modelarts: + if post_process: + post_process() + + if config.train_url: + print("Start to copy output directory") + sync_data(config.output_path, config.train_url) + return wrapped_func + return wrapper diff --git a/model_zoo/official/cv/yolov3_resnet18/scripts/run_distribute_train.sh b/model_zoo/official/cv/yolov3_resnet18/scripts/run_distribute_train.sh index e273cf74039..15483ecf2be 100644 --- a/model_zoo/official/cv/yolov3_resnet18/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/yolov3_resnet18/scripts/run_distribute_train.sh @@ -17,7 +17,7 @@ echo "=======================================================================================================================================================" echo "Please run the script as: " echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH RANK_TABLE_FILE PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" -echo "For example: sh run_distribute_train.sh 8 150 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)" +echo "For example: sh run_distribute_train.sh 8 160 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)" echo "It is better to use absolute path." echo "The learning rate is 0.005 as default, if you want other lr, please change the value in this script." echo "=======================================================================================================================================================" @@ -63,7 +63,9 @@ do rm -rf LOG$i mkdir ./LOG$i cp *.py ./LOG$i + cp *.yaml ./LOG$i cp -r ./src ./LOG$i + cp -r ./model_utils ./LOG$i cd ./LOG$i || exit export RANK_ID=$i echo "start training for rank $i, device $DEVICE_ID" @@ -74,8 +76,6 @@ do taskset -c $cmdopt python train.py \ --distribute=True \ --lr=0.005 \ - --device_num=$RANK_SIZE \ - --device_id=$DEVICE_ID \ --mindrecord_dir=$MINDRECORD_DIR \ --image_dir=$IMAGE_DIR \ --epoch_size=$EPOCH_SIZE \ @@ -87,8 +87,6 @@ do taskset -c $cmdopt python train.py \ --distribute=True \ --lr=0.005 \ - --device_num=$RANK_SIZE \ - --device_id=$DEVICE_ID \ --mindrecord_dir=$MINDRECORD_DIR \ --image_dir=$IMAGE_DIR \ --epoch_size=$EPOCH_SIZE \ diff --git a/model_zoo/official/cv/yolov3_resnet18/scripts/run_eval.sh b/model_zoo/official/cv/yolov3_resnet18/scripts/run_eval.sh index 8bb054289c0..e0ccd093497 100644 --- a/model_zoo/official/cv/yolov3_resnet18/scripts/run_eval.sh +++ b/model_zoo/official/cv/yolov3_resnet18/scripts/run_eval.sh @@ -23,4 +23,8 @@ echo "========================================================================== BASE_PATH=$(cd "`dirname $0`" || exit; pwd) cd $BASE_PATH/../ || exit -python eval.py --device_id=$1 --ckpt_path=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5 +export RANK_SIZE=1 +export DEVICE_ID=$1 +export RANK_ID=$1 + +python eval.py --ckpt_path=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5 diff --git a/model_zoo/official/cv/yolov3_resnet18/scripts/run_standalone_train.sh b/model_zoo/official/cv/yolov3_resnet18/scripts/run_standalone_train.sh index 5e3a234e257..c6aeb5db34a 100644 --- a/model_zoo/official/cv/yolov3_resnet18/scripts/run_standalone_train.sh +++ b/model_zoo/official/cv/yolov3_resnet18/scripts/run_standalone_train.sh @@ -17,7 +17,7 @@ echo "=========================================================================================================================================" echo "Please run the script as: " echo "sh run_standalone_train.sh DEVICE_ID EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" -echo "for example: sh run_standalone_train.sh 0 50 ./Mindrecord_train ./dataset ./dataset/train.txt /opt/yolov3-50.ckpt(optional) 30(optional)" +echo "for example: sh run_standalone_train.sh 0 60 ./Mindrecord_train ./dataset ./dataset/train.txt /opt/yolov3-50.ckpt(optional) 30(optional)" echo "=========================================================================================================================================" if [ $# != 5 ] && [ $# != 7 ] @@ -30,12 +30,16 @@ fi BASE_PATH=$(cd "`dirname $0`" || exit; pwd) cd $BASE_PATH/../ || exit +export RANK_SIZE=1 +export DEVICE_ID=$1 +export RANK_ID=$1 + if [ $# == 5 ] then - python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5 + python train.py --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5 fi if [ $# == 7 ] then - python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5 --pre_trained=$6 --pre_trained_epoch_size=$7 + python train.py --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5 --pre_trained=$6 --pre_trained_epoch_size=$7 fi diff --git a/model_zoo/official/cv/yolov3_resnet18/train.py b/model_zoo/official/cv/yolov3_resnet18/train.py index b590aac08ae..4619a3e33c3 100644 --- a/model_zoo/official/cv/yolov3_resnet18/train.py +++ b/model_zoo/official/cv/yolov3_resnet18/train.py @@ -23,8 +23,7 @@ Note if mindrecord_dir isn't empty, it will use mindrecord_dir rather than image """ import os -import argparse -import ast +import time import numpy as np import mindspore.nn as nn from mindspore import context, Tensor @@ -40,6 +39,10 @@ from src.yolov3 import yolov3_resnet18, YoloWithLossCell, TrainingWrapper from src.dataset import create_yolo_dataset, data_to_mindrecord_byte_image from src.config import ConfigYOLOV3ResNet18 +from model_utils.config import config as default_config +from model_utils.moxing_adapter import moxing_wrapper +from model_utils.device_adapter import get_device_id, get_rank_id, get_device_num + set_seed(1) def get_lr(learning_rate, start_step, global_step, decay_step, decay_rate, steps=False): @@ -63,71 +66,99 @@ def init_net_param(network, init_value='ones'): p.set_data(initializer(init_value, p.data.shape, p.data.dtype)) -def main(): - parser = argparse.ArgumentParser(description="YOLOv3 train") - parser.add_argument("--only_create_dataset", type=ast.literal_eval, default=False, - help="If set it true, only create Mindrecord, default is False.") - parser.add_argument("--distribute", type=ast.literal_eval, default=False, help="Run distribute, default is False.") - parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") - parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") - parser.add_argument("--lr", type=float, default=0.001, help="Learning rate, default is 0.001.") - parser.add_argument("--mode", type=str, default="sink", help="Run sink mode or not, default is sink") - parser.add_argument("--epoch_size", type=int, default=50, help="Epoch size, default is 50") - parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.") - parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained checkpoint file path") - parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size") - parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.") - parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.") - parser.add_argument("--mindrecord_dir", type=str, default="./Mindrecord_train", - help="Mindrecord directory. If the mindrecord_dir is empty, it wil generate mindrecord file by " - "image_dir and anno_path. Note if mindrecord_dir isn't empty, it will use mindrecord_dir " - "rather than image_dir and anno_path. Default is ./Mindrecord_train") - parser.add_argument("--image_dir", type=str, default="", help="Dataset directory, " - "the absolute image path is joined by the image_dir " - "and the relative path in anno_path") - parser.add_argument("--anno_path", type=str, default="", help="Annotation path.") - args_opt = parser.parse_args() +def modelarts_pre_process(): + '''modelarts pre process function.''' + def unzip(zip_file, save_dir): + import zipfile + s_time = time.time() + if not os.path.exists(os.path.join(save_dir, default_config.modelarts_dataset_unzip_name)): + zip_isexist = zipfile.is_zipfile(zip_file) + if zip_isexist: + fz = zipfile.ZipFile(zip_file, 'r') + data_num = len(fz.namelist()) + print("Extract Start...") + print("unzip file num: {}".format(data_num)) + data_print = int(data_num / 100) if data_num > 100 else 1 + i = 0 + for file in fz.namelist(): + if i % data_print == 0: + print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True) + i += 1 + fz.extract(file, save_dir) + print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60), + int(int(time.time() - s_time) % 60))) + print("Extract Done.") + else: + print("This is not zip.") + else: + print("Zip has been extracted.") - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) - if args_opt.distribute: - device_num = args_opt.device_num + if default_config.need_modelarts_dataset_unzip: + zip_file_1 = os.path.join(default_config.data_path, default_config.modelarts_dataset_unzip_name + ".zip") + save_dir_1 = os.path.join(default_config.data_path) + + sync_lock = "/tmp/unzip_sync.lock" + + # Each server contains 8 devices as most. + if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): + print("Zip file path: ", zip_file_1) + print("Unzip file save dir: ", save_dir_1) + unzip(zip_file_1, save_dir_1) + print("===Finish extract data synchronization===") + try: + os.mknod(sync_lock) + except IOError: + pass + + while True: + if os.path.exists(sync_lock): + break + time.sleep(1) + + print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1)) + + default_config.save_checkpoint_dir = os.path.join(default_config.output_path, default_config.save_checkpoint_dir) + + +@moxing_wrapper(pre_process=modelarts_pre_process) +def run_train(): + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=get_device_id()) + rank = get_rank_id() + device_num = get_device_num() + + if default_config.distribute: context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) init() - rank = args_opt.device_id % device_num - else: - rank = 0 - device_num = 1 print("Start create dataset!") - - # It will generate mindrecord file in args_opt.mindrecord_dir, + # It will generate mindrecord file in default_config.mindrecord_dir, # and the file name is yolo.mindrecord0, 1, ... file_num. - if not os.path.isdir(args_opt.mindrecord_dir): - os.makedirs(args_opt.mindrecord_dir) + if not os.path.isdir(default_config.mindrecord_dir): + os.makedirs(default_config.mindrecord_dir) prefix = "yolo.mindrecord" - mindrecord_file = os.path.join(args_opt.mindrecord_dir, prefix + "0") + mindrecord_file = os.path.join(default_config.mindrecord_dir, prefix + "0") if not os.path.exists(mindrecord_file): - if os.path.isdir(args_opt.image_dir) and os.path.exists(args_opt.anno_path): + if os.path.isdir(default_config.image_dir) and os.path.exists(default_config.anno_path): print("Create Mindrecord.") - data_to_mindrecord_byte_image(args_opt.image_dir, - args_opt.anno_path, - args_opt.mindrecord_dir, + data_to_mindrecord_byte_image(default_config.image_dir, + default_config.anno_path, + default_config.mindrecord_dir, prefix, 8) - print("Create Mindrecord Done, at {}".format(args_opt.mindrecord_dir)) + print("Create Mindrecord Done, at {}".format(default_config.mindrecord_dir)) else: - raise ValueError('image_dir {} or anno_path {} does not exist'.format(\ - args_opt.image_dir, args_opt.anno_path)) + raise ValueError('image_dir {} or anno_path {} does not exist'. + format(default_config.image_dir, default_config.anno_path)) - if not args_opt.only_create_dataset: - loss_scale = float(args_opt.loss_scale) + if not default_config.only_create_dataset: + loss_scale = float(default_config.loss_scale) # When create MindDataset, using the fitst mindrecord file, such as yolo.mindrecord0. dataset = create_yolo_dataset(mindrecord_file, - batch_size=args_opt.batch_size, device_num=device_num, rank=rank) + batch_size=default_config.batch_size, device_num=device_num, rank=rank) dataset_size = dataset.get_dataset_size() print("Create dataset done!") @@ -136,18 +167,20 @@ def main(): init_net_param(net, "XavierUniform") # checkpoint - ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs) - ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory='./ckpt_' + str(rank) + '/', config=ckpt_config) + ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * default_config.save_checkpoint_epochs) + save_ckpt_dir = os.path.join(default_config.save_checkpoint_dir, 'ckpt_' + str(rank) + '/') + ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory=save_ckpt_dir, config=ckpt_config) - if args_opt.pre_trained: - if args_opt.pre_trained_epoch_size <= 0: + if default_config.pre_trained: + if default_config.pre_trained_epoch_size <= 0: raise KeyError("pre_trained_epoch_size must be greater than 0.") - param_dict = load_checkpoint(args_opt.pre_trained) + param_dict = load_checkpoint(default_config.pre_trained) load_param_into_net(net, param_dict) total_epoch_size = 60 - if args_opt.distribute: + if default_config.distribute: total_epoch_size = 160 - lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=args_opt.pre_trained_epoch_size * dataset_size, + lr = Tensor(get_lr(learning_rate=default_config.lr, + start_step=default_config.pre_trained_epoch_size * dataset_size, global_step=total_epoch_size * dataset_size, decay_step=1000, decay_rate=0.95, steps=True)) opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale) @@ -157,11 +190,11 @@ def main(): model = Model(net) dataset_sink_mode = False - if args_opt.mode == "sink": + if default_config.mode == "sink": print("In sink mode, one epoch return a loss.") dataset_sink_mode = True print("Start train YOLOv3, the first epoch will be slower because of the graph compilation.") - model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode) + model.train(default_config.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode) if __name__ == '__main__': - main() + run_train()