forked from mindspore-Ecosystem/mindspore
!16225 modify YOLOv3_darknet53 for clould
From: @zhanghuiyao Reviewed-by: @c_34,@oacjiewen Signed-off-by: @c_34
This commit is contained in:
commit
8fbe61a4c3
|
@ -101,41 +101,93 @@ Dataset used: [COCO2014](https://cocodataset.org/#download)
|
|||
python hccl_tools.py --device_num "[0,8)"
|
||||
```
|
||||
|
||||
```network
|
||||
# The parameter of training_shape define image shape for network, default is "".
|
||||
# It means use 10 kinds of shape as input shape, or it can be set some kind of shape.
|
||||
# run training example(1p) by python command.
|
||||
python train.py \
|
||||
--data_dir=./dataset/coco2014 \
|
||||
--pretrained_backbone=darknet53_backbone.ckpt \
|
||||
--is_distributed=0 \
|
||||
--lr=0.001 \
|
||||
--loss_scale=1024 \
|
||||
--weight_decay=0.016 \
|
||||
--T_max=320 \
|
||||
--max_epoch=320 \
|
||||
--warmup_epochs=4 \
|
||||
--training_shape=416 \
|
||||
--lr_scheduler=cosine_annealing > log.txt 2>&1 &
|
||||
- Train on local
|
||||
|
||||
# standalone training example(1p) by shell script
|
||||
bash run_standalone_train.sh dataset/coco2014 darknet53_backbone.ckpt
|
||||
```network
|
||||
# The parameter of training_shape define image shape for network, default is "".
|
||||
# It means use 10 kinds of shape as input shape, or it can be set some kind of shape.
|
||||
# run training example(1p) by python command.
|
||||
python train.py \
|
||||
--data_dir=./dataset/coco2014 \
|
||||
--pretrained_backbone=darknet53_backbone.ckpt \
|
||||
--is_distributed=0 \
|
||||
--lr=0.001 \
|
||||
--loss_scale=1024 \
|
||||
--weight_decay=0.016 \
|
||||
--T_max=320 \
|
||||
--max_epoch=320 \
|
||||
--warmup_epochs=4 \
|
||||
--training_shape=416 \
|
||||
--lr_scheduler=cosine_annealing > log.txt 2>&1 &
|
||||
|
||||
# For Ascend device, distributed training example(8p) by shell script
|
||||
bash run_distribute_train.sh dataset/coco2014 darknet53_backbone.ckpt rank_table_8p.json
|
||||
# standalone training example(1p) by shell script
|
||||
bash run_standalone_train.sh dataset/coco2014 darknet53_backbone.ckpt
|
||||
|
||||
# For GPU device, distributed training example(8p) by shell script
|
||||
bash run_distribute_train_gpu.sh dataset/coco2014 darknet53_backbone.ckpt
|
||||
# For Ascend device, distributed training example(8p) by shell script
|
||||
bash run_distribute_train.sh dataset/coco2014 darknet53_backbone.ckpt rank_table_8p.json
|
||||
|
||||
# run evaluation by python command
|
||||
python eval.py \
|
||||
--data_dir=./dataset/coco2014 \
|
||||
--pretrained=yolov3.ckpt \
|
||||
--testing_shape=416 > log.txt 2>&1 &
|
||||
# For GPU device, distributed training example(8p) by shell script
|
||||
bash run_distribute_train_gpu.sh dataset/coco2014 darknet53_backbone.ckpt
|
||||
|
||||
# run evaluation by shell script
|
||||
bash run_eval.sh dataset/coco2014/ checkpoint/0-319_102400.ckpt
|
||||
```
|
||||
# run evaluation by python command
|
||||
python eval.py \
|
||||
--data_dir=./dataset/coco2014 \
|
||||
--pretrained=yolov3.ckpt \
|
||||
--testing_shape=416 > log.txt 2>&1 &
|
||||
|
||||
# run evaluation by shell script
|
||||
bash run_eval.sh dataset/coco2014/ checkpoint/0-319_102400.ckpt
|
||||
```
|
||||
|
||||
- Train on [ModelArts](https://support.huaweicloud.com/modelarts/)
|
||||
|
||||
```python
|
||||
# Train 8p with Ascend
|
||||
# (1) Perform a or b.
|
||||
# a. Set "enable_modelarts=True" on base_config.yaml file.
|
||||
# Set "data_dir='/cache/data/coco2014/'" on base_config.yaml file.
|
||||
# Set "checkpoint_url='s3://dir_to_your_pretrain/'" on base_config.yaml file.
|
||||
# Set "pretrained_backbone='/cache/checkpoint_path/0-148_92000.ckpt'" on base_config.yaml file.
|
||||
# Set "weight_decay=0.016" on base_config.yaml file.
|
||||
# Set "warmup_epochs=4" on base_config.yaml file.
|
||||
# Set "lr_scheduler='cosine_annealing'" on base_config.yaml file.
|
||||
# Set other parameters on base_config.yaml file you need.
|
||||
# b. Add "enable_modelarts=True" on the website UI interface.
|
||||
# Add "data_dir=/cache/data/coco2014/" on the website UI interface.
|
||||
# Add "checkpoint_url=s3://dir_to_your_pretrain/" on the website UI interface.
|
||||
# Add "pretrained_backbone=/cache/checkpoint_path/0-148_92000.ckpt" on the website UI interface.
|
||||
# Add "weight_decay=0.016" on the website UI interface.
|
||||
# Add "warmup_epochs=4" on the website UI interface.
|
||||
# Add "lr_scheduler=cosine_annealing" on the website UI interface.
|
||||
# Add other parameters on the website UI interface.
|
||||
# (3) Upload or copy your pretrained model to S3 bucket.
|
||||
# (4) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
|
||||
# (5) Set the code directory to "/path/yolov3_darknet53" on the website UI interface.
|
||||
# (6) Set the startup file to "train.py" on the website UI interface.
|
||||
# (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
|
||||
# (8) Create your job.
|
||||
#
|
||||
# Eval with Ascend
|
||||
# (1) Perform a or b.
|
||||
# a. Set "enable_modelarts=True" on base_config.yaml file.
|
||||
# Set "data_dir='/cache/data/coco2014/'" on base_config.yaml file.
|
||||
# Set "checkpoint_url='s3://dir_to_your_trained_ckpt/'" on base_config.yaml file.
|
||||
# Set "pretrained='/cache/checkpoint_path/0-320_102400.ckpt'" on base_config.yaml file.
|
||||
# Set "testing_shape=416" on base_config.yaml file.
|
||||
# Set other parameters on base_config.yaml file you need.
|
||||
# b. Add "enable_modelarts=True" on the website UI interface.
|
||||
# Add "data_dir=/cache/data/coco2014/" on the website UI interface.
|
||||
# Add "checkpoint_url=s3://dir_to_your_trained_ckpt/" on the website UI interface.
|
||||
# Add "pretrained=/cache/checkpoint_path/0-320_102400.ckpt" on the website UI interface.
|
||||
# Add "testing_shape=416" on the website UI interface.
|
||||
# Add other parameters on the website UI interface.
|
||||
# (3) Upload or copy your trained model to S3 bucket.
|
||||
# (4) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
|
||||
# (5) Set the code directory to "/path/yolov3_darknet53" on the website UI interface.
|
||||
# (6) Set the startup file to "eval.py" on the website UI interface.
|
||||
# (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
|
||||
# (8) Create your job.
|
||||
```
|
||||
|
||||
## [Script Description](#contents)
|
||||
|
||||
|
|
|
@ -104,49 +104,91 @@ YOLOv3使用DarkNet53执行特征提取,这是YOLOv2中的Darknet-19和残差
|
|||
python hccl_tools.py --device_num "[0,8)"
|
||||
```
|
||||
|
||||
```python
|
||||
# training_shape参数定义网络图像形状,默认为""。
|
||||
# 意思是使用10种形状作为输入形状,或者可以设置某种形状。
|
||||
# 通过python命令执行训练示例(1卡)。
|
||||
python train.py \
|
||||
--data_dir=./dataset/coco2014 \
|
||||
--pretrained_backbone=darknet53_backbone.ckpt \
|
||||
--is_distributed=0 \
|
||||
--lr=0.1 \
|
||||
--T_max=320 \
|
||||
--max_epoch=320 \
|
||||
--warmup_epochs=4 \
|
||||
--training_shape=416 \
|
||||
--lr_scheduler=cosine_annealing > log.txt 2>&1 &
|
||||
```
|
||||
- 在本地进行训练
|
||||
|
||||
```shell script
|
||||
# shell脚本单机训练示例(1卡)
|
||||
bash run_standalone_train.sh dataset/coco2014 darknet53_backbone.ckpt
|
||||
```
|
||||
```constet
|
||||
# training_shape参数定义网络图像形状,默认为""。
|
||||
# 意思是使用10种形状作为输入形状,或者可以设置某种形状。
|
||||
# 通过python命令执行训练示例(1卡)。
|
||||
python train.py \
|
||||
--data_dir=./dataset/coco2014 \
|
||||
--pretrained_backbone=darknet53_backbone.ckpt \
|
||||
--is_distributed=0 \
|
||||
--lr=0.1 \
|
||||
--T_max=320 \
|
||||
--max_epoch=320 \
|
||||
--warmup_epochs=4 \
|
||||
--training_shape=416 \
|
||||
--lr_scheduler=cosine_annealing > log.txt 2>&1 &
|
||||
|
||||
```shell script
|
||||
# 对于Ascend设备,使用shell脚本分布式训练示例(8卡)
|
||||
bash run_distribute_train.sh dataset/coco2014 darknet53_backbone.ckpt rank_table_8p.json
|
||||
```
|
||||
# shell脚本单机训练示例(1卡)
|
||||
bash run_standalone_train.sh dataset/coco2014 darknet53_backbone.ckpt
|
||||
|
||||
```shell script
|
||||
# 对于GPU设备,使用shell脚本分布式训练示例(8卡)
|
||||
bash run_distribute_train_gpu.sh dataset/coco2014 darknet53_backbone.ckpt
|
||||
```
|
||||
# 对于Ascend设备,使用shell脚本分布式训练示例(8卡)
|
||||
bash run_distribute_train.sh dataset/coco2014 darknet53_backbone.ckpt rank_table_8p.json
|
||||
|
||||
```python
|
||||
# 使用python命令评估
|
||||
python eval.py \
|
||||
--data_dir=./dataset/coco2014 \
|
||||
--pretrained=yolov3.ckpt \
|
||||
--testing_shape=416 > log.txt 2>&1 &
|
||||
```
|
||||
# 对于GPU设备,使用shell脚本分布式训练示例(8卡)
|
||||
bash run_distribute_train_gpu.sh dataset/coco2014 darknet53_backbone.ckpt
|
||||
|
||||
```shell script
|
||||
# 通过shell脚本运行评估
|
||||
bash run_eval.sh dataset/coco2014/ checkpoint/0-319_102400.ckpt
|
||||
```
|
||||
# 使用python命令评估
|
||||
python eval.py \
|
||||
--data_dir=./dataset/coco2014 \
|
||||
--pretrained=yolov3.ckpt \
|
||||
--testing_shape=416 > log.txt 2>&1 &
|
||||
|
||||
# 通过shell脚本运行评估
|
||||
bash run_eval.sh dataset/coco2014/ checkpoint/0-319_102400.ckpt
|
||||
```
|
||||
|
||||
- 在 [ModelArts](https://support.huaweicloud.com/modelarts/) 上训练
|
||||
|
||||
```python
|
||||
# 在modelarts上进行8卡训练(Ascend)
|
||||
# (1) 执行a或者b
|
||||
# a. 在 base_config.yaml 文件中配置 "enable_modelarts=True"
|
||||
# 在 base_config.yaml 文件中配置 "data_dir='/cache/data/coco2014/'"
|
||||
# 在 base_config.yaml 文件中配置 "checkpoint_url='s3://dir_to_your_pretrain/'"
|
||||
# 在 base_config.yaml 文件中配置 "pretrained_backbone='/cache/checkpoint_path/0-148_92000.ckpt'"
|
||||
# 在 base_config.yaml 文件中配置 "weight_decay=0.016"
|
||||
# 在 base_config.yaml 文件中配置 "warmup_epochs=4"
|
||||
# 在 base_config.yaml 文件中配置 "lr_scheduler='cosine_annealing'"
|
||||
# 在 base_config.yaml 文件中配置 其他参数
|
||||
# b. 在网页上设置 "enable_modelarts=True"
|
||||
# 在网页上设置 "data_dir=/cache/data/coco2014/"
|
||||
# 在网页上设置 "checkpoint_url=s3://dir_to_your_pretrain/"
|
||||
# 在网页上设置 "pretrained_backbone=/cache/checkpoint_path/0-148_92000.ckpt"
|
||||
# 在网页上设置 "weight_decay=0.016"
|
||||
# 在网页上设置 "warmup_epochs=4"
|
||||
# 在网页上设置 "lr_scheduler=cosine_annealing"
|
||||
# 在网页上设置 其他参数
|
||||
# (2) 上传你的预训练模型到 S3 桶上
|
||||
# (3) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集,但那可能会很慢。)
|
||||
# (4) 在网页上设置你的代码路径为 "/path/deeplabv3"
|
||||
# (5) 在网页上设置启动文件为 "train.py"
|
||||
# (6) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等
|
||||
# (7) 创建训练作业
|
||||
#
|
||||
# 在modelarts上进行验证(Ascend)
|
||||
# (1) 执行a或者b
|
||||
# a. 在 base_config.yaml 文件中配置 "enable_modelarts=True"
|
||||
# 在 base_config.yaml 文件中配置 "data_dir='/cache/data/coco2014/'"
|
||||
# 在 base_config.yaml 文件中配置 "checkpoint_url='s3://dir_to_your_trained_ckpt/'"
|
||||
# 在 base_config.yaml 文件中配置 "pretrained='/cache/checkpoint_path/0-320_102400.ckpt'"
|
||||
# 在 base_config.yaml 文件中配置 "testing_shape=416"
|
||||
# 在 base_config.yaml 文件中配置 其他参数
|
||||
# b. 在网页上设置 "enable_modelarts=True"
|
||||
# 在网页上设置 "data_dir=/cache/data/coco2014/"
|
||||
# 在网页上设置 "checkpoint_url=s3://dir_to_your_trained_ckpt/"
|
||||
# 在网页上设置 "pretrained=/cache/checkpoint_path/0-320_102400.ckpt"
|
||||
# 在网页上设置 "testing_shape=416"
|
||||
# 在网页上设置 其他参数
|
||||
# (2) 上传你的预训练模型到 S3 桶上
|
||||
# (3) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集,但那可能会很慢。)
|
||||
# (4) 在网页上设置你的代码路径为 "/path/deeplabv3"
|
||||
# (5) 在网页上设置启动文件为 "train.py"
|
||||
# (6) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等
|
||||
# (7) 创建训练作业
|
||||
```
|
||||
|
||||
# 脚本说明
|
||||
|
||||
|
|
|
@ -28,9 +28,9 @@
|
|||
#include "include/api/context.h"
|
||||
#include "include/api/types.h"
|
||||
#include "include/api/serialization.h"
|
||||
#include "include/dataset/vision_ascend.h"
|
||||
#include "include/dataset/execute.h"
|
||||
#include "include/dataset/vision.h"
|
||||
#include "include/minddata/dataset/include/vision_ascend.h"
|
||||
#include "include/minddata/dataset/include/execute.h"
|
||||
#include "include/minddata/dataset/include/vision.h"
|
||||
#include "inc/utils.h"
|
||||
|
||||
using mindspore::Context;
|
||||
|
|
|
@ -0,0 +1,168 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path"
|
||||
device_target: "Ascend" # ['Ascend', 'GPU']
|
||||
need_modelarts_dataset_unzip: True
|
||||
modelarts_dataset_unzip_name: "coco2014"
|
||||
|
||||
# ==============================================================================
|
||||
# Training options
|
||||
|
||||
# dataset related
|
||||
data_dir: "/cache/data/coco2014/"
|
||||
per_batch_size: 32
|
||||
|
||||
# network related
|
||||
pretrained_backbone: "/cache/checkpoint_path/0-148_92000.ckpt"
|
||||
resume_yolov3: ""
|
||||
|
||||
# optimizer and lr related
|
||||
lr_scheduler: "exponential"
|
||||
lr: 0.001
|
||||
lr_epochs: "220,250"
|
||||
lr_gamma: 0.1
|
||||
eta_min: 0.0
|
||||
T_max: 320
|
||||
max_epoch: 320
|
||||
warmup_epochs: 0
|
||||
weight_decay: 0.0005
|
||||
momentum: 0.9
|
||||
|
||||
# loss related
|
||||
loss_scale: 1024
|
||||
label_smooth: 0
|
||||
label_smooth_factor: 0.1
|
||||
|
||||
# logging related
|
||||
log_interval: 100
|
||||
ckpt_path: "outputs/"
|
||||
ckpt_interval: -1
|
||||
is_save_on_master: 1
|
||||
|
||||
# distributed related
|
||||
is_distributed: 1
|
||||
rank: 0
|
||||
group_size: 1
|
||||
|
||||
# profiler init
|
||||
need_profiler: 0
|
||||
|
||||
# reset default config
|
||||
training_shape: ""
|
||||
|
||||
|
||||
# Eval option
|
||||
pretrained: ""
|
||||
log_path: "outputs/"
|
||||
nms_thresh: 0.5
|
||||
annFile: ""
|
||||
testing_shape: ""
|
||||
eval_ignore_threshold: 0.001
|
||||
|
||||
|
||||
# Export option
|
||||
device_id: 0
|
||||
batch_size: 1
|
||||
ckpt_file: ""
|
||||
file_name: "yolov3_darknet53"
|
||||
file_format: "AIR" # ["AIR", "ONNX", "MINDIR"]
|
||||
|
||||
|
||||
# Other default config
|
||||
hue: 0.1
|
||||
saturation: 1.5
|
||||
value: 1.5
|
||||
jitter: 0.3
|
||||
|
||||
resize_rate: 1
|
||||
multi_scale: [[320, 320],
|
||||
[352, 352],
|
||||
[384, 384],
|
||||
[416, 416],
|
||||
[448, 448],
|
||||
[480, 480],
|
||||
[512, 512],
|
||||
[544, 544],
|
||||
[576, 576],
|
||||
[608, 608]
|
||||
]
|
||||
|
||||
num_classes: 80
|
||||
out_channel: 255 #3 * (num_classes + 5)
|
||||
max_box: 50
|
||||
|
||||
backbone_input_shape: [32, 64, 128, 256, 512]
|
||||
backbone_shape: [64, 128, 256, 512, 1024]
|
||||
backbone_layers: [1, 2, 8, 8, 4]
|
||||
|
||||
# confidence under ignore_threshold means no object when training
|
||||
ignore_threshold: 0.7
|
||||
|
||||
# h->w
|
||||
anchor_scales: [[10, 13],
|
||||
[16, 30],
|
||||
[33, 23],
|
||||
[30, 61],
|
||||
[62, 45],
|
||||
[59, 119],
|
||||
[116, 90],
|
||||
[156, 198],
|
||||
[373, 326]]
|
||||
|
||||
# test_param
|
||||
test_img_shape: [416, 416]
|
||||
|
||||
---
|
||||
|
||||
# Help description for each configuration
|
||||
data_dir: "Train dataset directory."
|
||||
per_batch_size: "Batch size for Training."
|
||||
pretrained_backbone: "The ckpt file of DarkNet53."
|
||||
resume_yolov3: "The ckpt file of YOLOv3, which used to fine tune."
|
||||
|
||||
lr_scheduler: "Learning rate scheduler, options: exponential, cosine_annealing."
|
||||
lr: "Learning rate."
|
||||
lr_epochs: "Epoch of changing of lr changing, split with ',' ."
|
||||
lr_gamma: "Decrease lr by a factor of exponential lr_scheduler."
|
||||
eta_min: "Eta_min in cosine_annealing scheduler."
|
||||
T_max: "T-max in cosine_annealing scheduler."
|
||||
max_epoch: "Max epoch num to train the model."
|
||||
warmup_epochs: "Warmup epochs."
|
||||
weight_decay: "Weight decay factor."
|
||||
momentum: "Momentum."
|
||||
loss_scale: "Static loss scale."
|
||||
label_smooth: "Whether to use label smooth in CE."
|
||||
label_smooth_factor: "Smooth strength of original one-hot."
|
||||
log_interval: "Logging interval steps."
|
||||
ckpt_path: "Checkpoint save location."
|
||||
ckpt_interval: "Save checkpoint interval."
|
||||
is_save_on_master: "Save ckpt on master or all rank, 1 for master, 0 for all ranks."
|
||||
is_distributed: "Distribute train or not, 1 for yes, 0 for no."
|
||||
rank: "Local rank of distributed."
|
||||
group_size: "World size of device."
|
||||
need_profiler: "Whether use profiler. 0 for no, 1 for yes."
|
||||
training_shape: "Fix training shape."
|
||||
resize_rate: "Resize rate for multi-scale training."
|
||||
|
||||
# eval option
|
||||
pretrained: "model_path, local pretrained model to load."
|
||||
log_path: "checkpoint save location."
|
||||
nms_thresh: "threshold for NMS."
|
||||
annFile: "path to annotation."
|
||||
testing_shape: "shape for test."
|
||||
eval_ignore_threshold: "threshold to throw low quality boxes for eval."
|
||||
|
||||
# export option
|
||||
device_id: "Device id"
|
||||
batch_size: "batch size"
|
||||
ckpt_file: "Checkpoint file path."
|
||||
file_name: "output file name."
|
||||
file_format: "file format choices in ['AIR', 'ONNX', 'MINDIR']"
|
||||
device_target: "device target. choices in ['Ascend', 'GPU'] for train. choices in ['Ascend', 'GPU', 'CPU'] for export."
|
|
@ -14,7 +14,6 @@
|
|||
# ============================================================================
|
||||
"""YoloV3 eval."""
|
||||
import os
|
||||
import argparse
|
||||
import datetime
|
||||
import time
|
||||
import sys
|
||||
|
@ -31,7 +30,10 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
|||
from src.yolo import YOLOV3DarkNet53
|
||||
from src.logger import get_logger
|
||||
from src.yolo_dataset import create_yolo_dataset
|
||||
from src.config import ConfigYOLOV3DarkNet53
|
||||
|
||||
from model_utils.config import config
|
||||
from model_utils.moxing_adapter import moxing_wrapper
|
||||
from model_utils.device_adapter import get_device_id, get_device_num
|
||||
|
||||
|
||||
class Redirct:
|
||||
|
@ -48,7 +50,7 @@ class Redirct:
|
|||
class DetectionEngine:
|
||||
"""Detection engine."""
|
||||
def __init__(self, args):
|
||||
self.ignore_threshold = args.ignore_threshold
|
||||
self.eval_ignore_threshold = args.eval_ignore_threshold
|
||||
self.labels = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
|
||||
'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat',
|
||||
'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
|
||||
|
@ -186,7 +188,7 @@ class DetectionEngine:
|
|||
flag[i, c] = True
|
||||
confidence = cls_emb[flag] * conf
|
||||
for x_lefti, y_lefti, wi, hi, confi, clsi in zip(x_top_left, y_top_left, w, h, confidence, cls_argmax):
|
||||
if confi < self.ignore_threshold:
|
||||
if confi < self.eval_ignore_threshold:
|
||||
continue
|
||||
if img_id not in self.results:
|
||||
self.results[img_id] = defaultdict(list)
|
||||
|
@ -199,68 +201,90 @@ class DetectionEngine:
|
|||
self.results[img_id][coco_clsi].append([x_lefti, y_lefti, wi, hi, confi])
|
||||
|
||||
|
||||
def parse_args():
|
||||
"""Parse arguments."""
|
||||
parser = argparse.ArgumentParser('mindspore coco testing')
|
||||
|
||||
# device related
|
||||
parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'],
|
||||
help='device where the code will be implemented. (Default: Ascend)')
|
||||
|
||||
# dataset related
|
||||
parser.add_argument('--data_dir', type=str, default='', help='train data dir')
|
||||
parser.add_argument('--per_batch_size', default=1, type=int, help='batch size for per gpu')
|
||||
|
||||
# network related
|
||||
parser.add_argument('--pretrained', default='', type=str, help='model_path, local pretrained model to load')
|
||||
|
||||
# logging related
|
||||
parser.add_argument('--log_path', type=str, default='outputs/', help='checkpoint save location')
|
||||
|
||||
# detect_related
|
||||
parser.add_argument('--nms_thresh', type=float, default=0.5, help='threshold for NMS')
|
||||
parser.add_argument('--annFile', type=str, default='', help='path to annotation')
|
||||
parser.add_argument('--testing_shape', type=str, default='', help='shape for test ')
|
||||
parser.add_argument('--ignore_threshold', type=float, default=0.001, help='threshold to throw low quality boxes')
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
args.data_root = os.path.join(args.data_dir, 'val2014')
|
||||
args.annFile = os.path.join(args.data_dir, 'annotations/instances_val2014.json')
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def conver_testing_shape(args):
|
||||
"""Convert testing shape to list."""
|
||||
testing_shape = [int(args.testing_shape), int(args.testing_shape)]
|
||||
return testing_shape
|
||||
|
||||
|
||||
def test():
|
||||
def modelarts_pre_process():
|
||||
'''modelarts pre process function.'''
|
||||
def unzip(zip_file, save_dir):
|
||||
import zipfile
|
||||
s_time = time.time()
|
||||
if not os.path.exists(os.path.join(save_dir, config.modelarts_dataset_unzip_name)):
|
||||
zip_isexist = zipfile.is_zipfile(zip_file)
|
||||
if zip_isexist:
|
||||
fz = zipfile.ZipFile(zip_file, 'r')
|
||||
data_num = len(fz.namelist())
|
||||
print("Extract Start...")
|
||||
print("unzip file num: {}".format(data_num))
|
||||
data_print = int(data_num / 100) if data_num > 100 else 1
|
||||
i = 0
|
||||
for file in fz.namelist():
|
||||
if i % data_print == 0:
|
||||
print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True)
|
||||
i += 1
|
||||
fz.extract(file, save_dir)
|
||||
print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60),
|
||||
int(int(time.time() - s_time) % 60)))
|
||||
print("Extract Done.")
|
||||
else:
|
||||
print("This is not zip.")
|
||||
else:
|
||||
print("Zip has been extracted.")
|
||||
|
||||
if config.need_modelarts_dataset_unzip:
|
||||
zip_file_1 = os.path.join(config.data_path, config.modelarts_dataset_unzip_name + ".zip")
|
||||
save_dir_1 = os.path.join(config.data_path)
|
||||
|
||||
sync_lock = "/tmp/unzip_sync.lock"
|
||||
|
||||
# Each server contains 8 devices as most.
|
||||
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
|
||||
print("Zip file path: ", zip_file_1)
|
||||
print("Unzip file save dir: ", save_dir_1)
|
||||
unzip(zip_file_1, save_dir_1)
|
||||
print("===Finish extract data synchronization===")
|
||||
try:
|
||||
os.mknod(sync_lock)
|
||||
except IOError:
|
||||
pass
|
||||
|
||||
while True:
|
||||
if os.path.exists(sync_lock):
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1))
|
||||
|
||||
|
||||
@moxing_wrapper(pre_process=modelarts_pre_process)
|
||||
def run_test():
|
||||
"""The function of eval."""
|
||||
start_time = time.time()
|
||||
args = parse_args()
|
||||
config.data_root = os.path.join(config.data_dir, 'val2014')
|
||||
config.annFile = os.path.join(config.data_dir, 'annotations/instances_val2014.json')
|
||||
|
||||
devid = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False, device_id=devid)
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False, device_id=devid)
|
||||
|
||||
# logger
|
||||
args.outputs_dir = os.path.join(args.log_path,
|
||||
datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
|
||||
config.outputs_dir = os.path.join(config.log_path,
|
||||
datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
|
||||
rank_id = int(os.environ.get('RANK_ID')) if os.environ.get('RANK_ID') else 0
|
||||
args.logger = get_logger(args.outputs_dir, rank_id)
|
||||
config.logger = get_logger(config.outputs_dir, rank_id)
|
||||
|
||||
context.reset_auto_parallel_context()
|
||||
parallel_mode = ParallelMode.STAND_ALONE
|
||||
context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1)
|
||||
|
||||
args.logger.info('Creating Network....')
|
||||
config.logger.info('Creating Network....')
|
||||
network = YOLOV3DarkNet53(is_training=False)
|
||||
|
||||
args.logger.info(args.pretrained)
|
||||
if os.path.isfile(args.pretrained):
|
||||
param_dict = load_checkpoint(args.pretrained)
|
||||
config.logger.info(config.pretrained)
|
||||
if os.path.isfile(config.pretrained):
|
||||
param_dict = load_checkpoint(config.pretrained)
|
||||
param_dict_new = {}
|
||||
for key, values in param_dict.items():
|
||||
if key.startswith('moments.'):
|
||||
|
@ -270,32 +294,31 @@ def test():
|
|||
else:
|
||||
param_dict_new[key] = values
|
||||
load_param_into_net(network, param_dict_new)
|
||||
args.logger.info('load_model {} success'.format(args.pretrained))
|
||||
config.logger.info('load_model %s success', config.pretrained)
|
||||
else:
|
||||
args.logger.info('{} not exists or not a pre-trained file'.format(args.pretrained))
|
||||
assert FileNotFoundError('{} not exists or not a pre-trained file'.format(args.pretrained))
|
||||
config.logger.info('%s not exists or not a pre-trained file', config.pretrained)
|
||||
assert FileNotFoundError('{} not exists or not a pre-trained file'.format(config.pretrained))
|
||||
exit(1)
|
||||
|
||||
data_root = args.data_root
|
||||
ann_file = args.annFile
|
||||
data_root = config.data_root
|
||||
ann_file = config.annFile
|
||||
|
||||
config = ConfigYOLOV3DarkNet53()
|
||||
if args.testing_shape:
|
||||
config.test_img_shape = conver_testing_shape(args)
|
||||
if config.testing_shape:
|
||||
config.test_img_shape = conver_testing_shape(config)
|
||||
|
||||
ds, data_size = create_yolo_dataset(data_root, ann_file, is_training=False, batch_size=args.per_batch_size,
|
||||
ds, data_size = create_yolo_dataset(data_root, ann_file, is_training=False, batch_size=config.per_batch_size,
|
||||
max_epoch=1, device_num=1, rank=rank_id, shuffle=False,
|
||||
config=config)
|
||||
|
||||
args.logger.info('testing shape : {}'.format(config.test_img_shape))
|
||||
args.logger.info('totol {} images to eval'.format(data_size))
|
||||
config.logger.info('testing shape : %s', config.test_img_shape)
|
||||
config.logger.info('totol %d images to eval', data_size)
|
||||
|
||||
network.set_train(False)
|
||||
|
||||
# init detection engine
|
||||
detection = DetectionEngine(args)
|
||||
detection = DetectionEngine(config)
|
||||
|
||||
args.logger.info('Start inference....')
|
||||
config.logger.info('Start inference....')
|
||||
for i, data in enumerate(ds.create_dict_iterator(num_epochs=1)):
|
||||
image = data["image"]
|
||||
|
||||
|
@ -310,20 +333,21 @@ def test():
|
|||
image_id = image_id.asnumpy()
|
||||
image_shape = image_shape.asnumpy()
|
||||
|
||||
detection.detect([output_small, output_me, output_big], args.per_batch_size, image_shape, image_id)
|
||||
detection.detect([output_small, output_me, output_big], config.per_batch_size, image_shape, image_id)
|
||||
if i % 1000 == 0:
|
||||
args.logger.info('Processing... {:.2f}% '.format(i * args.per_batch_size / data_size * 100))
|
||||
config.logger.info('Processing... {:.2f}% '.format(i * config.per_batch_size / data_size * 100))
|
||||
|
||||
args.logger.info('Calculating mAP...')
|
||||
config.logger.info('Calculating mAP...')
|
||||
detection.do_nms_for_results()
|
||||
result_file_path = detection.write_result()
|
||||
args.logger.info('result file path: {}'.format(result_file_path))
|
||||
config.logger.info('result file path: %s', result_file_path)
|
||||
eval_result = detection.get_eval_result()
|
||||
|
||||
cost_time = time.time() - start_time
|
||||
args.logger.info('\n=============coco eval result=========\n' + eval_result)
|
||||
args.logger.info('testing cost time {:.2f}h'.format(cost_time / 3600.))
|
||||
eval_print_str = '\n=============coco eval result=========\n' + eval_result
|
||||
config.logger.info(eval_print_str)
|
||||
config.logger.info('testing cost time %.2f h', cost_time / 3600.)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test()
|
||||
run_test()
|
||||
|
|
|
@ -12,7 +12,6 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
import argparse
|
||||
import numpy as np
|
||||
|
||||
import mindspore as ms
|
||||
|
@ -20,32 +19,21 @@ from mindspore import context, Tensor
|
|||
from mindspore.train.serialization import export, load_checkpoint, load_param_into_net
|
||||
|
||||
from src.yolo import YOLOV3DarkNet53
|
||||
from src.config import ConfigYOLOV3DarkNet53
|
||||
from model_utils.config import config
|
||||
|
||||
parser = argparse.ArgumentParser(description="yolov3_darknet53 export")
|
||||
parser.add_argument("--device_id", type=int, default=0, help="Device id")
|
||||
parser.add_argument("--batch_size", type=int, default=1, help="batch size")
|
||||
parser.add_argument("--ckpt_file", type=str, required=True, help="Checkpoint file path.")
|
||||
parser.add_argument("--file_name", type=str, default="yolov3_darknet53", help="output file name.")
|
||||
parser.add_argument("--file_format", type=str, choices=["AIR", "ONNX", "MINDIR"], default="AIR", help="file format")
|
||||
parser.add_argument("--device_target", type=str, choices=["Ascend", "GPU", "CPU"], default="Ascend",
|
||||
help="device target")
|
||||
args = parser.parse_args()
|
||||
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
|
||||
if args.device_target == "Ascend":
|
||||
context.set_context(device_id=args.device_id)
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
|
||||
if config.device_target == "Ascend":
|
||||
context.set_context(device_id=config.device_id)
|
||||
|
||||
if __name__ == "__main__":
|
||||
network = YOLOV3DarkNet53(is_training=False)
|
||||
|
||||
param_dict = load_checkpoint(args.ckpt_file)
|
||||
param_dict = load_checkpoint(config.ckpt_file)
|
||||
load_param_into_net(network, param_dict)
|
||||
|
||||
config = ConfigYOLOV3DarkNet53()
|
||||
network.set_train(False)
|
||||
|
||||
shape = [args.batch_size, 3] + config.test_img_shape
|
||||
shape = [config.batch_size, 3] + config.test_img_shape
|
||||
input_data = Tensor(np.zeros(shape), ms.float32)
|
||||
|
||||
export(network, input_data, file_name=args.file_name, file_format=args.file_format)
|
||||
export(network, input_data, file_name=config.file_name, file_format=config.file_format)
|
||||
|
|
|
@ -0,0 +1,127 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Parse arguments"""
|
||||
|
||||
import os
|
||||
import ast
|
||||
import argparse
|
||||
from pprint import pprint, pformat
|
||||
import yaml
|
||||
|
||||
class Config:
|
||||
"""
|
||||
Configuration namespace. Convert dictionary to members.
|
||||
"""
|
||||
def __init__(self, cfg_dict):
|
||||
for k, v in cfg_dict.items():
|
||||
if isinstance(v, (list, tuple)):
|
||||
setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
|
||||
else:
|
||||
setattr(self, k, Config(v) if isinstance(v, dict) else v)
|
||||
|
||||
def __str__(self):
|
||||
return pformat(self.__dict__)
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
|
||||
def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"):
|
||||
"""
|
||||
Parse command line arguments to the configuration according to the default yaml.
|
||||
|
||||
Args:
|
||||
parser: Parent parser.
|
||||
cfg: Base configuration.
|
||||
helper: Helper description.
|
||||
cfg_path: Path to the default yaml config.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
|
||||
parents=[parser])
|
||||
helper = {} if helper is None else helper
|
||||
choices = {} if choices is None else choices
|
||||
for item in cfg:
|
||||
if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
|
||||
help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
|
||||
choice = choices[item] if item in choices else None
|
||||
if isinstance(cfg[item], bool):
|
||||
parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
|
||||
help=help_description)
|
||||
else:
|
||||
parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
|
||||
help=help_description)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def parse_yaml(yaml_path):
|
||||
"""
|
||||
Parse the yaml config file.
|
||||
|
||||
Args:
|
||||
yaml_path: Path to the yaml config.
|
||||
"""
|
||||
with open(yaml_path, 'r') as fin:
|
||||
try:
|
||||
cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
|
||||
cfgs = [x for x in cfgs]
|
||||
if len(cfgs) == 1:
|
||||
cfg_helper = {}
|
||||
cfg = cfgs[0]
|
||||
cfg_choices = {}
|
||||
elif len(cfgs) == 2:
|
||||
cfg, cfg_helper = cfgs
|
||||
cfg_choices = {}
|
||||
elif len(cfgs) == 3:
|
||||
cfg, cfg_helper, cfg_choices = cfgs
|
||||
else:
|
||||
raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml")
|
||||
print(cfg_helper)
|
||||
except:
|
||||
raise ValueError("Failed to parse yaml")
|
||||
return cfg, cfg_helper, cfg_choices
|
||||
|
||||
|
||||
def merge(args, cfg):
|
||||
"""
|
||||
Merge the base config from yaml file and command line arguments.
|
||||
|
||||
Args:
|
||||
args: Command line arguments.
|
||||
cfg: Base configuration.
|
||||
"""
|
||||
args_var = vars(args)
|
||||
for item in args_var:
|
||||
cfg[item] = args_var[item]
|
||||
return cfg
|
||||
|
||||
|
||||
def get_config():
|
||||
"""
|
||||
Get Config according to the yaml file and cli arguments.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="default name", add_help=False)
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../default_config.yaml"),
|
||||
help="Config file path")
|
||||
path_args, _ = parser.parse_known_args()
|
||||
default, helper, choices = parse_yaml(path_args.config_path)
|
||||
pprint(default)
|
||||
args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path)
|
||||
final_config = merge(args, default)
|
||||
return Config(final_config)
|
||||
|
||||
config = get_config()
|
|
@ -0,0 +1,27 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Device adapter for ModelArts"""
|
||||
|
||||
from .config import config
|
||||
|
||||
if config.enable_modelarts:
|
||||
from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
|
||||
else:
|
||||
from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
|
||||
|
||||
__all__ = [
|
||||
"get_device_id", "get_device_num", "get_rank_id", "get_job_id"
|
||||
]
|
|
@ -0,0 +1,36 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Local adapter"""
|
||||
|
||||
import os
|
||||
|
||||
def get_device_id():
|
||||
device_id = os.getenv('DEVICE_ID', '0')
|
||||
return int(device_id)
|
||||
|
||||
|
||||
def get_device_num():
|
||||
device_num = os.getenv('RANK_SIZE', '1')
|
||||
return int(device_num)
|
||||
|
||||
|
||||
def get_rank_id():
|
||||
global_rank_id = os.getenv('RANK_ID', '0')
|
||||
return int(global_rank_id)
|
||||
|
||||
|
||||
def get_job_id():
|
||||
return "Local Job"
|
|
@ -0,0 +1,116 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Moxing adapter for ModelArts"""
|
||||
|
||||
import os
|
||||
import functools
|
||||
from mindspore import context
|
||||
from .config import config
|
||||
|
||||
_global_sync_count = 0
|
||||
|
||||
def get_device_id():
|
||||
device_id = os.getenv('DEVICE_ID', '0')
|
||||
return int(device_id)
|
||||
|
||||
|
||||
def get_device_num():
|
||||
device_num = os.getenv('RANK_SIZE', '1')
|
||||
return int(device_num)
|
||||
|
||||
|
||||
def get_rank_id():
|
||||
global_rank_id = os.getenv('RANK_ID', '0')
|
||||
return int(global_rank_id)
|
||||
|
||||
|
||||
def get_job_id():
|
||||
job_id = os.getenv('JOB_ID')
|
||||
job_id = job_id if job_id != "" else "default"
|
||||
return job_id
|
||||
|
||||
def sync_data(from_path, to_path):
|
||||
"""
|
||||
Download data from remote obs to local directory if the first url is remote url and the second one is local path
|
||||
Upload data from local directory to remote obs in contrast.
|
||||
"""
|
||||
import moxing as mox
|
||||
import time
|
||||
global _global_sync_count
|
||||
sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
|
||||
_global_sync_count += 1
|
||||
|
||||
# Each server contains 8 devices as most.
|
||||
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
|
||||
print("from path: ", from_path)
|
||||
print("to path: ", to_path)
|
||||
mox.file.copy_parallel(from_path, to_path)
|
||||
print("===finish data synchronization===")
|
||||
try:
|
||||
os.mknod(sync_lock)
|
||||
except IOError:
|
||||
pass
|
||||
print("===save flag===")
|
||||
|
||||
while True:
|
||||
if os.path.exists(sync_lock):
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
print("Finish sync data from {} to {}.".format(from_path, to_path))
|
||||
|
||||
|
||||
def moxing_wrapper(pre_process=None, post_process=None):
|
||||
"""
|
||||
Moxing wrapper to download dataset and upload outputs.
|
||||
"""
|
||||
def wrapper(run_func):
|
||||
@functools.wraps(run_func)
|
||||
def wrapped_func(*args, **kwargs):
|
||||
# Download data from data_url
|
||||
if config.enable_modelarts:
|
||||
if config.data_url:
|
||||
sync_data(config.data_url, config.data_path)
|
||||
print("Dataset downloaded: ", os.listdir(config.data_path))
|
||||
if config.checkpoint_url:
|
||||
sync_data(config.checkpoint_url, config.load_path)
|
||||
print("Preload downloaded: ", os.listdir(config.load_path))
|
||||
if config.train_url:
|
||||
sync_data(config.train_url, config.output_path)
|
||||
print("Workspace downloaded: ", os.listdir(config.output_path))
|
||||
|
||||
context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
|
||||
config.device_num = get_device_num()
|
||||
config.device_id = get_device_id()
|
||||
if not os.path.exists(config.output_path):
|
||||
os.makedirs(config.output_path)
|
||||
|
||||
if pre_process:
|
||||
pre_process()
|
||||
|
||||
# Run the main function
|
||||
run_func(*args, **kwargs)
|
||||
|
||||
# Upload data to train_url
|
||||
if config.enable_modelarts:
|
||||
if post_process:
|
||||
post_process()
|
||||
|
||||
if config.train_url:
|
||||
print("Start to copy output directory")
|
||||
sync_data(config.output_path, config.train_url)
|
||||
return wrapped_func
|
||||
return wrapper
|
|
@ -64,7 +64,9 @@ do
|
|||
rm -rf ./train_parallel$i
|
||||
mkdir ./train_parallel$i
|
||||
cp ../*.py ./train_parallel$i
|
||||
cp ../*.yaml ./train_parallel$i
|
||||
cp -r ../src ./train_parallel$i
|
||||
cp -r ../model_utils ./train_parallel$i
|
||||
cd ./train_parallel$i || exit
|
||||
echo "start training for rank $RANK_ID, device $DEVICE_ID"
|
||||
env > env.log
|
||||
|
|
|
@ -50,7 +50,9 @@ export DEVICE_NUM=8
|
|||
rm -rf ./train_parallel
|
||||
mkdir ./train_parallel
|
||||
cp ../*.py ./train_parallel
|
||||
cp ../*.yaml ./train_parallel
|
||||
cp -r ../src ./train_parallel
|
||||
cp -r ../model_utils ./train_parallel
|
||||
cd ./train_parallel || exit
|
||||
env > env.log
|
||||
mpirun --allow-run-as-root -n ${DEVICE_NUM} --output-filename log_output --merge-stderr-to-stdout \
|
||||
|
|
|
@ -55,7 +55,9 @@ then
|
|||
fi
|
||||
mkdir ./eval
|
||||
cp ../*.py ./eval
|
||||
cp ../*.yaml ./eval
|
||||
cp -r ../src ./eval
|
||||
cp -r ../model_utils ./eval
|
||||
cd ./eval || exit
|
||||
env > env.log
|
||||
echo "start inferring for device $DEVICE_ID"
|
||||
|
|
|
@ -55,7 +55,9 @@ then
|
|||
fi
|
||||
mkdir ./eval
|
||||
cp ../*.py ./eval
|
||||
cp ../*.yaml ./eval
|
||||
cp -r ../src ./eval
|
||||
cp -r ../model_utils ./eval
|
||||
cd ./eval || exit
|
||||
env > env.log
|
||||
echo "start inferring for device $DEVICE_ID"
|
||||
|
|
|
@ -56,7 +56,9 @@ then
|
|||
fi
|
||||
mkdir ./train
|
||||
cp ../*.py ./train
|
||||
cp ../*.yaml ./train
|
||||
cp -r ../src ./train
|
||||
cp -r ../model_utils ./train
|
||||
cd ./train || exit
|
||||
echo "start training for device $DEVICE_ID"
|
||||
env > env.log
|
||||
|
|
|
@ -56,7 +56,9 @@ then
|
|||
fi
|
||||
mkdir ./train
|
||||
cp ../*.py ./train
|
||||
cp ../*.yaml ./train
|
||||
cp -r ../src ./train
|
||||
cp -r ../model_utils ./train
|
||||
cd ./train || exit
|
||||
echo "start training for device $DEVICE_ID"
|
||||
env > env.log
|
||||
|
|
|
@ -1,68 +0,0 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Config parameters for Darknet based yolov3_darknet53 models."""
|
||||
|
||||
|
||||
class ConfigYOLOV3DarkNet53:
|
||||
"""
|
||||
Config parameters for the yolov3_darknet53.
|
||||
|
||||
Examples:
|
||||
ConfigYOLOV3DarkNet53()
|
||||
"""
|
||||
# train_param
|
||||
# data augmentation related
|
||||
hue = 0.1
|
||||
saturation = 1.5
|
||||
value = 1.5
|
||||
jitter = 0.3
|
||||
|
||||
resize_rate = 1
|
||||
multi_scale = [[320, 320],
|
||||
[352, 352],
|
||||
[384, 384],
|
||||
[416, 416],
|
||||
[448, 448],
|
||||
[480, 480],
|
||||
[512, 512],
|
||||
[544, 544],
|
||||
[576, 576],
|
||||
[608, 608]
|
||||
]
|
||||
|
||||
num_classes = 80
|
||||
max_box = 50
|
||||
|
||||
backbone_input_shape = [32, 64, 128, 256, 512]
|
||||
backbone_shape = [64, 128, 256, 512, 1024]
|
||||
backbone_layers = [1, 2, 8, 8, 4]
|
||||
|
||||
# confidence under ignore_threshold means no object when training
|
||||
ignore_threshold = 0.7
|
||||
|
||||
# h->w
|
||||
anchor_scales = [(10, 13),
|
||||
(16, 30),
|
||||
(33, 23),
|
||||
(30, 61),
|
||||
(62, 45),
|
||||
(59, 119),
|
||||
(116, 90),
|
||||
(156, 198),
|
||||
(373, 326)]
|
||||
out_channel = 3 * (num_classes + 5)
|
||||
|
||||
# test_param
|
||||
test_img_shape = [416, 416]
|
|
@ -29,7 +29,7 @@ class DistributedSampler:
|
|||
rank = 0
|
||||
self.dataset_size = dataset_size
|
||||
self.num_replicas = num_replicas
|
||||
self.rank = rank
|
||||
self.rank = rank if num_replicas > 1 else 0
|
||||
self.epoch = 0
|
||||
self.num_samples = int(math.ceil(dataset_size * 1.0 / self.num_replicas))
|
||||
self.total_size = self.num_samples * self.num_replicas
|
||||
|
|
|
@ -25,9 +25,8 @@ from mindspore.ops import functional as F
|
|||
from mindspore.ops import composite as C
|
||||
|
||||
from src.darknet import DarkNet, ResidualBlock
|
||||
from src.config import ConfigYOLOV3DarkNet53
|
||||
from src.loss import XYLoss, WHLoss, ConfidenceLoss, ClassLoss
|
||||
|
||||
from model_utils.config import config as default_config
|
||||
|
||||
def _conv_bn_relu(in_channel,
|
||||
out_channel,
|
||||
|
@ -164,17 +163,17 @@ class DetectionBlock(nn.Cell):
|
|||
|
||||
Args:
|
||||
scale: Character.
|
||||
config: ConfigYOLOV3DarkNet53, Configuration instance.
|
||||
config: Configuration.
|
||||
is_training: Bool, Whether train or not, default True.
|
||||
|
||||
Returns:
|
||||
Tuple, tuple of output tensor,(f1,f2,f3).
|
||||
|
||||
Examples:
|
||||
DetectionBlock(scale='l',stride=32)
|
||||
DetectionBlock(scale='l',stride=32,config=config)
|
||||
"""
|
||||
|
||||
def __init__(self, scale, config=ConfigYOLOV3DarkNet53(), is_training=True):
|
||||
def __init__(self, scale, config=None, is_training=True):
|
||||
super(DetectionBlock, self).__init__()
|
||||
self.config = config
|
||||
if scale == 's':
|
||||
|
@ -275,7 +274,7 @@ class YoloLossBlock(nn.Cell):
|
|||
"""
|
||||
Loss block cell of YOLOV3 network.
|
||||
"""
|
||||
def __init__(self, scale, config=ConfigYOLOV3DarkNet53()):
|
||||
def __init__(self, scale, config=None):
|
||||
super(YoloLossBlock, self).__init__()
|
||||
self.config = config
|
||||
if scale == 's':
|
||||
|
@ -362,9 +361,9 @@ class YOLOV3DarkNet53(nn.Cell):
|
|||
YOLOV3DarkNet53(True)
|
||||
"""
|
||||
|
||||
def __init__(self, is_training):
|
||||
def __init__(self, is_training, config=default_config):
|
||||
super(YOLOV3DarkNet53, self).__init__()
|
||||
self.config = ConfigYOLOV3DarkNet53()
|
||||
self.config = config
|
||||
self.tenser_to_array = P.TupleToArray()
|
||||
|
||||
# YOLOv3 network
|
||||
|
@ -376,9 +375,9 @@ class YOLOV3DarkNet53(nn.Cell):
|
|||
out_channel=self.config.out_channel)
|
||||
|
||||
# prediction on the default anchor boxes
|
||||
self.detect_1 = DetectionBlock('l', is_training=is_training)
|
||||
self.detect_2 = DetectionBlock('m', is_training=is_training)
|
||||
self.detect_3 = DetectionBlock('s', is_training=is_training)
|
||||
self.detect_1 = DetectionBlock('l', is_training=is_training, config=self.config)
|
||||
self.detect_2 = DetectionBlock('m', is_training=is_training, config=self.config)
|
||||
self.detect_3 = DetectionBlock('s', is_training=is_training, config=self.config)
|
||||
|
||||
def construct(self, x):
|
||||
input_shape = F.shape(x)[2:4]
|
||||
|
@ -393,10 +392,10 @@ class YOLOV3DarkNet53(nn.Cell):
|
|||
|
||||
class YoloWithLossCell(nn.Cell):
|
||||
"""YOLOV3 loss."""
|
||||
def __init__(self, network):
|
||||
def __init__(self, network, config=default_config):
|
||||
super(YoloWithLossCell, self).__init__()
|
||||
self.yolo_network = network
|
||||
self.config = ConfigYOLOV3DarkNet53()
|
||||
self.config = config
|
||||
self.tenser_to_array = P.TupleToArray()
|
||||
self.loss_big = YoloLossBlock('l', self.config)
|
||||
self.loss_me = YoloLossBlock('m', self.config)
|
||||
|
|
|
@ -155,12 +155,12 @@ def create_yolo_dataset(image_dir, anno_path, batch_size, max_epoch, device_num,
|
|||
|
||||
yolo_dataset = COCOYoloDataset(root=image_dir, ann_file=anno_path, filter_crowd_anno=filter_crowd,
|
||||
remove_images_without_annotations=remove_empty_anno, is_training=is_training)
|
||||
distributed_sampler = DistributedSampler(len(yolo_dataset), device_num, rank, shuffle=shuffle)
|
||||
hwc_to_chw = CV.HWC2CHW()
|
||||
|
||||
config.dataset_size = len(yolo_dataset)
|
||||
cores = multiprocessing.cpu_count()
|
||||
num_parallel_workers = int(cores / device_num)
|
||||
distributed_sampler = DistributedSampler(len(yolo_dataset), device_num, rank, shuffle=shuffle)
|
||||
if is_training:
|
||||
multi_scale_trans = MultiScaleTrans(config, device_num)
|
||||
dataset_column_names = ["image", "annotation", "bbox1", "bbox2", "bbox3",
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
"""YoloV3 train."""
|
||||
import os
|
||||
import time
|
||||
import argparse
|
||||
import datetime
|
||||
|
||||
from mindspore.context import ParallelMode
|
||||
|
@ -36,9 +35,12 @@ from src.util import AverageMeter, get_param_groups
|
|||
from src.lr_scheduler import get_lr
|
||||
from src.yolo_dataset import create_yolo_dataset
|
||||
from src.initializer import default_recurisive_init, load_yolov3_params
|
||||
from src.config import ConfigYOLOV3DarkNet53
|
||||
from src.util import keep_loss_fp32
|
||||
|
||||
from model_utils.config import config
|
||||
from model_utils.moxing_adapter import moxing_wrapper
|
||||
from model_utils.device_adapter import get_device_id, get_device_num
|
||||
|
||||
set_seed(1)
|
||||
|
||||
class BuildTrainNetwork(nn.Cell):
|
||||
|
@ -53,79 +55,6 @@ class BuildTrainNetwork(nn.Cell):
|
|||
return loss
|
||||
|
||||
|
||||
def parse_args():
|
||||
"""Parse train arguments."""
|
||||
parser = argparse.ArgumentParser('mindspore coco training')
|
||||
|
||||
# device related
|
||||
parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'],
|
||||
help='device where the code will be implemented. (Default: Ascend)')
|
||||
|
||||
# dataset related
|
||||
parser.add_argument('--data_dir', type=str, help='Train dataset directory.')
|
||||
parser.add_argument('--per_batch_size', default=32, type=int, help='Batch size for Training. Default: 32.')
|
||||
|
||||
# network related
|
||||
parser.add_argument('--pretrained_backbone', default='', type=str,
|
||||
help='The ckpt file of DarkNet53. Default: "".')
|
||||
parser.add_argument('--resume_yolov3', default='', type=str,
|
||||
help='The ckpt file of YOLOv3, which used to fine tune. Default: ""')
|
||||
|
||||
# optimizer and lr related
|
||||
parser.add_argument('--lr_scheduler', default='exponential', type=str,
|
||||
help='Learning rate scheduler, options: exponential, cosine_annealing. Default: exponential')
|
||||
parser.add_argument('--lr', default=0.001, type=float, help='Learning rate. Default: 0.001')
|
||||
parser.add_argument('--lr_epochs', type=str, default='220,250',
|
||||
help='Epoch of changing of lr changing, split with ",". Default: 220,250')
|
||||
parser.add_argument('--lr_gamma', type=float, default=0.1,
|
||||
help='Decrease lr by a factor of exponential lr_scheduler. Default: 0.1')
|
||||
parser.add_argument('--eta_min', type=float, default=0., help='Eta_min in cosine_annealing scheduler. Default: 0')
|
||||
parser.add_argument('--T_max', type=int, default=320, help='T-max in cosine_annealing scheduler. Default: 320')
|
||||
parser.add_argument('--max_epoch', type=int, default=320, help='Max epoch num to train the model. Default: 320')
|
||||
parser.add_argument('--warmup_epochs', default=0, type=float, help='Warmup epochs. Default: 0')
|
||||
parser.add_argument('--weight_decay', type=float, default=0.0005, help='Weight decay factor. Default: 0.0005')
|
||||
parser.add_argument('--momentum', type=float, default=0.9, help='Momentum. Default: 0.9')
|
||||
|
||||
# loss related
|
||||
parser.add_argument('--loss_scale', type=int, default=1024, help='Static loss scale. Default: 1024')
|
||||
parser.add_argument('--label_smooth', type=int, default=0, help='Whether to use label smooth in CE. Default:0')
|
||||
parser.add_argument('--label_smooth_factor', type=float, default=0.1,
|
||||
help='Smooth strength of original one-hot. Default: 0.1')
|
||||
|
||||
# logging related
|
||||
parser.add_argument('--log_interval', type=int, default=100, help='Logging interval steps. Default: 100')
|
||||
parser.add_argument('--ckpt_path', type=str, default='outputs/', help='Checkpoint save location. Default: outputs/')
|
||||
parser.add_argument('--ckpt_interval', type=int, default=None, help='Save checkpoint interval. Default: None')
|
||||
|
||||
parser.add_argument('--is_save_on_master', type=int, default=1,
|
||||
help='Save ckpt on master or all rank, 1 for master, 0 for all ranks. Default: 1')
|
||||
|
||||
# distributed related
|
||||
parser.add_argument('--is_distributed', type=int, default=1,
|
||||
help='Distribute train or not, 1 for yes, 0 for no. Default: 1')
|
||||
parser.add_argument('--rank', type=int, default=0, help='Local rank of distributed. Default: 0')
|
||||
parser.add_argument('--group_size', type=int, default=1, help='World size of device. Default: 1')
|
||||
|
||||
# profiler init
|
||||
parser.add_argument('--need_profiler', type=int, default=0,
|
||||
help='Whether use profiler. 0 for no, 1 for yes. Default: 0')
|
||||
|
||||
# reset default config
|
||||
parser.add_argument('--training_shape', type=str, default="", help='Fix training shape. Default: ""')
|
||||
parser.add_argument('--resize_rate', type=int, default=None,
|
||||
help='Resize rate for multi-scale training. Default: None')
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
if args.lr_scheduler == 'cosine_annealing' and args.max_epoch > args.T_max:
|
||||
args.T_max = args.max_epoch
|
||||
|
||||
args.lr_epochs = list(map(int, args.lr_epochs.split(',')))
|
||||
args.data_root = os.path.join(args.data_dir, 'train2014')
|
||||
args.annFile = os.path.join(args.data_dir, 'annotations/instances_train2014.json')
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def conver_training_shape(args):
|
||||
training_shape = [int(args.training_shape), int(args.training_shape)]
|
||||
return training_shape
|
||||
|
@ -151,6 +80,7 @@ def network_init(args):
|
|||
init("nccl")
|
||||
args.rank = get_rank()
|
||||
args.group_size = get_group_size()
|
||||
|
||||
# select for master rank save ckpt or all rank save, compatible for model parallel
|
||||
args.rank_save_ckpt_flag = 0
|
||||
if args.is_save_on_master:
|
||||
|
@ -175,47 +105,105 @@ def parallel_init(args):
|
|||
degree = get_group_size()
|
||||
context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree)
|
||||
|
||||
def train():
|
||||
|
||||
def modelarts_pre_process():
|
||||
'''modelarts pre process function.'''
|
||||
def unzip(zip_file, save_dir):
|
||||
import zipfile
|
||||
s_time = time.time()
|
||||
if not os.path.exists(os.path.join(save_dir, config.modelarts_dataset_unzip_name)):
|
||||
zip_isexist = zipfile.is_zipfile(zip_file)
|
||||
if zip_isexist:
|
||||
fz = zipfile.ZipFile(zip_file, 'r')
|
||||
data_num = len(fz.namelist())
|
||||
print("Extract Start...")
|
||||
print("unzip file num: {}".format(data_num))
|
||||
data_print = int(data_num / 100) if data_num > 100 else 1
|
||||
i = 0
|
||||
for file in fz.namelist():
|
||||
if i % data_print == 0:
|
||||
print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True)
|
||||
i += 1
|
||||
fz.extract(file, save_dir)
|
||||
print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60),
|
||||
int(int(time.time() - s_time) % 60)))
|
||||
print("Extract Done.")
|
||||
else:
|
||||
print("This is not zip.")
|
||||
else:
|
||||
print("Zip has been extracted.")
|
||||
|
||||
if config.need_modelarts_dataset_unzip:
|
||||
zip_file_1 = os.path.join(config.data_path, config.modelarts_dataset_unzip_name + ".zip")
|
||||
save_dir_1 = os.path.join(config.data_path)
|
||||
|
||||
sync_lock = "/tmp/unzip_sync.lock"
|
||||
|
||||
# Each server contains 8 devices as most.
|
||||
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
|
||||
print("Zip file path: ", zip_file_1)
|
||||
print("Unzip file save dir: ", save_dir_1)
|
||||
unzip(zip_file_1, save_dir_1)
|
||||
print("===Finish extract data synchronization===")
|
||||
try:
|
||||
os.mknod(sync_lock)
|
||||
except IOError:
|
||||
pass
|
||||
|
||||
while True:
|
||||
if os.path.exists(sync_lock):
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1))
|
||||
|
||||
config.ckpt_path = os.path.join(config.output_path, config.ckpt_path)
|
||||
|
||||
|
||||
@moxing_wrapper(pre_process=modelarts_pre_process)
|
||||
def run_train():
|
||||
"""Train function."""
|
||||
args = parse_args()
|
||||
profiler = network_init(args)
|
||||
if config.lr_scheduler == 'cosine_annealing' and config.max_epoch > config.T_max:
|
||||
config.T_max = config.max_epoch
|
||||
config.lr_epochs = list(map(int, config.lr_epochs.split(',')))
|
||||
config.data_root = os.path.join(config.data_dir, 'train2014')
|
||||
config.annFile = os.path.join(config.data_dir, 'annotations/instances_train2014.json')
|
||||
|
||||
profiler = network_init(config)
|
||||
|
||||
loss_meter = AverageMeter('loss')
|
||||
parallel_init(args)
|
||||
parallel_init(config)
|
||||
|
||||
network = YOLOV3DarkNet53(is_training=True)
|
||||
# default is kaiming-normal
|
||||
default_recurisive_init(network)
|
||||
load_yolov3_params(args, network)
|
||||
load_yolov3_params(config, network)
|
||||
|
||||
network = YoloWithLossCell(network)
|
||||
args.logger.info('finish get network')
|
||||
config.logger.info('finish get network')
|
||||
|
||||
config = ConfigYOLOV3DarkNet53()
|
||||
config.label_smooth = args.label_smooth
|
||||
config.label_smooth_factor = args.label_smooth_factor
|
||||
config.label_smooth = config.label_smooth
|
||||
config.label_smooth_factor = config.label_smooth_factor
|
||||
|
||||
if args.training_shape:
|
||||
config.multi_scale = [conver_training_shape(args)]
|
||||
if args.resize_rate:
|
||||
config.resize_rate = args.resize_rate
|
||||
if config.training_shape:
|
||||
config.multi_scale = [conver_training_shape(config)]
|
||||
|
||||
ds, data_size = create_yolo_dataset(image_dir=args.data_root, anno_path=args.annFile, is_training=True,
|
||||
batch_size=args.per_batch_size, max_epoch=args.max_epoch,
|
||||
device_num=args.group_size, rank=args.rank, config=config)
|
||||
args.logger.info('Finish loading dataset')
|
||||
ds, data_size = create_yolo_dataset(image_dir=config.data_root, anno_path=config.annFile, is_training=True,
|
||||
batch_size=config.per_batch_size, max_epoch=config.max_epoch,
|
||||
device_num=config.group_size, rank=config.rank, config=config)
|
||||
config.logger.info('Finish loading dataset')
|
||||
|
||||
args.steps_per_epoch = int(data_size / args.per_batch_size / args.group_size)
|
||||
config.steps_per_epoch = int(data_size / config.per_batch_size / config.group_size)
|
||||
|
||||
if not args.ckpt_interval:
|
||||
args.ckpt_interval = args.steps_per_epoch
|
||||
if config.ckpt_interval <= 0:
|
||||
config.ckpt_interval = config.steps_per_epoch
|
||||
|
||||
lr = get_lr(args)
|
||||
lr = get_lr(config)
|
||||
opt = Momentum(params=get_param_groups(network),
|
||||
learning_rate=Tensor(lr),
|
||||
momentum=args.momentum,
|
||||
weight_decay=args.weight_decay,
|
||||
loss_scale=args.loss_scale)
|
||||
momentum=config.momentum,
|
||||
weight_decay=config.weight_decay,
|
||||
loss_scale=config.loss_scale)
|
||||
is_gpu = context.get_context("device_target") == "GPU"
|
||||
if is_gpu:
|
||||
loss_scale_value = 1.0
|
||||
|
@ -224,18 +212,18 @@ def train():
|
|||
level="O2", keep_batchnorm_fp32=False)
|
||||
keep_loss_fp32(network)
|
||||
else:
|
||||
network = TrainingWrapper(network, opt, sens=args.loss_scale)
|
||||
network = TrainingWrapper(network, opt, sens=config.loss_scale)
|
||||
network.set_train()
|
||||
|
||||
if args.rank_save_ckpt_flag:
|
||||
if config.rank_save_ckpt_flag:
|
||||
# checkpoint save
|
||||
ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
|
||||
ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval,
|
||||
ckpt_max_num = config.max_epoch * config.steps_per_epoch // config.ckpt_interval
|
||||
ckpt_config = CheckpointConfig(save_checkpoint_steps=config.ckpt_interval,
|
||||
keep_checkpoint_max=ckpt_max_num)
|
||||
save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/')
|
||||
save_ckpt_path = os.path.join(config.outputs_dir, 'ckpt_' + str(config.rank) + '/')
|
||||
ckpt_cb = ModelCheckpoint(config=ckpt_config,
|
||||
directory=save_ckpt_path,
|
||||
prefix='{}'.format(args.rank))
|
||||
prefix='{}'.format(config.rank))
|
||||
cb_params = _InternalCallbackParam()
|
||||
cb_params.train_network = network
|
||||
cb_params.epoch_num = ckpt_max_num
|
||||
|
@ -250,7 +238,7 @@ def train():
|
|||
for i, data in enumerate(data_loader):
|
||||
images = data["image"]
|
||||
input_shape = images.shape[2:4]
|
||||
args.logger.info('iter[{}], shape{}'.format(i, input_shape[0]))
|
||||
config.logger.info('iter[{}], shape{}'.format(i, input_shape[0]))
|
||||
|
||||
images = Tensor.from_numpy(images)
|
||||
|
||||
|
@ -265,34 +253,34 @@ def train():
|
|||
batch_gt_box2)
|
||||
loss_meter.update(loss.asnumpy())
|
||||
|
||||
if args.rank_save_ckpt_flag:
|
||||
if config.rank_save_ckpt_flag:
|
||||
# ckpt progress
|
||||
cb_params.cur_step_num = i + 1 # current step number
|
||||
cb_params.batch_num = i + 2
|
||||
ckpt_cb.step_end(run_context)
|
||||
|
||||
if i % args.log_interval == 0:
|
||||
if i % config.log_interval == 0:
|
||||
time_used = time.time() - t_end
|
||||
epoch = int(i / args.steps_per_epoch)
|
||||
per_step_time = time_used/args.log_interval
|
||||
fps = args.per_batch_size * (i - old_progress) * args.group_size / time_used
|
||||
if args.rank == 0:
|
||||
args.logger.info(
|
||||
epoch = int(i / config.steps_per_epoch)
|
||||
per_step_time = time_used/config.log_interval
|
||||
fps = config.per_batch_size * (i - old_progress) * config.group_size / time_used
|
||||
if config.rank == 0:
|
||||
config.logger.info(
|
||||
'epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr:{},'
|
||||
' per_step_time:{}'.format(epoch, i, loss_meter, fps, lr[i], per_step_time))
|
||||
t_end = time.time()
|
||||
loss_meter.reset()
|
||||
old_progress = i
|
||||
|
||||
if (i + 1) % args.steps_per_epoch == 0 and args.rank_save_ckpt_flag:
|
||||
if (i + 1) % config.steps_per_epoch == 0 and config.rank_save_ckpt_flag:
|
||||
cb_params.cur_epoch_num += 1
|
||||
|
||||
if args.need_profiler:
|
||||
if config.need_profiler:
|
||||
if i == 10:
|
||||
profiler.analyse()
|
||||
break
|
||||
args.logger.info('==========end training===============')
|
||||
config.logger.info('==========end training===============')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train()
|
||||
run_train()
|
||||
|
|
|
@ -226,11 +226,11 @@ def test_yolov3_darknet_8p():
|
|||
cur_model_path = os.path.join(cur_path, model_name)
|
||||
train_file = os.path.join(cur_model_path, "train.py")
|
||||
old_list = ["--lr_scheduler=cosine_annealing"]
|
||||
new_list = ["--lr_scheduler=cosine_annealing --training_shape=416"]
|
||||
new_list = ["--lr_scheduler=cosine_annealing --training_shape=416 --log_interval=10"]
|
||||
utils.exec_sed_command(old_list, new_list,
|
||||
os.path.join(cur_model_path, "scripts/run_distribute_train.sh"))
|
||||
old_list = ["default=100", "max_epoch=args.max_epoch"]
|
||||
new_list = ["default=10", "max_epoch=1"]
|
||||
old_list = ["max_epoch=config.max_epoch"]
|
||||
new_list = ["max_epoch=1"]
|
||||
utils.exec_sed_command(old_list, new_list, train_file)
|
||||
old_list = ["sampler=distributed_sampler"]
|
||||
new_list = ["sampler=distributed_sampler, num_samples=100*batch_size"]
|
||||
|
|
Loading…
Reference in New Issue