modify yolov3_darknet53 for modelarts

This commit is contained in:
zhanghuiyao 2021-05-11 14:43:42 +08:00
parent 18e9e6ce4c
commit bb1865658e
23 changed files with 870 additions and 359 deletions

View File

@ -101,41 +101,93 @@ Dataset used: [COCO2014](https://cocodataset.org/#download)
python hccl_tools.py --device_num "[0,8)"
```
```network
# The parameter of training_shape define image shape for network, default is "".
# It means use 10 kinds of shape as input shape, or it can be set some kind of shape.
# run training example(1p) by python command.
python train.py \
--data_dir=./dataset/coco2014 \
--pretrained_backbone=darknet53_backbone.ckpt \
--is_distributed=0 \
--lr=0.001 \
--loss_scale=1024 \
--weight_decay=0.016 \
--T_max=320 \
--max_epoch=320 \
--warmup_epochs=4 \
--training_shape=416 \
--lr_scheduler=cosine_annealing > log.txt 2>&1 &
- Train on local
# standalone training example(1p) by shell script
bash run_standalone_train.sh dataset/coco2014 darknet53_backbone.ckpt
```network
# The parameter of training_shape define image shape for network, default is "".
# It means use 10 kinds of shape as input shape, or it can be set some kind of shape.
# run training example(1p) by python command.
python train.py \
--data_dir=./dataset/coco2014 \
--pretrained_backbone=darknet53_backbone.ckpt \
--is_distributed=0 \
--lr=0.001 \
--loss_scale=1024 \
--weight_decay=0.016 \
--T_max=320 \
--max_epoch=320 \
--warmup_epochs=4 \
--training_shape=416 \
--lr_scheduler=cosine_annealing > log.txt 2>&1 &
# For Ascend device, distributed training example(8p) by shell script
bash run_distribute_train.sh dataset/coco2014 darknet53_backbone.ckpt rank_table_8p.json
# standalone training example(1p) by shell script
bash run_standalone_train.sh dataset/coco2014 darknet53_backbone.ckpt
# For GPU device, distributed training example(8p) by shell script
bash run_distribute_train_gpu.sh dataset/coco2014 darknet53_backbone.ckpt
# For Ascend device, distributed training example(8p) by shell script
bash run_distribute_train.sh dataset/coco2014 darknet53_backbone.ckpt rank_table_8p.json
# run evaluation by python command
python eval.py \
--data_dir=./dataset/coco2014 \
--pretrained=yolov3.ckpt \
--testing_shape=416 > log.txt 2>&1 &
# For GPU device, distributed training example(8p) by shell script
bash run_distribute_train_gpu.sh dataset/coco2014 darknet53_backbone.ckpt
# run evaluation by shell script
bash run_eval.sh dataset/coco2014/ checkpoint/0-319_102400.ckpt
```
# run evaluation by python command
python eval.py \
--data_dir=./dataset/coco2014 \
--pretrained=yolov3.ckpt \
--testing_shape=416 > log.txt 2>&1 &
# run evaluation by shell script
bash run_eval.sh dataset/coco2014/ checkpoint/0-319_102400.ckpt
```
- Train on [ModelArts](https://support.huaweicloud.com/modelarts/)
```python
# Train 8p with Ascend
# (1) Perform a or b.
# a. Set "enable_modelarts=True" on base_config.yaml file.
# Set "data_dir='/cache/data/coco2014/'" on base_config.yaml file.
# Set "checkpoint_url='s3://dir_to_your_pretrain/'" on base_config.yaml file.
# Set "pretrained_backbone='/cache/checkpoint_path/0-148_92000.ckpt'" on base_config.yaml file.
# Set "weight_decay=0.016" on base_config.yaml file.
# Set "warmup_epochs=4" on base_config.yaml file.
# Set "lr_scheduler='cosine_annealing'" on base_config.yaml file.
# Set other parameters on base_config.yaml file you need.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add "data_dir=/cache/data/coco2014/" on the website UI interface.
# Add "checkpoint_url=s3://dir_to_your_pretrain/" on the website UI interface.
# Add "pretrained_backbone=/cache/checkpoint_path/0-148_92000.ckpt" on the website UI interface.
# Add "weight_decay=0.016" on the website UI interface.
# Add "warmup_epochs=4" on the website UI interface.
# Add "lr_scheduler=cosine_annealing" on the website UI interface.
# Add other parameters on the website UI interface.
# (3) Upload or copy your pretrained model to S3 bucket.
# (4) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
# (5) Set the code directory to "/path/yolov3_darknet53" on the website UI interface.
# (6) Set the startup file to "train.py" on the website UI interface.
# (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (8) Create your job.
#
# Eval with Ascend
# (1) Perform a or b.
# a. Set "enable_modelarts=True" on base_config.yaml file.
# Set "data_dir='/cache/data/coco2014/'" on base_config.yaml file.
# Set "checkpoint_url='s3://dir_to_your_trained_ckpt/'" on base_config.yaml file.
# Set "pretrained='/cache/checkpoint_path/0-320_102400.ckpt'" on base_config.yaml file.
# Set "testing_shape=416" on base_config.yaml file.
# Set other parameters on base_config.yaml file you need.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add "data_dir=/cache/data/coco2014/" on the website UI interface.
# Add "checkpoint_url=s3://dir_to_your_trained_ckpt/" on the website UI interface.
# Add "pretrained=/cache/checkpoint_path/0-320_102400.ckpt" on the website UI interface.
# Add "testing_shape=416" on the website UI interface.
# Add other parameters on the website UI interface.
# (3) Upload or copy your trained model to S3 bucket.
# (4) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
# (5) Set the code directory to "/path/yolov3_darknet53" on the website UI interface.
# (6) Set the startup file to "eval.py" on the website UI interface.
# (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (8) Create your job.
```
## [Script Description](#contents)

View File

@ -104,49 +104,91 @@ YOLOv3使用DarkNet53执行特征提取这是YOLOv2中的Darknet-19和残差
python hccl_tools.py --device_num "[0,8)"
```
```python
# training_shape参数定义网络图像形状默认为""。
# 意思是使用10种形状作为输入形状或者可以设置某种形状。
# 通过python命令执行训练示例(1卡)。
python train.py \
--data_dir=./dataset/coco2014 \
--pretrained_backbone=darknet53_backbone.ckpt \
--is_distributed=0 \
--lr=0.1 \
--T_max=320 \
--max_epoch=320 \
--warmup_epochs=4 \
--training_shape=416 \
--lr_scheduler=cosine_annealing > log.txt 2>&1 &
```
- 在本地进行训练
```shell script
# shell脚本单机训练示例(1卡)
bash run_standalone_train.sh dataset/coco2014 darknet53_backbone.ckpt
```
```constet
# training_shape参数定义网络图像形状默认为""。
# 意思是使用10种形状作为输入形状或者可以设置某种形状。
# 通过python命令执行训练示例(1卡)。
python train.py \
--data_dir=./dataset/coco2014 \
--pretrained_backbone=darknet53_backbone.ckpt \
--is_distributed=0 \
--lr=0.1 \
--T_max=320 \
--max_epoch=320 \
--warmup_epochs=4 \
--training_shape=416 \
--lr_scheduler=cosine_annealing > log.txt 2>&1 &
```shell script
# 对于Ascend设备使用shell脚本分布式训练示例(8卡)
bash run_distribute_train.sh dataset/coco2014 darknet53_backbone.ckpt rank_table_8p.json
```
# shell脚本单机训练示例(1卡)
bash run_standalone_train.sh dataset/coco2014 darknet53_backbone.ckpt
```shell script
# 对于GPU设备使用shell脚本分布式训练示例(8卡)
bash run_distribute_train_gpu.sh dataset/coco2014 darknet53_backbone.ckpt
```
# 对于Ascend设备使用shell脚本分布式训练示例(8卡)
bash run_distribute_train.sh dataset/coco2014 darknet53_backbone.ckpt rank_table_8p.json
```python
# 使用python命令评估
python eval.py \
--data_dir=./dataset/coco2014 \
--pretrained=yolov3.ckpt \
--testing_shape=416 > log.txt 2>&1 &
```
# 对于GPU设备使用shell脚本分布式训练示例(8卡)
bash run_distribute_train_gpu.sh dataset/coco2014 darknet53_backbone.ckpt
```shell script
# 通过shell脚本运行评估
bash run_eval.sh dataset/coco2014/ checkpoint/0-319_102400.ckpt
```
# 使用python命令评估
python eval.py \
--data_dir=./dataset/coco2014 \
--pretrained=yolov3.ckpt \
--testing_shape=416 > log.txt 2>&1 &
# 通过shell脚本运行评估
bash run_eval.sh dataset/coco2014/ checkpoint/0-319_102400.ckpt
```
- 在 [ModelArts](https://support.huaweicloud.com/modelarts/) 上训练
```python
# 在modelarts上进行8卡训练Ascend
# (1) 执行a或者b
# a. 在 base_config.yaml 文件中配置 "enable_modelarts=True"
# 在 base_config.yaml 文件中配置 "data_dir='/cache/data/coco2014/'"
# 在 base_config.yaml 文件中配置 "checkpoint_url='s3://dir_to_your_pretrain/'"
# 在 base_config.yaml 文件中配置 "pretrained_backbone='/cache/checkpoint_path/0-148_92000.ckpt'"
# 在 base_config.yaml 文件中配置 "weight_decay=0.016"
# 在 base_config.yaml 文件中配置 "warmup_epochs=4"
# 在 base_config.yaml 文件中配置 "lr_scheduler='cosine_annealing'"
# 在 base_config.yaml 文件中配置 其他参数
# b. 在网页上设置 "enable_modelarts=True"
# 在网页上设置 "data_dir=/cache/data/coco2014/"
# 在网页上设置 "checkpoint_url=s3://dir_to_your_pretrain/"
# 在网页上设置 "pretrained_backbone=/cache/checkpoint_path/0-148_92000.ckpt"
# 在网页上设置 "weight_decay=0.016"
# 在网页上设置 "warmup_epochs=4"
# 在网页上设置 "lr_scheduler=cosine_annealing"
# 在网页上设置 其他参数
# (2) 上传你的预训练模型到 S3 桶上
# (3) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集,但那可能会很慢。)
# (4) 在网页上设置你的代码路径为 "/path/deeplabv3"
# (5) 在网页上设置启动文件为 "train.py"
# (6) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等
# (7) 创建训练作业
#
# 在modelarts上进行验证Ascend
# (1) 执行a或者b
# a. 在 base_config.yaml 文件中配置 "enable_modelarts=True"
# 在 base_config.yaml 文件中配置 "data_dir='/cache/data/coco2014/'"
# 在 base_config.yaml 文件中配置 "checkpoint_url='s3://dir_to_your_trained_ckpt/'"
# 在 base_config.yaml 文件中配置 "pretrained='/cache/checkpoint_path/0-320_102400.ckpt'"
# 在 base_config.yaml 文件中配置 "testing_shape=416"
# 在 base_config.yaml 文件中配置 其他参数
# b. 在网页上设置 "enable_modelarts=True"
# 在网页上设置 "data_dir=/cache/data/coco2014/"
# 在网页上设置 "checkpoint_url=s3://dir_to_your_trained_ckpt/"
# 在网页上设置 "pretrained=/cache/checkpoint_path/0-320_102400.ckpt"
# 在网页上设置 "testing_shape=416"
# 在网页上设置 其他参数
# (2) 上传你的预训练模型到 S3 桶上
# (3) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集,但那可能会很慢。)
# (4) 在网页上设置你的代码路径为 "/path/deeplabv3"
# (5) 在网页上设置启动文件为 "train.py"
# (6) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等
# (7) 创建训练作业
```
# 脚本说明

View File

@ -28,9 +28,9 @@
#include "include/api/context.h"
#include "include/api/types.h"
#include "include/api/serialization.h"
#include "include/dataset/vision_ascend.h"
#include "include/dataset/execute.h"
#include "include/dataset/vision.h"
#include "include/minddata/dataset/include/vision_ascend.h"
#include "include/minddata/dataset/include/execute.h"
#include "include/minddata/dataset/include/vision.h"
#include "inc/utils.h"
using mindspore::Context;

View File

@ -0,0 +1,168 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path"
device_target: "Ascend" # ['Ascend', 'GPU']
need_modelarts_dataset_unzip: True
modelarts_dataset_unzip_name: "coco2014"
# ==============================================================================
# Training options
# dataset related
data_dir: "/cache/data/coco2014/"
per_batch_size: 32
# network related
pretrained_backbone: "/cache/checkpoint_path/0-148_92000.ckpt"
resume_yolov3: ""
# optimizer and lr related
lr_scheduler: "exponential"
lr: 0.001
lr_epochs: "220,250"
lr_gamma: 0.1
eta_min: 0.0
T_max: 320
max_epoch: 320
warmup_epochs: 0
weight_decay: 0.0005
momentum: 0.9
# loss related
loss_scale: 1024
label_smooth: 0
label_smooth_factor: 0.1
# logging related
log_interval: 100
ckpt_path: "outputs/"
ckpt_interval: -1
is_save_on_master: 1
# distributed related
is_distributed: 1
rank: 0
group_size: 1
# profiler init
need_profiler: 0
# reset default config
training_shape: ""
# Eval option
pretrained: ""
log_path: "outputs/"
nms_thresh: 0.5
annFile: ""
testing_shape: ""
eval_ignore_threshold: 0.001
# Export option
device_id: 0
batch_size: 1
ckpt_file: ""
file_name: "yolov3_darknet53"
file_format: "AIR" # ["AIR", "ONNX", "MINDIR"]
# Other default config
hue: 0.1
saturation: 1.5
value: 1.5
jitter: 0.3
resize_rate: 1
multi_scale: [[320, 320],
[352, 352],
[384, 384],
[416, 416],
[448, 448],
[480, 480],
[512, 512],
[544, 544],
[576, 576],
[608, 608]
]
num_classes: 80
out_channel: 255 #3 * (num_classes + 5)
max_box: 50
backbone_input_shape: [32, 64, 128, 256, 512]
backbone_shape: [64, 128, 256, 512, 1024]
backbone_layers: [1, 2, 8, 8, 4]
# confidence under ignore_threshold means no object when training
ignore_threshold: 0.7
# h->w
anchor_scales: [[10, 13],
[16, 30],
[33, 23],
[30, 61],
[62, 45],
[59, 119],
[116, 90],
[156, 198],
[373, 326]]
# test_param
test_img_shape: [416, 416]
---
# Help description for each configuration
data_dir: "Train dataset directory."
per_batch_size: "Batch size for Training."
pretrained_backbone: "The ckpt file of DarkNet53."
resume_yolov3: "The ckpt file of YOLOv3, which used to fine tune."
lr_scheduler: "Learning rate scheduler, options: exponential, cosine_annealing."
lr: "Learning rate."
lr_epochs: "Epoch of changing of lr changing, split with ',' ."
lr_gamma: "Decrease lr by a factor of exponential lr_scheduler."
eta_min: "Eta_min in cosine_annealing scheduler."
T_max: "T-max in cosine_annealing scheduler."
max_epoch: "Max epoch num to train the model."
warmup_epochs: "Warmup epochs."
weight_decay: "Weight decay factor."
momentum: "Momentum."
loss_scale: "Static loss scale."
label_smooth: "Whether to use label smooth in CE."
label_smooth_factor: "Smooth strength of original one-hot."
log_interval: "Logging interval steps."
ckpt_path: "Checkpoint save location."
ckpt_interval: "Save checkpoint interval."
is_save_on_master: "Save ckpt on master or all rank, 1 for master, 0 for all ranks."
is_distributed: "Distribute train or not, 1 for yes, 0 for no."
rank: "Local rank of distributed."
group_size: "World size of device."
need_profiler: "Whether use profiler. 0 for no, 1 for yes."
training_shape: "Fix training shape."
resize_rate: "Resize rate for multi-scale training."
# eval option
pretrained: "model_path, local pretrained model to load."
log_path: "checkpoint save location."
nms_thresh: "threshold for NMS."
annFile: "path to annotation."
testing_shape: "shape for test."
eval_ignore_threshold: "threshold to throw low quality boxes for eval."
# export option
device_id: "Device id"
batch_size: "batch size"
ckpt_file: "Checkpoint file path."
file_name: "output file name."
file_format: "file format choices in ['AIR', 'ONNX', 'MINDIR']"
device_target: "device target. choices in ['Ascend', 'GPU'] for train. choices in ['Ascend', 'GPU', 'CPU'] for export."

View File

@ -14,7 +14,6 @@
# ============================================================================
"""YoloV3 eval."""
import os
import argparse
import datetime
import time
import sys
@ -31,7 +30,10 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net
from src.yolo import YOLOV3DarkNet53
from src.logger import get_logger
from src.yolo_dataset import create_yolo_dataset
from src.config import ConfigYOLOV3DarkNet53
from model_utils.config import config
from model_utils.moxing_adapter import moxing_wrapper
from model_utils.device_adapter import get_device_id, get_device_num
class Redirct:
@ -48,7 +50,7 @@ class Redirct:
class DetectionEngine:
"""Detection engine."""
def __init__(self, args):
self.ignore_threshold = args.ignore_threshold
self.eval_ignore_threshold = args.eval_ignore_threshold
self.labels = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat',
'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
@ -186,7 +188,7 @@ class DetectionEngine:
flag[i, c] = True
confidence = cls_emb[flag] * conf
for x_lefti, y_lefti, wi, hi, confi, clsi in zip(x_top_left, y_top_left, w, h, confidence, cls_argmax):
if confi < self.ignore_threshold:
if confi < self.eval_ignore_threshold:
continue
if img_id not in self.results:
self.results[img_id] = defaultdict(list)
@ -199,68 +201,90 @@ class DetectionEngine:
self.results[img_id][coco_clsi].append([x_lefti, y_lefti, wi, hi, confi])
def parse_args():
"""Parse arguments."""
parser = argparse.ArgumentParser('mindspore coco testing')
# device related
parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'],
help='device where the code will be implemented. (Default: Ascend)')
# dataset related
parser.add_argument('--data_dir', type=str, default='', help='train data dir')
parser.add_argument('--per_batch_size', default=1, type=int, help='batch size for per gpu')
# network related
parser.add_argument('--pretrained', default='', type=str, help='model_path, local pretrained model to load')
# logging related
parser.add_argument('--log_path', type=str, default='outputs/', help='checkpoint save location')
# detect_related
parser.add_argument('--nms_thresh', type=float, default=0.5, help='threshold for NMS')
parser.add_argument('--annFile', type=str, default='', help='path to annotation')
parser.add_argument('--testing_shape', type=str, default='', help='shape for test ')
parser.add_argument('--ignore_threshold', type=float, default=0.001, help='threshold to throw low quality boxes')
args, _ = parser.parse_known_args()
args.data_root = os.path.join(args.data_dir, 'val2014')
args.annFile = os.path.join(args.data_dir, 'annotations/instances_val2014.json')
return args
def conver_testing_shape(args):
"""Convert testing shape to list."""
testing_shape = [int(args.testing_shape), int(args.testing_shape)]
return testing_shape
def test():
def modelarts_pre_process():
'''modelarts pre process function.'''
def unzip(zip_file, save_dir):
import zipfile
s_time = time.time()
if not os.path.exists(os.path.join(save_dir, config.modelarts_dataset_unzip_name)):
zip_isexist = zipfile.is_zipfile(zip_file)
if zip_isexist:
fz = zipfile.ZipFile(zip_file, 'r')
data_num = len(fz.namelist())
print("Extract Start...")
print("unzip file num: {}".format(data_num))
data_print = int(data_num / 100) if data_num > 100 else 1
i = 0
for file in fz.namelist():
if i % data_print == 0:
print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True)
i += 1
fz.extract(file, save_dir)
print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60),
int(int(time.time() - s_time) % 60)))
print("Extract Done.")
else:
print("This is not zip.")
else:
print("Zip has been extracted.")
if config.need_modelarts_dataset_unzip:
zip_file_1 = os.path.join(config.data_path, config.modelarts_dataset_unzip_name + ".zip")
save_dir_1 = os.path.join(config.data_path)
sync_lock = "/tmp/unzip_sync.lock"
# Each server contains 8 devices as most.
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
print("Zip file path: ", zip_file_1)
print("Unzip file save dir: ", save_dir_1)
unzip(zip_file_1, save_dir_1)
print("===Finish extract data synchronization===")
try:
os.mknod(sync_lock)
except IOError:
pass
while True:
if os.path.exists(sync_lock):
break
time.sleep(1)
print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1))
@moxing_wrapper(pre_process=modelarts_pre_process)
def run_test():
"""The function of eval."""
start_time = time.time()
args = parse_args()
config.data_root = os.path.join(config.data_dir, 'val2014')
config.annFile = os.path.join(config.data_dir, 'annotations/instances_val2014.json')
devid = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False, device_id=devid)
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False, device_id=devid)
# logger
args.outputs_dir = os.path.join(args.log_path,
datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
config.outputs_dir = os.path.join(config.log_path,
datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
rank_id = int(os.environ.get('RANK_ID')) if os.environ.get('RANK_ID') else 0
args.logger = get_logger(args.outputs_dir, rank_id)
config.logger = get_logger(config.outputs_dir, rank_id)
context.reset_auto_parallel_context()
parallel_mode = ParallelMode.STAND_ALONE
context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1)
args.logger.info('Creating Network....')
config.logger.info('Creating Network....')
network = YOLOV3DarkNet53(is_training=False)
args.logger.info(args.pretrained)
if os.path.isfile(args.pretrained):
param_dict = load_checkpoint(args.pretrained)
config.logger.info(config.pretrained)
if os.path.isfile(config.pretrained):
param_dict = load_checkpoint(config.pretrained)
param_dict_new = {}
for key, values in param_dict.items():
if key.startswith('moments.'):
@ -270,32 +294,31 @@ def test():
else:
param_dict_new[key] = values
load_param_into_net(network, param_dict_new)
args.logger.info('load_model {} success'.format(args.pretrained))
config.logger.info('load_model %s success', config.pretrained)
else:
args.logger.info('{} not exists or not a pre-trained file'.format(args.pretrained))
assert FileNotFoundError('{} not exists or not a pre-trained file'.format(args.pretrained))
config.logger.info('%s not exists or not a pre-trained file', config.pretrained)
assert FileNotFoundError('{} not exists or not a pre-trained file'.format(config.pretrained))
exit(1)
data_root = args.data_root
ann_file = args.annFile
data_root = config.data_root
ann_file = config.annFile
config = ConfigYOLOV3DarkNet53()
if args.testing_shape:
config.test_img_shape = conver_testing_shape(args)
if config.testing_shape:
config.test_img_shape = conver_testing_shape(config)
ds, data_size = create_yolo_dataset(data_root, ann_file, is_training=False, batch_size=args.per_batch_size,
ds, data_size = create_yolo_dataset(data_root, ann_file, is_training=False, batch_size=config.per_batch_size,
max_epoch=1, device_num=1, rank=rank_id, shuffle=False,
config=config)
args.logger.info('testing shape : {}'.format(config.test_img_shape))
args.logger.info('totol {} images to eval'.format(data_size))
config.logger.info('testing shape : %s', config.test_img_shape)
config.logger.info('totol %d images to eval', data_size)
network.set_train(False)
# init detection engine
detection = DetectionEngine(args)
detection = DetectionEngine(config)
args.logger.info('Start inference....')
config.logger.info('Start inference....')
for i, data in enumerate(ds.create_dict_iterator(num_epochs=1)):
image = data["image"]
@ -310,20 +333,21 @@ def test():
image_id = image_id.asnumpy()
image_shape = image_shape.asnumpy()
detection.detect([output_small, output_me, output_big], args.per_batch_size, image_shape, image_id)
detection.detect([output_small, output_me, output_big], config.per_batch_size, image_shape, image_id)
if i % 1000 == 0:
args.logger.info('Processing... {:.2f}% '.format(i * args.per_batch_size / data_size * 100))
config.logger.info('Processing... {:.2f}% '.format(i * config.per_batch_size / data_size * 100))
args.logger.info('Calculating mAP...')
config.logger.info('Calculating mAP...')
detection.do_nms_for_results()
result_file_path = detection.write_result()
args.logger.info('result file path: {}'.format(result_file_path))
config.logger.info('result file path: %s', result_file_path)
eval_result = detection.get_eval_result()
cost_time = time.time() - start_time
args.logger.info('\n=============coco eval result=========\n' + eval_result)
args.logger.info('testing cost time {:.2f}h'.format(cost_time / 3600.))
eval_print_str = '\n=============coco eval result=========\n' + eval_result
config.logger.info(eval_print_str)
config.logger.info('testing cost time %.2f h', cost_time / 3600.)
if __name__ == "__main__":
test()
run_test()

View File

@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import argparse
import numpy as np
import mindspore as ms
@ -20,32 +19,21 @@ from mindspore import context, Tensor
from mindspore.train.serialization import export, load_checkpoint, load_param_into_net
from src.yolo import YOLOV3DarkNet53
from src.config import ConfigYOLOV3DarkNet53
from model_utils.config import config
parser = argparse.ArgumentParser(description="yolov3_darknet53 export")
parser.add_argument("--device_id", type=int, default=0, help="Device id")
parser.add_argument("--batch_size", type=int, default=1, help="batch size")
parser.add_argument("--ckpt_file", type=str, required=True, help="Checkpoint file path.")
parser.add_argument("--file_name", type=str, default="yolov3_darknet53", help="output file name.")
parser.add_argument("--file_format", type=str, choices=["AIR", "ONNX", "MINDIR"], default="AIR", help="file format")
parser.add_argument("--device_target", type=str, choices=["Ascend", "GPU", "CPU"], default="Ascend",
help="device target")
args = parser.parse_args()
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
if args.device_target == "Ascend":
context.set_context(device_id=args.device_id)
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
if config.device_target == "Ascend":
context.set_context(device_id=config.device_id)
if __name__ == "__main__":
network = YOLOV3DarkNet53(is_training=False)
param_dict = load_checkpoint(args.ckpt_file)
param_dict = load_checkpoint(config.ckpt_file)
load_param_into_net(network, param_dict)
config = ConfigYOLOV3DarkNet53()
network.set_train(False)
shape = [args.batch_size, 3] + config.test_img_shape
shape = [config.batch_size, 3] + config.test_img_shape
input_data = Tensor(np.zeros(shape), ms.float32)
export(network, input_data, file_name=args.file_name, file_format=args.file_format)
export(network, input_data, file_name=config.file_name, file_format=config.file_format)

View File

@ -0,0 +1,127 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Parse arguments"""
import os
import ast
import argparse
from pprint import pprint, pformat
import yaml
class Config:
"""
Configuration namespace. Convert dictionary to members.
"""
def __init__(self, cfg_dict):
for k, v in cfg_dict.items():
if isinstance(v, (list, tuple)):
setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
else:
setattr(self, k, Config(v) if isinstance(v, dict) else v)
def __str__(self):
return pformat(self.__dict__)
def __repr__(self):
return self.__str__()
def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"):
"""
Parse command line arguments to the configuration according to the default yaml.
Args:
parser: Parent parser.
cfg: Base configuration.
helper: Helper description.
cfg_path: Path to the default yaml config.
"""
parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
parents=[parser])
helper = {} if helper is None else helper
choices = {} if choices is None else choices
for item in cfg:
if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
choice = choices[item] if item in choices else None
if isinstance(cfg[item], bool):
parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
help=help_description)
else:
parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
help=help_description)
args = parser.parse_args()
return args
def parse_yaml(yaml_path):
"""
Parse the yaml config file.
Args:
yaml_path: Path to the yaml config.
"""
with open(yaml_path, 'r') as fin:
try:
cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
cfgs = [x for x in cfgs]
if len(cfgs) == 1:
cfg_helper = {}
cfg = cfgs[0]
cfg_choices = {}
elif len(cfgs) == 2:
cfg, cfg_helper = cfgs
cfg_choices = {}
elif len(cfgs) == 3:
cfg, cfg_helper, cfg_choices = cfgs
else:
raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml")
print(cfg_helper)
except:
raise ValueError("Failed to parse yaml")
return cfg, cfg_helper, cfg_choices
def merge(args, cfg):
"""
Merge the base config from yaml file and command line arguments.
Args:
args: Command line arguments.
cfg: Base configuration.
"""
args_var = vars(args)
for item in args_var:
cfg[item] = args_var[item]
return cfg
def get_config():
"""
Get Config according to the yaml file and cli arguments.
"""
parser = argparse.ArgumentParser(description="default name", add_help=False)
current_dir = os.path.dirname(os.path.abspath(__file__))
parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../default_config.yaml"),
help="Config file path")
path_args, _ = parser.parse_known_args()
default, helper, choices = parse_yaml(path_args.config_path)
pprint(default)
args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path)
final_config = merge(args, default)
return Config(final_config)
config = get_config()

View File

@ -0,0 +1,27 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Device adapter for ModelArts"""
from .config import config
if config.enable_modelarts:
from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
else:
from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
__all__ = [
"get_device_id", "get_device_num", "get_rank_id", "get_job_id"
]

View File

@ -0,0 +1,36 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Local adapter"""
import os
def get_device_id():
device_id = os.getenv('DEVICE_ID', '0')
return int(device_id)
def get_device_num():
device_num = os.getenv('RANK_SIZE', '1')
return int(device_num)
def get_rank_id():
global_rank_id = os.getenv('RANK_ID', '0')
return int(global_rank_id)
def get_job_id():
return "Local Job"

View File

@ -0,0 +1,116 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Moxing adapter for ModelArts"""
import os
import functools
from mindspore import context
from .config import config
_global_sync_count = 0
def get_device_id():
device_id = os.getenv('DEVICE_ID', '0')
return int(device_id)
def get_device_num():
device_num = os.getenv('RANK_SIZE', '1')
return int(device_num)
def get_rank_id():
global_rank_id = os.getenv('RANK_ID', '0')
return int(global_rank_id)
def get_job_id():
job_id = os.getenv('JOB_ID')
job_id = job_id if job_id != "" else "default"
return job_id
def sync_data(from_path, to_path):
"""
Download data from remote obs to local directory if the first url is remote url and the second one is local path
Upload data from local directory to remote obs in contrast.
"""
import moxing as mox
import time
global _global_sync_count
sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
_global_sync_count += 1
# Each server contains 8 devices as most.
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
print("from path: ", from_path)
print("to path: ", to_path)
mox.file.copy_parallel(from_path, to_path)
print("===finish data synchronization===")
try:
os.mknod(sync_lock)
except IOError:
pass
print("===save flag===")
while True:
if os.path.exists(sync_lock):
break
time.sleep(1)
print("Finish sync data from {} to {}.".format(from_path, to_path))
def moxing_wrapper(pre_process=None, post_process=None):
"""
Moxing wrapper to download dataset and upload outputs.
"""
def wrapper(run_func):
@functools.wraps(run_func)
def wrapped_func(*args, **kwargs):
# Download data from data_url
if config.enable_modelarts:
if config.data_url:
sync_data(config.data_url, config.data_path)
print("Dataset downloaded: ", os.listdir(config.data_path))
if config.checkpoint_url:
sync_data(config.checkpoint_url, config.load_path)
print("Preload downloaded: ", os.listdir(config.load_path))
if config.train_url:
sync_data(config.train_url, config.output_path)
print("Workspace downloaded: ", os.listdir(config.output_path))
context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
config.device_num = get_device_num()
config.device_id = get_device_id()
if not os.path.exists(config.output_path):
os.makedirs(config.output_path)
if pre_process:
pre_process()
# Run the main function
run_func(*args, **kwargs)
# Upload data to train_url
if config.enable_modelarts:
if post_process:
post_process()
if config.train_url:
print("Start to copy output directory")
sync_data(config.output_path, config.train_url)
return wrapped_func
return wrapper

View File

@ -64,7 +64,9 @@ do
rm -rf ./train_parallel$i
mkdir ./train_parallel$i
cp ../*.py ./train_parallel$i
cp ../*.yaml ./train_parallel$i
cp -r ../src ./train_parallel$i
cp -r ../model_utils ./train_parallel$i
cd ./train_parallel$i || exit
echo "start training for rank $RANK_ID, device $DEVICE_ID"
env > env.log

View File

@ -50,7 +50,9 @@ export DEVICE_NUM=8
rm -rf ./train_parallel
mkdir ./train_parallel
cp ../*.py ./train_parallel
cp ../*.yaml ./train_parallel
cp -r ../src ./train_parallel
cp -r ../model_utils ./train_parallel
cd ./train_parallel || exit
env > env.log
mpirun --allow-run-as-root -n ${DEVICE_NUM} --output-filename log_output --merge-stderr-to-stdout \

View File

@ -55,7 +55,9 @@ then
fi
mkdir ./eval
cp ../*.py ./eval
cp ../*.yaml ./eval
cp -r ../src ./eval
cp -r ../model_utils ./eval
cd ./eval || exit
env > env.log
echo "start inferring for device $DEVICE_ID"

View File

@ -55,7 +55,9 @@ then
fi
mkdir ./eval
cp ../*.py ./eval
cp ../*.yaml ./eval
cp -r ../src ./eval
cp -r ../model_utils ./eval
cd ./eval || exit
env > env.log
echo "start inferring for device $DEVICE_ID"

View File

@ -56,7 +56,9 @@ then
fi
mkdir ./train
cp ../*.py ./train
cp ../*.yaml ./train
cp -r ../src ./train
cp -r ../model_utils ./train
cd ./train || exit
echo "start training for device $DEVICE_ID"
env > env.log

View File

@ -56,7 +56,9 @@ then
fi
mkdir ./train
cp ../*.py ./train
cp ../*.yaml ./train
cp -r ../src ./train
cp -r ../model_utils ./train
cd ./train || exit
echo "start training for device $DEVICE_ID"
env > env.log

View File

@ -1,68 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Config parameters for Darknet based yolov3_darknet53 models."""
class ConfigYOLOV3DarkNet53:
"""
Config parameters for the yolov3_darknet53.
Examples:
ConfigYOLOV3DarkNet53()
"""
# train_param
# data augmentation related
hue = 0.1
saturation = 1.5
value = 1.5
jitter = 0.3
resize_rate = 1
multi_scale = [[320, 320],
[352, 352],
[384, 384],
[416, 416],
[448, 448],
[480, 480],
[512, 512],
[544, 544],
[576, 576],
[608, 608]
]
num_classes = 80
max_box = 50
backbone_input_shape = [32, 64, 128, 256, 512]
backbone_shape = [64, 128, 256, 512, 1024]
backbone_layers = [1, 2, 8, 8, 4]
# confidence under ignore_threshold means no object when training
ignore_threshold = 0.7
# h->w
anchor_scales = [(10, 13),
(16, 30),
(33, 23),
(30, 61),
(62, 45),
(59, 119),
(116, 90),
(156, 198),
(373, 326)]
out_channel = 3 * (num_classes + 5)
# test_param
test_img_shape = [416, 416]

View File

@ -29,7 +29,7 @@ class DistributedSampler:
rank = 0
self.dataset_size = dataset_size
self.num_replicas = num_replicas
self.rank = rank
self.rank = rank if num_replicas > 1 else 0
self.epoch = 0
self.num_samples = int(math.ceil(dataset_size * 1.0 / self.num_replicas))
self.total_size = self.num_samples * self.num_replicas

View File

@ -25,9 +25,8 @@ from mindspore.ops import functional as F
from mindspore.ops import composite as C
from src.darknet import DarkNet, ResidualBlock
from src.config import ConfigYOLOV3DarkNet53
from src.loss import XYLoss, WHLoss, ConfidenceLoss, ClassLoss
from model_utils.config import config as default_config
def _conv_bn_relu(in_channel,
out_channel,
@ -164,17 +163,17 @@ class DetectionBlock(nn.Cell):
Args:
scale: Character.
config: ConfigYOLOV3DarkNet53, Configuration instance.
config: Configuration.
is_training: Bool, Whether train or not, default True.
Returns:
Tuple, tuple of output tensor,(f1,f2,f3).
Examples:
DetectionBlock(scale='l',stride=32)
DetectionBlock(scale='l',stride=32,config=config)
"""
def __init__(self, scale, config=ConfigYOLOV3DarkNet53(), is_training=True):
def __init__(self, scale, config=None, is_training=True):
super(DetectionBlock, self).__init__()
self.config = config
if scale == 's':
@ -275,7 +274,7 @@ class YoloLossBlock(nn.Cell):
"""
Loss block cell of YOLOV3 network.
"""
def __init__(self, scale, config=ConfigYOLOV3DarkNet53()):
def __init__(self, scale, config=None):
super(YoloLossBlock, self).__init__()
self.config = config
if scale == 's':
@ -362,9 +361,9 @@ class YOLOV3DarkNet53(nn.Cell):
YOLOV3DarkNet53(True)
"""
def __init__(self, is_training):
def __init__(self, is_training, config=default_config):
super(YOLOV3DarkNet53, self).__init__()
self.config = ConfigYOLOV3DarkNet53()
self.config = config
self.tenser_to_array = P.TupleToArray()
# YOLOv3 network
@ -376,9 +375,9 @@ class YOLOV3DarkNet53(nn.Cell):
out_channel=self.config.out_channel)
# prediction on the default anchor boxes
self.detect_1 = DetectionBlock('l', is_training=is_training)
self.detect_2 = DetectionBlock('m', is_training=is_training)
self.detect_3 = DetectionBlock('s', is_training=is_training)
self.detect_1 = DetectionBlock('l', is_training=is_training, config=self.config)
self.detect_2 = DetectionBlock('m', is_training=is_training, config=self.config)
self.detect_3 = DetectionBlock('s', is_training=is_training, config=self.config)
def construct(self, x):
input_shape = F.shape(x)[2:4]
@ -393,10 +392,10 @@ class YOLOV3DarkNet53(nn.Cell):
class YoloWithLossCell(nn.Cell):
"""YOLOV3 loss."""
def __init__(self, network):
def __init__(self, network, config=default_config):
super(YoloWithLossCell, self).__init__()
self.yolo_network = network
self.config = ConfigYOLOV3DarkNet53()
self.config = config
self.tenser_to_array = P.TupleToArray()
self.loss_big = YoloLossBlock('l', self.config)
self.loss_me = YoloLossBlock('m', self.config)

View File

@ -155,12 +155,12 @@ def create_yolo_dataset(image_dir, anno_path, batch_size, max_epoch, device_num,
yolo_dataset = COCOYoloDataset(root=image_dir, ann_file=anno_path, filter_crowd_anno=filter_crowd,
remove_images_without_annotations=remove_empty_anno, is_training=is_training)
distributed_sampler = DistributedSampler(len(yolo_dataset), device_num, rank, shuffle=shuffle)
hwc_to_chw = CV.HWC2CHW()
config.dataset_size = len(yolo_dataset)
cores = multiprocessing.cpu_count()
num_parallel_workers = int(cores / device_num)
distributed_sampler = DistributedSampler(len(yolo_dataset), device_num, rank, shuffle=shuffle)
if is_training:
multi_scale_trans = MultiScaleTrans(config, device_num)
dataset_column_names = ["image", "annotation", "bbox1", "bbox2", "bbox3",

View File

@ -15,7 +15,6 @@
"""YoloV3 train."""
import os
import time
import argparse
import datetime
from mindspore.context import ParallelMode
@ -36,9 +35,12 @@ from src.util import AverageMeter, get_param_groups
from src.lr_scheduler import get_lr
from src.yolo_dataset import create_yolo_dataset
from src.initializer import default_recurisive_init, load_yolov3_params
from src.config import ConfigYOLOV3DarkNet53
from src.util import keep_loss_fp32
from model_utils.config import config
from model_utils.moxing_adapter import moxing_wrapper
from model_utils.device_adapter import get_device_id, get_device_num
set_seed(1)
class BuildTrainNetwork(nn.Cell):
@ -53,79 +55,6 @@ class BuildTrainNetwork(nn.Cell):
return loss
def parse_args():
"""Parse train arguments."""
parser = argparse.ArgumentParser('mindspore coco training')
# device related
parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'],
help='device where the code will be implemented. (Default: Ascend)')
# dataset related
parser.add_argument('--data_dir', type=str, help='Train dataset directory.')
parser.add_argument('--per_batch_size', default=32, type=int, help='Batch size for Training. Default: 32.')
# network related
parser.add_argument('--pretrained_backbone', default='', type=str,
help='The ckpt file of DarkNet53. Default: "".')
parser.add_argument('--resume_yolov3', default='', type=str,
help='The ckpt file of YOLOv3, which used to fine tune. Default: ""')
# optimizer and lr related
parser.add_argument('--lr_scheduler', default='exponential', type=str,
help='Learning rate scheduler, options: exponential, cosine_annealing. Default: exponential')
parser.add_argument('--lr', default=0.001, type=float, help='Learning rate. Default: 0.001')
parser.add_argument('--lr_epochs', type=str, default='220,250',
help='Epoch of changing of lr changing, split with ",". Default: 220,250')
parser.add_argument('--lr_gamma', type=float, default=0.1,
help='Decrease lr by a factor of exponential lr_scheduler. Default: 0.1')
parser.add_argument('--eta_min', type=float, default=0., help='Eta_min in cosine_annealing scheduler. Default: 0')
parser.add_argument('--T_max', type=int, default=320, help='T-max in cosine_annealing scheduler. Default: 320')
parser.add_argument('--max_epoch', type=int, default=320, help='Max epoch num to train the model. Default: 320')
parser.add_argument('--warmup_epochs', default=0, type=float, help='Warmup epochs. Default: 0')
parser.add_argument('--weight_decay', type=float, default=0.0005, help='Weight decay factor. Default: 0.0005')
parser.add_argument('--momentum', type=float, default=0.9, help='Momentum. Default: 0.9')
# loss related
parser.add_argument('--loss_scale', type=int, default=1024, help='Static loss scale. Default: 1024')
parser.add_argument('--label_smooth', type=int, default=0, help='Whether to use label smooth in CE. Default:0')
parser.add_argument('--label_smooth_factor', type=float, default=0.1,
help='Smooth strength of original one-hot. Default: 0.1')
# logging related
parser.add_argument('--log_interval', type=int, default=100, help='Logging interval steps. Default: 100')
parser.add_argument('--ckpt_path', type=str, default='outputs/', help='Checkpoint save location. Default: outputs/')
parser.add_argument('--ckpt_interval', type=int, default=None, help='Save checkpoint interval. Default: None')
parser.add_argument('--is_save_on_master', type=int, default=1,
help='Save ckpt on master or all rank, 1 for master, 0 for all ranks. Default: 1')
# distributed related
parser.add_argument('--is_distributed', type=int, default=1,
help='Distribute train or not, 1 for yes, 0 for no. Default: 1')
parser.add_argument('--rank', type=int, default=0, help='Local rank of distributed. Default: 0')
parser.add_argument('--group_size', type=int, default=1, help='World size of device. Default: 1')
# profiler init
parser.add_argument('--need_profiler', type=int, default=0,
help='Whether use profiler. 0 for no, 1 for yes. Default: 0')
# reset default config
parser.add_argument('--training_shape', type=str, default="", help='Fix training shape. Default: ""')
parser.add_argument('--resize_rate', type=int, default=None,
help='Resize rate for multi-scale training. Default: None')
args, _ = parser.parse_known_args()
if args.lr_scheduler == 'cosine_annealing' and args.max_epoch > args.T_max:
args.T_max = args.max_epoch
args.lr_epochs = list(map(int, args.lr_epochs.split(',')))
args.data_root = os.path.join(args.data_dir, 'train2014')
args.annFile = os.path.join(args.data_dir, 'annotations/instances_train2014.json')
return args
def conver_training_shape(args):
training_shape = [int(args.training_shape), int(args.training_shape)]
return training_shape
@ -151,6 +80,7 @@ def network_init(args):
init("nccl")
args.rank = get_rank()
args.group_size = get_group_size()
# select for master rank save ckpt or all rank save, compatible for model parallel
args.rank_save_ckpt_flag = 0
if args.is_save_on_master:
@ -175,47 +105,105 @@ def parallel_init(args):
degree = get_group_size()
context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree)
def train():
def modelarts_pre_process():
'''modelarts pre process function.'''
def unzip(zip_file, save_dir):
import zipfile
s_time = time.time()
if not os.path.exists(os.path.join(save_dir, config.modelarts_dataset_unzip_name)):
zip_isexist = zipfile.is_zipfile(zip_file)
if zip_isexist:
fz = zipfile.ZipFile(zip_file, 'r')
data_num = len(fz.namelist())
print("Extract Start...")
print("unzip file num: {}".format(data_num))
data_print = int(data_num / 100) if data_num > 100 else 1
i = 0
for file in fz.namelist():
if i % data_print == 0:
print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True)
i += 1
fz.extract(file, save_dir)
print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60),
int(int(time.time() - s_time) % 60)))
print("Extract Done.")
else:
print("This is not zip.")
else:
print("Zip has been extracted.")
if config.need_modelarts_dataset_unzip:
zip_file_1 = os.path.join(config.data_path, config.modelarts_dataset_unzip_name + ".zip")
save_dir_1 = os.path.join(config.data_path)
sync_lock = "/tmp/unzip_sync.lock"
# Each server contains 8 devices as most.
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
print("Zip file path: ", zip_file_1)
print("Unzip file save dir: ", save_dir_1)
unzip(zip_file_1, save_dir_1)
print("===Finish extract data synchronization===")
try:
os.mknod(sync_lock)
except IOError:
pass
while True:
if os.path.exists(sync_lock):
break
time.sleep(1)
print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1))
config.ckpt_path = os.path.join(config.output_path, config.ckpt_path)
@moxing_wrapper(pre_process=modelarts_pre_process)
def run_train():
"""Train function."""
args = parse_args()
profiler = network_init(args)
if config.lr_scheduler == 'cosine_annealing' and config.max_epoch > config.T_max:
config.T_max = config.max_epoch
config.lr_epochs = list(map(int, config.lr_epochs.split(',')))
config.data_root = os.path.join(config.data_dir, 'train2014')
config.annFile = os.path.join(config.data_dir, 'annotations/instances_train2014.json')
profiler = network_init(config)
loss_meter = AverageMeter('loss')
parallel_init(args)
parallel_init(config)
network = YOLOV3DarkNet53(is_training=True)
# default is kaiming-normal
default_recurisive_init(network)
load_yolov3_params(args, network)
load_yolov3_params(config, network)
network = YoloWithLossCell(network)
args.logger.info('finish get network')
config.logger.info('finish get network')
config = ConfigYOLOV3DarkNet53()
config.label_smooth = args.label_smooth
config.label_smooth_factor = args.label_smooth_factor
config.label_smooth = config.label_smooth
config.label_smooth_factor = config.label_smooth_factor
if args.training_shape:
config.multi_scale = [conver_training_shape(args)]
if args.resize_rate:
config.resize_rate = args.resize_rate
if config.training_shape:
config.multi_scale = [conver_training_shape(config)]
ds, data_size = create_yolo_dataset(image_dir=args.data_root, anno_path=args.annFile, is_training=True,
batch_size=args.per_batch_size, max_epoch=args.max_epoch,
device_num=args.group_size, rank=args.rank, config=config)
args.logger.info('Finish loading dataset')
ds, data_size = create_yolo_dataset(image_dir=config.data_root, anno_path=config.annFile, is_training=True,
batch_size=config.per_batch_size, max_epoch=config.max_epoch,
device_num=config.group_size, rank=config.rank, config=config)
config.logger.info('Finish loading dataset')
args.steps_per_epoch = int(data_size / args.per_batch_size / args.group_size)
config.steps_per_epoch = int(data_size / config.per_batch_size / config.group_size)
if not args.ckpt_interval:
args.ckpt_interval = args.steps_per_epoch
if config.ckpt_interval <= 0:
config.ckpt_interval = config.steps_per_epoch
lr = get_lr(args)
lr = get_lr(config)
opt = Momentum(params=get_param_groups(network),
learning_rate=Tensor(lr),
momentum=args.momentum,
weight_decay=args.weight_decay,
loss_scale=args.loss_scale)
momentum=config.momentum,
weight_decay=config.weight_decay,
loss_scale=config.loss_scale)
is_gpu = context.get_context("device_target") == "GPU"
if is_gpu:
loss_scale_value = 1.0
@ -224,18 +212,18 @@ def train():
level="O2", keep_batchnorm_fp32=False)
keep_loss_fp32(network)
else:
network = TrainingWrapper(network, opt, sens=args.loss_scale)
network = TrainingWrapper(network, opt, sens=config.loss_scale)
network.set_train()
if args.rank_save_ckpt_flag:
if config.rank_save_ckpt_flag:
# checkpoint save
ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval,
ckpt_max_num = config.max_epoch * config.steps_per_epoch // config.ckpt_interval
ckpt_config = CheckpointConfig(save_checkpoint_steps=config.ckpt_interval,
keep_checkpoint_max=ckpt_max_num)
save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/')
save_ckpt_path = os.path.join(config.outputs_dir, 'ckpt_' + str(config.rank) + '/')
ckpt_cb = ModelCheckpoint(config=ckpt_config,
directory=save_ckpt_path,
prefix='{}'.format(args.rank))
prefix='{}'.format(config.rank))
cb_params = _InternalCallbackParam()
cb_params.train_network = network
cb_params.epoch_num = ckpt_max_num
@ -250,7 +238,7 @@ def train():
for i, data in enumerate(data_loader):
images = data["image"]
input_shape = images.shape[2:4]
args.logger.info('iter[{}], shape{}'.format(i, input_shape[0]))
config.logger.info('iter[{}], shape{}'.format(i, input_shape[0]))
images = Tensor.from_numpy(images)
@ -265,34 +253,34 @@ def train():
batch_gt_box2)
loss_meter.update(loss.asnumpy())
if args.rank_save_ckpt_flag:
if config.rank_save_ckpt_flag:
# ckpt progress
cb_params.cur_step_num = i + 1 # current step number
cb_params.batch_num = i + 2
ckpt_cb.step_end(run_context)
if i % args.log_interval == 0:
if i % config.log_interval == 0:
time_used = time.time() - t_end
epoch = int(i / args.steps_per_epoch)
per_step_time = time_used/args.log_interval
fps = args.per_batch_size * (i - old_progress) * args.group_size / time_used
if args.rank == 0:
args.logger.info(
epoch = int(i / config.steps_per_epoch)
per_step_time = time_used/config.log_interval
fps = config.per_batch_size * (i - old_progress) * config.group_size / time_used
if config.rank == 0:
config.logger.info(
'epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr:{},'
' per_step_time:{}'.format(epoch, i, loss_meter, fps, lr[i], per_step_time))
t_end = time.time()
loss_meter.reset()
old_progress = i
if (i + 1) % args.steps_per_epoch == 0 and args.rank_save_ckpt_flag:
if (i + 1) % config.steps_per_epoch == 0 and config.rank_save_ckpt_flag:
cb_params.cur_epoch_num += 1
if args.need_profiler:
if config.need_profiler:
if i == 10:
profiler.analyse()
break
args.logger.info('==========end training===============')
config.logger.info('==========end training===============')
if __name__ == "__main__":
train()
run_train()

View File

@ -226,11 +226,11 @@ def test_yolov3_darknet_8p():
cur_model_path = os.path.join(cur_path, model_name)
train_file = os.path.join(cur_model_path, "train.py")
old_list = ["--lr_scheduler=cosine_annealing"]
new_list = ["--lr_scheduler=cosine_annealing --training_shape=416"]
new_list = ["--lr_scheduler=cosine_annealing --training_shape=416 --log_interval=10"]
utils.exec_sed_command(old_list, new_list,
os.path.join(cur_model_path, "scripts/run_distribute_train.sh"))
old_list = ["default=100", "max_epoch=args.max_epoch"]
new_list = ["default=10", "max_epoch=1"]
old_list = ["max_epoch=config.max_epoch"]
new_list = ["max_epoch=1"]
utils.exec_sed_command(old_list, new_list, train_file)
old_list = ["sampler=distributed_sampler"]
new_list = ["sampler=distributed_sampler, num_samples=100*batch_size"]