modify FaceDetection net for clould

This commit is contained in:
zhanghuiyao 2021-05-22 10:02:09 +08:00
parent 659fb1dbbb
commit b62a3f9116
19 changed files with 702 additions and 260 deletions

View File

@ -84,7 +84,7 @@ Dataset used: [COCO2014](https://cocodataset.org/#download)
- Pretrained_backbone can use src/convert_weight.py, convert darknet53.conv.74 to mindspore ckpt.
```
python convert_weight.py --input_file ./darknet53.conv.74
python src/convert_weight.py --input_file ./darknet53.conv.74
```
darknet53.conv.74 can get from [download](https://pjreddie.com/media/files/darknet53.conv.74) .

View File

@ -88,7 +88,7 @@ YOLOv3使用DarkNet53执行特征提取这是YOLOv2中的Darknet-19和残差
- 使用src路径下的convert_weight.py脚本将darknet53.conv.74转换成mindspore ckpt格式。
```command
python convert_weight.py --input_file ./darknet53.conv.74
python src/convert_weight.py --input_file ./darknet53.conv.74
```
可以从网站[下载](https://pjreddie.com/media/files/darknet53.conv.74) darknet53.conv.74文件。

View File

@ -75,6 +75,10 @@ file_name: "yolov3_darknet53"
file_format: "AIR" # ["AIR", "ONNX", "MINDIR"]
# convert weight option
input_file: "./darknet53.conv.74"
output_file: "./backbone_darknet53.ckpt"
# Other default config
hue: 0.1
saturation: 1.5
@ -165,4 +169,8 @@ batch_size: "batch size"
ckpt_file: "Checkpoint file path."
file_name: "output file name."
file_format: "file format choices in ['AIR', 'ONNX', 'MINDIR']"
device_target: "device target. choices in ['Ascend', 'GPU'] for train. choices in ['Ascend', 'GPU', 'CPU'] for export."
device_target: "device target. choices in ['Ascend', 'GPU'] for train. choices in ['Ascend', 'GPU', 'CPU'] for export."
# convert weight option
input_file: "input file path."
output_file: "output file path."

View File

@ -14,12 +14,12 @@
# ============================================================================
"""Convert weight to mindspore ckpt."""
import os
import argparse
import numpy as np
from mindspore.train.serialization import save_checkpoint
from mindspore import Tensor
from src.yolo import YOLOV3DarkNet53
from model_utils.config import config
def load_weight(weights_file):
"""Loads pre-trained weights."""
@ -72,9 +72,4 @@ def convert(weights_file, output_file):
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="yolov3 weight convert.")
parser.add_argument("--input_file", type=str, default="./darknet53.conv.74", help="input file path.")
parser.add_argument("--output_file", type=str, default="./backbone_darknet53.ckpt", help="output file path.")
args_opt = parser.parse_args()
convert(args_opt.input_file, args_opt.output_file)
convert(config.input_file, config.output_file)

View File

@ -83,10 +83,16 @@ We use about 13K images as training dataset and 3K as evaluating dataset in this
The entire code structure is as following:
```python
```text
.
└─ Face Detection
├─ README.md
├─ model_utils
├─ __init__.py # init file
├─ config.py # Parse arguments
├─ device_adapter.py # Device adapter for ModelArts
├─ local_adapter.py # Local adapter
└─ moxing_adapter.py # Moxing adapter for ModelArts
├─ scripts
├─ run_standalone_train.sh # launch standalone training(1p) in ascend
├─ run_distribute_train.sh # launch distributed training(8p) in ascend
@ -98,7 +104,6 @@ The entire code structure is as following:
├─ yolo_loss.py # loss function
├─ yolo_postprocess.py # post process
└─ yolov3.py # network
├─ config.py # parameter configuration
├─ data_preprocess.py # preprocess
├─ logging.py # log function
├─ lrsche_factory.py # generate learning rate
@ -107,6 +112,7 @@ The entire code structure is as following:
├─ data_to_mindrecord_train.py # convert dataset to mindrecord for training
├─ data_to_mindrecord_train_append.py # add dataset to an existed mindrecord for training
└─ data_to_mindrecord_eval.py # convert dataset to mindrecord for evaluating
├─ default_config.yaml # default configurations
├─ train.py # training scripts
├─ eval.py # evaluation scripts
└─ export.py # export air model
@ -158,20 +164,84 @@ The entire code structure is as following:
bash run_distribute_train.sh /home/train.mindrecord ./rank_table_8p.json /home/a.ckpt
```
*Distribute mode doesn't support running on CPU*. You will get the loss value of each step as following in "./output/[TIME]/[TIME].log" or "./scripts/device0/train.log":
*Distribute mode doesn't support running on CPU*. You will get the loss value of each step as following in "./scripts/device0/output/[TIME]/[TIME].log" or "./scripts/device0/train.log":
```python
rank[0], iter[0], loss[318555.8], overflow:False, loss_scale:1024.0, lr:6.24999984211172e-06, batch_images:(64, 3, 448, 768), batch_labels:(64, 200, 6)
rank[0], iter[1], loss[95394.28], overflow:True, loss_scale:1024.0, lr:6.24999984211172e-06, batch_images:(64, 3, 448, 768), batch_labels:(64, 200, 6)
rank[0], iter[2], loss[81332.92], overflow:True, loss_scale:512.0, lr:6.24999984211172e-06, batch_images:(64, 3, 448, 768), batch_labels:(64, 200, 6)
rank[0], iter[3], loss[27250.805], overflow:True, loss_scale:256.0, lr:6.24999984211172e-06, batch_images:(64, 3, 448, 768), batch_labels:(64, 200, 6)
...
```python
rank[0], iter[0], loss[318555.8], overflow:False, loss_scale:1024.0, lr:6.24999984211172e-06, batch_images:(64, 3, 448, 768), batch_labels:(64, 200, 6)
rank[0], iter[1], loss[95394.28], overflow:True, loss_scale:1024.0, lr:6.24999984211172e-06, batch_images:(64, 3, 448, 768), batch_labels:(64, 200, 6)
rank[0], iter[2], loss[81332.92], overflow:True, loss_scale:512.0, lr:6.24999984211172e-06, batch_images:(64, 3, 448, 768), batch_labels:(64, 200, 6)
rank[0], iter[3], loss[27250.805], overflow:True, loss_scale:256.0, lr:6.24999984211172e-06, batch_images:(64, 3, 448, 768), batch_labels:(64, 200, 6)
...
rank[0], iter[62496], loss[2218.6282], overflow:False, loss_scale:256.0, lr:6.24999984211172e-06, batch_images:(64, 3, 448, 768), batch_labels:(64, 200, 6)
rank[0], iter[62497], loss[3788.5146], overflow:False, loss_scale:256.0, lr:6.24999984211172e-06, batch_images:(64, 3, 448, 768), batch_labels:(64, 200, 6)
rank[0], iter[62498], loss[3427.5479], overflow:False, loss_scale:256.0, lr:6.24999984211172e-06, batch_images:(64, 3, 448, 768), batch_labels:(64, 200, 6)
rank[0], iter[62499], loss[4294.194], overflow:False, loss_scale:256.0, lr:6.24999984211172e-06, batch_images:(64, 3, 448, 768), batch_labels:(64, 200, 6)
```
rank[0], iter[62496], loss[2218.6282], overflow:False, loss_scale:256.0, lr:6.24999984211172e-06, batch_images:(64, 3, 448, 768), batch_labels:(64, 200, 6)
rank[0], iter[62497], loss[3788.5146], overflow:False, loss_scale:256.0, lr:6.24999984211172e-06, batch_images:(64, 3, 448, 768), batch_labels:(64, 200, 6)
rank[0], iter[62498], loss[3427.5479], overflow:False, loss_scale:256.0, lr:6.24999984211172e-06, batch_images:(64, 3, 448, 768), batch_labels:(64, 200, 6)
rank[0], iter[62499], loss[4294.194], overflow:False, loss_scale:256.0, lr:6.24999984211172e-06, batch_images:(64, 3, 448, 768), batch_labels:(64, 200, 6)
```
- Train on [ModelArts](https://support.huaweicloud.com/modelarts/)
```python
# Train 8p with Ascend
# (1) Perform a or b.
# a. Set "enable_modelarts=True" on base_config.yaml file.
# Set "mindrecord_path='/cache/data/face_detect_dataset/mindrecord_train/data.mindrecord'" on default_config.yaml file.
# (optional)Set "checkpoint_url='s3://dir_to_your_pretrain/'" on default_config.yaml file.
# (optional)Set "pretrained='/cache/checkpoint_path/model.ckpt'" on default_config.yaml file.
# Set other parameters on default_config.yaml file you need.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add "mindrecord_path='/cache/data/face_detect_dataset/mindrecord_train/data.mindrecord'" on the website UI interface.
# (optional)Add "checkpoint_url='s3://dir_to_your_pretrain/'" on the website UI interface.
# (optional)Add "pretrained='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
# Add other parameters on the website UI interface.
# (3) (optional) Upload or copy your pretrained model to S3 bucket.
# (4) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
# (5) Set the code directory to "/path/FaceDetection" on the website UI interface.
# (6) Set the startup file to "train.py" on the website UI interface.
# (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (8) Create your job.
#
# Train 1p with Ascend
# (1) Perform a or b.
# a. Set "enable_modelarts=True" on base_config.yaml file.
# Set "run_platform='Ascend'" on default_config.yaml file.
# Set "mindrecord_path='/cache/data/face_detect_dataset/mindrecord_train/data.mindrecord'" on default_config.yaml file.
# (optional)Set "checkpoint_url='s3://dir_to_your_pretrain/'" on default_config.yaml file.
# (optional)Set "pretrained='/cache/checkpoint_path/model.ckpt'" on default_config.yaml file.
# Set other parameters on default_config.yaml file you need.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add "run_platform='Ascend'" on the website UI interface.
# Add "mindrecord_path='/cache/data/face_detect_dataset/mindrecord_train/data.mindrecord'" on the website UI interface.
# (optional)Add "checkpoint_url='s3://dir_to_your_pretrain/'" on the website UI interface.
# (optional)Add "pretrained='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
# Add other parameters on the website UI interface.
# (3) (optional) Upload or copy your pretrained model to S3 bucket.
# (4) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
# (5) Set the code directory to "/path/FaceDetection" on the website UI interface.
# (6) Set the startup file to "train.py" on the website UI interface.
# (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (8) Create your job.
#
# Eval 1p with Ascend
# (1) Perform a or b.
# a. Set "enable_modelarts=True" on base_config.yaml file.
# Set "run_platform='Ascend'" on default_config.yaml file.
# Set "mindrecord_path='/cache/data/face_detect_dataset/mindrecord_train/data.mindrecord'" on default_config.yaml file.
# Set "checkpoint_url='s3://dir_to_your_pretrain/'" on default_config.yaml file.
# Set "pretrained='/cache/checkpoint_path/model.ckpt'" on default_config.yaml file.
# Set other parameters on default_config.yaml file you need.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add "run_platform='Ascend'" on the website UI interface.
# Add "mindrecord_path='/cache/data/face_detect_dataset/mindrecord_test/data.mindrecord'" on the website UI interface.
# Add "checkpoint_url='s3://dir_to_your_pretrain/'" on the website UI interface.
# Add "pretrained='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
# Add other parameters on the website UI interface.
# (3) Upload or copy your pretrained model to S3 bucket.
# (4) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
# (5) Set the code directory to "/path/FaceDetection" on the website UI interface.
# (6) Set the startup file to "eval.py" on the website UI interface.
# (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (8) Create your job.
```
### Evaluation
@ -214,7 +284,7 @@ bash run_export.sh [PLATFORM] [BATCH_SIZE] [USE_DEVICE_ID] [PRETRAINED_BACKBONE]
| Parameters | Face Detection |
| -------------------------- | ----------------------------------------------------------- |
| Model Version | V1 |
| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 |
| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 |
| uploaded Date | 09/30/2020 (month/day/year) |
| MindSpore Version | 1.0.0 |
| Dataset | 13K images |
@ -231,7 +301,7 @@ bash run_export.sh [PLATFORM] [BATCH_SIZE] [USE_DEVICE_ID] [PRETRAINED_BACKBONE]
| Parameters | Face Detection |
| ------------------- | --------------------------- |
| Model Version | V1 |
| Resource | Ascend 910; OS Euler2.8 |
| Resource | Ascend 910; OS Euler2.8 |
| Uploaded Date | 09/30/2020 (month/day/year) |
| MindSpore Version | 1.0.0 |
| Dataset | 3K images |

View File

@ -0,0 +1,69 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path"
need_modelarts_dataset_unzip: True
modelarts_dataset_unzip_name: "face_detect_dataset"
# ==============================================================================
# train options
run_platform: "Ascend" # choices in ("Ascend", "CPU")
mindrecord_path: ""
pretrained: ""
use_loss_scale: True
# default options
batch_size: 64
warmup_lr: 0.0004
lr_rates: [0.002, 0.004, 0.002, 0.0008, 0.0004, 0.0002, 0.00008, 0.00004, 0.000004]
lr_steps: [1000, 10000, 40000, 60000, 80000, 100000, 130000, 160000, 190000]
gamma: 0.5
weight_decay: 0.0005
momentum: 0.5
max_epoch: 2500
log_interval: 10
ckpt_path: "../../output"
ckpt_interval: 1000
result_path: "../../results"
input_shape: [768, 448]
jitter: 0.3
flip: 0.5
hue: 0.1
sat: 1.5
val: 1.5
num_classes: 1
anchors: [[3, 4],
[5, 6],
[7, 9],
[10, 13],
[15, 19],
[21, 26],
[28, 36],
[38, 49],
[54, 71],
[77, 102],
[122, 162],
[207, 268]]
anchors_mask: [[8, 9, 10, 11], [4, 5, 6, 7], [0, 1, 2, 3]]
conf_thresh: 0.1
nms_thresh: 0.45
---
# Help description for each configuration
# train options
run_platform: "run platform, support Ascend and CPU."
mindrecord_path: "dataset path, e.g. /home/data.mindrecord"
pretrained: "pretrained model to load"
local_rank: "current rank to support distributed"
use_loss_scale: "Whether use dynamic loss scale, default is True."

View File

@ -14,7 +14,7 @@
# ============================================================================
"""Face detection eval."""
import os
import argparse
import time
import matplotlib.pyplot as plt
from mindspore import context
@ -24,50 +24,104 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.common import dtype as mstype
import mindspore.dataset as de
from src.data_preprocess import SingleScaleTrans
from src.config import config
from src.FaceDetection.yolov3 import HwYolov3 as backbone_HwYolov3
from src.FaceDetection import voc_wrapper
from src.network_define import BuildTestNetwork, get_bounding_boxes, tensor_to_brambox, \
parse_gt_from_anno, parse_rets, calc_recall_precision_ap
from model_utils.config import config
from model_utils.moxing_adapter import moxing_wrapper
from model_utils.device_adapter import get_device_id, get_device_num, get_rank_id
plt.switch_backend('agg')
def parse_args():
'''parse_args'''
parser = argparse.ArgumentParser('Yolov3 Face Detection')
parser.add_argument("--run_platform", type=str, default="Ascend", choices=("Ascend", "CPU"),
help="run platform, support Ascend and CPU.")
parser.add_argument('--mindrecord_path', type=str, default='', help='dataset path, e.g. /home/data.mindrecord')
parser.add_argument('--pretrained', type=str, default='', help='pretrained model to load')
parser.add_argument('--local_rank', type=int, default=0, help='current rank to support distributed')
parser.add_argument('--world_size', type=int, default=1, help='current process number to support distributed')
def load_pretrain(net, cfg):
'''load pretrain model'''
if os.path.isfile(cfg.pretrained):
param_dict = load_checkpoint(cfg.pretrained)
param_dict_new = {}
for key, values in param_dict.items():
if key.startswith('moments.'):
continue
elif key.startswith('network.'):
param_dict_new[key[8:]] = values
else:
param_dict_new[key] = values
load_param_into_net(net, param_dict_new)
print('load model {} success'.format(cfg.pretrained))
else:
print('load model {} failed, please check the path of model, evaluating end'.format(cfg.pretrained))
exit(0)
arg, _ = parser.parse_known_args()
return net
return arg
def modelarts_pre_process():
'''modelarts pre process function.'''
def unzip(zip_file, save_dir):
import zipfile
s_time = time.time()
if not os.path.exists(os.path.join(save_dir, config.modelarts_dataset_unzip_name)):
zip_isexist = zipfile.is_zipfile(zip_file)
if zip_isexist:
fz = zipfile.ZipFile(zip_file, 'r')
data_num = len(fz.namelist())
print("Extract Start...")
print("unzip file num: {}".format(data_num))
data_print = int(data_num / 100) if data_num > 100 else 1
i = 0
for file in fz.namelist():
if i % data_print == 0:
print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True)
i += 1
fz.extract(file, save_dir)
print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60),
int(int(time.time() - s_time) % 60)))
print("Extract Done.")
else:
print("This is not zip.")
else:
print("Zip has been extracted.")
if config.need_modelarts_dataset_unzip:
zip_file_1 = os.path.join(config.data_path, config.modelarts_dataset_unzip_name + ".zip")
save_dir_1 = os.path.join(config.data_path)
sync_lock = "/tmp/unzip_sync.lock"
# Each server contains 8 devices as most.
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
print("Zip file path: ", zip_file_1)
print("Unzip file save dir: ", save_dir_1)
unzip(zip_file_1, save_dir_1)
print("===Finish extract data synchronization===")
try:
os.mknod(sync_lock)
except IOError:
pass
while True:
if os.path.exists(sync_lock):
break
time.sleep(1)
print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1))
config.result_path = os.path.join(config.output_path, "results")
if __name__ == "__main__":
args = parse_args()
devid = int(os.getenv('DEVICE_ID', '0')) if args.run_platform != 'CPU' else 0
context.set_context(mode=context.GRAPH_MODE, device_target=args.run_platform, save_graphs=False, device_id=devid)
@moxing_wrapper(pre_process=modelarts_pre_process)
def run_eval():
'''run eval'''
config.world_size = get_device_num()
config.local_rank = get_rank_id()
devid = get_device_id() if config.run_platform != 'CPU' else 0
context.set_context(mode=context.GRAPH_MODE, device_target=config.run_platform, save_graphs=False, device_id=devid)
print('=============yolov3 start evaluating==================')
# logger
args.batch_size = config.batch_size
args.input_shape = config.input_shape
args.result_path = config.result_path
args.conf_thresh = config.conf_thresh
args.nms_thresh = config.nms_thresh
context.set_auto_parallel_context(parallel_mode=ParallelMode.STAND_ALONE, device_num=args.world_size,
context.set_auto_parallel_context(parallel_mode=ParallelMode.STAND_ALONE, device_num=config.world_size,
gradients_mean=True)
mindrecord_path = args.mindrecord_path
print('Loading data from {}'.format(mindrecord_path))
num_classes = config.num_classes
if num_classes > 1:
@ -84,34 +138,18 @@ if __name__ == "__main__":
classes = {0: 'face'}
# dataloader
ds = de.MindDataset(mindrecord_path + "0", columns_list=["image", "annotation", "image_name", "image_size"])
print('Loading data from {}'.format(config.mindrecord_path))
ds = de.MindDataset(config.mindrecord_path + "0", columns_list=["image", "annotation", "image_name", "image_size"])
single_scale_trans = SingleScaleTrans(resize=args.input_shape)
ds = ds.batch(args.batch_size, per_batch_map=single_scale_trans,
single_scale_trans = SingleScaleTrans(resize=config.input_shape)
ds = ds.batch(config.batch_size, per_batch_map=single_scale_trans,
input_columns=["image", "annotation", "image_name", "image_size"], num_parallel_workers=8)
args.steps_per_epoch = ds.get_dataset_size()
config.steps_per_epoch = ds.get_dataset_size()
# backbone
network = backbone_HwYolov3(num_classes, num_anchors_list, args)
# load pretrain model
if os.path.isfile(args.pretrained):
param_dict = load_checkpoint(args.pretrained)
param_dict_new = {}
for key, values in param_dict.items():
if key.startswith('moments.'):
continue
elif key.startswith('network.'):
param_dict_new[key[8:]] = values
else:
param_dict_new[key] = values
load_param_into_net(network, param_dict_new)
print('load model {} success'.format(args.pretrained))
else:
print('load model {} failed, please check the path of model, evaluating end'.format(args.pretrained))
exit(0)
network = backbone_HwYolov3(num_classes, num_anchors_list, config)
network = load_pretrain(network, config)
ds = ds.repeat(1)
@ -119,30 +157,25 @@ if __name__ == "__main__":
img_size = {}
img_anno = {}
model_name = args.pretrained.split('/')[-1].replace('.ckpt', '')
result_path = os.path.join(args.result_path, model_name)
model_name = config.pretrained.split('/')[-1].replace('.ckpt', '')
result_path = os.path.join(config.result_path, model_name)
if os.path.exists(result_path):
pass
if not os.path.isdir(result_path):
os.makedirs(result_path, exist_ok=True)
# result file
ret_files_set = {
'face': os.path.join(result_path, 'comp4_det_test_face_rm5050.txt'),
}
ret_files_set = {'face': os.path.join(result_path, 'comp4_det_test_face_rm5050.txt'),}
test_net = BuildTestNetwork(network, reduction_0, reduction_1, reduction_2, anchors, anchors_mask, num_classes,
args)
config)
print('conf_thresh:', args.conf_thresh)
print('conf_thresh:', config.conf_thresh)
eval_times = 0
for data in ds.create_tuple_iterator(output_numpy=True):
batch_images = data[0]
batch_labels = data[1]
batch_image_name = data[2]
batch_image_size = data[3]
batch_images, batch_labels, batch_image_name, batch_image_size = data[0:4]
eval_times += 1
img_tensor = Tensor(batch_images, mstype.float32)
@ -153,11 +186,11 @@ if __name__ == "__main__":
coords_0, cls_scores_0, coords_1, cls_scores_1, coords_2, cls_scores_2 = test_net(img_tensor)
boxes_0, boxes_1, boxes_2 = get_bounding_boxes(coords_0, cls_scores_0, coords_1, cls_scores_1, coords_2,
cls_scores_2, args.conf_thresh, args.input_shape,
cls_scores_2, config.conf_thresh, config.input_shape,
num_classes)
converted_boxes_0, converted_boxes_1, converted_boxes_2 = tensor_to_brambox(boxes_0, boxes_1, boxes_2,
args.input_shape, labels)
config.input_shape, labels)
tdets.append(converted_boxes_0)
tdets.append(converted_boxes_1)
@ -175,11 +208,11 @@ if __name__ == "__main__":
img_anno.update({batch_image_name[k].decode('UTF-8'): v for k, v in enumerate(batch_labels)})
print('eval times:', eval_times)
print('batch size: ', args.batch_size)
print('batch size: ', config.batch_size)
netw, neth = args.input_shape
netw, neth = config.input_shape
reorg_dets = voc_wrapper.reorg_detection(det, netw, neth, img_size)
voc_wrapper.gen_results(reorg_dets, result_path, img_size, args.nms_thresh)
voc_wrapper.gen_results(reorg_dets, result_path, img_size, config.nms_thresh)
# compute mAP
ground_truth = parse_gt_from_anno(img_anno, classes)
@ -208,3 +241,6 @@ if __name__ == "__main__":
plt.savefig(ap_save_path)
print('=============yolov3 evaluating finished==================')
if __name__ == "__main__":
run_eval()

View File

@ -14,7 +14,6 @@
# ============================================================================
"""Convert ckpt to air."""
import os
import argparse
import numpy as np
from mindspore import context
@ -22,22 +21,22 @@ from mindspore import Tensor
from mindspore.train.serialization import export, load_checkpoint, load_param_into_net
from src.FaceDetection.yolov3 import HwYolov3 as backbone_HwYolov3
from src.config import config
from model_utils.config import config
def save_air(args):
def save_air():
'''save air'''
print('============= yolov3 start save air ==================')
devid = int(os.getenv('DEVICE_ID', '0')) if args.run_platform != 'CPU' else 0
context.set_context(mode=context.GRAPH_MODE, device_target=args.run_platform, save_graphs=False, device_id=devid)
devid = int(os.getenv('DEVICE_ID', '0')) if config.run_platform != 'CPU' else 0
context.set_context(mode=context.GRAPH_MODE, device_target=config.run_platform, save_graphs=False, device_id=devid)
num_classes = config.num_classes
anchors_mask = config.anchors_mask
num_anchors_list = [len(x) for x in anchors_mask]
network = backbone_HwYolov3(num_classes, num_anchors_list, args)
network = backbone_HwYolov3(num_classes, num_anchors_list, config)
if os.path.isfile(args.pretrained):
param_dict = load_checkpoint(args.pretrained)
if os.path.isfile(config.pretrained):
param_dict = load_checkpoint(config.pretrained)
param_dict_new = {}
for key, values in param_dict.items():
if key.startswith('moments.'):
@ -47,23 +46,16 @@ def save_air(args):
else:
param_dict_new[key] = values
load_param_into_net(network, param_dict_new)
print('load model {} success'.format(args.pretrained))
print('load model {} success'.format(config.pretrained))
input_data = np.random.uniform(low=0, high=1.0, size=(args.batch_size, 3, 448, 768)).astype(np.float32)
input_data = np.random.uniform(low=0, high=1.0, size=(config.batch_size, 3, 448, 768)).astype(np.float32)
tensor_input_data = Tensor(input_data)
export(network, tensor_input_data,
file_name=args.pretrained.replace('.ckpt', '_' + str(args.batch_size) + 'b.air'), file_format='AIR')
file_name=config.pretrained.replace('.ckpt', '_' + str(config.batch_size) + 'b.air'), file_format='AIR')
print("export model success.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Convert ckpt to air')
parser.add_argument("--run_platform", type=str, default="Ascend", choices=("Ascend", "CPU"),
help="run platform, support Ascend and CPU.")
parser.add_argument('--pretrained', type=str, default='', help='pretrained model to load')
parser.add_argument('--batch_size', type=int, default=8, help='batch size')
arg = parser.parse_args()
save_air(arg)
save_air()

View File

@ -0,0 +1,126 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Parse arguments"""
import os
import ast
import argparse
from pprint import pformat
import yaml
class Config:
"""
Configuration namespace. Convert dictionary to members.
"""
def __init__(self, cfg_dict):
for k, v in cfg_dict.items():
if isinstance(v, (list, tuple)):
setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
else:
setattr(self, k, Config(v) if isinstance(v, dict) else v)
def __str__(self):
return pformat(self.__dict__)
def __repr__(self):
return self.__str__()
def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"):
"""
Parse command line arguments to the configuration according to the default yaml.
Args:
parser: Parent parser.
cfg: Base configuration.
helper: Helper description.
cfg_path: Path to the default yaml config.
"""
parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
parents=[parser])
helper = {} if helper is None else helper
choices = {} if choices is None else choices
for item in cfg:
if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
choice = choices[item] if item in choices else None
if isinstance(cfg[item], bool):
parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
help=help_description)
else:
parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
help=help_description)
args = parser.parse_args()
return args
def parse_yaml(yaml_path):
"""
Parse the yaml config file.
Args:
yaml_path: Path to the yaml config.
"""
with open(yaml_path, 'r') as fin:
try:
cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
cfgs = [x for x in cfgs]
if len(cfgs) == 1:
cfg_helper = {}
cfg = cfgs[0]
cfg_choices = {}
elif len(cfgs) == 2:
cfg, cfg_helper = cfgs
cfg_choices = {}
elif len(cfgs) == 3:
cfg, cfg_helper, cfg_choices = cfgs
else:
raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml")
print(cfg_helper)
except:
raise ValueError("Failed to parse yaml")
return cfg, cfg_helper, cfg_choices
def merge(args, cfg):
"""
Merge the base config from yaml file and command line arguments.
Args:
args: Command line arguments.
cfg: Base configuration.
"""
args_var = vars(args)
for item in args_var:
cfg[item] = args_var[item]
return cfg
def get_config():
"""
Get Config according to the yaml file and cli arguments.
"""
parser = argparse.ArgumentParser(description="default name", add_help=False)
current_dir = os.path.dirname(os.path.abspath(__file__))
parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../default_config.yaml"),
help="Config file path")
path_args, _ = parser.parse_known_args()
default, helper, choices = parse_yaml(path_args.config_path)
args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path)
final_config = merge(args, default)
return Config(final_config)
config = get_config()

View File

@ -0,0 +1,27 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Device adapter for ModelArts"""
from .config import config
if config.enable_modelarts:
from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
else:
from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
__all__ = [
"get_device_id", "get_device_num", "get_rank_id", "get_job_id"
]

View File

@ -0,0 +1,36 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Local adapter"""
import os
def get_device_id():
device_id = os.getenv('DEVICE_ID', '0')
return int(device_id)
def get_device_num():
device_num = os.getenv('RANK_SIZE', '1')
return int(device_num)
def get_rank_id():
global_rank_id = os.getenv('RANK_ID', '0')
return int(global_rank_id)
def get_job_id():
return "Local Job"

View File

@ -0,0 +1,116 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Moxing adapter for ModelArts"""
import os
import functools
from mindspore import context
from .config import config
_global_sync_count = 0
def get_device_id():
device_id = os.getenv('DEVICE_ID', '0')
return int(device_id)
def get_device_num():
device_num = os.getenv('RANK_SIZE', '1')
return int(device_num)
def get_rank_id():
global_rank_id = os.getenv('RANK_ID', '0')
return int(global_rank_id)
def get_job_id():
job_id = os.getenv('JOB_ID')
job_id = job_id if job_id != "" else "default"
return job_id
def sync_data(from_path, to_path):
"""
Download data from remote obs to local directory if the first url is remote url and the second one is local path
Upload data from local directory to remote obs in contrast.
"""
import moxing as mox
import time
global _global_sync_count
sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
_global_sync_count += 1
# Each server contains 8 devices as most.
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
print("from path: ", from_path)
print("to path: ", to_path)
mox.file.copy_parallel(from_path, to_path)
print("===finish data synchronization===")
try:
os.mknod(sync_lock)
except IOError:
pass
print("===save flag===")
while True:
if os.path.exists(sync_lock):
break
time.sleep(1)
print("Finish sync data from {} to {}.".format(from_path, to_path))
def moxing_wrapper(pre_process=None, post_process=None):
"""
Moxing wrapper to download dataset and upload outputs.
"""
def wrapper(run_func):
@functools.wraps(run_func)
def wrapped_func(*args, **kwargs):
# Download data from data_url
if config.enable_modelarts:
if config.data_url:
sync_data(config.data_url, config.data_path)
print("Dataset downloaded: ", os.listdir(config.data_path))
if config.checkpoint_url:
sync_data(config.checkpoint_url, config.load_path)
print("Preload downloaded: ", os.listdir(config.load_path))
if config.train_url:
sync_data(config.train_url, config.output_path)
print("Workspace downloaded: ", os.listdir(config.output_path))
context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
config.device_num = get_device_num()
config.device_id = get_device_id()
if not os.path.exists(config.output_path):
os.makedirs(config.output_path)
if pre_process:
pre_process()
# Run the main function
run_func(*args, **kwargs)
# Upload data to train_url
if config.enable_modelarts:
if post_process:
post_process()
if config.train_url:
print("Start to copy output directory")
sync_data(config.output_path, config.train_url)
return wrapped_func
return wrapper

View File

@ -60,10 +60,10 @@ echo $PRETRAINED_BACKBONE
echo 'start evaluating'
export RANK_ID=0
rm -rf ${current_exec_path}/device$USE_DEVICE_ID
rm -rf ${current_exec_path}/eval
echo 'start device '$USE_DEVICE_ID
mkdir ${current_exec_path}/device$USE_DEVICE_ID
cd ${current_exec_path}/device$USE_DEVICE_ID || exit
mkdir ${current_exec_path}/eval
cd ${current_exec_path}/eval || exit
dev=`expr $USE_DEVICE_ID + 0`
export DEVICE_ID=$dev
python ${dirname_path}/${SCRIPT_NAME} \

View File

@ -73,7 +73,6 @@ dev=`expr $USE_DEVICE_ID + 0`
export DEVICE_ID=$dev
python ${dirname_path}/${SCRIPT_NAME} \
--run_platform=$PLATFORM \
--world_size=1 \
--mindrecord_path=$MINDRECORD_FILE \
--pretrained=$PRETRAINED_BACKBONE > train.log 2>&1 &

View File

@ -1,58 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ===========================================================================
"""Network config setting, will be used in train.py and eval.py"""
from easydict import EasyDict as ed
config = ed({
'batch_size': 64,
'warmup_lr': 0.0004,
'lr_rates': [0.002, 0.004, 0.002, 0.0008, 0.0004, 0.0002, 0.00008, 0.00004, 0.000004],
'lr_steps': [1000, 10000, 40000, 60000, 80000, 100000, 130000, 160000, 190000],
'gamma': 0.5,
'weight_decay': 0.0005,
'momentum': 0.5,
'max_epoch': 2500,
'log_interval': 10,
'ckpt_path': '../../output',
'ckpt_interval': 1000,
'result_path': '../../results',
'input_shape': [768, 448],
'jitter': 0.3,
'flip': 0.5,
'hue': 0.1,
'sat': 1.5,
'val': 1.5,
'num_classes': 1,
'anchors': [
[3, 4],
[5, 6],
[7, 9],
[10, 13],
[15, 19],
[21, 26],
[28, 36],
[38, 49],
[54, 71],
[77, 102],
[122, 162],
[207, 268],
],
'anchors_mask': [(8, 9, 10, 11), (4, 5, 6, 7), (0, 1, 2, 3)],
'conf_thresh': 0.1,
'nms_thresh': 0.45,
})

View File

@ -19,7 +19,7 @@ import mindspore.dataset.vision.py_transforms as P
import mindspore.dataset as de
from src.transforms import RandomCropLetterbox, RandomFlip, HSVShift, ResizeLetterbox
from src.config import config
from model_utils.config import config
class SingleScaleTrans:

View File

@ -14,16 +14,14 @@
# ============================================================================
"""Face detection train."""
import os
import ast
import time
import datetime
import argparse
import numpy as np
from mindspore import context
from mindspore.train.loss_scale_manager import DynamicLossScaleManager
from mindspore import Tensor
from mindspore.communication.management import init, get_rank, get_group_size
from mindspore.communication.management import init
from mindspore.context import ParallelMode
from mindspore.train.callback import ModelCheckpoint, RunContext
from mindspore.train.callback import _InternalCallbackParam, CheckpointConfig
@ -31,75 +29,104 @@ from mindspore.common import dtype as mstype
from src.logging import get_logger
from src.data_preprocess import create_dataset
from src.config import config
from src.network_define import define_network
def parse_args():
'''parse_args'''
parser = argparse.ArgumentParser('Yolov3 Face Detection')
parser.add_argument("--run_platform", type=str, default="Ascend", choices=("Ascend", "CPU"),
help="run platform, support Ascend and CPU.")
parser.add_argument('--mindrecord_path', type=str, default='', help='dataset path, e.g. /home/data.mindrecord')
parser.add_argument('--pretrained', type=str, default='', help='pretrained model to load')
parser.add_argument('--local_rank', type=int, default=0, help='current rank to support distributed')
parser.add_argument('--world_size', type=int, default=8, help='current process number to support distributed')
parser.add_argument("--use_loss_scale", type=ast.literal_eval, default=True,
help="Whether use dynamic loss scale, default is True.")
args, _ = parser.parse_known_args()
args.batch_size = config.batch_size
args.warmup_lr = config.warmup_lr
args.lr_rates = config.lr_rates
if args.run_platform == "CPU":
args.use_loss_scale = False
args.world_size = 1
args.local_rank = 0
if args.world_size != 8:
args.lr_steps = [i * 8 // args.world_size for i in config.lr_steps]
else:
args.lr_steps = config.lr_steps
args.gamma = config.gamma
args.weight_decay = config.weight_decay if args.world_size != 1 else 0.
args.momentum = config.momentum
args.max_epoch = config.max_epoch
args.log_interval = config.log_interval
args.ckpt_path = config.ckpt_path
args.ckpt_interval = config.ckpt_interval
args.outputs_dir = os.path.join(args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
print('args.outputs_dir', args.outputs_dir)
args.num_classes = config.num_classes
args.anchors = config.anchors
args.anchors_mask = config.anchors_mask
args.num_anchors_list = [len(x) for x in args.anchors_mask]
return args
from model_utils.config import config
from model_utils.moxing_adapter import moxing_wrapper
from model_utils.device_adapter import get_device_id, get_device_num, get_rank_id
def train(args):
def modelarts_pre_process():
'''modelarts pre process function.'''
def unzip(zip_file, save_dir):
import zipfile
s_time = time.time()
if not os.path.exists(os.path.join(save_dir, config.modelarts_dataset_unzip_name)):
zip_isexist = zipfile.is_zipfile(zip_file)
if zip_isexist:
fz = zipfile.ZipFile(zip_file, 'r')
data_num = len(fz.namelist())
print("Extract Start...")
print("unzip file num: {}".format(data_num))
data_print = int(data_num / 100) if data_num > 100 else 1
i = 0
for file in fz.namelist():
if i % data_print == 0:
print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True)
i += 1
fz.extract(file, save_dir)
print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60),
int(int(time.time() - s_time) % 60)))
print("Extract Done.")
else:
print("This is not zip.")
else:
print("Zip has been extracted.")
if config.need_modelarts_dataset_unzip:
zip_file_1 = os.path.join(config.data_path, config.modelarts_dataset_unzip_name + ".zip")
save_dir_1 = os.path.join(config.data_path)
sync_lock = "/tmp/unzip_sync.lock"
# Each server contains 8 devices as most.
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
print("Zip file path: ", zip_file_1)
print("Unzip file save dir: ", save_dir_1)
unzip(zip_file_1, save_dir_1)
print("===Finish extract data synchronization===")
try:
os.mknod(sync_lock)
except IOError:
pass
while True:
if os.path.exists(sync_lock):
break
time.sleep(1)
print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1))
config.ckpt_path = os.path.join(config.output_path, "output")
@moxing_wrapper(pre_process=modelarts_pre_process)
def run_train():
'''train'''
config.world_size = get_device_num()
config.local_rank = get_rank_id()
if config.run_platform == "CPU":
config.use_loss_scale = False
config.world_size = 1
config.local_rank = 0
if config.world_size != 8:
config.lr_steps = [i * 8 // config.world_size for i in config.lr_steps]
config.weight_decay = config.weight_decay if config.world_size != 1 else 0.
config.outputs_dir = os.path.join(config.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
print('config.outputs_dir', config.outputs_dir)
config.num_anchors_list = [len(x) for x in config.anchors_mask]
print('=============yolov3 start trainging==================')
devid = int(os.getenv('DEVICE_ID', '0')) if args.run_platform != 'CPU' else 0
context.set_context(mode=context.GRAPH_MODE, device_target=args.run_platform, save_graphs=False, device_id=devid)
devid = int(os.getenv('DEVICE_ID', '0')) if config.run_platform != 'CPU' else 0
context.set_context(mode=context.GRAPH_MODE, device_target=config.run_platform, save_graphs=False, device_id=devid)
# init distributed
if args.world_size != 1:
if config.world_size != 1:
init()
args.local_rank = get_rank()
args.world_size = get_group_size()
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, device_num=args.world_size,
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, device_num=config.world_size,
gradients_mean=True)
args.logger = get_logger(args.outputs_dir, args.local_rank)
config.logger = get_logger(config.outputs_dir, config.local_rank)
# dataloader
ds = create_dataset(args)
ds = create_dataset(config)
args.logger.important_info('start create network')
config.logger.important_info('start create network')
create_network_start = time.time()
train_net = define_network(args)
train_net = define_network(config)
# checkpoint
ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
train_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num)
ckpt_cb = ModelCheckpoint(config=train_config, directory=args.outputs_dir, prefix='{}'.format(args.local_rank))
ckpt_max_num = config.max_epoch * config.steps_per_epoch // config.ckpt_interval
train_config = CheckpointConfig(save_checkpoint_steps=config.ckpt_interval, keep_checkpoint_max=ckpt_max_num)
ckpt_cb = ModelCheckpoint(config=train_config, directory=config.outputs_dir, prefix='{}'.format(config.local_rank))
cb_params = _InternalCallbackParam()
cb_params.train_network = train_net
cb_params.epoch_num = ckpt_max_num
@ -112,7 +139,7 @@ def train(args):
t_epoch = time.time()
old_progress = -1
i = 0
if args.use_loss_scale:
if config.use_loss_scale:
scale_manager = DynamicLossScaleManager(init_loss_scale=2 ** 10, scale_factor=2, scale_window=2000)
for data in ds.create_tuple_iterator(output_numpy=True):
batch_images = data[0]
@ -120,7 +147,7 @@ def train(args):
input_list = [Tensor(batch_images, mstype.float32)]
for idx in range(2, 26):
input_list.append(Tensor(data[idx], mstype.float32))
if args.use_loss_scale:
if config.use_loss_scale:
scaling_sens = Tensor(scale_manager.get_loss_scale(), dtype=mstype.float32)
loss0, overflow, _ = train_net(*input_list, scaling_sens)
overflow = np.all(overflow.asnumpy())
@ -128,50 +155,49 @@ def train(args):
scale_manager.update_loss_scale(overflow)
else:
scale_manager.update_loss_scale(False)
args.logger.info('rank[{}], iter[{}], loss[{}], overflow:{}, loss_scale:{}, lr:{}, batch_images:{}, '
'batch_labels:{}'.format(args.local_rank, i, loss0, overflow, scaling_sens, args.lr[i],
batch_images.shape, batch_labels.shape))
config.logger.info('rank[{:d}], iter[{}], loss[{}], overflow:{}, loss_scale:{}, lr:{}, batch_images:{}, '
'batch_labels:{}'.format(config.local_rank, i, loss0, overflow, scaling_sens,
config.lr[i], batch_images.shape, batch_labels.shape))
else:
loss0 = train_net(*input_list)
args.logger.info('rank[{}], iter[{}], loss[{}], lr:{}, batch_images:{}, '
'batch_labels:{}'.format(args.local_rank, i, loss0, args.lr[i],
batch_images.shape, batch_labels.shape))
config.logger.info('rank[{:d}], iter[{}], loss[{}], lr:{}, batch_images:{}, '
'batch_labels:{}'.format(config.local_rank, i, loss0,
config.lr[i], batch_images.shape, batch_labels.shape))
# save ckpt
cb_params.cur_step_num = i + 1 # current step number
cb_params.batch_num = i + 2
if args.local_rank == 0:
if config.local_rank == 0:
ckpt_cb.step_end(run_context)
# save Log
if i == 0:
time_for_graph_compile = time.time() - create_network_start
args.logger.important_info('Yolov3, graph compile time={:.2f}s'.format(time_for_graph_compile))
config.logger.important_info('Yolov3, graph compile time={:.2f}s'.format(time_for_graph_compile))
if i % args.steps_per_epoch == 0:
if i % config.steps_per_epoch == 0:
cb_params.cur_epoch_num += 1
if i % args.log_interval == 0 and args.local_rank == 0:
if i % config.log_interval == 0 and config.local_rank == 0:
time_used = time.time() - t_end
epoch = int(i / args.steps_per_epoch)
fps = args.batch_size * (i - old_progress) * args.world_size / time_used
args.logger.info('epoch[{}], iter[{}], loss:[{}], {:.2f} imgs/sec'.format(epoch, i, loss0, fps))
epoch = int(i / config.steps_per_epoch)
fps = config.batch_size * (i - old_progress) * config.world_size / time_used
config.logger.info('epoch[{}], iter[{}], loss:[{}], {:.2f} imgs/sec'.format(epoch, i, loss0, fps))
t_end = time.time()
old_progress = i
if i % args.steps_per_epoch == 0 and args.local_rank == 0:
if i % config.steps_per_epoch == 0 and config.local_rank == 0:
epoch_time_used = time.time() - t_epoch
epoch = int(i / args.steps_per_epoch)
fps = args.batch_size * args.world_size * args.steps_per_epoch / epoch_time_used
args.logger.info('=================================================')
args.logger.info('epoch time: epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i, fps))
args.logger.info('=================================================')
epoch = int(i / config.steps_per_epoch)
fps = config.batch_size * config.world_size * config.steps_per_epoch / epoch_time_used
config.logger.info('=================================================')
config.logger.info('epoch time: epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i, fps))
config.logger.info('=================================================')
t_epoch = time.time()
i = i + 1
args.logger.info('=============yolov3 training finished==================')
config.logger.info('=============yolov3 training finished==================')
if __name__ == "__main__":
arg = parse_args()
train(arg)
run_train()

View File

@ -28,9 +28,9 @@ def test_FaceDetection_WIDER():
model_name = "FaceDetection"
utils.copy_files(model_path, cur_path, model_name)
cur_model_path = os.path.join(cur_path, model_name)
old_list = ["'max_epoch': 2500,"]
new_list = ["'max_epoch': 1,"]
utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "src/config.py"))
old_list = ["max_epoch: 2500"]
new_list = ["max_epoch: 1"]
utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "default_config.yaml"))
dataset_path = os.path.join(utils.data_root, "widerface/mindrecord_train/data.mindrecord")
device_id = int(os.environ.get("DEVICE_ID", "0"))
model_train_command = "cd {}/scripts;sh run_standalone_train.sh Ascend {} {}"\