!21095 fix network issue

Merge pull request !21095 from JichenZhao/net_issue_fix
This commit is contained in:
i-robot 2021-08-02 08:45:40 +00:00 committed by Gitee
commit 60f66910eb
11 changed files with 90 additions and 109 deletions

View File

@ -122,7 +122,7 @@ batch_size: 2
loss_scale: 256
momentum: 0.91
weight_decay: 0.00001
epoch_size: 20
epoch_size: 5
save_checkpoint: True
save_checkpoint_epochs: 1
keep_checkpoint_max: 20

View File

@ -125,7 +125,7 @@ weight_decay: 0.00001
epoch_size: 20
save_checkpoint: True
save_checkpoint_epochs: 1
keep_checkpoint_max: 20
keep_checkpoint_max: 5
save_checkpoint_path: "./"
# Number of threads used to process the dataset in parallel

View File

@ -378,7 +378,7 @@ YOLOv5 on 118K images(The annotation and data format must be the same as coco201
| outputs | heatmaps |
| Loss | 53 |
| Speed | 1p 55 img/s 8p 440 img/s(shape=640) |
| Total time | 80h |
| Total time | 24h(8pcs) |
| Checkpoint for Fine tuning | 58M (.ckpt file) |
| Scripts | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/>|

View File

@ -45,22 +45,22 @@ YOLOv5作为先进的检测器它比所有可用的替代检测器更快FP
- 目录结构如下,由用户定义目录和文件的名称:
```shell
©À©¤©¤ dataset
©À©¤©¤ YOLOv5
©À©¤©¤ annotations
©¦ ©À©¤ train.json
©¦ ©¸©¤ val.json
©À©¤ images
©À©¤ train
©¦ ©¸©¤images
©¦ ©À©¤picture1.jpg
©¦ ©À©¤ ...
©¦ ©¸©¤picturen.jpg
©¸©¤ val
©¸©¤images
©À©¤picture1.jpg
©À©¤ ...
©¸©¤picturen.jpg
├── dataset
├── YOLOv5
├── annotations
│ ├─ train.json
│ └─ val.json
├─ images
├─ train
│ └─images
│ ├─picture1.jpg
│ ├─ ...
│ └─picturen.jpg
└─ val
└─images
├─picture1.jpg
├─ ...
└─picturen.jpg
```
建议用户使用MS COCO数据集来体验模型
@ -125,34 +125,34 @@ bash run_eval.sh dataset/xxx checkpoint/xxx.ckpt
## [脚本和示例代码](#目录)
```python
©¸©¤yolov5
©À©¤README.md
©À©¤mindspore_hub_conf.md # Mindspore Hub配置
©À©¤ascend310_infer # 用于310推理
©À©¤scripts
©À©¤run_standalone_train.sh # 在Ascend中启动单机训练1卡
©À©¤run_distribute_train.sh # 在Ascend中启动分布式训练8卡
©À©¤run_infer_310.sh # 在Ascend中启动310推理
©¸©¤run_eval.sh # 在Ascend中启动评估
©À©¤src
©À©¤__init__.py # Python初始化文件
©À©¤config.py # 参数配置
©À©¤yolov5_backbone.py # 网络骨干
©À©¤distributed_sampler.py # 数据集迭代器
©À©¤initializer.py # 参数初始化器
©À©¤logger.py # 日志函数
©À©¤loss.py # 损失函数
©À©¤lr_scheduler.py # 生成学习率
©À©¤transforms.py # 预处理数据
©À©¤util.py # 工具函数
©À©¤yolo.py # YOLOv5网络
©À©¤yolo_dataset.py # 为YOLOv5创建数据集
└─yolov5
├─README.md
├─mindspore_hub_conf.md # Mindspore Hub配置
├─ascend310_infer # 用于310推理
├─scripts
├─run_standalone_train.sh # 在Ascend中启动单机训练1卡
├─run_distribute_train.sh # 在Ascend中启动分布式训练8卡
├─run_infer_310.sh # 在Ascend中启动310推理
├─run_eval.sh # 在Ascend中启动评估
├─src
├─__init__.py # Python初始化文件
├─config.py # 参数配置
├─yolov5_backbone.py # 网络骨干
├─distributed_sampler.py # 数据集迭代器
├─initializer.py # 参数初始化器
├─logger.py # 日志函数
├─loss.py # 损失函数
├─lr_scheduler.py # 生成学习率
├─transforms.py # 预处理数据
├─util.py # 工具函数
├─yolo.py # YOLOv5网络
├─yolo_dataset.py # 为YOLOv5创建数据集
©À©¤eval.py # 评估验证结果
©À©¤export.py # 将MindSpore模型转换为AIR模型
©À©¤preprocess.py # 310推理前处理脚本
©À©¤postprocess.py # 310推理后处理脚本
©¸©¤train.py # 训练网络
├─eval.py # 评估验证结果
├─export.py # 将MindSpore模型转换为AIR模型
├─preprocess.py # 310推理前处理脚本
├─postprocess.py # 310推理后处理脚本
├─train.py # 训练网络
```
## [脚本参数](#目录)
@ -378,7 +378,7 @@ YOLOv5应用于118000张图像上标注和数据格式必须与COCO 2017相
|输出|heatmaps |
| 损失 | 53 |
|速度| 1卡55 img/s8卡440 img/sshape=640|
| 总时长 | 80小时 |
| 总时长 | 24小时(8卡) |
| 微调检查点 | 58M .ckpt文件 |
|脚本| <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/> |

View File

@ -55,13 +55,13 @@
```python
# 分布式训练运行示例
sh scripts/run_distribute_train.sh /path/dataset /path/rank_table
bash scripts/run_distribute_train.sh /path/dataset /path/rank_table
# 单机训练运行示例
sh scripts/run_standalone_train.sh /path/dataset
bash scripts/run_standalone_train.sh /path/dataset
# 运行评估示例
sh scripts/run_eval.sh /path/evalset /path/ckpt
bash scripts/run_eval.sh /path/evalset /path/ckpt
```
## 脚本说明
@ -108,7 +108,7 @@ train.py和val.py中主要参数如下
### 分布式训练
```shell
sh scripts/run_distribute_train.sh /path/dataset /path/rank_table
bash scripts/run_distribute_train.sh /path/dataset /path/rank_table
```
上述shell脚本将在后台运行分布训练。可以通过`device[X]/train.log`文件查看结果。
@ -134,7 +134,7 @@ epoch time: 1104929.793 ms, per step time: 97.162 ms
在运行以下命令之前请检查用于评估的检查点路径。请将检查点路径设置为绝对全路径例如“username/arcface/arcface-11372-1.ckpt”。
```bash
sh scripts/run_eval.sh /path/evalset /path/ckpt
bash scripts/run_eval.sh /path/evalset /path/ckpt
```
上述python命令将在后台运行您可以通过eval.log文件查看结果。测试数据集的准确性如下

View File

@ -27,13 +27,13 @@ get_real_path(){
echo "$(realpath -m $PWD/$1)"
fi
}
RANK_SIZE=8
DATA_PATH=$(get_real_path $1)
RANK_TABLE=$(get_real_path $2)
EXEC_PATH=$(pwd)
echo "$EXEC_PATH"
export RANK_TABLE_FILE=$RANK_TABLE
export RANK_SIZE=8
for((i=0;i<RANK_SIZE;i++))
do

View File

@ -89,7 +89,7 @@ HarDNet指的是Harmonic DenseNet: A low memory traffic network其突出的
# 运行分布式训练示例
python3 train.py > train.log 2>&1 & --dataset_path /path/dataset --pre_ckpt_path /path/pretrained_path --isModelArts False
OR
bash run_distribute_train.sh /path/dataset /path/pretrain_path 8
bash run_distribute_train.sh /path/dataset /path/pretrain_path /path/rank_table
# 运行评估示例
python3 eval.py > eval.log 2>&1 & --dataset_path /path/dataset --ckpt_path /path/ckpt
@ -242,7 +242,7 @@ HarDNet指的是Harmonic DenseNet: A low memory traffic network其突出的
```bash
python3 train.py > train.log 2>&1 & --dataset_path /path/dataset --pre_ckpt_path /path/pretrained_path --isModelArts False
OR
bash run_distribute_train.sh /path/dataset /path/pretrain_path 8
bash run_distribute_train.sh /path/dataset /path/pretrain_path /path/rank_table
```
上述shell脚本将在后台运行分布训练。您可以通过train_parallel[X]/log文件查看结果。采用以下方式达到损失值

View File

@ -16,40 +16,28 @@
echo "=============================================================================================================="
echo "Please run the script as: "
echo "bash run_distribute_train.sh DATA_PATH pretrain_path RANK_SIZE"
echo "For example: bash run_distribute_train.sh /path/dataset /path/pretrain_path 8"
echo "bash run_distribute_train.sh DATA_PATH pretrain_path RANK_TABLE"
echo "For example: bash run_distribute_train.sh /path/dataset /path/pretrain_path /path/rank_table"
echo "It is better to use the absolute path."
echo "=============================================================================================================="
set -e
DATA_PATH=$1
PRETRAINED_PATH=$2
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
DATA_PATH=$(get_real_path $1)
PRETRAINED_PATH=$(get_real_path $2)
RANK_TABLE=$(get_real_path $3)
export DATA_PATH=${DATA_PATH}
RANK_SIZE=$3
export RANK_SIZE=8
export RANK_TABLE_FILE=$RANK_TABLE
EXEC_PATH=$(pwd)
echo "$EXEC_PATH"
test_dist_8pcs()
{
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_8pcs.json
export RANK_SIZE=8
}
test_dist_4pcs()
{
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_4pcs.json
export RANK_SIZE=4
}
test_dist_2pcs()
{
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_2pcs.json
export RANK_SIZE=2
}
test_dist_${RANK_SIZE}pcs
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
for((i=1;i<${RANK_SIZE};i++))
@ -82,7 +70,7 @@ export DEVICE_ID=0
export RANK_ID=0
echo "start training for device 0"
env > env0.log
nohup python3 -u train.py --dataset_path ${DATA_PATH} --isModelArts False --distribute True --pre_ckpt_path ${PRETRAINED_PATH} > train0.log 2>&1
nohup python3 -u train.py --dataset_path ${DATA_PATH} --isModelArts False --distribute True --pre_ckpt_path ${PRETRAINED_PATH} > train0.log 2>&1 &
if [ $? -eq 0 ];then
echo "training success"

View File

@ -82,13 +82,13 @@ simple_baselines的总体网络架构如下
```text
# 分布式训练
用法:sh run_distribute_train.sh --is_model_arts False --run_distribute True
用法:bash run_distribute_train.sh RANK_TABLE
# 单机训练
用法:sh run_standalone_train.sh --device_id 0 --is_model_arts False --run_distribute False
用法:bash run_standalone_train.sh DEVICE_ID
# 运行评估示例
用法sh run_eval.sh
用法:bash run_eval.sh
```
# 脚本说明
@ -183,13 +183,13 @@ config.TEST.NMS_THRE = 1.0 # nms阈值
```text
# 分布式训练
用法:sh run_distribute_train.sh --is_model_arts False --run_distribute True
用法:bash run_distribute_train.sh RANK_TABLE
# 单机训练
用法:sh run_standalone_train.sh --device_id 0 --is_model_arts False --run_distribute False
用法:bash run_standalone_train.sh DEVICE_ID
# 运行评估示例
用法sh run_eval.sh
用法:bash run_eval.sh
```
### 结果
@ -219,7 +219,7 @@ epoch:140 step:2340, loss is 0.0003393
```bash
# 评估
sh eval.sh
bash eval.sh
```
### 结果

View File

@ -16,31 +16,24 @@
echo "========================================================================"
echo "Please run the script as: "
echo "bash run.sh RANK_SIZE"
echo "For example: bash run_distribute.sh 8"
echo "bash run.sh RANK_TABLE"
echo "For example: bash run_distribute.sh RANK_TABLE"
echo "It is better to use the absolute path."
echo "========================================================================"
set -e
RANK_SIZE=$1
export RANK_SIZE
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
RANK_TABLE=$(get_real_path $1)
EXEC_PATH=$(pwd)
echo "$EXEC_PATH"
test_dist_8pcs()
{
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_8pcs.json
export RANK_SIZE=8
}
test_dist_2pcs()
{
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_2pcs.json
export RANK_SIZE=2
}
test_dist_${RANK_SIZE}pcs
export RANK_TABLE_FILE=$RANK_TABLE
export RANK_SIZE=8
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

View File

@ -217,7 +217,7 @@ def _build_training_pipeline(config: Seq2seqConfig,
scale_update_cell=scale_manager.get_update_cell()
)
net_with_grads.set_train(True)
model = Model(net_with_grads, amp_level="O2")
model = Model(net_with_grads)
loss_monitor = LossCallBack(config)
dataset_size = dataset.get_dataset_size()
time_cb = TimeMonitor(data_size=dataset_size)