forked from mindspore-Ecosystem/mindspore
!21095 fix network issue
Merge pull request !21095 from JichenZhao/net_issue_fix
This commit is contained in:
commit
60f66910eb
|
@ -122,7 +122,7 @@ batch_size: 2
|
|||
loss_scale: 256
|
||||
momentum: 0.91
|
||||
weight_decay: 0.00001
|
||||
epoch_size: 20
|
||||
epoch_size: 5
|
||||
save_checkpoint: True
|
||||
save_checkpoint_epochs: 1
|
||||
keep_checkpoint_max: 20
|
||||
|
|
|
@ -125,7 +125,7 @@ weight_decay: 0.00001
|
|||
epoch_size: 20
|
||||
save_checkpoint: True
|
||||
save_checkpoint_epochs: 1
|
||||
keep_checkpoint_max: 20
|
||||
keep_checkpoint_max: 5
|
||||
save_checkpoint_path: "./"
|
||||
|
||||
# Number of threads used to process the dataset in parallel
|
||||
|
|
|
@ -378,7 +378,7 @@ YOLOv5 on 118K images(The annotation and data format must be the same as coco201
|
|||
| outputs | heatmaps |
|
||||
| Loss | 53 |
|
||||
| Speed | 1p 55 img/s 8p 440 img/s(shape=640) |
|
||||
| Total time | 80h |
|
||||
| Total time | 24h(8pcs) |
|
||||
| Checkpoint for Fine tuning | 58M (.ckpt file) |
|
||||
| Scripts | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/>|
|
||||
|
||||
|
|
|
@ -45,22 +45,22 @@ YOLOv5作为先进的检测器,它比所有可用的替代检测器更快(FP
|
|||
- 目录结构如下,由用户定义目录和文件的名称:
|
||||
|
||||
```shell
|
||||
©À©¤©¤ dataset
|
||||
©À©¤©¤ YOLOv5
|
||||
©À©¤©¤ annotations
|
||||
©¦ ©À©¤ train.json
|
||||
©¦ ©¸©¤ val.json
|
||||
©À©¤ images
|
||||
©À©¤ train
|
||||
©¦ ©¸©¤images
|
||||
©¦ ©À©¤picture1.jpg
|
||||
©¦ ©À©¤ ...
|
||||
©¦ ©¸©¤picturen.jpg
|
||||
©¸©¤ val
|
||||
©¸©¤images
|
||||
©À©¤picture1.jpg
|
||||
©À©¤ ...
|
||||
©¸©¤picturen.jpg
|
||||
├── dataset
|
||||
├── YOLOv5
|
||||
├── annotations
|
||||
│ ├─ train.json
|
||||
│ └─ val.json
|
||||
├─ images
|
||||
├─ train
|
||||
│ └─images
|
||||
│ ├─picture1.jpg
|
||||
│ ├─ ...
|
||||
│ └─picturen.jpg
|
||||
└─ val
|
||||
└─images
|
||||
├─picture1.jpg
|
||||
├─ ...
|
||||
└─picturen.jpg
|
||||
```
|
||||
|
||||
建议用户使用MS COCO数据集来体验模型,
|
||||
|
@ -125,34 +125,34 @@ bash run_eval.sh dataset/xxx checkpoint/xxx.ckpt
|
|||
## [脚本和示例代码](#目录)
|
||||
|
||||
```python
|
||||
©¸©¤yolov5
|
||||
©À©¤README.md
|
||||
©À©¤mindspore_hub_conf.md # Mindspore Hub配置
|
||||
©À©¤ascend310_infer # 用于310推理
|
||||
©À©¤scripts
|
||||
©À©¤run_standalone_train.sh # 在Ascend中启动单机训练(1卡)
|
||||
©À©¤run_distribute_train.sh # 在Ascend中启动分布式训练(8卡)
|
||||
©À©¤run_infer_310.sh # 在Ascend中启动310推理
|
||||
©¸©¤run_eval.sh # 在Ascend中启动评估
|
||||
©À©¤src
|
||||
©À©¤__init__.py # Python初始化文件
|
||||
©À©¤config.py # 参数配置
|
||||
©À©¤yolov5_backbone.py # 网络骨干
|
||||
©À©¤distributed_sampler.py # 数据集迭代器
|
||||
©À©¤initializer.py # 参数初始化器
|
||||
©À©¤logger.py # 日志函数
|
||||
©À©¤loss.py # 损失函数
|
||||
©À©¤lr_scheduler.py # 生成学习率
|
||||
©À©¤transforms.py # 预处理数据
|
||||
©À©¤util.py # 工具函数
|
||||
©À©¤yolo.py # YOLOv5网络
|
||||
©À©¤yolo_dataset.py # 为YOLOv5创建数据集
|
||||
└─yolov5
|
||||
├─README.md
|
||||
├─mindspore_hub_conf.md # Mindspore Hub配置
|
||||
├─ascend310_infer # 用于310推理
|
||||
├─scripts
|
||||
├─run_standalone_train.sh # 在Ascend中启动单机训练(1卡)
|
||||
├─run_distribute_train.sh # 在Ascend中启动分布式训练(8卡)
|
||||
├─run_infer_310.sh # 在Ascend中启动310推理
|
||||
├─run_eval.sh # 在Ascend中启动评估
|
||||
├─src
|
||||
├─__init__.py # Python初始化文件
|
||||
├─config.py # 参数配置
|
||||
├─yolov5_backbone.py # 网络骨干
|
||||
├─distributed_sampler.py # 数据集迭代器
|
||||
├─initializer.py # 参数初始化器
|
||||
├─logger.py # 日志函数
|
||||
├─loss.py # 损失函数
|
||||
├─lr_scheduler.py # 生成学习率
|
||||
├─transforms.py # 预处理数据
|
||||
├─util.py # 工具函数
|
||||
├─yolo.py # YOLOv5网络
|
||||
├─yolo_dataset.py # 为YOLOv5创建数据集
|
||||
|
||||
©À©¤eval.py # 评估验证结果
|
||||
©À©¤export.py # 将MindSpore模型转换为AIR模型
|
||||
©À©¤preprocess.py # 310推理前处理脚本
|
||||
©À©¤postprocess.py # 310推理后处理脚本
|
||||
©¸©¤train.py # 训练网络
|
||||
├─eval.py # 评估验证结果
|
||||
├─export.py # 将MindSpore模型转换为AIR模型
|
||||
├─preprocess.py # 310推理前处理脚本
|
||||
├─postprocess.py # 310推理后处理脚本
|
||||
├─train.py # 训练网络
|
||||
```
|
||||
|
||||
## [脚本参数](#目录)
|
||||
|
@ -378,7 +378,7 @@ YOLOv5应用于118000张图像上(标注和数据格式必须与COCO 2017相
|
|||
|输出|heatmaps |
|
||||
| 损失 | 53 |
|
||||
|速度| 1卡:55 img/s;8卡:440 img/s(shape=640)|
|
||||
| 总时长 | 80小时 |
|
||||
| 总时长 | 24小时(8卡) |
|
||||
| 微调检查点 | 58M (.ckpt文件) |
|
||||
|脚本| <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/> |
|
||||
|
||||
|
|
|
@ -55,13 +55,13 @@
|
|||
|
||||
```python
|
||||
# 分布式训练运行示例
|
||||
sh scripts/run_distribute_train.sh /path/dataset /path/rank_table
|
||||
bash scripts/run_distribute_train.sh /path/dataset /path/rank_table
|
||||
|
||||
# 单机训练运行示例
|
||||
sh scripts/run_standalone_train.sh /path/dataset
|
||||
bash scripts/run_standalone_train.sh /path/dataset
|
||||
|
||||
# 运行评估示例
|
||||
sh scripts/run_eval.sh /path/evalset /path/ckpt
|
||||
bash scripts/run_eval.sh /path/evalset /path/ckpt
|
||||
```
|
||||
|
||||
## 脚本说明
|
||||
|
@ -108,7 +108,7 @@ train.py和val.py中主要参数如下:
|
|||
### 分布式训练
|
||||
|
||||
```shell
|
||||
sh scripts/run_distribute_train.sh /path/dataset /path/rank_table
|
||||
bash scripts/run_distribute_train.sh /path/dataset /path/rank_table
|
||||
```
|
||||
|
||||
上述shell脚本将在后台运行分布训练。可以通过`device[X]/train.log`文件查看结果。
|
||||
|
@ -134,7 +134,7 @@ epoch time: 1104929.793 ms, per step time: 97.162 ms
|
|||
在运行以下命令之前,请检查用于评估的检查点路径。请将检查点路径设置为绝对全路径,例如“username/arcface/arcface-11372-1.ckpt”。
|
||||
|
||||
```bash
|
||||
sh scripts/run_eval.sh /path/evalset /path/ckpt
|
||||
bash scripts/run_eval.sh /path/evalset /path/ckpt
|
||||
```
|
||||
|
||||
上述python命令将在后台运行,您可以通过eval.log文件查看结果。测试数据集的准确性如下:
|
||||
|
|
|
@ -27,13 +27,13 @@ get_real_path(){
|
|||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
RANK_SIZE=8
|
||||
DATA_PATH=$(get_real_path $1)
|
||||
RANK_TABLE=$(get_real_path $2)
|
||||
|
||||
EXEC_PATH=$(pwd)
|
||||
echo "$EXEC_PATH"
|
||||
export RANK_TABLE_FILE=$RANK_TABLE
|
||||
export RANK_SIZE=8
|
||||
|
||||
for((i=0;i<RANK_SIZE;i++))
|
||||
do
|
||||
|
|
|
@ -89,7 +89,7 @@ HarDNet指的是Harmonic DenseNet: A low memory traffic network,其突出的
|
|||
# 运行分布式训练示例
|
||||
python3 train.py > train.log 2>&1 & --dataset_path /path/dataset --pre_ckpt_path /path/pretrained_path --isModelArts False
|
||||
OR
|
||||
bash run_distribute_train.sh /path/dataset /path/pretrain_path 8
|
||||
bash run_distribute_train.sh /path/dataset /path/pretrain_path /path/rank_table
|
||||
|
||||
# 运行评估示例
|
||||
python3 eval.py > eval.log 2>&1 & --dataset_path /path/dataset --ckpt_path /path/ckpt
|
||||
|
@ -242,7 +242,7 @@ HarDNet指的是Harmonic DenseNet: A low memory traffic network,其突出的
|
|||
```bash
|
||||
python3 train.py > train.log 2>&1 & --dataset_path /path/dataset --pre_ckpt_path /path/pretrained_path --isModelArts False
|
||||
OR
|
||||
bash run_distribute_train.sh /path/dataset /path/pretrain_path 8
|
||||
bash run_distribute_train.sh /path/dataset /path/pretrain_path /path/rank_table
|
||||
```
|
||||
|
||||
上述shell脚本将在后台运行分布训练。您可以通过train_parallel[X]/log文件查看结果。采用以下方式达到损失值:
|
||||
|
|
|
@ -16,40 +16,28 @@
|
|||
|
||||
echo "=============================================================================================================="
|
||||
echo "Please run the script as: "
|
||||
echo "bash run_distribute_train.sh DATA_PATH pretrain_path RANK_SIZE"
|
||||
echo "For example: bash run_distribute_train.sh /path/dataset /path/pretrain_path 8"
|
||||
echo "bash run_distribute_train.sh DATA_PATH pretrain_path RANK_TABLE"
|
||||
echo "For example: bash run_distribute_train.sh /path/dataset /path/pretrain_path /path/rank_table"
|
||||
echo "It is better to use the absolute path."
|
||||
echo "=============================================================================================================="
|
||||
set -e
|
||||
DATA_PATH=$1
|
||||
PRETRAINED_PATH=$2
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
DATA_PATH=$(get_real_path $1)
|
||||
PRETRAINED_PATH=$(get_real_path $2)
|
||||
RANK_TABLE=$(get_real_path $3)
|
||||
export DATA_PATH=${DATA_PATH}
|
||||
RANK_SIZE=$3
|
||||
|
||||
export RANK_SIZE=8
|
||||
export RANK_TABLE_FILE=$RANK_TABLE
|
||||
EXEC_PATH=$(pwd)
|
||||
|
||||
echo "$EXEC_PATH"
|
||||
|
||||
test_dist_8pcs()
|
||||
{
|
||||
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_8pcs.json
|
||||
export RANK_SIZE=8
|
||||
}
|
||||
|
||||
test_dist_4pcs()
|
||||
{
|
||||
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_4pcs.json
|
||||
export RANK_SIZE=4
|
||||
}
|
||||
|
||||
test_dist_2pcs()
|
||||
{
|
||||
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_2pcs.json
|
||||
export RANK_SIZE=2
|
||||
}
|
||||
|
||||
test_dist_${RANK_SIZE}pcs
|
||||
|
||||
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
||||
|
||||
for((i=1;i<${RANK_SIZE};i++))
|
||||
|
@ -82,7 +70,7 @@ export DEVICE_ID=0
|
|||
export RANK_ID=0
|
||||
echo "start training for device 0"
|
||||
env > env0.log
|
||||
nohup python3 -u train.py --dataset_path ${DATA_PATH} --isModelArts False --distribute True --pre_ckpt_path ${PRETRAINED_PATH} > train0.log 2>&1
|
||||
nohup python3 -u train.py --dataset_path ${DATA_PATH} --isModelArts False --distribute True --pre_ckpt_path ${PRETRAINED_PATH} > train0.log 2>&1 &
|
||||
|
||||
if [ $? -eq 0 ];then
|
||||
echo "training success"
|
||||
|
|
|
@ -82,13 +82,13 @@ simple_baselines的总体网络架构如下:
|
|||
|
||||
```text
|
||||
# 分布式训练
|
||||
用法:sh run_distribute_train.sh --is_model_arts False --run_distribute True
|
||||
用法:bash run_distribute_train.sh RANK_TABLE
|
||||
|
||||
# 单机训练
|
||||
用法:sh run_standalone_train.sh --device_id 0 --is_model_arts False --run_distribute False
|
||||
用法:bash run_standalone_train.sh DEVICE_ID
|
||||
|
||||
# 运行评估示例
|
||||
用法:sh run_eval.sh
|
||||
用法:bash run_eval.sh
|
||||
```
|
||||
|
||||
# 脚本说明
|
||||
|
@ -183,13 +183,13 @@ config.TEST.NMS_THRE = 1.0 # nms阈值
|
|||
|
||||
```text
|
||||
# 分布式训练
|
||||
用法:sh run_distribute_train.sh --is_model_arts False --run_distribute True
|
||||
用法:bash run_distribute_train.sh RANK_TABLE
|
||||
|
||||
# 单机训练
|
||||
用法:sh run_standalone_train.sh --device_id 0 --is_model_arts False --run_distribute False
|
||||
用法:bash run_standalone_train.sh DEVICE_ID
|
||||
|
||||
# 运行评估示例
|
||||
用法:sh run_eval.sh
|
||||
用法:bash run_eval.sh
|
||||
```
|
||||
|
||||
### 结果
|
||||
|
@ -219,7 +219,7 @@ epoch:140 step:2340, loss is 0.0003393
|
|||
|
||||
```bash
|
||||
# 评估
|
||||
sh eval.sh
|
||||
bash eval.sh
|
||||
```
|
||||
|
||||
### 结果
|
||||
|
|
|
@ -16,31 +16,24 @@
|
|||
|
||||
echo "========================================================================"
|
||||
echo "Please run the script as: "
|
||||
echo "bash run.sh RANK_SIZE"
|
||||
echo "For example: bash run_distribute.sh 8"
|
||||
echo "bash run.sh RANK_TABLE"
|
||||
echo "For example: bash run_distribute.sh RANK_TABLE"
|
||||
echo "It is better to use the absolute path."
|
||||
echo "========================================================================"
|
||||
set -e
|
||||
|
||||
RANK_SIZE=$1
|
||||
export RANK_SIZE
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
RANK_TABLE=$(get_real_path $1)
|
||||
|
||||
EXEC_PATH=$(pwd)
|
||||
echo "$EXEC_PATH"
|
||||
|
||||
test_dist_8pcs()
|
||||
{
|
||||
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_8pcs.json
|
||||
export RANK_SIZE=8
|
||||
}
|
||||
|
||||
test_dist_2pcs()
|
||||
{
|
||||
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_2pcs.json
|
||||
export RANK_SIZE=2
|
||||
}
|
||||
|
||||
test_dist_${RANK_SIZE}pcs
|
||||
export RANK_TABLE_FILE=$RANK_TABLE
|
||||
export RANK_SIZE=8
|
||||
|
||||
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
||||
|
||||
|
|
|
@ -217,7 +217,7 @@ def _build_training_pipeline(config: Seq2seqConfig,
|
|||
scale_update_cell=scale_manager.get_update_cell()
|
||||
)
|
||||
net_with_grads.set_train(True)
|
||||
model = Model(net_with_grads, amp_level="O2")
|
||||
model = Model(net_with_grads)
|
||||
loss_monitor = LossCallBack(config)
|
||||
dataset_size = dataset.get_dataset_size()
|
||||
time_cb = TimeMonitor(data_size=dataset_size)
|
||||
|
|
Loading…
Reference in New Issue