diff --git a/model_zoo/official/cv/faster_rcnn/default_config_101.yaml b/model_zoo/official/cv/faster_rcnn/default_config_101.yaml
index b6a16195514..a357f3d3ce3 100644
--- a/model_zoo/official/cv/faster_rcnn/default_config_101.yaml
+++ b/model_zoo/official/cv/faster_rcnn/default_config_101.yaml
@@ -122,7 +122,7 @@ batch_size: 2
loss_scale: 256
momentum: 0.91
weight_decay: 0.00001
-epoch_size: 20
+epoch_size: 5
save_checkpoint: True
save_checkpoint_epochs: 1
keep_checkpoint_max: 20
diff --git a/model_zoo/official/cv/faster_rcnn/default_config_152.yaml b/model_zoo/official/cv/faster_rcnn/default_config_152.yaml
index d2755194040..896c0b02fc5 100644
--- a/model_zoo/official/cv/faster_rcnn/default_config_152.yaml
+++ b/model_zoo/official/cv/faster_rcnn/default_config_152.yaml
@@ -125,7 +125,7 @@ weight_decay: 0.00001
epoch_size: 20
save_checkpoint: True
save_checkpoint_epochs: 1
-keep_checkpoint_max: 20
+keep_checkpoint_max: 5
save_checkpoint_path: "./"
# Number of threads used to process the dataset in parallel
diff --git a/model_zoo/official/cv/yolov5/README.md b/model_zoo/official/cv/yolov5/README.md
index d666c644179..4203e2debf8 100644
--- a/model_zoo/official/cv/yolov5/README.md
+++ b/model_zoo/official/cv/yolov5/README.md
@@ -378,7 +378,7 @@ YOLOv5 on 118K images(The annotation and data format must be the same as coco201
| outputs | heatmaps |
| Loss | 53 |
| Speed | 1p 55 img/s 8p 440 img/s(shape=640) |
-| Total time | 80h |
+| Total time | 24h(8pcs) |
| Checkpoint for Fine tuning | 58M (.ckpt file) |
| Scripts | |
diff --git a/model_zoo/official/cv/yolov5/README_CN.md b/model_zoo/official/cv/yolov5/README_CN.md
index d1ac34047eb..c9d7a5861ec 100644
--- a/model_zoo/official/cv/yolov5/README_CN.md
+++ b/model_zoo/official/cv/yolov5/README_CN.md
@@ -45,22 +45,22 @@ YOLOv5作为先进的检测器,它比所有可用的替代检测器更快(FP
- 目录结构如下,由用户定义目录和文件的名称:
```shell
- ©À©¤©¤ dataset
- ©À©¤©¤ YOLOv5
- ©À©¤©¤ annotations
- ©¦ ©À©¤ train.json
- ©¦ ©¸©¤ val.json
- ©À©¤ images
- ©À©¤ train
- ©¦ ©¸©¤images
- ©¦ ©À©¤picture1.jpg
- ©¦ ©À©¤ ...
- ©¦ ©¸©¤picturen.jpg
- ©¸©¤ val
- ©¸©¤images
- ©À©¤picture1.jpg
- ©À©¤ ...
- ©¸©¤picturen.jpg
+ ├── dataset
+ ├── YOLOv5
+ ├── annotations
+ │ ├─ train.json
+ │ └─ val.json
+ ├─ images
+ ├─ train
+ │ └─images
+ │ ├─picture1.jpg
+ │ ├─ ...
+ │ └─picturen.jpg
+ └─ val
+ └─images
+ ├─picture1.jpg
+ ├─ ...
+ └─picturen.jpg
```
建议用户使用MS COCO数据集来体验模型,
@@ -125,34 +125,34 @@ bash run_eval.sh dataset/xxx checkpoint/xxx.ckpt
## [脚本和示例代码](#目录)
```python
-©¸©¤yolov5
- ©À©¤README.md
- ©À©¤mindspore_hub_conf.md # Mindspore Hub配置
- ©À©¤ascend310_infer # 用于310推理
- ©À©¤scripts
- ©À©¤run_standalone_train.sh # 在Ascend中启动单机训练(1卡)
- ©À©¤run_distribute_train.sh # 在Ascend中启动分布式训练(8卡)
- ©À©¤run_infer_310.sh # 在Ascend中启动310推理
- ©¸©¤run_eval.sh # 在Ascend中启动评估
- ©À©¤src
- ©À©¤__init__.py # Python初始化文件
- ©À©¤config.py # 参数配置
- ©À©¤yolov5_backbone.py # 网络骨干
- ©À©¤distributed_sampler.py # 数据集迭代器
- ©À©¤initializer.py # 参数初始化器
- ©À©¤logger.py # 日志函数
- ©À©¤loss.py # 损失函数
- ©À©¤lr_scheduler.py # 生成学习率
- ©À©¤transforms.py # 预处理数据
- ©À©¤util.py # 工具函数
- ©À©¤yolo.py # YOLOv5网络
- ©À©¤yolo_dataset.py # 为YOLOv5创建数据集
+└─yolov5
+ ├─README.md
+ ├─mindspore_hub_conf.md # Mindspore Hub配置
+ ├─ascend310_infer # 用于310推理
+ ├─scripts
+ ├─run_standalone_train.sh # 在Ascend中启动单机训练(1卡)
+ ├─run_distribute_train.sh # 在Ascend中启动分布式训练(8卡)
+ ├─run_infer_310.sh # 在Ascend中启动310推理
+ ├─run_eval.sh # 在Ascend中启动评估
+ ├─src
+ ├─__init__.py # Python初始化文件
+ ├─config.py # 参数配置
+ ├─yolov5_backbone.py # 网络骨干
+ ├─distributed_sampler.py # 数据集迭代器
+ ├─initializer.py # 参数初始化器
+ ├─logger.py # 日志函数
+ ├─loss.py # 损失函数
+ ├─lr_scheduler.py # 生成学习率
+ ├─transforms.py # 预处理数据
+ ├─util.py # 工具函数
+ ├─yolo.py # YOLOv5网络
+ ├─yolo_dataset.py # 为YOLOv5创建数据集
- ©À©¤eval.py # 评估验证结果
- ©À©¤export.py # 将MindSpore模型转换为AIR模型
- ©À©¤preprocess.py # 310推理前处理脚本
- ©À©¤postprocess.py # 310推理后处理脚本
- ©¸©¤train.py # 训练网络
+ ├─eval.py # 评估验证结果
+ ├─export.py # 将MindSpore模型转换为AIR模型
+ ├─preprocess.py # 310推理前处理脚本
+ ├─postprocess.py # 310推理后处理脚本
+ ├─train.py # 训练网络
```
## [脚本参数](#目录)
@@ -378,7 +378,7 @@ YOLOv5应用于118000张图像上(标注和数据格式必须与COCO 2017相
|输出|heatmaps |
| 损失 | 53 |
|速度| 1卡:55 img/s;8卡:440 img/s(shape=640)|
-| 总时长 | 80小时 |
+| 总时长 | 24小时(8卡) |
| 微调检查点 | 58M (.ckpt文件) |
|脚本| |
diff --git a/model_zoo/research/cv/arcface/README_CN.md b/model_zoo/research/cv/arcface/README_CN.md
index 25d07b67638..f08a44a5d21 100644
--- a/model_zoo/research/cv/arcface/README_CN.md
+++ b/model_zoo/research/cv/arcface/README_CN.md
@@ -55,13 +55,13 @@
```python
# 分布式训练运行示例
-sh scripts/run_distribute_train.sh /path/dataset /path/rank_table
+bash scripts/run_distribute_train.sh /path/dataset /path/rank_table
# 单机训练运行示例
-sh scripts/run_standalone_train.sh /path/dataset
+bash scripts/run_standalone_train.sh /path/dataset
# 运行评估示例
-sh scripts/run_eval.sh /path/evalset /path/ckpt
+bash scripts/run_eval.sh /path/evalset /path/ckpt
```
## 脚本说明
@@ -108,7 +108,7 @@ train.py和val.py中主要参数如下:
### 分布式训练
```shell
-sh scripts/run_distribute_train.sh /path/dataset /path/rank_table
+bash scripts/run_distribute_train.sh /path/dataset /path/rank_table
```
上述shell脚本将在后台运行分布训练。可以通过`device[X]/train.log`文件查看结果。
@@ -134,7 +134,7 @@ epoch time: 1104929.793 ms, per step time: 97.162 ms
在运行以下命令之前,请检查用于评估的检查点路径。请将检查点路径设置为绝对全路径,例如“username/arcface/arcface-11372-1.ckpt”。
```bash
- sh scripts/run_eval.sh /path/evalset /path/ckpt
+ bash scripts/run_eval.sh /path/evalset /path/ckpt
```
上述python命令将在后台运行,您可以通过eval.log文件查看结果。测试数据集的准确性如下:
diff --git a/model_zoo/research/cv/arcface/scripts/run_distribute_train.sh b/model_zoo/research/cv/arcface/scripts/run_distribute_train.sh
index 6c953ab1097..35989366537 100644
--- a/model_zoo/research/cv/arcface/scripts/run_distribute_train.sh
+++ b/model_zoo/research/cv/arcface/scripts/run_distribute_train.sh
@@ -27,13 +27,13 @@ get_real_path(){
echo "$(realpath -m $PWD/$1)"
fi
}
-RANK_SIZE=8
DATA_PATH=$(get_real_path $1)
RANK_TABLE=$(get_real_path $2)
EXEC_PATH=$(pwd)
echo "$EXEC_PATH"
export RANK_TABLE_FILE=$RANK_TABLE
+export RANK_SIZE=8
for((i=0;i train.log 2>&1 & --dataset_path /path/dataset --pre_ckpt_path /path/pretrained_path --isModelArts False
OR
- bash run_distribute_train.sh /path/dataset /path/pretrain_path 8
+ bash run_distribute_train.sh /path/dataset /path/pretrain_path /path/rank_table
# 运行评估示例
python3 eval.py > eval.log 2>&1 & --dataset_path /path/dataset --ckpt_path /path/ckpt
@@ -242,7 +242,7 @@ HarDNet指的是Harmonic DenseNet: A low memory traffic network,其突出的
```bash
python3 train.py > train.log 2>&1 & --dataset_path /path/dataset --pre_ckpt_path /path/pretrained_path --isModelArts False
OR
- bash run_distribute_train.sh /path/dataset /path/pretrain_path 8
+ bash run_distribute_train.sh /path/dataset /path/pretrain_path /path/rank_table
```
上述shell脚本将在后台运行分布训练。您可以通过train_parallel[X]/log文件查看结果。采用以下方式达到损失值:
diff --git a/model_zoo/research/cv/hardnet/scripts/run_distribute_train.sh b/model_zoo/research/cv/hardnet/scripts/run_distribute_train.sh
index a5476ca1787..994d50a457b 100644
--- a/model_zoo/research/cv/hardnet/scripts/run_distribute_train.sh
+++ b/model_zoo/research/cv/hardnet/scripts/run_distribute_train.sh
@@ -16,40 +16,28 @@
echo "=============================================================================================================="
echo "Please run the script as: "
-echo "bash run_distribute_train.sh DATA_PATH pretrain_path RANK_SIZE"
-echo "For example: bash run_distribute_train.sh /path/dataset /path/pretrain_path 8"
+echo "bash run_distribute_train.sh DATA_PATH pretrain_path RANK_TABLE"
+echo "For example: bash run_distribute_train.sh /path/dataset /path/pretrain_path /path/rank_table"
echo "It is better to use the absolute path."
echo "=============================================================================================================="
set -e
-DATA_PATH=$1
-PRETRAINED_PATH=$2
+get_real_path(){
+ if [ "${1:0:1}" == "/" ]; then
+ echo "$1"
+ else
+ echo "$(realpath -m $PWD/$1)"
+ fi
+}
+DATA_PATH=$(get_real_path $1)
+PRETRAINED_PATH=$(get_real_path $2)
+RANK_TABLE=$(get_real_path $3)
export DATA_PATH=${DATA_PATH}
-RANK_SIZE=$3
-
+export RANK_SIZE=8
+export RANK_TABLE_FILE=$RANK_TABLE
EXEC_PATH=$(pwd)
echo "$EXEC_PATH"
-test_dist_8pcs()
-{
- export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_8pcs.json
- export RANK_SIZE=8
-}
-
-test_dist_4pcs()
-{
- export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_4pcs.json
- export RANK_SIZE=4
-}
-
-test_dist_2pcs()
-{
- export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_2pcs.json
- export RANK_SIZE=2
-}
-
-test_dist_${RANK_SIZE}pcs
-
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
for((i=1;i<${RANK_SIZE};i++))
@@ -82,7 +70,7 @@ export DEVICE_ID=0
export RANK_ID=0
echo "start training for device 0"
env > env0.log
-nohup python3 -u train.py --dataset_path ${DATA_PATH} --isModelArts False --distribute True --pre_ckpt_path ${PRETRAINED_PATH} > train0.log 2>&1
+nohup python3 -u train.py --dataset_path ${DATA_PATH} --isModelArts False --distribute True --pre_ckpt_path ${PRETRAINED_PATH} > train0.log 2>&1 &
if [ $? -eq 0 ];then
echo "training success"
diff --git a/model_zoo/research/cv/simple_baselines/README.md b/model_zoo/research/cv/simple_baselines/README.md
index 23f562e2b9d..fe453b8027c 100644
--- a/model_zoo/research/cv/simple_baselines/README.md
+++ b/model_zoo/research/cv/simple_baselines/README.md
@@ -82,13 +82,13 @@ simple_baselines的总体网络架构如下:
```text
# 分布式训练
-用法:sh run_distribute_train.sh --is_model_arts False --run_distribute True
+用法:bash run_distribute_train.sh RANK_TABLE
# 单机训练
-用法:sh run_standalone_train.sh --device_id 0 --is_model_arts False --run_distribute False
+用法:bash run_standalone_train.sh DEVICE_ID
# 运行评估示例
-用法:sh run_eval.sh
+用法:bash run_eval.sh
```
# 脚本说明
@@ -183,13 +183,13 @@ config.TEST.NMS_THRE = 1.0 # nms阈值
```text
# 分布式训练
-用法:sh run_distribute_train.sh --is_model_arts False --run_distribute True
+用法:bash run_distribute_train.sh RANK_TABLE
# 单机训练
-用法:sh run_standalone_train.sh --device_id 0 --is_model_arts False --run_distribute False
+用法:bash run_standalone_train.sh DEVICE_ID
# 运行评估示例
-用法:sh run_eval.sh
+用法:bash run_eval.sh
```
### 结果
@@ -219,7 +219,7 @@ epoch:140 step:2340, loss is 0.0003393
```bash
# 评估
-sh eval.sh
+bash eval.sh
```
### 结果
diff --git a/model_zoo/research/cv/simple_baselines/scripts/run_distribute_train.sh b/model_zoo/research/cv/simple_baselines/scripts/run_distribute_train.sh
index b568b3d400b..a91edd71221 100644
--- a/model_zoo/research/cv/simple_baselines/scripts/run_distribute_train.sh
+++ b/model_zoo/research/cv/simple_baselines/scripts/run_distribute_train.sh
@@ -16,31 +16,24 @@
echo "========================================================================"
echo "Please run the script as: "
-echo "bash run.sh RANK_SIZE"
-echo "For example: bash run_distribute.sh 8"
+echo "bash run.sh RANK_TABLE"
+echo "For example: bash run_distribute.sh RANK_TABLE"
echo "It is better to use the absolute path."
echo "========================================================================"
set -e
-
-RANK_SIZE=$1
-export RANK_SIZE
+get_real_path(){
+ if [ "${1:0:1}" == "/" ]; then
+ echo "$1"
+ else
+ echo "$(realpath -m $PWD/$1)"
+ fi
+}
+RANK_TABLE=$(get_real_path $1)
EXEC_PATH=$(pwd)
echo "$EXEC_PATH"
-
-test_dist_8pcs()
-{
- export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_8pcs.json
- export RANK_SIZE=8
-}
-
-test_dist_2pcs()
-{
- export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_2pcs.json
- export RANK_SIZE=2
-}
-
-test_dist_${RANK_SIZE}pcs
+export RANK_TABLE_FILE=$RANK_TABLE
+export RANK_SIZE=8
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
diff --git a/model_zoo/research/nlp/seq2seq/train.py b/model_zoo/research/nlp/seq2seq/train.py
index 529a01e5e19..1ac8199f3e4 100644
--- a/model_zoo/research/nlp/seq2seq/train.py
+++ b/model_zoo/research/nlp/seq2seq/train.py
@@ -217,7 +217,7 @@ def _build_training_pipeline(config: Seq2seqConfig,
scale_update_cell=scale_manager.get_update_cell()
)
net_with_grads.set_train(True)
- model = Model(net_with_grads, amp_level="O2")
+ model = Model(net_with_grads)
loss_monitor = LossCallBack(config)
dataset_size = dataset.get_dataset_size()
time_cb = TimeMonitor(data_size=dataset_size)