!21531 fix network issue(pynative running problems)

Merge pull request !21531 from JichenZhao/master
This commit is contained in:
i-robot 2021-08-10 08:24:29 +00:00 committed by Gitee
commit 06c2254210
6 changed files with 28 additions and 38 deletions

View File

@ -93,7 +93,7 @@
```python
# 分布式训练
用法:bash run_distribute_train.sh [DATASET_PATH] [RANK_SIZE]
用法:bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE]
# 单机训练
用法:bash run_standalone_train.sh [DATASET_PATH] [DEVICE_ID]
@ -155,7 +155,7 @@
```text
"class_num":1000, # 数据集类数
"batch_size":128, # 输入张量的批次大小
"batch_size":80, # 输入张量的批次大小
"loss_scale":1024, # 损失等级
"momentum":0.08, # 动量优化器
"weight_decay":0.0002, # 权重衰减
@ -203,7 +203,7 @@
```text
# 分布式训练
用法:bash run_distribute_train.sh [DATASET_PATH] [RANK_SIZE]
用法:bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE]
# 单机训练
用法:bash run_standalone_train.sh [DATASET_PATH] [DEVICE_ID]
@ -292,15 +292,15 @@ result:{'top_1 acc':0.802303685897436}
| 模型版本 | Glore_resnet200 |Glore_resnet200 |
| 资源 | Ascend 910CPU2.60GHz192核内存2048G |GPU-V100(SXM2) |
| 上传日期 | 2021-03-34 |2021-05-25 |
| MindSpore版本 | 1.1.1 |1.2.0 |
| MindSpore版本 | 1.3.0 |1.2.0 |
| 数据集 | ImageNet2012 | ImageNet2012 |
| 训练参数 | epoch=150, steps per epoch=1251, batch_size = 128 |epoch=150, steps per epoch=2502, batch_size = 64 |
| 训练参数 | epoch=150, steps per epoch=2001, batch_size = 80 |epoch=150, steps per epoch=2502, batch_size = 64 |
| 优化器 | NAG | NAG |
| 损失函数 | SoftmaxCrossEntropyExpand |SoftmaxCrossEntropyExpand |
| 输出 | 概率 |概率 |
| 损失 |0.7068262 |0.55614954 |
| 速度 | 630.343毫秒/步8卡 |912.211 毫秒/步8卡 |
| 总时长 | 33时45分钟 |94时08分 |
| 损失 |0.8068262 |0.55614954 |
| 速度 | 400.343毫秒/步8卡 |912.211 毫秒/步8卡 |
| 总时长 | 33时35分钟 |94时08分 |
| 参数(M) | 70.6 |70.6
| 微调检查点| 807.57M.ckpt文件 |808.28(.ckpt)
| 脚本 | [链接](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/research/cv/glore_res200) |
@ -314,9 +314,9 @@ result:{'top_1 acc':0.802303685897436}
| 模型版本 | Glore_resnet200 | Glore_resnet200 |
| 资源 | Ascend 910 | GPU |
| 上传日期 | 2021-3-24 |2021-05-25 |
| MindSpore版本 | 1.1.1 |1.2.0 |
| 数据集 | 12万张图像 |12万张图像 |
| batch_size | 128 |64 |
| MindSpore版本 | 1.3.0 |1.2.0 |
| 数据集 | 120万张图像 |120万张图像 |
| batch_size | 80 |64 |
| 输出 | 概率 |概率 |
| 准确性 | 8卡: 80.23% |8卡80.603% |

View File

@ -17,35 +17,28 @@
echo "=============================================================================================================="
echo "Please run the script as: "
echo "bash run_distribute_train.sh DATA_PATH RANK_SIZE"
echo "For example: bash run_distribute_train.sh /path/dataset 8"
echo "For example: bash run_distribute_train.sh /path/dataset /path/rank_table"
echo "It is better to use the absolute path."
echo "=============================================================================================================="
set -e
DATA_PATH=$1
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
DATA_PATH=$(get_real_path $1)
export DATA_PATH=${DATA_PATH}
RANK_SIZE=$2
EXEC_PATH=$(pwd)
RANK_TABLE=$(get_real_path $2)
export RANK_TABLE_FILE=${RANK_TABLE}
export RANK_SIZE=8
echo "$EXEC_PATH"
test_dist_8pcs()
{
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_8pcs.json
export RANK_SIZE=8
}
test_dist_2pcs()
{
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_2pcs.json
export RANK_SIZE=2
}
test_dist_${RANK_SIZE}pcs
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
for((i=1;i<${RANK_SIZE};i++))
for((i=1;i<8;i++))
do
rm -rf device$i
mkdir device$i
@ -75,7 +68,7 @@ export DEVICE_ID=0
export RANK_ID=0
echo "start training for device 0"
env > env0.log
python3 train.py --data_url $1 --isModelArts False --run_distribute True > train0.log 2>&1
python3 train.py --data_url $1 --isModelArts False --run_distribute True > train0.log 2>&1 &
if [ $? -eq 0 ];then
echo "training success"

View File

@ -18,7 +18,7 @@ network config setting, will be used in train.py
from easydict import EasyDict
config1 = EasyDict({
"class_num": 1000,
"batch_size": 128,
"batch_size": 80,
"loss_scale": 1024,
"momentum": 0.08,
"weight_decay": 0.0002,

View File

@ -30,6 +30,7 @@ from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.communication.management import init, get_group_size, get_rank
import mindspore.nn as nn
from mindspore.common import set_seed
import mindspore.common.initializer as weight_init
from src.lr_generator import get_lr
from src.config import config1, config2
@ -64,6 +65,7 @@ elif args_opt.device_target == "GPU":
random.seed(1)
np.random.seed(1)
de.config.set_seed(1)
set_seed(1)
if __name__ == '__main__':

View File

@ -296,7 +296,6 @@ class Seq2seqTrainOneStepWithLossScaleCell(nn.Cell):
dtype=mstype.float32), name="loss_scale")
self.add_flags(has_effect=True)
self.loss_scalar = P.ScalarSummary()
def construct(self,
source_eos_ids,
@ -368,5 +367,4 @@ class Seq2seqTrainOneStepWithLossScaleCell(nn.Cell):
if not overflow:
self.optimizer(grads)
self.loss_scalar("loss", loss)
return (loss, cond, scaling_sens)

View File

@ -229,7 +229,6 @@ class Adam(Optimizer):
self.one = Tensor(np.array([1.0]).astype(np.float32))
self.realdiv = P.RealDiv()
self.lr_scalar = P.ScalarSummary()
def construct(self, gradients):
"""Adam optimizer."""
@ -240,8 +239,6 @@ class Adam(Optimizer):
gradients = self.scale_grad(gradients)
lr = self.get_lr()
self.lr_scalar("learning_rate", lr)
beta1_power = self.beta1_power * self.beta1
self.beta1_power = beta1_power
beta2_power = self.beta2_power * self.beta2