!21531 fix network issue(pynative running problems)
Merge pull request !21531 from JichenZhao/master
This commit is contained in:
commit
06c2254210
|
@ -93,7 +93,7 @@
|
|||
|
||||
```python
|
||||
# 分布式训练
|
||||
用法:bash run_distribute_train.sh [DATASET_PATH] [RANK_SIZE]
|
||||
用法:bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE]
|
||||
|
||||
# 单机训练
|
||||
用法:bash run_standalone_train.sh [DATASET_PATH] [DEVICE_ID]
|
||||
|
@ -155,7 +155,7 @@
|
|||
|
||||
```text
|
||||
"class_num":1000, # 数据集类数
|
||||
"batch_size":128, # 输入张量的批次大小
|
||||
"batch_size":80, # 输入张量的批次大小
|
||||
"loss_scale":1024, # 损失等级
|
||||
"momentum":0.08, # 动量优化器
|
||||
"weight_decay":0.0002, # 权重衰减
|
||||
|
@ -203,7 +203,7 @@
|
|||
|
||||
```text
|
||||
# 分布式训练
|
||||
用法:bash run_distribute_train.sh [DATASET_PATH] [RANK_SIZE]
|
||||
用法:bash run_distribute_train.sh [DATASET_PATH] [RANK_TABLE]
|
||||
|
||||
# 单机训练
|
||||
用法:bash run_standalone_train.sh [DATASET_PATH] [DEVICE_ID]
|
||||
|
@ -292,15 +292,15 @@ result:{'top_1 acc':0.802303685897436}
|
|||
| 模型版本 | Glore_resnet200 |Glore_resnet200 |
|
||||
| 资源 | Ascend 910;CPU:2.60GHz,192核;内存:2048G |GPU-V100(SXM2) |
|
||||
| 上传日期 | 2021-03-34 |2021-05-25 |
|
||||
| MindSpore版本 | 1.1.1 |1.2.0 |
|
||||
| MindSpore版本 | 1.3.0 |1.2.0 |
|
||||
| 数据集 | ImageNet2012 | ImageNet2012 |
|
||||
| 训练参数 | epoch=150, steps per epoch=1251, batch_size = 128 |epoch=150, steps per epoch=2502, batch_size = 64 |
|
||||
| 训练参数 | epoch=150, steps per epoch=2001, batch_size = 80 |epoch=150, steps per epoch=2502, batch_size = 64 |
|
||||
| 优化器 | NAG | NAG |
|
||||
| 损失函数 | SoftmaxCrossEntropyExpand |SoftmaxCrossEntropyExpand |
|
||||
| 输出 | 概率 |概率 |
|
||||
| 损失 |0.7068262 |0.55614954 |
|
||||
| 速度 | 630.343毫秒/步(8卡) |912.211 毫秒/步(8卡) |
|
||||
| 总时长 | 33时45分钟 |94时08分 |
|
||||
| 损失 |0.8068262 |0.55614954 |
|
||||
| 速度 | 400.343毫秒/步(8卡) |912.211 毫秒/步(8卡) |
|
||||
| 总时长 | 33时35分钟 |94时08分 |
|
||||
| 参数(M) | 70.6 |70.6
|
||||
| 微调检查点| 807.57M(.ckpt文件) |808.28(.ckpt)
|
||||
| 脚本 | [链接](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/research/cv/glore_res200) |
|
||||
|
@ -314,9 +314,9 @@ result:{'top_1 acc':0.802303685897436}
|
|||
| 模型版本 | Glore_resnet200 | Glore_resnet200 |
|
||||
| 资源 | Ascend 910 | GPU |
|
||||
| 上传日期 | 2021-3-24 |2021-05-25 |
|
||||
| MindSpore版本 | 1.1.1 |1.2.0 |
|
||||
| 数据集 | 12万张图像 |12万张图像 |
|
||||
| batch_size | 128 |64 |
|
||||
| MindSpore版本 | 1.3.0 |1.2.0 |
|
||||
| 数据集 | 120万张图像 |120万张图像 |
|
||||
| batch_size | 80 |64 |
|
||||
| 输出 | 概率 |概率 |
|
||||
| 准确性 | 8卡: 80.23% |8卡:80.603% |
|
||||
|
||||
|
|
|
@ -17,35 +17,28 @@
|
|||
echo "=============================================================================================================="
|
||||
echo "Please run the script as: "
|
||||
echo "bash run_distribute_train.sh DATA_PATH RANK_SIZE"
|
||||
echo "For example: bash run_distribute_train.sh /path/dataset 8"
|
||||
echo "For example: bash run_distribute_train.sh /path/dataset /path/rank_table"
|
||||
echo "It is better to use the absolute path."
|
||||
echo "=============================================================================================================="
|
||||
set -e
|
||||
DATA_PATH=$1
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
DATA_PATH=$(get_real_path $1)
|
||||
export DATA_PATH=${DATA_PATH}
|
||||
RANK_SIZE=$2
|
||||
|
||||
EXEC_PATH=$(pwd)
|
||||
RANK_TABLE=$(get_real_path $2)
|
||||
export RANK_TABLE_FILE=${RANK_TABLE}
|
||||
export RANK_SIZE=8
|
||||
|
||||
echo "$EXEC_PATH"
|
||||
|
||||
test_dist_8pcs()
|
||||
{
|
||||
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_8pcs.json
|
||||
export RANK_SIZE=8
|
||||
}
|
||||
|
||||
test_dist_2pcs()
|
||||
{
|
||||
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_2pcs.json
|
||||
export RANK_SIZE=2
|
||||
}
|
||||
|
||||
test_dist_${RANK_SIZE}pcs
|
||||
|
||||
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
||||
|
||||
for((i=1;i<${RANK_SIZE};i++))
|
||||
for((i=1;i<8;i++))
|
||||
do
|
||||
rm -rf device$i
|
||||
mkdir device$i
|
||||
|
@ -75,7 +68,7 @@ export DEVICE_ID=0
|
|||
export RANK_ID=0
|
||||
echo "start training for device 0"
|
||||
env > env0.log
|
||||
python3 train.py --data_url $1 --isModelArts False --run_distribute True > train0.log 2>&1
|
||||
python3 train.py --data_url $1 --isModelArts False --run_distribute True > train0.log 2>&1 &
|
||||
|
||||
if [ $? -eq 0 ];then
|
||||
echo "training success"
|
||||
|
|
|
@ -18,7 +18,7 @@ network config setting, will be used in train.py
|
|||
from easydict import EasyDict
|
||||
config1 = EasyDict({
|
||||
"class_num": 1000,
|
||||
"batch_size": 128,
|
||||
"batch_size": 80,
|
||||
"loss_scale": 1024,
|
||||
"momentum": 0.08,
|
||||
"weight_decay": 0.0002,
|
||||
|
|
|
@ -30,6 +30,7 @@ from mindspore.train.loss_scale_manager import FixedLossScaleManager
|
|||
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||
from mindspore.communication.management import init, get_group_size, get_rank
|
||||
import mindspore.nn as nn
|
||||
from mindspore.common import set_seed
|
||||
import mindspore.common.initializer as weight_init
|
||||
from src.lr_generator import get_lr
|
||||
from src.config import config1, config2
|
||||
|
@ -64,6 +65,7 @@ elif args_opt.device_target == "GPU":
|
|||
random.seed(1)
|
||||
np.random.seed(1)
|
||||
de.config.set_seed(1)
|
||||
set_seed(1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
|
|
|
@ -296,7 +296,6 @@ class Seq2seqTrainOneStepWithLossScaleCell(nn.Cell):
|
|||
dtype=mstype.float32), name="loss_scale")
|
||||
self.add_flags(has_effect=True)
|
||||
|
||||
self.loss_scalar = P.ScalarSummary()
|
||||
|
||||
def construct(self,
|
||||
source_eos_ids,
|
||||
|
@ -368,5 +367,4 @@ class Seq2seqTrainOneStepWithLossScaleCell(nn.Cell):
|
|||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
|
||||
self.loss_scalar("loss", loss)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
|
|
@ -229,7 +229,6 @@ class Adam(Optimizer):
|
|||
self.one = Tensor(np.array([1.0]).astype(np.float32))
|
||||
self.realdiv = P.RealDiv()
|
||||
|
||||
self.lr_scalar = P.ScalarSummary()
|
||||
|
||||
def construct(self, gradients):
|
||||
"""Adam optimizer."""
|
||||
|
@ -240,8 +239,6 @@ class Adam(Optimizer):
|
|||
gradients = self.scale_grad(gradients)
|
||||
lr = self.get_lr()
|
||||
|
||||
self.lr_scalar("learning_rate", lr)
|
||||
|
||||
beta1_power = self.beta1_power * self.beta1
|
||||
self.beta1_power = beta1_power
|
||||
beta2_power = self.beta2_power * self.beta2
|
||||
|
|
Loading…
Reference in New Issue