From 9748a3d2ee9a8665974b7c0c7f8fb1de261090e8 Mon Sep 17 00:00:00 2001 From: wangmin Date: Mon, 31 Aug 2020 10:31:25 +0800 Subject: [PATCH] train paramters for GPU --- .../scripts/run_distribute_train.sh | 20 +++++++++++++------ .../official/cv/resnet_thor/src/config.py | 12 +++++------ model_zoo/official/cv/resnet_thor/train.py | 1 + 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/model_zoo/official/cv/resnet_thor/scripts/run_distribute_train.sh b/model_zoo/official/cv/resnet_thor/scripts/run_distribute_train.sh index c68a4fa159..534a6006c9 100755 --- a/model_zoo/official/cv/resnet_thor/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/resnet_thor/scripts/run_distribute_train.sh @@ -32,13 +32,21 @@ then exit 1 fi -BASE_PATH=$(cd "`dirname $0`" || exit; pwd) -cd $BASE_PATH/../ || exit +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) ulimit -u unlimited export DEVICE_NUM=$3 export RANK_SIZE=$3 -export RANK_TABLE_FILE=$1 +export RANK_TABLE_FILE=$PATH1 for((i=0; i<${DEVICE_NUM}; i++)) do @@ -46,12 +54,12 @@ do export RANK_ID=$i rm -rf ./train_parallel$i mkdir ./train_parallel$i - cp *.py ./train_parallel$i - cp -r ./src ./train_parallel$i + cp ../*.py ./train_parallel$i + cp -r ../src ./train_parallel$i cd ./train_parallel$i || exit echo "start training for rank $RANK_ID, device $DEVICE_ID" env > env.log - python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$2 > log 2>&1 & + python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 > log 2>&1 & cd .. done diff --git a/model_zoo/official/cv/resnet_thor/src/config.py b/model_zoo/official/cv/resnet_thor/src/config.py index 00de8f985f..4acbebf3f0 100644 --- a/model_zoo/official/cv/resnet_thor/src/config.py +++ b/model_zoo/official/cv/resnet_thor/src/config.py @@ -46,17 +46,17 @@ config_gpu = ed({ "loss_scale": 128, "momentum": 0.9, "weight_decay": 5e-4, - "epoch_size": 45, + "epoch_size": 40, "save_checkpoint": True, "save_checkpoint_epochs": 1, "keep_checkpoint_max": 15, "save_checkpoint_path": "./", "use_label_smooth": True, "label_smooth_factor": 0.1, - "lr_init": 0.04, - "lr_decay": 5, - "lr_end_epoch": 58, - "damping_init": 0.02, - "damping_decay": 0.87, + "lr_init": 0.05672, + "lr_decay": 4.9687, + "lr_end_epoch": 50, + "damping_init": 0.02345, + "damping_decay": 0.5467, "frequency": 834, }) diff --git a/model_zoo/official/cv/resnet_thor/train.py b/model_zoo/official/cv/resnet_thor/train.py index cd6d84a8ee..025431d46c 100644 --- a/model_zoo/official/cv/resnet_thor/train.py +++ b/model_zoo/official/cv/resnet_thor/train.py @@ -109,6 +109,7 @@ if __name__ == '__main__': init() context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) + auto_parallel_context().set_all_reduce_fusion_split_indices([107]) ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/" # create dataset