From 64a3373f31e9c584da4a54b25bb4e0821e70bfe0 Mon Sep 17 00:00:00 2001 From: zhouyaqiang Date: Fri, 13 Nov 2020 09:35:23 +0800 Subject: [PATCH] extend hccl time out and modify lr schedule --- model_zoo/official/cv/inceptionv3/README.md | 1 + .../cv/inceptionv3/scripts/run_distribute_train.sh | 2 +- model_zoo/official/cv/inceptionv3/src/lr_generator.py | 7 ++++--- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/model_zoo/official/cv/inceptionv3/README.md b/model_zoo/official/cv/inceptionv3/README.md index a59a9446137..a9e5102c10a 100644 --- a/model_zoo/official/cv/inceptionv3/README.md +++ b/model_zoo/official/cv/inceptionv3/README.md @@ -133,6 +133,7 @@ sh scripts/run_standalone_train.sh DEVICE_ID DATA_PATH ``` > Notes: RANK_TABLE_FILE can refer to [Link](https://www.mindspore.cn/tutorial/training/en/master/advanced_use/distributed_training_ascend.html) , and the device_ip can be got as [Link]https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools. + For large models like InceptionV3, it's better to export an external environment variable `export HCCL_CONNECT_TIMEOUT=600` to extend hccl connection checking time from the default 120 seconds to 600 seconds. Otherwise, the connection could be timeout since compiling time increases with the growth of model size. > This is processor cores binding operation regarding the `device_num` and total processor numbers. If you are not expect to do it, remove the operations `taskset` in `scripts/run_distribute_train.sh` diff --git a/model_zoo/official/cv/inceptionv3/scripts/run_distribute_train.sh b/model_zoo/official/cv/inceptionv3/scripts/run_distribute_train.sh index 4e16345ada2..b73a6602293 100644 --- a/model_zoo/official/cv/inceptionv3/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/inceptionv3/scripts/run_distribute_train.sh @@ -17,7 +17,7 @@ DATA_DIR=$2 export RANK_TABLE_FILE=$1 export RANK_SIZE=8 - +export HCCL_CONNECT_TIMEOUT=600 cores=`cat /proc/cpuinfo|grep "processor" |wc -l` echo "the number of logical core" $cores diff --git a/model_zoo/official/cv/inceptionv3/src/lr_generator.py b/model_zoo/official/cv/inceptionv3/src/lr_generator.py index 6d78a3e59a7..b112112955c 100644 --- a/model_zoo/official/cv/inceptionv3/src/lr_generator.py +++ b/model_zoo/official/cv/inceptionv3/src/lr_generator.py @@ -45,7 +45,7 @@ def _generate_steps_lr(lr_init, lr_max, total_steps, warmup_steps, global_step=0 else: lr = lr_max * 0.001 lr_each_step.append(lr) - lr_each_step = np.array(lr_each_step).astype(np.float32)[global_step:] + lr_each_step = np.array(lr_each_step).astype(np.float32)[global_step:] return lr_each_step @@ -81,7 +81,7 @@ def _generate_exponential_lr(lr_init, lr_max, total_steps, warmup_steps, steps_p return lr_each_step -def _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps): +def _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps, global_step=0): """ Applies cosine decay to generate learning rate array. @@ -105,6 +105,7 @@ def _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps): cosine_decay = 0.5 * (1 + math.cos(math.pi * (i-warmup_steps) / decay_steps)) lr = (lr_max-lr_end)*cosine_decay + lr_end lr_each_step.append(lr) + lr_each_step = np.array(lr_each_step).astype(np.float32)[global_step:] return lr_each_step @@ -155,7 +156,7 @@ def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch elif lr_decay_mode == 'steps_decay': lr_each_step = _generate_exponential_lr(lr_init, lr_max, total_steps, warmup_steps, steps_per_epoch) elif lr_decay_mode == 'cosine': - lr_each_step = _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps) + lr_each_step = _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps, global_step) else: lr_each_step = _generate_liner_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps) learning_rate = np.array(lr_each_step).astype(np.float32)