From d63b1f16bdc81b279d7daa386c4b9097b8f6d209 Mon Sep 17 00:00:00 2001 From: wangmin Date: Thu, 3 Sep 2020 16:47:06 +0800 Subject: [PATCH] support startup bert_thor script with relative path --- model_zoo/official/nlp/bert_thor/README.md | 4 ++-- model_zoo/official/nlp/bert_thor/pretrain_eval.py | 2 -- .../official/nlp/bert_thor/scripts/run_distribute_pretrain.sh | 4 ++++ .../official/nlp/bert_thor/scripts/run_standalone_pretrain.sh | 4 ++++ model_zoo/official/nlp/bert_thor/src/lr_generator.py | 2 +- 5 files changed, 11 insertions(+), 5 deletions(-) diff --git a/model_zoo/official/nlp/bert_thor/README.md b/model_zoo/official/nlp/bert_thor/README.md index c8995699653..80ed17b42c0 100644 --- a/model_zoo/official/nlp/bert_thor/README.md +++ b/model_zoo/official/nlp/bert_thor/README.md @@ -128,12 +128,12 @@ Parameters for both training and inference can be set in config.py. ``` sh run_distribute_pretrain.sh [DEVICE_NUM] [EPOCH_SIZE] [DATA_DIR] [SCHEMA_DIR] [RANK_TABLE_FILE] ``` -We need three parameters for this scripts. +We need five parameters for this scripts. - `DEVICE_NUM`: the device number for distributed train. - `EPOCH_SIZE`: Epoch size used in the model - `DATA_DIR`:Data path, it is better to use absolute path. - `SCHEMA_DIR `:Schema path, it is better to use absolute path -- `RANK_TABLE_FILE`: the path of rank_table.json +- `RANK_TABLE_FILE`: rank table file with JSON format Training result will be stored in the current path, whose folder name begins with the file name that the user defines. Under this, you can find checkpoint file together with result like the followings in log. ``` diff --git a/model_zoo/official/nlp/bert_thor/pretrain_eval.py b/model_zoo/official/nlp/bert_thor/pretrain_eval.py index 0e64c61700e..4cb501a4a62 100644 --- a/model_zoo/official/nlp/bert_thor/pretrain_eval.py +++ b/model_zoo/official/nlp/bert_thor/pretrain_eval.py @@ -153,10 +153,8 @@ def MLM_eval(): net = Model(net_for_pretraining, eval_network=net_for_pretraining, eval_indexes=[0, 1, 2], metrics={'name': myMetric()}) res = net.eval(dataset, dataset_sink_mode=False) - print("==============================================================") for _, v in res.items(): print("Accuracy is: ", v) - print("==============================================================") if __name__ == "__main__": diff --git a/model_zoo/official/nlp/bert_thor/scripts/run_distribute_pretrain.sh b/model_zoo/official/nlp/bert_thor/scripts/run_distribute_pretrain.sh index 3ac2db0206d..1b695fd28c4 100644 --- a/model_zoo/official/nlp/bert_thor/scripts/run_distribute_pretrain.sh +++ b/model_zoo/official/nlp/bert_thor/scripts/run_distribute_pretrain.sh @@ -25,6 +25,9 @@ EPOCH_SIZE=$2 DATA_DIR=$3 SCHEMA_DIR=$4 +BASE_PATH=$(cd "`dirname $0`" || exit; pwd) +cd $BASE_PATH/ || exit + ulimit -u unlimited export RANK_TABLE_FILE=$5 export RANK_SIZE=$1 @@ -55,6 +58,7 @@ do --load_checkpoint_path="" \ --save_checkpoint_path='./' \ --save_checkpoint_steps=1000 \ + --train_steps=3000 \ --save_checkpoint_num=30 \ --data_dir=$DATA_DIR \ --schema_dir=$SCHEMA_DIR > log.txt 2>&1 & diff --git a/model_zoo/official/nlp/bert_thor/scripts/run_standalone_pretrain.sh b/model_zoo/official/nlp/bert_thor/scripts/run_standalone_pretrain.sh index 35d18c2ad00..87098430f02 100644 --- a/model_zoo/official/nlp/bert_thor/scripts/run_standalone_pretrain.sh +++ b/model_zoo/official/nlp/bert_thor/scripts/run_standalone_pretrain.sh @@ -24,6 +24,9 @@ EPOCH_SIZE=$2 DATA_DIR=$3 SCHEMA_DIR=$4 +BASE_PATH=$(cd "`dirname $0`" || exit; pwd) +cd $BASE_PATH/ || exit + ulimit -u unlimited export DEVICE_ID=$1 export RANK_SIZE=1 @@ -51,6 +54,7 @@ python run_pretrain.py \ --load_checkpoint_path="" \ --save_checkpoint_path='./' \ --save_checkpoint_steps=5000 \ +--train_steps=-1 \ --save_checkpoint_num=20 \ --data_dir=$DATA_DIR \ --schema_dir=$SCHEMA_DIR > log.txt 2>&1 & diff --git a/model_zoo/official/nlp/bert_thor/src/lr_generator.py b/model_zoo/official/nlp/bert_thor/src/lr_generator.py index d3ca9f458a2..cb761cdccb2 100644 --- a/model_zoo/official/nlp/bert_thor/src/lr_generator.py +++ b/model_zoo/official/nlp/bert_thor/src/lr_generator.py @@ -55,7 +55,7 @@ def get_poly_lr(global_step, lr_init, lr_end, lr_max, warmup_steps, total_steps, return learning_rate -# bert kfac hyperparam setting +# bert thor hyperparam setting def get_bert_lr(): learning_rate = Tensor( get_poly_lr(global_step=0, lr_init=0.0, lr_end=1e-6, lr_max=3.1e-3, warmup_steps=0, total_steps=30000,