!6188 fix loss print of bert, add attention about bind cores in README

Merge pull request !6188 from chenhaozhe/fix-bert-loss-print
2020-09-15 16:33:19 +08:00 · 2020-09-15 16:33:19 +08:00 · 105422fbb9
parent 73d4ceeaf7 e1f4c066b3
commit 105422fbb9
6 changed files with 40 additions and 16 deletions
--- a/model_zoo/official/nlp/bert/README.md
+++ b/model_zoo/official/nlp/bert/README.md
@ -399,6 +399,9 @@ epoch: 0.0, current epoch percent: 0.002, step: 200, outpus are (Tensor(shape=[1
 ...
 ```

+> **Attention** This will bind the processor cores according to the `device_num` and total processor numbers. If you don't expect to run pretraining with binding processor cores, remove the operations about `taskset` in `scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py`
+
+
 ## [Evaluation Process](#contents)
 ### Evaluation
 #### evaluation on cola dataset when running on Ascend
--- a/model_zoo/official/nlp/bert/run_classifier.py
+++ b/model_zoo/official/nlp/bert/run_classifier.py
@ -78,7 +78,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin
    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
    netwithgrads = BertFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell)
    model = Model(netwithgrads)
-    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb]
+    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb]
    model.train(epoch_num, dataset, callbacks=callbacks)

 def eval_result_print(assessment_method="accuracy", callback=None):
--- a/model_zoo/official/nlp/bert/run_ner.py
+++ b/model_zoo/official/nlp/bert/run_ner.py
@ -79,7 +79,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin
    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
    netwithgrads = BertFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell)
    model = Model(netwithgrads)
-    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb]
+    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb]
    model.train(epoch_num, dataset, callbacks=callbacks)

 def eval_result_print(assessment_method="accuracy", callback=None):
--- a/model_zoo/official/nlp/bert/run_squad.py
+++ b/model_zoo/official/nlp/bert/run_squad.py
@ -81,7 +81,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin
    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
    netwithgrads = BertSquadCell(network, optimizer=optimizer, scale_update_cell=update_cell)
    model = Model(netwithgrads)
-    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb]
+    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb]
    model.train(epoch_num, dataset, callbacks=callbacks)


--- a/model_zoo/official/nlp/bert/src/utils.py
+++ b/model_zoo/official/nlp/bert/src/utils.py
@ -141,14 +141,18 @@ class LossCallBack(Callback):
    Args:
        per_print_times (int): Print loss every times. Default: 1.
    """
-    def __init__(self, dataset_size=1):
+    def __init__(self, dataset_size=-1):
        super(LossCallBack, self).__init__()
        self._dataset_size = dataset_size
    def step_end(self, run_context):
        cb_params = run_context.original_args()
-        percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size)
-        print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}"
-              .format(epoch_num, "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs)))
+        if self._dataset_size > 0:
+            percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size)
+            print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}"
+                  .format(epoch_num, "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs)))
+        else:
+            print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
+                                                               str(cb_params.net_outputs)))

 def LoadNewestCkpt(load_finetune_checkpoint_dir, steps_per_epoch, epoch_num, prefix):
    """
--- a/model_zoo/official/nlp/tinybert/README.md
+++ b/model_zoo/official/nlp/tinybert/README.md
@ -1,19 +1,34 @@
 # Contents
+- [Contents](#contents)
 - [TinyBERT Description](#tinybert-description)
 - [Model Architecture](#model-architecture)
 - [Dataset](#dataset)
 - [Environment Requirements](#environment-requirements)
 - [Quick Start](#quick-start)
 - [Script Description](#script-description)
-    - [Script and Sample Code](#script-and-sample-code)
-    - [Script Parameters](#script-parameters)
-    - [Dataset Preparation](#dataset-preparation)
-    - [Training Process](#training-process)
-    - [Evaluation Process](#evaluation-process)
- [Model Description](#model-description)
-    - [Performance](#performance)
-        - [Training Performance](#training-performance)
-        - [Evaluation Performance](#evaluation-performance)
+  - [Script and Sample Code](#script-and-sample-code)
+  - [Script Parameters](#script-parameters)
+    - [General Distill](#general-distill)
+    - [Task Distill](#task-distill)
+  - [Options and Parameters](#options-and-parameters)
+    - [Options:](#options)
+    - [Parameters:](#parameters)
+  - [Training Process](#training-process)
+    - [Training](#training)
+      - [running on Ascend](#running-on-ascend)
+      - [running on GPU](#running-on-gpu)
+    - [Distributed Training](#distributed-training)
+      - [running on Ascend](#running-on-ascend-1)
+      - [running on GPU](#running-on-gpu-1)
+  - [Evaluation Process](#evaluation-process)
+    - [Evaluation](#evaluation)
+      - [evaluation on SST-2 dataset](#evaluation-on-sst-2-dataset)
+      - [evaluation on MNLI dataset](#evaluation-on-mnli-dataset)
+      - [evaluation on QNLI dataset](#evaluation-on-qnli-dataset)
+  - [Model Description](#model-description)
+  - [Performance](#performance)
+    - [training Performance](#training-performance)
+      - [Inference Performance](#inference-performance)
 - [Description of Random Situation](#description-of-random-situation)
 - [ModelZoo Homepage](#modelzoo-homepage)

@ -244,6 +259,8 @@ epoch: 2, step: 200, outpus are (Tensor(shape=[1], dtype=Float32, 30.1724), Tens
 ...
 ```

+> **Attention** This will bind the processor cores according to the `device_num` and total processor numbers. If you don't expect to run pretraining with binding processor cores, remove the operations about `taskset` in `scripts/run_distributed_gd_ascend.sh`
+
 #### running on GPU
 Before running the command below, please check `load_teacher_ckpt_path`, `data_dir` `schma_dir` and `device_target=GPU` has been set. Please set the path to be the absolute full path, e.g:"/username/checkpoint_100_300.ckpt".
 ```