forked from OSSInnovation/mindspore
!6188 fix loss print of bert, add attention about bind cores in README
Merge pull request !6188 from chenhaozhe/fix-bert-loss-print
This commit is contained in:
commit
105422fbb9
|
@ -399,6 +399,9 @@ epoch: 0.0, current epoch percent: 0.002, step: 200, outpus are (Tensor(shape=[1
|
|||
...
|
||||
```
|
||||
|
||||
> **Attention** This will bind the processor cores according to the `device_num` and total processor numbers. If you don't expect to run pretraining with binding processor cores, remove the operations about `taskset` in `scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py`
|
||||
|
||||
|
||||
## [Evaluation Process](#contents)
|
||||
### Evaluation
|
||||
#### evaluation on cola dataset when running on Ascend
|
||||
|
|
|
@ -78,7 +78,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin
|
|||
update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
|
||||
netwithgrads = BertFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell)
|
||||
model = Model(netwithgrads)
|
||||
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb]
|
||||
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb]
|
||||
model.train(epoch_num, dataset, callbacks=callbacks)
|
||||
|
||||
def eval_result_print(assessment_method="accuracy", callback=None):
|
||||
|
|
|
@ -79,7 +79,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin
|
|||
update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
|
||||
netwithgrads = BertFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell)
|
||||
model = Model(netwithgrads)
|
||||
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb]
|
||||
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb]
|
||||
model.train(epoch_num, dataset, callbacks=callbacks)
|
||||
|
||||
def eval_result_print(assessment_method="accuracy", callback=None):
|
||||
|
|
|
@ -81,7 +81,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin
|
|||
update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
|
||||
netwithgrads = BertSquadCell(network, optimizer=optimizer, scale_update_cell=update_cell)
|
||||
model = Model(netwithgrads)
|
||||
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb]
|
||||
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb]
|
||||
model.train(epoch_num, dataset, callbacks=callbacks)
|
||||
|
||||
|
||||
|
|
|
@ -141,14 +141,18 @@ class LossCallBack(Callback):
|
|||
Args:
|
||||
per_print_times (int): Print loss every times. Default: 1.
|
||||
"""
|
||||
def __init__(self, dataset_size=1):
|
||||
def __init__(self, dataset_size=-1):
|
||||
super(LossCallBack, self).__init__()
|
||||
self._dataset_size = dataset_size
|
||||
def step_end(self, run_context):
|
||||
cb_params = run_context.original_args()
|
||||
percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size)
|
||||
print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}"
|
||||
.format(epoch_num, "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs)))
|
||||
if self._dataset_size > 0:
|
||||
percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size)
|
||||
print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}"
|
||||
.format(epoch_num, "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs)))
|
||||
else:
|
||||
print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
|
||||
str(cb_params.net_outputs)))
|
||||
|
||||
def LoadNewestCkpt(load_finetune_checkpoint_dir, steps_per_epoch, epoch_num, prefix):
|
||||
"""
|
||||
|
|
|
@ -1,19 +1,34 @@
|
|||
# Contents
|
||||
- [Contents](#contents)
|
||||
- [TinyBERT Description](#tinybert-description)
|
||||
- [Model Architecture](#model-architecture)
|
||||
- [Dataset](#dataset)
|
||||
- [Environment Requirements](#environment-requirements)
|
||||
- [Quick Start](#quick-start)
|
||||
- [Script Description](#script-description)
|
||||
- [Script and Sample Code](#script-and-sample-code)
|
||||
- [Script Parameters](#script-parameters)
|
||||
- [Dataset Preparation](#dataset-preparation)
|
||||
- [Training Process](#training-process)
|
||||
- [Evaluation Process](#evaluation-process)
|
||||
- [Model Description](#model-description)
|
||||
- [Performance](#performance)
|
||||
- [Training Performance](#training-performance)
|
||||
- [Evaluation Performance](#evaluation-performance)
|
||||
- [Script and Sample Code](#script-and-sample-code)
|
||||
- [Script Parameters](#script-parameters)
|
||||
- [General Distill](#general-distill)
|
||||
- [Task Distill](#task-distill)
|
||||
- [Options and Parameters](#options-and-parameters)
|
||||
- [Options:](#options)
|
||||
- [Parameters:](#parameters)
|
||||
- [Training Process](#training-process)
|
||||
- [Training](#training)
|
||||
- [running on Ascend](#running-on-ascend)
|
||||
- [running on GPU](#running-on-gpu)
|
||||
- [Distributed Training](#distributed-training)
|
||||
- [running on Ascend](#running-on-ascend-1)
|
||||
- [running on GPU](#running-on-gpu-1)
|
||||
- [Evaluation Process](#evaluation-process)
|
||||
- [Evaluation](#evaluation)
|
||||
- [evaluation on SST-2 dataset](#evaluation-on-sst-2-dataset)
|
||||
- [evaluation on MNLI dataset](#evaluation-on-mnli-dataset)
|
||||
- [evaluation on QNLI dataset](#evaluation-on-qnli-dataset)
|
||||
- [Model Description](#model-description)
|
||||
- [Performance](#performance)
|
||||
- [training Performance](#training-performance)
|
||||
- [Inference Performance](#inference-performance)
|
||||
- [Description of Random Situation](#description-of-random-situation)
|
||||
- [ModelZoo Homepage](#modelzoo-homepage)
|
||||
|
||||
|
@ -244,6 +259,8 @@ epoch: 2, step: 200, outpus are (Tensor(shape=[1], dtype=Float32, 30.1724), Tens
|
|||
...
|
||||
```
|
||||
|
||||
> **Attention** This will bind the processor cores according to the `device_num` and total processor numbers. If you don't expect to run pretraining with binding processor cores, remove the operations about `taskset` in `scripts/run_distributed_gd_ascend.sh`
|
||||
|
||||
#### running on GPU
|
||||
Before running the command below, please check `load_teacher_ckpt_path`, `data_dir` `schma_dir` and `device_target=GPU` has been set. Please set the path to be the absolute full path, e.g:"/username/checkpoint_100_300.ckpt".
|
||||
```
|
||||
|
|
Loading…
Reference in New Issue