diff --git a/model_zoo/research/nlp/ternarybert/README.md b/model_zoo/research/nlp/ternarybert/README.md index 30cc280c9cb..102f7ad02e6 100644 --- a/model_zoo/research/nlp/ternarybert/README.md +++ b/model_zoo/research/nlp/ternarybert/README.md @@ -80,7 +80,7 @@ Before running the shell script, please set the `task_name`, `model_dir` and `da ```text . -└─bert +└─ternarybert ├─README.md ├─scripts ├─run_train.sh # shell script for training phase @@ -106,26 +106,12 @@ Before running the shell script, please set the `task_name`, `model_dir` and `da ```text -usage: train.py [--h] - [--device_target {GPU,Ascend}] - [--do_eval {true,false}] - [--epoch_size EPOCH_SIZE] - [--device_id DEVICE_ID] - [--do_shuffle {true,false}] - [--enable_data_sink {true,false}] - [--save_ckpt_step SAVE_CKPT_STEP] - [--eval_ckpt_step EVAL_CKPT_STEP] - [--max_ckpt_num MAX_CKPT_NUM] - [--data_sink_steps DATA_SINK_STEPS] - [--teacher_model_dir TEACHER_MODEL_DIR] - [--student_model_dir STUDENT_MODEL_DIR] - [--data_dir DATA_DIR] - [--output_dir OUTPUT_DIR] - [--task_name {sts-b,qnli,mnli}] - [--dataset_type DATASET_TYPE] - [--seed SEED] - [--train_batch_size TRAIN_BATCH_SIZE] - [--eval_batch_size EVAL_BATCH_SIZE] +usage: train.py [--h] [--device_target {GPU,Ascend}] [--do_eval {true,false}] [--epoch_size EPOCH_SIZE] + [--device_id DEVICE_ID] [--do_shuffle {true,false}] [--enable_data_sink {true,false}] [--save_ckpt_step SAVE_CKPT_STEP] + [--eval_ckpt_step EVAL_CKPT_STEP] [--max_ckpt_num MAX_CKPT_NUM] [--data_sink_steps DATA_SINK_STEPS] + [--teacher_model_dir TEACHER_MODEL_DIR] [--student_model_dir STUDENT_MODEL_DIR] [--data_dir DATA_DIR] + [--output_dir OUTPUT_DIR] [--task_name {sts-b,qnli,mnli}] [--dataset_type DATASET_TYPE] [--seed SEED] + [--train_batch_size TRAIN_BATCH_SIZE] [--eval_batch_size EVAL_BATCH_SIZE] options: --device_target Device where the code will be implemented: "GPU" | "Ascend", default is "GPU" @@ -154,14 +140,8 @@ options: ```text -usage: eval.py [--h] - [--device_target {GPU,Ascend}] - [--device_id DEVICE_ID] - [--model_dir MODEL_DIR] - [--data_dir DATA_DIR] - [--task_name {sts-b,qnli,mnli}] - [--dataset_type DATASET_TYPE] - [--batch_size BATCH_SIZE] +usage: eval.py [--h] [--device_target {GPU,Ascend}] [--device_id DEVICE_ID] [--model_dir MODEL_DIR] [--data_dir DATA_DIR] + [--task_name {sts-b,qnli,mnli}] [--dataset_type DATASET_TYPE] [--batch_size BATCH_SIZE] options: --device_target Device where the code will be implemented: "GPU" | "Ascend", default is "GPU" @@ -205,7 +185,7 @@ Parameters for eval: Parameters for teacher bert network: seq_length length of input sequence: N, default is 128 - vocab_size size of each embedding vector: N, must be consistant with the dataset you use. Default is 30522 + vocab_size size of each embedding vector: N, must be consistent with the dataset you use. Default is 30522 hidden_size size of bert encoder layers: N num_hidden_layers number of hidden layers: N num_attention_heads number of attention heads: N, default is 12 @@ -224,7 +204,7 @@ Parameters for teacher bert network: Parameters for student bert network: seq_length length of input sequence: N, default is 128 - vocab_size size of each embedding vector: N, must be consistant with the dataset you use. Default is 30522 + vocab_size size of each embedding vector: N, must be consistent with the dataset you use. Default is 30522 hidden_size size of bert encoder layers: N num_hidden_layers number of hidden layers: N num_attention_heads number of attention heads: N, default is 12 @@ -348,7 +328,7 @@ eval step: 0, Accuracy: 90.625 eval step: 1, Accuracy: 81.25 eval step: 2, Accuracy: 79.16666666666666 ... -The best Accuracy: 83.70860927152319 +The best Accuracy: 83.58388835685436 ``` @@ -362,27 +342,13 @@ The best Accuracy: 83.70860927152319 | ----------------- | :---------------------------------------------------- | | Model Version | TernaryBERT | | Resource | NV SMX2 V100-32G | -| uploaded Date | 08/20/2020 | +| uploaded Date | 02/01/2020 | | MindSpore Version | 1.1.0 | -| Dataset | STS-B, QNLI, MNLI | -| batch_size | 16, 16, 16 | -| Metric value | 87.58388835685437, 90.426505583013, 83.70860927152319 | -| Speed | | -| Total time | | - -### Inference Performance - -| Parameters | GPU | -| ----------------- | :---------------------------------------------------- | -| Model Version | TernaryBERT | -| Resource | NV SMX2 V100-32G | -| uploaded Date | 08/20/2020 | -| MindSpore Version | 1.1.0 | -| Dataset | STS-B, QNLI, MNLI | -| batch_size | 32, 32, 32 | -| Accuracy | 87.58388835685437, 90.426505583013, 83.70860927152319 | -| Speed | | -| Total time | | +| Dataset | STS-B | +| batch_size | 16 | +| Metric value | 87.5839 | +| Speed | 0.19s/step | +| Total time | 6.7min(3epoch, 1p) | # [Description of Random Situation](#contents) diff --git a/model_zoo/research/nlp/ternarybert/eval.py b/model_zoo/research/nlp/ternarybert/eval.py index d1535f95922..27ad90192ca 100644 --- a/model_zoo/research/nlp/ternarybert/eval.py +++ b/model_zoo/research/nlp/ternarybert/eval.py @@ -20,7 +20,7 @@ import re import argparse from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net -from src.dataset import create_tinybert_dataset +from src.dataset import create_dataset from src.config import eval_cfg, student_net_cfg, task_cfg from src.tinybert_model import BertModelCLS @@ -66,15 +66,15 @@ def do_eval_standalone(args_opt): context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args.device_id) - eval_dataset = create_tinybert_dataset(batch_size=eval_cfg.batch_size, - device_num=1, - rank=0, - do_shuffle='false', - data_dir=eval_data_dir, - data_type=args_opt.dataset_type, - seq_length=task.seq_length, - task_type=task.task_type, - drop_remainder=False) + eval_dataset = create_dataset(batch_size=eval_cfg.batch_size, + device_num=1, + rank=0, + do_shuffle='false', + data_dir=eval_data_dir, + data_type=args_opt.dataset_type, + seq_length=task.seq_length, + task_type=task.task_type, + drop_remainder=False) print('eval dataset size:', eval_dataset.get_dataset_size()) print('eval dataset batch size:', eval_dataset.get_batch_size()) diff --git a/model_zoo/research/nlp/ternarybert/src/cell_wrapper.py b/model_zoo/research/nlp/ternarybert/src/cell_wrapper.py index 77b6adc155f..2c505b9db04 100644 --- a/model_zoo/research/nlp/ternarybert/src/cell_wrapper.py +++ b/model_zoo/research/nlp/ternarybert/src/cell_wrapper.py @@ -313,7 +313,7 @@ class BertNetworkWithLoss(nn.Cell): class BertTrainWithLossScaleCell(nn.Cell): """ - Especifically defined for finetuning where only four inputs tensor are needed. + Specifically defined for finetuning where only four inputs tensor are needed. """ def __init__(self, network, optimizer, scale_update_cell=None): super(BertTrainWithLossScaleCell, self).__init__(auto_prefix=False) @@ -333,6 +333,8 @@ class BertTrainWithLossScaleCell(nn.Cell): if self.reducer_flag: self.degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) + self.clip_type = gradient_cfg.clip_type + self.clip_value = gradient_cfg.clip_value self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.cast = P.Cast() self.alloc_status = P.NPUAllocFloatStatus() @@ -410,7 +412,7 @@ class BertTrainWithLossScaleCell(nn.Cell): # apply grad reducer on grads grads = self.grad_reducer(grads) grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads) - grads = self.hyper_map(F.partial(clip_grad, gradient_cfg.clip_type, gradient_cfg.clip_value), grads) + grads = self.hyper_map(F.partial(clip_grad, self.clip_type, self.clip_value), grads) restore = () for i in range(self.length): restore = restore + (F.assign(weights[i], self.saved_params[i]),) @@ -437,7 +439,7 @@ class BertTrainWithLossScaleCell(nn.Cell): class BertTrainCell(nn.Cell): """ - Especifically defined for finetuning where only four inputs tensor are needed. + Specifically defined for finetuning where only four inputs tensor are needed. """ def __init__(self, network, optimizer, sens=1.0): super(BertTrainCell, self).__init__(auto_prefix=False) @@ -448,6 +450,8 @@ class BertTrainCell(nn.Cell): self.sens = sens self.grad = C.GradOperation(get_by_list=True, sens_param=True) + self.clip_type = gradient_cfg.clip_type + self.clip_value = gradient_cfg.clip_value self.reducer_flag = False self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: @@ -514,7 +518,7 @@ class BertTrainCell(nn.Cell): F.control_depend(input_ids, grads) # apply grad reducer on grads grads = self.grad_reducer(grads) - grads = self.hyper_map(F.partial(clip_grad, gradient_cfg.clip_type, gradient_cfg.clip_value), grads) + grads = self.hyper_map(F.partial(clip_grad, self.clip_type, self.clip_value), grads) restore = () for i in range(self.length): restore = restore + (F.assign(weights[i], self.saved_params[i]),) diff --git a/model_zoo/research/nlp/ternarybert/src/dataset.py b/model_zoo/research/nlp/ternarybert/src/dataset.py index 44bb05e4c3e..2a59e5c70c2 100644 --- a/model_zoo/research/nlp/ternarybert/src/dataset.py +++ b/model_zoo/research/nlp/ternarybert/src/dataset.py @@ -27,8 +27,8 @@ class DataType(Enum): MINDRECORD = 2 -def create_tinybert_dataset(batch_size=32, device_num=1, rank=0, do_shuffle="true", data_dir=None, - data_type='tfrecord', seq_length=128, task_type=mstype.int32, drop_remainder=True): +def create_dataset(batch_size=32, device_num=1, rank=0, do_shuffle="true", data_dir=None, + data_type='tfrecord', seq_length=128, task_type=mstype.int32, drop_remainder=True): """create tinybert dataset""" if isinstance(data_dir, list): data_files = data_dir diff --git a/model_zoo/research/nlp/ternarybert/src/utils.py b/model_zoo/research/nlp/ternarybert/src/utils.py index 5cd1dfae888..7256c4c33e4 100644 --- a/model_zoo/research/nlp/ternarybert/src/utils.py +++ b/model_zoo/research/nlp/ternarybert/src/utils.py @@ -89,12 +89,7 @@ class LossCallBack(Callback): class StepCallBack(Callback): """ - Monitor the loss in training. - If the loss in NAN or INF terminating training. - Note: - if per_print_times is 0 do not print loss. - Args: - per_print_times (int): Print loss every times. Default: 1. + Monitor the time in training. """ def __init__(self): super(StepCallBack, self).__init__() diff --git a/model_zoo/research/nlp/ternarybert/train.py b/model_zoo/research/nlp/ternarybert/train.py index e9e4578486f..93d35c93b28 100644 --- a/model_zoo/research/nlp/ternarybert/train.py +++ b/model_zoo/research/nlp/ternarybert/train.py @@ -21,7 +21,7 @@ from mindspore import context from mindspore.train.model import Model from mindspore.nn.optim import AdamWeightDecay from mindspore import set_seed -from src.dataset import create_tinybert_dataset +from src.dataset import create_dataset from src.utils import StepCallBack, ModelSaveCkpt, EvalCallBack, BertLearningRate from src.config import train_cfg, eval_cfg, teacher_net_cfg, student_net_cfg, task_cfg from src.cell_wrapper import BertNetworkWithLoss, BertTrainCell @@ -86,26 +86,26 @@ def run_task_distill(args_opt): rank = 0 device_num = 1 - train_dataset = create_tinybert_dataset(batch_size=train_cfg.batch_size, - device_num=device_num, - rank=rank, - do_shuffle=args_opt.do_shuffle, - data_dir=train_data_dir, - data_type=args_opt.dataset_type, - seq_length=task.seq_length, - task_type=task.task_type, - drop_remainder=True) + train_dataset = create_dataset(batch_size=train_cfg.batch_size, + device_num=device_num, + rank=rank, + do_shuffle=args_opt.do_shuffle, + data_dir=train_data_dir, + data_type=args_opt.dataset_type, + seq_length=task.seq_length, + task_type=task.task_type, + drop_remainder=True) dataset_size = train_dataset.get_dataset_size() print('train dataset size:', dataset_size) - eval_dataset = create_tinybert_dataset(batch_size=eval_cfg.batch_size, - device_num=device_num, - rank=rank, - do_shuffle=args_opt.do_shuffle, - data_dir=eval_data_dir, - data_type=args_opt.dataset_type, - seq_length=task.seq_length, - task_type=task.task_type, - drop_remainder=False) + eval_dataset = create_dataset(batch_size=eval_cfg.batch_size, + device_num=device_num, + rank=rank, + do_shuffle=args_opt.do_shuffle, + data_dir=eval_data_dir, + data_type=args_opt.dataset_type, + seq_length=task.seq_length, + task_type=task.task_type, + drop_remainder=False) print('eval dataset size:', eval_dataset.get_dataset_size()) if args_opt.enable_data_sink == 'true':