fix bug and update readme
This commit is contained in:
parent
ae39553df1
commit
0b4b7dfab0
|
@ -80,7 +80,7 @@ Before running the shell script, please set the `task_name`, `model_dir` and `da
|
||||||
```text
|
```text
|
||||||
|
|
||||||
.
|
.
|
||||||
└─bert
|
└─ternarybert
|
||||||
├─README.md
|
├─README.md
|
||||||
├─scripts
|
├─scripts
|
||||||
├─run_train.sh # shell script for training phase
|
├─run_train.sh # shell script for training phase
|
||||||
|
@ -106,26 +106,12 @@ Before running the shell script, please set the `task_name`, `model_dir` and `da
|
||||||
|
|
||||||
```text
|
```text
|
||||||
|
|
||||||
usage: train.py [--h]
|
usage: train.py [--h] [--device_target {GPU,Ascend}] [--do_eval {true,false}] [--epoch_size EPOCH_SIZE]
|
||||||
[--device_target {GPU,Ascend}]
|
[--device_id DEVICE_ID] [--do_shuffle {true,false}] [--enable_data_sink {true,false}] [--save_ckpt_step SAVE_CKPT_STEP]
|
||||||
[--do_eval {true,false}]
|
[--eval_ckpt_step EVAL_CKPT_STEP] [--max_ckpt_num MAX_CKPT_NUM] [--data_sink_steps DATA_SINK_STEPS]
|
||||||
[--epoch_size EPOCH_SIZE]
|
[--teacher_model_dir TEACHER_MODEL_DIR] [--student_model_dir STUDENT_MODEL_DIR] [--data_dir DATA_DIR]
|
||||||
[--device_id DEVICE_ID]
|
[--output_dir OUTPUT_DIR] [--task_name {sts-b,qnli,mnli}] [--dataset_type DATASET_TYPE] [--seed SEED]
|
||||||
[--do_shuffle {true,false}]
|
[--train_batch_size TRAIN_BATCH_SIZE] [--eval_batch_size EVAL_BATCH_SIZE]
|
||||||
[--enable_data_sink {true,false}]
|
|
||||||
[--save_ckpt_step SAVE_CKPT_STEP]
|
|
||||||
[--eval_ckpt_step EVAL_CKPT_STEP]
|
|
||||||
[--max_ckpt_num MAX_CKPT_NUM]
|
|
||||||
[--data_sink_steps DATA_SINK_STEPS]
|
|
||||||
[--teacher_model_dir TEACHER_MODEL_DIR]
|
|
||||||
[--student_model_dir STUDENT_MODEL_DIR]
|
|
||||||
[--data_dir DATA_DIR]
|
|
||||||
[--output_dir OUTPUT_DIR]
|
|
||||||
[--task_name {sts-b,qnli,mnli}]
|
|
||||||
[--dataset_type DATASET_TYPE]
|
|
||||||
[--seed SEED]
|
|
||||||
[--train_batch_size TRAIN_BATCH_SIZE]
|
|
||||||
[--eval_batch_size EVAL_BATCH_SIZE]
|
|
||||||
|
|
||||||
options:
|
options:
|
||||||
--device_target Device where the code will be implemented: "GPU" | "Ascend", default is "GPU"
|
--device_target Device where the code will be implemented: "GPU" | "Ascend", default is "GPU"
|
||||||
|
@ -154,14 +140,8 @@ options:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
|
|
||||||
usage: eval.py [--h]
|
usage: eval.py [--h] [--device_target {GPU,Ascend}] [--device_id DEVICE_ID] [--model_dir MODEL_DIR] [--data_dir DATA_DIR]
|
||||||
[--device_target {GPU,Ascend}]
|
[--task_name {sts-b,qnli,mnli}] [--dataset_type DATASET_TYPE] [--batch_size BATCH_SIZE]
|
||||||
[--device_id DEVICE_ID]
|
|
||||||
[--model_dir MODEL_DIR]
|
|
||||||
[--data_dir DATA_DIR]
|
|
||||||
[--task_name {sts-b,qnli,mnli}]
|
|
||||||
[--dataset_type DATASET_TYPE]
|
|
||||||
[--batch_size BATCH_SIZE]
|
|
||||||
|
|
||||||
options:
|
options:
|
||||||
--device_target Device where the code will be implemented: "GPU" | "Ascend", default is "GPU"
|
--device_target Device where the code will be implemented: "GPU" | "Ascend", default is "GPU"
|
||||||
|
@ -205,7 +185,7 @@ Parameters for eval:
|
||||||
|
|
||||||
Parameters for teacher bert network:
|
Parameters for teacher bert network:
|
||||||
seq_length length of input sequence: N, default is 128
|
seq_length length of input sequence: N, default is 128
|
||||||
vocab_size size of each embedding vector: N, must be consistant with the dataset you use. Default is 30522
|
vocab_size size of each embedding vector: N, must be consistent with the dataset you use. Default is 30522
|
||||||
hidden_size size of bert encoder layers: N
|
hidden_size size of bert encoder layers: N
|
||||||
num_hidden_layers number of hidden layers: N
|
num_hidden_layers number of hidden layers: N
|
||||||
num_attention_heads number of attention heads: N, default is 12
|
num_attention_heads number of attention heads: N, default is 12
|
||||||
|
@ -224,7 +204,7 @@ Parameters for teacher bert network:
|
||||||
|
|
||||||
Parameters for student bert network:
|
Parameters for student bert network:
|
||||||
seq_length length of input sequence: N, default is 128
|
seq_length length of input sequence: N, default is 128
|
||||||
vocab_size size of each embedding vector: N, must be consistant with the dataset you use. Default is 30522
|
vocab_size size of each embedding vector: N, must be consistent with the dataset you use. Default is 30522
|
||||||
hidden_size size of bert encoder layers: N
|
hidden_size size of bert encoder layers: N
|
||||||
num_hidden_layers number of hidden layers: N
|
num_hidden_layers number of hidden layers: N
|
||||||
num_attention_heads number of attention heads: N, default is 12
|
num_attention_heads number of attention heads: N, default is 12
|
||||||
|
@ -348,7 +328,7 @@ eval step: 0, Accuracy: 90.625
|
||||||
eval step: 1, Accuracy: 81.25
|
eval step: 1, Accuracy: 81.25
|
||||||
eval step: 2, Accuracy: 79.16666666666666
|
eval step: 2, Accuracy: 79.16666666666666
|
||||||
...
|
...
|
||||||
The best Accuracy: 83.70860927152319
|
The best Accuracy: 83.58388835685436
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -362,27 +342,13 @@ The best Accuracy: 83.70860927152319
|
||||||
| ----------------- | :---------------------------------------------------- |
|
| ----------------- | :---------------------------------------------------- |
|
||||||
| Model Version | TernaryBERT |
|
| Model Version | TernaryBERT |
|
||||||
| Resource | NV SMX2 V100-32G |
|
| Resource | NV SMX2 V100-32G |
|
||||||
| uploaded Date | 08/20/2020 |
|
| uploaded Date | 02/01/2020 |
|
||||||
| MindSpore Version | 1.1.0 |
|
| MindSpore Version | 1.1.0 |
|
||||||
| Dataset | STS-B, QNLI, MNLI |
|
| Dataset | STS-B |
|
||||||
| batch_size | 16, 16, 16 |
|
| batch_size | 16 |
|
||||||
| Metric value | 87.58388835685437, 90.426505583013, 83.70860927152319 |
|
| Metric value | 87.5839 |
|
||||||
| Speed | |
|
| Speed | 0.19s/step |
|
||||||
| Total time | |
|
| Total time | 6.7min(3epoch, 1p) |
|
||||||
|
|
||||||
### Inference Performance
|
|
||||||
|
|
||||||
| Parameters | GPU |
|
|
||||||
| ----------------- | :---------------------------------------------------- |
|
|
||||||
| Model Version | TernaryBERT |
|
|
||||||
| Resource | NV SMX2 V100-32G |
|
|
||||||
| uploaded Date | 08/20/2020 |
|
|
||||||
| MindSpore Version | 1.1.0 |
|
|
||||||
| Dataset | STS-B, QNLI, MNLI |
|
|
||||||
| batch_size | 32, 32, 32 |
|
|
||||||
| Accuracy | 87.58388835685437, 90.426505583013, 83.70860927152319 |
|
|
||||||
| Speed | |
|
|
||||||
| Total time | |
|
|
||||||
|
|
||||||
# [Description of Random Situation](#contents)
|
# [Description of Random Situation](#contents)
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,7 @@ import re
|
||||||
import argparse
|
import argparse
|
||||||
from mindspore import context
|
from mindspore import context
|
||||||
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||||
from src.dataset import create_tinybert_dataset
|
from src.dataset import create_dataset
|
||||||
from src.config import eval_cfg, student_net_cfg, task_cfg
|
from src.config import eval_cfg, student_net_cfg, task_cfg
|
||||||
from src.tinybert_model import BertModelCLS
|
from src.tinybert_model import BertModelCLS
|
||||||
|
|
||||||
|
@ -66,15 +66,15 @@ def do_eval_standalone(args_opt):
|
||||||
|
|
||||||
context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args.device_id)
|
context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args.device_id)
|
||||||
|
|
||||||
eval_dataset = create_tinybert_dataset(batch_size=eval_cfg.batch_size,
|
eval_dataset = create_dataset(batch_size=eval_cfg.batch_size,
|
||||||
device_num=1,
|
device_num=1,
|
||||||
rank=0,
|
rank=0,
|
||||||
do_shuffle='false',
|
do_shuffle='false',
|
||||||
data_dir=eval_data_dir,
|
data_dir=eval_data_dir,
|
||||||
data_type=args_opt.dataset_type,
|
data_type=args_opt.dataset_type,
|
||||||
seq_length=task.seq_length,
|
seq_length=task.seq_length,
|
||||||
task_type=task.task_type,
|
task_type=task.task_type,
|
||||||
drop_remainder=False)
|
drop_remainder=False)
|
||||||
print('eval dataset size:', eval_dataset.get_dataset_size())
|
print('eval dataset size:', eval_dataset.get_dataset_size())
|
||||||
print('eval dataset batch size:', eval_dataset.get_batch_size())
|
print('eval dataset batch size:', eval_dataset.get_batch_size())
|
||||||
|
|
||||||
|
|
|
@ -313,7 +313,7 @@ class BertNetworkWithLoss(nn.Cell):
|
||||||
|
|
||||||
class BertTrainWithLossScaleCell(nn.Cell):
|
class BertTrainWithLossScaleCell(nn.Cell):
|
||||||
"""
|
"""
|
||||||
Especifically defined for finetuning where only four inputs tensor are needed.
|
Specifically defined for finetuning where only four inputs tensor are needed.
|
||||||
"""
|
"""
|
||||||
def __init__(self, network, optimizer, scale_update_cell=None):
|
def __init__(self, network, optimizer, scale_update_cell=None):
|
||||||
super(BertTrainWithLossScaleCell, self).__init__(auto_prefix=False)
|
super(BertTrainWithLossScaleCell, self).__init__(auto_prefix=False)
|
||||||
|
@ -333,6 +333,8 @@ class BertTrainWithLossScaleCell(nn.Cell):
|
||||||
if self.reducer_flag:
|
if self.reducer_flag:
|
||||||
self.degree = get_group_size()
|
self.degree = get_group_size()
|
||||||
self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
|
self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
|
||||||
|
self.clip_type = gradient_cfg.clip_type
|
||||||
|
self.clip_value = gradient_cfg.clip_value
|
||||||
self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
|
self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
|
||||||
self.cast = P.Cast()
|
self.cast = P.Cast()
|
||||||
self.alloc_status = P.NPUAllocFloatStatus()
|
self.alloc_status = P.NPUAllocFloatStatus()
|
||||||
|
@ -410,7 +412,7 @@ class BertTrainWithLossScaleCell(nn.Cell):
|
||||||
# apply grad reducer on grads
|
# apply grad reducer on grads
|
||||||
grads = self.grad_reducer(grads)
|
grads = self.grad_reducer(grads)
|
||||||
grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads)
|
grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads)
|
||||||
grads = self.hyper_map(F.partial(clip_grad, gradient_cfg.clip_type, gradient_cfg.clip_value), grads)
|
grads = self.hyper_map(F.partial(clip_grad, self.clip_type, self.clip_value), grads)
|
||||||
restore = ()
|
restore = ()
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
restore = restore + (F.assign(weights[i], self.saved_params[i]),)
|
restore = restore + (F.assign(weights[i], self.saved_params[i]),)
|
||||||
|
@ -437,7 +439,7 @@ class BertTrainWithLossScaleCell(nn.Cell):
|
||||||
|
|
||||||
class BertTrainCell(nn.Cell):
|
class BertTrainCell(nn.Cell):
|
||||||
"""
|
"""
|
||||||
Especifically defined for finetuning where only four inputs tensor are needed.
|
Specifically defined for finetuning where only four inputs tensor are needed.
|
||||||
"""
|
"""
|
||||||
def __init__(self, network, optimizer, sens=1.0):
|
def __init__(self, network, optimizer, sens=1.0):
|
||||||
super(BertTrainCell, self).__init__(auto_prefix=False)
|
super(BertTrainCell, self).__init__(auto_prefix=False)
|
||||||
|
@ -448,6 +450,8 @@ class BertTrainCell(nn.Cell):
|
||||||
self.sens = sens
|
self.sens = sens
|
||||||
self.grad = C.GradOperation(get_by_list=True,
|
self.grad = C.GradOperation(get_by_list=True,
|
||||||
sens_param=True)
|
sens_param=True)
|
||||||
|
self.clip_type = gradient_cfg.clip_type
|
||||||
|
self.clip_value = gradient_cfg.clip_value
|
||||||
self.reducer_flag = False
|
self.reducer_flag = False
|
||||||
self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
|
self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
|
||||||
if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
|
if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
|
||||||
|
@ -514,7 +518,7 @@ class BertTrainCell(nn.Cell):
|
||||||
F.control_depend(input_ids, grads)
|
F.control_depend(input_ids, grads)
|
||||||
# apply grad reducer on grads
|
# apply grad reducer on grads
|
||||||
grads = self.grad_reducer(grads)
|
grads = self.grad_reducer(grads)
|
||||||
grads = self.hyper_map(F.partial(clip_grad, gradient_cfg.clip_type, gradient_cfg.clip_value), grads)
|
grads = self.hyper_map(F.partial(clip_grad, self.clip_type, self.clip_value), grads)
|
||||||
restore = ()
|
restore = ()
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
restore = restore + (F.assign(weights[i], self.saved_params[i]),)
|
restore = restore + (F.assign(weights[i], self.saved_params[i]),)
|
||||||
|
|
|
@ -27,8 +27,8 @@ class DataType(Enum):
|
||||||
MINDRECORD = 2
|
MINDRECORD = 2
|
||||||
|
|
||||||
|
|
||||||
def create_tinybert_dataset(batch_size=32, device_num=1, rank=0, do_shuffle="true", data_dir=None,
|
def create_dataset(batch_size=32, device_num=1, rank=0, do_shuffle="true", data_dir=None,
|
||||||
data_type='tfrecord', seq_length=128, task_type=mstype.int32, drop_remainder=True):
|
data_type='tfrecord', seq_length=128, task_type=mstype.int32, drop_remainder=True):
|
||||||
"""create tinybert dataset"""
|
"""create tinybert dataset"""
|
||||||
if isinstance(data_dir, list):
|
if isinstance(data_dir, list):
|
||||||
data_files = data_dir
|
data_files = data_dir
|
||||||
|
|
|
@ -89,12 +89,7 @@ class LossCallBack(Callback):
|
||||||
|
|
||||||
class StepCallBack(Callback):
|
class StepCallBack(Callback):
|
||||||
"""
|
"""
|
||||||
Monitor the loss in training.
|
Monitor the time in training.
|
||||||
If the loss in NAN or INF terminating training.
|
|
||||||
Note:
|
|
||||||
if per_print_times is 0 do not print loss.
|
|
||||||
Args:
|
|
||||||
per_print_times (int): Print loss every times. Default: 1.
|
|
||||||
"""
|
"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(StepCallBack, self).__init__()
|
super(StepCallBack, self).__init__()
|
||||||
|
|
|
@ -21,7 +21,7 @@ from mindspore import context
|
||||||
from mindspore.train.model import Model
|
from mindspore.train.model import Model
|
||||||
from mindspore.nn.optim import AdamWeightDecay
|
from mindspore.nn.optim import AdamWeightDecay
|
||||||
from mindspore import set_seed
|
from mindspore import set_seed
|
||||||
from src.dataset import create_tinybert_dataset
|
from src.dataset import create_dataset
|
||||||
from src.utils import StepCallBack, ModelSaveCkpt, EvalCallBack, BertLearningRate
|
from src.utils import StepCallBack, ModelSaveCkpt, EvalCallBack, BertLearningRate
|
||||||
from src.config import train_cfg, eval_cfg, teacher_net_cfg, student_net_cfg, task_cfg
|
from src.config import train_cfg, eval_cfg, teacher_net_cfg, student_net_cfg, task_cfg
|
||||||
from src.cell_wrapper import BertNetworkWithLoss, BertTrainCell
|
from src.cell_wrapper import BertNetworkWithLoss, BertTrainCell
|
||||||
|
@ -86,26 +86,26 @@ def run_task_distill(args_opt):
|
||||||
|
|
||||||
rank = 0
|
rank = 0
|
||||||
device_num = 1
|
device_num = 1
|
||||||
train_dataset = create_tinybert_dataset(batch_size=train_cfg.batch_size,
|
train_dataset = create_dataset(batch_size=train_cfg.batch_size,
|
||||||
device_num=device_num,
|
device_num=device_num,
|
||||||
rank=rank,
|
rank=rank,
|
||||||
do_shuffle=args_opt.do_shuffle,
|
do_shuffle=args_opt.do_shuffle,
|
||||||
data_dir=train_data_dir,
|
data_dir=train_data_dir,
|
||||||
data_type=args_opt.dataset_type,
|
data_type=args_opt.dataset_type,
|
||||||
seq_length=task.seq_length,
|
seq_length=task.seq_length,
|
||||||
task_type=task.task_type,
|
task_type=task.task_type,
|
||||||
drop_remainder=True)
|
drop_remainder=True)
|
||||||
dataset_size = train_dataset.get_dataset_size()
|
dataset_size = train_dataset.get_dataset_size()
|
||||||
print('train dataset size:', dataset_size)
|
print('train dataset size:', dataset_size)
|
||||||
eval_dataset = create_tinybert_dataset(batch_size=eval_cfg.batch_size,
|
eval_dataset = create_dataset(batch_size=eval_cfg.batch_size,
|
||||||
device_num=device_num,
|
device_num=device_num,
|
||||||
rank=rank,
|
rank=rank,
|
||||||
do_shuffle=args_opt.do_shuffle,
|
do_shuffle=args_opt.do_shuffle,
|
||||||
data_dir=eval_data_dir,
|
data_dir=eval_data_dir,
|
||||||
data_type=args_opt.dataset_type,
|
data_type=args_opt.dataset_type,
|
||||||
seq_length=task.seq_length,
|
seq_length=task.seq_length,
|
||||||
task_type=task.task_type,
|
task_type=task.task_type,
|
||||||
drop_remainder=False)
|
drop_remainder=False)
|
||||||
print('eval dataset size:', eval_dataset.get_dataset_size())
|
print('eval dataset size:', eval_dataset.get_dataset_size())
|
||||||
|
|
||||||
if args_opt.enable_data_sink == 'true':
|
if args_opt.enable_data_sink == 'true':
|
||||||
|
|
Loading…
Reference in New Issue