diff --git a/model_zoo/official/nlp/q8bert/README.md b/model_zoo/official/nlp/q8bert/README.md new file mode 100644 index 00000000000..0b58ef500bf --- /dev/null +++ b/model_zoo/official/nlp/q8bert/README.md @@ -0,0 +1,237 @@ + +# Contents + +- [Contents](#contents) +- [Q8BERT Description](#q8bert-description) +- [Model Architecture](#model-architecture) +- [Dataset](#dataset) +- [Environment Requirements](#environment-requirements) +- [Quick Start](#quick-start) +- [Script Description](#script-description) + - [Script and Sample Code](#script-and-sample-code) + - [Parameters](#parameters) + - [Training Process](#training-process) + - [Training](#training) + - [Model Description](#model-description) + - [Performance](#performance) + - [training Performance](#training-performance) +- [Description of Random Situation](#description-of-random-situation) +- [ModelZoo Homepage](#modelzoo-homepage) + +# [Q8BERT Description](#contents) + +[Q8BERT](https://arxiv.org/abs/1910.06188) is a quantization-aware training during the fine-tuning phase of [BERT](https://arxiv.org/abs/1810.04805) +in order to compress BERT by 4× with minimal accuracy loss. Furthermore, the +produced quantized model can accelerate inference speed if it is optimized for 8bit Integer supporting hardware. + +[Paper](https://arxiv.org/abs/1910.06188): Ofir Zafrir, Guy Boudoukh, Peter Izsak and Moshe Wasserblat. [Q8BERT: Quantized 8Bit BERT](https://arxiv.org/abs/1910.06188). arXiv preprint arXiv:2009.12812. + +# [Model Architecture](#contents) + +The backbone structure of Q8BERT is transformer, the transformer contains 12 encoder modules, one encoder contains one self-attention module and one self-attention module contains one attention module. + +# [Dataset](#contents) + +- Download glue dataset for task distillation. Convert dataset files from json format to tfrecord format, please refer to run_classifier.py which in [BERT](https://github.com/google-research/bert) repository. + +# [Environment Requirements](#contents) + +- Hardware(GPU) + - Prepare hardware environment with GPU processor. +- Framework + - [MindSpore](https://gitee.com/mindspore/mindspore) +- For more information, please check the resources below: + - [MindSpore Tutorials](https://www.mindspore.cn/tutorial/training/en/master/index.html) + - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/en/master/index.html) +- Software: + - numpy + +# [Quick Start](#contents) + +After installing MindSpore via the official website, you can start training and evaluation as follows: + +```bash + +# run training example +run_train.sh + +Before running the shell script, please set the `task_name`, `teacher_model_dir`, `student_model_dir` and `data_dir` in the run_train.sh file first. + +``` + +# [Script Description](#contents) + +## [Script and Sample Code](#contents) + +```text +└─q8bert + ├─README.md + ├─scripts + ├─run_train.sh # shell script for training phase + ├─src + ├─__init__.py + ├─dataset.py # data processing + ├─bert_model.py # backbone code of bert + ├─q8bert_model.py # quantization for Bert + ├─q8bert.py # backbone code of q8bert + ├─utils.py # some utils function of q8bert + ├─__init__.py + ├─run_train.py # train net for task distillation + +## [Script Parameters](#contents) + +### Train + +```text + +usage: run_train.py [--h] [--device_target {GPU,Ascend}][--epoch_num EPOCH_NUM] [--task_name {SST-2,QNLI,MNLI,COLA,QQP,"STS-B,RTE}][--do_shuffle {true,false}] [--enable_data_sink {true,false}][--do_eval {true,false}][--device_id DEVICE_ID] [--save_ckpt_step SAVE_CKPT_STEP] [--eval_ckpt_step EVAL_CKPT_STEP] [--max_ckpt_num MAX_CKPT_NUM] [--load_ckpt_path LOAD_CKPT_PATH] [--train_data_dir TRAIN_DATA_DIR] [--eval_data_dir EVAL_DATA_DIR] [--device_id DEVICE_ID] [--logging_step LOGGIND_STEP] [--do_quant {true,false}] + +options: + --device_target Device where the code will be implemented: "GPU" | "Ascend", default is "GPU" + --do_eval Do eval task during training or not: "true" | "false", default is "true" + --epoch_num Epoch num for train phase: N, default is 3 + --device_id Device id: N, default is 0 + --do_shuffle Enable shuffle for train dataset: "true" | "false", default is "true" + --enable_data_sink Enable data sink: "true" | "false", default is "true" + --save_ckpt_step If do_eval is false, the checkpoint will be saved every save_ckpt_step: N, default is 50 + --eval_ckpt_step If do_eval is true, the evaluation will be ran every eval_ckpt_step: N, default is 50 + --max_ckpt_num The number of checkpoints will not be larger than max_ckpt_num: N, default is 50 + --data_sink_steps Sink steps for each epoch: N, default is 1 + --load_ckpt_path The checkpoint directory of model: PATH, default is "" + --train_data_dir Train Data directory: PATH, default is "" + --eval_data_dir Eval Data directory: PATH, default is "" + --task_name The name of the task to train: "SST-2"| "QNLI"| "MNLI"|"COLA"|"QQP"|"STS-B"|"RTE" + --dataset_type The name of the task to train: "tfrecord" | "mindrecord", default is "tfrecord" + --train_batch_size Batch size for training: N, default is 16 + --eval_batch_size Eval Batch size in callback: N, default is 32 + +``` + +## Parameters + +`config.py`contains parameters of glue tasks, train, optimizer, eval, teacher BERT model and student BERT model. + +```text + +Parameters for glue task: + num_labels the numbers of labels: N. + seq_length length of input sequence: N + task_type the type of task: "classification" | "regression" + metrics the eval metric for task: Accuracy | F1 | Pearsonr | Matthews + +Parameters for train: + batch_size batch size of input dataset: N, default is 16 + loss_scale_value initial value of loss scale: N, default is 2^16 + scale_factor factor used to update loss scale: N, default is 2 + scale_window steps for once updatation of loss scale: N, default is 50 + +Parameters for optimizer: + learning_rate value of learning rate: Q, default is 5e-5 + end_learning_rate value of end learning rate: Q, must be positive, default is 1e-14 + power power: Q, default is 1.0 + weight_decay weight decay: Q, default is 1e-4 + eps term added to the denominator to improve numerical stability: Q, default is 1e-6 + warmup_ratio the ratio of warmup steps to total steps: Q, default is 0.1 + +Parameters for eval: + batch_size batch size of input dataset: N, default is 32 + +Parameters for teacher bert network: + seq_length length of input sequence: N, default is 128 + vocab_size size of each embedding vector: N, must be consistent with the dataset you use. Default is 30522 + hidden_size size of bert encoder layers: N + num_hidden_layers number of hidden layers: N + num_attention_heads number of attention heads: N, default is 12 + intermediate_size size of intermediate layer: N + hidden_act activation function used: ACTIVATION, default is "gelu" + hidden_dropout_prob dropout probability for BertOutput: Q + attention_probs_dropout_prob dropout probability for BertAttention: Q + max_position_embeddings maximum length of sequences: N, default is 512 + save_ckpt_step number for saving checkponit: N, default is 100 + max_ckpt_num maximum number for saving checkpoint: N, default is 1 + type_vocab_size size of token type vocab: N, default is 2 + initializer_range initialization value of TruncatedNormal: Q, default is 0.02 + use_relative_positions use relative positions or not: True | False, default is False + dtype data type of input: mstype.float16 | mstype.float32, default is mstype.float32 + compute_type compute type in BertTransformer: mstype.float16 | mstype.float32, default is mstype.float32 + +Parameters for student bert network: + seq_length length of input sequence: N, default is 128 + vocab_size size of each embedding vector: N, must be consistent with the dataset you use. Default is 30522 + hidden_size size of bert encoder layers: N + num_hidden_layers number of hidden layers: N + num_attention_heads number of attention heads: N, default is 12 + intermediate_size size of intermediate layer: N + hidden_act activation function used: ACTIVATION, default is "gelu" + hidden_dropout_prob dropout probability for BertOutput: Q + attention_probs_dropout_prob dropout probability for BertAttention: Q + max_position_embeddings maximum length of sequences: N, default is 512 + save_ckpt_step number for saving checkponit: N, default is 100 + max_ckpt_num maximum number for saving checkpoint: N, default is 1 + type_vocab_size size of token type vocab: N, default is 2 + initializer_range initialization value of TruncatedNormal: Q, default is 0.02 + use_relative_positions use relative positions or not: True | False, default is False + dtype data type of input: mstype.float16 | mstype.float32, default is mstype.float32 + compute_type compute type in BertTransformer: mstype.float16 | mstype.float32, default is mstype.float32 + do_quant do activation quantilization or not: True | False, default is True + embedding_bits the quant bits of embedding: N, default is 2 + weight_bits the quant bits of weight: N, default is 2 + cls_dropout_prob dropout probability for BertModelCLS: Q + activation_init initialization value of activation quantilization: Q, default is 2.5 + is_lgt_fit use label ground truth loss or not: True | False, default is False + +``` + +## [Training Process](#contents) + +### Training + +Before running the command below, please check `data_dir` and 'load_ckpt_path' has been set. Please set the path to be the absolute full path, e.g:"/home/xxx/model_dir/". + +```text + +python + python ./run_train.py --device_target="GPU" --do_eval="true" --epoch_num=3 --task_name="STS-B" --do_shuffle="true" --enable_data_sink="true" --data_sink_steps=100 --save_ckpt_step=100 --max_ckpt_num=1 --load_ckpt_path="sts-b.ckpt" --train_data_dir="sts-b/train.tf_record" --eval_data_dir="sts-b/eval.tf_record" --device_id=0 --logging_step=100 --do_quant="true" +shell + sh run_train.sh + +``` + +The shell command above will run in the background, you can view the results the file log.txt. The python command will run in the console, you can view the results on the interface. After training, you will get some checkpoint files under the script folder by default. The eval metric value will be achieved as follows: + +```text + +epoch: 1, step: 100, loss are (Tensor(shape=[], dtype=Float32, value= 0.526506), Tensor(shape=[], dtype=Bool, value= False)) The current result is {'pearson': 0.8407084843799768, 'spearmanr': 0.8405771469597393, 'corr': 0.840642815669858} epoch time: 66421.602 ms, per step time: 664.216 ms +epoch: 2, step: 200, loss are (Tensor(shape=[], dtype=Float32, value= 0.406012), Tensor(shape=[], dtype=Bool, value= False)) The current result is {'pearson': 0.826509808575773, 'spearmanr': 0.8274141859302444, 'corr': 0.8269619972530087} epoch time: 47488.633 ms, per step time: 474.886 ms +... +best pearson:0.8753269455187238 + +``` + +## [Model Description](#contents) + +## [Performance](#contents) + +### training Performance + +| Parameters | GPU | +| ----------------- | :---------------------------------------------------- | +| Model Version | Q8BERT | +| Resource | NV GeForce GTX1080ti | +| uploaded Date | 03/01/2020 | +| MindSpore Version | 1.1.0 | +| Dataset | STS-B | +| batch_size | 16 | +| Metric value | 87.5833 | +| Speed | 0.47s/step | +| Total time | 9.1min(3epoch, 1p) | + +# [Description of Random Situation](#contents) + +In train.py, we set do_shuffle to shuffle the dataset. + +In config.py, we set the hidden_dropout_prob, attention_pros_dropout_prob and cls_dropout_prob to dropout some network node. + +# [ModelZoo Homepage](#contents) + +Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/master/model_zoo). diff --git a/model_zoo/official/nlp/q8bert/README_CN.md b/model_zoo/official/nlp/q8bert/README_CN.md new file mode 100644 index 00000000000..9097a21784a --- /dev/null +++ b/model_zoo/official/nlp/q8bert/README_CN.md @@ -0,0 +1,260 @@ + +# 目录 + + + +- [目录](#目录) +- [TinyBERT概述](#tinybert概述) +- [模型架构](#模型架构) +- [数据集](#数据集) +- [环境要求](#环境要求) +- [快速入门](#快速入门) +- [脚本说明](#脚本说明) + - [脚本和样例代码](#脚本和样例代码) + - [脚本参数](#脚本参数) + - [一般蒸馏](#一般蒸馏) + - [任务蒸馏](#任务蒸馏) + - [选项及参数](#选项及参数) + - [选项](#选项) + - [参数](#参数) + - [训练流程](#训练流程) + - [用法](#用法) + - [Ascend处理器上运行](#ascend处理器上运行) + - [在GPU处理器上运行](#在gpu处理器上运行) + - [分布式训练](#分布式训练) + - [Ascend处理器上运行](#ascend处理器上运行-1) + - [GPU处理器上运行](#gpu处理器上运行) + - [评估过程](#评估过程) + - [用法](#用法-1) + - [基于SST-2数据集进行评估](#基于sst-2数据集进行评估) + - [基于MNLI数据集进行评估](#基于mnli数据集进行评估) + - [基于QNLI数据集进行评估](#基于qnli数据集进行评估) + - [模型描述](#模型描述) + - [性能](#性能) + - [评估性能](#评估性能) + - [推理性能](#推理性能) +- [随机情况说明](#随机情况说明) +- [ModelZoo主页](#modelzoo主页) + + + +# Q8BERT概述 + +[Q8BERT](https://arxiv.org/abs/1910.06188)是一种在finetune阶段使用量化训练BERT后的模型,最后是训练出来的模型在保证精度损失的情况下,模型大小压缩4倍,而且使用这种算法训练出来的模型在含有8bit算子的硬件上,推理速度也可以相应提高 + +[论文](https://arxiv.org/abs/1910.06188): Ofir Zafrir, Guy Boudoukh, Peter Izsak and Moshe Wasserblat. [Q8BERT: Quantized 8Bit BERT](https://arxiv.org/abs/1910.06188). arXiv preprint arXiv:2009.12812. + +# 模型架构 + +Q8BERT模型的主干结构是transformer,一个转换器包含12个编码器模块。 + +# 数据集 + +- 下载GLUE数据集进行任务蒸馏。将数据集由JSON格式转化为TFRecord格式。详见[BERT](https://github.com/google-research/bert)代码库中的run_classifier.py文件。 + +# 环境要求 + +- 硬件(Ascend或GPU) + - 使用Ascend或GPU处理器准备硬件环境。如需试用昇腾处理器,请发送[申请表](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx)到ascend@huawei.com。申请通过后,即可获得资源。 +- 框架 + - [MindSpore](https://gitee.com/mindspore/mindspore) +- 更多关于Mindspore的信息,请查看以下资源: + - [MindSpore教程](https://www.mindspore.cn/tutorial/training/zh-CN/master/index.html) + - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/zh-CN/master/index.html) + +# 快速入门 + +从官网下载安装MindSpore之后,可以开始使用如下脚本训练和推理: + +```bash +# 运行训练脚本 +run_train.sh + +Before running the shell script, please set the `task_name`, `teacher_model_dir`, `student_model_dir` and `data_dir` in the run_train.sh file first. + +``` + +若在Ascend设备上运行分布式训练,请提前创建JSON格式的HCCL配置文件。 +详情参见如下链接: +https:gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools. + +如需设置数据集格式和参数,请创建JSON格式的视图配置文件,详见[TFRecord](https://www.mindspore.cn/doc/programming_guide/zh-CN/master/dataset_loading.html#tfrecord) 格式。 + +```text +For general task, schema file contains ["input_ids", "input_mask", "segment_ids"]. + +For task distill and eval phase, schema file contains ["input_ids", "input_mask", "segment_ids", "label_ids"]. + +`numRows` is the only option which could be set by user, the others value must be set according to the dataset. + +For example, the dataset is cn-wiki-128, the schema file for general distill phase as following: +{ + "datasetType": "TF", + "numRows": 7680, + "columns": { + "input_ids": { + "type": "int64", + "rank": 1, + "shape": [256] + }, + "input_mask": { + "type": "int64", + "rank": 1, + "shape": [256] + }, + "segment_ids": { + "type": "int64", + "rank": 1, + "shape": [256] + } + } +} +``` + +# 脚本说明 + +## 脚本和样例代码 + +```shell +. +└─q8bert + ├─README.md + ├─scripts + ├─run_train.sh # 运行shell脚本 + ├─src + ├─__init__.py + ├─dataset.py # 数据处理 + ├─bert_model.py # bert模型主体结构 + ├─q8bert_model.py # bert模型量化感知算法 + ├─q8bert.py # q8bert主体结构 + ├─utils.py # utils函数 + ├─__init__.py + ├─run_train.py # 运行main函数 + +``` + +## 脚本和脚本参数 + +```text + +用法: run_train.py [--h] [--device_target {GPU,Ascend}][--epoch_num EPOCH_NUM] [--task_name {SST-2,QNLI,MNLI,COLA,QQP,"STS-B,RTE}][--do_shuffle {true,false}] [--enable_data_sink {true,false}][--do_eval {true,false}][--device_id DEVICE_ID] [--save_ckpt_step SAVE_CKPT_STEP] [--eval_ckpt_step EVAL_CKPT_STEP] [--max_ckpt_num MAX_CKPT_NUM] [--load_ckpt_path LOAD_CKPT_PATH] [--train_data_dir TRAIN_DATA_DIR] [--eval_data_dir EVAL_DATA_DIR] [--device_id DEVICE_ID] [--logging_step LOGGIND_STEP] [--do_quant {true,false}] + +选项: + --device_target 代码实现设备,可选项为Ascend或CPU。默认为GPU + --do_eval 是否在训练的过程中加上推理默认为是 + --epoch_num Epoch数,默认为3 + --device_id 设备ID,默认为0 + --do_shuffle 是否使能轮换,可选项为true或false,默认为true + --enable_data_sink 是否使能数据下沉,可选项为true或false,默认为true + --save_ckpt_step 保存检查点文件的步数,默认为1000 + --eval_ckpt_step 如过do_eval为是, 在训练过程中执行推理的步数 + --max_ckpt_num 保存检查点文件的最大数,默认为1 + --data_sink_steps 设置数据下沉步数,默认为1 + --load_ckpt_path 加载检查点文件的路径,默认为"" + --train_data_dir 训练集路径, 默认为 "" + --eval_data_dir 验证集路径, 默认为 "" + --task_name Glue数据集任务: "SST-2"| "QNLI"| "MNLI"|"COLA"|"QQP"|"STS-B"|"RTE" + --dataset_type 数据集类型,可选项为tfrecord或mindrecord,默认为tfrecord + --train_batch_size 训练batchsize,默认16 + --eval_batch_size 推理batchsize,默认32 + +``` + +## 选项及参数 + +`config.py` 包含BERT模型参数与优化器和损失缩放选项。 + +### 选项 + +```text + +batch_size 输入数据集的批次大小,默认为16 +Parameters for lossscale: + loss_scale_value 损失放大初始值,默认为 + scale_factor 损失放大的更新因子,默认为2 + scale_window 损失放大的一次更新步数,默认为50 + +Parameters for optimizer: + learning_rate 学习率 + end_learning_rate 结束学习率,取值需为正数 + power 幂 + weight_decay 权重衰减 + eps 增加分母,提高小数稳定性 + +``` + +### 参数 + +```text + +Parameters for bert network: + seq_length 输入序列的长度,默认为128 + vocab_size 各内嵌向量大小,需与所采用的数据集相同。默认为30522 + hidden_size BERT的encoder层数 + num_hidden_layers 隐藏层数 + num_attention_heads 注意头的数量,默认为12 + intermediate_size 中间层数 + hidden_act 所采用的激活函数,默认为gelu + hidden_dropout_prob BERT输出的随机失活可能性 + attention_probs_dropout_prob BERT注意的随机失活可能性 + max_position_embeddings 序列最大长度,默认为512 + save_ckpt_step 保存检查点数量,默认为100 + max_ckpt_num 保存检查点最大数量,默认为1 + type_vocab_size 标记类型的词汇表大小,默认为2 + initializer_range TruncatedNormal的初始值,默认为0.02 + use_relative_positions 是否采用相对位置,可选项为true或false,默认为False + dtype 输入的数据类型,可选项为mstype.float16或mstype.float32,默认为mstype.float32 + compute_type Bert Transformer的计算类型,可选项为mstype.float16或mstype.float32,默认为mstype.float16 + +``` + +## 训练流程 + +### 用法 + +#### Ascend处理器上运行 + +运行以下命令前,确保已设置'data_dir'和'load_ckpt_path'。请将路径设置为绝对全路径,例如/username/checkpoint_100_300.ckpt。 + +```text + +python + python ./run_train.py --device_target="GPU" --do_eval="true" --epoch_num=3 --task_name="STS-B" --do_shuffle="true" --enable_data_sink="true" --data_sink_steps=100 --save_ckpt_step=100 --max_ckpt_num=1 --load_ckpt_path="sts-b.ckpt" --train_data_dir="sts-b/train.tf_record" --eval_data_dir="sts-b/eval.tf_record" --device_id=0 --logging_step=100 --do_quant="true" +shell + sh run_train.sh + +以上命令后台运行,您可以在log.txt文件中查看运行结果。训练结束后,您可以在默认脚本文件夹中找到检查点文件。得到如下损失值: +epoch: 1, step: 100, loss are (Tensor(shape=[], dtype=Float32, value= 0.526506), Tensor(shape=[], dtype=Bool, value= False)) The current result is {'pearson': 0.8407084843799768, 'spearmanr': 0.8405771469597393, 'corr': 0.840642815669858} epoch time: 66421.602 ms, per step time: 664.216 ms +epoch: 2, step: 200, loss are (Tensor(shape=[], dtype=Float32, value= 0.406012), Tensor(shape=[], dtype=Bool, value= False)) The current result is {'pearson': 0.826509808575773, 'spearmanr': 0.8274141859302444, 'corr': 0.8269619972530087} epoch time: 47488.633 ms, per step time: 474.886 ms +... +best pearson:0.8753269455187238 + +``` + +## 模型描述 + +## 性能 + +### 评估性能 + +| Parameters | GPU | +| ----------------- | :---------------------------------------------------- | +| 模型 | Q8BERT | +| 资源 | NV GeForce GTX1080ti | +| 测试时间 | 03/01/2020 | +| MindSpore版本 | 1.1.0 | +| 数据集 | STS-B | +| batch size | 16 | +| 结果 | 87.5833 | +| 速度 | 0.47s/step | +| 总时间 | 9.1min(3epoch, 1p) | + +# 随机情况说明 + +run_train.py脚本中设置了do_shuffle来轮换数据集。 + +config.py文件中设置了hidden_dropout_prob和attention_pros_dropout_prob,使网点随机失活。 + +# ModelZoo主页 + +请浏览官网[主页](https://gitee.com/mindspore/mindspore/tree/master/model_zoo)。 diff --git a/model_zoo/official/nlp/q8bert/__init__.py b/model_zoo/official/nlp/q8bert/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/model_zoo/official/nlp/q8bert/run_train.py b/model_zoo/official/nlp/q8bert/run_train.py new file mode 100644 index 00000000000..fad997974ab --- /dev/null +++ b/model_zoo/official/nlp/q8bert/run_train.py @@ -0,0 +1,208 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =========================================================================== + +"""task distill script""" + +import argparse +import os + +from mindspore import context +from mindspore import set_seed +from mindspore.nn.optim import AdamWeightDecay +from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell +from mindspore.train.callback import TimeMonitor +from mindspore.train.model import Model + +from src.dataset import create_tinybert_dataset +from src.q8bert import BertEvaluationWithLossScaleCell, BertNetworkWithLoss_td, BertEvaluationCell +from src.config import train_cfg, eval_cfg, model_cfg +from src.utils import LossCallBack, ModelSaveCkpt, EvalCallBack, BertLearningRate + +_cur_dir = os.getcwd() +save_ckpt_dir = os.path.join(_cur_dir, 'Q8Bert_save_ckpt') +if not os.path.exists(save_ckpt_dir): + os.makedirs(save_ckpt_dir) + +def parse_args(): + """ + parse args + """ + parser = argparse.ArgumentParser(description='Q8Bert task distill') + parser.add_argument("--device_target", type=str, default="Ascend", choices=['Ascend', 'GPU'], + help='device where the code will be implemented. (Default: Ascend)') + parser.add_argument("--do_eval", type=str, default="true", choices=["true", "false"], + help="Do eval task, default is true.") + parser.add_argument("--epoch_num", type=int, default=3, help="default is 3.") + parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") + parser.add_argument("--do_shuffle", type=str, default="true", choices=["true", "false"], + help="Enable shuffle for dataset, default is true.") + parser.add_argument("--enable_data_sink", type=str, default="true", choices=["true", "false"], + help="Enable data sink, default is true.") + parser.add_argument("--save_ckpt_step", type=int, default=100, help="Enable save ckpt.") + parser.add_argument("--max_ckpt_num", type=int, default=1, help="Enable data sink, default is true.") + parser.add_argument("--data_sink_steps", type=int, default=1, help="Sink steps for each epoch, default is 1.") + parser.add_argument("--load_ckpt_path", type=str, default="", help="Load checkpoint file path") + parser.add_argument("--train_data_dir", type=str, default="", + help="Train data path, it is better to use absolute path") + parser.add_argument("--eval_data_dir", type=str, default="", + help="Eval data path, it is better to use absolute path") + parser.add_argument("--do_quant", type=str, default="false", help="Do quant for model") + parser.add_argument("--logging_step", type=int, default=100, help="Do evalate each logging step") + parser.add_argument("--task_name", type=str, default="COLA", + choices=["SST-2", "QNLI", "MNLI", "COLA", "QQP", "STS-B", "RTE"], + help="The name of the task to train.") + parser.add_argument("--dataset_type", type=str, default="tfrecord", + help="dataset type tfrecord/mindrecord, default is tfrecord") + args = parser.parse_args() + return args + + +args_opt = parse_args() + +DEFAULT_NUM_LABELS = 2 +DEFAULT_SEQ_LENGTH = 128 +task_params = {"SST-2": {"num_labels": 2, "seq_length": 64}, + "QNLI": {"num_labels": 2, "seq_length": 128}, + "MNLI": {"num_labels": 3, "seq_length": 128}, + "STS-B": {"num_labels": 1, "seq_length": 128}} + +glue_output_modes = { + "cola": "classification", + "mnli": "classification", + "mnli-mm": "classification", + "mrpc": "classification", + "sst-2": "classification", + "sts-b": "regression", + "qqp": "classification", + "qnli": "classification", + "rte": "classification", + "wnli": "classification", +} + + +class Task: + """ + Encapsulation class of get the task parameter. + """ + def __init__(self, task_name): + self.task_name = task_name + + @property + def num_labels(self): + if self.task_name in task_params and "num_labels" in task_params[self.task_name]: + return task_params[self.task_name]["num_labels"] + return DEFAULT_NUM_LABELS + + @property + def seq_length(self): + if self.task_name in task_params and "seq_length" in task_params[self.task_name]: + return task_params[self.task_name]["seq_length"] + return DEFAULT_SEQ_LENGTH +task = Task(args_opt.task_name) + + + +def do_train(): + """ + do train + """ + ckpt_file = args_opt.load_ckpt_path + + if ckpt_file == '': + raise ValueError("Student ckpt file should not be None") + cfg = train_cfg + + if args_opt.device_target == "Ascend": + context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) + elif args_opt.device_target == "GPU": + context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) + else: + raise Exception("Target error, GPU or Ascend is supported.") + + load_student_checkpoint_path = ckpt_file + netwithloss = BertNetworkWithLoss_td(student_config=model_cfg, student_ckpt=load_student_checkpoint_path, + do_quant=args_opt.do_quant, is_training=True, + task_type=glue_output_modes[args_opt.task_name.lower()], + num_labels=task.num_labels, is_predistill=False) + rank = 0 + device_num = 1 + train_dataset = create_tinybert_dataset(cfg.batch_size, + device_num, rank, args_opt.do_shuffle, + args_opt.train_data_dir, None, seq_length=task.seq_length, + task_type='classification', + drop_remainder=True) + + dataset_size = train_dataset.get_dataset_size() + print('td2 train dataset size: ', dataset_size) + print('td2 train dataset repeatcount: ', train_dataset.get_repeat_count()) + if args_opt.enable_data_sink == 'true': + repeat_count = args_opt.epoch_num * train_dataset.get_dataset_size() // args_opt.data_sink_steps + time_monitor_steps = args_opt.data_sink_steps + else: + repeat_count = args_opt.epoch_num + time_monitor_steps = dataset_size + + optimizer_cfg = cfg.optimizer_cfg + + lr_schedule = BertLearningRate(learning_rate=optimizer_cfg.AdamWeightDecay.learning_rate, + end_learning_rate=optimizer_cfg.AdamWeightDecay.end_learning_rate, + warmup_steps=int(dataset_size * args_opt.epoch_num / 10), + decay_steps=int(dataset_size * args_opt.epoch_num), + power=optimizer_cfg.AdamWeightDecay.power) + params = netwithloss.trainable_params() + decay_params = list(filter(optimizer_cfg.AdamWeightDecay.decay_filter, params)) + other_params = list(filter(lambda x: not optimizer_cfg.AdamWeightDecay.decay_filter(x), params)) + group_params = [{'params': decay_params, 'weight_decay': optimizer_cfg.AdamWeightDecay.weight_decay}, + {'params': other_params, 'weight_decay': 0.0}, + {'order_params': params}] + + optimizer = AdamWeightDecay(group_params, learning_rate=lr_schedule, eps=optimizer_cfg.AdamWeightDecay.eps) + + eval_dataset = create_tinybert_dataset(eval_cfg.batch_size, + device_num, rank, args_opt.do_shuffle, + args_opt.eval_data_dir, None, + data_type=args_opt.dataset_type, + seq_length=task.seq_length, + task_type='classification', + drop_remainder=False) + print('td2 eval dataset size: ', eval_dataset.get_dataset_size()) + + if args_opt.do_eval.lower() == "true": + callback = [TimeMonitor(time_monitor_steps), LossCallBack(), + EvalCallBack(netwithloss.bert, eval_dataset, args_opt.task_name, args_opt.logging_step)] + else: + callback = [TimeMonitor(time_monitor_steps), LossCallBack(), + ModelSaveCkpt(netwithloss.bert, + args_opt.save_ckpt_step, + args_opt.max_ckpt_num, + save_ckpt_dir)] + if enable_loss_scale: + update_cell = DynamicLossScaleUpdateCell(loss_scale_value=cfg.loss_scale_value, + scale_factor=cfg.scale_factor, + scale_window=cfg.scale_window) + + netwithgrads = BertEvaluationWithLossScaleCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell) + else: + netwithgrads = BertEvaluationCell(netwithloss, optimizer=optimizer) + model = Model(netwithgrads) + model.train(repeat_count, train_dataset, callbacks=callback, + dataset_sink_mode=(args_opt.enable_data_sink == 'true'), + sink_size=args_opt.data_sink_steps) + +if __name__ == '__main__': + set_seed(1) + enable_loss_scale = True + model_cfg.seq_length = task.seq_length + do_train() diff --git a/model_zoo/official/nlp/q8bert/scripts/run_train.sh b/model_zoo/official/nlp/q8bert/scripts/run_train.sh new file mode 100644 index 00000000000..88d9ff77ab9 --- /dev/null +++ b/model_zoo/official/nlp/q8bert/scripts/run_train.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +mkdir -p ms_log +PROJECT_DIR=$(cd "$(dirname "$0")"; pwd) +CUR_DIR=`pwd` +export GLOG_log_dir=${CUR_DIR}/ms_log +export GLOG_logtostderr=0 + +python ${PROJECT_DIR}/../run_train.py \ + --device_target="Ascend" \ + --device_id=0 \ + --do_eval="true" \ + --epoch_num=3 \ + --task_name="" \ + --do_shuffle="true" \ + --enable_data_sink="true" \ + --data_sink_steps=100 \ + --save_ckpt_step=100 \ + --max_ckpt_num=1 \ + --load_ckpt_path="" \ + --train_data_dir="" \ + --eval_data_dir="" \ + --device_id="" \ + --logging_step=100\ + --do_quant="true" > log.txt 2>&1 & + diff --git a/model_zoo/official/nlp/q8bert/src/__init__.py b/model_zoo/official/nlp/q8bert/src/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/model_zoo/official/nlp/q8bert/src/bert_model.py b/model_zoo/official/nlp/q8bert/src/bert_model.py new file mode 100644 index 00000000000..62a972e9eaa --- /dev/null +++ b/model_zoo/official/nlp/q8bert/src/bert_model.py @@ -0,0 +1,1012 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =========================================================================== +"""Bert model.""" +import math +import copy +import numpy as np +import mindspore.common.dtype as mstype +import mindspore.nn as nn +import mindspore.ops.functional as F +from mindspore.common.initializer import TruncatedNormal, initializer +from mindspore.ops import operations as P +from mindspore.ops import composite as C +from mindspore.common.tensor import Tensor +from mindspore.common.parameter import Parameter +from mindspore import context + + +class BertConfig: + """ + Configuration for `BertModel`. + + Args: + seq_length (int): Length of input sequence. Default: 128. + vocab_size (int): The shape of each embedding vector. Default: 32000. + hidden_size (int): Size of the bert encoder layers. Default: 768. + num_hidden_layers (int): Number of hidden layers in the BertTransformer encoder + cell. Default: 12. + num_attention_heads (int): Number of attention heads in the BertTransformer + encoder cell. Default: 12. + intermediate_size (int): Size of intermediate layer in the BertTransformer + encoder cell. Default: 3072. + hidden_act (str): Activation function used in the BertTransformer encoder + cell. Default: "gelu". + hidden_dropout_prob (float): The dropout probability for BertOutput. Default: 0.1. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.1. + max_position_embeddings (int): Maximum length of sequences used in this + model. Default: 512. + type_vocab_size (int): Size of token type vocab. Default: 16. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + dtype (:class:`mindspore.dtype`): Data type of the input. Default: mstype.float32. + compute_type (:class:`mindspore.dtype`): Compute type in BertTransformer. Default: mstype.float32. + """ + + def __init__(self, + seq_length=128, + vocab_size=32000, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02, + use_relative_positions=False, + dtype=mstype.float32, + compute_type=mstype.float32): + self.seq_length = seq_length + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.use_relative_positions = use_relative_positions + self.dtype = dtype + self.compute_type = compute_type + + +class EmbeddingLookup(nn.Cell): + """ + A embeddings lookup table with a fixed dictionary and size. + + Args: + vocab_size (int): Size of the dictionary of embeddings. + embedding_size (int): The size of each embedding vector. + embedding_shape (list): [batch_size, seq_length, embedding_size], the shape of + each embedding vector. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + """ + + def __init__(self, + vocab_size, + embedding_size, + embedding_shape, + use_one_hot_embeddings=False, + initializer_range=0.02): + super(EmbeddingLookup, self).__init__() + self.vocab_size = vocab_size + self.use_one_hot_embeddings = use_one_hot_embeddings + self.embedding_table = Parameter(initializer + (TruncatedNormal(initializer_range), + [vocab_size, embedding_size])) + self.expand = P.ExpandDims() + self.shape_flat = (-1,) + self.gather = P.GatherV2() + self.one_hot = P.OneHot() + self.on_value = Tensor(1.0, mstype.float32) + self.off_value = Tensor(0.0, mstype.float32) + self.array_mul = P.MatMul() + self.reshape = P.Reshape() + self.shape = tuple(embedding_shape) + + def construct(self, input_ids): + """embedding lookup""" + extended_ids = self.expand(input_ids, -1) + flat_ids = self.reshape(extended_ids, self.shape_flat) + if self.use_one_hot_embeddings: + one_hot_ids = self.one_hot(flat_ids, self.vocab_size, self.on_value, self.off_value) + output_for_reshape = self.array_mul( + one_hot_ids, self.embedding_table) + else: + output_for_reshape = self.gather(self.embedding_table, flat_ids, 0) + output = self.reshape(output_for_reshape, self.shape) + return output, self.embedding_table + + +class EmbeddingPostprocessor(nn.Cell): + """ + Postprocessors apply positional and token type embeddings to word embeddings. + + Args: + embedding_size (int): The size of each embedding vector. + embedding_shape (list): [batch_size, seq_length, embedding_size], the shape of + each embedding vector. + use_token_type (bool): Specifies whether to use token type embeddings. Default: False. + token_type_vocab_size (int): Size of token type vocab. Default: 16. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + max_position_embeddings (int): Maximum length of sequences used in this + model. Default: 512. + dropout_prob (float): The dropout probability. Default: 0.1. + """ + + def __init__(self, + use_relative_positions, + embedding_size, + embedding_shape, + use_token_type=False, + token_type_vocab_size=16, + use_one_hot_embeddings=False, + initializer_range=0.02, + max_position_embeddings=512, + dropout_prob=0.1): + super(EmbeddingPostprocessor, self).__init__() + self.use_token_type = use_token_type + self.token_type_vocab_size = token_type_vocab_size + self.use_one_hot_embeddings = use_one_hot_embeddings + self.max_position_embeddings = max_position_embeddings + self.embedding_table = Parameter(initializer + (TruncatedNormal(initializer_range), + [token_type_vocab_size, + embedding_size])) + self.shape_flat = (-1,) + self.one_hot = P.OneHot() + self.on_value = Tensor(1.0, mstype.float32) + self.off_value = Tensor(0.1, mstype.float32) + self.array_mul = P.MatMul() + self.reshape = P.Reshape() + self.shape = tuple(embedding_shape) + self.layernorm = nn.LayerNorm((embedding_size,)) + self.dropout = nn.Dropout(1 - dropout_prob) + self.gather = P.GatherV2() + self.use_relative_positions = use_relative_positions + self.slice = P.StridedSlice() + self.full_position_embeddings = Parameter(initializer + (TruncatedNormal(initializer_range), + [max_position_embeddings, + embedding_size])) + + def construct(self, token_type_ids, word_embeddings): + """embedding postprocessor""" + output = word_embeddings + if self.use_token_type: + flat_ids = self.reshape(token_type_ids, self.shape_flat) + if self.use_one_hot_embeddings: + one_hot_ids = self.one_hot(flat_ids, + self.token_type_vocab_size, self.on_value, self.off_value) + token_type_embeddings = self.array_mul(one_hot_ids, + self.embedding_table) + else: + token_type_embeddings = self.gather(self.embedding_table, flat_ids, 0) + token_type_embeddings = self.reshape(token_type_embeddings, self.shape) + output += token_type_embeddings + if not self.use_relative_positions: + _, seq, width = self.shape + position_embeddings = self.slice(self.full_position_embeddings, (0, 0), (seq, width), (1, 1)) + position_embeddings = self.reshape(position_embeddings, (1, seq, width)) + output += position_embeddings + output = self.layernorm(output) + output = self.dropout(output) + return output + +class BertOutput(nn.Cell): + """ + Apply a linear computation to hidden status and a residual computation to input. + + Args: + in_channels (int): Input channels. + out_channels (int): Output channels. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + dropout_prob (float): The dropout probability. Default: 0.1. + compute_type (:class:`mindspore.dtype`): Compute type in BertTransformer. Default: mstype.float32. + """ + + def __init__(self, + in_channels, + out_channels, + initializer_range=0.02, + dropout_prob=0.1, + compute_type=mstype.float32,): + super(BertOutput, self).__init__() + self.dense = nn.Dense(in_channels, out_channels, + weight_init=TruncatedNormal(initializer_range)).to_float(compute_type) + self.dropout = nn.Dropout(1 - dropout_prob) + self.add = P.TensorAdd() + self.is_gpu = context.get_context('device_target') == "GPU" + if self.is_gpu: + self.layernorm = nn.LayerNorm((out_channels,)).to_float(mstype.float32) + self.compute_type = compute_type + else: + self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type) + self.cast = P.Cast() + + def construct(self, hidden_status, input_tensor): + """bert output""" + output = self.dense(hidden_status) + output = self.dropout(output) + output = self.add(input_tensor, output) + output = self.layernorm(output) + if self.is_gpu: + output = self.cast(output, self.compute_type) + return output + + +class RelaPosMatrixGenerator(nn.Cell): + """ + Generates matrix of relative positions between inputs. + + Args: + length (int): Length of one dim for the matrix to be generated. + max_relative_position (int): Max value of relative position. + """ + + def __init__(self, length, max_relative_position): + super(RelaPosMatrixGenerator, self).__init__() + self._length = length + self._max_relative_position = Tensor(max_relative_position, dtype=mstype.int32) + self._min_relative_position = Tensor(-max_relative_position, dtype=mstype.int32) + self.range_length = -length + 1 + self.tile = P.Tile() + self.range_mat = P.Reshape() + self.sub = P.Sub() + self.expanddims = P.ExpandDims() + self.cast = P.Cast() + + def construct(self): + """position matrix generator""" + range_vec_row_out = self.cast(F.tuple_to_array(F.make_range(self._length)), mstype.int32) + range_vec_col_out = self.range_mat(range_vec_row_out, (self._length, -1)) + tile_row_out = self.tile(range_vec_row_out, (self._length,)) + tile_col_out = self.tile(range_vec_col_out, (1, self._length)) + range_mat_out = self.range_mat(tile_row_out, (self._length, self._length)) + transpose_out = self.range_mat(tile_col_out, (self._length, self._length)) + distance_mat = self.sub(range_mat_out, transpose_out) + distance_mat_clipped = C.clip_by_value(distance_mat, + self._min_relative_position, + self._max_relative_position) + # Shift values to be >=0. Each integer still uniquely identifies a + # relative position difference. + final_mat = distance_mat_clipped + self._max_relative_position + return final_mat + + +class RelaPosEmbeddingsGenerator(nn.Cell): + """ + Generates tensor of size [length, length, depth]. + + Args: + length (int): Length of one dim for the matrix to be generated. + depth (int): Size of each attention head. + max_relative_position (int): Maxmum value of relative position. + initializer_range (float): Initialization value of TruncatedNormal. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + """ + + def __init__(self, + length, + depth, + max_relative_position, + initializer_range, + use_one_hot_embeddings=False): + super(RelaPosEmbeddingsGenerator, self).__init__() + self.depth = depth + self.vocab_size = max_relative_position * 2 + 1 + self.use_one_hot_embeddings = use_one_hot_embeddings + self.embeddings_table = Parameter( + initializer(TruncatedNormal(initializer_range), + [self.vocab_size, self.depth])) + self.relative_positions_matrix = RelaPosMatrixGenerator(length=length, + max_relative_position=max_relative_position) + self.reshape = P.Reshape() + self.one_hot = P.OneHot() + self.on_value = Tensor(1.0, mstype.float32) + self.off_value = Tensor(0.0, mstype.float32) + self.shape = P.Shape() + self.gather = P.GatherV2() # index_select + self.matmul = P.BatchMatMul() + + def construct(self): + """position embedding generation""" + relative_positions_matrix_out = self.relative_positions_matrix() + # Generate embedding for each relative position of dimension depth. + if self.use_one_hot_embeddings: + flat_relative_positions_matrix = self.reshape(relative_positions_matrix_out, (-1,)) + one_hot_relative_positions_matrix = self.one_hot( + flat_relative_positions_matrix, self.vocab_size, self.on_value, self.off_value) + embeddings = self.matmul(one_hot_relative_positions_matrix, self.embeddings_table) + my_shape = self.shape(relative_positions_matrix_out) + (self.depth,) + embeddings = self.reshape(embeddings, my_shape) + else: + embeddings = self.gather(self.embeddings_table, + relative_positions_matrix_out, 0) + return embeddings + + +class SaturateCast(nn.Cell): + """ + Performs a safe saturating cast. This operation applies proper clamping before casting to prevent + the danger that the value will overflow or underflow. + + Args: + src_type (:class:`mindspore.dtype`): The type of the elements of the input tensor. Default: mstype.float32. + dst_type (:class:`mindspore.dtype`): The type of the elements of the output tensor. Default: mstype.float32. + """ + + def __init__(self, src_type=mstype.float32, dst_type=mstype.float32): + super(SaturateCast, self).__init__() + np_type = mstype.dtype_to_nptype(dst_type) + min_type = np.finfo(np_type).min + max_type = np.finfo(np_type).max + self.tensor_min_type = Tensor([min_type], dtype=src_type) + self.tensor_max_type = Tensor([max_type], dtype=src_type) + self.min_op = P.Minimum() + self.max_op = P.Maximum() + self.cast = P.Cast() + self.dst_type = dst_type + + def construct(self, x): + """saturate cast""" + out = self.max_op(x, self.tensor_min_type) + out = self.min_op(out, self.tensor_max_type) + return self.cast(out, self.dst_type) + + +class BertAttention(nn.Cell): + """ + Apply multi-headed attention from "from_tensor" to "to_tensor". + + Args: + from_tensor_width (int): Size of last dim of from_tensor. + to_tensor_width (int): Size of last dim of to_tensor. + from_seq_length (int): Length of from_tensor sequence. + to_seq_length (int): Length of to_tensor sequence. + num_attention_heads (int): Number of attention heads. Default: 1. + size_per_head (int): Size of each attention head. Default: 512. + query_act (str): Activation function for the query transform. Default: None. + key_act (str): Activation function for the key transform. Default: None. + value_act (str): Activation function for the value transform. Default: None. + has_attention_mask (bool): Specifies whether to use attention mask. Default: False. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.0. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + do_return_2d_tensor (bool): True for return 2d tensor. False for return 3d + tensor. Default: False. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + compute_type (:class:`mindspore.dtype`): Compute type in BertAttention. Default: mstype.float32. + """ + + def __init__(self, + from_tensor_width, + to_tensor_width, + from_seq_length, + to_seq_length, + num_attention_heads=1, + size_per_head=512, + query_act=None, + key_act=None, + value_act=None, + has_attention_mask=False, + attention_probs_dropout_prob=0.0, + use_one_hot_embeddings=False, + initializer_range=0.02, + do_return_2d_tensor=False, + use_relative_positions=False, + compute_type=mstype.float32): + super(BertAttention, self).__init__() + self.from_seq_length = from_seq_length + self.to_seq_length = to_seq_length + self.num_attention_heads = num_attention_heads + self.size_per_head = size_per_head + self.has_attention_mask = has_attention_mask + self.use_relative_positions = use_relative_positions + self.scores_mul = Tensor([1.0 / math.sqrt(float(self.size_per_head))], dtype=compute_type) + self.reshape = P.Reshape() + self.shape_from_2d = (-1, from_tensor_width) + self.shape_to_2d = (-1, to_tensor_width) + weight = TruncatedNormal(initializer_range) + units = num_attention_heads * size_per_head + self.query_layer = nn.Dense(from_tensor_width, + units, + activation=query_act, + weight_init=weight).to_float(compute_type) + self.key_layer = nn.Dense(to_tensor_width, + units, + activation=key_act, + weight_init=weight).to_float(compute_type) + self.value_layer = nn.Dense(to_tensor_width, + units, + activation=value_act, + weight_init=weight).to_float(compute_type) + self.shape_from = (-1, from_seq_length, num_attention_heads, size_per_head) + self.shape_to = (-1, to_seq_length, num_attention_heads, size_per_head) + self.matmul_trans_b = P.BatchMatMul(transpose_b=True) + self.multiply = P.Mul() + self.transpose = P.Transpose() + self.trans_shape = (0, 2, 1, 3) + self.trans_shape_relative = (2, 0, 1, 3) + self.trans_shape_position = (1, 2, 0, 3) + self.multiply_data = Tensor([-10000.0,], dtype=compute_type) + self.matmul = P.BatchMatMul() + self.softmax = nn.Softmax() + self.dropout = nn.Dropout(1 - attention_probs_dropout_prob) + if self.has_attention_mask: + self.expand_dims = P.ExpandDims() + self.sub = P.Sub() + self.add = P.TensorAdd() + self.cast = P.Cast() + self.get_dtype = P.DType() + if do_return_2d_tensor: + self.shape_return = (-1, num_attention_heads * size_per_head) + else: + self.shape_return = (-1, from_seq_length, num_attention_heads * size_per_head) + self.cast_compute_type = SaturateCast(dst_type=compute_type) + if self.use_relative_positions: + self._generate_relative_positions_embeddings = \ + RelaPosEmbeddingsGenerator(length=to_seq_length, + depth=size_per_head, + max_relative_position=16, + initializer_range=initializer_range, + use_one_hot_embeddings=use_one_hot_embeddings) + + def construct(self, from_tensor, to_tensor, attention_mask): + """bert attention""" + # reshape 2d/3d input tensors to 2d + from_tensor_2d = self.reshape(from_tensor, self.shape_from_2d) + to_tensor_2d = self.reshape(to_tensor, self.shape_to_2d) + query_out = self.query_layer(from_tensor_2d) + key_out = self.key_layer(to_tensor_2d) + value_out = self.value_layer(to_tensor_2d) + query_layer = self.reshape(query_out, self.shape_from) + query_layer = self.transpose(query_layer, self.trans_shape) + key_layer = self.reshape(key_out, self.shape_to) + key_layer = self.transpose(key_layer, self.trans_shape) + attention_scores = self.matmul_trans_b(query_layer, key_layer) + # use_relative_position, supplementary logic + if self.use_relative_positions: + # relations_keys is [F|T, F|T, H] + relations_keys = self._generate_relative_positions_embeddings() + relations_keys = self.cast_compute_type(relations_keys) + # query_layer_t is [F, B, N, H] + query_layer_t = self.transpose(query_layer, self.trans_shape_relative) + # query_layer_r is [F, B * N, H] + query_layer_r = self.reshape(query_layer_t, + (self.from_seq_length, + -1, + self.size_per_head)) + # key_position_scores is [F, B * N, F|T] + key_position_scores = self.matmul_trans_b(query_layer_r, + relations_keys) + # key_position_scores_r is [F, B, N, F|T] + key_position_scores_r = self.reshape(key_position_scores, + (self.from_seq_length, + -1, + self.num_attention_heads, + self.from_seq_length)) + # key_position_scores_r_t is [B, N, F, F|T] + key_position_scores_r_t = self.transpose(key_position_scores_r, + self.trans_shape_position) + attention_scores = attention_scores + key_position_scores_r_t + attention_scores = self.multiply(self.scores_mul, attention_scores) + if self.has_attention_mask: + attention_mask = self.expand_dims(attention_mask, 1) + multiply_out = self.sub(self.cast(F.tuple_to_array((1.0,)), self.get_dtype(attention_scores)), + self.cast(attention_mask, self.get_dtype(attention_scores))) + adder = self.multiply(multiply_out, self.multiply_data) + attention_scores = self.add(adder, attention_scores) + attention_probs = self.softmax(attention_scores) + attention_probs = self.dropout(attention_probs) + value_layer = self.reshape(value_out, self.shape_to) + value_layer = self.transpose(value_layer, self.trans_shape) + context_layer = self.matmul(attention_probs, value_layer) + # use_relative_position, supplementary logic + if self.use_relative_positions: + # relations_values is [F|T, F|T, H] + relations_values = self._generate_relative_positions_embeddings() + relations_values = self.cast_compute_type(relations_values) + # attention_probs_t is [F, B, N, T] + attention_probs_t = self.transpose(attention_probs, self.trans_shape_relative) + # attention_probs_r is [F, B * N, T] + attention_probs_r = self.reshape( + attention_probs_t, + (self.from_seq_length, + -1, + self.to_seq_length)) + # value_position_scores is [F, B * N, H] + value_position_scores = self.matmul(attention_probs_r, + relations_values) + # value_position_scores_r is [F, B, N, H] + value_position_scores_r = self.reshape(value_position_scores, + (self.from_seq_length, + -1, + self.num_attention_heads, + self.size_per_head)) + # value_position_scores_r_t is [B, N, F, H] + value_position_scores_r_t = self.transpose(value_position_scores_r, + self.trans_shape_position) + context_layer = context_layer + value_position_scores_r_t + context_layer = self.transpose(context_layer, self.trans_shape) + context_layer = self.reshape(context_layer, self.shape_return) + return context_layer, attention_scores + + +class BertSelfAttention(nn.Cell): + """ + Apply self-attention. + + Args: + seq_length (int): Length of input sequence. + hidden_size (int): Size of the bert encoder layers. + num_attention_heads (int): Number of attention heads. Default: 12. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.1. + use_one_hot_embeddings (bool): Specifies whether to use one_hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + hidden_dropout_prob (float): The dropout probability for BertOutput. Default: 0.1. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + compute_type (:class:`mindspore.dtype`): Compute type in BertSelfAttention. Default: mstype.float32. + """ + + def __init__(self, + seq_length, + hidden_size, + num_attention_heads=12, + attention_probs_dropout_prob=0.1, + use_one_hot_embeddings=False, + initializer_range=0.02, + hidden_dropout_prob=0.1, + use_relative_positions=False, + compute_type=mstype.float32): + super(BertSelfAttention, self).__init__() + if hidden_size % num_attention_heads != 0: + raise ValueError("The hidden size (%d) is not a multiple of the number " + "of attention heads (%d)" % (hidden_size, num_attention_heads)) + self.size_per_head = int(hidden_size / num_attention_heads) + self.attention = BertAttention( + from_tensor_width=hidden_size, + to_tensor_width=hidden_size, + from_seq_length=seq_length, + to_seq_length=seq_length, + num_attention_heads=num_attention_heads, + size_per_head=self.size_per_head, + attention_probs_dropout_prob=attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=initializer_range, + use_relative_positions=use_relative_positions, + has_attention_mask=True, + do_return_2d_tensor=True, + compute_type=compute_type) + self.output = BertOutput(in_channels=hidden_size, + out_channels=hidden_size, + initializer_range=initializer_range, + dropout_prob=hidden_dropout_prob, + compute_type=compute_type) + self.reshape = P.Reshape() + self.shape = (-1, hidden_size) + + def construct(self, input_tensor, attention_mask): + """bert self attention""" + input_tensor = self.reshape(input_tensor, self.shape) + attention_output, attention_scores = self.attention(input_tensor, input_tensor, attention_mask) + output = self.output(attention_output, input_tensor) + return output, attention_scores + + +class BertEncoderCell(nn.Cell): + """ + Encoder cells used in BertTransformer. + + Args: + hidden_size (int): Size of the bert encoder layers. Default: 768. + seq_length (int): Length of input sequence. Default: 512. + num_attention_heads (int): Number of attention heads. Default: 12. + intermediate_size (int): Size of intermediate layer. Default: 3072. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.02. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + hidden_dropout_prob (float): The dropout probability for BertOutput. Default: 0.1. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + hidden_act (str): Activation function. Default: "gelu". + compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: mstype.float32. + """ + + def __init__(self, + hidden_size=768, + seq_length=512, + num_attention_heads=12, + intermediate_size=3072, + attention_probs_dropout_prob=0.02, + use_one_hot_embeddings=False, + initializer_range=0.02, + hidden_dropout_prob=0.1, + use_relative_positions=False, + hidden_act="gelu", + compute_type=mstype.float32): + super(BertEncoderCell, self).__init__() + self.attention = BertSelfAttention( + hidden_size=hidden_size, + seq_length=seq_length, + num_attention_heads=num_attention_heads, + attention_probs_dropout_prob=attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=initializer_range, + hidden_dropout_prob=hidden_dropout_prob, + use_relative_positions=use_relative_positions, + compute_type=compute_type) + self.intermediate = nn.Dense(in_channels=hidden_size, + out_channels=intermediate_size, + activation=hidden_act, + weight_init=TruncatedNormal(initializer_range)).to_float(compute_type) + self.output = BertOutput(in_channels=intermediate_size, + out_channels=hidden_size, + initializer_range=initializer_range, + dropout_prob=hidden_dropout_prob, + compute_type=compute_type) + + def construct(self, hidden_states, attention_mask): + """bert encoder cell""" + # self-attention + attention_output, attention_scores = self.attention(hidden_states, attention_mask) + # feed construct + intermediate_output = self.intermediate(attention_output) + # add and normalize + output = self.output(intermediate_output, attention_output) + return output, attention_scores + + +class BertTransformer(nn.Cell): + """ + Multi-layer bert transformer. + + Args: + hidden_size (int): Size of the encoder layers. + seq_length (int): Length of input sequence. + num_hidden_layers (int): Number of hidden layers in encoder cells. + num_attention_heads (int): Number of attention heads in encoder cells. Default: 12. + intermediate_size (int): Size of intermediate layer in encoder cells. Default: 3072. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.1. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + hidden_dropout_prob (float): The dropout probability for BertOutput. Default: 0.1. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + hidden_act (str): Activation function used in the encoder cells. Default: "gelu". + compute_type (:class:`mindspore.dtype`): Compute type in BertTransformer. Default: mstype.float32. + return_all_encoders (bool): Specifies whether to return all encoders. Default: False. + """ + + def __init__(self, + hidden_size, + seq_length, + num_hidden_layers, + num_attention_heads=12, + intermediate_size=3072, + attention_probs_dropout_prob=0.1, + use_one_hot_embeddings=False, + initializer_range=0.02, + hidden_dropout_prob=0.1, + use_relative_positions=False, + hidden_act="gelu", + compute_type=mstype.float32, + return_all_encoders=False): + super(BertTransformer, self).__init__() + self.return_all_encoders = return_all_encoders + layers = [] + for _ in range(num_hidden_layers): + layer = BertEncoderCell(hidden_size=hidden_size, + seq_length=seq_length, + num_attention_heads=num_attention_heads, + intermediate_size=intermediate_size, + attention_probs_dropout_prob=attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=initializer_range, + hidden_dropout_prob=hidden_dropout_prob, + use_relative_positions=use_relative_positions, + hidden_act=hidden_act, + compute_type=compute_type) + layers.append(layer) + self.layers = nn.CellList(layers) + self.reshape = P.Reshape() + self.shape = (-1, hidden_size) + self.out_shape = (-1, seq_length, hidden_size) + + def construct(self, input_tensor, attention_mask): + """bert transformer""" + prev_output = self.reshape(input_tensor, self.shape) + all_encoder_layers = () + all_encoder_atts = () + all_encoder_outputs = () + all_encoder_outputs += (prev_output,) + for layer_module in self.layers: + layer_output, encoder_att = layer_module(prev_output, attention_mask) + prev_output = layer_output + if self.return_all_encoders: + all_encoder_outputs += (layer_output,) + layer_output = self.reshape(layer_output, self.out_shape) + all_encoder_layers += (layer_output,) + all_encoder_atts += (encoder_att,) + if not self.return_all_encoders: + prev_output = self.reshape(prev_output, self.out_shape) + all_encoder_layers += (prev_output,) + return all_encoder_layers, all_encoder_outputs, all_encoder_atts + + +class CreateAttentionMaskFromInputMask(nn.Cell): + """ + Create attention mask according to input mask. + + Args: + config (Class): Configuration for BertModel. + """ + + def __init__(self, config): + super(CreateAttentionMaskFromInputMask, self).__init__() + self.input_mask = None + self.cast = P.Cast() + self.reshape = P.Reshape() + self.shape = (-1, 1, config.seq_length) + + def construct(self, input_mask): + attention_mask = self.cast(self.reshape(input_mask, self.shape), mstype.float32) + return attention_mask + + +class BertModel(nn.Cell): + """ + Bidirectional Encoder Representations from Transformers. + + Args: + config (Class): Configuration for BertModel. + is_training (bool): True for training mode. False for eval mode. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + """ + + def __init__(self, + config, + is_training, + use_one_hot_embeddings=False): + super(BertModel, self).__init__() + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + self.seq_length = config.seq_length + self.hidden_size = config.hidden_size + self.num_hidden_layers = config.num_hidden_layers + self.embedding_size = config.hidden_size + self.token_type_ids = None + self.last_idx = self.num_hidden_layers - 1 + output_embedding_shape = [-1, self.seq_length, + self.embedding_size] + self.bert_embedding_lookup = EmbeddingLookup( + vocab_size=config.vocab_size, + embedding_size=self.embedding_size, + embedding_shape=output_embedding_shape, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=config.initializer_range) + self.bert_embedding_postprocessor = EmbeddingPostprocessor( + use_relative_positions=config.use_relative_positions, + embedding_size=self.embedding_size, + embedding_shape=output_embedding_shape, + use_token_type=True, + token_type_vocab_size=config.type_vocab_size, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=0.02, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob) + self.bert_encoder = BertTransformer( + hidden_size=self.hidden_size, + seq_length=self.seq_length, + num_attention_heads=config.num_attention_heads, + num_hidden_layers=self.num_hidden_layers, + intermediate_size=config.intermediate_size, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=config.initializer_range, + hidden_dropout_prob=config.hidden_dropout_prob, + use_relative_positions=config.use_relative_positions, + hidden_act=config.hidden_act, + compute_type=config.compute_type, + return_all_encoders=True) + self.cast = P.Cast() + self.dtype = config.dtype + self.cast_compute_type = SaturateCast(dst_type=config.compute_type) + self.slice = P.StridedSlice() + self.squeeze_1 = P.Squeeze(axis=1) + self.dense = nn.Dense(self.hidden_size, self.hidden_size, + activation="tanh", + weight_init=TruncatedNormal(config.initializer_range)).to_float(config.compute_type) + self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask(config) + + def construct(self, input_ids, token_type_ids, input_mask): + """bert model""" + # embedding + word_embeddings, embedding_tables = self.bert_embedding_lookup(input_ids) + embedding_output = self.bert_embedding_postprocessor(token_type_ids, word_embeddings) + # attention mask [batch_size, seq_length, seq_length] + attention_mask = self._create_attention_mask_from_input_mask(input_mask) + # bert encoder + encoder_output, encoder_layers, layer_atts = self.bert_encoder(self.cast_compute_type(embedding_output), + attention_mask) + sequence_output = self.cast(encoder_output[self.last_idx], self.dtype) + # pooler + batch_size = P.Shape()(input_ids)[0] + sequence_slice = self.slice(sequence_output, + (0, 0, 0), + (batch_size, 1, self.hidden_size), + (1, 1, 1)) + first_token = self.squeeze_1(sequence_slice) + pooled_output = self.dense(first_token) + pooled_output = self.cast(pooled_output, self.dtype) + encoder_outputs = () + for output in encoder_layers: + encoder_outputs += (self.cast(output, self.dtype),) + attention_outputs = () + for output in layer_atts: + attention_outputs += (self.cast(output, self.dtype),) + return sequence_output, pooled_output, embedding_tables, encoder_outputs, attention_outputs + + +class TinyBertModel(nn.Cell): + """ + Bidirectional Encoder Representations from Transformers. + + Args: + config (Class): Configuration for BertModel. + is_training (bool): True for training mode. False for eval mode. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + """ + + def __init__(self, + config, + is_training, + use_one_hot_embeddings=False): + super(TinyBertModel, self).__init__() + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + self.seq_length = config.seq_length + self.hidden_size = config.hidden_size + self.num_hidden_layers = config.num_hidden_layers + self.embedding_size = config.hidden_size + self.token_type_ids = None + self.last_idx = self.num_hidden_layers - 1 + output_embedding_shape = [-1, self.seq_length, + self.embedding_size] + self.tinybert_embedding_lookup = EmbeddingLookup( + vocab_size=config.vocab_size, + embedding_size=self.embedding_size, + embedding_shape=output_embedding_shape, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=config.initializer_range) + self.tinybert_embedding_postprocessor = EmbeddingPostprocessor( + use_relative_positions=config.use_relative_positions, + embedding_size=self.embedding_size, + embedding_shape=output_embedding_shape, + use_token_type=True, + token_type_vocab_size=config.type_vocab_size, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=0.02, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob) + self.tinybert_encoder = BertTransformer( + hidden_size=self.hidden_size, + seq_length=self.seq_length, + num_attention_heads=config.num_attention_heads, + num_hidden_layers=self.num_hidden_layers, + intermediate_size=config.intermediate_size, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=config.initializer_range, + hidden_dropout_prob=config.hidden_dropout_prob, + use_relative_positions=config.use_relative_positions, + hidden_act=config.hidden_act, + compute_type=config.compute_type, + return_all_encoders=True) + self.cast = P.Cast() + self.dtype = config.dtype + self.cast_compute_type = SaturateCast(dst_type=config.compute_type) + self.slice = P.StridedSlice() + self.squeeze_1 = P.Squeeze(axis=1) + self.dense = nn.Dense(self.hidden_size, self.hidden_size, + activation="tanh", + weight_init=TruncatedNormal(config.initializer_range)).to_float(config.compute_type) + self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask(config) + + def construct(self, input_ids, token_type_ids, input_mask): + """tiny bert model""" + # embedding + word_embeddings, embedding_tables = self.tinybert_embedding_lookup(input_ids) + embedding_output = self.tinybert_embedding_postprocessor(token_type_ids, + word_embeddings) + # attention mask [batch_size, seq_length, seq_length] + attention_mask = self._create_attention_mask_from_input_mask(input_mask) + # bert encoder + encoder_output, encoder_layers, layer_atts = self.tinybert_encoder(self.cast_compute_type(embedding_output), + attention_mask) + sequence_output = self.cast(encoder_output[self.last_idx], self.dtype) + # pooler + batch_size = P.Shape()(input_ids)[0] + sequence_slice = self.slice(sequence_output, + (0, 0, 0), + (batch_size, 1, self.hidden_size), + (1, 1, 1)) + first_token = self.squeeze_1(sequence_slice) + pooled_output = self.dense(first_token) + pooled_output = self.cast(pooled_output, self.dtype) + encoder_outputs = () + for output in encoder_layers: + encoder_outputs += (self.cast(output, self.dtype),) + attention_outputs = () + for output in layer_atts: + attention_outputs += (self.cast(output, self.dtype),) + return sequence_output, pooled_output, embedding_tables, encoder_outputs, attention_outputs + + +class BertModelCLS(nn.Cell): + """ + This class is responsible for classification task evaluation, + i.e. XNLI(num_labels=3), LCQMC(num_labels=2), Chnsenti(num_labels=2). + The returned output represents the final logits as the results of log_softmax is proportional to that of softmax. + """ + + def __init__(self, config, is_training, num_labels=2, dropout_prob=0.0, + use_one_hot_embeddings=False, phase_type="student"): + super(BertModelCLS, self).__init__() + self.bert = BertModel(config, is_training, use_one_hot_embeddings) + self.cast = P.Cast() + self.weight_init = TruncatedNormal(config.initializer_range) + self.log_softmax = P.LogSoftmax(axis=-1) + self.dtype = config.dtype + self.num_labels = num_labels + self.phase_type = phase_type + if self.phase_type == "teacher": + self.dense = nn.Dense(config.hidden_size, self.num_labels, weight_init=self.weight_init, + has_bias=True).to_float(config.compute_type) + else: + self.dense_1 = nn.Dense(config.hidden_size, self.num_labels, weight_init=self.weight_init, + has_bias=True).to_float(config.compute_type) + self.dropout = nn.ReLU() + + def construct(self, input_ids, token_type_id, input_mask): + """classification bert model""" + _, pooled_output, _, seq_output, att_output = self.bert(input_ids, token_type_id, input_mask) + cls = self.cast(pooled_output, self.dtype) + cls = self.dropout(cls) + if self.phase_type == "teacher": + logits = self.dense(cls) + else: + logits = self.dense_1(cls) + logits = self.cast(logits, self.dtype) + log_probs = self.log_softmax(logits) + if self._phase == 'train' or self.phase_type == "teacher": + return seq_output, att_output, logits, log_probs + # return log_probs + + return seq_output, att_output, logits, log_probs diff --git a/model_zoo/official/nlp/q8bert/src/config.py b/model_zoo/official/nlp/q8bert/src/config.py new file mode 100644 index 00000000000..ad1a297f0ac --- /dev/null +++ b/model_zoo/official/nlp/q8bert/src/config.py @@ -0,0 +1,59 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =========================================================================== + +"""config script for task distill""" + +import mindspore.common.dtype as mstype +from easydict import EasyDict as edict +from .q8bert_model import BertConfig +train_cfg = edict({ + 'batch_size': 16, + 'loss_scale_value': 2 ** 16, + 'scale_factor': 2, + 'scale_window': 50, + 'optimizer_cfg': edict({ + 'AdamWeightDecay': edict({ + 'learning_rate': 5e-5, + 'end_learning_rate': 1e-14, + 'power': 1.0, + 'weight_decay': 1e-4, + 'eps': 1e-6, + 'decay_filter': lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower(), + 'warmup_ratio': 0.1 + }), + }), +}) + +eval_cfg = edict({ + 'batch_size': 32, +}) + +model_cfg = BertConfig( + seq_length=128, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=6, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + use_relative_positions=False, + dtype=mstype.float32, + compute_type=mstype.float32, +) diff --git a/model_zoo/official/nlp/q8bert/src/dataset.py b/model_zoo/official/nlp/q8bert/src/dataset.py new file mode 100644 index 00000000000..49c92d2b508 --- /dev/null +++ b/model_zoo/official/nlp/q8bert/src/dataset.py @@ -0,0 +1,91 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =========================================================================== + +"""create tinybert dataset""" + +from enum import Enum +import mindspore.common.dtype as mstype +import mindspore.dataset.engine.datasets as de +import mindspore.dataset.transforms.c_transforms as C + +class DataType(Enum): + """Enumerate supported dataset format""" + TFRECORD = 1 + MINDRECORD = 2 + +def create_tinybert_dataset(batch_size=32, device_num=1, rank=0, + do_shuffle="true", data_dir=None, schema_dir=None, + data_type=DataType.TFRECORD, seq_length=128, task_type=mstype.int32, drop_remainder=True): + """create tinybert dataset""" + if isinstance(data_dir, list): + data_files = data_dir + else: + data_files = [data_dir] + + columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"] + shard_equal_rows = True + shuffle = (do_shuffle == "true") + if device_num == 1: + shard_equal_rows = False + shuffle = False + if data_type == DataType.MINDRECORD: + ds = de.MindDataset(data_files, columns_list=columns_list, + shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank) + else: + ds = de.TFRecordDataset(data_files, None, columns_list=columns_list, + shuffle=shuffle, num_shards=device_num, shard_id=rank, + shard_equal_rows=shard_equal_rows) + if device_num == 1 and shuffle is True: + ds = ds.shuffle(10000) + type_cast_op = C.TypeCast(mstype.int32) + slice_op = C.Slice(slice(0, seq_length, 1)) + label_type = mstype.float32 + # label_type = mstype.int32 if task_type == 'classification' else mstype.float32 + ds = ds.map(operations=[type_cast_op, slice_op], input_columns=["segment_ids"]) + ds = ds.map(operations=[type_cast_op, slice_op], input_columns=["input_mask"]) + ds = ds.map(operations=[type_cast_op, slice_op], input_columns=["input_ids"]) + ds = ds.map(operations=[C.TypeCast(label_type), slice_op], input_columns=["label_ids"]) + # apply batch operations + ds = ds.batch(batch_size, drop_remainder=drop_remainder) + + return ds + +def generator_squad(data_features): + for feature in data_features: + yield (feature.input_ids, feature.input_mask, feature.segment_ids, feature.unique_id) + + +def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, schema_file_path=None, + is_training=True, do_shuffle=True): + """create finetune or evaluation dataset""" + type_cast_op = C.TypeCast(mstype.int32) + if is_training: + data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "start_positions", + "end_positions", "unique_ids", "is_impossible"], + shuffle=do_shuffle) + data_set = data_set.map(operations=type_cast_op, input_columns="start_positions") + data_set = data_set.map(operations=type_cast_op, input_columns="end_positions") + else: + data_set = ds.GeneratorDataset(generator_squad(data_file_path), shuffle=do_shuffle, + column_names=["input_ids", "input_mask", "segment_ids", "unique_ids"]) + data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="unique_ids") + data_set = data_set.repeat(repeat_count) + # apply batch operations + data_set = data_set.batch(batch_size, drop_remainder=True) + return data_set diff --git a/model_zoo/official/nlp/q8bert/src/q8bert.py b/model_zoo/official/nlp/q8bert/src/q8bert.py new file mode 100644 index 00000000000..60e19248dce --- /dev/null +++ b/model_zoo/official/nlp/q8bert/src/q8bert.py @@ -0,0 +1,559 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =========================================================================== + +"""Tinybert model""" + +import re +import mindspore.nn as nn +from mindspore import context +from mindspore.ops import operations as P +from mindspore.ops import functional as F +from mindspore.ops import composite as C +from mindspore.common.tensor import Tensor +from mindspore.common import dtype as mstype +from mindspore.common.parameter import Parameter +from mindspore.communication.management import get_group_size +from mindspore.nn.wrap.grad_reducer import DistributedGradReducer +from mindspore.context import ParallelMode +from mindspore.train.serialization import load_checkpoint, load_param_into_net +from .q8bert_model import BertModel, TinyBertModel, BertModelCLS + +GRADIENT_CLIP_TYPE = 1 +GRADIENT_CLIP_VALUE = 1.0 + +clip_grad = C.MultitypeFuncGraph("clip_grad") +@clip_grad.register("Number", "Number", "Tensor") +def _clip_grad(clip_type, clip_value, grad): + """ + Clip gradients. + + Inputs: + clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'. + clip_value (float): Specifies how much to clip. + grad (tuple[Tensor]): Gradients. + + Outputs: + tuple[Tensor], clipped gradients. + """ + if clip_type not in (0, 1): + return grad + dt = F.dtype(grad) + if clip_type == 0: + new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt), + F.cast(F.tuple_to_array((clip_value,)), dt)) + else: + new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt)) + return new_grad + +grad_scale = C.MultitypeFuncGraph("grad_scale") +reciprocal = P.Reciprocal() + +@grad_scale.register("Tensor", "Tensor") +def tensor_grad_scale(scale, grad): + return grad * reciprocal(scale) + +class ClipGradients(nn.Cell): + """ + Clip gradients. + + Args: + grads (list): List of gradient tuples. + clip_type (Tensor): The way to clip, 'value' or 'norm'. + clip_value (Tensor): Specifies how much to clip. + + Returns: + List, a list of clipped_grad tuples. + """ + def __init__(self): + super(ClipGradients, self).__init__() + self.clip_by_norm = nn.ClipByNorm() + self.cast = P.Cast() + self.dtype = P.DType() + + def construct(self, + grads, + clip_type, + clip_value): + """clip gradients""" + if clip_type not in (0, 1): + return grads + new_grads = () + for grad in grads: + dt = self.dtype(grad) + if clip_type == 0: + t = C.clip_by_value(grad, self.cast(F.tuple_to_array((-clip_value,)), dt), + self.cast(F.tuple_to_array((clip_value,)), dt)) + else: + t = self.clip_by_norm(grad, self.cast(F.tuple_to_array((clip_value,)), dt)) + new_grads = new_grads + (t,) + return new_grads + +class SoftCrossEntropy(nn.Cell): + """SoftCrossEntropy loss""" + def __init__(self): + super(SoftCrossEntropy, self).__init__() + self.log_softmax = P.LogSoftmax(axis=-1) + self.softmax = P.Softmax(axis=-1) + self.reduce_mean = P.ReduceMean() + self.cast = P.Cast() + + def construct(self, predicts, targets): + likelihood = self.log_softmax(predicts) + target_prob = self.softmax(targets) + loss = self.reduce_mean(-target_prob * likelihood) + + return self.cast(loss, mstype.float32) + +class BertNetworkWithLoss_gd(nn.Cell): + """ + Provide bert pre-training loss through network. + Args: + config (BertConfig): The config of BertModel. + is_training (bool): Specifies whether to use the training mode. + use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings. Default: False. + Returns: + Tensor, the loss of the network. + """ + def __init__(self, teacher_config, teacher_ckpt, student_config, is_training, use_one_hot_embeddings=False, + is_att_fit=True, is_rep_fit=True): + super(BertNetworkWithLoss_gd, self).__init__() + # load teacher model + self.teacher = BertModel(teacher_config, False, use_one_hot_embeddings) + param_dict = load_checkpoint(teacher_ckpt) + new_param_dict = {} + for key, value in param_dict.items(): + new_key = re.sub('^bert.bert.', 'teacher.', key) + new_param_dict[new_key] = value + load_param_into_net(self.teacher, new_param_dict) + # no_grad + self.teacher.set_train(False) + params = self.teacher.trainable_params() + for param in params: + param.requires_grad = False + # student model + self.bert = TinyBertModel(student_config, is_training, use_one_hot_embeddings) + self.cast = P.Cast() + self.fit_dense = nn.Dense(student_config.hidden_size, + teacher_config.hidden_size).to_float(teacher_config.compute_type) + self.teacher_layers_num = teacher_config.num_hidden_layers + self.student_layers_num = student_config.num_hidden_layers + self.layers_per_block = int(self.teacher_layers_num / self.student_layers_num) + self.is_att_fit = is_att_fit + self.is_rep_fit = is_rep_fit + self.loss_mse = nn.MSELoss() + self.select = P.Select() + self.zeroslike = P.ZerosLike() + self.dtype = teacher_config.dtype + + def construct(self, + input_ids, + input_mask, + token_type_id): + """general distill network with loss""" + # teacher model + _, _, _, teacher_seq_output, teacher_att_output = self.teacher(input_ids, token_type_id, input_mask) + # student model + _, _, _, student_seq_output, student_att_output = self.bert(input_ids, token_type_id, input_mask) + total_loss = 0 + if self.is_att_fit: + selected_teacher_att_output = () + selected_student_att_output = () + for i in range(self.student_layers_num): + selected_teacher_att_output += (teacher_att_output[(i + 1) * self.layers_per_block - 1],) + selected_student_att_output += (student_att_output[i],) + att_loss = 0 + for i in range(self.student_layers_num): + student_att = selected_student_att_output[i] + teacher_att = selected_teacher_att_output[i] + student_att = self.select(student_att <= self.cast(-100.0, mstype.float32), self.zeroslike(student_att), + student_att) + teacher_att = self.select(teacher_att <= self.cast(-100.0, mstype.float32), self.zeroslike(teacher_att), + teacher_att) + att_loss += self.loss_mse(student_att, teacher_att) + total_loss += att_loss + if self.is_rep_fit: + selected_teacher_seq_output = () + selected_student_seq_output = () + for i in range(self.student_layers_num + 1): + selected_teacher_seq_output += (teacher_seq_output[i * self.layers_per_block],) + fit_dense_out = self.fit_dense(student_seq_output[i]) + fit_dense_out = self.cast(fit_dense_out, self.dtype) + selected_student_seq_output += (fit_dense_out,) + rep_loss = 0 + for i in range(self.student_layers_num + 1): + teacher_rep = selected_teacher_seq_output[i] + student_rep = selected_student_seq_output[i] + rep_loss += self.loss_mse(student_rep, teacher_rep) + total_loss += rep_loss + return self.cast(total_loss, mstype.float32) + +class BertTrainWithLossScaleCell(nn.Cell): + """ + Encapsulation class of bert network training. + + Append an optimizer to the training network after that the construct + function can be called to create the backward graph. + + Args: + network (Cell): The training network. Note that loss function should have been added. + optimizer (Optimizer): Optimizer for updating the weights. + scale_update_cell (Cell): Cell to do the loss scale. Default: None. + """ + def __init__(self, network, optimizer, scale_update_cell=None): + super(BertTrainWithLossScaleCell, self).__init__(auto_prefix=False) + self.network = network + self.network.set_grad() + self.weights = optimizer.parameters + self.optimizer = optimizer + self.grad = C.GradOperation(get_by_list=True, + sens_param=True) + self.reducer_flag = False + self.allreduce = P.AllReduce() + self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: + self.reducer_flag = True + self.grad_reducer = F.identity + self.degree = 1 + if self.reducer_flag: + self.degree = get_group_size() + self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) + self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) + self.cast = P.Cast() + self.alloc_status = P.NPUAllocFloatStatus() + self.get_status = P.NPUGetFloatStatus() + self.clear_before_grad = P.NPUClearFloatStatus() + self.reduce_sum = P.ReduceSum(keep_dims=False) + self.depend_parameter_use = P.ControlDepend(depend_mode=1) + self.base = Tensor(1, mstype.float32) + self.less_equal = P.LessEqual() + self.hyper_map = C.HyperMap() + self.loss_scale = None + self.loss_scaling_manager = scale_update_cell + if scale_update_cell: + self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) + + @C.add_flags(has_effect=True) + def construct(self, + input_ids, + input_mask, + token_type_id, + sens=None): + """Defines the computation performed.""" + weights = self.weights + loss = self.network(input_ids, + input_mask, + token_type_id) + if sens is None: + scaling_sens = self.loss_scale + else: + scaling_sens = sens + # alloc status and clear should be right before gradoperation + init = self.alloc_status() + self.clear_before_grad(init) + grads = self.grad(self.network, weights)(input_ids, + input_mask, + token_type_id, + self.cast(scaling_sens, + mstype.float32)) + # apply grad reducer on grads + grads = self.grad_reducer(grads) + grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads) + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + self.get_status(init) + flag_sum = self.reduce_sum(init, (0,)) + if self.is_distributed: + # sum overflow flag over devices + flag_reduce = self.allreduce(flag_sum) + cond = self.less_equal(self.base, flag_reduce) + else: + cond = self.less_equal(self.base, flag_sum) + overflow = cond + if sens is None: + overflow = self.loss_scaling_manager(self.loss_scale, cond) + if overflow: + succ = False + else: + succ = self.optimizer(grads) + ret = (loss, cond, scaling_sens) + return F.depend(ret, succ) + +class BertTrainCell(nn.Cell): + """ + Encapsulation class of bert network training. + + Append an optimizer to the training network after that the construct + function can be called to create the backward graph. + + Args: + network (Cell): The training network. Note that loss function should have been added. + optimizer (Optimizer): Optimizer for updating the weights. + sens (Number): The adjust parameter. Default: 1.0. + """ + def __init__(self, network, optimizer, sens=1.0): + super(BertTrainCell, self).__init__(auto_prefix=False) + self.network = network + self.network.set_grad() + self.weights = optimizer.parameters + self.optimizer = optimizer + self.sens = sens + self.grad = C.GradOperation(get_by_list=True, + sens_param=True) + self.reducer_flag = False + self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: + self.reducer_flag = True + self.grad_reducer = F.identity + self.degree = 1 + if self.reducer_flag: + mean = context.get_auto_parallel_context("gradients_mean") + self.degree = get_group_size() + self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, self.degree) + self.cast = P.Cast() + self.hyper_map = C.HyperMap() + + def construct(self, + input_ids, + input_mask, + token_type_id): + """Defines the computation performed.""" + weights = self.weights + loss = self.network(input_ids, + input_mask, + token_type_id) + grads = self.grad(self.network, weights)(input_ids, + input_mask, + token_type_id, + self.cast(F.tuple_to_array((self.sens,)), + mstype.float32)) + # apply grad reducer on grads + grads = self.grad_reducer(grads) + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + succ = self.optimizer(grads) + return F.depend(loss, succ) + +class BertNetworkWithLoss_td(nn.Cell): + """ + Provide bert pre-training loss through network. + Args: + config (BertConfig): The config of BertModel. + is_training (bool): Specifies whether to use the training mode. + use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings. Default: False. + Returns: + Tensor, the loss of the network. + """ + def __init__(self, student_config, student_ckpt, do_quant, + is_training, task_type, num_labels, use_one_hot_embeddings=False, + is_predistill=True, is_att_fit=True, is_rep_fit=True, + temperature=1.0, dropout_prob=0.1): + super(BertNetworkWithLoss_td, self).__init__() + + # load student model + self.bert = BertModelCLS(student_config, is_training, num_labels, dropout_prob, + use_one_hot_embeddings, "student") + if do_quant == "True": + import src.q8bert_model as quant_bert_model + self.bert = quant_bert_model.BertModelCLS(student_config, is_training, num_labels, dropout_prob, + use_one_hot_embeddings, "student") + else: + import src.bert_model as bert_model + self.bert = bert_model.BertModelCLS(student_config, is_training, num_labels, dropout_prob, + use_one_hot_embeddings, "student") + + param_dict = load_checkpoint(student_ckpt) + if is_predistill: + new_param_dict = {} + for key, value in param_dict.items(): + new_key = re.sub('tinybert_', 'bert_', 'bert.' + key) + new_param_dict[new_key] = value + load_param_into_net(self.bert, new_param_dict) + else: + new_param_dict = {} + for key, value in param_dict.items(): + new_key = re.sub('tinybert_', 'bert_', key) + new_param_dict[new_key] = value + load_param_into_net(self.bert, new_param_dict) + self.cast = P.Cast() + self.student_layers_num = student_config.num_hidden_layers + self.is_predistill = is_predistill + self.is_att_fit = is_att_fit + self.is_rep_fit = is_rep_fit + self.task_type = task_type + self.temperature = temperature + self.loss_mse = nn.MSELoss() + self.select = P.Select() + self.zeroslike = P.ZerosLike() + self.dtype = student_config.dtype + self.num_labels = num_labels + self.soft_cross_entropy = SoftCrossEntropy() + self.reshape = P.Reshape() + self.lgt_fct = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') + + def construct(self, + input_ids, + input_mask, + token_type_id, + label_ids): + """task distill network with loss""" + # student model + _, _, student_logits, _ = self.bert(input_ids, token_type_id, input_mask) + total_loss = 0 + + if self.task_type == "classification": + student_logits = self.cast(student_logits, mstype.float32) + label_ids_reshape = self.reshape(self.cast(label_ids, mstype.int32), (-1,)) + cls_loss = self.lgt_fct(student_logits, label_ids_reshape) + else: + student_logits = self.reshape(student_logits, (-1,)) + label_ids = self.reshape(label_ids, (-1,)) + cls_loss = self.loss_mse(student_logits, label_ids) + total_loss += cls_loss + return self.cast(total_loss, mstype.float32) + +class BertEvaluationWithLossScaleCell(nn.Cell): + """ + specifically defined for finetuning where only four inputs tensor are needed. + """ + def __init__(self, network, optimizer, scale_update_cell=None): + super(BertEvaluationWithLossScaleCell, self).__init__(auto_prefix=False) + self.network = network + self.network.set_grad() + self.weights = optimizer.parameters + self.optimizer = optimizer + self.grad = C.GradOperation(get_by_list=True, + sens_param=True) + self.reducer_flag = False + self.allreduce = P.AllReduce() + self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: + self.reducer_flag = True + self.grad_reducer = F.identity + self.degree = 1 + if self.reducer_flag: + self.degree = get_group_size() + self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) + self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) + self.cast = P.Cast() + self.alloc_status = P.NPUAllocFloatStatus() + self.get_status = P.NPUGetFloatStatus() + self.clear_before_grad = P.NPUClearFloatStatus() + self.reduce_sum = P.ReduceSum(keep_dims=False) + self.depend_parameter_use = P.ControlDepend(depend_mode=1) + self.base = Tensor(1, mstype.float32) + self.less_equal = P.LessEqual() + self.hyper_map = C.HyperMap() + self.loss_scale = None + self.loss_scaling_manager = scale_update_cell + if scale_update_cell: + self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) + + @C.add_flags(has_effect=True) + def construct(self, + input_ids, + input_mask, + token_type_id, + label_ids, + sens=None): + """Defines the computation performed.""" + weights = self.weights + loss = self.network(input_ids, + input_mask, + token_type_id, + label_ids) + if sens is None: + scaling_sens = self.loss_scale + else: + scaling_sens = sens + # alloc status and clear should be right before gradoperation + init = self.alloc_status() + self.clear_before_grad(init) + grads = self.grad(self.network, weights)(input_ids, + input_mask, + token_type_id, + label_ids, + self.cast(scaling_sens, + mstype.float32)) + # apply grad reducer on grads + grads = self.grad_reducer(grads) + grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads) + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + self.get_status(init) + flag_sum = self.reduce_sum(init, (0,)) + if self.is_distributed: + # sum overflow flag over devices + flag_reduce = self.allreduce(flag_sum) + cond = self.less_equal(self.base, flag_reduce) + else: + cond = self.less_equal(self.base, flag_sum) + overflow = cond + if sens is None: + overflow = self.loss_scaling_manager(self.loss_scale, cond) + if overflow: + succ = False + else: + succ = self.optimizer(grads) + ret = (loss, cond, scaling_sens) + return F.depend(ret, succ) + + +class BertEvaluationCell(nn.Cell): + """ + specifically defined for finetuning where only four inputs tensor are needed. + """ + def __init__(self, network, optimizer, sens=1.0): + super(BertEvaluationCell, self).__init__(auto_prefix=False) + self.network = network + self.network.set_grad() + self.weights = optimizer.parameters + self.optimizer = optimizer + self.sens = sens + self.grad = C.GradOperation(get_by_list=True, + sens_param=True) + self.reducer_flag = False + self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: + self.reducer_flag = True + self.grad_reducer = F.identity + self.degree = 1 + if self.reducer_flag: + mean = context.get_auto_parallel_context("gradients_mean") + self.degree = get_group_size() + self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, self.degree) + self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) + self.cast = P.Cast() + self.hyper_map = C.HyperMap() + + def construct(self, + input_ids, + input_mask, + token_type_id, + label_ids): + """Defines the computation performed.""" + # return input_ids + weights = self.weights + loss = self.network(input_ids, + input_mask, + token_type_id, + label_ids) + grads = self.grad(self.network, weights)(input_ids, + input_mask, + token_type_id, + label_ids, + self.cast(F.tuple_to_array((self.sens,)), + mstype.float32)) + # apply grad reducer on grads + grads = self.grad_reducer(grads) + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + succ = self.optimizer(grads) + return F.depend(loss, succ) diff --git a/model_zoo/official/nlp/q8bert/src/q8bert_model.py b/model_zoo/official/nlp/q8bert/src/q8bert_model.py new file mode 100644 index 00000000000..babe281fc23 --- /dev/null +++ b/model_zoo/official/nlp/q8bert/src/q8bert_model.py @@ -0,0 +1,1141 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =========================================================================== +"""Bert model.""" +import math +import copy +import numpy as np +import mindspore.common.dtype as mstype +import mindspore.nn as nn +import mindspore.ops.functional as F +from mindspore._checkparam import Validator +from mindspore.common.initializer import TruncatedNormal, initializer +from mindspore.compression.common import QuantDtype +from mindspore.ops import operations as P, Primitive +from mindspore.ops import composite as C +from mindspore.common.tensor import Tensor +from mindspore.common.parameter import Parameter +from mindspore import context +from mindspore.nn.layer.quant import FakeQuantWithMinMaxObserver as FakeQuantWithMinMax, quant_config_default + + +class BertConfig: + """ + Configuration for `BertModel`. + + Args: + seq_length (int): Length of input sequence. Default: 128. + vocab_size (int): The shape of each embedding vector. Default: 32000. + hidden_size (int): Size of the bert encoder layers. Default: 768. + num_hidden_layers (int): Number of hidden layers in the BertTransformer encoder + cell. Default: 12. + num_attention_heads (int): Number of attention heads in the BertTransformer + encoder cell. Default: 12. + intermediate_size (int): Size of intermediate layer in the BertTransformer + encoder cell. Default: 3072. + hidden_act (str): Activation function used in the BertTransformer encoder + cell. Default: "gelu". + hidden_dropout_prob (float): The dropout probability for BertOutput. Default: 0.1. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.1. + max_position_embeddings (int): Maximum length of sequences used in this + model. Default: 512. + type_vocab_size (int): Size of token type vocab. Default: 16. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + dtype (:class:`mindspore.dtype`): Data type of the input. Default: mstype.float32. + compute_type (:class:`mindspore.dtype`): Compute type in BertTransformer. Default: mstype.float32. + """ + + def __init__(self, + seq_length=128, + vocab_size=32000, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02, + use_relative_positions=False, + dtype=mstype.float32, + compute_type=mstype.float32): + self.seq_length = seq_length + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.use_relative_positions = use_relative_positions + self.dtype = dtype + self.compute_type = compute_type + + +class EmbeddingLookup(nn.Cell): + """ + A embeddings lookup table with a fixed dictionary and size. + + Args: + vocab_size (int): Size of the dictionary of embeddings. + embedding_size (int): The size of each embedding vector. + embedding_shape (list): [batch_size, seq_length, embedding_size], the shape of + each embedding vector. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + """ + + def __init__(self, + vocab_size, + embedding_size, + embedding_shape, + use_one_hot_embeddings=False, + initializer_range=0.02): + super(EmbeddingLookup, self).__init__() + self.vocab_size = vocab_size + self.use_one_hot_embeddings = use_one_hot_embeddings + self.embedding_table = Parameter(initializer + (TruncatedNormal(initializer_range), + [vocab_size, embedding_size])) + self.expand = P.ExpandDims() + self.shape_flat = (-1,) + self.gather = P.GatherV2() + self.one_hot = P.OneHot() + self.on_value = Tensor(1.0, mstype.float32) + self.off_value = Tensor(0.0, mstype.float32) + self.array_mul = P.MatMul() + self.reshape = P.Reshape() + self.shape = tuple(embedding_shape) + + def construct(self, input_ids): + """embedding lookup""" + extended_ids = self.expand(input_ids, -1) + flat_ids = self.reshape(extended_ids, self.shape_flat) + if self.use_one_hot_embeddings: + one_hot_ids = self.one_hot(flat_ids, self.vocab_size, self.on_value, self.off_value) + output_for_reshape = self.array_mul( + one_hot_ids, self.embedding_table) + else: + output_for_reshape = self.gather(self.embedding_table, flat_ids, 0) + output = self.reshape(output_for_reshape, self.shape) + return output, self.embedding_table + + +class EmbeddingPostprocessor(nn.Cell): + """ + Postprocessors apply positional and token type embeddings to word embeddings. + + Args: + embedding_size (int): The size of each embedding vector. + embedding_shape (list): [batch_size, seq_length, embedding_size], the shape of + each embedding vector. + use_token_type (bool): Specifies whether to use token type embeddings. Default: False. + token_type_vocab_size (int): Size of token type vocab. Default: 16. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + max_position_embeddings (int): Maximum length of sequences used in this + model. Default: 512. + dropout_prob (float): The dropout probability. Default: 0.1. + """ + + def __init__(self, + use_relative_positions, + embedding_size, + embedding_shape, + use_token_type=False, + token_type_vocab_size=16, + use_one_hot_embeddings=False, + initializer_range=0.02, + max_position_embeddings=512, + dropout_prob=0.1): + super(EmbeddingPostprocessor, self).__init__() + self.use_token_type = use_token_type + self.token_type_vocab_size = token_type_vocab_size + self.use_one_hot_embeddings = use_one_hot_embeddings + self.max_position_embeddings = max_position_embeddings + self.embedding_table = Parameter(initializer + (TruncatedNormal(initializer_range), + [token_type_vocab_size, + embedding_size])) + self.shape_flat = (-1,) + self.one_hot = P.OneHot() + self.on_value = Tensor(1.0, mstype.float32) + self.off_value = Tensor(0.1, mstype.float32) + self.array_mul = P.MatMul() + self.reshape = P.Reshape() + self.shape = tuple(embedding_shape) + self.layernorm = nn.LayerNorm((embedding_size,)) + self.dropout = nn.Dropout(1 - dropout_prob) + self.gather = P.GatherV2() + self.use_relative_positions = use_relative_positions + self.slice = P.StridedSlice() + self.full_position_embeddings = Parameter(initializer + (TruncatedNormal(initializer_range), + [max_position_embeddings, + embedding_size])) + + def construct(self, token_type_ids, word_embeddings): + """embedding postprocessor""" + output = word_embeddings + if self.use_token_type: + flat_ids = self.reshape(token_type_ids, self.shape_flat) + if self.use_one_hot_embeddings: + one_hot_ids = self.one_hot(flat_ids, + self.token_type_vocab_size, self.on_value, self.off_value) + token_type_embeddings = self.array_mul(one_hot_ids, + self.embedding_table) + else: + token_type_embeddings = self.gather(self.embedding_table, flat_ids, 0) + token_type_embeddings = self.reshape(token_type_embeddings, self.shape) + output += token_type_embeddings + if not self.use_relative_positions: + _, seq, width = self.shape + position_embeddings = self.slice(self.full_position_embeddings, (0, 0), (seq, width), (1, 1)) + position_embeddings = self.reshape(position_embeddings, (1, seq, width)) + output += position_embeddings + output = self.layernorm(output) + output = self.dropout(output) + return output + + +class QuantDense(nn.Cell): + """ + The fake quant fully connected layer. + + Args: + in_channels (int): The number of channels in the input space. + out_channels (int): The number of channels in the output space. + weight_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable weight_init parameter. The dtype + is same as input x. The values of str refer to the function `initializer`. Default: 'normal'. + bias_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable bias_init parameter. The dtype is + same as input x. The values of str refer to the function `initializer`. Default: 'zeros'. + has_bias (bool): Specifies whether the layer uses a bias vector. Default: True. + activation (Function): activate function applied to the output of the fully connected layer, e.g. 'ReLU'. + Default: None. + quant_config (QuantConfig): default quant config. + quant_dtype (QuantDtype): the bits of quantization, Default: 8bit. + activation_init (float): init activate quant value. Default: 6. + """ + def __init__(self, + in_channels, + out_channels, + weight_init='normal', + bias_init='zeros', + has_bias=True, + activation=None, + quant_config=quant_config_default, + quant_dtype=QuantDtype.INT8, + activation_init=2.5): + super(QuantDense, self).__init__() + self.in_channels = Validator.check_positive_int(in_channels) + self.out_channels = Validator.check_positive_int(out_channels) + self.has_bias = Validator.check_bool(has_bias) + + if isinstance(weight_init, Tensor): + if weight_init.dim() != 2 or weight_init.shape[0] != out_channels or \ + weight_init.shape[1] != in_channels: + raise ValueError("weight_init shape error") + + self.weight = Parameter(initializer( + weight_init, [out_channels, in_channels]), name="weight") + + if self.has_bias: + if isinstance(bias_init, Tensor): + if bias_init.dim() != 1 or bias_init.shape[0] != out_channels: + raise ValueError("bias_init shape error") + + self.bias = Parameter(initializer( + bias_init, [out_channels]), name="bias") + + self.matmul = P.MatMul(transpose_b=True) + self.bias_add = P.BiasAdd() + + self.activation = nn.get_activation(activation) if isinstance(activation, str) else activation + if activation is not None and not isinstance(self.activation, (nn.Cell, Primitive)): + raise TypeError("The activation must be str or Cell or Primitive,"" but got {}.".format(activation)) + self.activation_flag = self.activation is not None + self.fake_quant_weight = quant_config.weight(min_init=-6, + max_init=6, + ema=False, + channel_axis=0, + num_channels=out_channels, + quant_dtype=quant_dtype) + self.quant_input = FakeQuantWithMinMax(min_init=-activation_init, + max_init=activation_init, + ema=True) + + def construct(self, x): + """Use operators to construct the Dense layer.""" + output = self.fake_quant_weight(self.weight) + x = self.quant_input(x) + output = self.matmul(x, output) + if self.has_bias: + output = self.bias_add(output, self.bias) + if self.activation_flag: + return self.activation(output) + return output + + def extend_repr(self): + """A pretty print for Dense layer.""" + s = 'in_channels={}, out_channels={}, weight={}, has_bias={}'.format( + self.in_channels, self.out_channels, self.weight, self.has_bias) + if self.has_bias: + s += ', bias={}'.format(self.bias) + if self.activation_flag: + s += ', activation={}'.format(self.activation) + return s + + +class BertOutput(nn.Cell): + """ + Apply a linear computation to hidden status and a residual computation to input. + + Args: + in_channels (int): Input channels. + out_channels (int): Output channels. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + dropout_prob (float): The dropout probability. Default: 0.1. + compute_type (:class:`mindspore.dtype`): Compute type in BertTransformer. Default: mstype.float32. + """ + + def __init__(self, + in_channels, + out_channels, + initializer_range=0.02, + dropout_prob=0.1, + compute_type=mstype.float32, + activation_init=2.5): + super(BertOutput, self).__init__() + self.dense = QuantDense(in_channels, out_channels, + weight_init=TruncatedNormal(initializer_range), + activation_init=activation_init).to_float(compute_type) + self.dropout = nn.Dropout(1 - dropout_prob) + self.add = P.TensorAdd() + self.is_gpu = context.get_context('device_target') == "GPU" + if self.is_gpu: + self.layernorm = nn.LayerNorm((out_channels,)).to_float(mstype.float32) + self.compute_type = compute_type + else: + self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type) + self.cast = P.Cast() + + def construct(self, hidden_status, input_tensor): + """bert output""" + output = self.dense(hidden_status) + output = self.dropout(output) + output = self.add(input_tensor, output) + output = self.layernorm(output) + if self.is_gpu: + output = self.cast(output, self.compute_type) + return output + + +class RelaPosMatrixGenerator(nn.Cell): + """ + Generates matrix of relative positions between inputs. + + Args: + length (int): Length of one dim for the matrix to be generated. + max_relative_position (int): Max value of relative position. + """ + + def __init__(self, length, max_relative_position): + super(RelaPosMatrixGenerator, self).__init__() + self._length = length + self._max_relative_position = Tensor(max_relative_position, dtype=mstype.int32) + self._min_relative_position = Tensor(-max_relative_position, dtype=mstype.int32) + self.range_length = -length + 1 + self.tile = P.Tile() + self.range_mat = P.Reshape() + self.sub = P.Sub() + self.expanddims = P.ExpandDims() + self.cast = P.Cast() + + def construct(self): + """position matrix generator""" + range_vec_row_out = self.cast(F.tuple_to_array(F.make_range(self._length)), mstype.int32) + range_vec_col_out = self.range_mat(range_vec_row_out, (self._length, -1)) + tile_row_out = self.tile(range_vec_row_out, (self._length,)) + tile_col_out = self.tile(range_vec_col_out, (1, self._length)) + range_mat_out = self.range_mat(tile_row_out, (self._length, self._length)) + transpose_out = self.range_mat(tile_col_out, (self._length, self._length)) + distance_mat = self.sub(range_mat_out, transpose_out) + distance_mat_clipped = C.clip_by_value(distance_mat, + self._min_relative_position, + self._max_relative_position) + # Shift values to be >=0. Each integer still uniquely identifies a + # relative position difference. + final_mat = distance_mat_clipped + self._max_relative_position + return final_mat + + +class RelaPosEmbeddingsGenerator(nn.Cell): + """ + Generates tensor of size [length, length, depth]. + + Args: + length (int): Length of one dim for the matrix to be generated. + depth (int): Size of each attention head. + max_relative_position (int): Maxmum value of relative position. + initializer_range (float): Initialization value of TruncatedNormal. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + """ + + def __init__(self, + length, + depth, + max_relative_position, + initializer_range, + use_one_hot_embeddings=False): + super(RelaPosEmbeddingsGenerator, self).__init__() + self.depth = depth + self.vocab_size = max_relative_position * 2 + 1 + self.use_one_hot_embeddings = use_one_hot_embeddings + self.embeddings_table = Parameter( + initializer(TruncatedNormal(initializer_range), + [self.vocab_size, self.depth])) + self.relative_positions_matrix = RelaPosMatrixGenerator(length=length, + max_relative_position=max_relative_position) + self.reshape = P.Reshape() + self.one_hot = P.OneHot() + self.on_value = Tensor(1.0, mstype.float32) + self.off_value = Tensor(0.0, mstype.float32) + self.shape = P.Shape() + self.gather = P.GatherV2() # index_select + self.matmul = P.BatchMatMul() + + def construct(self): + """position embedding generation""" + relative_positions_matrix_out = self.relative_positions_matrix() + # Generate embedding for each relative position of dimension depth. + if self.use_one_hot_embeddings: + flat_relative_positions_matrix = self.reshape(relative_positions_matrix_out, (-1,)) + one_hot_relative_positions_matrix = self.one_hot( + flat_relative_positions_matrix, self.vocab_size, self.on_value, self.off_value) + embeddings = self.matmul(one_hot_relative_positions_matrix, self.embeddings_table) + my_shape = self.shape(relative_positions_matrix_out) + (self.depth,) + embeddings = self.reshape(embeddings, my_shape) + else: + embeddings = self.gather(self.embeddings_table, + relative_positions_matrix_out, 0) + return embeddings + + +class SaturateCast(nn.Cell): + """ + Performs a safe saturating cast. This operation applies proper clamping before casting to prevent + the danger that the value will overflow or underflow. + + Args: + src_type (:class:`mindspore.dtype`): The type of the elements of the input tensor. Default: mstype.float32. + dst_type (:class:`mindspore.dtype`): The type of the elements of the output tensor. Default: mstype.float32. + """ + + def __init__(self, src_type=mstype.float32, dst_type=mstype.float32): + super(SaturateCast, self).__init__() + np_type = mstype.dtype_to_nptype(dst_type) + min_type = np.finfo(np_type).min + max_type = np.finfo(np_type).max + self.tensor_min_type = Tensor([min_type], dtype=src_type) + self.tensor_max_type = Tensor([max_type], dtype=src_type) + self.min_op = P.Minimum() + self.max_op = P.Maximum() + self.cast = P.Cast() + self.dst_type = dst_type + + def construct(self, x): + """saturate cast""" + out = self.max_op(x, self.tensor_min_type) + out = self.min_op(out, self.tensor_max_type) + return self.cast(out, self.dst_type) + + +class BertAttention(nn.Cell): + """ + Apply multi-headed attention from "from_tensor" to "to_tensor". + + Args: + from_tensor_width (int): Size of last dim of from_tensor. + to_tensor_width (int): Size of last dim of to_tensor. + from_seq_length (int): Length of from_tensor sequence. + to_seq_length (int): Length of to_tensor sequence. + num_attention_heads (int): Number of attention heads. Default: 1. + size_per_head (int): Size of each attention head. Default: 512. + query_act (str): Activation function for the query transform. Default: None. + key_act (str): Activation function for the key transform. Default: None. + value_act (str): Activation function for the value transform. Default: None. + has_attention_mask (bool): Specifies whether to use attention mask. Default: False. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.0. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + do_return_2d_tensor (bool): True for return 2d tensor. False for return 3d + tensor. Default: False. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + compute_type (:class:`mindspore.dtype`): Compute type in BertAttention. Default: mstype.float32. + """ + + def __init__(self, + from_tensor_width, + to_tensor_width, + from_seq_length, + to_seq_length, + num_attention_heads=1, + size_per_head=512, + query_act=None, + key_act=None, + value_act=None, + has_attention_mask=False, + attention_probs_dropout_prob=0.0, + use_one_hot_embeddings=False, + initializer_range=0.02, + do_return_2d_tensor=False, + use_relative_positions=False, + compute_type=mstype.float32, + activation_init=2.5): + super(BertAttention, self).__init__() + self.from_seq_length = from_seq_length + self.to_seq_length = to_seq_length + self.num_attention_heads = num_attention_heads + self.size_per_head = size_per_head + self.has_attention_mask = has_attention_mask + self.use_relative_positions = use_relative_positions + self.scores_mul = Tensor([1.0 / math.sqrt(float(self.size_per_head))], dtype=compute_type) + self.reshape = P.Reshape() + self.shape_from_2d = (-1, from_tensor_width) + self.shape_to_2d = (-1, to_tensor_width) + weight = TruncatedNormal(initializer_range) + units = num_attention_heads * size_per_head + self.do_quant = True + if self.do_quant: + self.quant_from_tensor_2d = FakeQuantWithMinMax(min_init=-activation_init, + max_init=activation_init, + ema=True) + self.quant_to_tensor_2d = FakeQuantWithMinMax(min_init=-activation_init, + max_init=activation_init, + ema=True) + self.quant_query_layer = FakeQuantWithMinMax(min_init=-activation_init, + max_init=activation_init, + ema=True) + self.quant_key_layer = FakeQuantWithMinMax(min_init=-activation_init, + max_init=activation_init, + ema=True) + self.quant_attention_probs = FakeQuantWithMinMax(min_init=-activation_init, + max_init=activation_init, + ema=True) + self.quant_value_layer = FakeQuantWithMinMax(min_init=-activation_init, + max_init=activation_init, + ema=True) + self.query_layer = nn.Dense(from_tensor_width, + units, + activation=query_act, + weight_init=weight).to_float(compute_type) + self.key_layer = nn.Dense(to_tensor_width, + units, + activation=key_act, + weight_init=weight).to_float(compute_type) + self.value_layer = nn.Dense(to_tensor_width, + units, + activation=value_act, + weight_init=weight).to_float(compute_type) + self.shape_from = (-1, from_seq_length, num_attention_heads, size_per_head) + self.shape_to = (-1, to_seq_length, num_attention_heads, size_per_head) + self.matmul_trans_b = P.BatchMatMul(transpose_b=True) + self.multiply = P.Mul() + self.transpose = P.Transpose() + self.trans_shape = (0, 2, 1, 3) + self.trans_shape_relative = (2, 0, 1, 3) + self.trans_shape_position = (1, 2, 0, 3) + self.multiply_data = Tensor([-10000.0,], dtype=compute_type) + self.matmul = P.BatchMatMul() + self.softmax = nn.Softmax() + self.dropout = nn.Dropout(1 - attention_probs_dropout_prob) + if self.has_attention_mask: + self.expand_dims = P.ExpandDims() + self.sub = P.Sub() + self.add = P.TensorAdd() + self.cast = P.Cast() + self.get_dtype = P.DType() + if do_return_2d_tensor: + self.shape_return = (-1, num_attention_heads * size_per_head) + else: + self.shape_return = (-1, from_seq_length, num_attention_heads * size_per_head) + self.cast_compute_type = SaturateCast(dst_type=compute_type) + if self.use_relative_positions: + self._generate_relative_positions_embeddings = \ + RelaPosEmbeddingsGenerator(length=to_seq_length, + depth=size_per_head, + max_relative_position=16, + initializer_range=initializer_range, + use_one_hot_embeddings=use_one_hot_embeddings) + + def construct(self, from_tensor, to_tensor, attention_mask): + """bert attention""" + # reshape 2d/3d input tensors to 2d + from_tensor_2d = self.reshape(from_tensor, self.shape_from_2d) + to_tensor_2d = self.reshape(to_tensor, self.shape_to_2d) + if self.do_quant: + from_tensor_2d = self.quant_from_tensor_2d(from_tensor_2d) + to_tensor_2d = self.quant_to_tensor_2d(to_tensor_2d) + query_out = self.query_layer(from_tensor_2d) + key_out = self.key_layer(to_tensor_2d) + value_out = self.value_layer(to_tensor_2d) + query_layer = self.reshape(query_out, self.shape_from) + query_layer = self.transpose(query_layer, self.trans_shape) + key_layer = self.reshape(key_out, self.shape_to) + key_layer = self.transpose(key_layer, self.trans_shape) + if self.do_quant: + query_layer = self.quant_query_layer(query_layer) + key_layer = self.quant_key_layer(key_layer) + attention_scores = self.matmul_trans_b(query_layer, key_layer) + # use_relative_position, supplementary logic + if self.use_relative_positions: + # relations_keys is [F|T, F|T, H] + relations_keys = self._generate_relative_positions_embeddings() + relations_keys = self.cast_compute_type(relations_keys) + # query_layer_t is [F, B, N, H] + query_layer_t = self.transpose(query_layer, self.trans_shape_relative) + # query_layer_r is [F, B * N, H] + query_layer_r = self.reshape(query_layer_t, + (self.from_seq_length, + -1, + self.size_per_head)) + # key_position_scores is [F, B * N, F|T] + key_position_scores = self.matmul_trans_b(query_layer_r, + relations_keys) + # key_position_scores_r is [F, B, N, F|T] + key_position_scores_r = self.reshape(key_position_scores, + (self.from_seq_length, + -1, + self.num_attention_heads, + self.from_seq_length)) + # key_position_scores_r_t is [B, N, F, F|T] + key_position_scores_r_t = self.transpose(key_position_scores_r, + self.trans_shape_position) + attention_scores = attention_scores + key_position_scores_r_t + attention_scores = self.multiply(self.scores_mul, attention_scores) + if self.has_attention_mask: + attention_mask = self.expand_dims(attention_mask, 1) + multiply_out = self.sub(self.cast(F.tuple_to_array((1.0,)), self.get_dtype(attention_scores)), + self.cast(attention_mask, self.get_dtype(attention_scores))) + adder = self.multiply(multiply_out, self.multiply_data) + attention_scores = self.add(adder, attention_scores) + attention_probs = self.softmax(attention_scores) + attention_probs = self.dropout(attention_probs) + value_layer = self.reshape(value_out, self.shape_to) + value_layer = self.transpose(value_layer, self.trans_shape) + if self.do_quant: + attention_probs = self.quant_attention_probs(attention_probs) + value_layer = self.quant_value_layer(value_layer) + context_layer = self.matmul(attention_probs, value_layer) + # use_relative_position, supplementary logic + if self.use_relative_positions: + # relations_values is [F|T, F|T, H] + relations_values = self._generate_relative_positions_embeddings() + relations_values = self.cast_compute_type(relations_values) + # attention_probs_t is [F, B, N, T] + attention_probs_t = self.transpose(attention_probs, self.trans_shape_relative) + # attention_probs_r is [F, B * N, T] + attention_probs_r = self.reshape( + attention_probs_t, + (self.from_seq_length, + -1, + self.to_seq_length)) + # value_position_scores is [F, B * N, H] + value_position_scores = self.matmul(attention_probs_r, + relations_values) + # value_position_scores_r is [F, B, N, H] + value_position_scores_r = self.reshape(value_position_scores, + (self.from_seq_length, + -1, + self.num_attention_heads, + self.size_per_head)) + # value_position_scores_r_t is [B, N, F, H] + value_position_scores_r_t = self.transpose(value_position_scores_r, + self.trans_shape_position) + context_layer = context_layer + value_position_scores_r_t + context_layer = self.transpose(context_layer, self.trans_shape) + context_layer = self.reshape(context_layer, self.shape_return) + return context_layer, attention_scores + + +class BertSelfAttention(nn.Cell): + """ + Apply self-attention. + + Args: + seq_length (int): Length of input sequence. + hidden_size (int): Size of the bert encoder layers. + num_attention_heads (int): Number of attention heads. Default: 12. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.1. + use_one_hot_embeddings (bool): Specifies whether to use one_hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + hidden_dropout_prob (float): The dropout probability for BertOutput. Default: 0.1. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + compute_type (:class:`mindspore.dtype`): Compute type in BertSelfAttention. Default: mstype.float32. + """ + + def __init__(self, + seq_length, + hidden_size, + num_attention_heads=12, + attention_probs_dropout_prob=0.1, + use_one_hot_embeddings=False, + initializer_range=0.02, + hidden_dropout_prob=0.1, + use_relative_positions=False, + compute_type=mstype.float32, + activation_init=2.5): + super(BertSelfAttention, self).__init__() + if hidden_size % num_attention_heads != 0: + raise ValueError("The hidden size (%d) is not a multiple of the number " + "of attention heads (%d)" % (hidden_size, num_attention_heads)) + self.size_per_head = int(hidden_size / num_attention_heads) + self.attention = BertAttention( + from_tensor_width=hidden_size, + to_tensor_width=hidden_size, + from_seq_length=seq_length, + to_seq_length=seq_length, + num_attention_heads=num_attention_heads, + size_per_head=self.size_per_head, + attention_probs_dropout_prob=attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=initializer_range, + use_relative_positions=use_relative_positions, + has_attention_mask=True, + do_return_2d_tensor=True, + compute_type=compute_type, + activation_init=activation_init + ) + self.output = BertOutput(in_channels=hidden_size, + out_channels=hidden_size, + initializer_range=initializer_range, + dropout_prob=hidden_dropout_prob, + compute_type=compute_type, + activation_init=activation_init) + self.reshape = P.Reshape() + self.shape = (-1, hidden_size) + + def construct(self, input_tensor, attention_mask): + """bert self attention""" + input_tensor = self.reshape(input_tensor, self.shape) + attention_output, attention_scores = self.attention(input_tensor, input_tensor, attention_mask) + output = self.output(attention_output, input_tensor) + return output, attention_scores + + +class BertEncoderCell(nn.Cell): + """ + Encoder cells used in BertTransformer. + + Args: + hidden_size (int): Size of the bert encoder layers. Default: 768. + seq_length (int): Length of input sequence. Default: 512. + num_attention_heads (int): Number of attention heads. Default: 12. + intermediate_size (int): Size of intermediate layer. Default: 3072. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.02. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + hidden_dropout_prob (float): The dropout probability for BertOutput. Default: 0.1. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + hidden_act (str): Activation function. Default: "gelu". + compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: mstype.float32. + """ + + def __init__(self, + hidden_size=768, + seq_length=512, + num_attention_heads=12, + intermediate_size=3072, + attention_probs_dropout_prob=0.02, + use_one_hot_embeddings=False, + initializer_range=0.02, + hidden_dropout_prob=0.1, + use_relative_positions=False, + hidden_act="gelu", + compute_type=mstype.float32, + activation_init=2.5): + super(BertEncoderCell, self).__init__() + self.attention = BertSelfAttention( + hidden_size=hidden_size, + seq_length=seq_length, + num_attention_heads=num_attention_heads, + attention_probs_dropout_prob=attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=initializer_range, + hidden_dropout_prob=hidden_dropout_prob, + use_relative_positions=use_relative_positions, + compute_type=compute_type, + activation_init=activation_init) + self.intermediate = QuantDense(in_channels=hidden_size, out_channels=intermediate_size, + activation=hidden_act, + weight_init=TruncatedNormal(initializer_range)).to_float(compute_type) + self.output = BertOutput(in_channels=intermediate_size, + out_channels=hidden_size, + initializer_range=initializer_range, + dropout_prob=hidden_dropout_prob, + compute_type=compute_type, + activation_init=activation_init) + + def construct(self, hidden_states, attention_mask): + """bert encoder cell""" + # self-attention + attention_output, attention_scores = self.attention(hidden_states, attention_mask) + # feed construct + intermediate_output = self.intermediate(attention_output) + # add and normalize + output = self.output(intermediate_output, attention_output) + return output, attention_scores + + +class BertTransformer(nn.Cell): + """ + Multi-layer bert transformer. + + Args: + hidden_size (int): Size of the encoder layers. + seq_length (int): Length of input sequence. + num_hidden_layers (int): Number of hidden layers in encoder cells. + num_attention_heads (int): Number of attention heads in encoder cells. Default: 12. + intermediate_size (int): Size of intermediate layer in encoder cells. Default: 3072. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.1. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + hidden_dropout_prob (float): The dropout probability for BertOutput. Default: 0.1. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + hidden_act (str): Activation function used in the encoder cells. Default: "gelu". + compute_type (:class:`mindspore.dtype`): Compute type in BertTransformer. Default: mstype.float32. + return_all_encoders (bool): Specifies whether to return all encoders. Default: False. + """ + + def __init__(self, + hidden_size, + seq_length, + num_hidden_layers, + num_attention_heads=12, + intermediate_size=3072, + attention_probs_dropout_prob=0.1, + use_one_hot_embeddings=False, + initializer_range=0.02, + hidden_dropout_prob=0.1, + use_relative_positions=False, + hidden_act="gelu", + compute_type=mstype.float32, + return_all_encoders=False, + activation_init=2.5): + super(BertTransformer, self).__init__() + self.return_all_encoders = return_all_encoders + layers = [] + for _ in range(num_hidden_layers): + layer = BertEncoderCell(hidden_size=hidden_size, + seq_length=seq_length, + num_attention_heads=num_attention_heads, + intermediate_size=intermediate_size, + attention_probs_dropout_prob=attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=initializer_range, + hidden_dropout_prob=hidden_dropout_prob, + use_relative_positions=use_relative_positions, + hidden_act=hidden_act, + compute_type=compute_type,) + layers.append(layer) + self.layers = nn.CellList(layers) + self.reshape = P.Reshape() + self.shape = (-1, hidden_size) + self.out_shape = (-1, seq_length, hidden_size) + + def construct(self, input_tensor, attention_mask): + """bert transformer""" + prev_output = self.reshape(input_tensor, self.shape) + all_encoder_layers = () + all_encoder_atts = () + all_encoder_outputs = () + all_encoder_outputs += (prev_output,) + for layer_module in self.layers: + layer_output, encoder_att = layer_module(prev_output, attention_mask) + prev_output = layer_output + if self.return_all_encoders: + all_encoder_outputs += (layer_output,) + layer_output = self.reshape(layer_output, self.out_shape) + all_encoder_layers += (layer_output,) + all_encoder_atts += (encoder_att,) + if not self.return_all_encoders: + prev_output = self.reshape(prev_output, self.out_shape) + all_encoder_layers += (prev_output,) + return all_encoder_layers, all_encoder_outputs, all_encoder_atts + + +class CreateAttentionMaskFromInputMask(nn.Cell): + """ + Create attention mask according to input mask. + + Args: + config (Class): Configuration for BertModel. + """ + + def __init__(self, config): + super(CreateAttentionMaskFromInputMask, self).__init__() + self.input_mask = None + self.cast = P.Cast() + self.reshape = P.Reshape() + self.shape = (-1, 1, config.seq_length) + + def construct(self, input_mask): + attention_mask = self.cast(self.reshape(input_mask, self.shape), mstype.float32) + return attention_mask + + +class BertModel(nn.Cell): + """ + Bidirectional Encoder Representations from Transformers. + + Args: + config (Class): Configuration for BertModel. + is_training (bool): True for training mode. False for eval mode. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + """ + + def __init__(self, + config, + is_training, + use_one_hot_embeddings=False): + super(BertModel, self).__init__() + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + self.seq_length = config.seq_length + self.hidden_size = config.hidden_size + self.num_hidden_layers = config.num_hidden_layers + self.embedding_size = config.hidden_size + self.token_type_ids = None + self.last_idx = self.num_hidden_layers - 1 + output_embedding_shape = [-1, self.seq_length, + self.embedding_size] + self.bert_embedding_lookup = EmbeddingLookup( + vocab_size=config.vocab_size, + embedding_size=self.embedding_size, + embedding_shape=output_embedding_shape, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=config.initializer_range) + self.bert_embedding_postprocessor = EmbeddingPostprocessor( + use_relative_positions=config.use_relative_positions, + embedding_size=self.embedding_size, + embedding_shape=output_embedding_shape, + use_token_type=True, + token_type_vocab_size=config.type_vocab_size, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=0.02, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob) + self.bert_encoder = BertTransformer( + hidden_size=self.hidden_size, + seq_length=self.seq_length, + num_attention_heads=config.num_attention_heads, + num_hidden_layers=self.num_hidden_layers, + intermediate_size=config.intermediate_size, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=config.initializer_range, + hidden_dropout_prob=config.hidden_dropout_prob, + use_relative_positions=config.use_relative_positions, + hidden_act=config.hidden_act, + compute_type=config.compute_type, + return_all_encoders=True) + self.cast = P.Cast() + self.dtype = config.dtype + self.cast_compute_type = SaturateCast(dst_type=config.compute_type) + self.slice = P.StridedSlice() + self.squeeze_1 = P.Squeeze(axis=1) + self.dense = nn.Dense(self.hidden_size, self.hidden_size, + activation="tanh", + weight_init=TruncatedNormal(config.initializer_range)).to_float(config.compute_type) + self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask(config) + + def construct(self, input_ids, token_type_ids, input_mask): + """bert model""" + # embedding + word_embeddings, embedding_tables = self.bert_embedding_lookup(input_ids) + embedding_output = self.bert_embedding_postprocessor(token_type_ids, word_embeddings) + # attention mask [batch_size, seq_length, seq_length] + attention_mask = self._create_attention_mask_from_input_mask(input_mask) + # bert encoder + encoder_output, encoder_layers, layer_atts = self.bert_encoder(self.cast_compute_type(embedding_output), + attention_mask) + sequence_output = self.cast(encoder_output[self.last_idx], self.dtype) + # pooler + batch_size = P.Shape()(input_ids)[0] + sequence_slice = self.slice(sequence_output, + (0, 0, 0), + (batch_size, 1, self.hidden_size), + (1, 1, 1)) + first_token = self.squeeze_1(sequence_slice) + pooled_output = self.dense(first_token) + pooled_output = self.cast(pooled_output, self.dtype) + encoder_outputs = () + for output in encoder_layers: + encoder_outputs += (self.cast(output, self.dtype),) + attention_outputs = () + for output in layer_atts: + attention_outputs += (self.cast(output, self.dtype),) + return sequence_output, pooled_output, embedding_tables, encoder_outputs, attention_outputs + + +class TinyBertModel(nn.Cell): + """ + Bidirectional Encoder Representations from Transformers. + + Args: + config (Class): Configuration for BertModel. + is_training (bool): True for training mode. False for eval mode. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + """ + + def __init__(self, + config, + is_training, + use_one_hot_embeddings=False): + super(TinyBertModel, self).__init__() + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + self.seq_length = config.seq_length + self.hidden_size = config.hidden_size + self.num_hidden_layers = config.num_hidden_layers + self.embedding_size = config.hidden_size + self.token_type_ids = None + self.last_idx = self.num_hidden_layers - 1 + output_embedding_shape = [-1, self.seq_length, + self.embedding_size] + self.tinybert_embedding_lookup = EmbeddingLookup( + vocab_size=config.vocab_size, + embedding_size=self.embedding_size, + embedding_shape=output_embedding_shape, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=config.initializer_range) + self.tinybert_embedding_postprocessor = EmbeddingPostprocessor( + use_relative_positions=config.use_relative_positions, + embedding_size=self.embedding_size, + embedding_shape=output_embedding_shape, + use_token_type=True, + token_type_vocab_size=config.type_vocab_size, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=0.02, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob) + self.tinybert_encoder = BertTransformer( + hidden_size=self.hidden_size, + seq_length=self.seq_length, + num_attention_heads=config.num_attention_heads, + num_hidden_layers=self.num_hidden_layers, + intermediate_size=config.intermediate_size, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=config.initializer_range, + hidden_dropout_prob=config.hidden_dropout_prob, + use_relative_positions=config.use_relative_positions, + hidden_act=config.hidden_act, + compute_type=config.compute_type, + return_all_encoders=True) + self.cast = P.Cast() + self.dtype = config.dtype + self.cast_compute_type = SaturateCast(dst_type=config.compute_type) + self.slice = P.StridedSlice() + self.squeeze_1 = P.Squeeze(axis=1) + self.dense = nn.Dense(self.hidden_size, self.hidden_size, + activation="tanh", + weight_init=TruncatedNormal(config.initializer_range)).to_float(config.compute_type) + self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask(config) + + def construct(self, input_ids, token_type_ids, input_mask): + """tiny bert model""" + # embedding + word_embeddings, embedding_tables = self.tinybert_embedding_lookup(input_ids) + embedding_output = self.tinybert_embedding_postprocessor(token_type_ids, + word_embeddings) + # attention mask [batch_size, seq_length, seq_length] + attention_mask = self._create_attention_mask_from_input_mask(input_mask) + # bert encoder + encoder_output, encoder_layers, layer_atts = self.tinybert_encoder(self.cast_compute_type(embedding_output), + attention_mask) + sequence_output = self.cast(encoder_output[self.last_idx], self.dtype) + # pooler + batch_size = P.Shape()(input_ids)[0] + sequence_slice = self.slice(sequence_output, + (0, 0, 0), + (batch_size, 1, self.hidden_size), + (1, 1, 1)) + first_token = self.squeeze_1(sequence_slice) + pooled_output = self.dense(first_token) + pooled_output = self.cast(pooled_output, self.dtype) + encoder_outputs = () + for output in encoder_layers: + encoder_outputs += (self.cast(output, self.dtype),) + attention_outputs = () + for output in layer_atts: + attention_outputs += (self.cast(output, self.dtype),) + return sequence_output, pooled_output, embedding_tables, encoder_outputs, attention_outputs + + +class BertModelCLS(nn.Cell): + """ + This class is responsible for classification task evaluation, + i.e. XNLI(num_labels=3), LCQMC(num_labels=2), Chnsenti(num_labels=2). + The returned output represents the final logits as the results of log_softmax is proportional to that of softmax. + """ + + def __init__(self, config, is_training, num_labels=2, dropout_prob=0.0, + use_one_hot_embeddings=False, phase_type="student"): + super(BertModelCLS, self).__init__() + self.bert = BertModel(config, is_training, use_one_hot_embeddings) + self.cast = P.Cast() + self.weight_init = TruncatedNormal(config.initializer_range) + self.log_softmax = P.LogSoftmax(axis=-1) + self.dtype = config.dtype + self.num_labels = num_labels + self.phase_type = phase_type + if self.phase_type == "teacher": + self.dense = nn.Dense(config.hidden_size, self.num_labels, weight_init=self.weight_init, + has_bias=True).to_float(config.compute_type) + else: + self.dense_1 = nn.Dense(config.hidden_size, self.num_labels, weight_init=self.weight_init, + has_bias=True).to_float(config.compute_type) + self.dropout = nn.ReLU() + + def construct(self, input_ids, token_type_id, input_mask): + """classification bert model""" + _, pooled_output, _, seq_output, att_output = self.bert(input_ids, token_type_id, input_mask) + cls = self.cast(pooled_output, self.dtype) + cls = self.dropout(cls) + if self.phase_type == "teacher": + logits = self.dense(cls) + else: + logits = self.dense_1(cls) + logits = self.cast(logits, self.dtype) + log_probs = self.log_softmax(logits) + if self._phase == 'train' or self.phase_type == "teacher": + return seq_output, att_output, logits, log_probs + return seq_output, att_output, logits, log_probs diff --git a/model_zoo/official/nlp/q8bert/src/utils.py b/model_zoo/official/nlp/q8bert/src/utils.py new file mode 100644 index 00000000000..2e489b0431c --- /dev/null +++ b/model_zoo/official/nlp/q8bert/src/utils.py @@ -0,0 +1,307 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =========================================================================== + +"""tinybert utils""" + +import os +import logging +import numpy as np +from mindspore import Tensor +from mindspore.common import dtype as mstype +from mindspore.train.callback import Callback +from mindspore.train.serialization import save_checkpoint +from mindspore.ops import operations as P +from mindspore.nn.learning_rate_schedule import LearningRateSchedule, PolynomialDecayLR, WarmUpLR +import mindspore.nn as nn + +logger = logging.getLogger(__name__) + +try: + from scipy.stats import pearsonr, spearmanr + from sklearn.metrics import matthews_corrcoef, f1_score + _has_sklearn = True +except (AttributeError, ImportError) as e: + logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html") + _has_sklearn = False + +def is_sklearn_available(): + return _has_sklearn + +if _has_sklearn: + + def simple_accuracy(preds, labels): + return (preds == labels).mean() + + + def acc_and_f1(preds, labels): + acc = simple_accuracy(preds, labels) + f1 = f1_score(y_true=labels, y_pred=preds) + return { + "acc": acc, + "f1": f1, + "acc_and_f1": (acc + f1) / 2, + } + + + def pearson_and_spearman(preds, labels): + pearson_corr = pearsonr(preds, labels)[0] + spearman_corr = spearmanr(preds, labels)[0] + return { + "pearson": pearson_corr, + "spearmanr": spearman_corr, + "corr": (pearson_corr + spearman_corr) / 2, + } + + + def glue_compute_metrics(task_name, preds, labels): + """different dataset evaluation.""" + assert len(preds) == len(labels) + if task_name == "cola": + result = {"mcc": matthews_corrcoef(labels, preds)} + elif task_name == "sst-2": + result = {"acc": simple_accuracy(preds, labels)} + elif task_name == "mrpc": + result = acc_and_f1(preds, labels) + elif task_name == "sts-b": + result = pearson_and_spearman(preds, labels) + elif task_name == "qqp": + result = acc_and_f1(preds, labels) + elif task_name == "mnli": + result = {"acc": simple_accuracy(preds, labels)} + elif task_name == "mnli-mm": + result = {"acc": simple_accuracy(preds, labels)} + elif task_name == "qnli": + result = {"acc": simple_accuracy(preds, labels)} + elif task_name == "rte": + result = {"acc": simple_accuracy(preds, labels)} + elif task_name == "wnli": + result = {"acc": simple_accuracy(preds, labels)} + else: + raise KeyError(task_name) + return result + +glue_output_modes = { + "cola": "classification", + "mnli": "classification", + "mnli-mm": "classification", + "mrpc": "classification", + "sst-2": "classification", + "sts-b": "regression", + "qqp": "classification", + "qnli": "classification", + "rte": "classification", + "wnli": "classification", +} + +class ModelSaveCkpt(Callback): + """ + Saves checkpoint. + If the loss in NAN or INF terminating training. + Args: + network (Network): The train network for training. + save_ckpt_num (int): The number to save checkpoint, default is 1000. + max_ckpt_num (int): The max checkpoint number, default is 3. + """ + def __init__(self, network, save_ckpt_step, max_ckpt_num, output_dir): + super(ModelSaveCkpt, self).__init__() + self.count = 0 + self.network = network + self.save_ckpt_step = save_ckpt_step + self.max_ckpt_num = max_ckpt_num + self.output_dir = output_dir + + def step_end(self, run_context): + """step end and save ckpt""" + cb_params = run_context.original_args() + if cb_params.cur_step_num % self.save_ckpt_step == 0: + saved_ckpt_num = cb_params.cur_step_num / self.save_ckpt_step + if saved_ckpt_num > self.max_ckpt_num: + oldest_ckpt_index = saved_ckpt_num - self.max_ckpt_num + path = os.path.join(self.output_dir, "tiny_bert_{}_{}.ckpt".format(int(oldest_ckpt_index), + self.save_ckpt_step)) + if os.path.exists(path): + os.remove(path) + save_checkpoint(self.network, os.path.join(self.output_dir, + "tiny_bert_{}_{}.ckpt".format(int(saved_ckpt_num), + self.save_ckpt_step))) + +def make_directory(path: str): + """Make directory.""" + if path is None or not isinstance(path, str) or path.strip() == "": + logger.error("The path(%r) is invalid type.", path) + raise TypeError("Input path is invalid type") + + # convert the relative paths + path = os.path.realpath(path) + logger.debug("The abs path is %r", path) + + # check the path is exist and write permissions? + if os.path.exists(path): + real_path = path + else: + # All exceptions need to be caught because create directory maybe have some limit(permissions) + logger.debug("The directory(%s) doesn't exist, will create it", path) + try: + os.makedirs(path, exist_ok=True) + real_path = path + except PermissionError as e: + logger.error("No write permission on the directory(%r), error = %r", path, e) + raise TypeError("No write permission on the directory.") + return real_path + +class LossCallBack(Callback): + """ + Monitor the loss in training. + If the loss in NAN or INF terminating training. + Note: + if per_print_times is 0 do not print loss. + Args: + per_print_times (int): Print loss every times. Default: 1. + """ + def __init__(self, per_print_times=1): + super(LossCallBack, self).__init__() + if not isinstance(per_print_times, int) or per_print_times < 0: + raise ValueError("print_step must be int and >= 0") + self._per_print_times = per_print_times + + def step_end(self, run_context): + """step end and print loss""" + cb_params = run_context.original_args() + print("epoch: {}, step: {}, loss are {}".format(cb_params.cur_epoch_num, + cb_params.cur_step_num, + str(cb_params.net_outputs))) + +class EvalCallBack(Callback): + """Evaluation callback""" + def __init__(self, network, dataset, task_name, logging_step): + super(EvalCallBack, self).__init__() + self.network = network + self.global_acc = 0.0 + self.dataset = dataset + self.task_name = task_name + self.logging_step = logging_step + + def step_end(self, run_context): + """step end and do evaluation""" + cb_params = run_context.original_args() + label_nums = 2 + if self.task_name.lower == 'mnli': + label_nums = 3 + if cb_params.cur_step_num % self.logging_step == 0: + columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"] + preds = None + out_label_ids = None + for data in self.dataset.create_dict_iterator(num_epochs=1): + input_data = [] + for i in columns_list: + input_data.append(data[i]) + input_ids, input_mask, token_type_id, label_ids = input_data + self.network.set_train(False) + _, _, logits, _ = self.network(input_ids, token_type_id, input_mask) + if preds is None: + preds = logits.asnumpy() + preds = np.reshape(preds, [-1, label_nums]) + out_label_ids = label_ids.asnumpy() + else: + preds = np.concatenate((preds, np.reshape(logits.asnumpy(), [-1, label_nums])), axis=0) + out_label_ids = np.append(out_label_ids, label_ids.asnumpy()) + if glue_output_modes[self.task_name.lower()] == "classification": + preds = np.argmax(preds, axis=1) + elif glue_output_modes[self.task_name.lower()] == "regression": + preds = np.reshape(preds, [-1]) + result = glue_compute_metrics(self.task_name.lower(), preds, out_label_ids) + print("The current result is {}".format(result)) + + +def LoadNewestCkpt(load_finetune_checkpoint_dir, steps_per_epoch, epoch_num, prefix): + """ + Find the ckpt finetune generated and load it into eval network. + """ + files = os.listdir(load_finetune_checkpoint_dir) + pre_len = len(prefix) + max_num = 0 + for filename in files: + name_ext = os.path.splitext(filename) + if name_ext[-1] != ".ckpt": + continue + if filename.find(prefix) == 0 and not filename[pre_len].isalpha(): + index = filename[pre_len:].find("-") + if index == 0 and max_num == 0: + load_finetune_checkpoint_path = os.path.join(load_finetune_checkpoint_dir, filename) + elif index not in (0, -1): + name_split = name_ext[-2].split('_') + if (steps_per_epoch != int(name_split[len(name_split)-1])) \ + or (epoch_num != int(filename[pre_len + index + 1:pre_len + index + 2])): + continue + num = filename[pre_len + 1:pre_len + index] + if int(num) > max_num: + max_num = int(num) + load_finetune_checkpoint_path = os.path.join(load_finetune_checkpoint_dir, filename) + return load_finetune_checkpoint_path + +class BertLearningRate(LearningRateSchedule): + """ + Warmup-decay learning rate for Bert network. + """ + def __init__(self, learning_rate, end_learning_rate, warmup_steps, decay_steps, power): + super(BertLearningRate, self).__init__() + self.warmup_flag = False + if warmup_steps > 0: + self.warmup_flag = True + self.warmup_lr = WarmUpLR(learning_rate, warmup_steps) + self.decay_lr = PolynomialDecayLR(learning_rate, end_learning_rate, decay_steps, power) + self.warmup_steps = Tensor(np.array([warmup_steps]).astype(np.float32)) + + self.greater = P.Greater() + self.one = Tensor(np.array([1.0]).astype(np.float32)) + self.cast = P.Cast() + + def construct(self, global_step): + decay_lr = self.decay_lr(global_step) + if self.warmup_flag: + is_warmup = self.cast(self.greater(self.warmup_steps, global_step), mstype.float32) + warmup_lr = self.warmup_lr(global_step) + lr = (self.one - is_warmup) * decay_lr + is_warmup * warmup_lr + else: + lr = decay_lr + return lr + +class CrossEntropyCalculation(nn.Cell): + """ + Cross Entropy loss + """ + def __init__(self, is_training=True): + super(CrossEntropyCalculation, self).__init__() + self.onehot = P.OneHot() + self.on_value = Tensor(1.0, mstype.float32) + self.off_value = Tensor(0.0, mstype.float32) + self.reduce_sum = P.ReduceSum() + self.reduce_mean = P.ReduceMean() + self.reshape = P.Reshape() + self.last_idx = (-1,) + self.neg = P.Neg() + self.cast = P.Cast() + self.is_training = is_training + + def construct(self, logits, label_ids, num_labels): + if self.is_training: + label_ids = self.reshape(label_ids, self.last_idx) + one_hot_labels = self.onehot(label_ids, num_labels, self.on_value, self.off_value) + per_example_loss = self.neg(self.reduce_sum(one_hot_labels * logits, self.last_idx)) + loss = self.reduce_mean(per_example_loss, self.last_idx) + return_value = self.cast(loss, mstype.float32) + else: + return_value = logits * 1.0 + return return_value