!18779 modify model_zoo mass for clould

Merge pull request !18779 from lilei/modify_model_zoo_mass
This commit is contained in:
i-robot 2021-06-24 01:52:37 +00:00 committed by Gitee
commit eb30e1e691
22 changed files with 681 additions and 341 deletions

View File

@ -14,19 +14,13 @@
# ============================================================================
"""post process for 310 inference"""
import os
import argparse
import numpy as np
from PIL import Image
from src.config import config
from src.model_utils.config import config
from src.eval_utils import metrics
batch_size = 1
parser = argparse.ArgumentParser(description="ssd acc calculation")
parser.add_argument("--result_path", type=str, required=True, help="result files path.")
parser.add_argument("--img_path", type=str, required=True, help="image file path.")
parser.add_argument("--drop", action="store_true", help="drop iscrowd images or not.")
args = parser.parse_args()
def get_imgSize(file_name):
img = Image.open(file_name)
@ -35,7 +29,7 @@ def get_imgSize(file_name):
def get_result(result_path, img_id_file_path):
anno_json = os.path.join(config.coco_root, config.instances_set.format(config.val_data_type))
if args.drop:
if config.drop:
from pycocotools.coco import COCO
train_cls = config.classes
train_cls_dict = {}
@ -53,7 +47,7 @@ def get_result(result_path, img_id_file_path):
for file in files:
img_ids_name = file.split('.')[0]
img_id = int(np.squeeze(img_ids_name))
if args.drop:
if config.drop:
anno_ids = coco.getAnnIds(imgIds=img_id, iscrowd=None)
anno = coco.loadAnns(anno_ids)
annos = []
@ -86,4 +80,4 @@ def get_result(result_path, img_id_file_path):
print(f" mAP:{mAP}")
if __name__ == '__main__':
get_result(args.result_path, args.img_path)
get_result(config.result_path, config.img_path)

View File

@ -68,6 +68,10 @@ filter_weight: False
freeze_layer: None
save_best_ckpt: True
result_path: ""
img_path: ""
drop: False
# `mindrecord_dir` and `coco_root` are better to use absolute path.
feature_extractor_base_param: ""
checkpoint_filter_list: ['multi_loc_layers', 'multi_cls_layers']

View File

@ -71,6 +71,10 @@ filter_weight: False
freeze_layer: None
save_best_ckpt: True
result_path: ""
img_path: ""
drop: False
# `mindrecord_dir` and `coco_root` are better to use absolute path.
feature_extractor_base_param: "/ckpt/mobilenet_v1.ckpt"
checkpoint_filter_list: ['network.multi_box.cls_layers.0.weight', 'network.multi_box.cls_layers.0.bias',

View File

@ -72,6 +72,10 @@ filter_weight: False
freeze_layer: None
save_best_ckpt: True
result_path: ""
img_path: ""
drop: False
# `mindrecord_dir` and `coco_root` are better to use absolute path.
feature_extractor_base_param: "/ckpt/resnet50.ckpt"
checkpoint_filter_list: ['network.multi_box.cls_layers.0.weight', 'network.multi_box.cls_layers.0.bias',

View File

@ -67,6 +67,10 @@ filter_weight: False
freeze_layer: None
save_best_ckpt: True
result_path: ""
img_path: ""
drop: False
# `mindrecord_dir` and `coco_root` are better to use absolute path.
feature_extractor_base_param: ""
checkpoint_filter_list: ['multi_loc_layers', 'multi_cls_layers']

View File

@ -81,6 +81,37 @@ First of all, through a sequence to sequence framework, mass only predicts the b
Secondly, by predicting the continuous token of the decoder, the decoder can build better language modeling ability than only predicting discrete token.
Third, by further shielding the input token of the decoder which is not shielded in the encoder, the decoder is encouraged to extract more useful information from the encoder side, rather than using the rich information in the previous token.
If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training and evaluation as follows:
```python
# run distributed training on modelarts example
# (1) First, Perform a or b.
# a. Set "enable_modelarts=True" on yaml file.
# Set other parameters on yaml file you need.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add other parameters on the website UI interface.
# (2) Set the task"task=train"
# (3) Set the code directory to "/path/mass" on the website UI interface.
# (4) Set the startup file to "train.py" on the website UI interface.
# (5) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (6) Create your job.
# run evaluation on modelarts example
# (1) Copy or upload your trained model to S3 bucket.
# (2) Perform a or b.
# a. Set "enable_modelarts=True" on yaml file.
# Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on yaml file.
# Set "checkpoint_url=/The path of checkpoint in S3/" on yaml file.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
# Add "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface.
# (3) Set the task"task=infer" and path of vocab
# (4) Set the code directory to "/path/mass" on the website UI interface.
# (5) Set the startup file to "eval.py" on the website UI interface.
# (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (7) Create your job.
```
# Script description
MASS script and code structure are as follow:
@ -90,8 +121,12 @@ MASS script and code structure are as follow:
├── README.md // Introduction of MASS model.
├── config
│ ├──config.py // Configuration instance definition.
│ ├──config.json // Configuration file.
├── src
│ ├── model_utils
│ ├──config.py // parameter configuration
│ ├──device_adapter.py // device adapter
│ ├──local_adapter.py // local adapter
│ ├──moxing_adapter.py // moxing adapter
│ ├──dataset
│ ├──bi_data_loader.py // Dataset loader for fine-tune or inferring.
│ ├──mono_data_loader.py // Dataset loader for pre-training.
@ -134,6 +169,7 @@ MASS script and code structure are as follow:
├── requirements.txt // Requirements of third party package.
├── train.py // Train API entry.
├── eval.py // Infer API entry.
├── default_config.yaml // parameter configuration
├── tokenize_corpus.py // Corpus tokenization.
├── apply_bpe_encoding.py // Applying bpe encoding.
├── weights_average.py // Average multi model checkpoints to NPZ format.
@ -332,9 +368,8 @@ python cornell_dialog.py --src_folder /{path}/cornell_dialog \
## Configuration
Json file under the path `config/` is the template configuration file.
Almost all of the options and arguments needed could be assigned conveniently, including the training platform, configurations of dataset and model, arguments of optimizer etc. Optional features such as loss scale and checkpoint are also available by setting the options correspondingly.
For more detailed information about the attributes, refer to the file `config/config.py`.
For more detailed information about the attributes, refer to the file `default_config.yaml`.
## Training & Evaluation process
@ -357,9 +392,7 @@ The usage of `run_ascend.sh` is shown as below:
```text
Usage: run_ascend.sh [-h, --help] [-t, --task <CHAR>] [-n, --device_num <N>]
[-i, --device_id <N>] [-j, --hccl_json <FILE>]
[-c, --config <FILE>] [-o, --output <FILE>]
[-v, --vocab <FILE>]
[-i, --device_id <N>] [-o, --output <FILE>] [-v, --vocab <FILE>]
options:
-h, --help show usage
@ -367,7 +400,6 @@ options:
-n, --device_num device number used for training: N, default is 1.
-i, --device_id device id used for training with single device: N, 0<=N<=7, default is 0.
-j, --hccl_json rank table file used for training with multiple devices: FILE.
-c, --config configuration file as shown in the path 'mass/config': FILE.
-o, --output assign output file of inference: FILE.
-v, --vocab set the vocabulary.
-m, --metric set the metric.
@ -379,15 +411,13 @@ The usage of `run_gpu.sh` is shown as below:
```text
Usage: run_gpu.sh [-h, --help] [-t, --task <CHAR>] [-n, --device_num <N>]
[-i, --device_id <N>] [-c, --config <FILE>]
[-o, --output <FILE>] [-v, --vocab <FILE>]
[-i, --device_id <N>] [-o, --output <FILE>] [-v, --vocab <FILE>]
options:
-h, --help show usage
-t, --task select task: CHAR, 't' for train and 'i' for inference".
-n, --device_num device number used for training: N, default is 1.
-i, --device_id device id used for training with single device: N, 0<=N<=7, default is 0.
-c, --config configuration file as shown in the path 'mass/config': FILE.
-o, --output assign output file of inference: FILE.
-v, --vocab set the vocabulary.
-m, --metric set the metric.
@ -397,7 +427,7 @@ The command followed shows a example for training with 2 devices.
Ascend:
```ascend
sh run_ascend.sh --task t --device_num 2 --hccl_json /{path}/rank_table.json --config /{path}/config.json
sh run_ascend.sh --task t --device_num 2 --hccl_json /{path}/rank_table.json
```
ps. Discontinuous device id is not supported in `run_ascend.sh` at present, device id in `rank_table.json` must start from 0.
@ -405,20 +435,20 @@ ps. Discontinuous device id is not supported in `run_ascend.sh` at present, devi
GPU:
```gpu
sh run_gpu.sh --task t --device_num 2 --config /{path}/config.json
sh run_gpu.sh --task t --device_num 2
```
If use a single chip, it would be like this:
Ascend:
```ascend
sh run_ascend.sh --task t --device_num 1 --device_id 0 --config /{path}/config.json
sh run_ascend.sh --task t --device_num 1 --device_id 0
```
GPU:
```gpu
sh run_gpu.sh --task t --device_num 1 --device_id 0 --config /{path}/config.json
sh run_gpu.sh --task t --device_num 1 --device_id 0
```
## Weights average
@ -427,16 +457,14 @@ sh run_gpu.sh --task t --device_num 1 --device_id 0 --config /{path}/config.json
python weights_average.py --input_files your_checkpoint_list --output_file model.npz
```
The input_files is a list of you checkpoints file. To use model.npz as the weights, add its path in config.json at "existed_ckpt".
The input_files is a list of you checkpoints file. To use model.npz as the weights, add its path in default_config.yaml at "default_config.yaml".
```json
```default_config.yaml
{
...
"checkpoint_options": {
"existed_ckpt": "/xxx/xxx/model.npz",
"checkpoint_file_path": "/xxx/xxx/model.npz",
"save_ckpt_steps": 1000,
...
},
...
}
```
@ -448,14 +476,13 @@ Two learning rate scheduler are provided in our model:
1. [Polynomial decay scheduler](https://towardsdatascience.com/learning-rate-schedules-and-adaptive-learning-rate-methods-for-deep-learning-2c8f433990d1).
2. [Inverse square root scheduler](https://ece.uwaterloo.ca/~dwharder/aads/Algorithms/Inverse_square_root/).
LR scheduler could be config in `config/config.json`.
LR scheduler could be config in `default_config.yaml`.
For Polynomial decay scheduler, config could be like:
```json
```default_config.yaml
{
...
"learn_rate_config": {
"optimizer": "adam",
"lr": 1e-4,
"lr_scheduler": "poly",
@ -463,24 +490,21 @@ For Polynomial decay scheduler, config could be like:
"decay_steps": 10000,
"warmup_steps": 2000,
"min_lr": 1e-6
},
...
}
```
For Inverse square root scheduler, config could be like:
```json
```default_config.yaml
{
...
"learn_rate_config": {
"optimizer": "adam",
"lr": 1e-4,
"lr_scheduler": "isr",
"decay_start_step": 12000,
"warmup_steps": 2000,
"min_lr": 1e-6
},
...
}
```
@ -516,79 +540,79 @@ MASS pre-trains a sequence to sequence model by predicting the masked fragments
Here we provide a practice example to demonstrate the basic usage of MASS for pre-training, fine-tuning a model, and the inference process. The overall process is as follows:
1. Download and process the dataset.
2. Modify the `config.json` to config the network.
2. Modify the `default_config.yaml` to config the network.
3. Run a task for pre-training and fine-tuning.
4. Perform inference and validation.
## Pre-training
For pre-training a model, config the options in `config.json` firstly:
For pre-training a model, config the options in `default_config.yaml` firstly:
- Assign the `pre_train_dataset` under `dataset_config` node to the dataset path.
- Choose the optimizer('momentum/adam/lamb' is available).
- Assign the 'ckpt_prefix' and 'ckpt_path' under `checkpoint_path` to save the model files.
- Set other arguments including dataset configurations and network configurations.
- If you have a trained model already, assign the `existed_ckpt` to the checkpoint file.
- If you have a trained model already, assign the `checkpoint_file_path` to the checkpoint file.
If you use the ascend chip, run the shell script `run_ascend.sh` as followed:
```ascend
sh run_ascend.sh -t t -n 1 -i 1 -c /mass/config/config.json
sh run_ascend.sh -t t -n 1 -i 1
```
You can also run the shell script `run_gpu.sh` on gpu as followed:
```gpu
sh run_gpu.sh -t t -n 1 -i 1 -c /mass/config/config.json
sh run_gpu.sh -t t -n 1 -i 1
```
Get the log and output files under the path `./train_mass_*/`, and the model file under the path assigned in the `config/config.json` file.
Get the log and output files under the path `./train_mass_*/`, and the model file under the path assigned in the `default_config.yaml` file.
## Fine-tuning
For fine-tuning a model, config the options in `config.json` firstly:
For fine-tuning a model, config the options in `default_config.yaml` firstly:
- Assign the `fine_tune_dataset` under `dataset_config` node to the dataset path.
- Assign the `existed_ckpt` under `checkpoint_path` node to the existed model file generated by pre-training.
- Assign the `checkpoint_file_path` under `checkpoint_path` node to the existed model file generated by pre-training.
- Choose the optimizer('momentum/adam/lamb' is available).
- Assign the `ckpt_prefix` and `ckpt_path` under `checkpoint_path` node to save the model files.
- Assign the `ckpt_prefix` and `checkpoint_file_path` under `checkpoint_path` node to save the model files.
- Set other arguments including dataset configurations and network configurations.
If you use the ascend chip, run the shell script `run_ascend.sh` as followed:
```ascend
sh run_ascend.sh -t t -n 1 -i 1 -c config/config.json
sh run_ascend.sh -t t -n 1 -i 1
```
You can also run the shell script `run_gpu.sh` on gpu as followed:
```gpu
sh run_gpu.sh -t t -n 1 -i 1 -c config/config.json
sh run_gpu.sh -t t -n 1 -i 1
```
Get the log and output files under the path `./train_mass_*/`, and the model file under the path assigned in the `config/config.json` file.
Get the log and output files under the path `./train_mass_*/`, and the model file under the path assigned in the `default_config.yaml` file.
## Inference
If you need to use the trained model to perform inference on multiple hardware platforms, such as GPU, Ascend 910 or Ascend 310, you can refer to this [Link](https://www.mindspore.cn/tutorial/training/en/master/advanced_use/migrate_3rd_scripts.html).
For inference, config the options in `config.json` firstly:
For inference, config the options in `default_config.yaml` firstly:
- Assign the `test_dataset` under `dataset_config` node to the dataset path.
- Assign the `existed_ckpt` under `checkpoint_path` node to the model file produced by fine-tuning.
- Assign the `default_config.yaml` under `data_path` node to the dataset path.
- Assign the `default_config.yaml` under `checkpoint_path` node to the model file produced by fine-tuning.
- Choose the optimizer('momentum/adam/lamb' is available).
- Assign the `ckpt_prefix` and `ckpt_path` under `checkpoint_path` node to save the model files.
- Assign the `ckpt_prefix` and `checkpoint_file_path` under `checkpoint_path` node to save the model files.
- Set other arguments including dataset configurations and network configurations.
If you use the ascend chip, run the shell script `run_ascend.sh` as followed:
```bash
sh run_ascend.sh -t i -n 1 -i 1 -c config/config.json -o {outputfile}
sh run_ascend.sh -t i -n 1 -i 1 -o {outputfile}
```
You can also run the shell script `run_gpu.sh` on gpu as followed:
```gpu
sh run_gpu.sh -t i -n 1 -i 1 -c config/config.json -o {outputfile}
sh run_gpu.sh -t i -n 1 -i 1 -o {outputfile}
```
## Mindir Inference Process
@ -596,7 +620,7 @@ sh run_gpu.sh -t i -n 1 -i 1 -c config/config.json -o {outputfile}
### [Export MindIR](#contents)
```shell
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
python export.py --checkpoint_file_path [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
```
The ckpt_file parameter is required,
@ -675,7 +699,7 @@ The comparisons between MASS and other baseline methods in terms of PPL on Corne
# Description of random situation
MASS model contains dropout operations, if you want to disable dropout, please set related dropout_rate to 0 in `config/config.json`.
MASS model contains dropout operations, if you want to disable dropout, please set related dropout_rate to 0 in `default_config.yaml`.
# others

View File

@ -80,6 +80,40 @@ MASS设计联合预训练编码器和解码器来完成语言生成任务。
其次,通过预测解码器的连续标记,可以建立比仅预测离散标记更好的语言建模能力。
第三,通过进一步屏蔽编码器中未屏蔽的解码器的输入标记,鼓励解码器从编码器侧提取更有用的信息,而不是使用前一个标记中的丰富信息。
如果要在modelarts上进行模型的训练可以参考modelarts的官方指导文档(https://support.huaweicloud.com/modelarts/)
开始进行模型的训练和推理,具体操作如下:
```python
# 在modelarts上使用分布式训练的示例
# (1) 选址a或者b其中一种方式。
# a. 设置 "enable_modelarts=True" 。
# 在yaml文件上设置网络所需的参数。
# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。
# 在modelarts的界面上设置网络所需的参数。
# (2)设置训练任务"task=train"
# (3) 在modelarts的界面上设置代码的路径 "/path/mass"。
# (4) 在modelarts的界面上设置模型的启动文件 "train.py" 。
# (5) 在modelarts的界面上设置模型的数据路径 "Dataset path" ,
# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。
# (6) 开始模型的训练。
# 在modelarts上使用模型推理的示例
# (1) 把训练好的模型地方到桶的对应位置。
# (2) 选址a或者b其中一种方式。
# a. 设置 "enable_modelarts=True"
# 设置 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt" 在 yaml 文件.
# 设置 "checkpoint_url=/The path of checkpoint in S3/" 在 yaml 文件.
# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。
# 增加 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" 参数在modearts的界面上。
# 增加 "checkpoint_url=/The path of checkpoint in S3/" 参数在modearts的界面上。
# (3) 设置训练任务"task=infer"和vocab的路径
# (4) 在modelarts的界面上设置代码的路径 "/path/mass"。
# (5) 在modelarts的界面上设置模型的启动文件 "eval.py" 。
# (6) 在modelarts的界面上设置模型的数据路径 "Dataset path" ,
# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。
# (7) 开始模型的推理。
```
## 脚本说明
MASS脚本及代码结构如下
@ -88,8 +122,13 @@ MASS脚本及代码结构如下
├── mass
├── README.md // MASS模型介绍
├── config
│ ├──config.py // 配置实例定义
│ ├──config.json // 配置文件
│ ├──config.py // 参数配置
├── src
│ ├──model_utils
│ ├──config.py // 参数配置
│ ├──device_adapter.py // 设备配置
│ ├──local_adapter.py // 本地设备配置
│ ├──moxing_adapter.py // modelarts设备配置
├──src
│ ├──dataset
│ ├──bi_data_loader.py // 数据集加载器,用于微调或推理
@ -133,6 +172,7 @@ MASS脚本及代码结构如下
├── requirements.txt // 第三方包需求
├── train.py // 训练API入口
├── eval.py // 推理API入口
├── default_config.yaml // 参数配置
├── tokenize_corpus.py // 语料标记化
├── apply_bpe_encoding.py // 应用BPE进行编码
├── weights_average.py // 将各模型检查点平均转换到NPZ格式
@ -333,7 +373,7 @@ python cornell_dialog.py --src_folder /{path}/cornell_dialog \
`config/`目录下的JSON文件为模板配置文件
便于为大多数选项及参数赋值,包括训练平台、数据集和模型的配置、优化器参数等。还可以通过设置相应选项,获得诸如损失放大和检查点等可选特性。
有关属性的详细信息,参见`config/config.py`文件。
有关属性的详细信息,参见`default_config.yaml`文件。
## 训练&评估过程
@ -357,8 +397,7 @@ sh run_gpu.sh [--options]
```text
Usage: run_ascend.sh [-h, --help] [-t, --task <CHAR>] [-n, --device_num <N>]
[-i, --device_id <N>] [-j, --hccl_json <FILE>]
[-c, --config <FILE>] [-o, --output <FILE>]
[-v, --vocab <FILE>]
[-o, --output <FILE>] [-v, --vocab <FILE>]
options:
-h, --help show usage
@ -366,7 +405,6 @@ options:
-n, --device_num device number used for training: N, default is 1.
-i, --device_id device id used for training with single device: N, 0<=N<=7, default is 0.
-j, --hccl_json rank table file used for training with multiple devices: FILE.
-c, --config configuration file as shown in the path 'mass/config': FILE.
-o, --output assign output file of inference: FILE.
-v, --vocab set the vocabulary.
-m, --metric set the metric.
@ -378,15 +416,13 @@ options:
```text
Usage: run_gpu.sh [-h, --help] [-t, --task <CHAR>] [-n, --device_num <N>]
[-i, --device_id <N>] [-c, --config <FILE>]
[-o, --output <FILE>] [-v, --vocab <FILE>]
[-i, --device_id <N>] [-o, --output <FILE>] [-v, --vocab <FILE>]
options:
-h, --help show usage
-t, --task select task: CHAR, 't' for train and 'i' for inference".
-n, --device_num device number used for training: N, default is 1.
-i, --device_id device id used for training with single device: N, 0<=N<=7, default is 0.
-c, --config configuration file as shown in the path 'mass/config': FILE.
-o, --output assign output file of inference: FILE.
-v, --vocab set the vocabulary.
-m, --metric set the metric.
@ -396,7 +432,7 @@ options:
Ascend处理器
```ascend
sh run_ascend.sh --task t --device_num 2 --hccl_json /{path}/rank_table.json --config /{path}/config.json
sh run_ascend.sh --task t --device_num 2 --hccl_json /{path}/rank_table.json
```
注:`run_ascend.sh`暂不支持不连续设备ID`rank_table.json`中的设备ID必须从0开始。
@ -404,20 +440,20 @@ sh run_ascend.sh --task t --device_num 2 --hccl_json /{path}/rank_table.json --c
GPU处理器
```gpu
sh run_gpu.sh --task t --device_num 2 --config /{path}/config.json
sh run_gpu.sh --task t --device_num 2
```
运行如下命令进行单卡训练:
Ascend处理器
```ascend
sh run_ascend.sh --task t --device_num 1 --device_id 0 --config /{path}/config.json
sh run_ascend.sh --task t --device_num 1 --device_id 0
```
GPU处理器
```gpu
sh run_gpu.sh --task t --device_num 1 --device_id 0 --config /{path}/config.json
sh run_gpu.sh --task t --device_num 1 --device_id 0
```
## 权重平均值
@ -426,16 +462,14 @@ sh run_gpu.sh --task t --device_num 1 --device_id 0 --config /{path}/config.json
python weights_average.py --input_files your_checkpoint_list --output_file model.npz
```
`input_files`为检查点文件清单。如需使用`model.npz`作为权重文件,请在“existed_ckpt”的`config.json`文件中添加`model.npz`的路径。
`input_files`为检查点文件清单。如需使用`model.npz`作为权重文件,请在“checkpoint_file_path”的`default_config.yaml`文件中添加`model.npz`的路径。
```json
```default_config.yaml
{
...
"checkpoint_options": {
"existed_ckpt": "/xxx/xxx/model.npz",
"checkpoint_file_path": "/xxx/xxx/model.npz",
"save_ckpt_steps": 1000,
...
},
...
}
```
@ -451,10 +485,9 @@ python weights_average.py --input_files your_checkpoint_list --output_file model
多项式衰减调度器配置文件示例如下:
```json
```default_config.yaml
{
...
"learn_rate_config": {
"optimizer": "adam",
"lr": 1e-4,
"lr_scheduler": "poly",
@ -462,24 +495,21 @@ python weights_average.py --input_files your_checkpoint_list --output_file model
"decay_steps": 10000,
"warmup_steps": 2000,
"min_lr": 1e-6
},
...
}
```
逆平方根调度器配置文件示例如下:
```json
```default_config.yaml
{
...
"learn_rate_config": {
"optimizer": "adam",
"lr": 1e-4,
"lr_scheduler": "isr",
"decay_start_step": 12000,
"warmup_steps": 2000,
"min_lr": 1e-6
},
...
}
```
@ -515,79 +545,78 @@ MASS通过预测输入序列中被屏蔽的片段来预训练序列到序列模
这里提供了一个练习示例来演示应用MASS对模型进行预训练、微调的基本用法以及推理过程。操作步骤如下
1. 下载并处理数据集。
2. 修改`config.json`文件,配置网络。
2. 修改`default_config.yaml`文件,配置网络。
3. 运行预训练和微调任务。
4. 进行推理验证。
## 预训练
预训练模型时,首先配置`config.json`中的选项:
预训练模型时,首先配置`default_config.yaml`中的选项:
- 将`dataset_config`节点下的`pre_train_dataset`配置为数据集路径。
- 选择优化器(可采用'momentum/adam/lamb
- 在`checkpoint_path`下,指定'ckpt_prefix'和'ckpt_path'来保存模型文件。
- 在`checkpoint_path`下,指定'ckpt_prefix'和'checkpoint_file_path'来保存模型文件。
- 配置其他参数,包括数据集配置和网络配置。
- 如果已经有训练好的模型,请将`existed_ckpt`配置为该检查点文件。
- 如果已经有训练好的模型,请将`checkpoint_file_path`配置为该检查点文件。
如使用Ascend芯片执行`run_ascend.sh`这个shell脚本
```ascend
sh run_ascend.sh -t t -n 1 -i 1 -c /mass/config/config.json
sh run_ascend.sh -t t -n 1 -i 1
```
如使用GPU处理器执行`run_gpu.sh`这个shell脚本
```gpu
sh run_gpu.sh -t t -n 1 -i 1 -c /mass/config/config.json
sh run_gpu.sh -t t -n 1 -i 1
```
日志和输出文件可以在`./train_mass_*/`路径下获取,模型文件可以在`config/config.json`配置文件中指定的路径下获取。
日志和输出文件可以在`./train_mass_*/`路径下获取,模型文件可以在`default_config.yaml`配置文件中指定的路径下获取。
## 微调
预训练模型时,首先配置`config.json`中的选项:
预训练模型时,首先配置`default_yaml.yaml`中的选项:
- 将`dataset_config`节点下的`fine_tune_dataset`配置为数据集路径。
- 将`checkpoint_path`节点下的`existed_ckpt`赋值给预训练生成的已有模型文件。
- 将`default_config.yaml`下的`data_path`配置为数据集路径。
- 将`default_config.yaml`下的`checkpoint_file_path`赋值给预训练生成的已有模型文件。
- 选择优化器(可采用'momentum/adam/lamb
- 在`checkpoint_path`下,指定'ckpt_prefix'和'ckpt_path'来保存模型文件。
- 在`default_config.yaml`下,指定'ckpt_prefix'和'checkpoint_path'来保存模型文件。
- 配置其他参数,包括数据集配置和网络配置。
如使用Ascend芯片执行`run_ascend.sh`这个shell脚本
```ascend
sh run_ascend.sh -t t -n 1 -i 1 -c config/config.json
sh run_ascend.sh -t t -n 1 -i 1
```
如使用GPU处理器执行`run_gpu.sh`这个shell脚本
```gpu
sh run_gpu.sh -t t -n 1 -i 1 -c config/config.json
sh run_gpu.sh -t t -n 1 -i 1
```
日志和输出文件可以在`./train_mass_*/`路径下获取,模型文件可以在`config/config.json`配置文件中指定的路径下获取。
日志和输出文件可以在`./train_mass_*/`路径下获取,模型文件可以在`default_config.yaml`配置文件中指定的路径下获取。
## 推理
如果您需要使用此训练模型在GPU、Ascend 910、Ascend 310等多个硬件平台上进行推理可参考此[链接](https://www.mindspore.cn/tutorial/training/zh-CN/master/advanced_use/migrate_3rd_scripts.html)。
推理时,请先配置`config.json`中的选项:
- 将`dataset_config`节点下的`test_dataset`配置为数据集路径。
- 将`dataset_config`节点下的`test_dataset`配置为数据集路径。
- 将`default_config.yaml`节点下的`data_path`配置为数据集路径。
- 选择优化器(可采用'momentum/adam/lamb
- 在`checkpoint_path`下,指定'ckpt_prefix'和'ckpt_path'来保存模型文件。
- 指定'ckpt_prefix'和'ckpt_path'来保存模型文件。
- 配置其他参数,包括数据集配置和网络配置。
如使用Ascend芯片执行`run_ascend.sh`这个shell脚本
```bash
sh run_ascend.sh -t i -n 1 -i 1 -c config/config.json -o {outputfile}
sh run_ascend.sh -t i -n 1 -i 1 -o {outputfile}
```
如使用GPU处理器执行`run_gpu.sh`这个shell脚本
```gpu
sh run_gpu.sh -t i -n 1 -i 1 -c config/config.json -o {outputfile}
sh run_gpu.sh -t i -n 1 -i 1 -o {outputfile}
```
## Mindir推理
@ -595,10 +624,10 @@ sh run_gpu.sh -t i -n 1 -i 1 -c config/config.json -o {outputfile}
### [导出模型](#contents)
```shell
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
python export.py --checkpoint_file_path [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
```
参数ckpt_file为必填项
参数checkpoint_file_path为必填项,
`EXPORT_FORMAT` 必须在 ["AIR", "MINDIR"]中选择。
### 在Ascend310执行推理
@ -674,7 +703,7 @@ bash run_infer_310.sh [MINDIR_PATH] [CONFIG] [VOCAB] [OUTPUT] [NEED_PREPROCESS]
# 随机情况说明
MASS模型涉及随机失活dropout操作如需禁用此功能请在`config/config.json`中将dropout_rate设置为0。
MASS模型涉及随机失活dropout操作如需禁用此功能请在`default_config.yaml`中将dropout_rate设置为0。
# 其他

View File

@ -1,54 +0,0 @@
{
"dataset_config": {
"epochs": 20,
"batch_size": 192,
"pre_train_dataset": "",
"fine_tune_dataset": "",
"test_dataset": "",
"valid_dataset": "",
"dataset_sink_mode": false,
"dataset_sink_step": 100
},
"model_config": {
"random_seed": 100,
"save_graphs": false,
"seq_length": 64,
"vocab_size": 45744,
"hidden_size": 1024,
"num_hidden_layers": 6,
"num_attention_heads": 8,
"intermediate_size": 4096,
"hidden_act": "relu",
"hidden_dropout_prob": 0.2,
"attention_dropout_prob": 0.2,
"max_position_embeddings": 64,
"initializer_range": 0.02,
"label_smoothing": 0.1,
"beam_width": 4,
"length_penalty_weight": 1.0,
"max_decode_length": 64
},
"loss_scale_config": {
"loss_scale_mode": "dynamic",
"init_loss_scale": 65536,
"loss_scale_factor": 2,
"scale_window": 200
},
"learn_rate_config": {
"optimizer": "adam",
"lr": 1e-4,
"lr_scheduler": "poly",
"poly_lr_scheduler_power": 0.5,
"decay_steps": 10000,
"decay_start_step": 12000,
"warmup_steps": 4000,
"min_lr": 1e-6
},
"checkpoint_options": {
"existed_ckpt": "",
"save_ckpt_steps": 2500,
"keep_ckpt_max": 50,
"ckpt_prefix": "ckpt",
"ckpt_path": "checkpoints"
}
}

View File

@ -0,0 +1,94 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: ''
# ==============================================================================
# Training options
task: "train"
epochs: 20
batch_size: 192
dtype: float32 #only support float16 and float32
compute_type: float16 #only support float16 and float32
pre_train_dataset: ""
fine_tune_dataset: ""
test_dataset: ""
valid_dataset: ""
dataset_sink_mode: false
dataset_sink_step: 100
random_seed: 100
save_graphs: false
seq_length: 64
vocab_size: 45744
hidden_size: 1024
num_hidden_layers: 6
num_attention_heads: 8
intermediate_size: 4096
hidden_act: "relu"
hidden_dropout_prob: 0.2
attention_dropout_prob: 0.2
max_position_embeddings: 64
initializer_range: 0.02
label_smoothing: 0.1
beam_width: 4
length_penalty_weight: 1.0
max_decode_length: 64
loss_scale_mode: "dynamic"
init_loss_scale: 65536
loss_scale_factor: 2
scale_window: 200
optimizer: "adam"
lr: 0.0001
lr_scheduler: "poly"
poly_lr_scheduler_power: 0.5
decay_steps: 10000
decay_start_step: 12000
warmup_steps: 4000
min_lr: 0.000001
existed_ckpt: ""
save_ckpt_steps: 2500
keep_ckpt_max: 50
ckpt_prefix: "ckpt"
metric: "rouge"
vocab: ""
output: ""
# Export options
device_id: 0
ckpt_file: ""
file_name: "mass"
file_format: "AIR"
vocab_file: ""
result_path: "./preprocess_Result/"
source_id_folder: ""
target_id_folder: ""
result_dir: "./result_Files"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
train_url: 'Training output url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
keep_checkpoint_max: "keep the last keep_checkpoint_max checkpoint"
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -14,60 +14,52 @@
# ============================================================================
"""Evaluation api."""
import os
import argparse
import pickle
from mindspore.common import dtype as mstype
from mindspore import context
from config import TransformerConfig
from mindspore.common import dtype as mstype
from src.transformer import infer, infer_ppl
from src.utils import Dictionary
from src.utils import get_score
from src.model_utils.config import config
from src.model_utils.moxing_adapter import moxing_wrapper
parser = argparse.ArgumentParser(description='Evaluation MASS.')
parser.add_argument("--config", type=str, required=True,
help="Model config json file path.")
parser.add_argument("--vocab", type=str, required=True,
help="Vocabulary to use.")
parser.add_argument("--output", type=str, required=True,
help="Result file path.")
parser.add_argument("--metric", type=str, default='rouge',
help='Set eval method.')
parser.add_argument("--platform", type=str, required=True,
help="model working platform.")
def get_config(config):
config = TransformerConfig.from_json_file(config)
config.compute_type = mstype.float16
config.dtype = mstype.float32
return config
if __name__ == '__main__':
args, _ = parser.parse_known_args()
vocab = Dictionary.load_from_persisted_dict(args.vocab)
_config = get_config(args.config)
def get_config():
if config.compute_type == "float16":
config.compute_type = mstype.float16
if config.compute_type == "float32":
config.compute_type = mstype.float32
if config.dtype == "float16":
config.dtype = mstype.float16
if config.dtype == "float32":
config.dtype = mstype.float32
@moxing_wrapper()
def eval_net():
"""eval_net"""
vocab = Dictionary.load_from_persisted_dict(config.vocab)
get_config()
device_id = os.getenv('DEVICE_ID', None)
if device_id is None:
device_id = 0
device_id = int(device_id)
context.set_context(
mode=context.GRAPH_MODE,
device_target=args.platform,
device_target=config.device_target,
reserve_class_name_in_scope=False,
device_id=device_id)
if args.metric == 'rouge':
result = infer(_config)
if config.metric == 'rouge':
result = infer(config)
else:
result = infer_ppl(_config)
result = infer_ppl(config)
with open(args.output, "wb") as f:
with open(config.output, "wb") as f:
pickle.dump(result, f, 1)
# get score by given metric
score = get_score(result, vocab, metric=args.metric)
score = get_score(result, vocab, metric=config.metric)
print(score)
if __name__ == '__main__':
eval_net()

View File

@ -14,7 +14,6 @@
# ============================================================================
"""export checkpoint file into air models"""
import argparse
import numpy as np
from mindspore import Tensor, context
@ -23,33 +22,16 @@ from mindspore.train.serialization import export
from src.utils import Dictionary
from src.utils.load_weights import load_infer_weights
from src.model_utils.config import config
from src.transformer.transformer_for_infer import TransformerInferModel
from config import TransformerConfig
parser = argparse.ArgumentParser(description="mass export")
parser.add_argument("--device_id", type=int, default=0, help="Device id")
parser.add_argument("--file_name", type=str, default="mass", help="output file name.")
parser.add_argument("--file_format", type=str, choices=["AIR", "ONNX", "MINDIR"], default="AIR", help="file format")
parser.add_argument("--device_target", type=str, default="Ascend",
choices=["Ascend", "GPU", "CPU"], help="device target (default: Ascend)")
parser.add_argument('--gigaword_infer_config', type=str, required=True, help='gigaword config file')
parser.add_argument('--vocab_file', type=str, required=True, help='vocabulary file')
args = parser.parse_args()
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
if args.device_target == "Ascend":
context.set_context(device_id=args.device_id)
def get_config(config_file):
tfm_config = TransformerConfig.from_json_file(config_file)
tfm_config.compute_type = mstype.float16
tfm_config.dtype = mstype.float32
return tfm_config
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
if config.device_target == "Ascend":
context.set_context(device_id=config.device_id)
if __name__ == '__main__':
vocab = Dictionary.load_from_persisted_dict(args.vocab_file)
config = get_config(args.gigaword_infer_config)
vocab = Dictionary.load_from_persisted_dict(config.vocab_file)
dec_len = config.max_decode_length
tfm_model = TransformerInferModel(config=config, use_one_hot_embeddings=False)
@ -84,4 +66,4 @@ if __name__ == '__main__':
source_ids = Tensor(np.ones((1, config.seq_length)).astype(np.int32))
source_mask = Tensor(np.ones((1, config.seq_length)).astype(np.int32))
export(tfm_model, source_ids, source_mask, file_name=args.file_name, file_format=args.file_format)
export(tfm_model, source_ids, source_mask, file_name=config.file_name, file_format=config.file_format)

View File

@ -14,34 +14,14 @@
# ============================================================================
"""Evaluation api."""
import os
import argparse
import pickle
import numpy as np
from config import TransformerConfig
from src.model_utils.config import config
from src.utils import Dictionary
from src.utils import get_score
parser = argparse.ArgumentParser(description='postprocess.')
parser.add_argument("--config", type=str, required=True,
help="Model config json file path.")
parser.add_argument("--vocab", type=str, required=True,
help="Vocabulary to use.")
parser.add_argument("--output", type=str, required=True,
help="Result file path.")
parser.add_argument("--metric", type=str, default='rouge',
help='Set eval method.')
parser.add_argument("--source_id_folder", type=str, default='',
help="source_eos_ids folder path.")
parser.add_argument("--target_id_folder", type=str, default='',
help="target_eos_ids folder path.")
parser.add_argument("--result_dir", type=str, default='./result_Files',
help="result dir path.")
args, _ = parser.parse_known_args()
def read_from_file(config):
def read_from_file():
'''
calculate accuraty.
'''
@ -49,16 +29,16 @@ def read_from_file(config):
probs = []
source_sentences = []
target_sentences = []
file_num = len(os.listdir(args.source_id_folder))
file_num = len(os.listdir(config.source_id_folder))
for i in range(file_num):
f_name = "gigaword_bs_" + str(config.batch_size) + "_" + str(i)
source_ids = np.fromfile(os.path.join(args.source_id_folder, f_name + ".bin"), np.int32)
source_ids = np.fromfile(os.path.join(config.source_id_folder, f_name + ".bin"), np.int32)
source_ids = source_ids.reshape(1, config.max_decode_length)
target_ids = np.fromfile(os.path.join(args.target_id_folder, f_name + ".bin"), np.int32)
target_ids = np.fromfile(os.path.join(config.target_id_folder, f_name + ".bin"), np.int32)
target_ids = target_ids.reshape(1, config.max_decode_length)
predicted_ids = np.fromfile(os.path.join(args.result_dir, f_name + "_0.bin"), np.int32)
predicted_ids = np.fromfile(os.path.join(config.result_dir, f_name + "_0.bin"), np.int32)
predicted_ids = predicted_ids.reshape(1, config.max_decode_length + 1)
entire_probs = np.fromfile(os.path.join(args.result_dir, f_name + "_1.bin"), np.float32)
entire_probs = np.fromfile(os.path.join(config.result_dir, f_name + "_1.bin"), np.float32)
entire_probs = entire_probs.reshape(1, config.beam_width, config.max_decode_length + 1)
source_sentences.append(source_ids)
@ -87,13 +67,12 @@ def read_from_file(config):
if __name__ == '__main__':
conf = TransformerConfig.from_json_file(args.config)
result = read_from_file(conf)
vocab = Dictionary.load_from_persisted_dict(args.vocab)
result = read_from_file()
vocab = Dictionary.load_from_persisted_dict(config.vocab)
with open(args.output, "wb") as f:
with open(config.output, "wb") as f:
pickle.dump(result, f, 1)
# get score by given metric
score = get_score(result, vocab, metric=args.metric)
score = get_score(result, vocab, metric=config.metric)
print(score)

View File

@ -14,28 +14,19 @@
# ============================================================================
"""Evaluation api."""
import os
import argparse
from config import TransformerConfig
from src.model_utils.config import config
from src.dataset import load_dataset
parser = argparse.ArgumentParser(description='preprocess.')
parser.add_argument("--config", type=str, required=True,
help="Model config json file path.")
parser.add_argument("--result_path", type=str, default='./preprocess_Result/',
help="preprocess result path.")
args, _ = parser.parse_known_args()
def generate_bin():
'''
Generate bin files.
'''
config = TransformerConfig.from_json_file(args.config)
ds = load_dataset(data_files=config.test_dataset,
batch_size=config.batch_size,
epoch_count=1,
sink_mode=config.dataset_sink_mode,
shuffle=False) if config.test_dataset else None
cur_dir = args.result_path
cur_dir = config.result_path
source_eos_ids_path = os.path.join(cur_dir, "00_source_eos_ids")
source_eos_mask_path = os.path.join(cur_dir, "01_source_eos_mask")
target_eos_ids_path = os.path.join(cur_dir, "target_eos_ids")

View File

@ -32,7 +32,6 @@ echo_help()
echo " -n --device_num training with N devices"
echo " -i --device_id training with device i"
echo " -j --hccl_json set the rank table file"
echo " -c --config set the configuration file"
echo " -o --output set the output file of inference"
echo " -v --vocab set the vocabulary"
echo " -m --metric set the metric"
@ -104,11 +103,6 @@ do
export DEVICE_ID=$2
shift 2
;;
-c|--config)
echo "config";
configurations=$2
shift 2
;;
-o|--output)
echo "output";
output=$2
@ -153,7 +147,8 @@ do
cp train.py ./${task}_mass_$DEVICE_ID
cp eval.py ./${task}_mass_$DEVICE_ID
cp $configurations ./${task}_mass_$DEVICE_ID
cp -r ./src ./${task}_mass_$DEVICE_ID
cp -r ./*.yaml ./${task}_mass_$DEVICE_ID
if [ $vocab ]
then
@ -165,10 +160,10 @@ do
echo $task
if [ "$task" == "train" ]
then
python train.py --config ${configurations##*/} --platform Ascend >>log.log 2>&1 &
python train.py --device_target Ascend --output_path './output' >>log.log 2>&1 &
elif [ "$task" == "infer" ]
then
python eval.py --config ${configurations##*/} --output ${output} --vocab ${vocab##*/} --metric ${metric} --platform Ascend >>log_infer.log 2>&1 &
python eval.py --output ${output} --vocab ${vocab##*/} --metric ${metric} --device_target Ascend >>log_infer.log 2>&1 &
fi
cd ../
done

View File

@ -31,7 +31,6 @@ echo_help()
echo " -t --task select task, 't' for training and 'i' for inference"
echo " -n --device_num training with N devices"
echo " -i --device_id training with device i"
echo " -c --config set the configuration file"
echo " -o --output set the output file of inference"
echo " -v --vocab set the vocabulary"
echo " -m --metric set the metric"
@ -87,11 +86,6 @@ do
export DEVICE_ID=$2
shift 2
;;
-c|--config)
echo "config";
configurations=$2
shift 2
;;
-o|--output)
echo "output";
output=$2
@ -132,7 +126,8 @@ mkdir ./${task}_mass_$DEVICE_ID
cp train.py ./${task}_mass_$DEVICE_ID
cp eval.py ./${task}_mass_$DEVICE_ID
cp $configurations ./${task}_mass_$DEVICE_ID
cp -r ./src ./${task}_mass_$DEVICE_ID
cp -r ./*.yaml ./${task}_mass_$DEVICE_ID
if [ $vocab ]
then
@ -147,13 +142,13 @@ then
if [ $RANK_SIZE -gt 1 ]
then
mpirun -n $RANK_SIZE --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 &
python train.py --device_target GPU --output_path './output' >>log.log 2>&1 &
else
python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 &
python train.py --device_target GPU --output_path './output' >>log.log 2>&1 &
fi
elif [ "$task" == "infer" ]
then
python eval.py --config ${configurations##*/} --output ${output} --vocab ${vocab##*/} --metric ${metric} --platform GPU >>log_infer.log 2>&1 &
python eval.py --output ${output} --vocab ${vocab##*/} --metric ${metric} --device_target GPU >>log_infer.log 2>&1 &
fi
cd ../

View File

@ -0,0 +1,125 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Parse arguments"""
import os
import ast
import argparse
from pprint import pprint, pformat
import yaml
_config_path = "./default_config.yaml"
class Config:
"""
Configuration namespace. Convert dictionary to members.
"""
def __init__(self, cfg_dict):
for k, v in cfg_dict.items():
if isinstance(v, (list, tuple)):
setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
else:
setattr(self, k, Config(v) if isinstance(v, dict) else v)
def __str__(self):
return pformat(self.__dict__)
def __repr__(self):
return self.__str__()
def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"):
"""
Parse command line arguments to the configuration according to the default yaml.
Args:
parser: Parent parser.
cfg: Base configuration.
helper: Helper description.
cfg_path: Path to the default yaml config.
"""
parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
parents=[parser])
helper = {} if helper is None else helper
choices = {} if choices is None else choices
for item in cfg:
if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
choice = choices[item] if item in choices else None
if isinstance(cfg[item], bool):
parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
help=help_description)
else:
parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
help=help_description)
args = parser.parse_args()
return args
def parse_yaml(yaml_path):
"""
Parse the yaml config file.
Args:
yaml_path: Path to the yaml config.
"""
with open(yaml_path, 'r') as fin:
try:
cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
cfgs = [x for x in cfgs]
if len(cfgs) == 1:
cfg_helper = {}
cfg = cfgs[0]
elif len(cfgs) == 2:
cfg, cfg_helper = cfgs
else:
raise ValueError("At most 2 docs (config and help description for help) are supported in config yaml")
print(cfg_helper)
except:
raise ValueError("Failed to parse yaml")
return cfg, cfg_helper
def merge(args, cfg):
"""
Merge the base config from yaml file and command line arguments.
Args:
args: Command line arguments.
cfg: Base configuration.
"""
args_var = vars(args)
for item in args_var:
cfg[item] = args_var[item]
return cfg
def get_config():
"""
Get Config according to the yaml file and cli arguments.
"""
parser = argparse.ArgumentParser(description="default name", add_help=False)
current_dir = os.path.dirname(os.path.abspath(__file__))
parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../../default_config.yaml"),
help="Config file path")
path_args, _ = parser.parse_known_args()
default, helper = parse_yaml(path_args.config_path)
pprint(default)
args = parse_cli_to_yaml(parser, default, helper, path_args.config_path)
final_config = merge(args, default)
return Config(final_config)
config = get_config()

View File

@ -0,0 +1,27 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Device adapter for ModelArts"""
from src.model_utils.config import config
if config.enable_modelarts:
from src.model_utils.moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
else:
from src.model_utils.local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
__all__ = [
"get_device_id", "get_device_num", "get_rank_id", "get_job_id"
]

View File

@ -0,0 +1,36 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Local adapter"""
import os
def get_device_id():
device_id = os.getenv('DEVICE_ID', '0')
return int(device_id)
def get_device_num():
device_num = os.getenv('RANK_SIZE', '1')
return int(device_num)
def get_rank_id():
global_rank_id = os.getenv('RANK_ID', '0')
return int(global_rank_id)
def get_job_id():
return "Local Job"

View File

@ -0,0 +1,115 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Moxing adapter for ModelArts"""
import os
import functools
from mindspore import context
from src.model_utils.config import config
_global_sync_count = 0
def get_device_id():
device_id = os.getenv('DEVICE_ID', '0')
return int(device_id)
def get_device_num():
device_num = os.getenv('RANK_SIZE', '1')
return int(device_num)
def get_rank_id():
global_rank_id = os.getenv('RANK_ID', '0')
return int(global_rank_id)
def get_job_id():
job_id = os.getenv('JOB_ID')
job_id = job_id if job_id != "" else "default"
return job_id
def sync_data(from_path, to_path):
"""
Download data from remote obs to local directory if the first url is remote url and the second one is local path
Upload data from local directory to remote obs in contrast.
"""
import moxing as mox
import time
global _global_sync_count
sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
_global_sync_count += 1
# Each server contains 8 devices as most.
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
print("from path: ", from_path)
print("to path: ", to_path)
mox.file.copy_parallel(from_path, to_path)
print("===finish data synchronization===")
try:
os.mknod(sync_lock)
except IOError:
pass
print("===save flag===")
while True:
if os.path.exists(sync_lock):
break
time.sleep(1)
print("Finish sync data from {} to {}.".format(from_path, to_path))
def moxing_wrapper(pre_process=None, post_process=None):
"""
Moxing wrapper to download dataset and upload outputs.
"""
def wrapper(run_func):
@functools.wraps(run_func)
def wrapped_func(*args, **kwargs):
# Download data from data_url
if config.enable_modelarts:
if config.data_url:
sync_data(config.data_url, config.data_path)
print("Dataset downloaded: ", os.listdir(config.data_path))
if config.checkpoint_url:
sync_data(config.checkpoint_url, config.load_path)
print("Preload downloaded: ", os.listdir(config.load_path))
if config.train_url:
sync_data(config.train_url, config.output_path)
print("Workspace downloaded: ", os.listdir(config.output_path))
context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
config.device_num = get_device_num()
config.device_id = get_device_id()
if not os.path.exists(config.output_path):
os.makedirs(config.output_path)
if pre_process:
pre_process()
run_func(*args, **kwargs)
# Upload data to train_url
if config.enable_modelarts:
if post_process:
post_process()
if config.train_url:
print("Start to copy output directory")
sync_data(config.output_path, config.train_url)
return wrapped_func
return wrapper

View File

@ -13,6 +13,7 @@
# limitations under the License.
# ============================================================================
"""Infer api."""
import os
import time
import mindspore.nn as nn
@ -143,11 +144,16 @@ def infer(config):
Returns:
list, result with
"""
if config.enable_modelarts:
config.test_dataset = os.path.join(config.data_path, \
"tfrecords/gigaword_new_prob/gigaword_test_dataset.tfrecord-001-of-001")
else:
config.test_dataset = os.path.join(config.data_path, "gigaword_test_dataset.tfrecord-001-of-001")
eval_dataset = load_dataset(data_files=config.test_dataset,
batch_size=config.batch_size,
epoch_count=1,
sink_mode=config.dataset_sink_mode,
shuffle=False) if config.test_dataset else None
shuffle=False) if config.data_path else None
prediction = transformer_infer(config, eval_dataset)
return prediction
@ -269,10 +275,15 @@ def infer_ppl(config):
Returns:
list, result with
"""
if config.enable_modelarts:
config.test_dataset = os.path.join(config.data_path, \
"tfrecords/gigaword_new_prob/gigaword_test_dataset.tfrecord-001-of-001")
else:
config.test_dataset = os.path.join(config.data_path, "gigaword_test_dataset.tfrecord-001-of-001")
eval_dataset = load_dataset(data_files=config.test_dataset,
batch_size=config.batch_size,
epoch_count=1,
sink_mode=config.dataset_sink_mode,
shuffle=False) if config.test_dataset else None
shuffle=False) if config.data_path else None
prediction = transformer_infer_ppl(config, eval_dataset)
return prediction

View File

@ -27,7 +27,7 @@ def load_infer_weights(config):
Returns:
dict, weights.
"""
model_path = config.existed_ckpt
model_path = config.checkpoint_file_path
if model_path.endswith(".npz"):
ms_ckpt = np.load(model_path)
is_npz = True

View File

@ -14,7 +14,6 @@
# ============================================================================
"""Train api."""
import os
import argparse
import pickle
import numpy as np
@ -32,7 +31,6 @@ from mindspore.communication import management as MultiAscend
from mindspore.train.serialization import load_checkpoint
from mindspore.common import set_seed
from config import TransformerConfig
from src.dataset import load_dataset
from src.transformer import TransformerNetworkWithLoss, TransformerTrainOneStepWithLossScaleCell
from src.transformer.infer_mass import infer
@ -40,27 +38,26 @@ from src.utils import LossCallBack
from src.utils import one_weight, zero_weight, weight_variable
from src.utils import square_root_schedule
from src.utils.lr_scheduler import polynomial_decay_scheduler, BertLearningRate
from src.model_utils.config import config
from src.model_utils.moxing_adapter import moxing_wrapper
parser = argparse.ArgumentParser(description='MASS train entry point.')
parser.add_argument("--config", type=str, required=True, help="model config json file path.")
parser.add_argument("--platform", type=str, required=True, help="model working platform.")
def get_config():
if config.compute_type == "float16":
config.compute_type = mstype.float16
if config.compute_type == "float32":
config.compute_type = mstype.float32
if config.dtype == "float16":
config.dtype = mstype.float16
if config.dtype == "float32":
config.dtype = mstype.float32
def get_config(config):
config = TransformerConfig.from_json_file(config)
config.compute_type = mstype.float16
config.dtype = mstype.float32
return config
def _train(model, config: TransformerConfig,
pre_training_dataset=None, fine_tune_dataset=None, test_dataset=None,
def _train(model, pre_training_dataset=None, fine_tune_dataset=None, test_dataset=None,
callbacks: list = None):
"""
Train model.
Args:
model (Model): MindSpore model instance.
config (TransformerConfig): Config of mass model.
pre_training_dataset (Dataset): Pre-training dataset.
fine_tune_dataset (Dataset): Fine-tune dataset.
test_dataset (Dataset): Test dataset.
@ -81,7 +78,7 @@ def _train(model, config: TransformerConfig,
# Test the accuracy of the model.
if test_dataset is not None:
print(" | Start test job.")
result = infer(_config)
result = infer(config)
with open("validation_res_after_pre_training.bin", "wb") as f:
pickle.dump(result, f, 1)
@ -95,18 +92,18 @@ def _train(model, config: TransformerConfig,
# Test the accuracy of the model.
if test_dataset is not None:
print(" | Start test job.")
result = infer(_config)
result = infer(config)
with open("validation_res_after_pre_training.bin", "wb") as f:
pickle.dump(result, f, 1)
def _load_checkpoint_to_net(config, network):
def _load_checkpoint_to_net(network):
"""load parameters to network from checkpoint."""
if config.existed_ckpt:
if config.existed_ckpt.endswith(".npz"):
weights = np.load(config.existed_ckpt)
if config.checkpoint_file_path:
if config.checkpoint_file_path.endswith(".npz"):
weights = np.load(config.checkpoint_file_path)
else:
weights = load_checkpoint(config.existed_ckpt)
weights = load_checkpoint(config.checkpoint_file_path)
for param in network.trainable_params():
weights_name = param.name
if weights_name not in weights:
@ -133,7 +130,7 @@ def _load_checkpoint_to_net(config, network):
param.set_data(weight_variable(value.asnumpy().shape))
def _get_lr(config, update_steps):
def _get_lr(update_steps):
"""generate learning rate."""
if config.lr_scheduler == "isr":
lr = Tensor(square_root_schedule(lr=config.lr,
@ -153,7 +150,7 @@ def _get_lr(config, update_steps):
return lr
def _get_optimizer(config, network, lr):
def _get_optimizer(network, lr):
"""get mass optimizer, support Adam, Lamb, Momentum."""
if config.optimizer.lower() == "adam":
optimizer = Adam(network.trainable_params(), lr, beta1=0.9, beta2=0.98)
@ -175,8 +172,7 @@ def _get_optimizer(config, network, lr):
return optimizer
def _build_training_pipeline(config: TransformerConfig,
pre_training_dataset=None,
def _build_training_pipeline(pre_training_dataset=None,
fine_tune_dataset=None,
test_dataset=None,
platform="Ascend"):
@ -184,14 +180,13 @@ def _build_training_pipeline(config: TransformerConfig,
Build training pipeline.
Args:
config (TransformerConfig): Config of mass model.
pre_training_dataset (Dataset): Pre-training dataset.
fine_tune_dataset (Dataset): Fine-tune dataset.
test_dataset (Dataset): Test dataset.
"""
net_with_loss = TransformerNetworkWithLoss(config, is_training=True)
net_with_loss.init_parameters_data()
_load_checkpoint_to_net(config, net_with_loss)
_load_checkpoint_to_net(net_with_loss)
dataset = pre_training_dataset if pre_training_dataset is not None \
else fine_tune_dataset
@ -201,9 +196,9 @@ def _build_training_pipeline(config: TransformerConfig,
update_steps = config.epochs * dataset.get_dataset_size()
lr = _get_lr(config, update_steps)
lr = _get_lr(update_steps)
optimizer = _get_optimizer(config, net_with_loss, lr)
optimizer = _get_optimizer(net_with_loss, lr)
# loss scale.
if config.loss_scale_mode == "dynamic":
@ -223,27 +218,28 @@ def _build_training_pipeline(config: TransformerConfig,
rank_size = os.getenv('RANK_SIZE')
callbacks = []
callbacks.append(time_cb)
ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path)
if rank_size is not None and int(rank_size) > 1:
loss_monitor = LossCallBack(config, rank_id=MultiAscend.get_rank())
callbacks.append(loss_monitor)
if MultiAscend.get_rank() % 8 == 0:
ckpt_callback = ModelCheckpoint(
prefix=config.ckpt_prefix,
directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(MultiAscend.get_rank())),
directory=os.path.join(ckpt_save_dir, 'ckpt_{}'.format(MultiAscend.get_rank())),
config=ckpt_config)
callbacks.append(ckpt_callback)
if rank_size is None or int(rank_size) == 1:
ckpt_callback = ModelCheckpoint(
prefix=config.ckpt_prefix,
directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
directory=os.path.join(ckpt_save_dir, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
config=ckpt_config)
loss_monitor = LossCallBack(config, rank_id=os.getenv('DEVICE_ID'))
callbacks.append(loss_monitor)
callbacks.append(ckpt_callback)
print(f" | ALL SET, PREPARE TO TRAIN.")
_train(model=model, config=config,
_train(model=model,
pre_training_dataset=pre_training_dataset,
fine_tune_dataset=fine_tune_dataset,
test_dataset=test_dataset,
@ -260,17 +256,21 @@ def _setup_parallel_env(platform):
)
def train_parallel(config: TransformerConfig, platform: "Ascend"):
@moxing_wrapper()
def train_parallel(platform: "Ascend"):
"""
Train model with multi ascend chips.
Args:
config (TransformerConfig): Config for MASS model.
"""
_setup_parallel_env(platform)
print(f" | Starting training on {os.getenv('RANK_SIZE', None)} devices.")
if config.task == "train":
filenames = os.listdir(config.data_path)
config.fine_tune_dataset = [os.path.join(config.data_path, filename) for filename in filenames]
else:
config.test_dataset = os.path.join(config.data_path, "gigaword_test_dataset.tfrecord-001-of-001")
pre_train_dataset = load_dataset(
data_files=config.pre_train_dataset,
batch_size=config.batch_size, epoch_count=1,
@ -296,21 +296,23 @@ def train_parallel(config: TransformerConfig, platform: "Ascend"):
rank_id=MultiAscend.get_rank()
) if config.test_dataset else None
_build_training_pipeline(config=config,
pre_training_dataset=pre_train_dataset,
_build_training_pipeline(pre_training_dataset=pre_train_dataset,
fine_tune_dataset=fine_tune_dataset,
test_dataset=test_dataset,
platform=platform)
def train_single(config: TransformerConfig, platform: "Ascend"):
@moxing_wrapper()
def train_single(platform: "Ascend"):
"""
Train model on single device.
Args:
config (TransformerConfig): Config for model.
"""
print(" | Starting training on single device.")
if config.task == "train":
filenames = os.listdir(config.data_path)
config.fine_tune_dataset = [os.path.join(config.data_path, filename) for filename in filenames]
else:
config.test_dataset = os.path.join(config.data_path, "gigaword_test_dataset.tfrecord-001-of-001")
pre_train_dataset = load_dataset(data_files=config.pre_train_dataset,
batch_size=config.batch_size,
epoch_count=1,
@ -327,43 +329,30 @@ def train_single(config: TransformerConfig, platform: "Ascend"):
sink_mode=config.dataset_sink_mode,
sink_step=config.dataset_sink_step) if config.test_dataset else None
_build_training_pipeline(config=config,
pre_training_dataset=pre_train_dataset,
_build_training_pipeline(pre_training_dataset=pre_train_dataset,
fine_tune_dataset=fine_tune_dataset,
test_dataset=test_dataset,
platform=platform)
def _check_args(config):
if not os.path.exists(config):
raise FileNotFoundError("`config` is not existed.")
if not isinstance(config, str):
raise ValueError("`config` must be type of str.")
if __name__ == '__main__':
args, _ = parser.parse_known_args()
device_id = os.getenv('DEVICE_ID', None)
if device_id is None:
device_id = 0
device_id = int(device_id)
context.set_context(
mode=context.GRAPH_MODE,
device_target=args.platform,
device_target=config.device_target,
reserve_class_name_in_scope=False,
device_id=device_id,
max_call_depth=2000)
_rank_size = os.getenv('RANK_SIZE')
_check_args(args.config)
_config = get_config(args.config)
set_seed(_config.random_seed)
context.set_context(save_graphs=_config.save_graphs)
get_config()
set_seed(config.random_seed)
context.set_context(save_graphs=config.save_graphs)
if _rank_size is not None and int(_rank_size) > 1:
train_parallel(_config, args.platform)
train_parallel(config.device_target)
else:
train_single(_config, args.platform)
train_single(config.device_target)