modify model_zoo network for clould
This commit is contained in:
parent
f98497ca09
commit
8388ced9ee
|
@ -14,19 +14,13 @@
|
|||
# ============================================================================
|
||||
"""post process for 310 inference"""
|
||||
import os
|
||||
import argparse
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from src.config import config
|
||||
from src.model_utils.config import config
|
||||
from src.eval_utils import metrics
|
||||
|
||||
batch_size = 1
|
||||
parser = argparse.ArgumentParser(description="ssd acc calculation")
|
||||
parser.add_argument("--result_path", type=str, required=True, help="result files path.")
|
||||
parser.add_argument("--img_path", type=str, required=True, help="image file path.")
|
||||
parser.add_argument("--drop", action="store_true", help="drop iscrowd images or not.")
|
||||
args = parser.parse_args()
|
||||
|
||||
def get_imgSize(file_name):
|
||||
img = Image.open(file_name)
|
||||
|
@ -35,7 +29,7 @@ def get_imgSize(file_name):
|
|||
def get_result(result_path, img_id_file_path):
|
||||
anno_json = os.path.join(config.coco_root, config.instances_set.format(config.val_data_type))
|
||||
|
||||
if args.drop:
|
||||
if config.drop:
|
||||
from pycocotools.coco import COCO
|
||||
train_cls = config.classes
|
||||
train_cls_dict = {}
|
||||
|
@ -53,7 +47,7 @@ def get_result(result_path, img_id_file_path):
|
|||
for file in files:
|
||||
img_ids_name = file.split('.')[0]
|
||||
img_id = int(np.squeeze(img_ids_name))
|
||||
if args.drop:
|
||||
if config.drop:
|
||||
anno_ids = coco.getAnnIds(imgIds=img_id, iscrowd=None)
|
||||
anno = coco.loadAnns(anno_ids)
|
||||
annos = []
|
||||
|
@ -86,4 +80,4 @@ def get_result(result_path, img_id_file_path):
|
|||
print(f" mAP:{mAP}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
get_result(args.result_path, args.img_path)
|
||||
get_result(config.result_path, config.img_path)
|
||||
|
|
|
@ -68,6 +68,10 @@ filter_weight: False
|
|||
freeze_layer: None
|
||||
save_best_ckpt: True
|
||||
|
||||
result_path: ""
|
||||
img_path: ""
|
||||
drop: False
|
||||
|
||||
# `mindrecord_dir` and `coco_root` are better to use absolute path.
|
||||
feature_extractor_base_param: ""
|
||||
checkpoint_filter_list: ['multi_loc_layers', 'multi_cls_layers']
|
||||
|
|
|
@ -71,6 +71,10 @@ filter_weight: False
|
|||
freeze_layer: None
|
||||
save_best_ckpt: True
|
||||
|
||||
result_path: ""
|
||||
img_path: ""
|
||||
drop: False
|
||||
|
||||
# `mindrecord_dir` and `coco_root` are better to use absolute path.
|
||||
feature_extractor_base_param: "/ckpt/mobilenet_v1.ckpt"
|
||||
checkpoint_filter_list: ['network.multi_box.cls_layers.0.weight', 'network.multi_box.cls_layers.0.bias',
|
||||
|
|
|
@ -72,6 +72,10 @@ filter_weight: False
|
|||
freeze_layer: None
|
||||
save_best_ckpt: True
|
||||
|
||||
result_path: ""
|
||||
img_path: ""
|
||||
drop: False
|
||||
|
||||
# `mindrecord_dir` and `coco_root` are better to use absolute path.
|
||||
feature_extractor_base_param: "/ckpt/resnet50.ckpt"
|
||||
checkpoint_filter_list: ['network.multi_box.cls_layers.0.weight', 'network.multi_box.cls_layers.0.bias',
|
||||
|
|
|
@ -67,6 +67,10 @@ filter_weight: False
|
|||
freeze_layer: None
|
||||
save_best_ckpt: True
|
||||
|
||||
result_path: ""
|
||||
img_path: ""
|
||||
drop: False
|
||||
|
||||
# `mindrecord_dir` and `coco_root` are better to use absolute path.
|
||||
feature_extractor_base_param: ""
|
||||
checkpoint_filter_list: ['multi_loc_layers', 'multi_cls_layers']
|
||||
|
|
|
@ -81,6 +81,37 @@ First of all, through a sequence to sequence framework, mass only predicts the b
|
|||
Secondly, by predicting the continuous token of the decoder, the decoder can build better language modeling ability than only predicting discrete token.
|
||||
Third, by further shielding the input token of the decoder which is not shielded in the encoder, the decoder is encouraged to extract more useful information from the encoder side, rather than using the rich information in the previous token.
|
||||
|
||||
If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training and evaluation as follows:
|
||||
|
||||
```python
|
||||
# run distributed training on modelarts example
|
||||
# (1) First, Perform a or b.
|
||||
# a. Set "enable_modelarts=True" on yaml file.
|
||||
# Set other parameters on yaml file you need.
|
||||
# b. Add "enable_modelarts=True" on the website UI interface.
|
||||
# Add other parameters on the website UI interface.
|
||||
# (2) Set the task"task=train"
|
||||
# (3) Set the code directory to "/path/mass" on the website UI interface.
|
||||
# (4) Set the startup file to "train.py" on the website UI interface.
|
||||
# (5) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
|
||||
# (6) Create your job.
|
||||
|
||||
# run evaluation on modelarts example
|
||||
# (1) Copy or upload your trained model to S3 bucket.
|
||||
# (2) Perform a or b.
|
||||
# a. Set "enable_modelarts=True" on yaml file.
|
||||
# Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on yaml file.
|
||||
# Set "checkpoint_url=/The path of checkpoint in S3/" on yaml file.
|
||||
# b. Add "enable_modelarts=True" on the website UI interface.
|
||||
# Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
|
||||
# Add "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface.
|
||||
# (3) Set the task"task=infer" and path of vocab
|
||||
# (4) Set the code directory to "/path/mass" on the website UI interface.
|
||||
# (5) Set the startup file to "eval.py" on the website UI interface.
|
||||
# (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
|
||||
# (7) Create your job.
|
||||
```
|
||||
|
||||
# Script description
|
||||
|
||||
MASS script and code structure are as follow:
|
||||
|
@ -90,8 +121,12 @@ MASS script and code structure are as follow:
|
|||
├── README.md // Introduction of MASS model.
|
||||
├── config
|
||||
│ ├──config.py // Configuration instance definition.
|
||||
│ ├──config.json // Configuration file.
|
||||
├── src
|
||||
│ ├── model_utils
|
||||
│ ├──config.py // parameter configuration
|
||||
│ ├──device_adapter.py // device adapter
|
||||
│ ├──local_adapter.py // local adapter
|
||||
│ ├──moxing_adapter.py // moxing adapter
|
||||
│ ├──dataset
|
||||
│ ├──bi_data_loader.py // Dataset loader for fine-tune or inferring.
|
||||
│ ├──mono_data_loader.py // Dataset loader for pre-training.
|
||||
|
@ -134,6 +169,7 @@ MASS script and code structure are as follow:
|
|||
├── requirements.txt // Requirements of third party package.
|
||||
├── train.py // Train API entry.
|
||||
├── eval.py // Infer API entry.
|
||||
├── default_config.yaml // parameter configuration
|
||||
├── tokenize_corpus.py // Corpus tokenization.
|
||||
├── apply_bpe_encoding.py // Applying bpe encoding.
|
||||
├── weights_average.py // Average multi model checkpoints to NPZ format.
|
||||
|
@ -332,9 +368,8 @@ python cornell_dialog.py --src_folder /{path}/cornell_dialog \
|
|||
|
||||
## Configuration
|
||||
|
||||
Json file under the path `config/` is the template configuration file.
|
||||
Almost all of the options and arguments needed could be assigned conveniently, including the training platform, configurations of dataset and model, arguments of optimizer etc. Optional features such as loss scale and checkpoint are also available by setting the options correspondingly.
|
||||
For more detailed information about the attributes, refer to the file `config/config.py`.
|
||||
For more detailed information about the attributes, refer to the file `default_config.yaml`.
|
||||
|
||||
## Training & Evaluation process
|
||||
|
||||
|
@ -357,9 +392,7 @@ The usage of `run_ascend.sh` is shown as below:
|
|||
|
||||
```text
|
||||
Usage: run_ascend.sh [-h, --help] [-t, --task <CHAR>] [-n, --device_num <N>]
|
||||
[-i, --device_id <N>] [-j, --hccl_json <FILE>]
|
||||
[-c, --config <FILE>] [-o, --output <FILE>]
|
||||
[-v, --vocab <FILE>]
|
||||
[-i, --device_id <N>] [-o, --output <FILE>] [-v, --vocab <FILE>]
|
||||
|
||||
options:
|
||||
-h, --help show usage
|
||||
|
@ -367,7 +400,6 @@ options:
|
|||
-n, --device_num device number used for training: N, default is 1.
|
||||
-i, --device_id device id used for training with single device: N, 0<=N<=7, default is 0.
|
||||
-j, --hccl_json rank table file used for training with multiple devices: FILE.
|
||||
-c, --config configuration file as shown in the path 'mass/config': FILE.
|
||||
-o, --output assign output file of inference: FILE.
|
||||
-v, --vocab set the vocabulary.
|
||||
-m, --metric set the metric.
|
||||
|
@ -379,15 +411,13 @@ The usage of `run_gpu.sh` is shown as below:
|
|||
|
||||
```text
|
||||
Usage: run_gpu.sh [-h, --help] [-t, --task <CHAR>] [-n, --device_num <N>]
|
||||
[-i, --device_id <N>] [-c, --config <FILE>]
|
||||
[-o, --output <FILE>] [-v, --vocab <FILE>]
|
||||
[-i, --device_id <N>] [-o, --output <FILE>] [-v, --vocab <FILE>]
|
||||
|
||||
options:
|
||||
-h, --help show usage
|
||||
-t, --task select task: CHAR, 't' for train and 'i' for inference".
|
||||
-n, --device_num device number used for training: N, default is 1.
|
||||
-i, --device_id device id used for training with single device: N, 0<=N<=7, default is 0.
|
||||
-c, --config configuration file as shown in the path 'mass/config': FILE.
|
||||
-o, --output assign output file of inference: FILE.
|
||||
-v, --vocab set the vocabulary.
|
||||
-m, --metric set the metric.
|
||||
|
@ -397,7 +427,7 @@ The command followed shows a example for training with 2 devices.
|
|||
Ascend:
|
||||
|
||||
```ascend
|
||||
sh run_ascend.sh --task t --device_num 2 --hccl_json /{path}/rank_table.json --config /{path}/config.json
|
||||
sh run_ascend.sh --task t --device_num 2 --hccl_json /{path}/rank_table.json
|
||||
```
|
||||
|
||||
ps. Discontinuous device id is not supported in `run_ascend.sh` at present, device id in `rank_table.json` must start from 0.
|
||||
|
@ -405,20 +435,20 @@ ps. Discontinuous device id is not supported in `run_ascend.sh` at present, devi
|
|||
GPU:
|
||||
|
||||
```gpu
|
||||
sh run_gpu.sh --task t --device_num 2 --config /{path}/config.json
|
||||
sh run_gpu.sh --task t --device_num 2
|
||||
```
|
||||
|
||||
If use a single chip, it would be like this:
|
||||
Ascend:
|
||||
|
||||
```ascend
|
||||
sh run_ascend.sh --task t --device_num 1 --device_id 0 --config /{path}/config.json
|
||||
sh run_ascend.sh --task t --device_num 1 --device_id 0
|
||||
```
|
||||
|
||||
GPU:
|
||||
|
||||
```gpu
|
||||
sh run_gpu.sh --task t --device_num 1 --device_id 0 --config /{path}/config.json
|
||||
sh run_gpu.sh --task t --device_num 1 --device_id 0
|
||||
```
|
||||
|
||||
## Weights average
|
||||
|
@ -427,16 +457,14 @@ sh run_gpu.sh --task t --device_num 1 --device_id 0 --config /{path}/config.json
|
|||
python weights_average.py --input_files your_checkpoint_list --output_file model.npz
|
||||
```
|
||||
|
||||
The input_files is a list of you checkpoints file. To use model.npz as the weights, add its path in config.json at "existed_ckpt".
|
||||
The input_files is a list of you checkpoints file. To use model.npz as the weights, add its path in default_config.yaml at "default_config.yaml".
|
||||
|
||||
```json
|
||||
```default_config.yaml
|
||||
{
|
||||
...
|
||||
"checkpoint_options": {
|
||||
"existed_ckpt": "/xxx/xxx/model.npz",
|
||||
"checkpoint_file_path": "/xxx/xxx/model.npz",
|
||||
"save_ckpt_steps": 1000,
|
||||
...
|
||||
},
|
||||
...
|
||||
}
|
||||
```
|
||||
|
@ -448,14 +476,13 @@ Two learning rate scheduler are provided in our model:
|
|||
1. [Polynomial decay scheduler](https://towardsdatascience.com/learning-rate-schedules-and-adaptive-learning-rate-methods-for-deep-learning-2c8f433990d1).
|
||||
2. [Inverse square root scheduler](https://ece.uwaterloo.ca/~dwharder/aads/Algorithms/Inverse_square_root/).
|
||||
|
||||
LR scheduler could be config in `config/config.json`.
|
||||
LR scheduler could be config in `default_config.yaml`.
|
||||
|
||||
For Polynomial decay scheduler, config could be like:
|
||||
|
||||
```json
|
||||
```default_config.yaml
|
||||
{
|
||||
...
|
||||
"learn_rate_config": {
|
||||
"optimizer": "adam",
|
||||
"lr": 1e-4,
|
||||
"lr_scheduler": "poly",
|
||||
|
@ -463,24 +490,21 @@ For Polynomial decay scheduler, config could be like:
|
|||
"decay_steps": 10000,
|
||||
"warmup_steps": 2000,
|
||||
"min_lr": 1e-6
|
||||
},
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
For Inverse square root scheduler, config could be like:
|
||||
|
||||
```json
|
||||
```default_config.yaml
|
||||
{
|
||||
...
|
||||
"learn_rate_config": {
|
||||
"optimizer": "adam",
|
||||
"lr": 1e-4,
|
||||
"lr_scheduler": "isr",
|
||||
"decay_start_step": 12000,
|
||||
"warmup_steps": 2000,
|
||||
"min_lr": 1e-6
|
||||
},
|
||||
...
|
||||
}
|
||||
```
|
||||
|
@ -516,79 +540,79 @@ MASS pre-trains a sequence to sequence model by predicting the masked fragments
|
|||
Here we provide a practice example to demonstrate the basic usage of MASS for pre-training, fine-tuning a model, and the inference process. The overall process is as follows:
|
||||
|
||||
1. Download and process the dataset.
|
||||
2. Modify the `config.json` to config the network.
|
||||
2. Modify the `default_config.yaml` to config the network.
|
||||
3. Run a task for pre-training and fine-tuning.
|
||||
4. Perform inference and validation.
|
||||
|
||||
## Pre-training
|
||||
|
||||
For pre-training a model, config the options in `config.json` firstly:
|
||||
For pre-training a model, config the options in `default_config.yaml` firstly:
|
||||
|
||||
- Assign the `pre_train_dataset` under `dataset_config` node to the dataset path.
|
||||
- Choose the optimizer('momentum/adam/lamb' is available).
|
||||
- Assign the 'ckpt_prefix' and 'ckpt_path' under `checkpoint_path` to save the model files.
|
||||
- Set other arguments including dataset configurations and network configurations.
|
||||
- If you have a trained model already, assign the `existed_ckpt` to the checkpoint file.
|
||||
- If you have a trained model already, assign the `checkpoint_file_path` to the checkpoint file.
|
||||
|
||||
If you use the ascend chip, run the shell script `run_ascend.sh` as followed:
|
||||
|
||||
```ascend
|
||||
sh run_ascend.sh -t t -n 1 -i 1 -c /mass/config/config.json
|
||||
sh run_ascend.sh -t t -n 1 -i 1
|
||||
```
|
||||
|
||||
You can also run the shell script `run_gpu.sh` on gpu as followed:
|
||||
|
||||
```gpu
|
||||
sh run_gpu.sh -t t -n 1 -i 1 -c /mass/config/config.json
|
||||
sh run_gpu.sh -t t -n 1 -i 1
|
||||
```
|
||||
|
||||
Get the log and output files under the path `./train_mass_*/`, and the model file under the path assigned in the `config/config.json` file.
|
||||
Get the log and output files under the path `./train_mass_*/`, and the model file under the path assigned in the `default_config.yaml` file.
|
||||
|
||||
## Fine-tuning
|
||||
|
||||
For fine-tuning a model, config the options in `config.json` firstly:
|
||||
For fine-tuning a model, config the options in `default_config.yaml` firstly:
|
||||
|
||||
- Assign the `fine_tune_dataset` under `dataset_config` node to the dataset path.
|
||||
- Assign the `existed_ckpt` under `checkpoint_path` node to the existed model file generated by pre-training.
|
||||
- Assign the `checkpoint_file_path` under `checkpoint_path` node to the existed model file generated by pre-training.
|
||||
- Choose the optimizer('momentum/adam/lamb' is available).
|
||||
- Assign the `ckpt_prefix` and `ckpt_path` under `checkpoint_path` node to save the model files.
|
||||
- Assign the `ckpt_prefix` and `checkpoint_file_path` under `checkpoint_path` node to save the model files.
|
||||
- Set other arguments including dataset configurations and network configurations.
|
||||
|
||||
If you use the ascend chip, run the shell script `run_ascend.sh` as followed:
|
||||
|
||||
```ascend
|
||||
sh run_ascend.sh -t t -n 1 -i 1 -c config/config.json
|
||||
sh run_ascend.sh -t t -n 1 -i 1
|
||||
```
|
||||
|
||||
You can also run the shell script `run_gpu.sh` on gpu as followed:
|
||||
|
||||
```gpu
|
||||
sh run_gpu.sh -t t -n 1 -i 1 -c config/config.json
|
||||
sh run_gpu.sh -t t -n 1 -i 1
|
||||
```
|
||||
|
||||
Get the log and output files under the path `./train_mass_*/`, and the model file under the path assigned in the `config/config.json` file.
|
||||
Get the log and output files under the path `./train_mass_*/`, and the model file under the path assigned in the `default_config.yaml` file.
|
||||
|
||||
## Inference
|
||||
|
||||
If you need to use the trained model to perform inference on multiple hardware platforms, such as GPU, Ascend 910 or Ascend 310, you can refer to this [Link](https://www.mindspore.cn/tutorial/training/en/master/advanced_use/migrate_3rd_scripts.html).
|
||||
For inference, config the options in `config.json` firstly:
|
||||
For inference, config the options in `default_config.yaml` firstly:
|
||||
|
||||
- Assign the `test_dataset` under `dataset_config` node to the dataset path.
|
||||
- Assign the `existed_ckpt` under `checkpoint_path` node to the model file produced by fine-tuning.
|
||||
- Assign the `default_config.yaml` under `data_path` node to the dataset path.
|
||||
- Assign the `default_config.yaml` under `checkpoint_path` node to the model file produced by fine-tuning.
|
||||
- Choose the optimizer('momentum/adam/lamb' is available).
|
||||
- Assign the `ckpt_prefix` and `ckpt_path` under `checkpoint_path` node to save the model files.
|
||||
- Assign the `ckpt_prefix` and `checkpoint_file_path` under `checkpoint_path` node to save the model files.
|
||||
- Set other arguments including dataset configurations and network configurations.
|
||||
|
||||
If you use the ascend chip, run the shell script `run_ascend.sh` as followed:
|
||||
|
||||
```bash
|
||||
sh run_ascend.sh -t i -n 1 -i 1 -c config/config.json -o {outputfile}
|
||||
sh run_ascend.sh -t i -n 1 -i 1 -o {outputfile}
|
||||
```
|
||||
|
||||
You can also run the shell script `run_gpu.sh` on gpu as followed:
|
||||
|
||||
```gpu
|
||||
sh run_gpu.sh -t i -n 1 -i 1 -c config/config.json -o {outputfile}
|
||||
sh run_gpu.sh -t i -n 1 -i 1 -o {outputfile}
|
||||
```
|
||||
|
||||
## Mindir Inference Process
|
||||
|
@ -596,7 +620,7 @@ sh run_gpu.sh -t i -n 1 -i 1 -c config/config.json -o {outputfile}
|
|||
### [Export MindIR](#contents)
|
||||
|
||||
```shell
|
||||
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
|
||||
python export.py --checkpoint_file_path [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
|
||||
```
|
||||
|
||||
The ckpt_file parameter is required,
|
||||
|
@ -675,7 +699,7 @@ The comparisons between MASS and other baseline methods in terms of PPL on Corne
|
|||
|
||||
# Description of random situation
|
||||
|
||||
MASS model contains dropout operations, if you want to disable dropout, please set related dropout_rate to 0 in `config/config.json`.
|
||||
MASS model contains dropout operations, if you want to disable dropout, please set related dropout_rate to 0 in `default_config.yaml`.
|
||||
|
||||
# others
|
||||
|
||||
|
|
|
@ -80,6 +80,40 @@ MASS设计联合预训练编码器和解码器,来完成语言生成任务。
|
|||
其次,通过预测解码器的连续标记,可以建立比仅预测离散标记更好的语言建模能力。
|
||||
第三,通过进一步屏蔽编码器中未屏蔽的解码器的输入标记,鼓励解码器从编码器侧提取更有用的信息,而不是使用前一个标记中的丰富信息。
|
||||
|
||||
如果要在modelarts上进行模型的训练,可以参考modelarts的官方指导文档(https://support.huaweicloud.com/modelarts/)
|
||||
开始进行模型的训练和推理,具体操作如下:
|
||||
|
||||
```python
|
||||
# 在modelarts上使用分布式训练的示例:
|
||||
# (1) 选址a或者b其中一种方式。
|
||||
# a. 设置 "enable_modelarts=True" 。
|
||||
# 在yaml文件上设置网络所需的参数。
|
||||
# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。
|
||||
# 在modelarts的界面上设置网络所需的参数。
|
||||
# (2)设置训练任务"task=train"
|
||||
# (3) 在modelarts的界面上设置代码的路径 "/path/mass"。
|
||||
# (4) 在modelarts的界面上设置模型的启动文件 "train.py" 。
|
||||
# (5) 在modelarts的界面上设置模型的数据路径 "Dataset path" ,
|
||||
# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。
|
||||
# (6) 开始模型的训练。
|
||||
|
||||
# 在modelarts上使用模型推理的示例
|
||||
# (1) 把训练好的模型地方到桶的对应位置。
|
||||
# (2) 选址a或者b其中一种方式。
|
||||
# a. 设置 "enable_modelarts=True"
|
||||
# 设置 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt" 在 yaml 文件.
|
||||
# 设置 "checkpoint_url=/The path of checkpoint in S3/" 在 yaml 文件.
|
||||
# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。
|
||||
# 增加 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" 参数在modearts的界面上。
|
||||
# 增加 "checkpoint_url=/The path of checkpoint in S3/" 参数在modearts的界面上。
|
||||
# (3) 设置训练任务"task=infer"和vocab的路径
|
||||
# (4) 在modelarts的界面上设置代码的路径 "/path/mass"。
|
||||
# (5) 在modelarts的界面上设置模型的启动文件 "eval.py" 。
|
||||
# (6) 在modelarts的界面上设置模型的数据路径 "Dataset path" ,
|
||||
# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。
|
||||
# (7) 开始模型的推理。
|
||||
```
|
||||
|
||||
## 脚本说明
|
||||
|
||||
MASS脚本及代码结构如下:
|
||||
|
@ -88,8 +122,13 @@ MASS脚本及代码结构如下:
|
|||
├── mass
|
||||
├── README.md // MASS模型介绍
|
||||
├── config
|
||||
│ ├──config.py // 配置实例定义
|
||||
│ ├──config.json // 配置文件
|
||||
│ ├──config.py // 参数配置
|
||||
├── src
|
||||
│ ├──model_utils
|
||||
│ ├──config.py // 参数配置
|
||||
│ ├──device_adapter.py // 设备配置
|
||||
│ ├──local_adapter.py // 本地设备配置
|
||||
│ ├──moxing_adapter.py // modelarts设备配置
|
||||
├──src
|
||||
│ ├──dataset
|
||||
│ ├──bi_data_loader.py // 数据集加载器,用于微调或推理
|
||||
|
@ -133,6 +172,7 @@ MASS脚本及代码结构如下:
|
|||
├── requirements.txt // 第三方包需求
|
||||
├── train.py // 训练API入口
|
||||
├── eval.py // 推理API入口
|
||||
├── default_config.yaml // 参数配置
|
||||
├── tokenize_corpus.py // 语料标记化
|
||||
├── apply_bpe_encoding.py // 应用BPE进行编码
|
||||
├── weights_average.py // 将各模型检查点平均转换到NPZ格式
|
||||
|
@ -333,7 +373,7 @@ python cornell_dialog.py --src_folder /{path}/cornell_dialog \
|
|||
|
||||
`config/`目录下的JSON文件为模板配置文件,
|
||||
便于为大多数选项及参数赋值,包括训练平台、数据集和模型的配置、优化器参数等。还可以通过设置相应选项,获得诸如损失放大和检查点等可选特性。
|
||||
有关属性的详细信息,参见`config/config.py`文件。
|
||||
有关属性的详细信息,参见`default_config.yaml`文件。
|
||||
|
||||
## 训练&评估过程
|
||||
|
||||
|
@ -357,8 +397,7 @@ sh run_gpu.sh [--options]
|
|||
```text
|
||||
Usage: run_ascend.sh [-h, --help] [-t, --task <CHAR>] [-n, --device_num <N>]
|
||||
[-i, --device_id <N>] [-j, --hccl_json <FILE>]
|
||||
[-c, --config <FILE>] [-o, --output <FILE>]
|
||||
[-v, --vocab <FILE>]
|
||||
[-o, --output <FILE>] [-v, --vocab <FILE>]
|
||||
|
||||
options:
|
||||
-h, --help show usage
|
||||
|
@ -366,7 +405,6 @@ options:
|
|||
-n, --device_num device number used for training: N, default is 1.
|
||||
-i, --device_id device id used for training with single device: N, 0<=N<=7, default is 0.
|
||||
-j, --hccl_json rank table file used for training with multiple devices: FILE.
|
||||
-c, --config configuration file as shown in the path 'mass/config': FILE.
|
||||
-o, --output assign output file of inference: FILE.
|
||||
-v, --vocab set the vocabulary.
|
||||
-m, --metric set the metric.
|
||||
|
@ -378,15 +416,13 @@ options:
|
|||
|
||||
```text
|
||||
Usage: run_gpu.sh [-h, --help] [-t, --task <CHAR>] [-n, --device_num <N>]
|
||||
[-i, --device_id <N>] [-c, --config <FILE>]
|
||||
[-o, --output <FILE>] [-v, --vocab <FILE>]
|
||||
[-i, --device_id <N>] [-o, --output <FILE>] [-v, --vocab <FILE>]
|
||||
|
||||
options:
|
||||
-h, --help show usage
|
||||
-t, --task select task: CHAR, 't' for train and 'i' for inference".
|
||||
-n, --device_num device number used for training: N, default is 1.
|
||||
-i, --device_id device id used for training with single device: N, 0<=N<=7, default is 0.
|
||||
-c, --config configuration file as shown in the path 'mass/config': FILE.
|
||||
-o, --output assign output file of inference: FILE.
|
||||
-v, --vocab set the vocabulary.
|
||||
-m, --metric set the metric.
|
||||
|
@ -396,7 +432,7 @@ options:
|
|||
Ascend处理器:
|
||||
|
||||
```ascend
|
||||
sh run_ascend.sh --task t --device_num 2 --hccl_json /{path}/rank_table.json --config /{path}/config.json
|
||||
sh run_ascend.sh --task t --device_num 2 --hccl_json /{path}/rank_table.json
|
||||
```
|
||||
|
||||
注:`run_ascend.sh`暂不支持不连续设备ID,`rank_table.json`中的设备ID必须从0开始。
|
||||
|
@ -404,20 +440,20 @@ sh run_ascend.sh --task t --device_num 2 --hccl_json /{path}/rank_table.json --c
|
|||
GPU处理器:
|
||||
|
||||
```gpu
|
||||
sh run_gpu.sh --task t --device_num 2 --config /{path}/config.json
|
||||
sh run_gpu.sh --task t --device_num 2
|
||||
```
|
||||
|
||||
运行如下命令进行单卡训练:
|
||||
Ascend处理器:
|
||||
|
||||
```ascend
|
||||
sh run_ascend.sh --task t --device_num 1 --device_id 0 --config /{path}/config.json
|
||||
sh run_ascend.sh --task t --device_num 1 --device_id 0
|
||||
```
|
||||
|
||||
GPU处理器:
|
||||
|
||||
```gpu
|
||||
sh run_gpu.sh --task t --device_num 1 --device_id 0 --config /{path}/config.json
|
||||
sh run_gpu.sh --task t --device_num 1 --device_id 0
|
||||
```
|
||||
|
||||
## 权重平均值
|
||||
|
@ -426,16 +462,14 @@ sh run_gpu.sh --task t --device_num 1 --device_id 0 --config /{path}/config.json
|
|||
python weights_average.py --input_files your_checkpoint_list --output_file model.npz
|
||||
```
|
||||
|
||||
`input_files`为检查点文件清单。如需使用`model.npz`作为权重文件,请在“existed_ckpt”的`config.json`文件中添加`model.npz`的路径。
|
||||
`input_files`为检查点文件清单。如需使用`model.npz`作为权重文件,请在“checkpoint_file_path”的`default_config.yaml`文件中添加`model.npz`的路径。
|
||||
|
||||
```json
|
||||
```default_config.yaml
|
||||
{
|
||||
...
|
||||
"checkpoint_options": {
|
||||
"existed_ckpt": "/xxx/xxx/model.npz",
|
||||
"checkpoint_file_path": "/xxx/xxx/model.npz",
|
||||
"save_ckpt_steps": 1000,
|
||||
...
|
||||
},
|
||||
...
|
||||
}
|
||||
```
|
||||
|
@ -451,10 +485,9 @@ python weights_average.py --input_files your_checkpoint_list --output_file model
|
|||
|
||||
多项式衰减调度器配置文件示例如下:
|
||||
|
||||
```json
|
||||
```default_config.yaml
|
||||
{
|
||||
...
|
||||
"learn_rate_config": {
|
||||
"optimizer": "adam",
|
||||
"lr": 1e-4,
|
||||
"lr_scheduler": "poly",
|
||||
|
@ -462,24 +495,21 @@ python weights_average.py --input_files your_checkpoint_list --output_file model
|
|||
"decay_steps": 10000,
|
||||
"warmup_steps": 2000,
|
||||
"min_lr": 1e-6
|
||||
},
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
逆平方根调度器配置文件示例如下:
|
||||
|
||||
```json
|
||||
```default_config.yaml
|
||||
{
|
||||
...
|
||||
"learn_rate_config": {
|
||||
"optimizer": "adam",
|
||||
"lr": 1e-4,
|
||||
"lr_scheduler": "isr",
|
||||
"decay_start_step": 12000,
|
||||
"warmup_steps": 2000,
|
||||
"min_lr": 1e-6
|
||||
},
|
||||
...
|
||||
}
|
||||
```
|
||||
|
@ -515,79 +545,78 @@ MASS通过预测输入序列中被屏蔽的片段来预训练序列到序列模
|
|||
这里提供了一个练习示例来演示应用MASS,对模型进行预训练、微调的基本用法,以及推理过程。操作步骤如下:
|
||||
|
||||
1. 下载并处理数据集。
|
||||
2. 修改`config.json`文件,配置网络。
|
||||
2. 修改`default_config.yaml`文件,配置网络。
|
||||
3. 运行预训练和微调任务。
|
||||
4. 进行推理验证。
|
||||
|
||||
## 预训练
|
||||
|
||||
预训练模型时,首先配置`config.json`中的选项:
|
||||
预训练模型时,首先配置`default_config.yaml`中的选项:
|
||||
|
||||
- 将`dataset_config`节点下的`pre_train_dataset`配置为数据集路径。
|
||||
- 选择优化器(可采用'momentum/adam/lamb’)。
|
||||
- 在`checkpoint_path`下,指定'ckpt_prefix'和'ckpt_path'来保存模型文件。
|
||||
- 在`checkpoint_path`下,指定'ckpt_prefix'和'checkpoint_file_path'来保存模型文件。
|
||||
- 配置其他参数,包括数据集配置和网络配置。
|
||||
- 如果已经有训练好的模型,请将`existed_ckpt`配置为该检查点文件。
|
||||
- 如果已经有训练好的模型,请将`checkpoint_file_path`配置为该检查点文件。
|
||||
|
||||
如使用Ascend芯片,执行`run_ascend.sh`这个shell脚本:
|
||||
|
||||
```ascend
|
||||
sh run_ascend.sh -t t -n 1 -i 1 -c /mass/config/config.json
|
||||
sh run_ascend.sh -t t -n 1 -i 1
|
||||
```
|
||||
|
||||
如使用GPU处理器,执行`run_gpu.sh`这个shell脚本:
|
||||
|
||||
```gpu
|
||||
sh run_gpu.sh -t t -n 1 -i 1 -c /mass/config/config.json
|
||||
sh run_gpu.sh -t t -n 1 -i 1
|
||||
```
|
||||
|
||||
日志和输出文件可以在`./train_mass_*/`路径下获取,模型文件可以在`config/config.json`配置文件中指定的路径下获取。
|
||||
日志和输出文件可以在`./train_mass_*/`路径下获取,模型文件可以在`default_config.yaml`配置文件中指定的路径下获取。
|
||||
|
||||
## 微调
|
||||
|
||||
预训练模型时,首先配置`config.json`中的选项:
|
||||
预训练模型时,首先配置`default_yaml.yaml`中的选项:
|
||||
|
||||
- 将`dataset_config`节点下的`fine_tune_dataset`配置为数据集路径。
|
||||
- 将`checkpoint_path`节点下的`existed_ckpt`赋值给预训练生成的已有模型文件。
|
||||
- 将`default_config.yaml`下的`data_path`配置为数据集路径。
|
||||
- 将`default_config.yaml`下的`checkpoint_file_path`赋值给预训练生成的已有模型文件。
|
||||
- 选择优化器(可采用'momentum/adam/lamb’)。
|
||||
- 在`checkpoint_path`下,指定'ckpt_prefix'和'ckpt_path'来保存模型文件。
|
||||
- 在`default_config.yaml`下,指定'ckpt_prefix'和'checkpoint_path'来保存模型文件。
|
||||
- 配置其他参数,包括数据集配置和网络配置。
|
||||
|
||||
如使用Ascend芯片,执行`run_ascend.sh`这个shell脚本:
|
||||
|
||||
```ascend
|
||||
sh run_ascend.sh -t t -n 1 -i 1 -c config/config.json
|
||||
sh run_ascend.sh -t t -n 1 -i 1
|
||||
```
|
||||
|
||||
如使用GPU处理器,执行`run_gpu.sh`这个shell脚本:
|
||||
|
||||
```gpu
|
||||
sh run_gpu.sh -t t -n 1 -i 1 -c config/config.json
|
||||
sh run_gpu.sh -t t -n 1 -i 1
|
||||
```
|
||||
|
||||
日志和输出文件可以在`./train_mass_*/`路径下获取,模型文件可以在`config/config.json`配置文件中指定的路径下获取。
|
||||
日志和输出文件可以在`./train_mass_*/`路径下获取,模型文件可以在`default_config.yaml`配置文件中指定的路径下获取。
|
||||
|
||||
## 推理
|
||||
|
||||
如果您需要使用此训练模型在GPU、Ascend 910、Ascend 310等多个硬件平台上进行推理,可参考此[链接](https://www.mindspore.cn/tutorial/training/zh-CN/master/advanced_use/migrate_3rd_scripts.html)。
|
||||
推理时,请先配置`config.json`中的选项:
|
||||
|
||||
- 将`dataset_config`节点下的`test_dataset`配置为数据集路径。
|
||||
- 将`dataset_config`节点下的`test_dataset`配置为数据集路径。
|
||||
- 将`default_config.yaml`节点下的`data_path`配置为数据集路径。
|
||||
- 选择优化器(可采用'momentum/adam/lamb’)。
|
||||
- 在`checkpoint_path`下,指定'ckpt_prefix'和'ckpt_path'来保存模型文件。
|
||||
- 指定'ckpt_prefix'和'ckpt_path'来保存模型文件。
|
||||
- 配置其他参数,包括数据集配置和网络配置。
|
||||
|
||||
如使用Ascend芯片,执行`run_ascend.sh`这个shell脚本:
|
||||
|
||||
```bash
|
||||
sh run_ascend.sh -t i -n 1 -i 1 -c config/config.json -o {outputfile}
|
||||
sh run_ascend.sh -t i -n 1 -i 1 -o {outputfile}
|
||||
```
|
||||
|
||||
如使用GPU处理器,执行`run_gpu.sh`这个shell脚本:
|
||||
|
||||
```gpu
|
||||
sh run_gpu.sh -t i -n 1 -i 1 -c config/config.json -o {outputfile}
|
||||
sh run_gpu.sh -t i -n 1 -i 1 -o {outputfile}
|
||||
```
|
||||
|
||||
## Mindir推理
|
||||
|
@ -595,10 +624,10 @@ sh run_gpu.sh -t i -n 1 -i 1 -c config/config.json -o {outputfile}
|
|||
### [导出模型](#contents)
|
||||
|
||||
```shell
|
||||
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
|
||||
python export.py --checkpoint_file_path [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
|
||||
```
|
||||
|
||||
参数ckpt_file为必填项,
|
||||
参数checkpoint_file_path为必填项,
|
||||
`EXPORT_FORMAT` 必须在 ["AIR", "MINDIR"]中选择。
|
||||
|
||||
### 在Ascend310执行推理
|
||||
|
@ -674,7 +703,7 @@ bash run_infer_310.sh [MINDIR_PATH] [CONFIG] [VOCAB] [OUTPUT] [NEED_PREPROCESS]
|
|||
|
||||
# 随机情况说明
|
||||
|
||||
MASS模型涉及随机失活(dropout)操作,如需禁用此功能,请在`config/config.json`中将dropout_rate设置为0。
|
||||
MASS模型涉及随机失活(dropout)操作,如需禁用此功能,请在`default_config.yaml`中将dropout_rate设置为0。
|
||||
|
||||
# 其他
|
||||
|
||||
|
|
|
@ -1,54 +0,0 @@
|
|||
{
|
||||
"dataset_config": {
|
||||
"epochs": 20,
|
||||
"batch_size": 192,
|
||||
"pre_train_dataset": "",
|
||||
"fine_tune_dataset": "",
|
||||
"test_dataset": "",
|
||||
"valid_dataset": "",
|
||||
"dataset_sink_mode": false,
|
||||
"dataset_sink_step": 100
|
||||
},
|
||||
"model_config": {
|
||||
"random_seed": 100,
|
||||
"save_graphs": false,
|
||||
"seq_length": 64,
|
||||
"vocab_size": 45744,
|
||||
"hidden_size": 1024,
|
||||
"num_hidden_layers": 6,
|
||||
"num_attention_heads": 8,
|
||||
"intermediate_size": 4096,
|
||||
"hidden_act": "relu",
|
||||
"hidden_dropout_prob": 0.2,
|
||||
"attention_dropout_prob": 0.2,
|
||||
"max_position_embeddings": 64,
|
||||
"initializer_range": 0.02,
|
||||
"label_smoothing": 0.1,
|
||||
"beam_width": 4,
|
||||
"length_penalty_weight": 1.0,
|
||||
"max_decode_length": 64
|
||||
},
|
||||
"loss_scale_config": {
|
||||
"loss_scale_mode": "dynamic",
|
||||
"init_loss_scale": 65536,
|
||||
"loss_scale_factor": 2,
|
||||
"scale_window": 200
|
||||
},
|
||||
"learn_rate_config": {
|
||||
"optimizer": "adam",
|
||||
"lr": 1e-4,
|
||||
"lr_scheduler": "poly",
|
||||
"poly_lr_scheduler_power": 0.5,
|
||||
"decay_steps": 10000,
|
||||
"decay_start_step": 12000,
|
||||
"warmup_steps": 4000,
|
||||
"min_lr": 1e-6
|
||||
},
|
||||
"checkpoint_options": {
|
||||
"existed_ckpt": "",
|
||||
"save_ckpt_steps": 2500,
|
||||
"keep_ckpt_max": 50,
|
||||
"ckpt_prefix": "ckpt",
|
||||
"ckpt_path": "checkpoints"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,94 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
run_distribute: False
|
||||
enable_profiling: False
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path/"
|
||||
device_target: 'Ascend'
|
||||
checkpoint_path: './checkpoint/'
|
||||
checkpoint_file_path: ''
|
||||
|
||||
# ==============================================================================
|
||||
# Training options
|
||||
task: "train"
|
||||
epochs: 20
|
||||
batch_size: 192
|
||||
dtype: float32 #only support float16 and float32
|
||||
compute_type: float16 #only support float16 and float32
|
||||
pre_train_dataset: ""
|
||||
fine_tune_dataset: ""
|
||||
test_dataset: ""
|
||||
valid_dataset: ""
|
||||
dataset_sink_mode: false
|
||||
dataset_sink_step: 100
|
||||
random_seed: 100
|
||||
save_graphs: false
|
||||
seq_length: 64
|
||||
vocab_size: 45744
|
||||
hidden_size: 1024
|
||||
num_hidden_layers: 6
|
||||
num_attention_heads: 8
|
||||
intermediate_size: 4096
|
||||
hidden_act: "relu"
|
||||
hidden_dropout_prob: 0.2
|
||||
attention_dropout_prob: 0.2
|
||||
max_position_embeddings: 64
|
||||
initializer_range: 0.02
|
||||
label_smoothing: 0.1
|
||||
beam_width: 4
|
||||
length_penalty_weight: 1.0
|
||||
max_decode_length: 64
|
||||
loss_scale_mode: "dynamic"
|
||||
init_loss_scale: 65536
|
||||
loss_scale_factor: 2
|
||||
scale_window: 200
|
||||
optimizer: "adam"
|
||||
lr: 0.0001
|
||||
lr_scheduler: "poly"
|
||||
poly_lr_scheduler_power: 0.5
|
||||
decay_steps: 10000
|
||||
decay_start_step: 12000
|
||||
warmup_steps: 4000
|
||||
min_lr: 0.000001
|
||||
existed_ckpt: ""
|
||||
save_ckpt_steps: 2500
|
||||
keep_ckpt_max: 50
|
||||
ckpt_prefix: "ckpt"
|
||||
metric: "rouge"
|
||||
vocab: ""
|
||||
output: ""
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
ckpt_file: ""
|
||||
file_name: "mass"
|
||||
file_format: "AIR"
|
||||
vocab_file: ""
|
||||
result_path: "./preprocess_Result/"
|
||||
source_id_folder: ""
|
||||
target_id_folder: ""
|
||||
result_dir: "./result_Files"
|
||||
|
||||
---
|
||||
# Help description for each configuration
|
||||
enable_modelarts: 'Whether training on modelarts, default: False'
|
||||
data_url: 'Dataset url for obs'
|
||||
train_url: 'Training output url for obs'
|
||||
checkpoint_url: 'The location of checkpoint for obs'
|
||||
data_path: 'Dataset path for local'
|
||||
output_path: 'Training output path for local'
|
||||
load_path: 'The location of checkpoint for obs'
|
||||
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
||||
num_classes: 'Class for dataset'
|
||||
batch_size: "Batch size for training and evaluation"
|
||||
epoch_size: "Total training epochs."
|
||||
keep_checkpoint_max: "keep the last keep_checkpoint_max checkpoint"
|
||||
checkpoint_path: "The location of the checkpoint file."
|
||||
checkpoint_file_path: "The location of the checkpoint file."
|
|
@ -14,60 +14,52 @@
|
|||
# ============================================================================
|
||||
"""Evaluation api."""
|
||||
import os
|
||||
import argparse
|
||||
import pickle
|
||||
|
||||
from mindspore.common import dtype as mstype
|
||||
from mindspore import context
|
||||
|
||||
from config import TransformerConfig
|
||||
from mindspore.common import dtype as mstype
|
||||
from src.transformer import infer, infer_ppl
|
||||
from src.utils import Dictionary
|
||||
from src.utils import get_score
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||
|
||||
parser = argparse.ArgumentParser(description='Evaluation MASS.')
|
||||
parser.add_argument("--config", type=str, required=True,
|
||||
help="Model config json file path.")
|
||||
parser.add_argument("--vocab", type=str, required=True,
|
||||
help="Vocabulary to use.")
|
||||
parser.add_argument("--output", type=str, required=True,
|
||||
help="Result file path.")
|
||||
parser.add_argument("--metric", type=str, default='rouge',
|
||||
help='Set eval method.')
|
||||
parser.add_argument("--platform", type=str, required=True,
|
||||
help="model working platform.")
|
||||
|
||||
|
||||
def get_config(config):
|
||||
config = TransformerConfig.from_json_file(config)
|
||||
config.compute_type = mstype.float16
|
||||
config.dtype = mstype.float32
|
||||
return config
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args, _ = parser.parse_known_args()
|
||||
vocab = Dictionary.load_from_persisted_dict(args.vocab)
|
||||
_config = get_config(args.config)
|
||||
def get_config():
|
||||
if config.compute_type == "float16":
|
||||
config.compute_type = mstype.float16
|
||||
if config.compute_type == "float32":
|
||||
config.compute_type = mstype.float32
|
||||
if config.dtype == "float16":
|
||||
config.dtype = mstype.float16
|
||||
if config.dtype == "float32":
|
||||
config.dtype = mstype.float32
|
||||
|
||||
@moxing_wrapper()
|
||||
def eval_net():
|
||||
"""eval_net"""
|
||||
vocab = Dictionary.load_from_persisted_dict(config.vocab)
|
||||
get_config()
|
||||
device_id = os.getenv('DEVICE_ID', None)
|
||||
if device_id is None:
|
||||
device_id = 0
|
||||
device_id = int(device_id)
|
||||
context.set_context(
|
||||
mode=context.GRAPH_MODE,
|
||||
device_target=args.platform,
|
||||
device_target=config.device_target,
|
||||
reserve_class_name_in_scope=False,
|
||||
device_id=device_id)
|
||||
|
||||
if args.metric == 'rouge':
|
||||
result = infer(_config)
|
||||
if config.metric == 'rouge':
|
||||
result = infer(config)
|
||||
else:
|
||||
result = infer_ppl(_config)
|
||||
result = infer_ppl(config)
|
||||
|
||||
with open(args.output, "wb") as f:
|
||||
with open(config.output, "wb") as f:
|
||||
pickle.dump(result, f, 1)
|
||||
|
||||
# get score by given metric
|
||||
score = get_score(result, vocab, metric=args.metric)
|
||||
score = get_score(result, vocab, metric=config.metric)
|
||||
print(score)
|
||||
|
||||
if __name__ == '__main__':
|
||||
eval_net()
|
||||
|
|
|
@ -14,7 +14,6 @@
|
|||
# ============================================================================
|
||||
"""export checkpoint file into air models"""
|
||||
|
||||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from mindspore import Tensor, context
|
||||
|
@ -23,33 +22,16 @@ from mindspore.train.serialization import export
|
|||
|
||||
from src.utils import Dictionary
|
||||
from src.utils.load_weights import load_infer_weights
|
||||
from src.model_utils.config import config
|
||||
from src.transformer.transformer_for_infer import TransformerInferModel
|
||||
from config import TransformerConfig
|
||||
|
||||
parser = argparse.ArgumentParser(description="mass export")
|
||||
parser.add_argument("--device_id", type=int, default=0, help="Device id")
|
||||
parser.add_argument("--file_name", type=str, default="mass", help="output file name.")
|
||||
parser.add_argument("--file_format", type=str, choices=["AIR", "ONNX", "MINDIR"], default="AIR", help="file format")
|
||||
parser.add_argument("--device_target", type=str, default="Ascend",
|
||||
choices=["Ascend", "GPU", "CPU"], help="device target (default: Ascend)")
|
||||
parser.add_argument('--gigaword_infer_config', type=str, required=True, help='gigaword config file')
|
||||
parser.add_argument('--vocab_file', type=str, required=True, help='vocabulary file')
|
||||
args = parser.parse_args()
|
||||
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
|
||||
if args.device_target == "Ascend":
|
||||
context.set_context(device_id=args.device_id)
|
||||
|
||||
def get_config(config_file):
|
||||
tfm_config = TransformerConfig.from_json_file(config_file)
|
||||
tfm_config.compute_type = mstype.float16
|
||||
tfm_config.dtype = mstype.float32
|
||||
|
||||
return tfm_config
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
|
||||
if config.device_target == "Ascend":
|
||||
context.set_context(device_id=config.device_id)
|
||||
|
||||
if __name__ == '__main__':
|
||||
vocab = Dictionary.load_from_persisted_dict(args.vocab_file)
|
||||
config = get_config(args.gigaword_infer_config)
|
||||
vocab = Dictionary.load_from_persisted_dict(config.vocab_file)
|
||||
dec_len = config.max_decode_length
|
||||
|
||||
tfm_model = TransformerInferModel(config=config, use_one_hot_embeddings=False)
|
||||
|
@ -84,4 +66,4 @@ if __name__ == '__main__':
|
|||
source_ids = Tensor(np.ones((1, config.seq_length)).astype(np.int32))
|
||||
source_mask = Tensor(np.ones((1, config.seq_length)).astype(np.int32))
|
||||
|
||||
export(tfm_model, source_ids, source_mask, file_name=args.file_name, file_format=args.file_format)
|
||||
export(tfm_model, source_ids, source_mask, file_name=config.file_name, file_format=config.file_format)
|
||||
|
|
|
@ -14,34 +14,14 @@
|
|||
# ============================================================================
|
||||
"""Evaluation api."""
|
||||
import os
|
||||
import argparse
|
||||
import pickle
|
||||
import numpy as np
|
||||
|
||||
|
||||
|
||||
from config import TransformerConfig
|
||||
from src.model_utils.config import config
|
||||
from src.utils import Dictionary
|
||||
from src.utils import get_score
|
||||
|
||||
parser = argparse.ArgumentParser(description='postprocess.')
|
||||
parser.add_argument("--config", type=str, required=True,
|
||||
help="Model config json file path.")
|
||||
parser.add_argument("--vocab", type=str, required=True,
|
||||
help="Vocabulary to use.")
|
||||
parser.add_argument("--output", type=str, required=True,
|
||||
help="Result file path.")
|
||||
parser.add_argument("--metric", type=str, default='rouge',
|
||||
help='Set eval method.')
|
||||
parser.add_argument("--source_id_folder", type=str, default='',
|
||||
help="source_eos_ids folder path.")
|
||||
parser.add_argument("--target_id_folder", type=str, default='',
|
||||
help="target_eos_ids folder path.")
|
||||
parser.add_argument("--result_dir", type=str, default='./result_Files',
|
||||
help="result dir path.")
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
def read_from_file(config):
|
||||
def read_from_file():
|
||||
'''
|
||||
calculate accuraty.
|
||||
'''
|
||||
|
@ -49,16 +29,16 @@ def read_from_file(config):
|
|||
probs = []
|
||||
source_sentences = []
|
||||
target_sentences = []
|
||||
file_num = len(os.listdir(args.source_id_folder))
|
||||
file_num = len(os.listdir(config.source_id_folder))
|
||||
for i in range(file_num):
|
||||
f_name = "gigaword_bs_" + str(config.batch_size) + "_" + str(i)
|
||||
source_ids = np.fromfile(os.path.join(args.source_id_folder, f_name + ".bin"), np.int32)
|
||||
source_ids = np.fromfile(os.path.join(config.source_id_folder, f_name + ".bin"), np.int32)
|
||||
source_ids = source_ids.reshape(1, config.max_decode_length)
|
||||
target_ids = np.fromfile(os.path.join(args.target_id_folder, f_name + ".bin"), np.int32)
|
||||
target_ids = np.fromfile(os.path.join(config.target_id_folder, f_name + ".bin"), np.int32)
|
||||
target_ids = target_ids.reshape(1, config.max_decode_length)
|
||||
predicted_ids = np.fromfile(os.path.join(args.result_dir, f_name + "_0.bin"), np.int32)
|
||||
predicted_ids = np.fromfile(os.path.join(config.result_dir, f_name + "_0.bin"), np.int32)
|
||||
predicted_ids = predicted_ids.reshape(1, config.max_decode_length + 1)
|
||||
entire_probs = np.fromfile(os.path.join(args.result_dir, f_name + "_1.bin"), np.float32)
|
||||
entire_probs = np.fromfile(os.path.join(config.result_dir, f_name + "_1.bin"), np.float32)
|
||||
entire_probs = entire_probs.reshape(1, config.beam_width, config.max_decode_length + 1)
|
||||
|
||||
source_sentences.append(source_ids)
|
||||
|
@ -87,13 +67,12 @@ def read_from_file(config):
|
|||
|
||||
|
||||
if __name__ == '__main__':
|
||||
conf = TransformerConfig.from_json_file(args.config)
|
||||
result = read_from_file(conf)
|
||||
vocab = Dictionary.load_from_persisted_dict(args.vocab)
|
||||
result = read_from_file()
|
||||
vocab = Dictionary.load_from_persisted_dict(config.vocab)
|
||||
|
||||
with open(args.output, "wb") as f:
|
||||
with open(config.output, "wb") as f:
|
||||
pickle.dump(result, f, 1)
|
||||
|
||||
# get score by given metric
|
||||
score = get_score(result, vocab, metric=args.metric)
|
||||
score = get_score(result, vocab, metric=config.metric)
|
||||
print(score)
|
||||
|
|
|
@ -14,28 +14,19 @@
|
|||
# ============================================================================
|
||||
"""Evaluation api."""
|
||||
import os
|
||||
import argparse
|
||||
from config import TransformerConfig
|
||||
from src.model_utils.config import config
|
||||
from src.dataset import load_dataset
|
||||
|
||||
parser = argparse.ArgumentParser(description='preprocess.')
|
||||
parser.add_argument("--config", type=str, required=True,
|
||||
help="Model config json file path.")
|
||||
parser.add_argument("--result_path", type=str, default='./preprocess_Result/',
|
||||
help="preprocess result path.")
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
def generate_bin():
|
||||
'''
|
||||
Generate bin files.
|
||||
'''
|
||||
config = TransformerConfig.from_json_file(args.config)
|
||||
ds = load_dataset(data_files=config.test_dataset,
|
||||
batch_size=config.batch_size,
|
||||
epoch_count=1,
|
||||
sink_mode=config.dataset_sink_mode,
|
||||
shuffle=False) if config.test_dataset else None
|
||||
cur_dir = args.result_path
|
||||
cur_dir = config.result_path
|
||||
source_eos_ids_path = os.path.join(cur_dir, "00_source_eos_ids")
|
||||
source_eos_mask_path = os.path.join(cur_dir, "01_source_eos_mask")
|
||||
target_eos_ids_path = os.path.join(cur_dir, "target_eos_ids")
|
||||
|
|
|
@ -32,7 +32,6 @@ echo_help()
|
|||
echo " -n --device_num training with N devices"
|
||||
echo " -i --device_id training with device i"
|
||||
echo " -j --hccl_json set the rank table file"
|
||||
echo " -c --config set the configuration file"
|
||||
echo " -o --output set the output file of inference"
|
||||
echo " -v --vocab set the vocabulary"
|
||||
echo " -m --metric set the metric"
|
||||
|
@ -104,11 +103,6 @@ do
|
|||
export DEVICE_ID=$2
|
||||
shift 2
|
||||
;;
|
||||
-c|--config)
|
||||
echo "config";
|
||||
configurations=$2
|
||||
shift 2
|
||||
;;
|
||||
-o|--output)
|
||||
echo "output";
|
||||
output=$2
|
||||
|
@ -153,7 +147,8 @@ do
|
|||
|
||||
cp train.py ./${task}_mass_$DEVICE_ID
|
||||
cp eval.py ./${task}_mass_$DEVICE_ID
|
||||
cp $configurations ./${task}_mass_$DEVICE_ID
|
||||
cp -r ./src ./${task}_mass_$DEVICE_ID
|
||||
cp -r ./*.yaml ./${task}_mass_$DEVICE_ID
|
||||
|
||||
if [ $vocab ]
|
||||
then
|
||||
|
@ -165,10 +160,10 @@ do
|
|||
echo $task
|
||||
if [ "$task" == "train" ]
|
||||
then
|
||||
python train.py --config ${configurations##*/} --platform Ascend >>log.log 2>&1 &
|
||||
python train.py --device_target Ascend --output_path './output' >>log.log 2>&1 &
|
||||
elif [ "$task" == "infer" ]
|
||||
then
|
||||
python eval.py --config ${configurations##*/} --output ${output} --vocab ${vocab##*/} --metric ${metric} --platform Ascend >>log_infer.log 2>&1 &
|
||||
python eval.py --output ${output} --vocab ${vocab##*/} --metric ${metric} --device_target Ascend >>log_infer.log 2>&1 &
|
||||
fi
|
||||
cd ../
|
||||
done
|
||||
|
|
|
@ -31,7 +31,6 @@ echo_help()
|
|||
echo " -t --task select task, 't' for training and 'i' for inference"
|
||||
echo " -n --device_num training with N devices"
|
||||
echo " -i --device_id training with device i"
|
||||
echo " -c --config set the configuration file"
|
||||
echo " -o --output set the output file of inference"
|
||||
echo " -v --vocab set the vocabulary"
|
||||
echo " -m --metric set the metric"
|
||||
|
@ -87,11 +86,6 @@ do
|
|||
export DEVICE_ID=$2
|
||||
shift 2
|
||||
;;
|
||||
-c|--config)
|
||||
echo "config";
|
||||
configurations=$2
|
||||
shift 2
|
||||
;;
|
||||
-o|--output)
|
||||
echo "output";
|
||||
output=$2
|
||||
|
@ -132,7 +126,8 @@ mkdir ./${task}_mass_$DEVICE_ID
|
|||
|
||||
cp train.py ./${task}_mass_$DEVICE_ID
|
||||
cp eval.py ./${task}_mass_$DEVICE_ID
|
||||
cp $configurations ./${task}_mass_$DEVICE_ID
|
||||
cp -r ./src ./${task}_mass_$DEVICE_ID
|
||||
cp -r ./*.yaml ./${task}_mass_$DEVICE_ID
|
||||
|
||||
if [ $vocab ]
|
||||
then
|
||||
|
@ -147,13 +142,13 @@ then
|
|||
if [ $RANK_SIZE -gt 1 ]
|
||||
then
|
||||
mpirun -n $RANK_SIZE --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 &
|
||||
python train.py --device_target GPU --output_path './output' >>log.log 2>&1 &
|
||||
else
|
||||
python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 &
|
||||
python train.py --device_target GPU --output_path './output' >>log.log 2>&1 &
|
||||
fi
|
||||
elif [ "$task" == "infer" ]
|
||||
then
|
||||
python eval.py --config ${configurations##*/} --output ${output} --vocab ${vocab##*/} --metric ${metric} --platform GPU >>log_infer.log 2>&1 &
|
||||
python eval.py --output ${output} --vocab ${vocab##*/} --metric ${metric} --device_target GPU >>log_infer.log 2>&1 &
|
||||
fi
|
||||
cd ../
|
||||
|
||||
|
|
|
@ -0,0 +1,125 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Parse arguments"""
|
||||
|
||||
import os
|
||||
import ast
|
||||
import argparse
|
||||
from pprint import pprint, pformat
|
||||
import yaml
|
||||
|
||||
_config_path = "./default_config.yaml"
|
||||
|
||||
class Config:
|
||||
"""
|
||||
Configuration namespace. Convert dictionary to members.
|
||||
"""
|
||||
def __init__(self, cfg_dict):
|
||||
for k, v in cfg_dict.items():
|
||||
if isinstance(v, (list, tuple)):
|
||||
setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
|
||||
else:
|
||||
setattr(self, k, Config(v) if isinstance(v, dict) else v)
|
||||
|
||||
def __str__(self):
|
||||
return pformat(self.__dict__)
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
|
||||
def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"):
|
||||
"""
|
||||
Parse command line arguments to the configuration according to the default yaml.
|
||||
|
||||
Args:
|
||||
parser: Parent parser.
|
||||
cfg: Base configuration.
|
||||
helper: Helper description.
|
||||
cfg_path: Path to the default yaml config.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
|
||||
parents=[parser])
|
||||
helper = {} if helper is None else helper
|
||||
choices = {} if choices is None else choices
|
||||
for item in cfg:
|
||||
if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
|
||||
help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
|
||||
choice = choices[item] if item in choices else None
|
||||
if isinstance(cfg[item], bool):
|
||||
parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
|
||||
help=help_description)
|
||||
else:
|
||||
parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
|
||||
help=help_description)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def parse_yaml(yaml_path):
|
||||
"""
|
||||
Parse the yaml config file.
|
||||
|
||||
Args:
|
||||
yaml_path: Path to the yaml config.
|
||||
"""
|
||||
with open(yaml_path, 'r') as fin:
|
||||
try:
|
||||
cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
|
||||
cfgs = [x for x in cfgs]
|
||||
if len(cfgs) == 1:
|
||||
cfg_helper = {}
|
||||
cfg = cfgs[0]
|
||||
elif len(cfgs) == 2:
|
||||
cfg, cfg_helper = cfgs
|
||||
else:
|
||||
raise ValueError("At most 2 docs (config and help description for help) are supported in config yaml")
|
||||
print(cfg_helper)
|
||||
except:
|
||||
raise ValueError("Failed to parse yaml")
|
||||
return cfg, cfg_helper
|
||||
|
||||
|
||||
def merge(args, cfg):
|
||||
"""
|
||||
Merge the base config from yaml file and command line arguments.
|
||||
|
||||
Args:
|
||||
args: Command line arguments.
|
||||
cfg: Base configuration.
|
||||
"""
|
||||
args_var = vars(args)
|
||||
for item in args_var:
|
||||
cfg[item] = args_var[item]
|
||||
return cfg
|
||||
|
||||
|
||||
def get_config():
|
||||
"""
|
||||
Get Config according to the yaml file and cli arguments.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="default name", add_help=False)
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../../default_config.yaml"),
|
||||
help="Config file path")
|
||||
path_args, _ = parser.parse_known_args()
|
||||
default, helper = parse_yaml(path_args.config_path)
|
||||
pprint(default)
|
||||
args = parse_cli_to_yaml(parser, default, helper, path_args.config_path)
|
||||
final_config = merge(args, default)
|
||||
return Config(final_config)
|
||||
|
||||
config = get_config()
|
|
@ -0,0 +1,27 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Device adapter for ModelArts"""
|
||||
|
||||
from src.model_utils.config import config
|
||||
|
||||
if config.enable_modelarts:
|
||||
from src.model_utils.moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
|
||||
else:
|
||||
from src.model_utils.local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
|
||||
|
||||
__all__ = [
|
||||
"get_device_id", "get_device_num", "get_rank_id", "get_job_id"
|
||||
]
|
|
@ -0,0 +1,36 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Local adapter"""
|
||||
|
||||
import os
|
||||
|
||||
def get_device_id():
|
||||
device_id = os.getenv('DEVICE_ID', '0')
|
||||
return int(device_id)
|
||||
|
||||
|
||||
def get_device_num():
|
||||
device_num = os.getenv('RANK_SIZE', '1')
|
||||
return int(device_num)
|
||||
|
||||
|
||||
def get_rank_id():
|
||||
global_rank_id = os.getenv('RANK_ID', '0')
|
||||
return int(global_rank_id)
|
||||
|
||||
|
||||
def get_job_id():
|
||||
return "Local Job"
|
|
@ -0,0 +1,115 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Moxing adapter for ModelArts"""
|
||||
|
||||
import os
|
||||
import functools
|
||||
from mindspore import context
|
||||
from src.model_utils.config import config
|
||||
|
||||
_global_sync_count = 0
|
||||
|
||||
def get_device_id():
|
||||
device_id = os.getenv('DEVICE_ID', '0')
|
||||
return int(device_id)
|
||||
|
||||
|
||||
def get_device_num():
|
||||
device_num = os.getenv('RANK_SIZE', '1')
|
||||
return int(device_num)
|
||||
|
||||
|
||||
def get_rank_id():
|
||||
global_rank_id = os.getenv('RANK_ID', '0')
|
||||
return int(global_rank_id)
|
||||
|
||||
|
||||
def get_job_id():
|
||||
job_id = os.getenv('JOB_ID')
|
||||
job_id = job_id if job_id != "" else "default"
|
||||
return job_id
|
||||
|
||||
def sync_data(from_path, to_path):
|
||||
"""
|
||||
Download data from remote obs to local directory if the first url is remote url and the second one is local path
|
||||
Upload data from local directory to remote obs in contrast.
|
||||
"""
|
||||
import moxing as mox
|
||||
import time
|
||||
global _global_sync_count
|
||||
sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
|
||||
_global_sync_count += 1
|
||||
|
||||
# Each server contains 8 devices as most.
|
||||
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
|
||||
print("from path: ", from_path)
|
||||
print("to path: ", to_path)
|
||||
mox.file.copy_parallel(from_path, to_path)
|
||||
print("===finish data synchronization===")
|
||||
try:
|
||||
os.mknod(sync_lock)
|
||||
except IOError:
|
||||
pass
|
||||
print("===save flag===")
|
||||
|
||||
while True:
|
||||
if os.path.exists(sync_lock):
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
print("Finish sync data from {} to {}.".format(from_path, to_path))
|
||||
|
||||
|
||||
def moxing_wrapper(pre_process=None, post_process=None):
|
||||
"""
|
||||
Moxing wrapper to download dataset and upload outputs.
|
||||
"""
|
||||
def wrapper(run_func):
|
||||
@functools.wraps(run_func)
|
||||
def wrapped_func(*args, **kwargs):
|
||||
# Download data from data_url
|
||||
if config.enable_modelarts:
|
||||
if config.data_url:
|
||||
sync_data(config.data_url, config.data_path)
|
||||
print("Dataset downloaded: ", os.listdir(config.data_path))
|
||||
if config.checkpoint_url:
|
||||
sync_data(config.checkpoint_url, config.load_path)
|
||||
print("Preload downloaded: ", os.listdir(config.load_path))
|
||||
if config.train_url:
|
||||
sync_data(config.train_url, config.output_path)
|
||||
print("Workspace downloaded: ", os.listdir(config.output_path))
|
||||
|
||||
context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
|
||||
config.device_num = get_device_num()
|
||||
config.device_id = get_device_id()
|
||||
if not os.path.exists(config.output_path):
|
||||
os.makedirs(config.output_path)
|
||||
|
||||
if pre_process:
|
||||
pre_process()
|
||||
|
||||
run_func(*args, **kwargs)
|
||||
|
||||
# Upload data to train_url
|
||||
if config.enable_modelarts:
|
||||
if post_process:
|
||||
post_process()
|
||||
|
||||
if config.train_url:
|
||||
print("Start to copy output directory")
|
||||
sync_data(config.output_path, config.train_url)
|
||||
return wrapped_func
|
||||
return wrapper
|
|
@ -13,6 +13,7 @@
|
|||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Infer api."""
|
||||
import os
|
||||
import time
|
||||
|
||||
import mindspore.nn as nn
|
||||
|
@ -143,11 +144,16 @@ def infer(config):
|
|||
Returns:
|
||||
list, result with
|
||||
"""
|
||||
if config.enable_modelarts:
|
||||
config.test_dataset = os.path.join(config.data_path, \
|
||||
"tfrecords/gigaword_new_prob/gigaword_test_dataset.tfrecord-001-of-001")
|
||||
else:
|
||||
config.test_dataset = os.path.join(config.data_path, "gigaword_test_dataset.tfrecord-001-of-001")
|
||||
eval_dataset = load_dataset(data_files=config.test_dataset,
|
||||
batch_size=config.batch_size,
|
||||
epoch_count=1,
|
||||
sink_mode=config.dataset_sink_mode,
|
||||
shuffle=False) if config.test_dataset else None
|
||||
shuffle=False) if config.data_path else None
|
||||
prediction = transformer_infer(config, eval_dataset)
|
||||
return prediction
|
||||
|
||||
|
@ -269,10 +275,15 @@ def infer_ppl(config):
|
|||
Returns:
|
||||
list, result with
|
||||
"""
|
||||
if config.enable_modelarts:
|
||||
config.test_dataset = os.path.join(config.data_path, \
|
||||
"tfrecords/gigaword_new_prob/gigaword_test_dataset.tfrecord-001-of-001")
|
||||
else:
|
||||
config.test_dataset = os.path.join(config.data_path, "gigaword_test_dataset.tfrecord-001-of-001")
|
||||
eval_dataset = load_dataset(data_files=config.test_dataset,
|
||||
batch_size=config.batch_size,
|
||||
epoch_count=1,
|
||||
sink_mode=config.dataset_sink_mode,
|
||||
shuffle=False) if config.test_dataset else None
|
||||
shuffle=False) if config.data_path else None
|
||||
prediction = transformer_infer_ppl(config, eval_dataset)
|
||||
return prediction
|
||||
|
|
|
@ -27,7 +27,7 @@ def load_infer_weights(config):
|
|||
Returns:
|
||||
dict, weights.
|
||||
"""
|
||||
model_path = config.existed_ckpt
|
||||
model_path = config.checkpoint_file_path
|
||||
if model_path.endswith(".npz"):
|
||||
ms_ckpt = np.load(model_path)
|
||||
is_npz = True
|
||||
|
|
|
@ -14,7 +14,6 @@
|
|||
# ============================================================================
|
||||
"""Train api."""
|
||||
import os
|
||||
import argparse
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
|
@ -32,7 +31,6 @@ from mindspore.communication import management as MultiAscend
|
|||
from mindspore.train.serialization import load_checkpoint
|
||||
from mindspore.common import set_seed
|
||||
|
||||
from config import TransformerConfig
|
||||
from src.dataset import load_dataset
|
||||
from src.transformer import TransformerNetworkWithLoss, TransformerTrainOneStepWithLossScaleCell
|
||||
from src.transformer.infer_mass import infer
|
||||
|
@ -40,27 +38,26 @@ from src.utils import LossCallBack
|
|||
from src.utils import one_weight, zero_weight, weight_variable
|
||||
from src.utils import square_root_schedule
|
||||
from src.utils.lr_scheduler import polynomial_decay_scheduler, BertLearningRate
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||
|
||||
parser = argparse.ArgumentParser(description='MASS train entry point.')
|
||||
parser.add_argument("--config", type=str, required=True, help="model config json file path.")
|
||||
parser.add_argument("--platform", type=str, required=True, help="model working platform.")
|
||||
def get_config():
|
||||
if config.compute_type == "float16":
|
||||
config.compute_type = mstype.float16
|
||||
if config.compute_type == "float32":
|
||||
config.compute_type = mstype.float32
|
||||
if config.dtype == "float16":
|
||||
config.dtype = mstype.float16
|
||||
if config.dtype == "float32":
|
||||
config.dtype = mstype.float32
|
||||
|
||||
def get_config(config):
|
||||
config = TransformerConfig.from_json_file(config)
|
||||
config.compute_type = mstype.float16
|
||||
config.dtype = mstype.float32
|
||||
return config
|
||||
|
||||
|
||||
def _train(model, config: TransformerConfig,
|
||||
pre_training_dataset=None, fine_tune_dataset=None, test_dataset=None,
|
||||
def _train(model, pre_training_dataset=None, fine_tune_dataset=None, test_dataset=None,
|
||||
callbacks: list = None):
|
||||
"""
|
||||
Train model.
|
||||
|
||||
Args:
|
||||
model (Model): MindSpore model instance.
|
||||
config (TransformerConfig): Config of mass model.
|
||||
pre_training_dataset (Dataset): Pre-training dataset.
|
||||
fine_tune_dataset (Dataset): Fine-tune dataset.
|
||||
test_dataset (Dataset): Test dataset.
|
||||
|
@ -81,7 +78,7 @@ def _train(model, config: TransformerConfig,
|
|||
# Test the accuracy of the model.
|
||||
if test_dataset is not None:
|
||||
print(" | Start test job.")
|
||||
result = infer(_config)
|
||||
result = infer(config)
|
||||
with open("validation_res_after_pre_training.bin", "wb") as f:
|
||||
pickle.dump(result, f, 1)
|
||||
|
||||
|
@ -95,18 +92,18 @@ def _train(model, config: TransformerConfig,
|
|||
# Test the accuracy of the model.
|
||||
if test_dataset is not None:
|
||||
print(" | Start test job.")
|
||||
result = infer(_config)
|
||||
result = infer(config)
|
||||
with open("validation_res_after_pre_training.bin", "wb") as f:
|
||||
pickle.dump(result, f, 1)
|
||||
|
||||
|
||||
def _load_checkpoint_to_net(config, network):
|
||||
def _load_checkpoint_to_net(network):
|
||||
"""load parameters to network from checkpoint."""
|
||||
if config.existed_ckpt:
|
||||
if config.existed_ckpt.endswith(".npz"):
|
||||
weights = np.load(config.existed_ckpt)
|
||||
if config.checkpoint_file_path:
|
||||
if config.checkpoint_file_path.endswith(".npz"):
|
||||
weights = np.load(config.checkpoint_file_path)
|
||||
else:
|
||||
weights = load_checkpoint(config.existed_ckpt)
|
||||
weights = load_checkpoint(config.checkpoint_file_path)
|
||||
for param in network.trainable_params():
|
||||
weights_name = param.name
|
||||
if weights_name not in weights:
|
||||
|
@ -133,7 +130,7 @@ def _load_checkpoint_to_net(config, network):
|
|||
param.set_data(weight_variable(value.asnumpy().shape))
|
||||
|
||||
|
||||
def _get_lr(config, update_steps):
|
||||
def _get_lr(update_steps):
|
||||
"""generate learning rate."""
|
||||
if config.lr_scheduler == "isr":
|
||||
lr = Tensor(square_root_schedule(lr=config.lr,
|
||||
|
@ -153,7 +150,7 @@ def _get_lr(config, update_steps):
|
|||
return lr
|
||||
|
||||
|
||||
def _get_optimizer(config, network, lr):
|
||||
def _get_optimizer(network, lr):
|
||||
"""get mass optimizer, support Adam, Lamb, Momentum."""
|
||||
if config.optimizer.lower() == "adam":
|
||||
optimizer = Adam(network.trainable_params(), lr, beta1=0.9, beta2=0.98)
|
||||
|
@ -175,8 +172,7 @@ def _get_optimizer(config, network, lr):
|
|||
return optimizer
|
||||
|
||||
|
||||
def _build_training_pipeline(config: TransformerConfig,
|
||||
pre_training_dataset=None,
|
||||
def _build_training_pipeline(pre_training_dataset=None,
|
||||
fine_tune_dataset=None,
|
||||
test_dataset=None,
|
||||
platform="Ascend"):
|
||||
|
@ -184,14 +180,13 @@ def _build_training_pipeline(config: TransformerConfig,
|
|||
Build training pipeline.
|
||||
|
||||
Args:
|
||||
config (TransformerConfig): Config of mass model.
|
||||
pre_training_dataset (Dataset): Pre-training dataset.
|
||||
fine_tune_dataset (Dataset): Fine-tune dataset.
|
||||
test_dataset (Dataset): Test dataset.
|
||||
"""
|
||||
net_with_loss = TransformerNetworkWithLoss(config, is_training=True)
|
||||
net_with_loss.init_parameters_data()
|
||||
_load_checkpoint_to_net(config, net_with_loss)
|
||||
_load_checkpoint_to_net(net_with_loss)
|
||||
|
||||
dataset = pre_training_dataset if pre_training_dataset is not None \
|
||||
else fine_tune_dataset
|
||||
|
@ -201,9 +196,9 @@ def _build_training_pipeline(config: TransformerConfig,
|
|||
|
||||
update_steps = config.epochs * dataset.get_dataset_size()
|
||||
|
||||
lr = _get_lr(config, update_steps)
|
||||
lr = _get_lr(update_steps)
|
||||
|
||||
optimizer = _get_optimizer(config, net_with_loss, lr)
|
||||
optimizer = _get_optimizer(net_with_loss, lr)
|
||||
|
||||
# loss scale.
|
||||
if config.loss_scale_mode == "dynamic":
|
||||
|
@ -223,27 +218,28 @@ def _build_training_pipeline(config: TransformerConfig,
|
|||
rank_size = os.getenv('RANK_SIZE')
|
||||
callbacks = []
|
||||
callbacks.append(time_cb)
|
||||
ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path)
|
||||
if rank_size is not None and int(rank_size) > 1:
|
||||
loss_monitor = LossCallBack(config, rank_id=MultiAscend.get_rank())
|
||||
callbacks.append(loss_monitor)
|
||||
if MultiAscend.get_rank() % 8 == 0:
|
||||
ckpt_callback = ModelCheckpoint(
|
||||
prefix=config.ckpt_prefix,
|
||||
directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(MultiAscend.get_rank())),
|
||||
directory=os.path.join(ckpt_save_dir, 'ckpt_{}'.format(MultiAscend.get_rank())),
|
||||
config=ckpt_config)
|
||||
callbacks.append(ckpt_callback)
|
||||
|
||||
if rank_size is None or int(rank_size) == 1:
|
||||
ckpt_callback = ModelCheckpoint(
|
||||
prefix=config.ckpt_prefix,
|
||||
directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
|
||||
directory=os.path.join(ckpt_save_dir, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
|
||||
config=ckpt_config)
|
||||
loss_monitor = LossCallBack(config, rank_id=os.getenv('DEVICE_ID'))
|
||||
callbacks.append(loss_monitor)
|
||||
callbacks.append(ckpt_callback)
|
||||
|
||||
print(f" | ALL SET, PREPARE TO TRAIN.")
|
||||
_train(model=model, config=config,
|
||||
_train(model=model,
|
||||
pre_training_dataset=pre_training_dataset,
|
||||
fine_tune_dataset=fine_tune_dataset,
|
||||
test_dataset=test_dataset,
|
||||
|
@ -260,17 +256,21 @@ def _setup_parallel_env(platform):
|
|||
)
|
||||
|
||||
|
||||
def train_parallel(config: TransformerConfig, platform: "Ascend"):
|
||||
@moxing_wrapper()
|
||||
def train_parallel(platform: "Ascend"):
|
||||
"""
|
||||
Train model with multi ascend chips.
|
||||
|
||||
Args:
|
||||
config (TransformerConfig): Config for MASS model.
|
||||
"""
|
||||
_setup_parallel_env(platform)
|
||||
|
||||
print(f" | Starting training on {os.getenv('RANK_SIZE', None)} devices.")
|
||||
|
||||
if config.task == "train":
|
||||
filenames = os.listdir(config.data_path)
|
||||
config.fine_tune_dataset = [os.path.join(config.data_path, filename) for filename in filenames]
|
||||
else:
|
||||
config.test_dataset = os.path.join(config.data_path, "gigaword_test_dataset.tfrecord-001-of-001")
|
||||
pre_train_dataset = load_dataset(
|
||||
data_files=config.pre_train_dataset,
|
||||
batch_size=config.batch_size, epoch_count=1,
|
||||
|
@ -296,21 +296,23 @@ def train_parallel(config: TransformerConfig, platform: "Ascend"):
|
|||
rank_id=MultiAscend.get_rank()
|
||||
) if config.test_dataset else None
|
||||
|
||||
_build_training_pipeline(config=config,
|
||||
pre_training_dataset=pre_train_dataset,
|
||||
_build_training_pipeline(pre_training_dataset=pre_train_dataset,
|
||||
fine_tune_dataset=fine_tune_dataset,
|
||||
test_dataset=test_dataset,
|
||||
platform=platform)
|
||||
|
||||
|
||||
def train_single(config: TransformerConfig, platform: "Ascend"):
|
||||
@moxing_wrapper()
|
||||
def train_single(platform: "Ascend"):
|
||||
"""
|
||||
Train model on single device.
|
||||
|
||||
Args:
|
||||
config (TransformerConfig): Config for model.
|
||||
"""
|
||||
print(" | Starting training on single device.")
|
||||
if config.task == "train":
|
||||
filenames = os.listdir(config.data_path)
|
||||
config.fine_tune_dataset = [os.path.join(config.data_path, filename) for filename in filenames]
|
||||
else:
|
||||
config.test_dataset = os.path.join(config.data_path, "gigaword_test_dataset.tfrecord-001-of-001")
|
||||
pre_train_dataset = load_dataset(data_files=config.pre_train_dataset,
|
||||
batch_size=config.batch_size,
|
||||
epoch_count=1,
|
||||
|
@ -327,43 +329,30 @@ def train_single(config: TransformerConfig, platform: "Ascend"):
|
|||
sink_mode=config.dataset_sink_mode,
|
||||
sink_step=config.dataset_sink_step) if config.test_dataset else None
|
||||
|
||||
_build_training_pipeline(config=config,
|
||||
pre_training_dataset=pre_train_dataset,
|
||||
_build_training_pipeline(pre_training_dataset=pre_train_dataset,
|
||||
fine_tune_dataset=fine_tune_dataset,
|
||||
test_dataset=test_dataset,
|
||||
platform=platform)
|
||||
|
||||
|
||||
def _check_args(config):
|
||||
if not os.path.exists(config):
|
||||
raise FileNotFoundError("`config` is not existed.")
|
||||
if not isinstance(config, str):
|
||||
raise ValueError("`config` must be type of str.")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
device_id = os.getenv('DEVICE_ID', None)
|
||||
if device_id is None:
|
||||
device_id = 0
|
||||
device_id = int(device_id)
|
||||
context.set_context(
|
||||
mode=context.GRAPH_MODE,
|
||||
device_target=args.platform,
|
||||
device_target=config.device_target,
|
||||
reserve_class_name_in_scope=False,
|
||||
device_id=device_id,
|
||||
max_call_depth=2000)
|
||||
|
||||
_rank_size = os.getenv('RANK_SIZE')
|
||||
|
||||
_check_args(args.config)
|
||||
_config = get_config(args.config)
|
||||
|
||||
set_seed(_config.random_seed)
|
||||
context.set_context(save_graphs=_config.save_graphs)
|
||||
get_config()
|
||||
set_seed(config.random_seed)
|
||||
context.set_context(save_graphs=config.save_graphs)
|
||||
|
||||
if _rank_size is not None and int(_rank_size) > 1:
|
||||
train_parallel(_config, args.platform)
|
||||
train_parallel(config.device_target)
|
||||
else:
|
||||
train_single(_config, args.platform)
|
||||
train_single(config.device_target)
|
||||
|
|
Loading…
Reference in New Issue