This commit is contained in:
huchunmei 2021-06-28 15:28:03 +08:00
parent d919560902
commit 2c4c157ae5
36 changed files with 3180 additions and 215 deletions

View File

@ -18,12 +18,11 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import ast
import collections
import logging
import numpy as np
import src.tokenization as tokenization
from src.model_utils.config import config
from mindspore.mindrecord import FileWriter
class SampleInstance():
@ -121,33 +120,17 @@ def create_training_instance(source_words, target_words, max_seq_length, clip_to
return instance
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--input_file", type=str, required=True,
help='Input raw text file (or comma-separated list of files).')
parser.add_argument("--output_file", type=str, required=True, help='Output MindRecord file.')
parser.add_argument("--num_splits", type=int, default=16,
help='The MindRecord file will be split into the number of partition.')
parser.add_argument("--vocab_file", type=str, required=True,
help='The vocabulary file that the Transformer model was trained on.')
parser.add_argument("--clip_to_max_len", type=bool, default=False,
help='clip sequences to maximum sequence length.')
parser.add_argument("--max_seq_length", type=int, default=128, help='Maximum sequence length.')
parser.add_argument("--bucket", type=ast.literal_eval, default=[16, 32, 48, 64, 128],
help='bucket sequence length')
args = parser.parse_args()
tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=args.vocab_file)
tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=config.vocab_file)
input_files = []
for input_pattern in args.input_file.split(","):
for input_pattern in config.input_file.split(","):
input_files.append(input_pattern)
logging.info("*** Read from input files ***")
for input_file in input_files:
logging.info(" %s", input_file)
output_file = args.output_file
output_file = config.output_file
logging.info("*** Write to output files ***")
logging.info(" %s", output_file)
@ -155,7 +138,7 @@ def main():
total_read = 0
feature_dict = {}
for i in args.bucket:
for i in config.bucket:
feature_dict[i] = []
for input_file in input_files:
@ -174,17 +157,17 @@ def main():
source_tokens = tokenizer.tokenize(source_line)
target_tokens = tokenizer.tokenize(target_line)
if len(source_tokens) >= args.max_seq_length or len(target_tokens) >= args.max_seq_length:
if len(source_tokens) >= config.max_seq_length or len(target_tokens) >= config.max_seq_length:
logging.info("ignore long sentence!")
continue
instance = create_training_instance(source_tokens, target_tokens, args.max_seq_length,
clip_to_max_len=args.clip_to_max_len)
instance = create_training_instance(source_tokens, target_tokens, config.max_seq_length,
clip_to_max_len=config.clip_to_max_len)
if instance is None:
continue
features, seq_max_bucket_length = get_instance_features(instance, tokenizer, args.max_seq_length,
args.bucket)
features, seq_max_bucket_length = get_instance_features(instance, tokenizer, config.max_seq_length,
config.bucket)
for key in feature_dict:
if key == seq_max_bucket_length:
feature_dict[key].append(features)
@ -200,12 +183,12 @@ def main():
feature = features[feature_name]
logging.info("%s: %s", feature_name, feature)
for i in args.bucket:
if args.num_splits == 1:
for i in config.bucket:
if config.num_splits == 1:
output_file_name = output_file
else:
output_file_name = output_file + '_' + str(i) + '_'
writer = FileWriter(output_file_name, args.num_splits)
writer = FileWriter(output_file_name, config.num_splits)
data_schema = {"source_sos_ids": {"type": "int64", "shape": [-1]},
"source_sos_mask": {"type": "int64", "shape": [-1]},
"source_eos_ids": {"type": "int64", "shape": [-1]},

View File

@ -0,0 +1,133 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
data_url: ""
train_url: ""
checkpoint_url: ""
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path"
checkpoint_path: ''
device_target: Ascend
enable_profiling: False
# ==============================================================================
# config/cfg edict
transformer_network: 'base'
init_loss_scale_value: 1024
scale_factor: 2
scale_window: 2000
optimizer: 'Adam'
optimizer_adam_beta2: 0.997
# lr_schedule: edict({'learning_rate': 2.0, 'warmup_steps': 8000, 'start_decay_step': 16000, 'min_lr': 0.0,})
# transformer_net_cfg
batch_size: 96
seq_length: 128
vocab_size: 36560
hidden_size: 512
num_hidden_layers: 6
num_attention_heads: 8
intermediate_size: 2048
hidden_act: "relu"
hidden_dropout_prob: 0.2
attention_probs_dropout_prob: 0.2
max_position_embeddings: 128
initializer_range: 0.02
label_smoothing: 0.1
dtype: mstype.float32
compute_type: mstype.float16
#eval_config/cfg edict
data_file: '/cache/data'
model_file: './transformer/transformer_trained.ckpt'
output_file: './output_eval.txt'
# transformer_net_cfg
batch_size_ev: 1
hidden_dropout_prob_ev: 0.0
attention_probs_dropout_prob_ev: 0.0
beam_width: 4
max_decode_length: 80
length_penalty_weight: 1.0
# ==============================================================================
# train.py / Argparse init.
distribute: "false"
epoch_size: 52
device_id: 0
device_num: 1
enable_lossscale: "true"
do_shuffle: "true"
enable_save_ckpt: "true"
save_checkpoint_steps: 2500
save_checkpoint_num: 30
save_checkpoint_path: "./"
bucket_boundaries: [16, 32, 48, 64, 128]
accumulation_steps: 1
# export.py /eval_config - transformer export
file_name: "transformer"
file_format: 'AIR'
#'postprocess / from eval_config'
result_dir: "./result_Files"
#'preprocess / from eval_config'
result_path: "./preprocess_Result/"
# src/process_output.py "recore nbest with smoothed sentence-level bleu."
vocab_file: ""
# create_data.py
input_file: ''
num_splits: 16
clip_to_max_len: False
max_seq_length: 128
bucket: [16, 32, 48, 64, 128]
---
# Config description for each option
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
train_url: 'Training output url for obs'
data_path: 'Dataset path for local, it is better to use absolute path'
output_path: 'Training output path for local'
ann_file: 'Ann file, default is val.json.'
device_target: "device where the code will be implemented, default is Ascend"
checkpoint_path: "Checkpoint file path"
data_file: '/your/path/evaluation.mindrecord'
model_file: '/your/path/checkpoint_file'
output_file: './output_eval.txt'
distribute: "Run distribute, default is false."
epoch_size: "Epoch size, default is 52."
device_id: "Device id, default is 0."
device_num: "Use device nums, default is 1."
enable_lossscale: "Use lossscale or not, default is true."
do_shuffle: "Enable shuffle for dataset, default is true."
enable_save_ckpt: "Enable save checkpoint, default is true."
save_checkpoint_steps: "Save checkpoint steps, default is 2500."
save_checkpoint_num: "Save checkpoint numbers, default is 30."
save_checkpoint_path: "Save checkpoint file path"
bucket_boundaries: "sequence length for different bucket"
accumulation_steps: "Gradient accumulation steps, default is 1."
file_name: "output file name."
file_format: 'file format'
result_dir: "./result_Files"
result_path: "./preprocess_Result/"
vocab_file: "vocab file path."
input_file: 'Input raw text file (or comma-separated list of files).'
num_splits: 'The MindRecord file will be split into the number of partition.'
clip_to_max_len: 'clip sequences to maximum sequence length.'
max_seq_length: 'Maximum sequence length.'
bucket: 'bucket sequence length'
---
device_target: ["Ascend", "GPU", "CPU"]
file_format: ["AIR", "ONNX", "MINDIR"]
distribute: ['true', 'false']
enable_lossscale: ['true', 'false']
do_shuffle: ['true', 'false']
enable_save_ckpt: ['true', 'false']

View File

@ -0,0 +1,134 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
data_url: ""
train_url: ""
checkpoint_url: ""
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path"
checkpoint_path: ''
device_target: Ascend
enable_profiling: False
# ==============================================================================
# config/cfg edict
transformer_network: 'large'
init_loss_scale_value: 1024
scale_factor: 2
scale_window: 2000
optimizer: 'Adam'
optimizer_adam_beta2: 0.997
#lr_schedule: edict({'learning_rate': 2.0, 'warmup_steps': 8000, 'start_decay_step': 16000, 'min_lr': 0.0,})
# transformer_net_cfg
batch_size: 96
seq_length: 128
vocab_size: 36560
hidden_size: 1024
num_hidden_layers: 6
num_attention_heads: 16
intermediate_size: 4096
hidden_act: "relu"
hidden_dropout_prob: 0.2
attention_probs_dropout_prob: 0.2
max_position_embeddings: 128
initializer_range: 0.02
label_smoothing: 0.1
dtype: mstype.float32
compute_type: mstype.float16
#eval_config/cfg edict
data_file: '/cache/data'
model_file: './transformer/transformer_trained.ckpt'
output_file: './output_eval.txt'
# transformer_net_cfg
batch_size_ev: 1
hidden_dropout_prob_ev: 0.0
attention_probs_dropout_prob_ev: 0.0
beam_width: 4
max_decode_length: 80
length_penalty_weight: 1.0
# ==============================================================================
# train.py / Argparse init.
distribute: "false"
epoch_size: 52
device_id: 0
device_num: 1
enable_lossscale: "true"
do_shuffle: "true"
enable_save_ckpt: "true"
save_checkpoint_steps: 2500
save_checkpoint_num: 30
save_checkpoint_path: "./"
bucket_boundaries: [16, 32, 48, 64, 128]
accumulation_steps: 1
# export.py /eval_config - transformer export
file_name: "transformer"
file_format: 'AIR'
#'postprocess / from eval_config'
result_dir: "./result_Files"
#'preprocess / from eval_config'
result_path: "./preprocess_Result/"
# src/process_output.py "recore nbest with smoothed sentence-level bleu."
vocab_file: ""
# create_data.py
input_file: ''
num_splits: 16
clip_to_max_len: False
max_seq_length: 128
bucket: [16, 32, 48, 64, 128]
---
# Config description for each option
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
train_url: 'Training output url for obs'
data_path: 'Dataset path for local, it is better to use absolute path'
output_path: 'Training output path for local'
ann_file: 'Ann file, default is val.json.'
device_target: "device where the code will be implemented, default is Ascend"
checkpoint_path: "Checkpoint file path"
data_file: '/your/path/evaluation.mindrecord'
model_file: '/your/path/checkpoint_file'
output_file: './output_eval.txt'
distribute: "Run distribute, default is false."
epoch_size: "Epoch size, default is 52."
device_id: "Device id, default is 0."
device_num: "Use device nums, default is 1."
enable_lossscale: "Use lossscale or not, default is true."
do_shuffle: "Enable shuffle for dataset, default is true."
enable_save_ckpt: "Enable save checkpoint, default is true."
save_checkpoint_steps: "Save checkpoint steps, default is 2500."
save_checkpoint_num: "Save checkpoint numbers, default is 30."
save_checkpoint_path: "Save checkpoint file path"
bucket_boundaries: "sequence length for different bucket"
accumulation_steps: "Gradient accumulation steps, default is 1."
file_name: "output file name."
file_format: 'file format'
result_dir: "./result_Files"
result_path: "./preprocess_Result/"
vocab_file: "vocab file path."
input_file: 'Input raw text file (or comma-separated list of files).'
num_splits: 'The MindRecord file will be split into the number of partition.'
clip_to_max_len: 'clip sequences to maximum sequence length.'
max_seq_length: 'Maximum sequence length.'
bucket: 'bucket sequence length'
---
device_target: ["Ascend", "GPU", "CPU"]
file_format: ["AIR", "ONNX", "MINDIR"]
distribute: ['true', 'false']
enable_lossscale: ['true', 'false']
do_shuffle: ['true', 'false']
enable_save_ckpt: ['true', 'false']

View File

@ -0,0 +1,134 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
data_url: ""
train_url: ""
checkpoint_url: ""
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path"
checkpoint_path: ''
device_target: GPU
enable_profiling: False
# ==============================================================================
# config/cfg edict
transformer_network: 'large'
init_loss_scale_value: 1024
scale_factor: 2
scale_window: 2000
optimizer: 'Adam'
optimizer_adam_beta2: 0.997
#lr_schedule: edict({'learning_rate': 2.0, 'warmup_steps': 8000, 'start_decay_step': 16000, 'min_lr': 0.0,})
# transformer_net_cfg_gpu
batch_size: 32
seq_length: 128
vocab_size: 36560
hidden_size: 1024
num_hidden_layers: 6
num_attention_heads: 16
intermediate_size: 4096
hidden_act: "relu"
hidden_dropout_prob: 0.2
attention_probs_dropout_prob: 0.2
max_position_embeddings: 128
initializer_range: 0.02
label_smoothing: 0.1
dtype: mstype.float32
compute_type: mstype.float16
#eval_config/cfg edict
data_file: '/cache/data'
model_file: './transformer/transformer_trained.ckpt'
output_file: './output_eval.txt'
# transformer_net_cfg
batch_size_ev: 1
hidden_dropout_prob_ev: 0.0
attention_probs_dropout_prob_ev: 0.0
beam_width: 4
max_decode_length: 80
length_penalty_weight: 1.0
# ==============================================================================
# train.py / Argparse init.
distribute: "false"
epoch_size: 52
device_id: 0
device_num: 1
enable_lossscale: "true"
do_shuffle: "true"
enable_save_ckpt: "true"
save_checkpoint_steps: 2500
save_checkpoint_num: 30
save_checkpoint_path: "./"
bucket_boundaries: [16, 32, 48, 64, 128]
accumulation_steps: 1
# export.py /eval_config - transformer export
file_name: "transformer"
file_format: 'AIR'
#'postprocess / from eval_config'
result_dir: "./result_Files"
#'preprocess / from eval_config'
result_path: "./preprocess_Result/"
# src/process_output.py "recore nbest with smoothed sentence-level bleu."
vocab_file: ""
# create_data.py
input_file: ''
num_splits: 16
clip_to_max_len: False
max_seq_length: 128
bucket: [16, 32, 48, 64, 128]
---
# Config description for each option
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
train_url: 'Training output url for obs'
data_path: 'Dataset path for local, it is better to use absolute path'
output_path: 'Training output path for local'
ann_file: 'Ann file, default is val.json.'
device_target: "device where the code will be implemented, default is Ascend"
checkpoint_path: "Checkpoint file path"
data_file: '/your/path/evaluation.mindrecord'
model_file: '/your/path/checkpoint_file'
output_file: './output_eval.txt'
distribute: "Run distribute, default is false."
epoch_size: "Epoch size, default is 52."
device_id: "Device id, default is 0."
device_num: "Use device nums, default is 1."
enable_lossscale: "Use lossscale or not, default is true."
do_shuffle: "Enable shuffle for dataset, default is true."
enable_save_ckpt: "Enable save checkpoint, default is true."
save_checkpoint_steps: "Save checkpoint steps, default is 2500."
save_checkpoint_num: "Save checkpoint numbers, default is 30."
save_checkpoint_path: "Save checkpoint file path"
bucket_boundaries: "sequence length for different bucket"
accumulation_steps: "Gradient accumulation steps, default is 1."
file_name: "output file name."
file_format: 'file format'
result_dir: "./result_Files"
result_path: "./preprocess_Result/"
vocab_file: "vocab file path."
input_file: 'Input raw text file (or comma-separated list of files).'
num_splits: 'The MindRecord file will be split into the number of partition.'
clip_to_max_len: 'clip sequences to maximum sequence length.'
max_seq_length: 'Maximum sequence length.'
bucket: 'bucket sequence length'
---
device_target: ["Ascend", "GPU", "CPU"]
file_format: ["AIR", "ONNX", "MINDIR"]
distribute: ['true', 'false']
enable_lossscale: ['true', 'false']
do_shuffle: ['true', 'false']
enable_save_ckpt: ['true', 'false']

View File

@ -14,7 +14,7 @@
# ============================================================================
"""Transformer evaluation script."""
import argparse
import os
import numpy as np
import mindspore.nn as nn
@ -28,8 +28,15 @@ import mindspore.dataset.transforms.c_transforms as deC
from mindspore import context
from src.transformer_model import TransformerModel
from src.eval_config import cfg, transformer_net_cfg
from src.model_utils.config import config
from src.model_utils.moxing_adapter import moxing_wrapper
from src.model_utils.device_adapter import get_device_id
config.dtype = mstype.float32
config.compute_type = mstype.float16
config.batch_size = config.batch_size_ev
config.hidden_dropout_prob = config.hidden_dropout_prob_ev
config.attention_probs_dropout_prob = config.attention_probs_dropout_prob_ev
def load_test_data(batch_size=1, data_file=None):
"""
@ -57,7 +64,6 @@ class TransformerInferCell(nn.Cell):
"""
Encapsulation class of transformer network infer.
"""
def __init__(self, network):
super(TransformerInferCell, self).__init__(auto_prefix=False)
self.network = network
@ -98,23 +104,22 @@ def load_weights(model_path):
return parameter_dict
def modelarts_pre_process():
config.output_file = os.path.join(config.output_path, config.output_file)
config.data_file = os.path.join(config.data_file, 'ende-l128-mindrecord_128_00')
@moxing_wrapper(pre_process=modelarts_pre_process)
def run_transformer_eval():
"""
Transformer evaluation.
"""
parser = argparse.ArgumentParser(description='tranformer')
parser.add_argument("--device_target", type=str, default="Ascend",
help="device where the code will be implemented, default is Ascend")
parser.add_argument('--device_id', type=int, default=0, help='device id of GPU or Ascend, default is 0')
args = parser.parse_args()
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, reserve_class_name_in_scope=False,
device_id=get_device_id())
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, reserve_class_name_in_scope=False,
device_id=args.device_id)
dataset = load_test_data(batch_size=config.batch_size, data_file=config.data_file)
tfm_model = TransformerModel(config=config, is_training=False, use_one_hot_embeddings=False)
dataset = load_test_data(batch_size=transformer_net_cfg.batch_size, data_file=cfg.data_file)
tfm_model = TransformerModel(config=transformer_net_cfg, is_training=False, use_one_hot_embeddings=False)
parameter_dict = load_weights(cfg.model_file)
parameter_dict = load_weights(config.model_file)
load_param_into_net(tfm_model, parameter_dict)
tfm_infer = TransformerInferCell(tfm_model)
@ -132,9 +137,9 @@ def run_transformer_eval():
predictions.append(predicted_ids.asnumpy())
# decode and write to file
f = open(cfg.output_file, 'w')
f = open(config.output_file, 'w')
for batch_out in predictions:
for i in range(transformer_net_cfg.batch_size):
for i in range(config.batch_size):
if batch_out.ndim == 3:
batch_out = batch_out[:, 0]
token_ids = [str(x) for x in batch_out[i].tolist()]

View File

@ -14,35 +14,41 @@
# ============================================================================
""" export checkpoint file into models"""
import argparse
import numpy as np
from mindspore import Tensor, context
from mindspore.train.serialization import load_param_into_net, export
from src.transformer_model import TransformerModel
from src.eval_config import cfg, transformer_net_cfg
from src.model_utils.config import config
from src.model_utils.moxing_adapter import moxing_wrapper
from src.model_utils.device_adapter import get_device_id
from eval import load_weights
parser = argparse.ArgumentParser(description='transformer export')
parser.add_argument("--device_id", type=int, default=0, help="Device id")
parser.add_argument("--file_name", type=str, default="transformer", help="output file name.")
parser.add_argument('--file_format', type=str, choices=["AIR", "ONNX", "MINDIR"], default='AIR', help='file format')
parser.add_argument("--device_target", type=str, default="Ascend",
choices=["Ascend", "GPU", "CPU"], help="device target (default: Ascend)")
args = parser.parse_args()
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
if args.device_target == "Ascend":
context.set_context(device_id=args.device_id)
config.batch_size = config.batch_size_ev
config.hidden_dropout_prob = config.hidden_dropout_prob_ev
config.attention_probs_dropout_prob = config.attention_probs_dropout_prob_ev
if __name__ == '__main__':
tfm_model = TransformerModel(config=transformer_net_cfg, is_training=False, use_one_hot_embeddings=False)
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
if config.device_target == "Ascend":
context.set_context(device_id=get_device_id())
parameter_dict = load_weights(cfg.model_file)
def modelarts_pre_process():
pass
@moxing_wrapper(pre_process=modelarts_pre_process)
def export_transformer():
""" export_transformer """
tfm_model = TransformerModel(config=config, is_training=False, use_one_hot_embeddings=False)
parameter_dict = load_weights(config.model_file)
load_param_into_net(tfm_model, parameter_dict)
source_ids = Tensor(np.ones((transformer_net_cfg.batch_size, transformer_net_cfg.seq_length)).astype(np.int32))
source_mask = Tensor(np.ones((transformer_net_cfg.batch_size, transformer_net_cfg.seq_length)).astype(np.int32))
source_ids = Tensor(np.ones((config.batch_size, config.seq_length)).astype(np.int32))
source_mask = Tensor(np.ones((config.batch_size, config.seq_length)).astype(np.int32))
export(tfm_model, source_ids, source_mask, file_name=args.file_name, file_format=args.file_format)
export(tfm_model, source_ids, source_mask, file_name=config.file_name, file_format=config.file_format)
if __name__ == '__main__':
export_transformer()

View File

@ -15,31 +15,25 @@
"""Transformer evaluation script."""
import os
import argparse
import numpy as np
from src.model_utils.config import config
from src.eval_config import cfg, transformer_net_cfg
parser = argparse.ArgumentParser(description='postprocess')
parser.add_argument("--result_dir", type=str, default="./result_Files",
help="infer result path.")
args = parser.parse_args()
def generate_output():
'''
Generate output.
'''
predictions = []
file_num = len(os.listdir(args.result_dir))
file_num = len(os.listdir(config.result_dir))
for i in range(file_num):
batch = "transformer_bs_" + str(transformer_net_cfg.batch_size) + "_" + str(i) + "_0.bin"
pred = np.fromfile(os.path.join(args.result_dir, batch), np.int32)
predictions.append(pred.reshape(1, 1, transformer_net_cfg.max_decode_length + 1))
batch = "transformer_bs_" + str(config.batch_size) + "_" + str(i) + "_0.bin"
pred = np.fromfile(os.path.join(config.result_dir, batch), np.int32)
predictions.append(pred.reshape(1, 1, config.max_decode_length + 1))
# decode and write to file
f = open(cfg.output_file, 'w')
f = open(config.output_file, 'w')
for batch_out in predictions:
for i in range(transformer_net_cfg.batch_size):
for i in range(config.batch_size):
if batch_out.ndim == 3:
batch_out = batch_out[:, 0]
token_ids = [str(x) for x in batch_out[i].tolist()]

View File

@ -15,23 +15,16 @@
"""Transformer evaluation script."""
import os
import argparse
from src.eval_config import cfg, transformer_net_cfg
from src.model_utils.config import config
from eval import load_test_data
parser = argparse.ArgumentParser(description='preprocess')
parser.add_argument("--result_path", type=str, default="./preprocess_Result/",
help="preprocess result path.")
args = parser.parse_args()
def generate_bin():
'''
Generate bin files.
'''
dataset = load_test_data(batch_size=transformer_net_cfg.batch_size, data_file=cfg.data_file)
cur_dir = args.result_path
dataset = load_test_data(batch_size=config.batch_size, data_file=config.data_file)
cur_dir = config.result_path
source_eos_ids_path = os.path.join(cur_dir, "00_source_eos_ids")
source_eos_mask_path = os.path.join(cur_dir, "01_source_eos_mask")
@ -41,7 +34,7 @@ def generate_bin():
if not os.path.isdir(source_eos_mask_path):
os.makedirs(source_eos_mask_path)
batch_size = transformer_net_cfg.batch_size
batch_size = config.batch_size
for i, data in enumerate(dataset.create_dict_iterator(output_numpy=True, num_epochs=1)):
file_name = "transformer_bs_" + str(batch_size) + "_" + str(i) + ".bin"

View File

@ -28,7 +28,7 @@ eval_output=$2
vocab_file=$3
cat $eval_output \
| python src/process_output.py --vocab_file $vocab_file \
| python src/process_output.py --config_path="./default_config_large.yaml" --vocab_file $vocab_file \
| sed 's/@@ //g' > ${eval_output}.processed
perl -ple 's/(\S)-(\S)/$1 #@#-#@# $2/g' < $ref_data | perl ${BASEDIR}/replace-quote.perl > ${ref_data}.forbleu

View File

@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 4 ] ; then
if [ $# != 5 ] ; then
echo "=============================================================================================================="
echo "Please run the script as: "
echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE"
echo "for example: sh run_distribute_pretrain.sh 8 52 /path/ende-l128-mindrecord00 /path/hccl.json"
echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE CONFIG_PATH"
echo "for example: sh run_distribute_pretrain.sh 8 52 /path/ende-l128-mindrecord00 /path/hccl.json ./default_config_large.yaml"
echo "It is better to use absolute path."
echo "=============================================================================================================="
exit 1;
@ -32,6 +32,7 @@ DATA_PATH=$3
export HCCL_CONNECT_TIMEOUT=600
export RANK_TABLE_FILE=$4
export CONFIG_PATH=$5
export RANK_SIZE=$1
export HCCL_FLAG=1
export DEPLOY_MODE=0
@ -43,11 +44,12 @@ do
export GE_USE_STATIC_MEMORY=1
mkdir helper$i
cp -rf ../src/ ../train.py ./helper$i
cp -rf ../src/ ../train.py ../*.yaml ./helper$i
cd ./helper$i || exit
echo "start training for rank $i, device $DEVICE_ID"
env > env.log
python train.py \
--config_path=$CONFIG_PATH \
--distribute="true" \
--epoch_size=$EPOCH_SIZE \
--device_id=$DEVICE_ID \
@ -58,8 +60,7 @@ do
--checkpoint_path="" \
--save_checkpoint_steps=2500 \
--save_checkpoint_num=30 \
--data_path=$DATA_PATH \
--bucket_boundaries=[16,32,48,64,128] > log.txt 2>&1 &
--data_path=$DATA_PATH > log.txt 2>&1 &
cd ../
done
cd ..

View File

@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 5 ] ; then
if [ $# != 6 ] ; then
echo "=============================================================================================================="
echo "Please run the script as: "
echo "sh run_distribute_train_ascend_multi_machines.sh DEVICE_NUM SERVER_ID EPOCH_SIZE DATA_PATH RANK_TABLE_FILE"
echo "for example: sh run_distribute_train_ascend_multi_machines.sh 32 0 52 /path/ende-l128-mindrecord00 /path/hccl.json"
echo "sh run_distribute_train_ascend_multi_machines.sh DEVICE_NUM SERVER_ID EPOCH_SIZE DATA_PATH RANK_TABLE_FILE CONFIG_PATH"
echo "for example: sh run_distribute_train_ascend_multi_machines.sh 32 0 52 /path/ende-l128-mindrecord00 /path/hccl.json ./default_config_large.yaml"
echo "It is better to use absolute path."
echo "=============================================================================================================="
exit 1;
@ -32,6 +32,7 @@ DATA_PATH=$4
export HCCL_CONNECT_TIMEOUT=600
export RANK_TABLE_FILE=$5
export CONFIG_PATH=$6
export RANK_SIZE=$1
export SERVER_ID=$2
export DEVICE_NUM=8
@ -47,11 +48,12 @@ do
export GE_USE_STATIC_MEMORY=1
mkdir helper$i
cp -rf ../src/ ../train.py ./helper$i
cp -rf ../src/ ../train.py ../*.yaml ./helper$i
cd ./helper$i || exit
echo "start training for rank $i, device $DEVICE_ID"
env > env.log
python train.py \
--config_path=$CONFIG_PATH \
--distribute="true" \
--epoch_size=$EPOCH_SIZE \
--device_id=$DEVICE_ID \

View File

@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 3 ] ; then
if [ $# != 4 ] ; then
echo "=============================================================================================================="
echo "Please run the script as: "
echo "sh run_distribute_train_gpu.sh DEVICE_NUM EPOCH_SIZE DATA_PATH"
echo "for example: sh run_distribute_pretrain.sh 8 55 /path/ende-l128-mindrecord00"
echo "sh run_distribute_train_gpu.sh DEVICE_NUM EPOCH_SIZE DATA_PATH CONFIG_PATH"
echo "for example: sh run_distribute_pretrain.sh 8 55 /path/ende-l128-mindrecord00 ./default_config_large_gpu.yaml"
echo "It is better to use absolute path."
echo "=============================================================================================================="
exit 1;
@ -25,16 +25,18 @@ fi
rm -rf run_distribute_train
mkdir run_distribute_train
cp -rf ./src/ train.py ./run_distribute_train
cp -rf ./src/ train.py ./*.yaml ./run_distribute_train
cd run_distribute_train || exit
export RANK_SIZE=$1
export CONFIG_PATH=$4
EPOCH_SIZE=$2
DATA_PATH=$3
echo $RANK_SIZE
mpirun -n $RANK_SIZE \
python train.py \
--config_path=$CONFIG_PATH \
--distribute="true" \
--device_target="GPU" \
--epoch_size=$EPOCH_SIZE \

View File

@ -13,19 +13,36 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 2 ] ; then
if [ $# != 5 ] ; then
echo "=============================================================================================================="
echo "Please run the script as: "
echo "sh run_eval.sh DEVICE_TARGET DEVICE_ID"
echo "for example: sh run_eval.sh Ascend 0"
echo "Note: set the checkpoint and dataset path in src/eval_config.py"
echo "sh run_eval.sh DEVICE_TARGET DEVICE_ID MINDRECORD_DATA CKPT_PATH CONFIG_PATH"
echo "for example: sh run_eval.sh Ascend 0 /your/path/evaluation.mindrecord /your/path/checkpoint_file ./default_config_large_gpu.yaml"
echo "Note: set the checkpoint and dataset path in default_config.yaml"
echo "=============================================================================================================="
exit 1;
fi
export DEVICE_TARGET=$1
export CONFIG_PATH=$5
DEVICE_ID=$2
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
PATH3=$(get_real_path $3)
PATH4=$(get_real_path $4)
echo $PATH3
echo $PATH4
python eval.py \
--config_path=$CONFIG_PATH \
--device_target=$DEVICE_TARGET \
--device_id=$DEVICE_ID \
--data_file=$PATH3 \
--model_file=$PATH4 > log_eval.txt 2>&1 &

View File

@ -66,7 +66,7 @@ function preprocess_data()
rm -rf ./preprocess_Result
fi
mkdir preprocess_Result
python3.7 ../preprocess.py --result_path=./preprocess_Result/
python3.7 ../preprocess.py --config_path="./default_config_large.yaml" --result_path=./preprocess_Result/
}
function compile_app()
@ -93,7 +93,7 @@ function infer()
function cal_acc()
{
python3.7 ../postprocess.py --result_dir=./result_Files &> acc.log
python3.7 ../postprocess.py --config_path="./default_config_large.yaml" --result_dir=./result_Files &> acc.log
}
if [ $need_preprocess == "y" ]; then

View File

@ -25,7 +25,7 @@ fi
rm -rf run_standalone_train
mkdir run_standalone_train
cp -rf ./src/ train.py ./run_standalone_train
cp -rf ./src/ train.py ./*.yaml ./run_standalone_train
cd run_standalone_train || exit
export DEVICE_TARGET=$1
@ -36,6 +36,7 @@ DATA_PATH=$5
if [ $DEVICE_TARGET == 'Ascend' ];then
python train.py \
--config_path="./default_config_large.yaml" \
--distribute="false" \
--epoch_size=$EPOCH_SIZE \
--accumulation_steps=$GRADIENT_ACCUMULATE_STEP \
@ -53,6 +54,7 @@ elif [ $DEVICE_TARGET == 'GPU' ];then
export CUDA_VISIBLE_DEVICES="$2"
python train.py \
--config_path="./default_config_large_gpu.yaml" \
--distribute="false" \
--epoch_size=$EPOCH_SIZE \
--device_target=$DEVICE_TARGET \

View File

@ -17,7 +17,8 @@
import mindspore.common.dtype as mstype
import mindspore.dataset as de
import mindspore.dataset.transforms.c_transforms as deC
from .config import transformer_net_cfg, transformer_net_cfg_gpu
from .model_utils.config import config
de.config.set_seed(1)
def create_transformer_dataset(epoch_count=1, rank_size=1, rank_id=0, do_shuffle="true", dataset_path=None,
bucket_boundaries=None, device_target="Ascend"):
@ -38,11 +39,8 @@ def create_transformer_dataset(epoch_count=1, rank_size=1, rank_id=0, do_shuffle
ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask")
# apply batch operations
if device_target == "Ascend":
ds = ds.batch(transformer_net_cfg.batch_size, drop_remainder=True)
else:
ds = ds.batch(transformer_net_cfg_gpu.batch_size, drop_remainder=True)
ds = ds.batch(config.batch_size, drop_remainder=True)
ds = ds.repeat(epoch_count)
return ds

View File

@ -0,0 +1,127 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Parse arguments"""
import os
import ast
import argparse
from pprint import pprint, pformat
import yaml
class Config:
"""
Configuration namespace. Convert dictionary to members.
"""
def __init__(self, cfg_dict):
for k, v in cfg_dict.items():
if isinstance(v, (list, tuple)):
setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
else:
setattr(self, k, Config(v) if isinstance(v, dict) else v)
def __str__(self):
return pformat(self.__dict__)
def __repr__(self):
return self.__str__()
def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"):
"""
Parse command line arguments to the configuration according to the default yaml.
Args:
parser: Parent parser.
cfg: Base configuration.
helper: Helper description.
cfg_path: Path to the default yaml config.
"""
parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
parents=[parser])
helper = {} if helper is None else helper
choices = {} if choices is None else choices
for item in cfg:
if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
choice = choices[item] if item in choices else None
if isinstance(cfg[item], bool):
parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
help=help_description)
else:
parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
help=help_description)
args = parser.parse_args()
return args
def parse_yaml(yaml_path):
"""
Parse the yaml config file.
Args:
yaml_path: Path to the yaml config.
"""
with open(yaml_path, 'r') as fin:
try:
cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
cfgs = [x for x in cfgs]
if len(cfgs) == 1:
cfg_helper = {}
cfg = cfgs[0]
cfg_choices = {}
elif len(cfgs) == 2:
cfg, cfg_helper = cfgs
cfg_choices = {}
elif len(cfgs) == 3:
cfg, cfg_helper, cfg_choices = cfgs
else:
raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml")
print(cfg_helper)
except:
raise ValueError("Failed to parse yaml")
return cfg, cfg_helper, cfg_choices
def merge(args, cfg):
"""
Merge the base config from yaml file and command line arguments.
Args:
args: Command line arguments.
cfg: Base configuration.
"""
args_var = vars(args)
for item in args_var:
cfg[item] = args_var[item]
return cfg
def get_config():
"""
Get Config according to the yaml file and cli arguments.
"""
parser = argparse.ArgumentParser(description="default name", add_help=False)
current_dir = os.path.dirname(os.path.abspath(__file__))
parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../../default_config.yaml"),
help="Config file path")
path_args, _ = parser.parse_known_args()
default, helper, choices = parse_yaml(path_args.config_path)
pprint(default)
args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path)
final_config = merge(args, default)
return Config(final_config)
config = get_config()

View File

@ -0,0 +1,27 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Device adapter for ModelArts"""
from .config import config
if config.enable_modelarts:
from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
else:
from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
__all__ = [
"get_device_id", "get_device_num", "get_rank_id", "get_job_id"
]

View File

@ -0,0 +1,36 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Local adapter"""
import os
def get_device_id():
device_id = os.getenv('DEVICE_ID', '0')
return int(device_id)
def get_device_num():
device_num = os.getenv('RANK_SIZE', '1')
return int(device_num)
def get_rank_id():
global_rank_id = os.getenv('RANK_ID', '0')
return int(global_rank_id)
def get_job_id():
return "Local Job"

View File

@ -0,0 +1,122 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Moxing adapter for ModelArts"""
import os
import functools
from mindspore import context
from mindspore.profiler import Profiler
from .config import config
_global_sync_count = 0
def get_device_id():
device_id = os.getenv('DEVICE_ID', '0')
return int(device_id)
def get_device_num():
device_num = os.getenv('RANK_SIZE', '1')
return int(device_num)
def get_rank_id():
global_rank_id = os.getenv('RANK_ID', '0')
return int(global_rank_id)
def get_job_id():
job_id = os.getenv('JOB_ID')
job_id = job_id if job_id != "" else "default"
return job_id
def sync_data(from_path, to_path):
"""
Download data from remote obs to local directory if the first url is remote url and the second one is local path
Upload data from local directory to remote obs in contrast.
"""
import moxing as mox
import time
global _global_sync_count
sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
_global_sync_count += 1
# Each server contains 8 devices as most.
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
print("from path: ", from_path)
print("to path: ", to_path)
mox.file.copy_parallel(from_path, to_path)
print("===finish data synchronization===")
try:
os.mknod(sync_lock)
except IOError:
pass
print("===save flag===")
while True:
if os.path.exists(sync_lock):
break
time.sleep(1)
print("Finish sync data from {} to {}.".format(from_path, to_path))
def moxing_wrapper(pre_process=None, post_process=None):
"""
Moxing wrapper to download dataset and upload outputs.
"""
def wrapper(run_func):
@functools.wraps(run_func)
def wrapped_func(*args, **kwargs):
# Download data from data_url
if config.enable_modelarts:
if config.data_url:
sync_data(config.data_url, config.data_path)
print("Dataset downloaded: ", os.listdir(config.data_path))
if config.checkpoint_url:
sync_data(config.checkpoint_url, config.load_path)
print("Preload downloaded: ", os.listdir(config.load_path))
if config.train_url:
sync_data(config.train_url, config.output_path)
print("Workspace downloaded: ", os.listdir(config.output_path))
context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
config.device_num = get_device_num()
config.device_id = get_device_id()
if not os.path.exists(config.output_path):
os.makedirs(config.output_path)
if pre_process:
pre_process()
if config.enable_profiling:
profiler = Profiler()
run_func(*args, **kwargs)
if config.enable_profiling:
profiler.analyse()
# Upload data to train_url
if config.enable_modelarts:
if post_process:
post_process()
if config.train_url:
print("Start to copy output directory")
sync_data(config.output_path, config.train_url)
return wrapped_func
return wrapper

View File

@ -18,22 +18,18 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import sys
import sys
import tokenization
from .model_utils.config import config
# Explicitly set the encoding
sys.stdin = open(sys.stdin.fileno(), mode='r', encoding='utf-8', buffering=True)
sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=True)
def main():
parser = argparse.ArgumentParser(
description="recore nbest with smoothed sentence-level bleu.")
parser.add_argument("--vocab_file", type=str, default="", required=True, help="vocab file path.")
args = parser.parse_args()
tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=args.vocab_file)
tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=config.vocab_file)
for line in sys.stdin:
token_ids = [int(x) for x in line.strip().split()]

View File

@ -16,9 +16,8 @@
import os
import time
import argparse
import ast
from easydict import EasyDict as edict
import mindspore.common.dtype as mstype
from mindspore.common.tensor import Tensor
from mindspore.nn.optim import Adam
@ -36,21 +35,30 @@ from mindspore.common import set_seed
from src.transformer_for_train import TransformerTrainOneStepCell, TransformerNetworkWithLoss, \
TransformerTrainOneStepWithLossScaleCell, \
TransformerTrainAccumulationAllReducePostWithLossScaleCell
from src.config import cfg, transformer_net_cfg, transformer_net_cfg_gpu
from src.dataset import create_transformer_dataset
from src.lr_schedule import create_dynamic_lr
from src.model_utils.config import config
from src.model_utils.moxing_adapter import moxing_wrapper
from src.model_utils.device_adapter import get_device_id
set_seed(1)
def get_ms_timestamp():
t = time.time()
return int(round(t * 1000))
time_stamp_init = False
time_stamp_first = 0
config.dtype = mstype.float32
config.compute_type = mstype.float16
config.lr_schedule = edict({
'learning_rate': 2.0,
'warmup_steps': 8000,
'start_decay_step': 16000,
'min_lr': 0.0,
})
class LossCallBack(Callback):
"""
@ -82,7 +90,12 @@ class LossCallBack(Callback):
cb_params.cur_epoch_num,
cb_params.cur_step_num,
str(cb_params.net_outputs)))
with open("./loss_{}.log".format(self.rank_id), "a+") as f:
loss_file = "./loss_{}.log"
if config.enable_modelarts:
loss_file = "/cache/train/loss_{}.log"
with open(loss_file.format(self.rank_id), "a+") as f:
f.write("time: {}, epoch: {}, step: {}, loss: {}, overflow: {}, loss_scale: {}".format(
time_stamp_current - time_stamp_first,
cb_params.cur_epoch_num,
@ -93,121 +106,90 @@ class LossCallBack(Callback):
f.write('\n')
def argparse_init():
"""
Argparse init.
"""
parser = argparse.ArgumentParser(description='transformer')
parser.add_argument("--distribute", type=str, default="false", choices=['true', 'false'],
help="Run distribute, default is false.")
parser.add_argument("--epoch_size", type=int, default=52, help="Epoch size, default is 52.")
parser.add_argument("--device_target", type=str, default="Ascend",
help="device where the code will be implemented, default is Ascend")
parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.")
parser.add_argument("--enable_lossscale", type=str, default="true", choices=['true', 'false'],
help="Use lossscale or not, default is true.")
parser.add_argument("--do_shuffle", type=str, default="true", choices=['true', 'false'],
help="Enable shuffle for dataset, default is true.")
parser.add_argument("--checkpoint_path", type=str, default="", help="Checkpoint file path")
parser.add_argument("--enable_save_ckpt", type=str, default="true", choices=['true', 'false'],
help="Enable save checkpoint, default is true.")
parser.add_argument("--save_checkpoint_steps", type=int, default=2500, help="Save checkpoint steps, "
"default is 2500.")
parser.add_argument("--save_checkpoint_num", type=int, default=30, help="Save checkpoint numbers, default is 30.")
parser.add_argument("--save_checkpoint_path", type=str, default="./", help="Save checkpoint file path")
parser.add_argument("--data_path", type=str, default="", help="Data path, it is better to use absolute path")
parser.add_argument("--bucket_boundaries", type=ast.literal_eval, default=[16, 32, 48, 64, 128],
help="sequence length for different bucket")
parser.add_argument("--accumulation_steps", type=int, default=1, help="Gradient accumulation steps, default is 1.")
return parser
def modelarts_pre_process():
config.save_checkpoint_path = config.output_path
config.data_path = os.path.join(config.data_path, 'ende-l128-mindrecord')
@moxing_wrapper(pre_process=modelarts_pre_process)
def run_transformer_train():
"""
Transformer training.
"""
parser = argparse_init()
args, _ = parser.parse_known_args()
if args.device_target == "Ascend":
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=args.device_id)
if config.device_target == "Ascend":
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id())
else:
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False)
if args.device_target == "GPU":
if config.device_target == "GPU":
# Enable graph kernel
context.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion")
if args.distribute == "true":
if args.device_target == "Ascend":
device_num = args.device_num
if config.distribute == "true":
if config.device_target == "Ascend":
device_num = config.device_num
D.init('hccl')
else:
D.init('nccl')
device_num = D.get_group_size()
rank = get_rank()
args.device_id = rank
config.device_id = rank
context.reset_auto_parallel_context()
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
device_num=device_num)
rank_id = args.device_id % device_num
save_ckpt_path = os.path.join(args.save_checkpoint_path, 'ckpt_' + str(get_rank()) + '/')
rank_id = config.device_id % device_num
save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(get_rank()) + '/')
else:
device_num = 1
rank_id = 0
save_ckpt_path = os.path.join(args.save_checkpoint_path, 'ckpt_0/')
save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_0/')
dataset = create_transformer_dataset(epoch_count=1, rank_size=device_num,
rank_id=rank_id, do_shuffle=args.do_shuffle,
dataset_path=args.data_path,
bucket_boundaries=args.bucket_boundaries,
device_target=args.device_target)
if args.device_target == "Ascend":
netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True)
else:
netwithloss = TransformerNetworkWithLoss(transformer_net_cfg_gpu, True)
rank_id=rank_id, do_shuffle=config.do_shuffle,
dataset_path=config.data_path,
bucket_boundaries=config.bucket_boundaries,
device_target=config.device_target)
if args.checkpoint_path:
parameter_dict = load_checkpoint(args.checkpoint_path)
netwithloss = TransformerNetworkWithLoss(config, True)
if config.checkpoint_path:
parameter_dict = load_checkpoint(config.checkpoint_path)
load_param_into_net(netwithloss, parameter_dict)
hidden_size = transformer_net_cfg.hidden_size if args.device_target == "Ascend" \
else transformer_net_cfg_gpu.hidden_size
learning_rate = cfg.lr_schedule.learning_rate if args.device_target == "Ascend" \
else 1.0
hidden_size = config.hidden_size
learning_rate = config.lr_schedule.learning_rate if config.device_target == "Ascend" else 1.0
lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay",
training_steps=dataset.get_dataset_size()*args.epoch_size,
training_steps=dataset.get_dataset_size()*config.epoch_size,
learning_rate=learning_rate,
warmup_steps=cfg.lr_schedule.warmup_steps,
warmup_steps=config.lr_schedule.warmup_steps,
hidden_size=hidden_size,
start_decay_step=cfg.lr_schedule.start_decay_step,
min_lr=cfg.lr_schedule.min_lr), mstype.float32)
start_decay_step=config.lr_schedule.start_decay_step,
min_lr=config.lr_schedule.min_lr), mstype.float32)
if args.device_target == "GPU" and cfg.transformer_network == "large":
optimizer = Adam(netwithloss.trainable_params(), lr, beta2=cfg.optimizer_adam_beta2)
if config.device_target == "GPU" and config.transformer_network == "large":
optimizer = Adam(netwithloss.trainable_params(), lr, beta2=config.optimizer_adam_beta2)
else:
optimizer = Adam(netwithloss.trainable_params(), lr)
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(rank_id=rank_id)]
if args.enable_save_ckpt == "true":
if config.enable_save_ckpt == "true":
if device_num == 1 or (device_num > 1 and rank_id == 0):
if args.device_target == "Ascend":
ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps,
keep_checkpoint_max=args.save_checkpoint_num)
if config.device_target == "Ascend":
ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_steps,
keep_checkpoint_max=config.save_checkpoint_num)
else:
ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset.get_dataset_size(),
keep_checkpoint_max=args.save_checkpoint_num)
keep_checkpoint_max=config.save_checkpoint_num)
ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=save_ckpt_path, config=ckpt_config)
callbacks.append(ckpoint_cb)
if args.enable_lossscale == "true":
scale_manager = DynamicLossScaleManager(init_loss_scale=cfg.init_loss_scale_value,
scale_factor=cfg.scale_factor,
scale_window=cfg.scale_window)
if config.enable_lossscale == "true":
scale_manager = DynamicLossScaleManager(init_loss_scale=config.init_loss_scale_value,
scale_factor=config.scale_factor,
scale_window=config.scale_window)
update_cell = scale_manager.get_update_cell()
if args.accumulation_steps > 1:
if config.accumulation_steps > 1:
netwithgrads = TransformerTrainAccumulationAllReducePostWithLossScaleCell(netwithloss, optimizer,
update_cell,
args.accumulation_steps)
config.accumulation_steps)
else:
netwithgrads = TransformerTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer,
scale_update_cell=update_cell)
@ -217,7 +199,7 @@ def run_transformer_train():
netwithgrads.set_train(True)
model = Model(netwithgrads)
model.train(args.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=False)
model.train(config.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=False)
if __name__ == '__main__':

View File

@ -0,0 +1,281 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Transformer beam search module."""
import numpy as np
import mindspore.common.dtype as mstype
import mindspore.nn as nn
from mindspore.ops import operations as P
from mindspore.common.tensor import Tensor
INF = 1. * 1e9
class LengthPenalty(nn.Cell):
"""
Normalize scores of translations according to their length.
Args:
weight (float): Weight of length penalty. Default: 1.0.
compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
"""
def __init__(self,
weight=1.0,
compute_type=mstype.float32):
super(LengthPenalty, self).__init__()
self.weight = weight
self.add = P.Add()
self.pow = P.Pow()
self.div = P.RealDiv()
self.cast = P.Cast()
self.five = Tensor(5.0, mstype.float32)
self.six = Tensor(6.0, mstype.float32)
def construct(self, length_tensor):
length_tensor = self.cast(length_tensor, mstype.float32)
output = self.add(length_tensor, self.five)
output = self.div(output, self.six)
output = self.pow(output, self.weight)
return output
class TileBeam(nn.Cell):
"""
TileBeam.
Args:
beam_width (int): beam width setting. Default: 4.
compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
"""
def __init__(self,
beam_width,
compute_type=mstype.float32):
super(TileBeam, self).__init__()
self.beam_width = beam_width
self.expand = P.ExpandDims()
self.tile = P.Tile()
self.reshape = P.Reshape()
self.shape = P.Shape()
def construct(self, input_tensor):
"""
input_tensor: shape [batch, dim1, dim2]
output_tensor: shape [batch*beam, dim1, dim2]
"""
shape = self.shape(input_tensor)
input_tensor = self.expand(input_tensor, 1)
tile_shape = (1,) + (self.beam_width,)
for _ in range(len(shape)-1):
tile_shape = tile_shape + (1,)
output = self.tile(input_tensor, tile_shape)
out_shape = (shape[0]*self.beam_width,) + shape[1:]
output = self.reshape(output, out_shape)
return output
class Mod(nn.Cell):
"""
Mod function.
Args:
compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
"""
def __init__(self,
compute_type=mstype.float32):
super(Mod, self).__init__()
self.compute_type = compute_type
self.floor_div = P.FloorDiv()
self.sub = P.Sub()
self.multiply = P.Mul()
def construct(self, input_x, input_y):
x = self.floor_div(input_x, input_y)
x = self.multiply(x, input_y)
x = self.sub(input_x, x)
return x
class BeamSearchDecoder(nn.Cell):
"""
Beam search decoder.
Args:
batch_size (int): Batch size of input dataset.
seq_length (int): Length of input sequence.
vocab_size (int): Size of vocabulary.
decoder (:class:`TransformerDecoderStep`): Decoder module.
beam_width (int): beam width setting. Default: 4.
length_penalty_weight (float): Weight of length penalty. Default: 1.0.
max_decode_length (int): max decode length. Default: 128.
sos_id (int): Id of sequence start token. Default: 1.
eos_id (int): Id of sequence end token. Default: 2.
compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
"""
def __init__(self,
batch_size,
seq_length,
vocab_size,
decoder,
beam_width=4,
length_penalty_weight=1.0,
max_decode_length=128,
sos_id=1,
eos_id=2,
compute_type=mstype.float32):
super(BeamSearchDecoder, self).__init__(auto_prefix=False)
self.seq_length = seq_length
self.batch_size = batch_size
self.vocab_size = vocab_size
self.beam_width = beam_width
self.length_penalty_weight = length_penalty_weight
self.max_decode_length = max_decode_length
self.decoder = decoder
self.add = P.Add()
self.expand = P.ExpandDims()
self.reshape = P.Reshape()
self.shape_flat = (-1,)
self.shape = P.Shape()
self.zero_tensor = Tensor(np.zeros([batch_size, beam_width]), mstype.float32)
self.ninf_tensor = Tensor(np.full([batch_size, beam_width], -INF), mstype.float32)
self.select = P.Select()
self.flat_shape = (batch_size, beam_width * vocab_size)
self.topk = P.TopK(sorted=True)
self.floor_div = P.FloorDiv()
self.vocab_size_tensor = Tensor(self.vocab_size, mstype.int32)
self.real_div = P.RealDiv()
self.mod = Mod()
self.equal = P.Equal()
self.eos_ids = Tensor(np.full([batch_size, beam_width], eos_id), mstype.int32)
beam_ids = np.tile(np.arange(beam_width).reshape((1, beam_width)), [batch_size, 1])
self.beam_ids = Tensor(beam_ids, mstype.int32)
batch_ids = np.arange(batch_size*beam_width).reshape((batch_size, beam_width)) // beam_width
self.batch_ids = Tensor(batch_ids, mstype.int32)
self.concat = P.Concat(axis=-1)
self.gather_nd = P.GatherNd()
self.greater_equal = P.GreaterEqual()
self.sub = P.Sub()
self.cast = P.Cast()
self.zeroslike = P.ZerosLike()
# init inputs and states
self.start_ids = Tensor(np.full([batch_size * beam_width, 1], sos_id), mstype.int32)
self.init_seq = Tensor(np.full([batch_size, beam_width, 1], sos_id), mstype.int32)
init_scores = np.tile(np.array([[0.] + [-INF]*(beam_width-1)]), [batch_size, 1])
self.init_scores = Tensor(init_scores, mstype.float32)
self.init_finished = Tensor(np.zeros([batch_size, beam_width], dtype=np.bool))
self.init_length = Tensor(np.zeros([batch_size, beam_width], dtype=np.int32))
self.length_penalty = LengthPenalty(weight=length_penalty_weight)
self.one = Tensor(1, mstype.int32)
def one_step(self, cur_input_ids, enc_states, enc_attention_mask, state_log_probs,
state_seq, state_finished, state_length):
"""
One step for decode
"""
log_probs = self.decoder(cur_input_ids, enc_states, enc_attention_mask, self.seq_length)
log_probs = self.reshape(log_probs, (self.batch_size, self.beam_width, self.vocab_size))
# select topk indices
total_log_probs = self.add(log_probs, self.expand(state_log_probs, -1))
# mask finished beams
mask_tensor = self.select(state_finished, self.ninf_tensor, self.zero_tensor)
total_log_probs = self.add(total_log_probs, self.expand(mask_tensor, -1))
# reshape scores to [batch, beam*vocab]
flat_scores = self.reshape(total_log_probs, self.flat_shape)
# select topk
topk_scores, topk_indices = self.topk(flat_scores, self.beam_width)
temp = topk_indices
beam_indices = self.zeroslike(topk_indices)
for _ in range(self.beam_width - 1):
temp = self.sub(temp, self.vocab_size_tensor)
res = self.cast(self.greater_equal(temp, 0), mstype.int32)
beam_indices = beam_indices + res
word_indices = topk_indices - beam_indices * self.vocab_size_tensor
#======================================================================
# mask finished indices
beam_indices = self.select(state_finished, self.beam_ids, beam_indices)
word_indices = self.select(state_finished, self.eos_ids, word_indices)
topk_scores = self.select(state_finished, state_log_probs, topk_scores)
###### put finished sequences to the end
# sort according to scores with -inf for finished beams
tmp_log_probs = self.select(
self.equal(word_indices, self.eos_ids),
self.ninf_tensor,
topk_scores)
_, tmp_indices = self.topk(tmp_log_probs, self.beam_width)
# update
tmp_gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(tmp_indices, -1)))
beam_indices = self.gather_nd(beam_indices, tmp_gather_indices)
word_indices = self.gather_nd(word_indices, tmp_gather_indices)
topk_scores = self.gather_nd(topk_scores, tmp_gather_indices)
###### generate new beam_search states
# gather indices for selecting alive beams
gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(beam_indices, -1)))
# length add 1 if not finished in the previous step
length_add = self.add(state_length, self.one)
state_length = self.select(state_finished, state_length, length_add)
state_length = self.gather_nd(state_length, gather_indices)
# concat seq
seq = self.gather_nd(state_seq, gather_indices)
state_seq = self.concat((seq, self.expand(word_indices, -1)))
# new finished flag and log_probs
state_finished = self.equal(word_indices, self.eos_ids)
state_log_probs = topk_scores
###### generate new inputs and decoder states
cur_input_ids = self.reshape(state_seq, (self.batch_size*self.beam_width, -1))
return cur_input_ids, state_log_probs, state_seq, state_finished, state_length
def construct(self, enc_states, enc_attention_mask):
"""Get beam search result."""
cur_input_ids = self.start_ids
# beam search states
state_log_probs = self.init_scores
state_seq = self.init_seq
state_finished = self.init_finished
state_length = self.init_length
for _ in range(self.max_decode_length):
# run one step decoder to get outputs of the current step
# shape [batch*beam, 1, vocab]
cur_input_ids, state_log_probs, state_seq, state_finished, state_length = self.one_step(
cur_input_ids, enc_states, enc_attention_mask, state_log_probs, state_seq, state_finished, state_length)
# add length penalty scores
penalty_len = self.length_penalty(state_length)
# get penalty length
log_probs = self.real_div(state_log_probs, penalty_len)
# sort according to scores
_, top_beam_indices = self.topk(log_probs, self.beam_width)
gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(top_beam_indices, -1)))
# sort sequence
predicted_ids = self.gather_nd(state_seq, gather_indices)
# take the first one
predicted_ids = predicted_ids[::, 0:1:1, ::]
return predicted_ids

View File

@ -1,4 +1,4 @@
# Copyright 2020 Huawei Technologies Co., Ltd
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

View File

@ -0,0 +1,58 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Data operations, will be used in train.py."""
import mindspore.common.dtype as mstype
import mindspore.dataset as de
import mindspore.dataset.transforms.c_transforms as deC
from .config import transformer_net_cfg, transformer_net_cfg_gpu
de.config.set_seed(1)
def create_transformer_dataset(epoch_count=1, rank_size=1, rank_id=0, do_shuffle="true", dataset_path=None,
bucket_boundaries=None, device_target="Ascend"):
"""create dataset"""
def batch_per_bucket(bucket_len, dataset_path):
dataset_path = dataset_path + "_" + str(bucket_len) + "_00"
ds = de.MindDataset(dataset_path,
columns_list=["source_eos_ids", "source_eos_mask",
"target_sos_ids", "target_sos_mask",
"target_eos_ids", "target_eos_mask"],
shuffle=(do_shuffle == "true"), num_shards=rank_size, shard_id=rank_id)
type_cast_op = deC.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids")
ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask")
ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids")
ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask")
ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids")
ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask")
# apply batch operations
if device_target == "Ascend":
ds = ds.batch(transformer_net_cfg.batch_size, drop_remainder=True)
else:
ds = ds.batch(transformer_net_cfg_gpu.batch_size, drop_remainder=True)
ds = ds.repeat(epoch_count)
return ds
for i, _ in enumerate(bucket_boundaries):
bucket_len = bucket_boundaries[i]
ds_per = batch_per_bucket(bucket_len, dataset_path)
if i == 0:
ds = ds_per
else:
ds = ds + ds_per
ds = ds.shuffle(ds.get_dataset_size())
ds.channel_name = 'transformer'
return ds

View File

@ -1,4 +1,4 @@
# Copyright 2020 Huawei Technologies Co., Ltd
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

View File

@ -0,0 +1,52 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Learning rate utilities."""
def linear_warmup(warmup_steps, current_step):
return min([1.0, float(current_step)/float(warmup_steps)])
def rsqrt_decay(warmup_steps, current_step):
return float(max([current_step, warmup_steps])) ** -0.5
def rsqrt_hidden(hidden_size):
return float(hidden_size) ** -0.5
def create_dynamic_lr(schedule, training_steps, learning_rate, warmup_steps, hidden_size,
start_decay_step=0, min_lr=0.):
"""
Generate dynamic learning rate.
"""
if start_decay_step < warmup_steps:
start_decay_step = warmup_steps
lr = []
for current_step in range(1, training_steps+1):
cur_lr = 1.0
for name in schedule.split("*"):
if name == "constant":
cur_lr *= float(learning_rate)
elif name == "rsqrt_hidden":
cur_lr *= rsqrt_hidden(hidden_size)
elif name == "linear_warmup":
cur_lr *= linear_warmup(warmup_steps, current_step)
elif name == "rsqrt_decay":
cur_lr *= rsqrt_decay(warmup_steps, current_step-start_decay_step+warmup_steps)
else:
raise ValueError("unknown learning rate schedule")
if warmup_steps < current_step < start_decay_step:
cur_lr = lr[-1]
if current_step > warmup_steps:
cur_lr = max([cur_lr, min_lr])
lr.append(cur_lr)
return lr

View File

@ -0,0 +1,47 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Convert ids to tokens."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import sys
import tokenization
# Explicitly set the encoding
sys.stdin = open(sys.stdin.fileno(), mode='r', encoding='utf-8', buffering=True)
sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=True)
def main():
parser = argparse.ArgumentParser(
description="recore nbest with smoothed sentence-level bleu.")
parser.add_argument("--vocab_file", type=str, default="", required=True, help="vocab file path.")
args = parser.parse_args()
tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=args.vocab_file)
for line in sys.stdin:
token_ids = [int(x) for x in line.strip().split()]
tokens = tokenizer.convert_ids_to_tokens(token_ids)
sent = " ".join(tokens)
sent = sent.split("<s>")[-1]
sent = sent.split("</s>")[0]
print(sent.strip())
if __name__ == "__main__":
main()

View File

@ -0,0 +1,158 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Tokenization utilities."""
import sys
import collections
import unicodedata
def convert_to_printable(text):
"""
Converts `text` to a printable coding format.
"""
if sys.version_info[0] == 3:
if isinstance(text, str):
return text
if isinstance(text, bytes):
return text.decode("utf-8", "ignore")
raise ValueError("Only support type `str` or `bytes`, while text type is `%s`" % (type(text)))
if sys.version_info[0] == 2:
if isinstance(text, str):
return text
if isinstance(text, unicode):
return text.encode("utf-8")
raise ValueError("Only support type `str` or `unicode`, while text type is `%s`" % (type(text)))
raise ValueError("Only supported when running on Python2 or Python3.")
def convert_to_unicode(text):
"""
Converts `text` to Unicode format.
"""
if sys.version_info[0] == 3:
if isinstance(text, str):
return text
if isinstance(text, bytes):
return text.decode("utf-8", "ignore")
raise ValueError("Only support type `str` or `bytes`, while text type is `%s`" % (type(text)))
if sys.version_info[0] == 2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
if isinstance(text, unicode):
return text
raise ValueError("Only support type `str` or `unicode`, while text type is `%s`" % (type(text)))
raise ValueError("Only supported when running on Python2 or Python3.")
def load_vocab_file(vocab_file):
"""
Loads a vocabulary file and turns into a {token:id} dictionary.
"""
vocab_dict = collections.OrderedDict()
index = 0
with open(vocab_file, "r") as vocab:
while True:
token = convert_to_unicode(vocab.readline())
if not token:
break
token = token.strip()
vocab_dict[token] = index
index += 1
return vocab_dict
def convert_by_vocab_dict(vocab_dict, items):
"""
Converts a sequence of [tokens|ids] according to the vocab dict.
"""
output = []
for item in items:
if item in vocab_dict:
output.append(vocab_dict[item])
else:
output.append(vocab_dict["<unk>"])
return output
class WhiteSpaceTokenizer():
"""
Whitespace tokenizer.
"""
def __init__(self, vocab_file):
self.vocab_dict = load_vocab_file(vocab_file)
self.inv_vocab_dict = {index: token for token, index in self.vocab_dict.items()}
def _is_whitespace_char(self, char):
"""
Checks if it is a whitespace character(regard "\t", "\n", "\r" as whitespace here).
"""
if char in (" ", "\t", "\n", "\r"):
return True
uni = unicodedata.category(char)
if uni == "Zs":
return True
return False
def _is_control_char(self, char):
"""
Checks if it is a control character.
"""
if char in ("\t", "\n", "\r"):
return False
uni = unicodedata.category(char)
if uni in ("Cc", "Cf"):
return True
return False
def _clean_text(self, text):
"""
Remove invalid characters and cleanup whitespace.
"""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or self._is_control_char(char):
continue
if self._is_whitespace_char(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
def _whitespace_tokenize(self, text):
"""
Clean whitespace and split text into tokens.
"""
text = text.strip()
if not text:
tokens = []
else:
tokens = text.split()
return tokens
def tokenize(self, text):
"""
Tokenizes text.
"""
text = convert_to_unicode(text)
text = self._clean_text(text)
tokens = self._whitespace_tokenize(text)
return tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab_dict(self.vocab_dict, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab_dict(self.inv_vocab_dict, ids)

View File

@ -0,0 +1,472 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Transformer for training."""
import numpy as np
from mindspore.common.initializer import initializer
import mindspore.nn as nn
from mindspore.ops import operations as P
from mindspore.ops import functional as F
from mindspore.ops import composite as C
from mindspore.common.tensor import Tensor
from mindspore.common.parameter import Parameter
from mindspore.common import dtype as mstype
from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
from mindspore.communication.management import get_group_size
from mindspore.context import ParallelMode
from mindspore import context
from .transformer_model import TransformerModel
GRADIENT_CLIP_TYPE = 1
GRADIENT_CLIP_VALUE = 5.0
clip_grad = C.MultitypeFuncGraph("clip_grad")
@clip_grad.register("Number", "Number", "Tensor")
def _clip_grad(clip_type, clip_value, grad):
"""
Clip gradients.
Inputs:
clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'.
clip_value (float): Specifies how much to clip.
grad (tuple[Tensor]): Gradients.
Outputs:
tuple[Tensor], clipped gradients.
"""
if clip_type not in (0, 1):
return grad
dt = F.dtype(grad)
if clip_type == 0:
new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt),
F.cast(F.tuple_to_array((clip_value,)), dt))
else:
new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt))
return new_grad
class TransformerTrainingLoss(nn.Cell):
"""
Provide transformer training loss.
Args:
config (TransformerConfig): The config of Transformer.
Returns:
Tensor, total loss.
"""
def __init__(self, config):
super(TransformerTrainingLoss, self).__init__(auto_prefix=False)
self.vocab_size = config.vocab_size
self.onehot = P.OneHot()
self.on_value = Tensor(float(1 - config.label_smoothing), mstype.float32)
self.off_value = Tensor(config.label_smoothing / float(self.vocab_size - 1), mstype.float32)
self.reduce_sum = P.ReduceSum()
self.reduce_mean = P.ReduceMean()
self.reshape = P.Reshape()
self.last_idx = (-1,)
self.flatten = P.Flatten()
self.neg = P.Neg()
self.cast = P.Cast()
self.batch_size = config.batch_size
def construct(self, prediction_scores, label_ids, label_weights, seq_length):
"""Defines the computation performed."""
flat_shape = (self.batch_size * seq_length,)
label_ids = self.reshape(label_ids, flat_shape)
label_weights = self.cast(self.reshape(label_weights, flat_shape), mstype.float32)
one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value)
per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx))
numerator = self.reduce_sum(label_weights * per_example_loss, ())
denominator = self.reduce_sum(label_weights, ()) + \
self.cast(F.tuple_to_array((1e-5,)), mstype.float32)
loss = numerator / denominator
return loss
class TransformerNetworkWithLoss(nn.Cell):
"""
Provide transformer training loss through network.
Args:
config (TransformerConfig): The config of Transformer.
is_training (bool): Specifies whether to use the training mode.
use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings. Default: False.
Returns:
Tensor, the loss of the network.
"""
def __init__(self, config, is_training, use_one_hot_embeddings=False):
super(TransformerNetworkWithLoss, self).__init__(auto_prefix=False)
self.transformer = TransformerModel(config, is_training, use_one_hot_embeddings)
self.loss = TransformerTrainingLoss(config)
self.cast = P.Cast()
self.shape = P.Shape()
def construct(self,
source_ids,
source_mask,
target_ids,
target_mask,
label_ids,
label_weights):
"""Transformer network with loss."""
prediction_scores = self.transformer(source_ids, source_mask, target_ids, target_mask)
seq_length = self.shape(source_ids)[1]
total_loss = self.loss(prediction_scores, label_ids, label_weights, seq_length)
return self.cast(total_loss, mstype.float32)
class TransformerTrainOneStepCell(nn.TrainOneStepCell):
"""
Encapsulation class of transformer network training.
Append an optimizer to the training network after that the construct
function can be called to create the backward graph.
Args:
network (Cell): The training network. Note that loss function should have been added.
optimizer (Optimizer): Optimizer for updating the weights.
sens (Number): The adjust parameter. Default: 1.0.
"""
def __init__(self, network, optimizer, sens=1.0):
super(TransformerTrainOneStepCell, self).__init__(network, optimizer, sens)
self.cast = P.Cast()
self.hyper_map = C.HyperMap()
def set_sens(self, value):
self.sens = value
def construct(self,
source_eos_ids,
source_eos_mask,
target_sos_ids,
target_sos_mask,
target_eos_ids,
target_eos_mask,):
"""Defines the computation performed."""
source_ids = source_eos_ids
source_mask = source_eos_mask
target_ids = target_sos_ids
target_mask = target_sos_mask
label_ids = target_eos_ids
label_weights = target_eos_mask
weights = self.weights
loss = self.network(source_ids,
source_mask,
target_ids,
target_mask,
label_ids,
label_weights)
grads = self.grad(self.network, weights)(source_ids,
source_mask,
target_ids,
target_mask,
label_ids,
label_weights,
self.cast(F.tuple_to_array((self.sens,)),
mstype.float32))
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
# apply grad reducer on grads
grads = self.grad_reducer(grads)
succ = self.optimizer(grads)
return F.depend(loss, succ)
grad_scale = C.MultitypeFuncGraph("grad_scale")
reciprocal = P.Reciprocal()
@grad_scale.register("Tensor", "Tensor")
def tensor_grad_scale(scale, grad):
return grad * F.cast(reciprocal(scale), F.dtype(grad))
_grad_overflow = C.MultitypeFuncGraph("_grad_overflow")
grad_overflow = P.FloatStatus()
@_grad_overflow.register("Tensor")
def _tensor_grad_overflow(grad):
return grad_overflow(grad)
class TransformerTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell):
"""
Encapsulation class of Transformer network training.
Append an optimizer to the training network after that the construct
function can be called to create the backward graph.
Args:
network (Cell): The training network. Note that loss function should have been added.
optimizer (Optimizer): Optimizer for updating the weights.
scale_update_cell (Cell): Cell to do the loss scale. Default: None.
"""
def __init__(self, network, optimizer, scale_update_cell=None):
super(TransformerTrainOneStepWithLossScaleCell, self).__init__(network, optimizer, scale_update_cell)
self.cast = P.Cast()
self.degree = 1
if self.reducer_flag:
self.degree = get_group_size()
self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
self.loss_scale = None
self.loss_scaling_manager = scale_update_cell
if scale_update_cell:
self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))
def construct(self,
source_eos_ids,
source_eos_mask,
target_sos_ids,
target_sos_mask,
target_eos_ids,
target_eos_mask,
sens=None):
"""Defines the computation performed."""
source_ids = source_eos_ids
source_mask = source_eos_mask
target_ids = target_sos_ids
target_mask = target_sos_mask
label_ids = target_eos_ids
label_weights = target_eos_mask
weights = self.weights
loss = self.network(source_ids,
source_mask,
target_ids,
target_mask,
label_ids,
label_weights)
if sens is None:
scaling_sens = self.loss_scale
else:
scaling_sens = sens
status, scaling_sens = self.start_overflow_check(loss, scaling_sens)
grads = self.grad(self.network, weights)(source_ids,
source_mask,
target_ids,
target_mask,
label_ids,
label_weights,
self.cast(scaling_sens,
mstype.float32))
# apply grad reducer on grads
grads = self.grad_reducer(grads)
grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads)
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
cond = self.get_overflow_status(status, grads)
overflow = cond
if sens is None:
overflow = self.loss_scaling_manager(self.loss_scale, cond)
if overflow:
succ = False
else:
succ = self.optimizer(grads)
ret = (loss, cond, scaling_sens)
return F.depend(ret, succ)
cast = P.Cast()
add_grads = C.MultitypeFuncGraph("add_grads")
@add_grads.register("Tensor", "Tensor")
def _add_grads(accu_grad, grad):
return accu_grad + cast(grad, mstype.float32)
update_accu_grads = C.MultitypeFuncGraph("update_accu_grads")
@update_accu_grads.register("Tensor", "Tensor")
def _update_accu_grads(accu_grad, grad):
succ = True
return F.depend(succ, F.assign(accu_grad, cast(grad, mstype.float32)))
accumulate_accu_grads = C.MultitypeFuncGraph("accumulate_accu_grads")
@accumulate_accu_grads.register("Tensor", "Tensor")
def _accumulate_accu_grads(accu_grad, grad):
succ = True
return F.depend(succ, F.assign_add(accu_grad, cast(grad, mstype.float32)))
zeroslike = P.ZerosLike()
reset_accu_grads = C.MultitypeFuncGraph("reset_accu_grads")
@reset_accu_grads.register("Tensor")
def _reset_accu_grads(accu_grad):
succ = True
return F.depend(succ, F.assign(accu_grad, zeroslike(accu_grad)))
class TransformerTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell):
"""
Encapsulation class of bert network training.
Append an optimizer to the training network after that the construct
function can be called to create the backward graph.
To mimic higher batch size, gradients are accumulated N times before weight update.
For distribution mode, allreduce will only be implemented in the weight updated step,
i.e. the sub-step after gradients accumulated N times.
Args:
network (Cell): The training network. Note that loss function should have been added.
optimizer (Optimizer): Optimizer for updating the weights.
scale_update_cell (Cell): Cell to do the loss scale. Default: None.
accumulation_steps (int): Number of accumulation steps before gradient update. The global batch size =
batch_size * accumulation_steps. Default: 1.
"""
def __init__(self, network, optimizer, scale_update_cell=None, accumulation_steps=8, enable_global_norm=False):
super(TransformerTrainAccumulationAllReducePostWithLossScaleCell, self).__init__(auto_prefix=False)
self.network = network
self.network.set_grad()
self.weights = optimizer.parameters
self.optimizer = optimizer
self.accumulation_steps = accumulation_steps
self.enable_global_norm = enable_global_norm
self.one = Tensor(np.array([1]).astype(np.int32))
self.zero = Tensor(np.array([0]).astype(np.int32))
self.local_step = Parameter(initializer(0, [1], mstype.int32))
self.accu_grads = self.weights.clone(prefix="accu_grads", init='zeros')
self.accu_overflow = Parameter(initializer(0, [1], mstype.int32))
self.accu_loss = Parameter(initializer(0, [1], mstype.float32))
self.grad = C.GradOperation(get_by_list=True, sens_param=True)
self.reducer_flag = False
self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
self.reducer_flag = True
self.grad_reducer = F.identity
self.degree = 1
if self.reducer_flag:
self.degree = get_group_size()
self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
self.overflow_reducer = F.identity
if self.is_distributed:
self.overflow_reducer = P.AllReduce()
self.cast = P.Cast()
self.alloc_status = P.NPUAllocFloatStatus()
self.get_status = P.NPUGetFloatStatus()
self.clear_status = P.NPUClearFloatStatus()
self.reduce_sum = P.ReduceSum(keep_dims=False)
self.base = Tensor(1, mstype.float32)
self.less_equal = P.LessEqual()
self.logical_or = P.LogicalOr()
self.not_equal = P.NotEqual()
self.select = P.Select()
self.reshape = P.Reshape()
self.hyper_map = C.HyperMap()
self.loss_scale = None
self.loss_scaling_manager = scale_update_cell
if scale_update_cell:
self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))
def construct(self,
source_eos_ids,
source_eos_mask,
target_sos_ids,
target_sos_mask,
target_eos_ids,
target_eos_mask,
sens=None):
"""Defines the computation performed."""
source_ids = source_eos_ids
source_mask = source_eos_mask
target_ids = target_sos_ids
target_mask = target_sos_mask
label_ids = target_eos_ids
label_weights = target_eos_mask
weights = self.weights
loss = self.network(source_ids,
source_mask,
target_ids,
target_mask,
label_ids,
label_weights)
if sens is None:
scaling_sens = self.loss_scale
else:
scaling_sens = sens
# alloc status and clear should be right before gradoperation
init = self.alloc_status()
init = F.depend(init, loss)
clear_status = self.clear_status(init)
scaling_sens = F.depend(scaling_sens, clear_status)
# update accumulation parameters
is_accu_step = self.not_equal(self.local_step, self.accumulation_steps)
self.local_step = self.select(is_accu_step, self.local_step + self.one, self.one)
self.accu_loss = self.select(is_accu_step, self.accu_loss + loss, loss)
mean_loss = self.accu_loss / self.local_step
is_accu_step = self.not_equal(self.local_step, self.accumulation_steps)
grads = self.grad(self.network, weights)(source_ids,
source_mask,
target_ids,
target_mask,
label_ids,
label_weights,
self.cast(scaling_sens,
mstype.float32))
accu_succ = self.hyper_map(accumulate_accu_grads, self.accu_grads, grads)
mean_loss = F.depend(mean_loss, accu_succ)
init = F.depend(init, mean_loss)
get_status = self.get_status(init)
init = F.depend(init, get_status)
flag_sum = self.reduce_sum(init, (0,))
overflow = self.less_equal(self.base, flag_sum)
overflow = self.logical_or(self.not_equal(self.accu_overflow, self.zero), overflow)
accu_overflow = self.select(overflow, self.one, self.zero)
self.accu_overflow = self.select(is_accu_step, accu_overflow, self.zero)
if is_accu_step:
succ = False
else:
# apply grad reducer on grads
grads = self.grad_reducer(self.accu_grads)
scaling = scaling_sens * self.degree * self.accumulation_steps
grads = self.hyper_map(F.partial(grad_scale, scaling), grads)
if self.enable_global_norm:
grads = C.clip_by_global_norm(grads, 1.0, None)
else:
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
accu_overflow = F.depend(accu_overflow, grads)
accu_overflow = self.overflow_reducer(accu_overflow)
overflow = self.less_equal(self.base, accu_overflow)
accu_succ = self.hyper_map(reset_accu_grads, self.accu_grads)
overflow = F.depend(overflow, accu_succ)
overflow = self.reshape(overflow, (()))
if sens is None:
overflow = self.loss_scaling_manager(self.loss_scale, overflow)
if overflow:
succ = False
else:
succ = self.optimizer(grads)
ret = (mean_loss, overflow, scaling_sens)
return F.depend(ret, succ)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,52 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Weight init utilities."""
import math
import numpy as np
from mindspore.common.tensor import Tensor
def _average_units(shape):
"""
Average shape dim.
"""
if not shape:
return 1.
if len(shape) == 1:
return float(shape[0])
if len(shape) == 2:
return float(shape[0] + shape[1]) / 2.
raise RuntimeError("not support shape.")
def weight_variable(shape):
scale_shape = shape
avg_units = _average_units(scale_shape)
scale = 1.0 / max(1., avg_units)
limit = math.sqrt(3.0 * scale)
values = np.random.uniform(-limit, limit, shape).astype(np.float32)
return Tensor(values)
def one_weight(shape):
ones = np.ones(shape).astype(np.float32)
return Tensor(ones)
def zero_weight(shape):
zeros = np.zeros(shape).astype(np.float32)
return Tensor(zeros)
def normal_weight(shape, num_units):
norm = np.random.normal(0.0, num_units**-0.5, shape).astype(np.float32)
return Tensor(norm)

View File

@ -27,12 +27,10 @@ from mindspore.train.callback import Callback
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as deC
from mindspore import context
from model_zoo.official.nlp.transformer.src.transformer_model import TransformerConfig
from model_zoo.official.nlp.transformer.src.transformer_for_train import TransformerNetworkWithLoss, \
TransformerTrainOneStepWithLossScaleCell
from model_zoo.official.nlp.transformer.src.config import cfg, transformer_net_cfg
from model_zoo.official.nlp.transformer.src.lr_schedule import create_dynamic_lr
from src.transformer_model import TransformerConfig
from src.transformer_for_train import TransformerNetworkWithLoss, TransformerTrainOneStepWithLossScaleCell
from src.config import cfg, transformer_net_cfg
from src.lr_schedule import create_dynamic_lr
from tests.st.model_zoo_tests import utils
DATA_DIR = ["/home/workspace/mindspore_dataset/transformer/test-mindrecord"]
@ -216,14 +214,14 @@ def test_transformer_export_mindir():
export_file = "transformer80_bs_0"
ckpt_path = os.path.join(utils.ckpt_root, "transformer/transformer_trained.ckpt")
print("ckpt_path:", ckpt_path)
old_list = ["'model_file': '/your/path/checkpoint_file'"]
new_list = ["'model_file': '{}'".format(ckpt_path)]
utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "src/eval_config.py"))
old_list = ["context.set_context(device_id=args.device_id)"]
old_list = ["model_file: './transformer/transformer_trained.ckpt'"]
new_list = ["model_file: '{}'".format(ckpt_path)]
utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "default_config_large.yaml"))
old_list = ["context.set_context(device_id=get_device_id())"]
new_list = ["context.set_context()"]
utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "export.py"))
exec_export_shell = "cd transformer; python -u export.py --file_name={}" \
" --file_format=MINDIR".format(export_file)
" --file_format=MINDIR --config_path=./default_config_large.yaml".format(export_file)
os.system(exec_export_shell)
assert os.path.exists(os.path.join(cur_model_path, "{}.mindir".format(export_file)))