clould
This commit is contained in:
parent
d919560902
commit
2c4c157ae5
|
@ -18,12 +18,11 @@ from __future__ import absolute_import
|
|||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import ast
|
||||
import collections
|
||||
import logging
|
||||
import numpy as np
|
||||
import src.tokenization as tokenization
|
||||
from src.model_utils.config import config
|
||||
from mindspore.mindrecord import FileWriter
|
||||
|
||||
class SampleInstance():
|
||||
|
@ -121,33 +120,17 @@ def create_training_instance(source_words, target_words, max_seq_length, clip_to
|
|||
return instance
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--input_file", type=str, required=True,
|
||||
help='Input raw text file (or comma-separated list of files).')
|
||||
parser.add_argument("--output_file", type=str, required=True, help='Output MindRecord file.')
|
||||
parser.add_argument("--num_splits", type=int, default=16,
|
||||
help='The MindRecord file will be split into the number of partition.')
|
||||
parser.add_argument("--vocab_file", type=str, required=True,
|
||||
help='The vocabulary file that the Transformer model was trained on.')
|
||||
parser.add_argument("--clip_to_max_len", type=bool, default=False,
|
||||
help='clip sequences to maximum sequence length.')
|
||||
parser.add_argument("--max_seq_length", type=int, default=128, help='Maximum sequence length.')
|
||||
parser.add_argument("--bucket", type=ast.literal_eval, default=[16, 32, 48, 64, 128],
|
||||
help='bucket sequence length')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=args.vocab_file)
|
||||
tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=config.vocab_file)
|
||||
|
||||
input_files = []
|
||||
for input_pattern in args.input_file.split(","):
|
||||
for input_pattern in config.input_file.split(","):
|
||||
input_files.append(input_pattern)
|
||||
|
||||
logging.info("*** Read from input files ***")
|
||||
for input_file in input_files:
|
||||
logging.info(" %s", input_file)
|
||||
|
||||
output_file = args.output_file
|
||||
output_file = config.output_file
|
||||
logging.info("*** Write to output files ***")
|
||||
logging.info(" %s", output_file)
|
||||
|
||||
|
@ -155,7 +138,7 @@ def main():
|
|||
total_read = 0
|
||||
|
||||
feature_dict = {}
|
||||
for i in args.bucket:
|
||||
for i in config.bucket:
|
||||
feature_dict[i] = []
|
||||
|
||||
for input_file in input_files:
|
||||
|
@ -174,17 +157,17 @@ def main():
|
|||
source_tokens = tokenizer.tokenize(source_line)
|
||||
target_tokens = tokenizer.tokenize(target_line)
|
||||
|
||||
if len(source_tokens) >= args.max_seq_length or len(target_tokens) >= args.max_seq_length:
|
||||
if len(source_tokens) >= config.max_seq_length or len(target_tokens) >= config.max_seq_length:
|
||||
logging.info("ignore long sentence!")
|
||||
continue
|
||||
|
||||
instance = create_training_instance(source_tokens, target_tokens, args.max_seq_length,
|
||||
clip_to_max_len=args.clip_to_max_len)
|
||||
instance = create_training_instance(source_tokens, target_tokens, config.max_seq_length,
|
||||
clip_to_max_len=config.clip_to_max_len)
|
||||
if instance is None:
|
||||
continue
|
||||
|
||||
features, seq_max_bucket_length = get_instance_features(instance, tokenizer, args.max_seq_length,
|
||||
args.bucket)
|
||||
features, seq_max_bucket_length = get_instance_features(instance, tokenizer, config.max_seq_length,
|
||||
config.bucket)
|
||||
for key in feature_dict:
|
||||
if key == seq_max_bucket_length:
|
||||
feature_dict[key].append(features)
|
||||
|
@ -200,12 +183,12 @@ def main():
|
|||
feature = features[feature_name]
|
||||
logging.info("%s: %s", feature_name, feature)
|
||||
|
||||
for i in args.bucket:
|
||||
if args.num_splits == 1:
|
||||
for i in config.bucket:
|
||||
if config.num_splits == 1:
|
||||
output_file_name = output_file
|
||||
else:
|
||||
output_file_name = output_file + '_' + str(i) + '_'
|
||||
writer = FileWriter(output_file_name, args.num_splits)
|
||||
writer = FileWriter(output_file_name, config.num_splits)
|
||||
data_schema = {"source_sos_ids": {"type": "int64", "shape": [-1]},
|
||||
"source_sos_mask": {"type": "int64", "shape": [-1]},
|
||||
"source_eos_ids": {"type": "int64", "shape": [-1]},
|
||||
|
|
|
@ -0,0 +1,133 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path"
|
||||
checkpoint_path: ''
|
||||
device_target: Ascend
|
||||
enable_profiling: False
|
||||
|
||||
# ==============================================================================
|
||||
# config/cfg edict
|
||||
transformer_network: 'base'
|
||||
init_loss_scale_value: 1024
|
||||
scale_factor: 2
|
||||
scale_window: 2000
|
||||
optimizer: 'Adam'
|
||||
optimizer_adam_beta2: 0.997
|
||||
# lr_schedule: edict({'learning_rate': 2.0, 'warmup_steps': 8000, 'start_decay_step': 16000, 'min_lr': 0.0,})
|
||||
|
||||
# transformer_net_cfg
|
||||
batch_size: 96
|
||||
seq_length: 128
|
||||
vocab_size: 36560
|
||||
hidden_size: 512
|
||||
num_hidden_layers: 6
|
||||
num_attention_heads: 8
|
||||
intermediate_size: 2048
|
||||
hidden_act: "relu"
|
||||
hidden_dropout_prob: 0.2
|
||||
attention_probs_dropout_prob: 0.2
|
||||
max_position_embeddings: 128
|
||||
initializer_range: 0.02
|
||||
label_smoothing: 0.1
|
||||
dtype: mstype.float32
|
||||
compute_type: mstype.float16
|
||||
|
||||
#eval_config/cfg edict
|
||||
data_file: '/cache/data'
|
||||
model_file: './transformer/transformer_trained.ckpt'
|
||||
output_file: './output_eval.txt'
|
||||
|
||||
# transformer_net_cfg
|
||||
batch_size_ev: 1
|
||||
hidden_dropout_prob_ev: 0.0
|
||||
attention_probs_dropout_prob_ev: 0.0
|
||||
beam_width: 4
|
||||
max_decode_length: 80
|
||||
length_penalty_weight: 1.0
|
||||
|
||||
# ==============================================================================
|
||||
# train.py / Argparse init.
|
||||
distribute: "false"
|
||||
epoch_size: 52
|
||||
device_id: 0
|
||||
device_num: 1
|
||||
enable_lossscale: "true"
|
||||
do_shuffle: "true"
|
||||
enable_save_ckpt: "true"
|
||||
save_checkpoint_steps: 2500
|
||||
save_checkpoint_num: 30
|
||||
save_checkpoint_path: "./"
|
||||
bucket_boundaries: [16, 32, 48, 64, 128]
|
||||
accumulation_steps: 1
|
||||
|
||||
# export.py /eval_config - transformer export
|
||||
file_name: "transformer"
|
||||
file_format: 'AIR'
|
||||
|
||||
#'postprocess / from eval_config'
|
||||
result_dir: "./result_Files"
|
||||
|
||||
#'preprocess / from eval_config'
|
||||
result_path: "./preprocess_Result/"
|
||||
|
||||
# src/process_output.py "recore nbest with smoothed sentence-level bleu."
|
||||
vocab_file: ""
|
||||
|
||||
# create_data.py
|
||||
input_file: ''
|
||||
num_splits: 16
|
||||
clip_to_max_len: False
|
||||
max_seq_length: 128
|
||||
bucket: [16, 32, 48, 64, 128]
|
||||
|
||||
---
|
||||
# Config description for each option
|
||||
enable_modelarts: 'Whether training on modelarts, default: False'
|
||||
data_url: 'Dataset url for obs'
|
||||
train_url: 'Training output url for obs'
|
||||
data_path: 'Dataset path for local, it is better to use absolute path'
|
||||
output_path: 'Training output path for local'
|
||||
ann_file: 'Ann file, default is val.json.'
|
||||
|
||||
device_target: "device where the code will be implemented, default is Ascend"
|
||||
checkpoint_path: "Checkpoint file path"
|
||||
data_file: '/your/path/evaluation.mindrecord'
|
||||
model_file: '/your/path/checkpoint_file'
|
||||
output_file: './output_eval.txt'
|
||||
|
||||
distribute: "Run distribute, default is false."
|
||||
epoch_size: "Epoch size, default is 52."
|
||||
device_id: "Device id, default is 0."
|
||||
device_num: "Use device nums, default is 1."
|
||||
enable_lossscale: "Use lossscale or not, default is true."
|
||||
do_shuffle: "Enable shuffle for dataset, default is true."
|
||||
enable_save_ckpt: "Enable save checkpoint, default is true."
|
||||
save_checkpoint_steps: "Save checkpoint steps, default is 2500."
|
||||
save_checkpoint_num: "Save checkpoint numbers, default is 30."
|
||||
save_checkpoint_path: "Save checkpoint file path"
|
||||
bucket_boundaries: "sequence length for different bucket"
|
||||
accumulation_steps: "Gradient accumulation steps, default is 1."
|
||||
|
||||
file_name: "output file name."
|
||||
file_format: 'file format'
|
||||
result_dir: "./result_Files"
|
||||
result_path: "./preprocess_Result/"
|
||||
vocab_file: "vocab file path."
|
||||
input_file: 'Input raw text file (or comma-separated list of files).'
|
||||
num_splits: 'The MindRecord file will be split into the number of partition.'
|
||||
clip_to_max_len: 'clip sequences to maximum sequence length.'
|
||||
max_seq_length: 'Maximum sequence length.'
|
||||
bucket: 'bucket sequence length'
|
||||
|
||||
---
|
||||
device_target: ["Ascend", "GPU", "CPU"]
|
||||
file_format: ["AIR", "ONNX", "MINDIR"]
|
||||
distribute: ['true', 'false']
|
||||
enable_lossscale: ['true', 'false']
|
||||
do_shuffle: ['true', 'false']
|
||||
enable_save_ckpt: ['true', 'false']
|
|
@ -0,0 +1,134 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path"
|
||||
checkpoint_path: ''
|
||||
device_target: Ascend
|
||||
enable_profiling: False
|
||||
|
||||
# ==============================================================================
|
||||
# config/cfg edict
|
||||
transformer_network: 'large'
|
||||
init_loss_scale_value: 1024
|
||||
scale_factor: 2
|
||||
scale_window: 2000
|
||||
optimizer: 'Adam'
|
||||
optimizer_adam_beta2: 0.997
|
||||
#lr_schedule: edict({'learning_rate': 2.0, 'warmup_steps': 8000, 'start_decay_step': 16000, 'min_lr': 0.0,})
|
||||
|
||||
# transformer_net_cfg
|
||||
batch_size: 96
|
||||
seq_length: 128
|
||||
vocab_size: 36560
|
||||
hidden_size: 1024
|
||||
num_hidden_layers: 6
|
||||
num_attention_heads: 16
|
||||
intermediate_size: 4096
|
||||
hidden_act: "relu"
|
||||
hidden_dropout_prob: 0.2
|
||||
attention_probs_dropout_prob: 0.2
|
||||
max_position_embeddings: 128
|
||||
initializer_range: 0.02
|
||||
label_smoothing: 0.1
|
||||
dtype: mstype.float32
|
||||
compute_type: mstype.float16
|
||||
|
||||
#eval_config/cfg edict
|
||||
data_file: '/cache/data'
|
||||
model_file: './transformer/transformer_trained.ckpt'
|
||||
output_file: './output_eval.txt'
|
||||
|
||||
# transformer_net_cfg
|
||||
batch_size_ev: 1
|
||||
hidden_dropout_prob_ev: 0.0
|
||||
attention_probs_dropout_prob_ev: 0.0
|
||||
beam_width: 4
|
||||
max_decode_length: 80
|
||||
length_penalty_weight: 1.0
|
||||
|
||||
# ==============================================================================
|
||||
# train.py / Argparse init.
|
||||
distribute: "false"
|
||||
epoch_size: 52
|
||||
device_id: 0
|
||||
device_num: 1
|
||||
enable_lossscale: "true"
|
||||
do_shuffle: "true"
|
||||
enable_save_ckpt: "true"
|
||||
save_checkpoint_steps: 2500
|
||||
save_checkpoint_num: 30
|
||||
save_checkpoint_path: "./"
|
||||
bucket_boundaries: [16, 32, 48, 64, 128]
|
||||
accumulation_steps: 1
|
||||
|
||||
# export.py /eval_config - transformer export
|
||||
file_name: "transformer"
|
||||
file_format: 'AIR'
|
||||
|
||||
#'postprocess / from eval_config'
|
||||
result_dir: "./result_Files"
|
||||
|
||||
#'preprocess / from eval_config'
|
||||
result_path: "./preprocess_Result/"
|
||||
|
||||
# src/process_output.py "recore nbest with smoothed sentence-level bleu."
|
||||
vocab_file: ""
|
||||
|
||||
# create_data.py
|
||||
input_file: ''
|
||||
num_splits: 16
|
||||
clip_to_max_len: False
|
||||
max_seq_length: 128
|
||||
bucket: [16, 32, 48, 64, 128]
|
||||
|
||||
|
||||
---
|
||||
# Config description for each option
|
||||
enable_modelarts: 'Whether training on modelarts, default: False'
|
||||
data_url: 'Dataset url for obs'
|
||||
train_url: 'Training output url for obs'
|
||||
data_path: 'Dataset path for local, it is better to use absolute path'
|
||||
output_path: 'Training output path for local'
|
||||
ann_file: 'Ann file, default is val.json.'
|
||||
|
||||
device_target: "device where the code will be implemented, default is Ascend"
|
||||
checkpoint_path: "Checkpoint file path"
|
||||
data_file: '/your/path/evaluation.mindrecord'
|
||||
model_file: '/your/path/checkpoint_file'
|
||||
output_file: './output_eval.txt'
|
||||
|
||||
distribute: "Run distribute, default is false."
|
||||
epoch_size: "Epoch size, default is 52."
|
||||
device_id: "Device id, default is 0."
|
||||
device_num: "Use device nums, default is 1."
|
||||
enable_lossscale: "Use lossscale or not, default is true."
|
||||
do_shuffle: "Enable shuffle for dataset, default is true."
|
||||
enable_save_ckpt: "Enable save checkpoint, default is true."
|
||||
save_checkpoint_steps: "Save checkpoint steps, default is 2500."
|
||||
save_checkpoint_num: "Save checkpoint numbers, default is 30."
|
||||
save_checkpoint_path: "Save checkpoint file path"
|
||||
bucket_boundaries: "sequence length for different bucket"
|
||||
accumulation_steps: "Gradient accumulation steps, default is 1."
|
||||
|
||||
file_name: "output file name."
|
||||
file_format: 'file format'
|
||||
result_dir: "./result_Files"
|
||||
result_path: "./preprocess_Result/"
|
||||
vocab_file: "vocab file path."
|
||||
input_file: 'Input raw text file (or comma-separated list of files).'
|
||||
num_splits: 'The MindRecord file will be split into the number of partition.'
|
||||
clip_to_max_len: 'clip sequences to maximum sequence length.'
|
||||
max_seq_length: 'Maximum sequence length.'
|
||||
bucket: 'bucket sequence length'
|
||||
|
||||
---
|
||||
device_target: ["Ascend", "GPU", "CPU"]
|
||||
file_format: ["AIR", "ONNX", "MINDIR"]
|
||||
distribute: ['true', 'false']
|
||||
enable_lossscale: ['true', 'false']
|
||||
do_shuffle: ['true', 'false']
|
||||
enable_save_ckpt: ['true', 'false']
|
|
@ -0,0 +1,134 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path"
|
||||
checkpoint_path: ''
|
||||
device_target: GPU
|
||||
enable_profiling: False
|
||||
|
||||
# ==============================================================================
|
||||
# config/cfg edict
|
||||
transformer_network: 'large'
|
||||
init_loss_scale_value: 1024
|
||||
scale_factor: 2
|
||||
scale_window: 2000
|
||||
optimizer: 'Adam'
|
||||
optimizer_adam_beta2: 0.997
|
||||
#lr_schedule: edict({'learning_rate': 2.0, 'warmup_steps': 8000, 'start_decay_step': 16000, 'min_lr': 0.0,})
|
||||
|
||||
# transformer_net_cfg_gpu
|
||||
batch_size: 32
|
||||
seq_length: 128
|
||||
vocab_size: 36560
|
||||
hidden_size: 1024
|
||||
num_hidden_layers: 6
|
||||
num_attention_heads: 16
|
||||
intermediate_size: 4096
|
||||
hidden_act: "relu"
|
||||
hidden_dropout_prob: 0.2
|
||||
attention_probs_dropout_prob: 0.2
|
||||
max_position_embeddings: 128
|
||||
initializer_range: 0.02
|
||||
label_smoothing: 0.1
|
||||
dtype: mstype.float32
|
||||
compute_type: mstype.float16
|
||||
|
||||
#eval_config/cfg edict
|
||||
data_file: '/cache/data'
|
||||
model_file: './transformer/transformer_trained.ckpt'
|
||||
output_file: './output_eval.txt'
|
||||
|
||||
# transformer_net_cfg
|
||||
batch_size_ev: 1
|
||||
hidden_dropout_prob_ev: 0.0
|
||||
attention_probs_dropout_prob_ev: 0.0
|
||||
beam_width: 4
|
||||
max_decode_length: 80
|
||||
length_penalty_weight: 1.0
|
||||
|
||||
# ==============================================================================
|
||||
# train.py / Argparse init.
|
||||
distribute: "false"
|
||||
epoch_size: 52
|
||||
device_id: 0
|
||||
device_num: 1
|
||||
enable_lossscale: "true"
|
||||
do_shuffle: "true"
|
||||
enable_save_ckpt: "true"
|
||||
save_checkpoint_steps: 2500
|
||||
save_checkpoint_num: 30
|
||||
save_checkpoint_path: "./"
|
||||
bucket_boundaries: [16, 32, 48, 64, 128]
|
||||
accumulation_steps: 1
|
||||
|
||||
# export.py /eval_config - transformer export
|
||||
file_name: "transformer"
|
||||
file_format: 'AIR'
|
||||
|
||||
#'postprocess / from eval_config'
|
||||
result_dir: "./result_Files"
|
||||
|
||||
#'preprocess / from eval_config'
|
||||
result_path: "./preprocess_Result/"
|
||||
|
||||
# src/process_output.py "recore nbest with smoothed sentence-level bleu."
|
||||
vocab_file: ""
|
||||
|
||||
# create_data.py
|
||||
input_file: ''
|
||||
num_splits: 16
|
||||
clip_to_max_len: False
|
||||
max_seq_length: 128
|
||||
bucket: [16, 32, 48, 64, 128]
|
||||
|
||||
|
||||
---
|
||||
# Config description for each option
|
||||
enable_modelarts: 'Whether training on modelarts, default: False'
|
||||
data_url: 'Dataset url for obs'
|
||||
train_url: 'Training output url for obs'
|
||||
data_path: 'Dataset path for local, it is better to use absolute path'
|
||||
output_path: 'Training output path for local'
|
||||
ann_file: 'Ann file, default is val.json.'
|
||||
|
||||
device_target: "device where the code will be implemented, default is Ascend"
|
||||
checkpoint_path: "Checkpoint file path"
|
||||
data_file: '/your/path/evaluation.mindrecord'
|
||||
model_file: '/your/path/checkpoint_file'
|
||||
output_file: './output_eval.txt'
|
||||
|
||||
distribute: "Run distribute, default is false."
|
||||
epoch_size: "Epoch size, default is 52."
|
||||
device_id: "Device id, default is 0."
|
||||
device_num: "Use device nums, default is 1."
|
||||
enable_lossscale: "Use lossscale or not, default is true."
|
||||
do_shuffle: "Enable shuffle for dataset, default is true."
|
||||
enable_save_ckpt: "Enable save checkpoint, default is true."
|
||||
save_checkpoint_steps: "Save checkpoint steps, default is 2500."
|
||||
save_checkpoint_num: "Save checkpoint numbers, default is 30."
|
||||
save_checkpoint_path: "Save checkpoint file path"
|
||||
bucket_boundaries: "sequence length for different bucket"
|
||||
accumulation_steps: "Gradient accumulation steps, default is 1."
|
||||
|
||||
file_name: "output file name."
|
||||
file_format: 'file format'
|
||||
result_dir: "./result_Files"
|
||||
result_path: "./preprocess_Result/"
|
||||
vocab_file: "vocab file path."
|
||||
input_file: 'Input raw text file (or comma-separated list of files).'
|
||||
num_splits: 'The MindRecord file will be split into the number of partition.'
|
||||
clip_to_max_len: 'clip sequences to maximum sequence length.'
|
||||
max_seq_length: 'Maximum sequence length.'
|
||||
bucket: 'bucket sequence length'
|
||||
|
||||
---
|
||||
device_target: ["Ascend", "GPU", "CPU"]
|
||||
file_format: ["AIR", "ONNX", "MINDIR"]
|
||||
distribute: ['true', 'false']
|
||||
enable_lossscale: ['true', 'false']
|
||||
do_shuffle: ['true', 'false']
|
||||
enable_save_ckpt: ['true', 'false']
|
|
@ -14,7 +14,7 @@
|
|||
# ============================================================================
|
||||
"""Transformer evaluation script."""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
import mindspore.nn as nn
|
||||
|
@ -28,8 +28,15 @@ import mindspore.dataset.transforms.c_transforms as deC
|
|||
from mindspore import context
|
||||
|
||||
from src.transformer_model import TransformerModel
|
||||
from src.eval_config import cfg, transformer_net_cfg
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||
from src.model_utils.device_adapter import get_device_id
|
||||
|
||||
config.dtype = mstype.float32
|
||||
config.compute_type = mstype.float16
|
||||
config.batch_size = config.batch_size_ev
|
||||
config.hidden_dropout_prob = config.hidden_dropout_prob_ev
|
||||
config.attention_probs_dropout_prob = config.attention_probs_dropout_prob_ev
|
||||
|
||||
def load_test_data(batch_size=1, data_file=None):
|
||||
"""
|
||||
|
@ -57,7 +64,6 @@ class TransformerInferCell(nn.Cell):
|
|||
"""
|
||||
Encapsulation class of transformer network infer.
|
||||
"""
|
||||
|
||||
def __init__(self, network):
|
||||
super(TransformerInferCell, self).__init__(auto_prefix=False)
|
||||
self.network = network
|
||||
|
@ -98,23 +104,22 @@ def load_weights(model_path):
|
|||
return parameter_dict
|
||||
|
||||
|
||||
def modelarts_pre_process():
|
||||
config.output_file = os.path.join(config.output_path, config.output_file)
|
||||
config.data_file = os.path.join(config.data_file, 'ende-l128-mindrecord_128_00')
|
||||
|
||||
@moxing_wrapper(pre_process=modelarts_pre_process)
|
||||
def run_transformer_eval():
|
||||
"""
|
||||
Transformer evaluation.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description='tranformer')
|
||||
parser.add_argument("--device_target", type=str, default="Ascend",
|
||||
help="device where the code will be implemented, default is Ascend")
|
||||
parser.add_argument('--device_id', type=int, default=0, help='device id of GPU or Ascend, default is 0')
|
||||
args = parser.parse_args()
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, reserve_class_name_in_scope=False,
|
||||
device_id=get_device_id())
|
||||
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, reserve_class_name_in_scope=False,
|
||||
device_id=args.device_id)
|
||||
dataset = load_test_data(batch_size=config.batch_size, data_file=config.data_file)
|
||||
tfm_model = TransformerModel(config=config, is_training=False, use_one_hot_embeddings=False)
|
||||
|
||||
dataset = load_test_data(batch_size=transformer_net_cfg.batch_size, data_file=cfg.data_file)
|
||||
tfm_model = TransformerModel(config=transformer_net_cfg, is_training=False, use_one_hot_embeddings=False)
|
||||
|
||||
parameter_dict = load_weights(cfg.model_file)
|
||||
parameter_dict = load_weights(config.model_file)
|
||||
load_param_into_net(tfm_model, parameter_dict)
|
||||
|
||||
tfm_infer = TransformerInferCell(tfm_model)
|
||||
|
@ -132,9 +137,9 @@ def run_transformer_eval():
|
|||
predictions.append(predicted_ids.asnumpy())
|
||||
|
||||
# decode and write to file
|
||||
f = open(cfg.output_file, 'w')
|
||||
f = open(config.output_file, 'w')
|
||||
for batch_out in predictions:
|
||||
for i in range(transformer_net_cfg.batch_size):
|
||||
for i in range(config.batch_size):
|
||||
if batch_out.ndim == 3:
|
||||
batch_out = batch_out[:, 0]
|
||||
token_ids = [str(x) for x in batch_out[i].tolist()]
|
||||
|
|
|
@ -14,35 +14,41 @@
|
|||
# ============================================================================
|
||||
""" export checkpoint file into models"""
|
||||
|
||||
import argparse
|
||||
import numpy as np
|
||||
|
||||
from mindspore import Tensor, context
|
||||
from mindspore.train.serialization import load_param_into_net, export
|
||||
|
||||
from src.transformer_model import TransformerModel
|
||||
from src.eval_config import cfg, transformer_net_cfg
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||
from src.model_utils.device_adapter import get_device_id
|
||||
from eval import load_weights
|
||||
|
||||
parser = argparse.ArgumentParser(description='transformer export')
|
||||
parser.add_argument("--device_id", type=int, default=0, help="Device id")
|
||||
parser.add_argument("--file_name", type=str, default="transformer", help="output file name.")
|
||||
parser.add_argument('--file_format', type=str, choices=["AIR", "ONNX", "MINDIR"], default='AIR', help='file format')
|
||||
parser.add_argument("--device_target", type=str, default="Ascend",
|
||||
choices=["Ascend", "GPU", "CPU"], help="device target (default: Ascend)")
|
||||
args = parser.parse_args()
|
||||
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
|
||||
if args.device_target == "Ascend":
|
||||
context.set_context(device_id=args.device_id)
|
||||
config.batch_size = config.batch_size_ev
|
||||
config.hidden_dropout_prob = config.hidden_dropout_prob_ev
|
||||
config.attention_probs_dropout_prob = config.attention_probs_dropout_prob_ev
|
||||
|
||||
if __name__ == '__main__':
|
||||
tfm_model = TransformerModel(config=transformer_net_cfg, is_training=False, use_one_hot_embeddings=False)
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
|
||||
if config.device_target == "Ascend":
|
||||
context.set_context(device_id=get_device_id())
|
||||
|
||||
parameter_dict = load_weights(cfg.model_file)
|
||||
def modelarts_pre_process():
|
||||
pass
|
||||
|
||||
@moxing_wrapper(pre_process=modelarts_pre_process)
|
||||
def export_transformer():
|
||||
""" export_transformer """
|
||||
tfm_model = TransformerModel(config=config, is_training=False, use_one_hot_embeddings=False)
|
||||
|
||||
parameter_dict = load_weights(config.model_file)
|
||||
load_param_into_net(tfm_model, parameter_dict)
|
||||
|
||||
source_ids = Tensor(np.ones((transformer_net_cfg.batch_size, transformer_net_cfg.seq_length)).astype(np.int32))
|
||||
source_mask = Tensor(np.ones((transformer_net_cfg.batch_size, transformer_net_cfg.seq_length)).astype(np.int32))
|
||||
source_ids = Tensor(np.ones((config.batch_size, config.seq_length)).astype(np.int32))
|
||||
source_mask = Tensor(np.ones((config.batch_size, config.seq_length)).astype(np.int32))
|
||||
|
||||
export(tfm_model, source_ids, source_mask, file_name=args.file_name, file_format=args.file_format)
|
||||
export(tfm_model, source_ids, source_mask, file_name=config.file_name, file_format=config.file_format)
|
||||
|
||||
if __name__ == '__main__':
|
||||
export_transformer()
|
||||
|
|
|
@ -15,31 +15,25 @@
|
|||
"""Transformer evaluation script."""
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import numpy as np
|
||||
from src.model_utils.config import config
|
||||
|
||||
from src.eval_config import cfg, transformer_net_cfg
|
||||
|
||||
parser = argparse.ArgumentParser(description='postprocess')
|
||||
parser.add_argument("--result_dir", type=str, default="./result_Files",
|
||||
help="infer result path.")
|
||||
args = parser.parse_args()
|
||||
|
||||
def generate_output():
|
||||
'''
|
||||
Generate output.
|
||||
'''
|
||||
predictions = []
|
||||
file_num = len(os.listdir(args.result_dir))
|
||||
file_num = len(os.listdir(config.result_dir))
|
||||
for i in range(file_num):
|
||||
batch = "transformer_bs_" + str(transformer_net_cfg.batch_size) + "_" + str(i) + "_0.bin"
|
||||
pred = np.fromfile(os.path.join(args.result_dir, batch), np.int32)
|
||||
predictions.append(pred.reshape(1, 1, transformer_net_cfg.max_decode_length + 1))
|
||||
batch = "transformer_bs_" + str(config.batch_size) + "_" + str(i) + "_0.bin"
|
||||
pred = np.fromfile(os.path.join(config.result_dir, batch), np.int32)
|
||||
predictions.append(pred.reshape(1, 1, config.max_decode_length + 1))
|
||||
|
||||
# decode and write to file
|
||||
f = open(cfg.output_file, 'w')
|
||||
f = open(config.output_file, 'w')
|
||||
for batch_out in predictions:
|
||||
for i in range(transformer_net_cfg.batch_size):
|
||||
for i in range(config.batch_size):
|
||||
if batch_out.ndim == 3:
|
||||
batch_out = batch_out[:, 0]
|
||||
token_ids = [str(x) for x in batch_out[i].tolist()]
|
||||
|
|
|
@ -15,23 +15,16 @@
|
|||
"""Transformer evaluation script."""
|
||||
|
||||
import os
|
||||
import argparse
|
||||
|
||||
from src.eval_config import cfg, transformer_net_cfg
|
||||
from src.model_utils.config import config
|
||||
from eval import load_test_data
|
||||
|
||||
parser = argparse.ArgumentParser(description='preprocess')
|
||||
parser.add_argument("--result_path", type=str, default="./preprocess_Result/",
|
||||
help="preprocess result path.")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def generate_bin():
|
||||
'''
|
||||
Generate bin files.
|
||||
'''
|
||||
dataset = load_test_data(batch_size=transformer_net_cfg.batch_size, data_file=cfg.data_file)
|
||||
cur_dir = args.result_path
|
||||
dataset = load_test_data(batch_size=config.batch_size, data_file=config.data_file)
|
||||
cur_dir = config.result_path
|
||||
|
||||
source_eos_ids_path = os.path.join(cur_dir, "00_source_eos_ids")
|
||||
source_eos_mask_path = os.path.join(cur_dir, "01_source_eos_mask")
|
||||
|
@ -41,7 +34,7 @@ def generate_bin():
|
|||
if not os.path.isdir(source_eos_mask_path):
|
||||
os.makedirs(source_eos_mask_path)
|
||||
|
||||
batch_size = transformer_net_cfg.batch_size
|
||||
batch_size = config.batch_size
|
||||
|
||||
for i, data in enumerate(dataset.create_dict_iterator(output_numpy=True, num_epochs=1)):
|
||||
file_name = "transformer_bs_" + str(batch_size) + "_" + str(i) + ".bin"
|
||||
|
|
|
@ -28,7 +28,7 @@ eval_output=$2
|
|||
vocab_file=$3
|
||||
|
||||
cat $eval_output \
|
||||
| python src/process_output.py --vocab_file $vocab_file \
|
||||
| python src/process_output.py --config_path="./default_config_large.yaml" --vocab_file $vocab_file \
|
||||
| sed 's/@@ //g' > ${eval_output}.processed
|
||||
|
||||
perl -ple 's/(\S)-(\S)/$1 #@#-#@# $2/g' < $ref_data | perl ${BASEDIR}/replace-quote.perl > ${ref_data}.forbleu
|
||||
|
|
|
@ -13,11 +13,11 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
if [ $# != 4 ] ; then
|
||||
if [ $# != 5 ] ; then
|
||||
echo "=============================================================================================================="
|
||||
echo "Please run the script as: "
|
||||
echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE"
|
||||
echo "for example: sh run_distribute_pretrain.sh 8 52 /path/ende-l128-mindrecord00 /path/hccl.json"
|
||||
echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE CONFIG_PATH"
|
||||
echo "for example: sh run_distribute_pretrain.sh 8 52 /path/ende-l128-mindrecord00 /path/hccl.json ./default_config_large.yaml"
|
||||
echo "It is better to use absolute path."
|
||||
echo "=============================================================================================================="
|
||||
exit 1;
|
||||
|
@ -32,6 +32,7 @@ DATA_PATH=$3
|
|||
|
||||
export HCCL_CONNECT_TIMEOUT=600
|
||||
export RANK_TABLE_FILE=$4
|
||||
export CONFIG_PATH=$5
|
||||
export RANK_SIZE=$1
|
||||
export HCCL_FLAG=1
|
||||
export DEPLOY_MODE=0
|
||||
|
@ -43,11 +44,12 @@ do
|
|||
export GE_USE_STATIC_MEMORY=1
|
||||
|
||||
mkdir helper$i
|
||||
cp -rf ../src/ ../train.py ./helper$i
|
||||
cp -rf ../src/ ../train.py ../*.yaml ./helper$i
|
||||
cd ./helper$i || exit
|
||||
echo "start training for rank $i, device $DEVICE_ID"
|
||||
env > env.log
|
||||
python train.py \
|
||||
--config_path=$CONFIG_PATH \
|
||||
--distribute="true" \
|
||||
--epoch_size=$EPOCH_SIZE \
|
||||
--device_id=$DEVICE_ID \
|
||||
|
@ -58,8 +60,7 @@ do
|
|||
--checkpoint_path="" \
|
||||
--save_checkpoint_steps=2500 \
|
||||
--save_checkpoint_num=30 \
|
||||
--data_path=$DATA_PATH \
|
||||
--bucket_boundaries=[16,32,48,64,128] > log.txt 2>&1 &
|
||||
--data_path=$DATA_PATH > log.txt 2>&1 &
|
||||
cd ../
|
||||
done
|
||||
cd ..
|
|
@ -13,11 +13,11 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
if [ $# != 5 ] ; then
|
||||
if [ $# != 6 ] ; then
|
||||
echo "=============================================================================================================="
|
||||
echo "Please run the script as: "
|
||||
echo "sh run_distribute_train_ascend_multi_machines.sh DEVICE_NUM SERVER_ID EPOCH_SIZE DATA_PATH RANK_TABLE_FILE"
|
||||
echo "for example: sh run_distribute_train_ascend_multi_machines.sh 32 0 52 /path/ende-l128-mindrecord00 /path/hccl.json"
|
||||
echo "sh run_distribute_train_ascend_multi_machines.sh DEVICE_NUM SERVER_ID EPOCH_SIZE DATA_PATH RANK_TABLE_FILE CONFIG_PATH"
|
||||
echo "for example: sh run_distribute_train_ascend_multi_machines.sh 32 0 52 /path/ende-l128-mindrecord00 /path/hccl.json ./default_config_large.yaml"
|
||||
echo "It is better to use absolute path."
|
||||
echo "=============================================================================================================="
|
||||
exit 1;
|
||||
|
@ -32,6 +32,7 @@ DATA_PATH=$4
|
|||
|
||||
export HCCL_CONNECT_TIMEOUT=600
|
||||
export RANK_TABLE_FILE=$5
|
||||
export CONFIG_PATH=$6
|
||||
export RANK_SIZE=$1
|
||||
export SERVER_ID=$2
|
||||
export DEVICE_NUM=8
|
||||
|
@ -47,11 +48,12 @@ do
|
|||
export GE_USE_STATIC_MEMORY=1
|
||||
|
||||
mkdir helper$i
|
||||
cp -rf ../src/ ../train.py ./helper$i
|
||||
cp -rf ../src/ ../train.py ../*.yaml ./helper$i
|
||||
cd ./helper$i || exit
|
||||
echo "start training for rank $i, device $DEVICE_ID"
|
||||
env > env.log
|
||||
python train.py \
|
||||
--config_path=$CONFIG_PATH \
|
||||
--distribute="true" \
|
||||
--epoch_size=$EPOCH_SIZE \
|
||||
--device_id=$DEVICE_ID \
|
||||
|
|
|
@ -13,11 +13,11 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
if [ $# != 3 ] ; then
|
||||
if [ $# != 4 ] ; then
|
||||
echo "=============================================================================================================="
|
||||
echo "Please run the script as: "
|
||||
echo "sh run_distribute_train_gpu.sh DEVICE_NUM EPOCH_SIZE DATA_PATH"
|
||||
echo "for example: sh run_distribute_pretrain.sh 8 55 /path/ende-l128-mindrecord00"
|
||||
echo "sh run_distribute_train_gpu.sh DEVICE_NUM EPOCH_SIZE DATA_PATH CONFIG_PATH"
|
||||
echo "for example: sh run_distribute_pretrain.sh 8 55 /path/ende-l128-mindrecord00 ./default_config_large_gpu.yaml"
|
||||
echo "It is better to use absolute path."
|
||||
echo "=============================================================================================================="
|
||||
exit 1;
|
||||
|
@ -25,16 +25,18 @@ fi
|
|||
|
||||
rm -rf run_distribute_train
|
||||
mkdir run_distribute_train
|
||||
cp -rf ./src/ train.py ./run_distribute_train
|
||||
cp -rf ./src/ train.py ./*.yaml ./run_distribute_train
|
||||
cd run_distribute_train || exit
|
||||
|
||||
export RANK_SIZE=$1
|
||||
export CONFIG_PATH=$4
|
||||
EPOCH_SIZE=$2
|
||||
DATA_PATH=$3
|
||||
echo $RANK_SIZE
|
||||
|
||||
mpirun -n $RANK_SIZE \
|
||||
python train.py \
|
||||
--config_path=$CONFIG_PATH \
|
||||
--distribute="true" \
|
||||
--device_target="GPU" \
|
||||
--epoch_size=$EPOCH_SIZE \
|
||||
|
|
|
@ -13,19 +13,36 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
if [ $# != 2 ] ; then
|
||||
if [ $# != 5 ] ; then
|
||||
echo "=============================================================================================================="
|
||||
echo "Please run the script as: "
|
||||
echo "sh run_eval.sh DEVICE_TARGET DEVICE_ID"
|
||||
echo "for example: sh run_eval.sh Ascend 0"
|
||||
echo "Note: set the checkpoint and dataset path in src/eval_config.py"
|
||||
echo "sh run_eval.sh DEVICE_TARGET DEVICE_ID MINDRECORD_DATA CKPT_PATH CONFIG_PATH"
|
||||
echo "for example: sh run_eval.sh Ascend 0 /your/path/evaluation.mindrecord /your/path/checkpoint_file ./default_config_large_gpu.yaml"
|
||||
echo "Note: set the checkpoint and dataset path in default_config.yaml"
|
||||
echo "=============================================================================================================="
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
export DEVICE_TARGET=$1
|
||||
export CONFIG_PATH=$5
|
||||
DEVICE_ID=$2
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
|
||||
PATH3=$(get_real_path $3)
|
||||
PATH4=$(get_real_path $4)
|
||||
echo $PATH3
|
||||
echo $PATH4
|
||||
|
||||
python eval.py \
|
||||
--config_path=$CONFIG_PATH \
|
||||
--device_target=$DEVICE_TARGET \
|
||||
--device_id=$DEVICE_ID \
|
||||
--data_file=$PATH3 \
|
||||
--model_file=$PATH4 > log_eval.txt 2>&1 &
|
||||
|
|
|
@ -66,7 +66,7 @@ function preprocess_data()
|
|||
rm -rf ./preprocess_Result
|
||||
fi
|
||||
mkdir preprocess_Result
|
||||
python3.7 ../preprocess.py --result_path=./preprocess_Result/
|
||||
python3.7 ../preprocess.py --config_path="./default_config_large.yaml" --result_path=./preprocess_Result/
|
||||
}
|
||||
|
||||
function compile_app()
|
||||
|
@ -93,7 +93,7 @@ function infer()
|
|||
|
||||
function cal_acc()
|
||||
{
|
||||
python3.7 ../postprocess.py --result_dir=./result_Files &> acc.log
|
||||
python3.7 ../postprocess.py --config_path="./default_config_large.yaml" --result_dir=./result_Files &> acc.log
|
||||
}
|
||||
|
||||
if [ $need_preprocess == "y" ]; then
|
||||
|
|
|
@ -25,7 +25,7 @@ fi
|
|||
|
||||
rm -rf run_standalone_train
|
||||
mkdir run_standalone_train
|
||||
cp -rf ./src/ train.py ./run_standalone_train
|
||||
cp -rf ./src/ train.py ./*.yaml ./run_standalone_train
|
||||
cd run_standalone_train || exit
|
||||
|
||||
export DEVICE_TARGET=$1
|
||||
|
@ -36,6 +36,7 @@ DATA_PATH=$5
|
|||
|
||||
if [ $DEVICE_TARGET == 'Ascend' ];then
|
||||
python train.py \
|
||||
--config_path="./default_config_large.yaml" \
|
||||
--distribute="false" \
|
||||
--epoch_size=$EPOCH_SIZE \
|
||||
--accumulation_steps=$GRADIENT_ACCUMULATE_STEP \
|
||||
|
@ -53,6 +54,7 @@ elif [ $DEVICE_TARGET == 'GPU' ];then
|
|||
export CUDA_VISIBLE_DEVICES="$2"
|
||||
|
||||
python train.py \
|
||||
--config_path="./default_config_large_gpu.yaml" \
|
||||
--distribute="false" \
|
||||
--epoch_size=$EPOCH_SIZE \
|
||||
--device_target=$DEVICE_TARGET \
|
||||
|
|
|
@ -17,7 +17,8 @@
|
|||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset as de
|
||||
import mindspore.dataset.transforms.c_transforms as deC
|
||||
from .config import transformer_net_cfg, transformer_net_cfg_gpu
|
||||
from .model_utils.config import config
|
||||
|
||||
de.config.set_seed(1)
|
||||
def create_transformer_dataset(epoch_count=1, rank_size=1, rank_id=0, do_shuffle="true", dataset_path=None,
|
||||
bucket_boundaries=None, device_target="Ascend"):
|
||||
|
@ -38,11 +39,8 @@ def create_transformer_dataset(epoch_count=1, rank_size=1, rank_id=0, do_shuffle
|
|||
ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask")
|
||||
|
||||
# apply batch operations
|
||||
if device_target == "Ascend":
|
||||
ds = ds.batch(transformer_net_cfg.batch_size, drop_remainder=True)
|
||||
else:
|
||||
ds = ds.batch(transformer_net_cfg_gpu.batch_size, drop_remainder=True)
|
||||
|
||||
ds = ds.batch(config.batch_size, drop_remainder=True)
|
||||
ds = ds.repeat(epoch_count)
|
||||
return ds
|
||||
|
||||
|
|
|
@ -0,0 +1,127 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Parse arguments"""
|
||||
|
||||
import os
|
||||
import ast
|
||||
import argparse
|
||||
from pprint import pprint, pformat
|
||||
import yaml
|
||||
|
||||
class Config:
|
||||
"""
|
||||
Configuration namespace. Convert dictionary to members.
|
||||
"""
|
||||
def __init__(self, cfg_dict):
|
||||
for k, v in cfg_dict.items():
|
||||
if isinstance(v, (list, tuple)):
|
||||
setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
|
||||
else:
|
||||
setattr(self, k, Config(v) if isinstance(v, dict) else v)
|
||||
|
||||
def __str__(self):
|
||||
return pformat(self.__dict__)
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
|
||||
def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"):
|
||||
"""
|
||||
Parse command line arguments to the configuration according to the default yaml.
|
||||
|
||||
Args:
|
||||
parser: Parent parser.
|
||||
cfg: Base configuration.
|
||||
helper: Helper description.
|
||||
cfg_path: Path to the default yaml config.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
|
||||
parents=[parser])
|
||||
helper = {} if helper is None else helper
|
||||
choices = {} if choices is None else choices
|
||||
for item in cfg:
|
||||
if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
|
||||
help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
|
||||
choice = choices[item] if item in choices else None
|
||||
if isinstance(cfg[item], bool):
|
||||
parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
|
||||
help=help_description)
|
||||
else:
|
||||
parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
|
||||
help=help_description)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def parse_yaml(yaml_path):
|
||||
"""
|
||||
Parse the yaml config file.
|
||||
|
||||
Args:
|
||||
yaml_path: Path to the yaml config.
|
||||
"""
|
||||
with open(yaml_path, 'r') as fin:
|
||||
try:
|
||||
cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
|
||||
cfgs = [x for x in cfgs]
|
||||
if len(cfgs) == 1:
|
||||
cfg_helper = {}
|
||||
cfg = cfgs[0]
|
||||
cfg_choices = {}
|
||||
elif len(cfgs) == 2:
|
||||
cfg, cfg_helper = cfgs
|
||||
cfg_choices = {}
|
||||
elif len(cfgs) == 3:
|
||||
cfg, cfg_helper, cfg_choices = cfgs
|
||||
else:
|
||||
raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml")
|
||||
print(cfg_helper)
|
||||
except:
|
||||
raise ValueError("Failed to parse yaml")
|
||||
return cfg, cfg_helper, cfg_choices
|
||||
|
||||
|
||||
def merge(args, cfg):
|
||||
"""
|
||||
Merge the base config from yaml file and command line arguments.
|
||||
|
||||
Args:
|
||||
args: Command line arguments.
|
||||
cfg: Base configuration.
|
||||
"""
|
||||
args_var = vars(args)
|
||||
for item in args_var:
|
||||
cfg[item] = args_var[item]
|
||||
return cfg
|
||||
|
||||
|
||||
def get_config():
|
||||
"""
|
||||
Get Config according to the yaml file and cli arguments.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="default name", add_help=False)
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../../default_config.yaml"),
|
||||
help="Config file path")
|
||||
path_args, _ = parser.parse_known_args()
|
||||
default, helper, choices = parse_yaml(path_args.config_path)
|
||||
pprint(default)
|
||||
args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path)
|
||||
final_config = merge(args, default)
|
||||
return Config(final_config)
|
||||
|
||||
config = get_config()
|
|
@ -0,0 +1,27 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Device adapter for ModelArts"""
|
||||
|
||||
from .config import config
|
||||
|
||||
if config.enable_modelarts:
|
||||
from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
|
||||
else:
|
||||
from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
|
||||
|
||||
__all__ = [
|
||||
"get_device_id", "get_device_num", "get_rank_id", "get_job_id"
|
||||
]
|
|
@ -0,0 +1,36 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Local adapter"""
|
||||
|
||||
import os
|
||||
|
||||
def get_device_id():
|
||||
device_id = os.getenv('DEVICE_ID', '0')
|
||||
return int(device_id)
|
||||
|
||||
|
||||
def get_device_num():
|
||||
device_num = os.getenv('RANK_SIZE', '1')
|
||||
return int(device_num)
|
||||
|
||||
|
||||
def get_rank_id():
|
||||
global_rank_id = os.getenv('RANK_ID', '0')
|
||||
return int(global_rank_id)
|
||||
|
||||
|
||||
def get_job_id():
|
||||
return "Local Job"
|
|
@ -0,0 +1,122 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Moxing adapter for ModelArts"""
|
||||
|
||||
import os
|
||||
import functools
|
||||
from mindspore import context
|
||||
from mindspore.profiler import Profiler
|
||||
from .config import config
|
||||
|
||||
_global_sync_count = 0
|
||||
|
||||
def get_device_id():
|
||||
device_id = os.getenv('DEVICE_ID', '0')
|
||||
return int(device_id)
|
||||
|
||||
|
||||
def get_device_num():
|
||||
device_num = os.getenv('RANK_SIZE', '1')
|
||||
return int(device_num)
|
||||
|
||||
|
||||
def get_rank_id():
|
||||
global_rank_id = os.getenv('RANK_ID', '0')
|
||||
return int(global_rank_id)
|
||||
|
||||
|
||||
def get_job_id():
|
||||
job_id = os.getenv('JOB_ID')
|
||||
job_id = job_id if job_id != "" else "default"
|
||||
return job_id
|
||||
|
||||
def sync_data(from_path, to_path):
|
||||
"""
|
||||
Download data from remote obs to local directory if the first url is remote url and the second one is local path
|
||||
Upload data from local directory to remote obs in contrast.
|
||||
"""
|
||||
import moxing as mox
|
||||
import time
|
||||
global _global_sync_count
|
||||
sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
|
||||
_global_sync_count += 1
|
||||
|
||||
# Each server contains 8 devices as most.
|
||||
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
|
||||
print("from path: ", from_path)
|
||||
print("to path: ", to_path)
|
||||
mox.file.copy_parallel(from_path, to_path)
|
||||
print("===finish data synchronization===")
|
||||
try:
|
||||
os.mknod(sync_lock)
|
||||
except IOError:
|
||||
pass
|
||||
print("===save flag===")
|
||||
|
||||
while True:
|
||||
if os.path.exists(sync_lock):
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
print("Finish sync data from {} to {}.".format(from_path, to_path))
|
||||
|
||||
|
||||
def moxing_wrapper(pre_process=None, post_process=None):
|
||||
"""
|
||||
Moxing wrapper to download dataset and upload outputs.
|
||||
"""
|
||||
def wrapper(run_func):
|
||||
@functools.wraps(run_func)
|
||||
def wrapped_func(*args, **kwargs):
|
||||
# Download data from data_url
|
||||
if config.enable_modelarts:
|
||||
if config.data_url:
|
||||
sync_data(config.data_url, config.data_path)
|
||||
print("Dataset downloaded: ", os.listdir(config.data_path))
|
||||
if config.checkpoint_url:
|
||||
sync_data(config.checkpoint_url, config.load_path)
|
||||
print("Preload downloaded: ", os.listdir(config.load_path))
|
||||
if config.train_url:
|
||||
sync_data(config.train_url, config.output_path)
|
||||
print("Workspace downloaded: ", os.listdir(config.output_path))
|
||||
|
||||
context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
|
||||
config.device_num = get_device_num()
|
||||
config.device_id = get_device_id()
|
||||
if not os.path.exists(config.output_path):
|
||||
os.makedirs(config.output_path)
|
||||
|
||||
if pre_process:
|
||||
pre_process()
|
||||
|
||||
if config.enable_profiling:
|
||||
profiler = Profiler()
|
||||
|
||||
run_func(*args, **kwargs)
|
||||
|
||||
if config.enable_profiling:
|
||||
profiler.analyse()
|
||||
|
||||
# Upload data to train_url
|
||||
if config.enable_modelarts:
|
||||
if post_process:
|
||||
post_process()
|
||||
|
||||
if config.train_url:
|
||||
print("Start to copy output directory")
|
||||
sync_data(config.output_path, config.train_url)
|
||||
return wrapped_func
|
||||
return wrapper
|
|
@ -18,22 +18,18 @@ from __future__ import absolute_import
|
|||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
import sys
|
||||
import tokenization
|
||||
from .model_utils.config import config
|
||||
|
||||
# Explicitly set the encoding
|
||||
sys.stdin = open(sys.stdin.fileno(), mode='r', encoding='utf-8', buffering=True)
|
||||
sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=True)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="recore nbest with smoothed sentence-level bleu.")
|
||||
parser.add_argument("--vocab_file", type=str, default="", required=True, help="vocab file path.")
|
||||
args = parser.parse_args()
|
||||
|
||||
tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=args.vocab_file)
|
||||
tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=config.vocab_file)
|
||||
|
||||
for line in sys.stdin:
|
||||
token_ids = [int(x) for x in line.strip().split()]
|
||||
|
|
|
@ -16,9 +16,8 @@
|
|||
|
||||
import os
|
||||
import time
|
||||
import argparse
|
||||
import ast
|
||||
|
||||
from easydict import EasyDict as edict
|
||||
import mindspore.common.dtype as mstype
|
||||
from mindspore.common.tensor import Tensor
|
||||
from mindspore.nn.optim import Adam
|
||||
|
@ -36,21 +35,30 @@ from mindspore.common import set_seed
|
|||
from src.transformer_for_train import TransformerTrainOneStepCell, TransformerNetworkWithLoss, \
|
||||
TransformerTrainOneStepWithLossScaleCell, \
|
||||
TransformerTrainAccumulationAllReducePostWithLossScaleCell
|
||||
from src.config import cfg, transformer_net_cfg, transformer_net_cfg_gpu
|
||||
from src.dataset import create_transformer_dataset
|
||||
from src.lr_schedule import create_dynamic_lr
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||
from src.model_utils.device_adapter import get_device_id
|
||||
|
||||
|
||||
set_seed(1)
|
||||
|
||||
|
||||
def get_ms_timestamp():
|
||||
t = time.time()
|
||||
return int(round(t * 1000))
|
||||
|
||||
|
||||
time_stamp_init = False
|
||||
time_stamp_first = 0
|
||||
|
||||
config.dtype = mstype.float32
|
||||
config.compute_type = mstype.float16
|
||||
config.lr_schedule = edict({
|
||||
'learning_rate': 2.0,
|
||||
'warmup_steps': 8000,
|
||||
'start_decay_step': 16000,
|
||||
'min_lr': 0.0,
|
||||
})
|
||||
|
||||
class LossCallBack(Callback):
|
||||
"""
|
||||
|
@ -82,7 +90,12 @@ class LossCallBack(Callback):
|
|||
cb_params.cur_epoch_num,
|
||||
cb_params.cur_step_num,
|
||||
str(cb_params.net_outputs)))
|
||||
with open("./loss_{}.log".format(self.rank_id), "a+") as f:
|
||||
|
||||
loss_file = "./loss_{}.log"
|
||||
if config.enable_modelarts:
|
||||
loss_file = "/cache/train/loss_{}.log"
|
||||
|
||||
with open(loss_file.format(self.rank_id), "a+") as f:
|
||||
f.write("time: {}, epoch: {}, step: {}, loss: {}, overflow: {}, loss_scale: {}".format(
|
||||
time_stamp_current - time_stamp_first,
|
||||
cb_params.cur_epoch_num,
|
||||
|
@ -93,121 +106,90 @@ class LossCallBack(Callback):
|
|||
f.write('\n')
|
||||
|
||||
|
||||
def argparse_init():
|
||||
"""
|
||||
Argparse init.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description='transformer')
|
||||
parser.add_argument("--distribute", type=str, default="false", choices=['true', 'false'],
|
||||
help="Run distribute, default is false.")
|
||||
parser.add_argument("--epoch_size", type=int, default=52, help="Epoch size, default is 52.")
|
||||
parser.add_argument("--device_target", type=str, default="Ascend",
|
||||
help="device where the code will be implemented, default is Ascend")
|
||||
parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
|
||||
parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.")
|
||||
parser.add_argument("--enable_lossscale", type=str, default="true", choices=['true', 'false'],
|
||||
help="Use lossscale or not, default is true.")
|
||||
parser.add_argument("--do_shuffle", type=str, default="true", choices=['true', 'false'],
|
||||
help="Enable shuffle for dataset, default is true.")
|
||||
parser.add_argument("--checkpoint_path", type=str, default="", help="Checkpoint file path")
|
||||
parser.add_argument("--enable_save_ckpt", type=str, default="true", choices=['true', 'false'],
|
||||
help="Enable save checkpoint, default is true.")
|
||||
parser.add_argument("--save_checkpoint_steps", type=int, default=2500, help="Save checkpoint steps, "
|
||||
"default is 2500.")
|
||||
parser.add_argument("--save_checkpoint_num", type=int, default=30, help="Save checkpoint numbers, default is 30.")
|
||||
parser.add_argument("--save_checkpoint_path", type=str, default="./", help="Save checkpoint file path")
|
||||
parser.add_argument("--data_path", type=str, default="", help="Data path, it is better to use absolute path")
|
||||
parser.add_argument("--bucket_boundaries", type=ast.literal_eval, default=[16, 32, 48, 64, 128],
|
||||
help="sequence length for different bucket")
|
||||
parser.add_argument("--accumulation_steps", type=int, default=1, help="Gradient accumulation steps, default is 1.")
|
||||
|
||||
return parser
|
||||
def modelarts_pre_process():
|
||||
config.save_checkpoint_path = config.output_path
|
||||
config.data_path = os.path.join(config.data_path, 'ende-l128-mindrecord')
|
||||
|
||||
@moxing_wrapper(pre_process=modelarts_pre_process)
|
||||
def run_transformer_train():
|
||||
"""
|
||||
Transformer training.
|
||||
"""
|
||||
parser = argparse_init()
|
||||
args, _ = parser.parse_known_args()
|
||||
if args.device_target == "Ascend":
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=args.device_id)
|
||||
if config.device_target == "Ascend":
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id())
|
||||
else:
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
|
||||
context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False)
|
||||
|
||||
if args.device_target == "GPU":
|
||||
if config.device_target == "GPU":
|
||||
# Enable graph kernel
|
||||
context.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion")
|
||||
if args.distribute == "true":
|
||||
if args.device_target == "Ascend":
|
||||
device_num = args.device_num
|
||||
if config.distribute == "true":
|
||||
if config.device_target == "Ascend":
|
||||
device_num = config.device_num
|
||||
D.init('hccl')
|
||||
else:
|
||||
D.init('nccl')
|
||||
device_num = D.get_group_size()
|
||||
rank = get_rank()
|
||||
args.device_id = rank
|
||||
config.device_id = rank
|
||||
context.reset_auto_parallel_context()
|
||||
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
|
||||
device_num=device_num)
|
||||
rank_id = args.device_id % device_num
|
||||
save_ckpt_path = os.path.join(args.save_checkpoint_path, 'ckpt_' + str(get_rank()) + '/')
|
||||
rank_id = config.device_id % device_num
|
||||
save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(get_rank()) + '/')
|
||||
else:
|
||||
device_num = 1
|
||||
rank_id = 0
|
||||
save_ckpt_path = os.path.join(args.save_checkpoint_path, 'ckpt_0/')
|
||||
save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_0/')
|
||||
dataset = create_transformer_dataset(epoch_count=1, rank_size=device_num,
|
||||
rank_id=rank_id, do_shuffle=args.do_shuffle,
|
||||
dataset_path=args.data_path,
|
||||
bucket_boundaries=args.bucket_boundaries,
|
||||
device_target=args.device_target)
|
||||
if args.device_target == "Ascend":
|
||||
netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True)
|
||||
else:
|
||||
netwithloss = TransformerNetworkWithLoss(transformer_net_cfg_gpu, True)
|
||||
rank_id=rank_id, do_shuffle=config.do_shuffle,
|
||||
dataset_path=config.data_path,
|
||||
bucket_boundaries=config.bucket_boundaries,
|
||||
device_target=config.device_target)
|
||||
|
||||
if args.checkpoint_path:
|
||||
parameter_dict = load_checkpoint(args.checkpoint_path)
|
||||
netwithloss = TransformerNetworkWithLoss(config, True)
|
||||
|
||||
if config.checkpoint_path:
|
||||
parameter_dict = load_checkpoint(config.checkpoint_path)
|
||||
load_param_into_net(netwithloss, parameter_dict)
|
||||
|
||||
hidden_size = transformer_net_cfg.hidden_size if args.device_target == "Ascend" \
|
||||
else transformer_net_cfg_gpu.hidden_size
|
||||
learning_rate = cfg.lr_schedule.learning_rate if args.device_target == "Ascend" \
|
||||
else 1.0
|
||||
hidden_size = config.hidden_size
|
||||
learning_rate = config.lr_schedule.learning_rate if config.device_target == "Ascend" else 1.0
|
||||
lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay",
|
||||
training_steps=dataset.get_dataset_size()*args.epoch_size,
|
||||
training_steps=dataset.get_dataset_size()*config.epoch_size,
|
||||
learning_rate=learning_rate,
|
||||
warmup_steps=cfg.lr_schedule.warmup_steps,
|
||||
warmup_steps=config.lr_schedule.warmup_steps,
|
||||
hidden_size=hidden_size,
|
||||
start_decay_step=cfg.lr_schedule.start_decay_step,
|
||||
min_lr=cfg.lr_schedule.min_lr), mstype.float32)
|
||||
start_decay_step=config.lr_schedule.start_decay_step,
|
||||
min_lr=config.lr_schedule.min_lr), mstype.float32)
|
||||
|
||||
if args.device_target == "GPU" and cfg.transformer_network == "large":
|
||||
optimizer = Adam(netwithloss.trainable_params(), lr, beta2=cfg.optimizer_adam_beta2)
|
||||
if config.device_target == "GPU" and config.transformer_network == "large":
|
||||
optimizer = Adam(netwithloss.trainable_params(), lr, beta2=config.optimizer_adam_beta2)
|
||||
else:
|
||||
optimizer = Adam(netwithloss.trainable_params(), lr)
|
||||
|
||||
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(rank_id=rank_id)]
|
||||
if args.enable_save_ckpt == "true":
|
||||
if config.enable_save_ckpt == "true":
|
||||
if device_num == 1 or (device_num > 1 and rank_id == 0):
|
||||
if args.device_target == "Ascend":
|
||||
ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps,
|
||||
keep_checkpoint_max=args.save_checkpoint_num)
|
||||
if config.device_target == "Ascend":
|
||||
ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_steps,
|
||||
keep_checkpoint_max=config.save_checkpoint_num)
|
||||
else:
|
||||
ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset.get_dataset_size(),
|
||||
keep_checkpoint_max=args.save_checkpoint_num)
|
||||
keep_checkpoint_max=config.save_checkpoint_num)
|
||||
ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=save_ckpt_path, config=ckpt_config)
|
||||
callbacks.append(ckpoint_cb)
|
||||
|
||||
if args.enable_lossscale == "true":
|
||||
scale_manager = DynamicLossScaleManager(init_loss_scale=cfg.init_loss_scale_value,
|
||||
scale_factor=cfg.scale_factor,
|
||||
scale_window=cfg.scale_window)
|
||||
if config.enable_lossscale == "true":
|
||||
scale_manager = DynamicLossScaleManager(init_loss_scale=config.init_loss_scale_value,
|
||||
scale_factor=config.scale_factor,
|
||||
scale_window=config.scale_window)
|
||||
update_cell = scale_manager.get_update_cell()
|
||||
if args.accumulation_steps > 1:
|
||||
if config.accumulation_steps > 1:
|
||||
netwithgrads = TransformerTrainAccumulationAllReducePostWithLossScaleCell(netwithloss, optimizer,
|
||||
update_cell,
|
||||
args.accumulation_steps)
|
||||
config.accumulation_steps)
|
||||
else:
|
||||
netwithgrads = TransformerTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer,
|
||||
scale_update_cell=update_cell)
|
||||
|
@ -217,7 +199,7 @@ def run_transformer_train():
|
|||
netwithgrads.set_train(True)
|
||||
model = Model(netwithgrads)
|
||||
|
||||
model.train(args.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=False)
|
||||
model.train(config.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -0,0 +1,281 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Transformer beam search module."""
|
||||
|
||||
import numpy as np
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.nn as nn
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.common.tensor import Tensor
|
||||
|
||||
INF = 1. * 1e9
|
||||
|
||||
class LengthPenalty(nn.Cell):
|
||||
"""
|
||||
Normalize scores of translations according to their length.
|
||||
|
||||
Args:
|
||||
weight (float): Weight of length penalty. Default: 1.0.
|
||||
compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
|
||||
"""
|
||||
def __init__(self,
|
||||
weight=1.0,
|
||||
compute_type=mstype.float32):
|
||||
super(LengthPenalty, self).__init__()
|
||||
self.weight = weight
|
||||
self.add = P.Add()
|
||||
self.pow = P.Pow()
|
||||
self.div = P.RealDiv()
|
||||
self.cast = P.Cast()
|
||||
self.five = Tensor(5.0, mstype.float32)
|
||||
self.six = Tensor(6.0, mstype.float32)
|
||||
|
||||
def construct(self, length_tensor):
|
||||
length_tensor = self.cast(length_tensor, mstype.float32)
|
||||
output = self.add(length_tensor, self.five)
|
||||
output = self.div(output, self.six)
|
||||
output = self.pow(output, self.weight)
|
||||
return output
|
||||
|
||||
|
||||
class TileBeam(nn.Cell):
|
||||
"""
|
||||
TileBeam.
|
||||
|
||||
Args:
|
||||
beam_width (int): beam width setting. Default: 4.
|
||||
compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
|
||||
"""
|
||||
def __init__(self,
|
||||
beam_width,
|
||||
compute_type=mstype.float32):
|
||||
super(TileBeam, self).__init__()
|
||||
self.beam_width = beam_width
|
||||
self.expand = P.ExpandDims()
|
||||
self.tile = P.Tile()
|
||||
self.reshape = P.Reshape()
|
||||
self.shape = P.Shape()
|
||||
|
||||
def construct(self, input_tensor):
|
||||
"""
|
||||
input_tensor: shape [batch, dim1, dim2]
|
||||
output_tensor: shape [batch*beam, dim1, dim2]
|
||||
"""
|
||||
shape = self.shape(input_tensor)
|
||||
input_tensor = self.expand(input_tensor, 1)
|
||||
tile_shape = (1,) + (self.beam_width,)
|
||||
for _ in range(len(shape)-1):
|
||||
tile_shape = tile_shape + (1,)
|
||||
output = self.tile(input_tensor, tile_shape)
|
||||
out_shape = (shape[0]*self.beam_width,) + shape[1:]
|
||||
output = self.reshape(output, out_shape)
|
||||
return output
|
||||
|
||||
|
||||
class Mod(nn.Cell):
|
||||
"""
|
||||
Mod function.
|
||||
|
||||
Args:
|
||||
compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
|
||||
"""
|
||||
def __init__(self,
|
||||
compute_type=mstype.float32):
|
||||
super(Mod, self).__init__()
|
||||
self.compute_type = compute_type
|
||||
self.floor_div = P.FloorDiv()
|
||||
self.sub = P.Sub()
|
||||
self.multiply = P.Mul()
|
||||
|
||||
def construct(self, input_x, input_y):
|
||||
x = self.floor_div(input_x, input_y)
|
||||
x = self.multiply(x, input_y)
|
||||
x = self.sub(input_x, x)
|
||||
return x
|
||||
|
||||
|
||||
class BeamSearchDecoder(nn.Cell):
|
||||
"""
|
||||
Beam search decoder.
|
||||
|
||||
Args:
|
||||
batch_size (int): Batch size of input dataset.
|
||||
seq_length (int): Length of input sequence.
|
||||
vocab_size (int): Size of vocabulary.
|
||||
decoder (:class:`TransformerDecoderStep`): Decoder module.
|
||||
beam_width (int): beam width setting. Default: 4.
|
||||
length_penalty_weight (float): Weight of length penalty. Default: 1.0.
|
||||
max_decode_length (int): max decode length. Default: 128.
|
||||
sos_id (int): Id of sequence start token. Default: 1.
|
||||
eos_id (int): Id of sequence end token. Default: 2.
|
||||
compute_type (:class:`mindspore.dtype`): Compute type in Transformer. Default: mstype.float32.
|
||||
"""
|
||||
def __init__(self,
|
||||
batch_size,
|
||||
seq_length,
|
||||
vocab_size,
|
||||
decoder,
|
||||
beam_width=4,
|
||||
length_penalty_weight=1.0,
|
||||
max_decode_length=128,
|
||||
sos_id=1,
|
||||
eos_id=2,
|
||||
compute_type=mstype.float32):
|
||||
super(BeamSearchDecoder, self).__init__(auto_prefix=False)
|
||||
self.seq_length = seq_length
|
||||
self.batch_size = batch_size
|
||||
self.vocab_size = vocab_size
|
||||
self.beam_width = beam_width
|
||||
self.length_penalty_weight = length_penalty_weight
|
||||
self.max_decode_length = max_decode_length
|
||||
self.decoder = decoder
|
||||
|
||||
self.add = P.Add()
|
||||
self.expand = P.ExpandDims()
|
||||
self.reshape = P.Reshape()
|
||||
self.shape_flat = (-1,)
|
||||
self.shape = P.Shape()
|
||||
|
||||
self.zero_tensor = Tensor(np.zeros([batch_size, beam_width]), mstype.float32)
|
||||
self.ninf_tensor = Tensor(np.full([batch_size, beam_width], -INF), mstype.float32)
|
||||
|
||||
self.select = P.Select()
|
||||
self.flat_shape = (batch_size, beam_width * vocab_size)
|
||||
self.topk = P.TopK(sorted=True)
|
||||
self.floor_div = P.FloorDiv()
|
||||
self.vocab_size_tensor = Tensor(self.vocab_size, mstype.int32)
|
||||
self.real_div = P.RealDiv()
|
||||
self.mod = Mod()
|
||||
self.equal = P.Equal()
|
||||
self.eos_ids = Tensor(np.full([batch_size, beam_width], eos_id), mstype.int32)
|
||||
|
||||
beam_ids = np.tile(np.arange(beam_width).reshape((1, beam_width)), [batch_size, 1])
|
||||
self.beam_ids = Tensor(beam_ids, mstype.int32)
|
||||
batch_ids = np.arange(batch_size*beam_width).reshape((batch_size, beam_width)) // beam_width
|
||||
self.batch_ids = Tensor(batch_ids, mstype.int32)
|
||||
self.concat = P.Concat(axis=-1)
|
||||
self.gather_nd = P.GatherNd()
|
||||
|
||||
self.greater_equal = P.GreaterEqual()
|
||||
self.sub = P.Sub()
|
||||
self.cast = P.Cast()
|
||||
self.zeroslike = P.ZerosLike()
|
||||
|
||||
# init inputs and states
|
||||
self.start_ids = Tensor(np.full([batch_size * beam_width, 1], sos_id), mstype.int32)
|
||||
self.init_seq = Tensor(np.full([batch_size, beam_width, 1], sos_id), mstype.int32)
|
||||
init_scores = np.tile(np.array([[0.] + [-INF]*(beam_width-1)]), [batch_size, 1])
|
||||
self.init_scores = Tensor(init_scores, mstype.float32)
|
||||
self.init_finished = Tensor(np.zeros([batch_size, beam_width], dtype=np.bool))
|
||||
self.init_length = Tensor(np.zeros([batch_size, beam_width], dtype=np.int32))
|
||||
self.length_penalty = LengthPenalty(weight=length_penalty_weight)
|
||||
self.one = Tensor(1, mstype.int32)
|
||||
|
||||
def one_step(self, cur_input_ids, enc_states, enc_attention_mask, state_log_probs,
|
||||
state_seq, state_finished, state_length):
|
||||
"""
|
||||
One step for decode
|
||||
"""
|
||||
log_probs = self.decoder(cur_input_ids, enc_states, enc_attention_mask, self.seq_length)
|
||||
log_probs = self.reshape(log_probs, (self.batch_size, self.beam_width, self.vocab_size))
|
||||
|
||||
# select topk indices
|
||||
total_log_probs = self.add(log_probs, self.expand(state_log_probs, -1))
|
||||
|
||||
# mask finished beams
|
||||
mask_tensor = self.select(state_finished, self.ninf_tensor, self.zero_tensor)
|
||||
total_log_probs = self.add(total_log_probs, self.expand(mask_tensor, -1))
|
||||
|
||||
# reshape scores to [batch, beam*vocab]
|
||||
flat_scores = self.reshape(total_log_probs, self.flat_shape)
|
||||
# select topk
|
||||
topk_scores, topk_indices = self.topk(flat_scores, self.beam_width)
|
||||
|
||||
temp = topk_indices
|
||||
beam_indices = self.zeroslike(topk_indices)
|
||||
for _ in range(self.beam_width - 1):
|
||||
temp = self.sub(temp, self.vocab_size_tensor)
|
||||
res = self.cast(self.greater_equal(temp, 0), mstype.int32)
|
||||
beam_indices = beam_indices + res
|
||||
word_indices = topk_indices - beam_indices * self.vocab_size_tensor
|
||||
#======================================================================
|
||||
|
||||
# mask finished indices
|
||||
beam_indices = self.select(state_finished, self.beam_ids, beam_indices)
|
||||
word_indices = self.select(state_finished, self.eos_ids, word_indices)
|
||||
topk_scores = self.select(state_finished, state_log_probs, topk_scores)
|
||||
|
||||
###### put finished sequences to the end
|
||||
# sort according to scores with -inf for finished beams
|
||||
tmp_log_probs = self.select(
|
||||
self.equal(word_indices, self.eos_ids),
|
||||
self.ninf_tensor,
|
||||
topk_scores)
|
||||
_, tmp_indices = self.topk(tmp_log_probs, self.beam_width)
|
||||
# update
|
||||
tmp_gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(tmp_indices, -1)))
|
||||
beam_indices = self.gather_nd(beam_indices, tmp_gather_indices)
|
||||
word_indices = self.gather_nd(word_indices, tmp_gather_indices)
|
||||
topk_scores = self.gather_nd(topk_scores, tmp_gather_indices)
|
||||
|
||||
###### generate new beam_search states
|
||||
# gather indices for selecting alive beams
|
||||
gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(beam_indices, -1)))
|
||||
|
||||
# length add 1 if not finished in the previous step
|
||||
length_add = self.add(state_length, self.one)
|
||||
state_length = self.select(state_finished, state_length, length_add)
|
||||
state_length = self.gather_nd(state_length, gather_indices)
|
||||
|
||||
# concat seq
|
||||
seq = self.gather_nd(state_seq, gather_indices)
|
||||
state_seq = self.concat((seq, self.expand(word_indices, -1)))
|
||||
|
||||
# new finished flag and log_probs
|
||||
state_finished = self.equal(word_indices, self.eos_ids)
|
||||
state_log_probs = topk_scores
|
||||
|
||||
###### generate new inputs and decoder states
|
||||
cur_input_ids = self.reshape(state_seq, (self.batch_size*self.beam_width, -1))
|
||||
return cur_input_ids, state_log_probs, state_seq, state_finished, state_length
|
||||
|
||||
def construct(self, enc_states, enc_attention_mask):
|
||||
"""Get beam search result."""
|
||||
cur_input_ids = self.start_ids
|
||||
# beam search states
|
||||
state_log_probs = self.init_scores
|
||||
state_seq = self.init_seq
|
||||
state_finished = self.init_finished
|
||||
state_length = self.init_length
|
||||
|
||||
for _ in range(self.max_decode_length):
|
||||
# run one step decoder to get outputs of the current step
|
||||
# shape [batch*beam, 1, vocab]
|
||||
cur_input_ids, state_log_probs, state_seq, state_finished, state_length = self.one_step(
|
||||
cur_input_ids, enc_states, enc_attention_mask, state_log_probs, state_seq, state_finished, state_length)
|
||||
|
||||
# add length penalty scores
|
||||
penalty_len = self.length_penalty(state_length)
|
||||
# get penalty length
|
||||
log_probs = self.real_div(state_log_probs, penalty_len)
|
||||
|
||||
# sort according to scores
|
||||
_, top_beam_indices = self.topk(log_probs, self.beam_width)
|
||||
gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(top_beam_indices, -1)))
|
||||
# sort sequence
|
||||
predicted_ids = self.gather_nd(state_seq, gather_indices)
|
||||
# take the first one
|
||||
predicted_ids = predicted_ids[::, 0:1:1, ::]
|
||||
return predicted_ids
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
|
@ -0,0 +1,58 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Data operations, will be used in train.py."""
|
||||
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset as de
|
||||
import mindspore.dataset.transforms.c_transforms as deC
|
||||
from .config import transformer_net_cfg, transformer_net_cfg_gpu
|
||||
de.config.set_seed(1)
|
||||
def create_transformer_dataset(epoch_count=1, rank_size=1, rank_id=0, do_shuffle="true", dataset_path=None,
|
||||
bucket_boundaries=None, device_target="Ascend"):
|
||||
"""create dataset"""
|
||||
def batch_per_bucket(bucket_len, dataset_path):
|
||||
dataset_path = dataset_path + "_" + str(bucket_len) + "_00"
|
||||
ds = de.MindDataset(dataset_path,
|
||||
columns_list=["source_eos_ids", "source_eos_mask",
|
||||
"target_sos_ids", "target_sos_mask",
|
||||
"target_eos_ids", "target_eos_mask"],
|
||||
shuffle=(do_shuffle == "true"), num_shards=rank_size, shard_id=rank_id)
|
||||
type_cast_op = deC.TypeCast(mstype.int32)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask")
|
||||
|
||||
# apply batch operations
|
||||
if device_target == "Ascend":
|
||||
ds = ds.batch(transformer_net_cfg.batch_size, drop_remainder=True)
|
||||
else:
|
||||
ds = ds.batch(transformer_net_cfg_gpu.batch_size, drop_remainder=True)
|
||||
|
||||
ds = ds.repeat(epoch_count)
|
||||
return ds
|
||||
|
||||
for i, _ in enumerate(bucket_boundaries):
|
||||
bucket_len = bucket_boundaries[i]
|
||||
ds_per = batch_per_bucket(bucket_len, dataset_path)
|
||||
if i == 0:
|
||||
ds = ds_per
|
||||
else:
|
||||
ds = ds + ds_per
|
||||
ds = ds.shuffle(ds.get_dataset_size())
|
||||
ds.channel_name = 'transformer'
|
||||
return ds
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
|
@ -0,0 +1,52 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Learning rate utilities."""
|
||||
|
||||
def linear_warmup(warmup_steps, current_step):
|
||||
return min([1.0, float(current_step)/float(warmup_steps)])
|
||||
|
||||
def rsqrt_decay(warmup_steps, current_step):
|
||||
return float(max([current_step, warmup_steps])) ** -0.5
|
||||
|
||||
def rsqrt_hidden(hidden_size):
|
||||
return float(hidden_size) ** -0.5
|
||||
|
||||
def create_dynamic_lr(schedule, training_steps, learning_rate, warmup_steps, hidden_size,
|
||||
start_decay_step=0, min_lr=0.):
|
||||
"""
|
||||
Generate dynamic learning rate.
|
||||
"""
|
||||
if start_decay_step < warmup_steps:
|
||||
start_decay_step = warmup_steps
|
||||
lr = []
|
||||
for current_step in range(1, training_steps+1):
|
||||
cur_lr = 1.0
|
||||
for name in schedule.split("*"):
|
||||
if name == "constant":
|
||||
cur_lr *= float(learning_rate)
|
||||
elif name == "rsqrt_hidden":
|
||||
cur_lr *= rsqrt_hidden(hidden_size)
|
||||
elif name == "linear_warmup":
|
||||
cur_lr *= linear_warmup(warmup_steps, current_step)
|
||||
elif name == "rsqrt_decay":
|
||||
cur_lr *= rsqrt_decay(warmup_steps, current_step-start_decay_step+warmup_steps)
|
||||
else:
|
||||
raise ValueError("unknown learning rate schedule")
|
||||
if warmup_steps < current_step < start_decay_step:
|
||||
cur_lr = lr[-1]
|
||||
if current_step > warmup_steps:
|
||||
cur_lr = max([cur_lr, min_lr])
|
||||
lr.append(cur_lr)
|
||||
return lr
|
|
@ -0,0 +1,47 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Convert ids to tokens."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
import tokenization
|
||||
|
||||
# Explicitly set the encoding
|
||||
sys.stdin = open(sys.stdin.fileno(), mode='r', encoding='utf-8', buffering=True)
|
||||
sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=True)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="recore nbest with smoothed sentence-level bleu.")
|
||||
parser.add_argument("--vocab_file", type=str, default="", required=True, help="vocab file path.")
|
||||
args = parser.parse_args()
|
||||
|
||||
tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=args.vocab_file)
|
||||
|
||||
for line in sys.stdin:
|
||||
token_ids = [int(x) for x in line.strip().split()]
|
||||
tokens = tokenizer.convert_ids_to_tokens(token_ids)
|
||||
sent = " ".join(tokens)
|
||||
sent = sent.split("<s>")[-1]
|
||||
sent = sent.split("</s>")[0]
|
||||
print(sent.strip())
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,158 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Tokenization utilities."""
|
||||
|
||||
import sys
|
||||
import collections
|
||||
import unicodedata
|
||||
|
||||
def convert_to_printable(text):
|
||||
"""
|
||||
Converts `text` to a printable coding format.
|
||||
"""
|
||||
if sys.version_info[0] == 3:
|
||||
if isinstance(text, str):
|
||||
return text
|
||||
if isinstance(text, bytes):
|
||||
return text.decode("utf-8", "ignore")
|
||||
raise ValueError("Only support type `str` or `bytes`, while text type is `%s`" % (type(text)))
|
||||
if sys.version_info[0] == 2:
|
||||
if isinstance(text, str):
|
||||
return text
|
||||
if isinstance(text, unicode):
|
||||
return text.encode("utf-8")
|
||||
raise ValueError("Only support type `str` or `unicode`, while text type is `%s`" % (type(text)))
|
||||
raise ValueError("Only supported when running on Python2 or Python3.")
|
||||
|
||||
|
||||
def convert_to_unicode(text):
|
||||
"""
|
||||
Converts `text` to Unicode format.
|
||||
"""
|
||||
if sys.version_info[0] == 3:
|
||||
if isinstance(text, str):
|
||||
return text
|
||||
if isinstance(text, bytes):
|
||||
return text.decode("utf-8", "ignore")
|
||||
raise ValueError("Only support type `str` or `bytes`, while text type is `%s`" % (type(text)))
|
||||
if sys.version_info[0] == 2:
|
||||
if isinstance(text, str):
|
||||
return text.decode("utf-8", "ignore")
|
||||
if isinstance(text, unicode):
|
||||
return text
|
||||
raise ValueError("Only support type `str` or `unicode`, while text type is `%s`" % (type(text)))
|
||||
raise ValueError("Only supported when running on Python2 or Python3.")
|
||||
|
||||
|
||||
def load_vocab_file(vocab_file):
|
||||
"""
|
||||
Loads a vocabulary file and turns into a {token:id} dictionary.
|
||||
"""
|
||||
vocab_dict = collections.OrderedDict()
|
||||
index = 0
|
||||
with open(vocab_file, "r") as vocab:
|
||||
while True:
|
||||
token = convert_to_unicode(vocab.readline())
|
||||
if not token:
|
||||
break
|
||||
token = token.strip()
|
||||
vocab_dict[token] = index
|
||||
index += 1
|
||||
return vocab_dict
|
||||
|
||||
|
||||
def convert_by_vocab_dict(vocab_dict, items):
|
||||
"""
|
||||
Converts a sequence of [tokens|ids] according to the vocab dict.
|
||||
"""
|
||||
output = []
|
||||
for item in items:
|
||||
if item in vocab_dict:
|
||||
output.append(vocab_dict[item])
|
||||
else:
|
||||
output.append(vocab_dict["<unk>"])
|
||||
return output
|
||||
|
||||
|
||||
class WhiteSpaceTokenizer():
|
||||
"""
|
||||
Whitespace tokenizer.
|
||||
"""
|
||||
def __init__(self, vocab_file):
|
||||
self.vocab_dict = load_vocab_file(vocab_file)
|
||||
self.inv_vocab_dict = {index: token for token, index in self.vocab_dict.items()}
|
||||
|
||||
def _is_whitespace_char(self, char):
|
||||
"""
|
||||
Checks if it is a whitespace character(regard "\t", "\n", "\r" as whitespace here).
|
||||
"""
|
||||
if char in (" ", "\t", "\n", "\r"):
|
||||
return True
|
||||
uni = unicodedata.category(char)
|
||||
if uni == "Zs":
|
||||
return True
|
||||
return False
|
||||
|
||||
def _is_control_char(self, char):
|
||||
"""
|
||||
Checks if it is a control character.
|
||||
"""
|
||||
if char in ("\t", "\n", "\r"):
|
||||
return False
|
||||
uni = unicodedata.category(char)
|
||||
if uni in ("Cc", "Cf"):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _clean_text(self, text):
|
||||
"""
|
||||
Remove invalid characters and cleanup whitespace.
|
||||
"""
|
||||
output = []
|
||||
for char in text:
|
||||
cp = ord(char)
|
||||
if cp == 0 or cp == 0xfffd or self._is_control_char(char):
|
||||
continue
|
||||
if self._is_whitespace_char(char):
|
||||
output.append(" ")
|
||||
else:
|
||||
output.append(char)
|
||||
return "".join(output)
|
||||
|
||||
def _whitespace_tokenize(self, text):
|
||||
"""
|
||||
Clean whitespace and split text into tokens.
|
||||
"""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
tokens = []
|
||||
else:
|
||||
tokens = text.split()
|
||||
return tokens
|
||||
|
||||
def tokenize(self, text):
|
||||
"""
|
||||
Tokenizes text.
|
||||
"""
|
||||
text = convert_to_unicode(text)
|
||||
text = self._clean_text(text)
|
||||
tokens = self._whitespace_tokenize(text)
|
||||
return tokens
|
||||
|
||||
def convert_tokens_to_ids(self, tokens):
|
||||
return convert_by_vocab_dict(self.vocab_dict, tokens)
|
||||
|
||||
def convert_ids_to_tokens(self, ids):
|
||||
return convert_by_vocab_dict(self.inv_vocab_dict, ids)
|
|
@ -0,0 +1,472 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Transformer for training."""
|
||||
import numpy as np
|
||||
|
||||
from mindspore.common.initializer import initializer
|
||||
import mindspore.nn as nn
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore.common.tensor import Tensor
|
||||
from mindspore.common.parameter import Parameter
|
||||
from mindspore.common import dtype as mstype
|
||||
from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
|
||||
from mindspore.communication.management import get_group_size
|
||||
from mindspore.context import ParallelMode
|
||||
from mindspore import context
|
||||
|
||||
from .transformer_model import TransformerModel
|
||||
|
||||
GRADIENT_CLIP_TYPE = 1
|
||||
GRADIENT_CLIP_VALUE = 5.0
|
||||
|
||||
clip_grad = C.MultitypeFuncGraph("clip_grad")
|
||||
|
||||
|
||||
@clip_grad.register("Number", "Number", "Tensor")
|
||||
def _clip_grad(clip_type, clip_value, grad):
|
||||
"""
|
||||
Clip gradients.
|
||||
|
||||
Inputs:
|
||||
clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'.
|
||||
clip_value (float): Specifies how much to clip.
|
||||
grad (tuple[Tensor]): Gradients.
|
||||
|
||||
Outputs:
|
||||
tuple[Tensor], clipped gradients.
|
||||
"""
|
||||
if clip_type not in (0, 1):
|
||||
return grad
|
||||
dt = F.dtype(grad)
|
||||
if clip_type == 0:
|
||||
new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt),
|
||||
F.cast(F.tuple_to_array((clip_value,)), dt))
|
||||
else:
|
||||
new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt))
|
||||
return new_grad
|
||||
|
||||
|
||||
class TransformerTrainingLoss(nn.Cell):
|
||||
"""
|
||||
Provide transformer training loss.
|
||||
|
||||
Args:
|
||||
config (TransformerConfig): The config of Transformer.
|
||||
|
||||
Returns:
|
||||
Tensor, total loss.
|
||||
"""
|
||||
def __init__(self, config):
|
||||
super(TransformerTrainingLoss, self).__init__(auto_prefix=False)
|
||||
self.vocab_size = config.vocab_size
|
||||
self.onehot = P.OneHot()
|
||||
self.on_value = Tensor(float(1 - config.label_smoothing), mstype.float32)
|
||||
self.off_value = Tensor(config.label_smoothing / float(self.vocab_size - 1), mstype.float32)
|
||||
self.reduce_sum = P.ReduceSum()
|
||||
self.reduce_mean = P.ReduceMean()
|
||||
self.reshape = P.Reshape()
|
||||
self.last_idx = (-1,)
|
||||
self.flatten = P.Flatten()
|
||||
self.neg = P.Neg()
|
||||
self.cast = P.Cast()
|
||||
self.batch_size = config.batch_size
|
||||
|
||||
def construct(self, prediction_scores, label_ids, label_weights, seq_length):
|
||||
"""Defines the computation performed."""
|
||||
flat_shape = (self.batch_size * seq_length,)
|
||||
label_ids = self.reshape(label_ids, flat_shape)
|
||||
label_weights = self.cast(self.reshape(label_weights, flat_shape), mstype.float32)
|
||||
one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value)
|
||||
|
||||
per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx))
|
||||
numerator = self.reduce_sum(label_weights * per_example_loss, ())
|
||||
denominator = self.reduce_sum(label_weights, ()) + \
|
||||
self.cast(F.tuple_to_array((1e-5,)), mstype.float32)
|
||||
loss = numerator / denominator
|
||||
return loss
|
||||
|
||||
|
||||
class TransformerNetworkWithLoss(nn.Cell):
|
||||
"""
|
||||
Provide transformer training loss through network.
|
||||
|
||||
Args:
|
||||
config (TransformerConfig): The config of Transformer.
|
||||
is_training (bool): Specifies whether to use the training mode.
|
||||
use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings. Default: False.
|
||||
|
||||
Returns:
|
||||
Tensor, the loss of the network.
|
||||
"""
|
||||
def __init__(self, config, is_training, use_one_hot_embeddings=False):
|
||||
super(TransformerNetworkWithLoss, self).__init__(auto_prefix=False)
|
||||
self.transformer = TransformerModel(config, is_training, use_one_hot_embeddings)
|
||||
self.loss = TransformerTrainingLoss(config)
|
||||
self.cast = P.Cast()
|
||||
self.shape = P.Shape()
|
||||
|
||||
def construct(self,
|
||||
source_ids,
|
||||
source_mask,
|
||||
target_ids,
|
||||
target_mask,
|
||||
label_ids,
|
||||
label_weights):
|
||||
"""Transformer network with loss."""
|
||||
prediction_scores = self.transformer(source_ids, source_mask, target_ids, target_mask)
|
||||
seq_length = self.shape(source_ids)[1]
|
||||
total_loss = self.loss(prediction_scores, label_ids, label_weights, seq_length)
|
||||
return self.cast(total_loss, mstype.float32)
|
||||
|
||||
|
||||
class TransformerTrainOneStepCell(nn.TrainOneStepCell):
|
||||
"""
|
||||
Encapsulation class of transformer network training.
|
||||
|
||||
Append an optimizer to the training network after that the construct
|
||||
function can be called to create the backward graph.
|
||||
|
||||
Args:
|
||||
network (Cell): The training network. Note that loss function should have been added.
|
||||
optimizer (Optimizer): Optimizer for updating the weights.
|
||||
sens (Number): The adjust parameter. Default: 1.0.
|
||||
"""
|
||||
def __init__(self, network, optimizer, sens=1.0):
|
||||
super(TransformerTrainOneStepCell, self).__init__(network, optimizer, sens)
|
||||
|
||||
self.cast = P.Cast()
|
||||
self.hyper_map = C.HyperMap()
|
||||
|
||||
def set_sens(self, value):
|
||||
self.sens = value
|
||||
|
||||
def construct(self,
|
||||
source_eos_ids,
|
||||
source_eos_mask,
|
||||
target_sos_ids,
|
||||
target_sos_mask,
|
||||
target_eos_ids,
|
||||
target_eos_mask,):
|
||||
"""Defines the computation performed."""
|
||||
source_ids = source_eos_ids
|
||||
source_mask = source_eos_mask
|
||||
target_ids = target_sos_ids
|
||||
target_mask = target_sos_mask
|
||||
label_ids = target_eos_ids
|
||||
label_weights = target_eos_mask
|
||||
|
||||
weights = self.weights
|
||||
loss = self.network(source_ids,
|
||||
source_mask,
|
||||
target_ids,
|
||||
target_mask,
|
||||
label_ids,
|
||||
label_weights)
|
||||
grads = self.grad(self.network, weights)(source_ids,
|
||||
source_mask,
|
||||
target_ids,
|
||||
target_mask,
|
||||
label_ids,
|
||||
label_weights,
|
||||
self.cast(F.tuple_to_array((self.sens,)),
|
||||
mstype.float32))
|
||||
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
succ = self.optimizer(grads)
|
||||
return F.depend(loss, succ)
|
||||
|
||||
|
||||
grad_scale = C.MultitypeFuncGraph("grad_scale")
|
||||
reciprocal = P.Reciprocal()
|
||||
|
||||
|
||||
@grad_scale.register("Tensor", "Tensor")
|
||||
def tensor_grad_scale(scale, grad):
|
||||
return grad * F.cast(reciprocal(scale), F.dtype(grad))
|
||||
|
||||
_grad_overflow = C.MultitypeFuncGraph("_grad_overflow")
|
||||
grad_overflow = P.FloatStatus()
|
||||
|
||||
@_grad_overflow.register("Tensor")
|
||||
def _tensor_grad_overflow(grad):
|
||||
return grad_overflow(grad)
|
||||
|
||||
class TransformerTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell):
|
||||
"""
|
||||
Encapsulation class of Transformer network training.
|
||||
|
||||
Append an optimizer to the training network after that the construct
|
||||
function can be called to create the backward graph.
|
||||
|
||||
Args:
|
||||
network (Cell): The training network. Note that loss function should have been added.
|
||||
optimizer (Optimizer): Optimizer for updating the weights.
|
||||
scale_update_cell (Cell): Cell to do the loss scale. Default: None.
|
||||
"""
|
||||
def __init__(self, network, optimizer, scale_update_cell=None):
|
||||
super(TransformerTrainOneStepWithLossScaleCell, self).__init__(network, optimizer, scale_update_cell)
|
||||
self.cast = P.Cast()
|
||||
self.degree = 1
|
||||
if self.reducer_flag:
|
||||
self.degree = get_group_size()
|
||||
self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
|
||||
|
||||
self.loss_scale = None
|
||||
self.loss_scaling_manager = scale_update_cell
|
||||
if scale_update_cell:
|
||||
self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))
|
||||
|
||||
def construct(self,
|
||||
source_eos_ids,
|
||||
source_eos_mask,
|
||||
target_sos_ids,
|
||||
target_sos_mask,
|
||||
target_eos_ids,
|
||||
target_eos_mask,
|
||||
sens=None):
|
||||
"""Defines the computation performed."""
|
||||
source_ids = source_eos_ids
|
||||
source_mask = source_eos_mask
|
||||
target_ids = target_sos_ids
|
||||
target_mask = target_sos_mask
|
||||
label_ids = target_eos_ids
|
||||
label_weights = target_eos_mask
|
||||
|
||||
weights = self.weights
|
||||
loss = self.network(source_ids,
|
||||
source_mask,
|
||||
target_ids,
|
||||
target_mask,
|
||||
label_ids,
|
||||
label_weights)
|
||||
if sens is None:
|
||||
scaling_sens = self.loss_scale
|
||||
else:
|
||||
scaling_sens = sens
|
||||
status, scaling_sens = self.start_overflow_check(loss, scaling_sens)
|
||||
grads = self.grad(self.network, weights)(source_ids,
|
||||
source_mask,
|
||||
target_ids,
|
||||
target_mask,
|
||||
label_ids,
|
||||
label_weights,
|
||||
self.cast(scaling_sens,
|
||||
mstype.float32))
|
||||
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads)
|
||||
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
|
||||
|
||||
cond = self.get_overflow_status(status, grads)
|
||||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
|
||||
|
||||
cast = P.Cast()
|
||||
add_grads = C.MultitypeFuncGraph("add_grads")
|
||||
|
||||
|
||||
@add_grads.register("Tensor", "Tensor")
|
||||
def _add_grads(accu_grad, grad):
|
||||
return accu_grad + cast(grad, mstype.float32)
|
||||
|
||||
update_accu_grads = C.MultitypeFuncGraph("update_accu_grads")
|
||||
|
||||
@update_accu_grads.register("Tensor", "Tensor")
|
||||
def _update_accu_grads(accu_grad, grad):
|
||||
succ = True
|
||||
return F.depend(succ, F.assign(accu_grad, cast(grad, mstype.float32)))
|
||||
|
||||
accumulate_accu_grads = C.MultitypeFuncGraph("accumulate_accu_grads")
|
||||
|
||||
@accumulate_accu_grads.register("Tensor", "Tensor")
|
||||
def _accumulate_accu_grads(accu_grad, grad):
|
||||
succ = True
|
||||
return F.depend(succ, F.assign_add(accu_grad, cast(grad, mstype.float32)))
|
||||
|
||||
|
||||
zeroslike = P.ZerosLike()
|
||||
reset_accu_grads = C.MultitypeFuncGraph("reset_accu_grads")
|
||||
|
||||
|
||||
@reset_accu_grads.register("Tensor")
|
||||
def _reset_accu_grads(accu_grad):
|
||||
succ = True
|
||||
return F.depend(succ, F.assign(accu_grad, zeroslike(accu_grad)))
|
||||
|
||||
|
||||
class TransformerTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell):
|
||||
"""
|
||||
Encapsulation class of bert network training.
|
||||
|
||||
Append an optimizer to the training network after that the construct
|
||||
function can be called to create the backward graph.
|
||||
|
||||
To mimic higher batch size, gradients are accumulated N times before weight update.
|
||||
|
||||
For distribution mode, allreduce will only be implemented in the weight updated step,
|
||||
i.e. the sub-step after gradients accumulated N times.
|
||||
|
||||
Args:
|
||||
network (Cell): The training network. Note that loss function should have been added.
|
||||
optimizer (Optimizer): Optimizer for updating the weights.
|
||||
scale_update_cell (Cell): Cell to do the loss scale. Default: None.
|
||||
accumulation_steps (int): Number of accumulation steps before gradient update. The global batch size =
|
||||
batch_size * accumulation_steps. Default: 1.
|
||||
"""
|
||||
|
||||
def __init__(self, network, optimizer, scale_update_cell=None, accumulation_steps=8, enable_global_norm=False):
|
||||
super(TransformerTrainAccumulationAllReducePostWithLossScaleCell, self).__init__(auto_prefix=False)
|
||||
self.network = network
|
||||
self.network.set_grad()
|
||||
self.weights = optimizer.parameters
|
||||
self.optimizer = optimizer
|
||||
self.accumulation_steps = accumulation_steps
|
||||
self.enable_global_norm = enable_global_norm
|
||||
self.one = Tensor(np.array([1]).astype(np.int32))
|
||||
self.zero = Tensor(np.array([0]).astype(np.int32))
|
||||
self.local_step = Parameter(initializer(0, [1], mstype.int32))
|
||||
self.accu_grads = self.weights.clone(prefix="accu_grads", init='zeros')
|
||||
self.accu_overflow = Parameter(initializer(0, [1], mstype.int32))
|
||||
self.accu_loss = Parameter(initializer(0, [1], mstype.float32))
|
||||
|
||||
self.grad = C.GradOperation(get_by_list=True, sens_param=True)
|
||||
self.reducer_flag = False
|
||||
self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
|
||||
if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
|
||||
self.reducer_flag = True
|
||||
self.grad_reducer = F.identity
|
||||
self.degree = 1
|
||||
if self.reducer_flag:
|
||||
self.degree = get_group_size()
|
||||
self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
|
||||
self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
|
||||
self.overflow_reducer = F.identity
|
||||
if self.is_distributed:
|
||||
self.overflow_reducer = P.AllReduce()
|
||||
self.cast = P.Cast()
|
||||
self.alloc_status = P.NPUAllocFloatStatus()
|
||||
self.get_status = P.NPUGetFloatStatus()
|
||||
self.clear_status = P.NPUClearFloatStatus()
|
||||
self.reduce_sum = P.ReduceSum(keep_dims=False)
|
||||
self.base = Tensor(1, mstype.float32)
|
||||
self.less_equal = P.LessEqual()
|
||||
self.logical_or = P.LogicalOr()
|
||||
self.not_equal = P.NotEqual()
|
||||
self.select = P.Select()
|
||||
self.reshape = P.Reshape()
|
||||
self.hyper_map = C.HyperMap()
|
||||
self.loss_scale = None
|
||||
self.loss_scaling_manager = scale_update_cell
|
||||
if scale_update_cell:
|
||||
self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))
|
||||
|
||||
def construct(self,
|
||||
source_eos_ids,
|
||||
source_eos_mask,
|
||||
target_sos_ids,
|
||||
target_sos_mask,
|
||||
target_eos_ids,
|
||||
target_eos_mask,
|
||||
sens=None):
|
||||
"""Defines the computation performed."""
|
||||
source_ids = source_eos_ids
|
||||
source_mask = source_eos_mask
|
||||
target_ids = target_sos_ids
|
||||
target_mask = target_sos_mask
|
||||
label_ids = target_eos_ids
|
||||
label_weights = target_eos_mask
|
||||
|
||||
weights = self.weights
|
||||
loss = self.network(source_ids,
|
||||
source_mask,
|
||||
target_ids,
|
||||
target_mask,
|
||||
label_ids,
|
||||
label_weights)
|
||||
if sens is None:
|
||||
scaling_sens = self.loss_scale
|
||||
else:
|
||||
scaling_sens = sens
|
||||
# alloc status and clear should be right before gradoperation
|
||||
init = self.alloc_status()
|
||||
init = F.depend(init, loss)
|
||||
clear_status = self.clear_status(init)
|
||||
scaling_sens = F.depend(scaling_sens, clear_status)
|
||||
# update accumulation parameters
|
||||
is_accu_step = self.not_equal(self.local_step, self.accumulation_steps)
|
||||
self.local_step = self.select(is_accu_step, self.local_step + self.one, self.one)
|
||||
self.accu_loss = self.select(is_accu_step, self.accu_loss + loss, loss)
|
||||
mean_loss = self.accu_loss / self.local_step
|
||||
is_accu_step = self.not_equal(self.local_step, self.accumulation_steps)
|
||||
|
||||
grads = self.grad(self.network, weights)(source_ids,
|
||||
source_mask,
|
||||
target_ids,
|
||||
target_mask,
|
||||
label_ids,
|
||||
label_weights,
|
||||
self.cast(scaling_sens,
|
||||
mstype.float32))
|
||||
|
||||
accu_succ = self.hyper_map(accumulate_accu_grads, self.accu_grads, grads)
|
||||
mean_loss = F.depend(mean_loss, accu_succ)
|
||||
|
||||
init = F.depend(init, mean_loss)
|
||||
get_status = self.get_status(init)
|
||||
init = F.depend(init, get_status)
|
||||
flag_sum = self.reduce_sum(init, (0,))
|
||||
overflow = self.less_equal(self.base, flag_sum)
|
||||
overflow = self.logical_or(self.not_equal(self.accu_overflow, self.zero), overflow)
|
||||
accu_overflow = self.select(overflow, self.one, self.zero)
|
||||
self.accu_overflow = self.select(is_accu_step, accu_overflow, self.zero)
|
||||
|
||||
if is_accu_step:
|
||||
succ = False
|
||||
else:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(self.accu_grads)
|
||||
scaling = scaling_sens * self.degree * self.accumulation_steps
|
||||
grads = self.hyper_map(F.partial(grad_scale, scaling), grads)
|
||||
if self.enable_global_norm:
|
||||
grads = C.clip_by_global_norm(grads, 1.0, None)
|
||||
else:
|
||||
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
|
||||
accu_overflow = F.depend(accu_overflow, grads)
|
||||
accu_overflow = self.overflow_reducer(accu_overflow)
|
||||
overflow = self.less_equal(self.base, accu_overflow)
|
||||
accu_succ = self.hyper_map(reset_accu_grads, self.accu_grads)
|
||||
overflow = F.depend(overflow, accu_succ)
|
||||
overflow = self.reshape(overflow, (()))
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, overflow)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
|
||||
ret = (mean_loss, overflow, scaling_sens)
|
||||
return F.depend(ret, succ)
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,52 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Weight init utilities."""
|
||||
|
||||
import math
|
||||
import numpy as np
|
||||
from mindspore.common.tensor import Tensor
|
||||
|
||||
def _average_units(shape):
|
||||
"""
|
||||
Average shape dim.
|
||||
"""
|
||||
if not shape:
|
||||
return 1.
|
||||
if len(shape) == 1:
|
||||
return float(shape[0])
|
||||
if len(shape) == 2:
|
||||
return float(shape[0] + shape[1]) / 2.
|
||||
raise RuntimeError("not support shape.")
|
||||
|
||||
def weight_variable(shape):
|
||||
scale_shape = shape
|
||||
avg_units = _average_units(scale_shape)
|
||||
scale = 1.0 / max(1., avg_units)
|
||||
limit = math.sqrt(3.0 * scale)
|
||||
values = np.random.uniform(-limit, limit, shape).astype(np.float32)
|
||||
return Tensor(values)
|
||||
|
||||
def one_weight(shape):
|
||||
ones = np.ones(shape).astype(np.float32)
|
||||
return Tensor(ones)
|
||||
|
||||
def zero_weight(shape):
|
||||
zeros = np.zeros(shape).astype(np.float32)
|
||||
return Tensor(zeros)
|
||||
|
||||
def normal_weight(shape, num_units):
|
||||
norm = np.random.normal(0.0, num_units**-0.5, shape).astype(np.float32)
|
||||
return Tensor(norm)
|
||||
|
|
@ -27,12 +27,10 @@ from mindspore.train.callback import Callback
|
|||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as deC
|
||||
from mindspore import context
|
||||
from model_zoo.official.nlp.transformer.src.transformer_model import TransformerConfig
|
||||
from model_zoo.official.nlp.transformer.src.transformer_for_train import TransformerNetworkWithLoss, \
|
||||
TransformerTrainOneStepWithLossScaleCell
|
||||
from model_zoo.official.nlp.transformer.src.config import cfg, transformer_net_cfg
|
||||
from model_zoo.official.nlp.transformer.src.lr_schedule import create_dynamic_lr
|
||||
|
||||
from src.transformer_model import TransformerConfig
|
||||
from src.transformer_for_train import TransformerNetworkWithLoss, TransformerTrainOneStepWithLossScaleCell
|
||||
from src.config import cfg, transformer_net_cfg
|
||||
from src.lr_schedule import create_dynamic_lr
|
||||
from tests.st.model_zoo_tests import utils
|
||||
|
||||
DATA_DIR = ["/home/workspace/mindspore_dataset/transformer/test-mindrecord"]
|
||||
|
@ -216,14 +214,14 @@ def test_transformer_export_mindir():
|
|||
export_file = "transformer80_bs_0"
|
||||
ckpt_path = os.path.join(utils.ckpt_root, "transformer/transformer_trained.ckpt")
|
||||
print("ckpt_path:", ckpt_path)
|
||||
old_list = ["'model_file': '/your/path/checkpoint_file'"]
|
||||
new_list = ["'model_file': '{}'".format(ckpt_path)]
|
||||
utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "src/eval_config.py"))
|
||||
old_list = ["context.set_context(device_id=args.device_id)"]
|
||||
old_list = ["model_file: './transformer/transformer_trained.ckpt'"]
|
||||
new_list = ["model_file: '{}'".format(ckpt_path)]
|
||||
utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "default_config_large.yaml"))
|
||||
old_list = ["context.set_context(device_id=get_device_id())"]
|
||||
new_list = ["context.set_context()"]
|
||||
utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "export.py"))
|
||||
exec_export_shell = "cd transformer; python -u export.py --file_name={}" \
|
||||
" --file_format=MINDIR".format(export_file)
|
||||
" --file_format=MINDIR --config_path=./default_config_large.yaml".format(export_file)
|
||||
os.system(exec_export_shell)
|
||||
assert os.path.exists(os.path.join(cur_model_path, "{}.mindir".format(export_file)))
|
||||
|
||||
|
|
Loading…
Reference in New Issue