From 786de3252e8b07aa5823ae36009c92c8ab10c763 Mon Sep 17 00:00:00 2001 From: yuchaojie Date: Mon, 14 Sep 2020 14:37:22 +0800 Subject: [PATCH] delete transformer's enable_data_sink option && modify create_data --- model_zoo/official/nlp/transformer/README.md | 12 ++- .../official/nlp/transformer/create_data.py | 87 ++++++++++++------- .../scripts/run_distribute_train_ascend.sh | 4 +- .../scripts/run_standalone_train_ascend.sh | 4 +- .../official/nlp/transformer/src/dataset.py | 4 +- model_zoo/official/nlp/transformer/train.py | 16 +--- 6 files changed, 69 insertions(+), 58 deletions(-) diff --git a/model_zoo/official/nlp/transformer/README.md b/model_zoo/official/nlp/transformer/README.md index d789d01711..59439f4232 100644 --- a/model_zoo/official/nlp/transformer/README.md +++ b/model_zoo/official/nlp/transformer/README.md @@ -101,10 +101,9 @@ python eval.py > eval.log 2>&1 & usage: train.py [--distribute DISTRIBUTE] [--epoch_size N] [----device_num N] [--device_id N] [--enable_save_ckpt ENABLE_SAVE_CKPT] [--enable_lossscale ENABLE_LOSSSCALE] [--do_shuffle DO_SHUFFLE] - [--enable_data_sink ENABLE_DATA_SINK] [--save_checkpoint_steps N] - [--save_checkpoint_num N] [--save_checkpoint_path SAVE_CHECKPOINT_PATH] - [--data_path DATA_PATH] - [--bucket_boundaries BUCKET_LENGTH] + [--save_checkpoint_steps N] [--save_checkpoint_num N] + [--save_checkpoint_path SAVE_CHECKPOINT_PATH] + [--data_path DATA_PATH] [--bucket_boundaries BUCKET_LENGTH] options: --distribute pre_training by serveral devices: "true"(training by more than 1 device) | "false", default is "false" @@ -114,7 +113,6 @@ options: --enable_save_ckpt enable save checkpoint: "true" | "false", default is "true" --enable_lossscale enable lossscale: "true" | "false", default is "true" --do_shuffle enable shuffle: "true" | "false", default is "true" - --enable_data_sink enable data sink: "true" | "false", default is "false" --checkpoint_path path to load checkpoint files: PATH, default is "" --save_checkpoint_steps steps for saving checkpoint files: N, default is 2500 --save_checkpoint_num number for saving checkpoint files: N, default is 30 @@ -143,7 +141,7 @@ eval_config.py: ``` Parameters for dataset and network (Training/Evaluation): batch_size batch size of input dataset: N, default is 96 - seq_length length of input sequence: N, default is 128 + seq_length max length of input sequence: N, default is 128 vocab_size size of each embedding vector: N, default is 36560 hidden_size size of Transformer encoder layers: N, default is 1024 num_hidden_layers number of hidden layers: N, default is 6 @@ -181,7 +179,7 @@ Parameters for learning rate: ``` bash paste train.tok.clean.bpe.32000.en train.tok.clean.bpe.32000.de > train.all - python create_data.py --input_file train.all --vocab_file vocab.bpe.32000 --output_file /path/ende-l128-mindrecord --max_seq_length 128 --bucket [16, 32, 48, 64, 128] + python create_data.py --input_file train.all --vocab_file vocab.bpe.32000 --output_file /path/ende-l128-mindrecord --max_seq_length 128 --bucket [16,32,48,64,128] ``` - Convert the original data to mindrecord for evaluation: diff --git a/model_zoo/official/nlp/transformer/create_data.py b/model_zoo/official/nlp/transformer/create_data.py index 6c68c29696..d8c4baeecb 100644 --- a/model_zoo/official/nlp/transformer/create_data.py +++ b/model_zoo/official/nlp/transformer/create_data.py @@ -19,6 +19,7 @@ from __future__ import division from __future__ import print_function import argparse +import ast import collections import logging import numpy as np @@ -51,23 +52,23 @@ class SampleInstance(): return self.__str__() -def write_instance_to_file(writer, instance, tokenizer, max_seq_length, bucket): - """Create files from `SampleInstance`s.""" - def _find_bucket_length(num): +def get_instance_features(instance, tokenizer, max_seq_length, bucket): + """Get features from `SampleInstance`s.""" + def _find_bucket_length(source_tokens, target_tokens): + source_ids = tokenizer.convert_tokens_to_ids(source_tokens) + target_ids = tokenizer.convert_tokens_to_ids(target_tokens) + num = max(len(source_ids), len(target_ids)) assert num <= bucket[-1] - for index in range(1, len(bucket)): if bucket[index - 1] < num <= bucket[index]: return bucket[index] return bucket[0] - def _convert_ids_and_mask(input_tokens): + def _convert_ids_and_mask(input_tokens, seq_max_bucket_length): input_ids = tokenizer.convert_tokens_to_ids(input_tokens) input_mask = [1] * len(input_ids) assert len(input_ids) <= max_seq_length - seq_max_bucket_length = _find_bucket_length(len(input_ids)) - while len(input_ids) < seq_max_bucket_length: input_ids.append(0) input_mask.append(0) @@ -77,10 +78,11 @@ def write_instance_to_file(writer, instance, tokenizer, max_seq_length, bucket): return input_ids, input_mask - source_sos_ids, source_sos_mask = _convert_ids_and_mask(instance.source_sos_tokens) - source_eos_ids, source_eos_mask = _convert_ids_and_mask(instance.source_eos_tokens) - target_sos_ids, target_sos_mask = _convert_ids_and_mask(instance.target_sos_tokens) - target_eos_ids, target_eos_mask = _convert_ids_and_mask(instance.target_eos_tokens) + seq_max_bucket_length = _find_bucket_length(instance.source_sos_tokens, instance.target_sos_tokens) + source_sos_ids, source_sos_mask = _convert_ids_and_mask(instance.source_sos_tokens, seq_max_bucket_length) + source_eos_ids, source_eos_mask = _convert_ids_and_mask(instance.source_eos_tokens, seq_max_bucket_length) + target_sos_ids, target_sos_mask = _convert_ids_and_mask(instance.target_sos_tokens, seq_max_bucket_length) + target_eos_ids, target_eos_mask = _convert_ids_and_mask(instance.target_eos_tokens, seq_max_bucket_length) features = collections.OrderedDict() features["source_sos_ids"] = np.asarray(source_sos_ids) @@ -92,8 +94,7 @@ def write_instance_to_file(writer, instance, tokenizer, max_seq_length, bucket): features["target_eos_ids"] = np.asarray(target_eos_ids) features["target_eos_mask"] = np.asarray(target_eos_mask) - writer.write_raw_data([features]) - return features + return features, seq_max_bucket_length def create_training_instance(source_words, target_words, max_seq_length, clip_to_max_len): """Creates `SampleInstance`s for a single sentence pair.""" @@ -131,7 +132,8 @@ def main(): parser.add_argument("--clip_to_max_len", type=bool, default=False, help='clip sequences to maximum sequence length.') parser.add_argument("--max_seq_length", type=int, default=128, help='Maximum sequence length.') - parser.add_argument("--bucket", type=list, default=[16, 32, 48, 64, 128], help='bucket sequence length') + parser.add_argument("--bucket", type=ast.literal_eval, default=[16, 32, 48, 64, 128], + help='bucket sequence length') args = parser.parse_args() @@ -141,29 +143,21 @@ def main(): for input_pattern in args.input_file.split(","): input_files.append(input_pattern) - logging.info("*** Reading from input files ***") + logging.info("*** Read from input files ***") for input_file in input_files: logging.info(" %s", input_file) output_file = args.output_file - logging.info("*** Writing to output files ***") + logging.info("*** Write to output files ***") logging.info(" %s", output_file) - writer = FileWriter(output_file, args.num_splits) - data_schema = {"source_sos_ids": {"type": "int64", "shape": [-1]}, - "source_sos_mask": {"type": "int64", "shape": [-1]}, - "source_eos_ids": {"type": "int64", "shape": [-1]}, - "source_eos_mask": {"type": "int64", "shape": [-1]}, - "target_sos_ids": {"type": "int64", "shape": [-1]}, - "target_sos_mask": {"type": "int64", "shape": [-1]}, - "target_eos_ids": {"type": "int64", "shape": [-1]}, - "target_eos_mask": {"type": "int64", "shape": [-1]} - } - writer.add_schema(data_schema, "tranformer hisi") - total_written = 0 total_read = 0 + feature_dict = {} + for i in args.bucket: + feature_dict[i] = [] + for input_file in input_files: logging.info("*** Reading from %s ***", input_file) with open(input_file, "r") as reader: @@ -174,7 +168,7 @@ def main(): total_read += 1 if total_read % 100000 == 0: - logging.info("%d ...", total_read) + logging.info("Read %d ...", total_read) source_line, target_line = line.strip().split("\t") source_tokens = tokenizer.tokenize(source_line) @@ -189,10 +183,13 @@ def main(): if instance is None: continue - features = write_instance_to_file(writer, instance, tokenizer, args.max_seq_length, args.bucket) - total_written += 1 + features, seq_max_bucket_length = get_instance_features(instance, tokenizer, args.max_seq_length, + args.bucket) + for key in feature_dict: + if key == seq_max_bucket_length: + feature_dict[key].append(features) - if total_written <= 20: + if total_read <= 10: logging.info("*** Example ***") logging.info("source tokens: %s", " ".join( [tokenization.convert_to_printable(x) for x in instance.source_eos_tokens])) @@ -203,9 +200,33 @@ def main(): feature = features[feature_name] logging.info("%s: %s", feature_name, feature) - writer.commit() + for i in args.bucket: + if args.num_splits == 1: + output_file_name = output_file + else: + output_file_name = output_file + '_' + str(i) + '_' + writer = FileWriter(output_file_name, args.num_splits) + data_schema = {"source_sos_ids": {"type": "int64", "shape": [-1]}, + "source_sos_mask": {"type": "int64", "shape": [-1]}, + "source_eos_ids": {"type": "int64", "shape": [-1]}, + "source_eos_mask": {"type": "int64", "shape": [-1]}, + "target_sos_ids": {"type": "int64", "shape": [-1]}, + "target_sos_mask": {"type": "int64", "shape": [-1]}, + "target_eos_ids": {"type": "int64", "shape": [-1]}, + "target_eos_mask": {"type": "int64", "shape": [-1]} + } + writer.add_schema(data_schema, "tranformer") + features_ = feature_dict[i] + logging.info("Bucket length %d has %d samples, start writing...", i, len(features_)) + + for item in features_: + writer.write_raw_data([item]) + total_written += 1 + writer.commit() + logging.info("Wrote %d total instances", total_written) if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) main() diff --git a/model_zoo/official/nlp/transformer/scripts/run_distribute_train_ascend.sh b/model_zoo/official/nlp/transformer/scripts/run_distribute_train_ascend.sh index ea6ea614dc..057e93a0e5 100644 --- a/model_zoo/official/nlp/transformer/scripts/run_distribute_train_ascend.sh +++ b/model_zoo/official/nlp/transformer/scripts/run_distribute_train_ascend.sh @@ -52,11 +52,11 @@ do --enable_save_ckpt="true" \ --enable_lossscale="true" \ --do_shuffle="true" \ - --enable_data_sink="false" \ --checkpoint_path="" \ --save_checkpoint_steps=2500 \ --save_checkpoint_num=30 \ - --data_path=$DATA_PATH > log.txt 2>&1 & + --data_path=$DATA_PATH \ + --bucket_boundaries=[16,32,48,64,128] > log.txt 2>&1 & cd ../ done cd .. \ No newline at end of file diff --git a/model_zoo/official/nlp/transformer/scripts/run_standalone_train_ascend.sh b/model_zoo/official/nlp/transformer/scripts/run_standalone_train_ascend.sh index 8e677191a8..1d1e129d14 100644 --- a/model_zoo/official/nlp/transformer/scripts/run_standalone_train_ascend.sh +++ b/model_zoo/official/nlp/transformer/scripts/run_standalone_train_ascend.sh @@ -37,9 +37,9 @@ python train.py \ --enable_save_ckpt="true" \ --enable_lossscale="true" \ --do_shuffle="true" \ - --enable_data_sink="false" \ --checkpoint_path="" \ --save_checkpoint_steps=2500 \ --save_checkpoint_num=30 \ - --data_path=$DATA_PATH > log.txt 2>&1 & + --data_path=$DATA_PATH \ + --bucket_boundaries=[16,32,48,64,128] > log.txt 2>&1 & cd .. \ No newline at end of file diff --git a/model_zoo/official/nlp/transformer/src/dataset.py b/model_zoo/official/nlp/transformer/src/dataset.py index 9426f6fd96..ac6ca9479f 100644 --- a/model_zoo/official/nlp/transformer/src/dataset.py +++ b/model_zoo/official/nlp/transformer/src/dataset.py @@ -19,8 +19,8 @@ import mindspore.dataset as de import mindspore.dataset.transforms.c_transforms as deC from .config import transformer_net_cfg de.config.set_seed(1) -def create_transformer_dataset(epoch_count=1, rank_size=1, rank_id=0, do_shuffle="true", enable_data_sink="true", - dataset_path=None, bucket_boundaries=None): +def create_transformer_dataset(epoch_count=1, rank_size=1, rank_id=0, do_shuffle="true", dataset_path=None, + bucket_boundaries=None): """create dataset""" def batch_per_bucket(bucket_len, dataset_path): dataset_path = dataset_path + "_" + str(bucket_len) + "_00" diff --git a/model_zoo/official/nlp/transformer/train.py b/model_zoo/official/nlp/transformer/train.py index 31a9b0dcee..fbb1183d87 100644 --- a/model_zoo/official/nlp/transformer/train.py +++ b/model_zoo/official/nlp/transformer/train.py @@ -16,6 +16,7 @@ import time import argparse +import ast import mindspore.common.dtype as mstype from mindspore.common.tensor import Tensor @@ -94,8 +95,6 @@ def argparse_init(): help="Use lossscale or not, default is true.") parser.add_argument("--do_shuffle", type=str, default="true", choices=['true', 'false'], help="Enable shuffle for dataset, default is true.") - parser.add_argument("--enable_data_sink", type=str, default="false", choices=['true', 'false'], - help="Enable data sink, default is false.") parser.add_argument("--checkpoint_path", type=str, default="", help="Checkpoint file path") parser.add_argument("--enable_save_ckpt", type=str, default="true", choices=['true', 'false'], help="Enable save checkpoint, default is true.") @@ -105,8 +104,8 @@ def argparse_init(): parser.add_argument("--save_checkpoint_path", type=str, default="./checkpoint/", help="Save checkpoint file path, " "default is ./checkpoint/") parser.add_argument("--data_path", type=str, default="", help="Data path, it is better to use absolute path") - parser.add_argument("--bucket_boundaries", type=list, default=[16, 32, 48, 64, 128], help="sequence length for " - "different bucket") + parser.add_argument("--bucket_boundaries", type=ast.literal_eval, default=[16, 32, 48, 64, 128], + help="sequence length for different bucket") return parser @@ -131,7 +130,6 @@ def run_transformer_train(): rank_id = 0 dataset = create_transformer_dataset(epoch_count=1, rank_size=device_num, rank_id=rank_id, do_shuffle=args.do_shuffle, - enable_data_sink=args.enable_data_sink, dataset_path=args.data_path, bucket_boundaries=args.bucket_boundaries) @@ -171,13 +169,7 @@ def run_transformer_train(): netwithgrads.set_train(True) model = Model(netwithgrads) - enable_sink = (args.enable_data_sink == "true") - if enable_sink: - sink_size = args.save_checkpoint_steps - model.train(args.epoch_size*dataset.get_dataset_size()//sink_size, dataset, callbacks=callbacks, - dataset_sink_mode=enable_sink, sink_size=sink_size) - else: - model.train(args.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=enable_sink) + model.train(args.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=False) if __name__ == '__main__': run_transformer_train()