forked from OSSInnovation/mindspore
!6174 delete transformer's enable_data_sink option
Merge pull request !6174 from yuchaojie/transformer2
This commit is contained in:
commit
122e966277
|
@ -101,10 +101,9 @@ python eval.py > eval.log 2>&1 &
|
||||||
usage: train.py [--distribute DISTRIBUTE] [--epoch_size N] [----device_num N] [--device_id N]
|
usage: train.py [--distribute DISTRIBUTE] [--epoch_size N] [----device_num N] [--device_id N]
|
||||||
[--enable_save_ckpt ENABLE_SAVE_CKPT]
|
[--enable_save_ckpt ENABLE_SAVE_CKPT]
|
||||||
[--enable_lossscale ENABLE_LOSSSCALE] [--do_shuffle DO_SHUFFLE]
|
[--enable_lossscale ENABLE_LOSSSCALE] [--do_shuffle DO_SHUFFLE]
|
||||||
[--enable_data_sink ENABLE_DATA_SINK] [--save_checkpoint_steps N]
|
[--save_checkpoint_steps N] [--save_checkpoint_num N]
|
||||||
[--save_checkpoint_num N] [--save_checkpoint_path SAVE_CHECKPOINT_PATH]
|
[--save_checkpoint_path SAVE_CHECKPOINT_PATH]
|
||||||
[--data_path DATA_PATH]
|
[--data_path DATA_PATH] [--bucket_boundaries BUCKET_LENGTH]
|
||||||
[--bucket_boundaries BUCKET_LENGTH]
|
|
||||||
|
|
||||||
options:
|
options:
|
||||||
--distribute pre_training by serveral devices: "true"(training by more than 1 device) | "false", default is "false"
|
--distribute pre_training by serveral devices: "true"(training by more than 1 device) | "false", default is "false"
|
||||||
|
@ -114,7 +113,6 @@ options:
|
||||||
--enable_save_ckpt enable save checkpoint: "true" | "false", default is "true"
|
--enable_save_ckpt enable save checkpoint: "true" | "false", default is "true"
|
||||||
--enable_lossscale enable lossscale: "true" | "false", default is "true"
|
--enable_lossscale enable lossscale: "true" | "false", default is "true"
|
||||||
--do_shuffle enable shuffle: "true" | "false", default is "true"
|
--do_shuffle enable shuffle: "true" | "false", default is "true"
|
||||||
--enable_data_sink enable data sink: "true" | "false", default is "false"
|
|
||||||
--checkpoint_path path to load checkpoint files: PATH, default is ""
|
--checkpoint_path path to load checkpoint files: PATH, default is ""
|
||||||
--save_checkpoint_steps steps for saving checkpoint files: N, default is 2500
|
--save_checkpoint_steps steps for saving checkpoint files: N, default is 2500
|
||||||
--save_checkpoint_num number for saving checkpoint files: N, default is 30
|
--save_checkpoint_num number for saving checkpoint files: N, default is 30
|
||||||
|
@ -143,7 +141,7 @@ eval_config.py:
|
||||||
```
|
```
|
||||||
Parameters for dataset and network (Training/Evaluation):
|
Parameters for dataset and network (Training/Evaluation):
|
||||||
batch_size batch size of input dataset: N, default is 96
|
batch_size batch size of input dataset: N, default is 96
|
||||||
seq_length length of input sequence: N, default is 128
|
seq_length max length of input sequence: N, default is 128
|
||||||
vocab_size size of each embedding vector: N, default is 36560
|
vocab_size size of each embedding vector: N, default is 36560
|
||||||
hidden_size size of Transformer encoder layers: N, default is 1024
|
hidden_size size of Transformer encoder layers: N, default is 1024
|
||||||
num_hidden_layers number of hidden layers: N, default is 6
|
num_hidden_layers number of hidden layers: N, default is 6
|
||||||
|
@ -181,7 +179,7 @@ Parameters for learning rate:
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
paste train.tok.clean.bpe.32000.en train.tok.clean.bpe.32000.de > train.all
|
paste train.tok.clean.bpe.32000.en train.tok.clean.bpe.32000.de > train.all
|
||||||
python create_data.py --input_file train.all --vocab_file vocab.bpe.32000 --output_file /path/ende-l128-mindrecord --max_seq_length 128 --bucket [16, 32, 48, 64, 128]
|
python create_data.py --input_file train.all --vocab_file vocab.bpe.32000 --output_file /path/ende-l128-mindrecord --max_seq_length 128 --bucket [16,32,48,64,128]
|
||||||
```
|
```
|
||||||
- Convert the original data to mindrecord for evaluation:
|
- Convert the original data to mindrecord for evaluation:
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@ from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import ast
|
||||||
import collections
|
import collections
|
||||||
import logging
|
import logging
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -51,23 +52,23 @@ class SampleInstance():
|
||||||
return self.__str__()
|
return self.__str__()
|
||||||
|
|
||||||
|
|
||||||
def write_instance_to_file(writer, instance, tokenizer, max_seq_length, bucket):
|
def get_instance_features(instance, tokenizer, max_seq_length, bucket):
|
||||||
"""Create files from `SampleInstance`s."""
|
"""Get features from `SampleInstance`s."""
|
||||||
def _find_bucket_length(num):
|
def _find_bucket_length(source_tokens, target_tokens):
|
||||||
|
source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
|
||||||
|
target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
|
||||||
|
num = max(len(source_ids), len(target_ids))
|
||||||
assert num <= bucket[-1]
|
assert num <= bucket[-1]
|
||||||
|
|
||||||
for index in range(1, len(bucket)):
|
for index in range(1, len(bucket)):
|
||||||
if bucket[index - 1] < num <= bucket[index]:
|
if bucket[index - 1] < num <= bucket[index]:
|
||||||
return bucket[index]
|
return bucket[index]
|
||||||
return bucket[0]
|
return bucket[0]
|
||||||
|
|
||||||
def _convert_ids_and_mask(input_tokens):
|
def _convert_ids_and_mask(input_tokens, seq_max_bucket_length):
|
||||||
input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
|
input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
|
||||||
input_mask = [1] * len(input_ids)
|
input_mask = [1] * len(input_ids)
|
||||||
assert len(input_ids) <= max_seq_length
|
assert len(input_ids) <= max_seq_length
|
||||||
|
|
||||||
seq_max_bucket_length = _find_bucket_length(len(input_ids))
|
|
||||||
|
|
||||||
while len(input_ids) < seq_max_bucket_length:
|
while len(input_ids) < seq_max_bucket_length:
|
||||||
input_ids.append(0)
|
input_ids.append(0)
|
||||||
input_mask.append(0)
|
input_mask.append(0)
|
||||||
|
@ -77,10 +78,11 @@ def write_instance_to_file(writer, instance, tokenizer, max_seq_length, bucket):
|
||||||
|
|
||||||
return input_ids, input_mask
|
return input_ids, input_mask
|
||||||
|
|
||||||
source_sos_ids, source_sos_mask = _convert_ids_and_mask(instance.source_sos_tokens)
|
seq_max_bucket_length = _find_bucket_length(instance.source_sos_tokens, instance.target_sos_tokens)
|
||||||
source_eos_ids, source_eos_mask = _convert_ids_and_mask(instance.source_eos_tokens)
|
source_sos_ids, source_sos_mask = _convert_ids_and_mask(instance.source_sos_tokens, seq_max_bucket_length)
|
||||||
target_sos_ids, target_sos_mask = _convert_ids_and_mask(instance.target_sos_tokens)
|
source_eos_ids, source_eos_mask = _convert_ids_and_mask(instance.source_eos_tokens, seq_max_bucket_length)
|
||||||
target_eos_ids, target_eos_mask = _convert_ids_and_mask(instance.target_eos_tokens)
|
target_sos_ids, target_sos_mask = _convert_ids_and_mask(instance.target_sos_tokens, seq_max_bucket_length)
|
||||||
|
target_eos_ids, target_eos_mask = _convert_ids_and_mask(instance.target_eos_tokens, seq_max_bucket_length)
|
||||||
|
|
||||||
features = collections.OrderedDict()
|
features = collections.OrderedDict()
|
||||||
features["source_sos_ids"] = np.asarray(source_sos_ids)
|
features["source_sos_ids"] = np.asarray(source_sos_ids)
|
||||||
|
@ -92,8 +94,7 @@ def write_instance_to_file(writer, instance, tokenizer, max_seq_length, bucket):
|
||||||
features["target_eos_ids"] = np.asarray(target_eos_ids)
|
features["target_eos_ids"] = np.asarray(target_eos_ids)
|
||||||
features["target_eos_mask"] = np.asarray(target_eos_mask)
|
features["target_eos_mask"] = np.asarray(target_eos_mask)
|
||||||
|
|
||||||
writer.write_raw_data([features])
|
return features, seq_max_bucket_length
|
||||||
return features
|
|
||||||
|
|
||||||
def create_training_instance(source_words, target_words, max_seq_length, clip_to_max_len):
|
def create_training_instance(source_words, target_words, max_seq_length, clip_to_max_len):
|
||||||
"""Creates `SampleInstance`s for a single sentence pair."""
|
"""Creates `SampleInstance`s for a single sentence pair."""
|
||||||
|
@ -131,7 +132,8 @@ def main():
|
||||||
parser.add_argument("--clip_to_max_len", type=bool, default=False,
|
parser.add_argument("--clip_to_max_len", type=bool, default=False,
|
||||||
help='clip sequences to maximum sequence length.')
|
help='clip sequences to maximum sequence length.')
|
||||||
parser.add_argument("--max_seq_length", type=int, default=128, help='Maximum sequence length.')
|
parser.add_argument("--max_seq_length", type=int, default=128, help='Maximum sequence length.')
|
||||||
parser.add_argument("--bucket", type=list, default=[16, 32, 48, 64, 128], help='bucket sequence length')
|
parser.add_argument("--bucket", type=ast.literal_eval, default=[16, 32, 48, 64, 128],
|
||||||
|
help='bucket sequence length')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
@ -141,29 +143,21 @@ def main():
|
||||||
for input_pattern in args.input_file.split(","):
|
for input_pattern in args.input_file.split(","):
|
||||||
input_files.append(input_pattern)
|
input_files.append(input_pattern)
|
||||||
|
|
||||||
logging.info("*** Reading from input files ***")
|
logging.info("*** Read from input files ***")
|
||||||
for input_file in input_files:
|
for input_file in input_files:
|
||||||
logging.info(" %s", input_file)
|
logging.info(" %s", input_file)
|
||||||
|
|
||||||
output_file = args.output_file
|
output_file = args.output_file
|
||||||
logging.info("*** Writing to output files ***")
|
logging.info("*** Write to output files ***")
|
||||||
logging.info(" %s", output_file)
|
logging.info(" %s", output_file)
|
||||||
|
|
||||||
writer = FileWriter(output_file, args.num_splits)
|
|
||||||
data_schema = {"source_sos_ids": {"type": "int64", "shape": [-1]},
|
|
||||||
"source_sos_mask": {"type": "int64", "shape": [-1]},
|
|
||||||
"source_eos_ids": {"type": "int64", "shape": [-1]},
|
|
||||||
"source_eos_mask": {"type": "int64", "shape": [-1]},
|
|
||||||
"target_sos_ids": {"type": "int64", "shape": [-1]},
|
|
||||||
"target_sos_mask": {"type": "int64", "shape": [-1]},
|
|
||||||
"target_eos_ids": {"type": "int64", "shape": [-1]},
|
|
||||||
"target_eos_mask": {"type": "int64", "shape": [-1]}
|
|
||||||
}
|
|
||||||
writer.add_schema(data_schema, "tranformer hisi")
|
|
||||||
|
|
||||||
total_written = 0
|
total_written = 0
|
||||||
total_read = 0
|
total_read = 0
|
||||||
|
|
||||||
|
feature_dict = {}
|
||||||
|
for i in args.bucket:
|
||||||
|
feature_dict[i] = []
|
||||||
|
|
||||||
for input_file in input_files:
|
for input_file in input_files:
|
||||||
logging.info("*** Reading from %s ***", input_file)
|
logging.info("*** Reading from %s ***", input_file)
|
||||||
with open(input_file, "r") as reader:
|
with open(input_file, "r") as reader:
|
||||||
|
@ -174,7 +168,7 @@ def main():
|
||||||
|
|
||||||
total_read += 1
|
total_read += 1
|
||||||
if total_read % 100000 == 0:
|
if total_read % 100000 == 0:
|
||||||
logging.info("%d ...", total_read)
|
logging.info("Read %d ...", total_read)
|
||||||
|
|
||||||
source_line, target_line = line.strip().split("\t")
|
source_line, target_line = line.strip().split("\t")
|
||||||
source_tokens = tokenizer.tokenize(source_line)
|
source_tokens = tokenizer.tokenize(source_line)
|
||||||
|
@ -189,10 +183,13 @@ def main():
|
||||||
if instance is None:
|
if instance is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
features = write_instance_to_file(writer, instance, tokenizer, args.max_seq_length, args.bucket)
|
features, seq_max_bucket_length = get_instance_features(instance, tokenizer, args.max_seq_length,
|
||||||
total_written += 1
|
args.bucket)
|
||||||
|
for key in feature_dict:
|
||||||
|
if key == seq_max_bucket_length:
|
||||||
|
feature_dict[key].append(features)
|
||||||
|
|
||||||
if total_written <= 20:
|
if total_read <= 10:
|
||||||
logging.info("*** Example ***")
|
logging.info("*** Example ***")
|
||||||
logging.info("source tokens: %s", " ".join(
|
logging.info("source tokens: %s", " ".join(
|
||||||
[tokenization.convert_to_printable(x) for x in instance.source_eos_tokens]))
|
[tokenization.convert_to_printable(x) for x in instance.source_eos_tokens]))
|
||||||
|
@ -203,9 +200,33 @@ def main():
|
||||||
feature = features[feature_name]
|
feature = features[feature_name]
|
||||||
logging.info("%s: %s", feature_name, feature)
|
logging.info("%s: %s", feature_name, feature)
|
||||||
|
|
||||||
|
for i in args.bucket:
|
||||||
|
if args.num_splits == 1:
|
||||||
|
output_file_name = output_file
|
||||||
|
else:
|
||||||
|
output_file_name = output_file + '_' + str(i) + '_'
|
||||||
|
writer = FileWriter(output_file_name, args.num_splits)
|
||||||
|
data_schema = {"source_sos_ids": {"type": "int64", "shape": [-1]},
|
||||||
|
"source_sos_mask": {"type": "int64", "shape": [-1]},
|
||||||
|
"source_eos_ids": {"type": "int64", "shape": [-1]},
|
||||||
|
"source_eos_mask": {"type": "int64", "shape": [-1]},
|
||||||
|
"target_sos_ids": {"type": "int64", "shape": [-1]},
|
||||||
|
"target_sos_mask": {"type": "int64", "shape": [-1]},
|
||||||
|
"target_eos_ids": {"type": "int64", "shape": [-1]},
|
||||||
|
"target_eos_mask": {"type": "int64", "shape": [-1]}
|
||||||
|
}
|
||||||
|
writer.add_schema(data_schema, "tranformer")
|
||||||
|
features_ = feature_dict[i]
|
||||||
|
logging.info("Bucket length %d has %d samples, start writing...", i, len(features_))
|
||||||
|
|
||||||
|
for item in features_:
|
||||||
|
writer.write_raw_data([item])
|
||||||
|
total_written += 1
|
||||||
writer.commit()
|
writer.commit()
|
||||||
|
|
||||||
logging.info("Wrote %d total instances", total_written)
|
logging.info("Wrote %d total instances", total_written)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
main()
|
main()
|
||||||
|
|
|
@ -52,11 +52,11 @@ do
|
||||||
--enable_save_ckpt="true" \
|
--enable_save_ckpt="true" \
|
||||||
--enable_lossscale="true" \
|
--enable_lossscale="true" \
|
||||||
--do_shuffle="true" \
|
--do_shuffle="true" \
|
||||||
--enable_data_sink="false" \
|
|
||||||
--checkpoint_path="" \
|
--checkpoint_path="" \
|
||||||
--save_checkpoint_steps=2500 \
|
--save_checkpoint_steps=2500 \
|
||||||
--save_checkpoint_num=30 \
|
--save_checkpoint_num=30 \
|
||||||
--data_path=$DATA_PATH > log.txt 2>&1 &
|
--data_path=$DATA_PATH \
|
||||||
|
--bucket_boundaries=[16,32,48,64,128] > log.txt 2>&1 &
|
||||||
cd ../
|
cd ../
|
||||||
done
|
done
|
||||||
cd ..
|
cd ..
|
|
@ -37,9 +37,9 @@ python train.py \
|
||||||
--enable_save_ckpt="true" \
|
--enable_save_ckpt="true" \
|
||||||
--enable_lossscale="true" \
|
--enable_lossscale="true" \
|
||||||
--do_shuffle="true" \
|
--do_shuffle="true" \
|
||||||
--enable_data_sink="false" \
|
|
||||||
--checkpoint_path="" \
|
--checkpoint_path="" \
|
||||||
--save_checkpoint_steps=2500 \
|
--save_checkpoint_steps=2500 \
|
||||||
--save_checkpoint_num=30 \
|
--save_checkpoint_num=30 \
|
||||||
--data_path=$DATA_PATH > log.txt 2>&1 &
|
--data_path=$DATA_PATH \
|
||||||
|
--bucket_boundaries=[16,32,48,64,128] > log.txt 2>&1 &
|
||||||
cd ..
|
cd ..
|
|
@ -19,8 +19,8 @@ import mindspore.dataset as de
|
||||||
import mindspore.dataset.transforms.c_transforms as deC
|
import mindspore.dataset.transforms.c_transforms as deC
|
||||||
from .config import transformer_net_cfg
|
from .config import transformer_net_cfg
|
||||||
de.config.set_seed(1)
|
de.config.set_seed(1)
|
||||||
def create_transformer_dataset(epoch_count=1, rank_size=1, rank_id=0, do_shuffle="true", enable_data_sink="true",
|
def create_transformer_dataset(epoch_count=1, rank_size=1, rank_id=0, do_shuffle="true", dataset_path=None,
|
||||||
dataset_path=None, bucket_boundaries=None):
|
bucket_boundaries=None):
|
||||||
"""create dataset"""
|
"""create dataset"""
|
||||||
def batch_per_bucket(bucket_len, dataset_path):
|
def batch_per_bucket(bucket_len, dataset_path):
|
||||||
dataset_path = dataset_path + "_" + str(bucket_len) + "_00"
|
dataset_path = dataset_path + "_" + str(bucket_len) + "_00"
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
|
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
import ast
|
||||||
|
|
||||||
import mindspore.common.dtype as mstype
|
import mindspore.common.dtype as mstype
|
||||||
from mindspore.common.tensor import Tensor
|
from mindspore.common.tensor import Tensor
|
||||||
|
@ -94,8 +95,6 @@ def argparse_init():
|
||||||
help="Use lossscale or not, default is true.")
|
help="Use lossscale or not, default is true.")
|
||||||
parser.add_argument("--do_shuffle", type=str, default="true", choices=['true', 'false'],
|
parser.add_argument("--do_shuffle", type=str, default="true", choices=['true', 'false'],
|
||||||
help="Enable shuffle for dataset, default is true.")
|
help="Enable shuffle for dataset, default is true.")
|
||||||
parser.add_argument("--enable_data_sink", type=str, default="false", choices=['true', 'false'],
|
|
||||||
help="Enable data sink, default is false.")
|
|
||||||
parser.add_argument("--checkpoint_path", type=str, default="", help="Checkpoint file path")
|
parser.add_argument("--checkpoint_path", type=str, default="", help="Checkpoint file path")
|
||||||
parser.add_argument("--enable_save_ckpt", type=str, default="true", choices=['true', 'false'],
|
parser.add_argument("--enable_save_ckpt", type=str, default="true", choices=['true', 'false'],
|
||||||
help="Enable save checkpoint, default is true.")
|
help="Enable save checkpoint, default is true.")
|
||||||
|
@ -105,8 +104,8 @@ def argparse_init():
|
||||||
parser.add_argument("--save_checkpoint_path", type=str, default="./checkpoint/", help="Save checkpoint file path, "
|
parser.add_argument("--save_checkpoint_path", type=str, default="./checkpoint/", help="Save checkpoint file path, "
|
||||||
"default is ./checkpoint/")
|
"default is ./checkpoint/")
|
||||||
parser.add_argument("--data_path", type=str, default="", help="Data path, it is better to use absolute path")
|
parser.add_argument("--data_path", type=str, default="", help="Data path, it is better to use absolute path")
|
||||||
parser.add_argument("--bucket_boundaries", type=list, default=[16, 32, 48, 64, 128], help="sequence length for "
|
parser.add_argument("--bucket_boundaries", type=ast.literal_eval, default=[16, 32, 48, 64, 128],
|
||||||
"different bucket")
|
help="sequence length for different bucket")
|
||||||
|
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
@ -131,7 +130,6 @@ def run_transformer_train():
|
||||||
rank_id = 0
|
rank_id = 0
|
||||||
dataset = create_transformer_dataset(epoch_count=1, rank_size=device_num,
|
dataset = create_transformer_dataset(epoch_count=1, rank_size=device_num,
|
||||||
rank_id=rank_id, do_shuffle=args.do_shuffle,
|
rank_id=rank_id, do_shuffle=args.do_shuffle,
|
||||||
enable_data_sink=args.enable_data_sink,
|
|
||||||
dataset_path=args.data_path,
|
dataset_path=args.data_path,
|
||||||
bucket_boundaries=args.bucket_boundaries)
|
bucket_boundaries=args.bucket_boundaries)
|
||||||
|
|
||||||
|
@ -171,13 +169,7 @@ def run_transformer_train():
|
||||||
netwithgrads.set_train(True)
|
netwithgrads.set_train(True)
|
||||||
model = Model(netwithgrads)
|
model = Model(netwithgrads)
|
||||||
|
|
||||||
enable_sink = (args.enable_data_sink == "true")
|
model.train(args.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=False)
|
||||||
if enable_sink:
|
|
||||||
sink_size = args.save_checkpoint_steps
|
|
||||||
model.train(args.epoch_size*dataset.get_dataset_size()//sink_size, dataset, callbacks=callbacks,
|
|
||||||
dataset_sink_mode=enable_sink, sink_size=sink_size)
|
|
||||||
else:
|
|
||||||
model.train(args.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=enable_sink)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
run_transformer_train()
|
run_transformer_train()
|
||||||
|
|
Loading…
Reference in New Issue