diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index 33b9028115f..debd2d6417a 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -135,6 +135,7 @@ class Dataset: """ def __init__(self, num_parallel_workers=None): + # Note: children and parent are internal variables, not recommand for external using. self.children = [] self.parent = [] self.num_parallel_workers = num_parallel_workers diff --git a/mindspore/dataset/text/utils.py b/mindspore/dataset/text/utils.py index 153ec484318..1dc3012ce95 100644 --- a/mindspore/dataset/text/utils.py +++ b/mindspore/dataset/text/utils.py @@ -185,7 +185,14 @@ class SentencePieceVocab(cde.SentencePieceVocab): character set. model_type(SentencePieceModel): Choose from unigram (default), bpe, char, or word. The input sentence must be pretokenized when using word type. - params(dict): A dictionary with no incoming parameters. + params(dict): A dictionary with no incoming parameters(The parameters are derived from SentencePiece + library). + + .. code-block :: + { + "input_sentence_size" : 0, + "max_sentencepiece_length" : 16 + } """ return super().from_file(file_path, vocab_size, character_coverage, DE_C_INTER_SENTENCEPIECE_MODE[model_type], params) diff --git a/tests/ut/python/dataset/test_sentencepiece_tokenizer.py b/tests/ut/python/dataset/test_sentencepiece_tokenizer.py index efcb656c105..e78c58e5a33 100644 --- a/tests/ut/python/dataset/test_sentencepiece_tokenizer.py +++ b/tests/ut/python/dataset/test_sentencepiece_tokenizer.py @@ -21,7 +21,7 @@ VOCAB_FILE = "../data/dataset/test_sentencepiece/botchan.txt" DATA_FILE = "../data/dataset/testTokenizerData/sentencepiece_tokenizer.txt" -def test_from_vocab_to_str(): +def test_from_vocab_to_str_UNIGRAM(): vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) @@ -33,6 +33,43 @@ def test_from_vocab_to_str(): assert value == expect[key] +def test_from_vocab_to_str_BPE(): + vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.BPE, {}) + tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) + dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) + dataset = dataset.map(operations=tokenizer) + expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'c', 'ope', '.'] + for i in dataset.create_dict_iterator(): + ret = to_str(i["text"]) + for key, value in enumerate(ret): + assert value == expect[key] + + +def test_from_vocab_to_str_CHAR(): + vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.CHAR, {}) + tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) + dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) + dataset = dataset.map(operations=tokenizer) + expect = ['▁', 'I', '▁', 's', 'a', 'w', '▁', 'a', '▁', 'g', 'i', 'r', 'l', '▁', 'w', 'i', 't', 'h',\ + '▁', 'a', '▁', 't', 'e', 'l', 'e', 's', 'c', 'o', 'p', 'e', '.'] + for i in dataset.create_dict_iterator(): + ret = to_str(i["text"]) + for key, value in enumerate(ret): + assert value == expect[key] + + +def test_from_vocab_to_str_WORD(): + vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.WORD, {}) + tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) + dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) + dataset = dataset.map(operations=tokenizer) + expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁telescope.'] + for i in dataset.create_dict_iterator(): + ret = to_str(i["text"]) + for key, value in enumerate(ret): + assert value == expect[key] + + def test_from_vocab_to_int(): vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.INT) @@ -85,7 +122,10 @@ def test_build_from_dataset(): if __name__ == "__main__": - test_from_vocab_to_str() + test_from_vocab_to_str_UNIGRAM() + test_from_vocab_to_str_BPE() + test_from_vocab_to_str_CHAR() + test_from_vocab_to_str_WORD() test_from_vocab_to_int() test_from_file_to_str() test_from_file_to_int()