forked from OSSInnovation/mindspore
commit
21a5f06e93
|
@ -135,6 +135,7 @@ class Dataset:
|
|||
"""
|
||||
|
||||
def __init__(self, num_parallel_workers=None):
|
||||
# Note: children and parent are internal variables, not recommand for external using.
|
||||
self.children = []
|
||||
self.parent = []
|
||||
self.num_parallel_workers = num_parallel_workers
|
||||
|
|
|
@ -185,7 +185,14 @@ class SentencePieceVocab(cde.SentencePieceVocab):
|
|||
character set.
|
||||
model_type(SentencePieceModel): Choose from unigram (default), bpe, char, or word. The input sentence
|
||||
must be pretokenized when using word type.
|
||||
params(dict): A dictionary with no incoming parameters.
|
||||
params(dict): A dictionary with no incoming parameters(The parameters are derived from SentencePiece
|
||||
library).
|
||||
|
||||
.. code-block ::
|
||||
{
|
||||
"input_sentence_size" : 0,
|
||||
"max_sentencepiece_length" : 16
|
||||
}
|
||||
"""
|
||||
return super().from_file(file_path, vocab_size, character_coverage,
|
||||
DE_C_INTER_SENTENCEPIECE_MODE[model_type], params)
|
||||
|
|
|
@ -21,7 +21,7 @@ VOCAB_FILE = "../data/dataset/test_sentencepiece/botchan.txt"
|
|||
DATA_FILE = "../data/dataset/testTokenizerData/sentencepiece_tokenizer.txt"
|
||||
|
||||
|
||||
def test_from_vocab_to_str():
|
||||
def test_from_vocab_to_str_UNIGRAM():
|
||||
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
|
||||
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
|
@ -33,6 +33,43 @@ def test_from_vocab_to_str():
|
|||
assert value == expect[key]
|
||||
|
||||
|
||||
def test_from_vocab_to_str_BPE():
|
||||
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.BPE, {})
|
||||
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
dataset = dataset.map(operations=tokenizer)
|
||||
expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'c', 'ope', '.']
|
||||
for i in dataset.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for key, value in enumerate(ret):
|
||||
assert value == expect[key]
|
||||
|
||||
|
||||
def test_from_vocab_to_str_CHAR():
|
||||
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.CHAR, {})
|
||||
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
dataset = dataset.map(operations=tokenizer)
|
||||
expect = ['▁', 'I', '▁', 's', 'a', 'w', '▁', 'a', '▁', 'g', 'i', 'r', 'l', '▁', 'w', 'i', 't', 'h',\
|
||||
'▁', 'a', '▁', 't', 'e', 'l', 'e', 's', 'c', 'o', 'p', 'e', '.']
|
||||
for i in dataset.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for key, value in enumerate(ret):
|
||||
assert value == expect[key]
|
||||
|
||||
|
||||
def test_from_vocab_to_str_WORD():
|
||||
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.WORD, {})
|
||||
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
dataset = dataset.map(operations=tokenizer)
|
||||
expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁telescope.']
|
||||
for i in dataset.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for key, value in enumerate(ret):
|
||||
assert value == expect[key]
|
||||
|
||||
|
||||
def test_from_vocab_to_int():
|
||||
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
|
||||
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.INT)
|
||||
|
@ -85,7 +122,10 @@ def test_build_from_dataset():
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_from_vocab_to_str()
|
||||
test_from_vocab_to_str_UNIGRAM()
|
||||
test_from_vocab_to_str_BPE()
|
||||
test_from_vocab_to_str_CHAR()
|
||||
test_from_vocab_to_str_WORD()
|
||||
test_from_vocab_to_int()
|
||||
test_from_file_to_str()
|
||||
test_from_file_to_int()
|
||||
|
|
Loading…
Reference in New Issue