Add testcases

This commit is contained in:
shenwei41 2020-07-22 15:50:41 +08:00
parent 875bdc2ebc
commit 4eaa396ca7
3 changed files with 51 additions and 3 deletions

View File

@ -135,6 +135,7 @@ class Dataset:
"""
def __init__(self, num_parallel_workers=None):
# Note: children and parent are internal variables, not recommand for external using.
self.children = []
self.parent = []
self.num_parallel_workers = num_parallel_workers

View File

@ -185,7 +185,14 @@ class SentencePieceVocab(cde.SentencePieceVocab):
character set.
model_type(SentencePieceModel): Choose from unigram (default), bpe, char, or word. The input sentence
must be pretokenized when using word type.
params(dict): A dictionary with no incoming parameters.
params(dict): A dictionary with no incoming parameters(The parameters are derived from SentencePiece
library).
.. code-block ::
{
"input_sentence_size" : 0,
"max_sentencepiece_length" : 16
}
"""
return super().from_file(file_path, vocab_size, character_coverage,
DE_C_INTER_SENTENCEPIECE_MODE[model_type], params)

View File

@ -21,7 +21,7 @@ VOCAB_FILE = "../data/dataset/test_sentencepiece/botchan.txt"
DATA_FILE = "../data/dataset/testTokenizerData/sentencepiece_tokenizer.txt"
def test_from_vocab_to_str():
def test_from_vocab_to_str_UNIGRAM():
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
@ -33,6 +33,43 @@ def test_from_vocab_to_str():
assert value == expect[key]
def test_from_vocab_to_str_BPE():
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.BPE, {})
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
dataset = dataset.map(operations=tokenizer)
expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'c', 'ope', '.']
for i in dataset.create_dict_iterator():
ret = to_str(i["text"])
for key, value in enumerate(ret):
assert value == expect[key]
def test_from_vocab_to_str_CHAR():
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.CHAR, {})
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
dataset = dataset.map(operations=tokenizer)
expect = ['', 'I', '', 's', 'a', 'w', '', 'a', '', 'g', 'i', 'r', 'l', '', 'w', 'i', 't', 'h',\
'', 'a', '', 't', 'e', 'l', 'e', 's', 'c', 'o', 'p', 'e', '.']
for i in dataset.create_dict_iterator():
ret = to_str(i["text"])
for key, value in enumerate(ret):
assert value == expect[key]
def test_from_vocab_to_str_WORD():
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.WORD, {})
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
dataset = dataset.map(operations=tokenizer)
expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁telescope.']
for i in dataset.create_dict_iterator():
ret = to_str(i["text"])
for key, value in enumerate(ret):
assert value == expect[key]
def test_from_vocab_to_int():
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.INT)
@ -85,7 +122,10 @@ def test_build_from_dataset():
if __name__ == "__main__":
test_from_vocab_to_str()
test_from_vocab_to_str_UNIGRAM()
test_from_vocab_to_str_BPE()
test_from_vocab_to_str_CHAR()
test_from_vocab_to_str_WORD()
test_from_vocab_to_int()
test_from_file_to_str()
test_from_file_to_int()