!13533 [MD] fix python tokenizer

From: @luoyang42
Reviewed-by: @heleiwang,@liucunwei
Signed-off-by: @liucunwei
This commit is contained in:
mindspore-ci-bot 2021-03-18 17:45:19 +08:00 committed by Gitee
commit bf29da4bd5
3 changed files with 13 additions and 6 deletions

View File

@ -533,7 +533,9 @@ class PythonTokenizer:
self.random = False
def __call__(self, in_array):
if not isinstance(in_array, str):
if not isinstance(in_array, np.ndarray):
raise TypeError("input should be a NumPy array. Got {}.".format(type(in_array)))
if in_array.dtype.type is np.bytes_:
in_array = to_str(in_array)
tokens = self.tokenizer(in_array)
return tokens

View File

@ -216,7 +216,7 @@ def to_str(array, encoding='utf8'):
"""
if not isinstance(array, np.ndarray):
raise ValueError('input should be a NumPy array.')
raise TypeError('input should be a NumPy array.')
return np.char.decode(array, encoding)

View File

@ -52,12 +52,17 @@ def test_python_tokenizer():
if not words:
return [""]
return words
txt = "Welcome to Beijing !"
txt = T.PythonTokenizer(my_tokenizer)(txt)
logger.info("Tokenize result: {}".format(txt))
txt1 = np.array("Welcome to Beijing !".encode())
txt1 = T.PythonTokenizer(my_tokenizer)(txt1)
logger.info("Tokenize result: {}".format(txt1))
txt2 = np.array("Welcome to Beijing !")
txt2 = T.PythonTokenizer(my_tokenizer)(txt2)
logger.info("Tokenize result: {}".format(txt2))
expected = ['Welcome', 'to', 'Beijing', '!']
np.testing.assert_equal(txt, expected)
np.testing.assert_equal(txt1, expected)
np.testing.assert_equal(txt2, expected)
if __name__ == '__main__':