forked from mindspore-Ecosystem/mindspore
!13533 [MD] fix python tokenizer
From: @luoyang42 Reviewed-by: @heleiwang,@liucunwei Signed-off-by: @liucunwei
This commit is contained in:
commit
bf29da4bd5
|
@ -533,7 +533,9 @@ class PythonTokenizer:
|
|||
self.random = False
|
||||
|
||||
def __call__(self, in_array):
|
||||
if not isinstance(in_array, str):
|
||||
if not isinstance(in_array, np.ndarray):
|
||||
raise TypeError("input should be a NumPy array. Got {}.".format(type(in_array)))
|
||||
if in_array.dtype.type is np.bytes_:
|
||||
in_array = to_str(in_array)
|
||||
tokens = self.tokenizer(in_array)
|
||||
return tokens
|
||||
|
|
|
@ -216,7 +216,7 @@ def to_str(array, encoding='utf8'):
|
|||
"""
|
||||
|
||||
if not isinstance(array, np.ndarray):
|
||||
raise ValueError('input should be a NumPy array.')
|
||||
raise TypeError('input should be a NumPy array.')
|
||||
|
||||
return np.char.decode(array, encoding)
|
||||
|
||||
|
|
|
@ -52,12 +52,17 @@ def test_python_tokenizer():
|
|||
if not words:
|
||||
return [""]
|
||||
return words
|
||||
txt = "Welcome to Beijing !"
|
||||
txt = T.PythonTokenizer(my_tokenizer)(txt)
|
||||
logger.info("Tokenize result: {}".format(txt))
|
||||
txt1 = np.array("Welcome to Beijing !".encode())
|
||||
txt1 = T.PythonTokenizer(my_tokenizer)(txt1)
|
||||
logger.info("Tokenize result: {}".format(txt1))
|
||||
|
||||
txt2 = np.array("Welcome to Beijing !")
|
||||
txt2 = T.PythonTokenizer(my_tokenizer)(txt2)
|
||||
logger.info("Tokenize result: {}".format(txt2))
|
||||
|
||||
expected = ['Welcome', 'to', 'Beijing', '!']
|
||||
np.testing.assert_equal(txt, expected)
|
||||
np.testing.assert_equal(txt1, expected)
|
||||
np.testing.assert_equal(txt2, expected)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Reference in New Issue