!13533 [MD] fix python tokenizer

From: @luoyang42 Reviewed-by: @heleiwang,@liucunwei Signed-off-by: @liucunwei
2021-03-18 17:45:19 +08:00 · 2021-03-18 17:45:19 +08:00 · bf29da4bd5
parent 7cd339368d f99204b292
commit bf29da4bd5
3 changed files with 13 additions and 6 deletions
--- a/mindspore/dataset/text/transforms.py
+++ b/mindspore/dataset/text/transforms.py
@ -533,7 +533,9 @@ class PythonTokenizer:
        self.random = False

    def __call__(self, in_array):
-        if not isinstance(in_array, str):
+        if not isinstance(in_array, np.ndarray):
+            raise TypeError("input should be a NumPy array. Got {}.".format(type(in_array)))
+        if in_array.dtype.type is np.bytes_:
            in_array = to_str(in_array)
        tokens = self.tokenizer(in_array)
        return tokens
--- a/mindspore/dataset/text/utils.py
+++ b/mindspore/dataset/text/utils.py
@ -216,7 +216,7 @@ def to_str(array, encoding='utf8'):
    """

    if not isinstance(array, np.ndarray):
-        raise ValueError('input should be a NumPy array.')
+        raise TypeError('input should be a NumPy array.')

    return np.char.decode(array, encoding)

--- a/tests/ut/python/dataset/test_eager_text.py
+++ b/tests/ut/python/dataset/test_eager_text.py
@ -52,12 +52,17 @@ def test_python_tokenizer():
        if not words:
            return [""]
        return words
-    txt = "Welcome to Beijing !"
-    txt = T.PythonTokenizer(my_tokenizer)(txt)
-    logger.info("Tokenize result: {}".format(txt))
+    txt1 = np.array("Welcome to Beijing !".encode())
+    txt1 = T.PythonTokenizer(my_tokenizer)(txt1)
+    logger.info("Tokenize result: {}".format(txt1))
+
+    txt2 = np.array("Welcome to Beijing !")
+    txt2 = T.PythonTokenizer(my_tokenizer)(txt2)
+    logger.info("Tokenize result: {}".format(txt2))

    expected = ['Welcome', 'to', 'Beijing', '!']
-    np.testing.assert_equal(txt, expected)
+    np.testing.assert_equal(txt1, expected)
+    np.testing.assert_equal(txt2, expected)


 if __name__ == '__main__':