[MD] decode numpy strings in cpp for eager mode

2022-06-15 16:18:06 -04:00 · 2022-06-15 16:18:06 -04:00 · 96e21f6ed4
parent 20e4ef2863
commit 96e21f6ed4
9 changed files with 75 additions and 17 deletions
--- a/mindspore/ccsrc/minddata/dataset/api/execute.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/execute.cc
@ -377,7 +377,7 @@ Status PyExecute::operator()(const std::shared_ptr<Tensor> &input_tensor, std::s
      de_tensor_list = std::move(de_output_list);
    }
    CHECK_FAIL_RETURN_UNEXPECTED(de_tensor_list.size() > 0,
-                                 "[internal] transformation resulted in a tensor with size=0!");
+                                 "[Internal] Transformation resulted in a tensor with size=0!");
    *out = std::move(de_tensor_list.getRow())[0];
  } else {
    std::string err_msg = "Your input device is not supported. (Option: CPU)";
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/core/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/core/bindings.cc
@ -93,11 +93,23 @@ PYBIND_REGISTER(Tensor, 0, ([](const py::module *m) {
                    .def("__str__", &Tensor::ToString)
                    .def("shape", &Tensor::shape)
                    .def("type", &Tensor::type)
-                    .def("as_array", [](py::object &t) {
+                    .def("as_array",
+                         [](py::object &t) {
+                           auto &tensor = py::cast<Tensor &>(t);
+                           if (tensor.type() == DataType::DE_STRING) {
+                             py::array res;
+                             THROW_IF_ERROR(tensor.GetDataAsNumpyStrings(&res));
+                             return res;
+                           }
+                           py::buffer_info info;
+                           THROW_IF_ERROR(Tensor::GetBufferInfo(&tensor, &info));
+                           return py::array(pybind11::dtype(info), info.shape, info.strides, info.ptr, t);
+                         })
+                    .def("as_decoded_array", [](py::object &t) {
                      auto &tensor = py::cast<Tensor &>(t);
                      if (tensor.type() == DataType::DE_STRING) {
                        py::array res;
-                        THROW_IF_ERROR(tensor.GetDataAsNumpyStrings(&res));
+                        THROW_IF_ERROR(tensor.GetDataAsNumpyUnicodeStrings(&res));
                        return res;
                      }
                      py::buffer_info info;
--- a/mindspore/ccsrc/minddata/dataset/core/tensor.cc
+++ b/mindspore/ccsrc/minddata/dataset/core/tensor.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2019-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -866,6 +866,17 @@ Status Tensor::GetDataAsNumpyStrings(py::array *data) {
  data_allocator_->deallocate(reinterpret_cast<uchar *>(tmp_data));
  return Status::OK();
 }
+Status Tensor::GetDataAsNumpyUnicodeStrings(py::array *data) {
+  RETURN_UNEXPECTED_IF_NULL(data);
+  std::vector<std::string_view> string_pointers;
+  string_pointers.reserve(Size());
+  // Iterate over tensor and create a vector of string_views of strings in the tensor.
+  (void)std::transform(begin<std::string_view>(), end<std::string_view>(), std::back_inserter(string_pointers),
+                       [](const auto &element) { return element; });
+  *data = py::array(py::cast(string_pointers));
+  data->resize(shape_.AsVector());
+  return Status::OK();
+}
 #endif

 void Tensor::Squeeze() { shape_ = shape_.Squeeze(); }
--- a/mindspore/ccsrc/minddata/dataset/core/tensor.h
+++ b/mindspore/ccsrc/minddata/dataset/core/tensor.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2019-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -441,12 +441,20 @@ class Tensor {

 #ifdef ENABLE_PYTHON
  /// Constructs numpy array from input tensor
-  /// \param[in] data this data is the location of python data
+  /// \param[out] data this data is the location of python data
  /// \return Status code
  Status GetDataAsNumpy(py::array *data);

+  /// Constructs numpy array of strings
+  /// \param[out] data this data is the location of python data
+  /// \return Status code
  Status GetDataAsNumpyStrings(py::array *data);

+  /// Constructs numpy array of strings which are already decoded ('U')
+  /// \param[out] data this data is the location of python data
+  /// \return Status code
+  Status GetDataAsNumpyUnicodeStrings(py::array *data);
+
  static Status GetBufferInfo(Tensor *t, py::buffer_info *out);
 #endif

--- a/mindspore/python/mindspore/dataset/engine/iterators.py
+++ b/mindspore/python/mindspore/dataset/engine/iterators.py
@ -183,9 +183,7 @@ class Iterator:
        return self._transform_md_to_tensor(t)

    def _transform_md_to_tensor(self, t):
-        array = t.as_array()
-        if array.dtype.type is np.bytes_:
-            array = np.char.decode(array)
+        array = t.as_decoded_array()
        if self._do_copy:
            return Tensor(array)
        return Tensor.from_numpy(array)
--- a/mindspore/python/mindspore/dataset/transforms/transforms.py
+++ b/mindspore/python/mindspore/dataset/transforms/transforms.py
@ -65,13 +65,8 @@ class TensorOperation:
        if not hasattr(self, 'callable_op_') or self.callable_op_ is None:
            self.callable_op_ = cde.Execute(self.parse())
        output_tensor_list = self.callable_op_(tensor_row)
-        for i, element in enumerate(output_tensor_list):
-            arr = element.as_array()
-            if arr.dtype.char == 'S':
-                output_tensor_list[i] = np.char.decode(arr)
-            else:
-                output_tensor_list[i] = arr
-        return output_tensor_list[0] if len(output_tensor_list) == 1 else tuple(output_tensor_list)
+        output_numpy_list = [x.as_decoded_array() for x in output_tensor_list]
+        return output_numpy_list[0] if len(output_numpy_list) == 1 else tuple(output_numpy_list)

    @staticmethod
    def parse():
--- a/tests/ut/python/dataset/test_map_offload.py
+++ b/tests/ut/python/dataset/test_map_offload.py
@ -20,7 +20,6 @@ import mindspore.common.dtype as mstype
 import mindspore.dataset.vision as C
 import mindspore.dataset.transforms as C2

-
 DATA_DIR = "../data/dataset/testPK/data"
 BATCH_SIZE = 2

@ -49,6 +48,30 @@ def test_offload():
        break


+def test_offload_string():
+    """
+    Feature: Test map offload flag with string tensors.
+    Description: Input is text dataset.
+    Expectation: Output should be same with activated or deactivated offload (incl. decoded text).
+    """
+
+    # Dataset with offload activated.
+    data0 = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False)
+
+    # Dataset with offload not activated.
+    data1 = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False)
+
+    # Use Data Transforms PadEnd op in operations list for Map
+    padend_op = C2.PadEnd([100], pad_value='<pad>')
+
+    data0 = data0.map(operations=[padend_op], input_columns=["text"], offload=True)
+    data1 = data1.map(operations=[padend_op], input_columns=["text"])
+
+    for d0, d1 in zip(data0.create_dict_iterator(num_epochs=1, output_numpy=True),
+                      data1.create_dict_iterator(num_epochs=1, output_numpy=True)):
+        np.testing.assert_array_equal(d0['text'], (d1['text']))
+
+
 def test_auto_offload():
    """
    Feature: Test auto_offload config option.
@ -104,6 +127,7 @@ def test_offload_multi_column():
    Description: Input is an image dataset, copy the image column and apply map operations to both images.
    Expectation: Output should be same with both offload activated and deactivated.
    """
+
    def copy_column(x, y):
        return x, x, y

@ -141,6 +165,7 @@ def test_offload_column_mapping():
    Description: Input is an image dataset, copy the image column, then apply offload to only copied column.
    Expectation: The offload model dataset column index value is 1 (second column).
    """
+
    def copy_column(x, y):
        return x, x, y

@ -417,6 +442,7 @@ def test_offload_with_dict_itr():

 if __name__ == "__main__":
    test_offload()
+    test_offload_string()
    test_auto_offload()
    test_offload_column_validation()
    test_offload_column_mapping()
--- a/tests/ut/python/dataset/test_tensor.py
+++ b/tests/ut/python/dataset/test_tensor.py
@ -45,11 +45,15 @@ def test_basic():
    assert n.type() == cde.DataType("int64")

    arr2 = n.as_array()
+    # decoding only impacts string arrays
+    arr3 = n.as_decoded_array()
    arr[0] = 2
    x = np.array([2, 2, 3, 4, 5])
    np.testing.assert_array_equal(x, arr2)
+    np.testing.assert_array_equal(x, arr3)
    assert n.type() == cde.DataType("int64")
    assert arr.__array_interface__['data'] == arr2.__array_interface__['data']
+    assert arr.__array_interface__['data'] == arr3.__array_interface__['data']


 def test_strides():
--- a/tests/ut/python/dataset/test_tensor_string.py
+++ b/tests/ut/python/dataset/test_tensor_string.py
@ -31,6 +31,10 @@ def test_basic():
    arr = n.as_array()
    np.testing.assert_array_equal(x, arr)

+    arr2 = n.as_decoded_array()
+    np.testing.assert_array_equal(x, np.char.encode(arr2))
+    np.testing.assert_array_equal(np.char.decode(x), arr2)
+

 def compare(strings, dtype='S'):
    arr = np.array(strings, dtype=dtype)