diff --git a/mindspore/ccsrc/minddata/dataset/api/execute.cc b/mindspore/ccsrc/minddata/dataset/api/execute.cc index cfef37d46dc..694da158033 100644 --- a/mindspore/ccsrc/minddata/dataset/api/execute.cc +++ b/mindspore/ccsrc/minddata/dataset/api/execute.cc @@ -377,7 +377,7 @@ Status PyExecute::operator()(const std::shared_ptr &input_tensor, std::s de_tensor_list = std::move(de_output_list); } CHECK_FAIL_RETURN_UNEXPECTED(de_tensor_list.size() > 0, - "[internal] transformation resulted in a tensor with size=0!"); + "[Internal] Transformation resulted in a tensor with size=0!"); *out = std::move(de_tensor_list.getRow())[0]; } else { std::string err_msg = "Your input device is not supported. (Option: CPU)"; diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/core/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/core/bindings.cc index cc4a1ed1f42..b072859e30f 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/core/bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/core/bindings.cc @@ -93,11 +93,23 @@ PYBIND_REGISTER(Tensor, 0, ([](const py::module *m) { .def("__str__", &Tensor::ToString) .def("shape", &Tensor::shape) .def("type", &Tensor::type) - .def("as_array", [](py::object &t) { + .def("as_array", + [](py::object &t) { + auto &tensor = py::cast(t); + if (tensor.type() == DataType::DE_STRING) { + py::array res; + THROW_IF_ERROR(tensor.GetDataAsNumpyStrings(&res)); + return res; + } + py::buffer_info info; + THROW_IF_ERROR(Tensor::GetBufferInfo(&tensor, &info)); + return py::array(pybind11::dtype(info), info.shape, info.strides, info.ptr, t); + }) + .def("as_decoded_array", [](py::object &t) { auto &tensor = py::cast(t); if (tensor.type() == DataType::DE_STRING) { py::array res; - THROW_IF_ERROR(tensor.GetDataAsNumpyStrings(&res)); + THROW_IF_ERROR(tensor.GetDataAsNumpyUnicodeStrings(&res)); return res; } py::buffer_info info; diff --git a/mindspore/ccsrc/minddata/dataset/core/tensor.cc b/mindspore/ccsrc/minddata/dataset/core/tensor.cc index 27baf0303bc..179b05ae719 100644 --- a/mindspore/ccsrc/minddata/dataset/core/tensor.cc +++ b/mindspore/ccsrc/minddata/dataset/core/tensor.cc @@ -1,5 +1,5 @@ /** - * Copyright 2019 Huawei Technologies Co., Ltd + * Copyright 2019-2022 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -866,6 +866,17 @@ Status Tensor::GetDataAsNumpyStrings(py::array *data) { data_allocator_->deallocate(reinterpret_cast(tmp_data)); return Status::OK(); } +Status Tensor::GetDataAsNumpyUnicodeStrings(py::array *data) { + RETURN_UNEXPECTED_IF_NULL(data); + std::vector string_pointers; + string_pointers.reserve(Size()); + // Iterate over tensor and create a vector of string_views of strings in the tensor. + (void)std::transform(begin(), end(), std::back_inserter(string_pointers), + [](const auto &element) { return element; }); + *data = py::array(py::cast(string_pointers)); + data->resize(shape_.AsVector()); + return Status::OK(); +} #endif void Tensor::Squeeze() { shape_ = shape_.Squeeze(); } diff --git a/mindspore/ccsrc/minddata/dataset/core/tensor.h b/mindspore/ccsrc/minddata/dataset/core/tensor.h index a46f27a2da3..39dc1c709bf 100644 --- a/mindspore/ccsrc/minddata/dataset/core/tensor.h +++ b/mindspore/ccsrc/minddata/dataset/core/tensor.h @@ -1,5 +1,5 @@ /** - * Copyright 2019 Huawei Technologies Co., Ltd + * Copyright 2019-2022 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -441,12 +441,20 @@ class Tensor { #ifdef ENABLE_PYTHON /// Constructs numpy array from input tensor - /// \param[in] data this data is the location of python data + /// \param[out] data this data is the location of python data /// \return Status code Status GetDataAsNumpy(py::array *data); + /// Constructs numpy array of strings + /// \param[out] data this data is the location of python data + /// \return Status code Status GetDataAsNumpyStrings(py::array *data); + /// Constructs numpy array of strings which are already decoded ('U') + /// \param[out] data this data is the location of python data + /// \return Status code + Status GetDataAsNumpyUnicodeStrings(py::array *data); + static Status GetBufferInfo(Tensor *t, py::buffer_info *out); #endif diff --git a/mindspore/python/mindspore/dataset/engine/iterators.py b/mindspore/python/mindspore/dataset/engine/iterators.py index a81d93ff1b7..ac2a47c159e 100644 --- a/mindspore/python/mindspore/dataset/engine/iterators.py +++ b/mindspore/python/mindspore/dataset/engine/iterators.py @@ -183,9 +183,7 @@ class Iterator: return self._transform_md_to_tensor(t) def _transform_md_to_tensor(self, t): - array = t.as_array() - if array.dtype.type is np.bytes_: - array = np.char.decode(array) + array = t.as_decoded_array() if self._do_copy: return Tensor(array) return Tensor.from_numpy(array) diff --git a/mindspore/python/mindspore/dataset/transforms/transforms.py b/mindspore/python/mindspore/dataset/transforms/transforms.py index 6ea2c3bc52b..eb2ba5457dc 100644 --- a/mindspore/python/mindspore/dataset/transforms/transforms.py +++ b/mindspore/python/mindspore/dataset/transforms/transforms.py @@ -65,13 +65,8 @@ class TensorOperation: if not hasattr(self, 'callable_op_') or self.callable_op_ is None: self.callable_op_ = cde.Execute(self.parse()) output_tensor_list = self.callable_op_(tensor_row) - for i, element in enumerate(output_tensor_list): - arr = element.as_array() - if arr.dtype.char == 'S': - output_tensor_list[i] = np.char.decode(arr) - else: - output_tensor_list[i] = arr - return output_tensor_list[0] if len(output_tensor_list) == 1 else tuple(output_tensor_list) + output_numpy_list = [x.as_decoded_array() for x in output_tensor_list] + return output_numpy_list[0] if len(output_numpy_list) == 1 else tuple(output_numpy_list) @staticmethod def parse(): diff --git a/tests/ut/python/dataset/test_map_offload.py b/tests/ut/python/dataset/test_map_offload.py index ff20ef5e237..e0dd9034f90 100644 --- a/tests/ut/python/dataset/test_map_offload.py +++ b/tests/ut/python/dataset/test_map_offload.py @@ -20,7 +20,6 @@ import mindspore.common.dtype as mstype import mindspore.dataset.vision as C import mindspore.dataset.transforms as C2 - DATA_DIR = "../data/dataset/testPK/data" BATCH_SIZE = 2 @@ -49,6 +48,30 @@ def test_offload(): break +def test_offload_string(): + """ + Feature: Test map offload flag with string tensors. + Description: Input is text dataset. + Expectation: Output should be same with activated or deactivated offload (incl. decoded text). + """ + + # Dataset with offload activated. + data0 = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False) + + # Dataset with offload not activated. + data1 = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False) + + # Use Data Transforms PadEnd op in operations list for Map + padend_op = C2.PadEnd([100], pad_value='') + + data0 = data0.map(operations=[padend_op], input_columns=["text"], offload=True) + data1 = data1.map(operations=[padend_op], input_columns=["text"]) + + for d0, d1 in zip(data0.create_dict_iterator(num_epochs=1, output_numpy=True), + data1.create_dict_iterator(num_epochs=1, output_numpy=True)): + np.testing.assert_array_equal(d0['text'], (d1['text'])) + + def test_auto_offload(): """ Feature: Test auto_offload config option. @@ -104,6 +127,7 @@ def test_offload_multi_column(): Description: Input is an image dataset, copy the image column and apply map operations to both images. Expectation: Output should be same with both offload activated and deactivated. """ + def copy_column(x, y): return x, x, y @@ -141,6 +165,7 @@ def test_offload_column_mapping(): Description: Input is an image dataset, copy the image column, then apply offload to only copied column. Expectation: The offload model dataset column index value is 1 (second column). """ + def copy_column(x, y): return x, x, y @@ -417,6 +442,7 @@ def test_offload_with_dict_itr(): if __name__ == "__main__": test_offload() + test_offload_string() test_auto_offload() test_offload_column_validation() test_offload_column_mapping() diff --git a/tests/ut/python/dataset/test_tensor.py b/tests/ut/python/dataset/test_tensor.py index e0632795e9e..eaffa920795 100644 --- a/tests/ut/python/dataset/test_tensor.py +++ b/tests/ut/python/dataset/test_tensor.py @@ -45,11 +45,15 @@ def test_basic(): assert n.type() == cde.DataType("int64") arr2 = n.as_array() + # decoding only impacts string arrays + arr3 = n.as_decoded_array() arr[0] = 2 x = np.array([2, 2, 3, 4, 5]) np.testing.assert_array_equal(x, arr2) + np.testing.assert_array_equal(x, arr3) assert n.type() == cde.DataType("int64") assert arr.__array_interface__['data'] == arr2.__array_interface__['data'] + assert arr.__array_interface__['data'] == arr3.__array_interface__['data'] def test_strides(): diff --git a/tests/ut/python/dataset/test_tensor_string.py b/tests/ut/python/dataset/test_tensor_string.py index 69b2dbddb3d..f6f3da03418 100644 --- a/tests/ut/python/dataset/test_tensor_string.py +++ b/tests/ut/python/dataset/test_tensor_string.py @@ -31,6 +31,10 @@ def test_basic(): arr = n.as_array() np.testing.assert_array_equal(x, arr) + arr2 = n.as_decoded_array() + np.testing.assert_array_equal(x, np.char.encode(arr2)) + np.testing.assert_array_equal(np.char.decode(x), arr2) + def compare(strings, dtype='S'): arr = np.array(strings, dtype=dtype)