[MD] decode numpy strings in cpp for eager mode

This commit is contained in:
mhmotallebi 2022-06-15 16:18:06 -04:00
parent 20e4ef2863
commit 96e21f6ed4
9 changed files with 75 additions and 17 deletions

View File

@ -377,7 +377,7 @@ Status PyExecute::operator()(const std::shared_ptr<Tensor> &input_tensor, std::s
de_tensor_list = std::move(de_output_list);
}
CHECK_FAIL_RETURN_UNEXPECTED(de_tensor_list.size() > 0,
"[internal] transformation resulted in a tensor with size=0!");
"[Internal] Transformation resulted in a tensor with size=0!");
*out = std::move(de_tensor_list.getRow())[0];
} else {
std::string err_msg = "Your input device is not supported. (Option: CPU)";

View File

@ -93,11 +93,23 @@ PYBIND_REGISTER(Tensor, 0, ([](const py::module *m) {
.def("__str__", &Tensor::ToString)
.def("shape", &Tensor::shape)
.def("type", &Tensor::type)
.def("as_array", [](py::object &t) {
.def("as_array",
[](py::object &t) {
auto &tensor = py::cast<Tensor &>(t);
if (tensor.type() == DataType::DE_STRING) {
py::array res;
THROW_IF_ERROR(tensor.GetDataAsNumpyStrings(&res));
return res;
}
py::buffer_info info;
THROW_IF_ERROR(Tensor::GetBufferInfo(&tensor, &info));
return py::array(pybind11::dtype(info), info.shape, info.strides, info.ptr, t);
})
.def("as_decoded_array", [](py::object &t) {
auto &tensor = py::cast<Tensor &>(t);
if (tensor.type() == DataType::DE_STRING) {
py::array res;
THROW_IF_ERROR(tensor.GetDataAsNumpyStrings(&res));
THROW_IF_ERROR(tensor.GetDataAsNumpyUnicodeStrings(&res));
return res;
}
py::buffer_info info;

View File

@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -866,6 +866,17 @@ Status Tensor::GetDataAsNumpyStrings(py::array *data) {
data_allocator_->deallocate(reinterpret_cast<uchar *>(tmp_data));
return Status::OK();
}
Status Tensor::GetDataAsNumpyUnicodeStrings(py::array *data) {
RETURN_UNEXPECTED_IF_NULL(data);
std::vector<std::string_view> string_pointers;
string_pointers.reserve(Size());
// Iterate over tensor and create a vector of string_views of strings in the tensor.
(void)std::transform(begin<std::string_view>(), end<std::string_view>(), std::back_inserter(string_pointers),
[](const auto &element) { return element; });
*data = py::array(py::cast(string_pointers));
data->resize(shape_.AsVector());
return Status::OK();
}
#endif
void Tensor::Squeeze() { shape_ = shape_.Squeeze(); }

View File

@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -441,12 +441,20 @@ class Tensor {
#ifdef ENABLE_PYTHON
/// Constructs numpy array from input tensor
/// \param[in] data this data is the location of python data
/// \param[out] data this data is the location of python data
/// \return Status code
Status GetDataAsNumpy(py::array *data);
/// Constructs numpy array of strings
/// \param[out] data this data is the location of python data
/// \return Status code
Status GetDataAsNumpyStrings(py::array *data);
/// Constructs numpy array of strings which are already decoded ('U')
/// \param[out] data this data is the location of python data
/// \return Status code
Status GetDataAsNumpyUnicodeStrings(py::array *data);
static Status GetBufferInfo(Tensor *t, py::buffer_info *out);
#endif

View File

@ -183,9 +183,7 @@ class Iterator:
return self._transform_md_to_tensor(t)
def _transform_md_to_tensor(self, t):
array = t.as_array()
if array.dtype.type is np.bytes_:
array = np.char.decode(array)
array = t.as_decoded_array()
if self._do_copy:
return Tensor(array)
return Tensor.from_numpy(array)

View File

@ -65,13 +65,8 @@ class TensorOperation:
if not hasattr(self, 'callable_op_') or self.callable_op_ is None:
self.callable_op_ = cde.Execute(self.parse())
output_tensor_list = self.callable_op_(tensor_row)
for i, element in enumerate(output_tensor_list):
arr = element.as_array()
if arr.dtype.char == 'S':
output_tensor_list[i] = np.char.decode(arr)
else:
output_tensor_list[i] = arr
return output_tensor_list[0] if len(output_tensor_list) == 1 else tuple(output_tensor_list)
output_numpy_list = [x.as_decoded_array() for x in output_tensor_list]
return output_numpy_list[0] if len(output_numpy_list) == 1 else tuple(output_numpy_list)
@staticmethod
def parse():

View File

@ -20,7 +20,6 @@ import mindspore.common.dtype as mstype
import mindspore.dataset.vision as C
import mindspore.dataset.transforms as C2
DATA_DIR = "../data/dataset/testPK/data"
BATCH_SIZE = 2
@ -49,6 +48,30 @@ def test_offload():
break
def test_offload_string():
"""
Feature: Test map offload flag with string tensors.
Description: Input is text dataset.
Expectation: Output should be same with activated or deactivated offload (incl. decoded text).
"""
# Dataset with offload activated.
data0 = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False)
# Dataset with offload not activated.
data1 = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False)
# Use Data Transforms PadEnd op in operations list for Map
padend_op = C2.PadEnd([100], pad_value='<pad>')
data0 = data0.map(operations=[padend_op], input_columns=["text"], offload=True)
data1 = data1.map(operations=[padend_op], input_columns=["text"])
for d0, d1 in zip(data0.create_dict_iterator(num_epochs=1, output_numpy=True),
data1.create_dict_iterator(num_epochs=1, output_numpy=True)):
np.testing.assert_array_equal(d0['text'], (d1['text']))
def test_auto_offload():
"""
Feature: Test auto_offload config option.
@ -104,6 +127,7 @@ def test_offload_multi_column():
Description: Input is an image dataset, copy the image column and apply map operations to both images.
Expectation: Output should be same with both offload activated and deactivated.
"""
def copy_column(x, y):
return x, x, y
@ -141,6 +165,7 @@ def test_offload_column_mapping():
Description: Input is an image dataset, copy the image column, then apply offload to only copied column.
Expectation: The offload model dataset column index value is 1 (second column).
"""
def copy_column(x, y):
return x, x, y
@ -417,6 +442,7 @@ def test_offload_with_dict_itr():
if __name__ == "__main__":
test_offload()
test_offload_string()
test_auto_offload()
test_offload_column_validation()
test_offload_column_mapping()

View File

@ -45,11 +45,15 @@ def test_basic():
assert n.type() == cde.DataType("int64")
arr2 = n.as_array()
# decoding only impacts string arrays
arr3 = n.as_decoded_array()
arr[0] = 2
x = np.array([2, 2, 3, 4, 5])
np.testing.assert_array_equal(x, arr2)
np.testing.assert_array_equal(x, arr3)
assert n.type() == cde.DataType("int64")
assert arr.__array_interface__['data'] == arr2.__array_interface__['data']
assert arr.__array_interface__['data'] == arr3.__array_interface__['data']
def test_strides():

View File

@ -31,6 +31,10 @@ def test_basic():
arr = n.as_array()
np.testing.assert_array_equal(x, arr)
arr2 = n.as_decoded_array()
np.testing.assert_array_equal(x, np.char.encode(arr2))
np.testing.assert_array_equal(np.char.decode(x), arr2)
def compare(strings, dtype='S'):
arr = np.array(strings, dtype=dtype)