[MD] decode numpy strings in cpp for eager mode
This commit is contained in:
parent
20e4ef2863
commit
96e21f6ed4
|
@ -377,7 +377,7 @@ Status PyExecute::operator()(const std::shared_ptr<Tensor> &input_tensor, std::s
|
|||
de_tensor_list = std::move(de_output_list);
|
||||
}
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(de_tensor_list.size() > 0,
|
||||
"[internal] transformation resulted in a tensor with size=0!");
|
||||
"[Internal] Transformation resulted in a tensor with size=0!");
|
||||
*out = std::move(de_tensor_list.getRow())[0];
|
||||
} else {
|
||||
std::string err_msg = "Your input device is not supported. (Option: CPU)";
|
||||
|
|
|
@ -93,7 +93,8 @@ PYBIND_REGISTER(Tensor, 0, ([](const py::module *m) {
|
|||
.def("__str__", &Tensor::ToString)
|
||||
.def("shape", &Tensor::shape)
|
||||
.def("type", &Tensor::type)
|
||||
.def("as_array", [](py::object &t) {
|
||||
.def("as_array",
|
||||
[](py::object &t) {
|
||||
auto &tensor = py::cast<Tensor &>(t);
|
||||
if (tensor.type() == DataType::DE_STRING) {
|
||||
py::array res;
|
||||
|
@ -103,6 +104,17 @@ PYBIND_REGISTER(Tensor, 0, ([](const py::module *m) {
|
|||
py::buffer_info info;
|
||||
THROW_IF_ERROR(Tensor::GetBufferInfo(&tensor, &info));
|
||||
return py::array(pybind11::dtype(info), info.shape, info.strides, info.ptr, t);
|
||||
})
|
||||
.def("as_decoded_array", [](py::object &t) {
|
||||
auto &tensor = py::cast<Tensor &>(t);
|
||||
if (tensor.type() == DataType::DE_STRING) {
|
||||
py::array res;
|
||||
THROW_IF_ERROR(tensor.GetDataAsNumpyUnicodeStrings(&res));
|
||||
return res;
|
||||
}
|
||||
py::buffer_info info;
|
||||
THROW_IF_ERROR(Tensor::GetBufferInfo(&tensor, &info));
|
||||
return py::array(pybind11::dtype(info), info.shape, info.strides, info.ptr, t);
|
||||
});
|
||||
}));
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
* Copyright 2019-2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -866,6 +866,17 @@ Status Tensor::GetDataAsNumpyStrings(py::array *data) {
|
|||
data_allocator_->deallocate(reinterpret_cast<uchar *>(tmp_data));
|
||||
return Status::OK();
|
||||
}
|
||||
Status Tensor::GetDataAsNumpyUnicodeStrings(py::array *data) {
|
||||
RETURN_UNEXPECTED_IF_NULL(data);
|
||||
std::vector<std::string_view> string_pointers;
|
||||
string_pointers.reserve(Size());
|
||||
// Iterate over tensor and create a vector of string_views of strings in the tensor.
|
||||
(void)std::transform(begin<std::string_view>(), end<std::string_view>(), std::back_inserter(string_pointers),
|
||||
[](const auto &element) { return element; });
|
||||
*data = py::array(py::cast(string_pointers));
|
||||
data->resize(shape_.AsVector());
|
||||
return Status::OK();
|
||||
}
|
||||
#endif
|
||||
|
||||
void Tensor::Squeeze() { shape_ = shape_.Squeeze(); }
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
* Copyright 2019-2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -441,12 +441,20 @@ class Tensor {
|
|||
|
||||
#ifdef ENABLE_PYTHON
|
||||
/// Constructs numpy array from input tensor
|
||||
/// \param[in] data this data is the location of python data
|
||||
/// \param[out] data this data is the location of python data
|
||||
/// \return Status code
|
||||
Status GetDataAsNumpy(py::array *data);
|
||||
|
||||
/// Constructs numpy array of strings
|
||||
/// \param[out] data this data is the location of python data
|
||||
/// \return Status code
|
||||
Status GetDataAsNumpyStrings(py::array *data);
|
||||
|
||||
/// Constructs numpy array of strings which are already decoded ('U')
|
||||
/// \param[out] data this data is the location of python data
|
||||
/// \return Status code
|
||||
Status GetDataAsNumpyUnicodeStrings(py::array *data);
|
||||
|
||||
static Status GetBufferInfo(Tensor *t, py::buffer_info *out);
|
||||
#endif
|
||||
|
||||
|
|
|
@ -183,9 +183,7 @@ class Iterator:
|
|||
return self._transform_md_to_tensor(t)
|
||||
|
||||
def _transform_md_to_tensor(self, t):
|
||||
array = t.as_array()
|
||||
if array.dtype.type is np.bytes_:
|
||||
array = np.char.decode(array)
|
||||
array = t.as_decoded_array()
|
||||
if self._do_copy:
|
||||
return Tensor(array)
|
||||
return Tensor.from_numpy(array)
|
||||
|
|
|
@ -65,13 +65,8 @@ class TensorOperation:
|
|||
if not hasattr(self, 'callable_op_') or self.callable_op_ is None:
|
||||
self.callable_op_ = cde.Execute(self.parse())
|
||||
output_tensor_list = self.callable_op_(tensor_row)
|
||||
for i, element in enumerate(output_tensor_list):
|
||||
arr = element.as_array()
|
||||
if arr.dtype.char == 'S':
|
||||
output_tensor_list[i] = np.char.decode(arr)
|
||||
else:
|
||||
output_tensor_list[i] = arr
|
||||
return output_tensor_list[0] if len(output_tensor_list) == 1 else tuple(output_tensor_list)
|
||||
output_numpy_list = [x.as_decoded_array() for x in output_tensor_list]
|
||||
return output_numpy_list[0] if len(output_numpy_list) == 1 else tuple(output_numpy_list)
|
||||
|
||||
@staticmethod
|
||||
def parse():
|
||||
|
|
|
@ -20,7 +20,6 @@ import mindspore.common.dtype as mstype
|
|||
import mindspore.dataset.vision as C
|
||||
import mindspore.dataset.transforms as C2
|
||||
|
||||
|
||||
DATA_DIR = "../data/dataset/testPK/data"
|
||||
BATCH_SIZE = 2
|
||||
|
||||
|
@ -49,6 +48,30 @@ def test_offload():
|
|||
break
|
||||
|
||||
|
||||
def test_offload_string():
|
||||
"""
|
||||
Feature: Test map offload flag with string tensors.
|
||||
Description: Input is text dataset.
|
||||
Expectation: Output should be same with activated or deactivated offload (incl. decoded text).
|
||||
"""
|
||||
|
||||
# Dataset with offload activated.
|
||||
data0 = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False)
|
||||
|
||||
# Dataset with offload not activated.
|
||||
data1 = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False)
|
||||
|
||||
# Use Data Transforms PadEnd op in operations list for Map
|
||||
padend_op = C2.PadEnd([100], pad_value='<pad>')
|
||||
|
||||
data0 = data0.map(operations=[padend_op], input_columns=["text"], offload=True)
|
||||
data1 = data1.map(operations=[padend_op], input_columns=["text"])
|
||||
|
||||
for d0, d1 in zip(data0.create_dict_iterator(num_epochs=1, output_numpy=True),
|
||||
data1.create_dict_iterator(num_epochs=1, output_numpy=True)):
|
||||
np.testing.assert_array_equal(d0['text'], (d1['text']))
|
||||
|
||||
|
||||
def test_auto_offload():
|
||||
"""
|
||||
Feature: Test auto_offload config option.
|
||||
|
@ -104,6 +127,7 @@ def test_offload_multi_column():
|
|||
Description: Input is an image dataset, copy the image column and apply map operations to both images.
|
||||
Expectation: Output should be same with both offload activated and deactivated.
|
||||
"""
|
||||
|
||||
def copy_column(x, y):
|
||||
return x, x, y
|
||||
|
||||
|
@ -141,6 +165,7 @@ def test_offload_column_mapping():
|
|||
Description: Input is an image dataset, copy the image column, then apply offload to only copied column.
|
||||
Expectation: The offload model dataset column index value is 1 (second column).
|
||||
"""
|
||||
|
||||
def copy_column(x, y):
|
||||
return x, x, y
|
||||
|
||||
|
@ -417,6 +442,7 @@ def test_offload_with_dict_itr():
|
|||
|
||||
if __name__ == "__main__":
|
||||
test_offload()
|
||||
test_offload_string()
|
||||
test_auto_offload()
|
||||
test_offload_column_validation()
|
||||
test_offload_column_mapping()
|
||||
|
|
|
@ -45,11 +45,15 @@ def test_basic():
|
|||
assert n.type() == cde.DataType("int64")
|
||||
|
||||
arr2 = n.as_array()
|
||||
# decoding only impacts string arrays
|
||||
arr3 = n.as_decoded_array()
|
||||
arr[0] = 2
|
||||
x = np.array([2, 2, 3, 4, 5])
|
||||
np.testing.assert_array_equal(x, arr2)
|
||||
np.testing.assert_array_equal(x, arr3)
|
||||
assert n.type() == cde.DataType("int64")
|
||||
assert arr.__array_interface__['data'] == arr2.__array_interface__['data']
|
||||
assert arr.__array_interface__['data'] == arr3.__array_interface__['data']
|
||||
|
||||
|
||||
def test_strides():
|
||||
|
|
|
@ -31,6 +31,10 @@ def test_basic():
|
|||
arr = n.as_array()
|
||||
np.testing.assert_array_equal(x, arr)
|
||||
|
||||
arr2 = n.as_decoded_array()
|
||||
np.testing.assert_array_equal(x, np.char.encode(arr2))
|
||||
np.testing.assert_array_equal(np.char.decode(x), arr2)
|
||||
|
||||
|
||||
def compare(strings, dtype='S'):
|
||||
arr = np.array(strings, dtype=dtype)
|
||||
|
|
Loading…
Reference in New Issue