Change mem layout of string tensor

add support for MindRecord and TFRecord
----
optimize tensorshape

optimize tensorshape and FlatIndex

TFRecord and MindRecord support for string tensor

Modify mem layout
Add new constructor
Add method Allocate

Change some GetMutableBuffer usages to AllocateBuffer
This commit is contained in:
hesham 2020-05-16 05:21:37 -04:00
parent d9c74e0acd
commit df361d1d26
29 changed files with 439 additions and 247 deletions

View File

@ -1,6 +1,10 @@
ms_protobuf_generate(EXAMPLE_SRCS EXAMPLE_HDRS example.proto)
ms_protobuf_generate(FEATURE_SRCS FEATURE_HDRS feature.proto)
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
add_library(core OBJECT
${EXAMPLE_SRCS}
${FEATURE_SRCS}
client.cc
config_manager.cc
cv_tensor.cc
@ -9,4 +13,5 @@ add_library(core OBJECT
tensor.cc
tensor_shape.cc
)
add_dependencies(core mindspore::protobuf)
target_include_directories(core PRIVATE ${pybind11_INCLUDE_DIRS})

View File

@ -25,14 +25,14 @@ namespace dataset {
uint8_t DataType::SizeInBytes() const {
if (type_ < DataType::NUM_OF_TYPES)
return SIZE_IN_BYTES[type_];
return kTypeInfo[type_].sizeInBytes_;
else
return 0;
}
py::dtype DataType::AsNumpyType() const {
if (type_ < DataType::NUM_OF_TYPES)
return py::dtype(PYBIND_TYPES[type_]);
return py::dtype(kTypeInfo[type_].pybindType_);
else
return py::dtype("unknown");
}
@ -40,7 +40,7 @@ py::dtype DataType::AsNumpyType() const {
uint8_t DataType::AsCVType() const {
uint8_t res = kCVInvalidType;
if (type_ < DataType::NUM_OF_TYPES) {
res = CV_TYPES[type_];
res = kTypeInfo[type_].cvType_;
}
if (res == kCVInvalidType) {
@ -108,7 +108,7 @@ DataType::DataType(const std::string &type_str) {
std::string DataType::ToString() const {
if (type_ < DataType::NUM_OF_TYPES)
return TO_STRINGS[type_];
return kTypeInfo[type_].name_;
else
return "unknown";
}
@ -149,7 +149,7 @@ DataType DataType::FromNpArray(const py::array &arr) {
std::string DataType::GetPybindFormat() const {
std::string res;
if (type_ < DataType::NUM_OF_TYPES) {
res = PYBIND_FORMAT_DESCRIPTOR[type_];
res = kTypeInfo[type_].pybindFormatDescriptor_;
}
if (res.empty()) {

View File

@ -51,56 +51,31 @@ class DataType {
NUM_OF_TYPES
};
inline static constexpr uint8_t SIZE_IN_BYTES[] = {0, // DE_UNKNOWN
1, // DE_BOOL
1, // DE_INT8
1, // DE_UINT8
2, // DE_INT16
2, // DE_UINT16
4, // DE_INT32
4, // DE_UINT32
8, // DE_INT64
8, // DE_UINT64
2, // DE_FLOAT16
4, // DE_FLOAT32
8, // DE_FLOAT64
0}; // DE_STRING
struct TypeInfo {
const char *name_; // name to be represent the type while printing
const uint8_t sizeInBytes_; // number of bytes needed for this type
const char *pybindType_; // Python matching type, used in get_output_types
const std::string pybindFormatDescriptor_; // pybind format used for numpy types
const uint8_t cvType_; // OpenCv matching type
};
inline static const char *TO_STRINGS[] = {"unknown", "bool", "int8", "uint8", "int16", "uint16", "int32",
"uint32", "int64", "uint64", "float16", "float32", "float64", "string"};
inline static const char *PYBIND_TYPES[] = {"object", "bool", "int8", "uint8", "int16", "uint16", "int32",
"uint32", "int64", "uint64", "float16", "float32", "double", "bytes"};
inline static const std::string PYBIND_FORMAT_DESCRIPTOR[] = {"", // DE_UNKNOWN
py::format_descriptor<bool>::format(), // DE_BOOL
py::format_descriptor<int8_t>::format(), // DE_INT8
py::format_descriptor<uint8_t>::format(), // DE_UINT8
py::format_descriptor<int16_t>::format(), // DE_INT16
py::format_descriptor<uint16_t>::format(), // DE_UINT16
py::format_descriptor<int32_t>::format(), // DE_INT32
py::format_descriptor<uint32_t>::format(), // DE_UINT32
py::format_descriptor<int64_t>::format(), // DE_INT64
py::format_descriptor<uint64_t>::format(), // DE_UINT64
"e", // DE_FLOAT16
py::format_descriptor<float>::format(), // DE_FLOAT32
py::format_descriptor<double>::format(), // DE_FLOAT64
"S"}; // DE_STRING
inline static constexpr uint8_t CV_TYPES[] = {kCVInvalidType, // DE_UNKNOWN
CV_8U, // DE_BOOL
CV_8S, // DE_INT8
CV_8U, // DE_UINT8
CV_16S, // DE_INT16
CV_16U, // DE_UINT16
CV_32S, // DE_INT32
kCVInvalidType, // DE_UINT32
kCVInvalidType, // DE_INT64
kCVInvalidType, // DE_UINT64
CV_16F, // DE_FLOAT16
CV_32F, // DE_FLOAT32
CV_64F, // DE_FLOAT64
kCVInvalidType}; // DE_STRING
static inline const TypeInfo kTypeInfo[] = {
// name, sizeInBytes, pybindTypem formatDescriptor, openCV
{"unknown", 0, "object", "", kCVInvalidType}, // DE_UNKNOWN
{"bool", 1, "bool", py::format_descriptor<bool>::format(), CV_8U}, // DE_BOOL
{"int8", 1, "int8", py::format_descriptor<int8_t>::format(), CV_8S}, // DE_INT8
{"uint8", 1, "uint8", py::format_descriptor<uint8_t>::format(), CV_8U}, // DE_UINT8
{"int16", 2, "int16", py::format_descriptor<int16_t>::format(), CV_16S}, // DE_INT16
{"uint16", 2, "uint16", py::format_descriptor<uint16_t>::format(), CV_16U}, // DE_UINT16
{"int32", 4, "int32", py::format_descriptor<int32_t>::format(), CV_32S}, // DE_INT32
{"uint32", 4, "uint32", py::format_descriptor<uint32_t>::format(), kCVInvalidType}, // DE_UINT32
{"int64", 8, "int64", py::format_descriptor<int64_t>::format(), kCVInvalidType}, // DE_INT64
{"uint64", 8, "uint64", py::format_descriptor<uint64_t>::format(), kCVInvalidType}, // DE_UINT64
{"float16", 2, "float16", "e", CV_16F}, // DE_FLOAT16
{"float32", 4, "float32", py::format_descriptor<float>::format(), CV_32F}, // DE_FLOAT32
{"float64", 8, "double", py::format_descriptor<double>::format(), CV_64F}, // DE_FLOAT64
{"string", 0, "bytes", "S", kCVInvalidType} // DE_STRING
};
// No arg constructor to create an unknown shape
DataType() : type_(DE_UNKNOWN) {}

View File

@ -57,18 +57,40 @@ Tensor::Tensor(const TensorShape &shape, const DataType &type) : shape_(shape),
}
Tensor::Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data) : Tensor(shape, type) {
if (type.IsNumeric()) {
// If the data pointer was given, then we can also populate the tensor with data
if (data != nullptr) {
// Given the shape/type of this tensor, compute the data size and copy in the input bytes.
int64_t byte_size = this->SizeInBytes();
Status s = this->AllocateBuffer(byte_size); // Allocates data_ inside itself
if (s.IsOk() && data_ != nullptr) {
int ret_code = memcpy_s(data_, byte_size, data, byte_size);
if (ret_code != 0) {
MS_LOG(ERROR) << "Failed to copy data into Tensor!";
}
} else {
MS_LOG(ERROR) << "Failed to create memory for Tensor!";
}
}
} else {
MS_LOG(ERROR) << "Type should be numeric to use this constructor.";
}
}
Tensor::Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data, const dsize_t &length)
: Tensor(shape, type) {
// If the data pointer was given, then we can also populate the tensor with data
if (data != nullptr) {
// Given the shape/type of this tensor, compute the data size and copy in the input bytes.
int64_t byte_size = this->SizeInBytes();
static_cast<void>(this->GetMutableBuffer()); // Allocates data_ inside itself
// Allocates data_ inside itself
Status s = AllocateBuffer(length);
if (s.IsError()) {
MS_LOG(ERROR) << "Failed to create memory for Tensor!";
}
if (data_ != nullptr) {
int ret_code = memcpy_s(data_, byte_size, data, byte_size);
int ret_code = memcpy_s(data_, length, data, length);
if (ret_code != 0) {
MS_LOG(ERROR) << "Failed to copy data into Tensor!";
}
} else {
MS_LOG(ERROR) << "Failed to create memory for Tensor!";
}
}
}
@ -98,32 +120,79 @@ Tensor::Tensor(const std::vector<std::string> &strings, const TensorShape &shape
auto length_sum = [](dsize_t sum, const std::string &s) { return s.length() + sum; };
dsize_t total_length = std::accumulate(strings.begin(), strings.end(), 0, length_sum);
dsize_t num_bytes = (kOffsetSize + 1) * shape_.NumOfElements() + total_length;
// total bytes needed = offset array + strings
// offset array needs to store one offset var per element + 1 extra to get the length of the last string.
// strings will be null-terminated --> need 1 extra byte per element
dsize_t num_bytes = (kOffsetSize + 1) * shape_.NumOfElements() + kOffsetSize + total_length;
data_ = data_allocator_->allocate(num_bytes);
auto offset_arr = reinterpret_cast<offset_t *>(data_);
uchar *buf = GetStringsBuffer();
offset_t offset = -1;
offset_t offset = buf - data_; // the first string will start here
uint32_t i = 0;
for (const auto &str : strings) {
// insert the end index of the string
// end index of a string is the end index of previous string + the length (including \0)
offset = offset + str.length() + 1;
// insert the start index of the string.
offset_arr[i++] = offset;
// total bytes are reduced by kOffsetSize
num_bytes -= kOffsetSize;
// insert actual string
memcpy_s(buf, num_bytes, str.c_str(), str.length() + 1);
buf += str.length() + 1;
int ret_code = memcpy_s(data_ + offset, num_bytes, common::SafeCStr(str), str.length() + 1);
if (ret_code != 0) MS_LOG(ERROR) << "Cannot copy string into Tensor";
// next string will be stored right after the current one.
offset = offset + str.length() + 1;
// total bytes are reduced by the length of the string
num_bytes -= str.length() + 1;
}
this->data_end_ = buf;
// store one more offset value so we can get the length of the last string
// length[last_element] = offset_arr[last_element + 1] - offset_arr[last_element]
offset_arr[i] = offset;
this->data_end_ = data_ + offset_arr[i];
DS_ASSERT(num_bytes == 0);
if (shape.known()) Tensor::Reshape(shape);
}
Tensor::Tensor(const dataengine::BytesList &bytes_list, const TensorShape &shape)
: Tensor(TensorShape({static_cast<dsize_t>(bytes_list.value_size())}), DataType(DataType::DE_STRING)) {
// total bytes needed = offset array + strings
// offset array needs to store one offset var per element + 1 extra to get the length of the last string.
// strings will be null-terminated --> need 1 extra byte per element
dsize_t num_bytes = (kOffsetSize)*shape_.NumOfElements() + kOffsetSize + bytes_list.ByteSizeLong();
data_ = data_allocator_->allocate(num_bytes);
auto offset_arr = reinterpret_cast<offset_t *>(data_);
uchar *buf = GetStringsBuffer();
offset_t offset = buf - data_; // the first string will start here
uint32_t i = 0;
for (; i < bytes_list.value_size(); i++) {
const std::string &str = bytes_list.value(i);
// insert the start index of the string.
offset_arr[i] = offset;
// total bytes are reduced by kOffsetSize
num_bytes -= kOffsetSize;
// insert actual string
int ret_code = memcpy_s(data_ + offset, num_bytes, common::SafeCStr(str), str.length() + 1);
if (ret_code != 0) {
MS_LOG(ERROR) << "Cannot copy string into Tensor";
}
// next string will be stored right after the current one.
offset = offset + str.length() + 1;
// total bytes are reduced by the length of the string
num_bytes -= str.length() + 1;
}
// store one more offset value so we can get the length of the last string
// length[last_element] = offset_arr[last_element + 1] - offset_arr[last_element]
offset_arr[i] = offset;
data_end_ = data_ + offset_arr[i];
DS_ASSERT(num_bytes == 0);
if (shape.known()) Tensor::Reshape(shape);
}
Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, TensorImpl tensor_impl, const TensorShape &shape,
DataType type, const unsigned char *data) {
if (!shape.known()) {
@ -152,20 +221,17 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, TensorImpl tensor_impl
}
return Status::OK(); // returns base-class shared_ptr
}
std::string to(std::string x) { return x; }
Status Tensor::CreateTensorFromNumpyString(std::shared_ptr<Tensor> *ptr, py::array arr) {
std::vector<dsize_t> shape;
for (dsize_t i = 0; i < arr.ndim(); i++) {
shape.push_back(static_cast<dsize_t>(arr.shape()[i]));
}
arr.resize({arr.size()});
auto itr = arr.begin();
arr.resize({arr.size()}); // flatten the py::array so we can iterate once
std::vector<std::string> strings;
for (; itr != arr.end(); itr++) {
std::string s = to(py::cast<py::bytes>(*itr));
strings.push_back(s);
}
arr.resize(shape);
std::for_each(arr.begin(), arr.end(), [&strings](const auto &s) { strings.emplace_back(py::cast<py::bytes>(s)); });
arr.resize(shape); // resize arr back to the original shape
return CreateTensor(ptr, strings, TensorShape{shape});
}
@ -190,8 +256,9 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, py::array arr) {
std::shared_ptr<MemoryPool> global_pool = GlobalContext::Instance()->mem_pool();
(*ptr)->data_allocator_ = std::make_unique<Allocator<unsigned char>>(global_pool);
static_cast<void>((*ptr)->GetMutableBuffer());
int64_t byte_size = (*ptr)->SizeInBytes();
RETURN_IF_NOT_OK((*ptr)->AllocateBuffer(byte_size));
unsigned char *data = static_cast<unsigned char *>(arr.request().ptr);
if ((*ptr)->data_ == nullptr) {
RETURN_STATUS_UNEXPECTED("Failed to create memory for Tensor.");
@ -232,6 +299,13 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, const std::vector<std:
return Status::OK();
}
Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, const dataengine::BytesList &bytes_list,
const TensorShape &shape) {
const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator();
*ptr = std::allocate_shared<Tensor>(*alloc, bytes_list, shape);
return Status::OK();
}
// Memcpy the given strided array's used part to consecutive memory
// Consider a 3-d array
// A[(i * shape[1] + j) * shape[2] + k] = B[i][j][k] = C[i * strides[0] + j * strides[1] + k * strides[2]]
@ -370,25 +444,20 @@ void Tensor::Print(std::ostream &out) const {
out << "[Data area is null]";
}
}
// Name: ToFlatIndex()
// Description: convert a vector style index to number, used to access memory internal use only
Status Tensor::ToFlatIndex(const std::vector<dsize_t> &index, dsize_t *flat_index) const {
if (!shape_.IsValidIndex(index)) {
std::string err = "Not a valid index";
RETURN_STATUS_UNEXPECTED(err);
}
*flat_index = 0;
for (size_t k = 0; k < index.size(); k++) {
dsize_t product = 1;
for (size_t l = k + 1; l < index.size(); l++) {
product *= shape_[l];
Status Tensor::AllocateBuffer(const dsize_t &length) {
if (data_ == nullptr) {
if (data_allocator_ != nullptr) {
data_ = data_allocator_->allocate(length);
RETURN_UNEXPECTED_IF_NULL(data_);
data_end_ = data_ + length;
} else {
data_ = static_cast<unsigned char *>(malloc(length));
data_end_ = data_ + length;
RETURN_UNEXPECTED_IF_NULL(data_);
}
*flat_index += index[k] * product;
}
return Status::OK();
}
const unsigned char *Tensor::GetBuffer() const {
// This version cannot modify anything. data_ could possibly be null.
return data_;
@ -404,17 +473,11 @@ unsigned char *Tensor::GetMutableBuffer() {
} else {
// If the data area is not created, then identify the memory size based
// on the shape and type and allocate it.
if (data_allocator_ != nullptr) {
data_ = data_allocator_->allocate(this->SizeInBytes());
data_end_ = data_ + SizeInBytes();
if (this->AllocateBuffer(this->SizeInBytes()).IsOk()) {
return data_;
} else {
data_ = static_cast<unsigned char *>(malloc(this->SizeInBytes()));
data_end_ = data_ + SizeInBytes();
if (data_ == nullptr) {
return nullptr;
}
return nullptr;
}
return data_;
}
}
@ -444,7 +507,7 @@ Status Tensor::GetItemPtr(T **ptr, const std::vector<dsize_t> &index) const {
RETURN_STATUS_UNEXPECTED(err);
}
dsize_t flat_idx;
RETURN_IF_NOT_OK(ToFlatIndex(index, &flat_idx));
RETURN_IF_NOT_OK(shape_.ToFlatIndex(index, &flat_idx));
*ptr = reinterpret_cast<T *>(data_ + flat_idx * type_.SizeInBytes());
return Status::OK();
@ -461,7 +524,7 @@ Status Tensor::GetItemPtr(uchar **ptr, const std::vector<dsize_t> &index, offset
RETURN_STATUS_UNEXPECTED(err);
}
dsize_t flat_idx;
RETURN_IF_NOT_OK(ToFlatIndex(index, &flat_idx));
RETURN_IF_NOT_OK(shape_.ToFlatIndex(index, &flat_idx));
offset_t length_temp = 0;
RETURN_IF_NOT_OK(GetStringAt(flat_idx, ptr, &length_temp));
if (length != nullptr) *length = length_temp;
@ -481,7 +544,7 @@ Status Tensor::StartAddrOfIndex(std::vector<dsize_t> ind, uchar **start_addr_of_
std::vector<dsize_t> r(t_shape.begin() + ind.size(), t_shape.end());
*remaining = TensorShape(r);
ind.resize(this->Rank(), 0); // same as -> while (ind.size() < this->Rank()) ind.push_back(0);
RETURN_IF_NOT_OK(ToFlatIndex(ind, &flat_ind));
RETURN_IF_NOT_OK(shape_.ToFlatIndex(ind, &flat_ind));
// check if GetBuffer() returns null, we should flag this as an error, this sanity check will only
// be true is the tensor failed to allocate memory.
if (GetMutableBuffer() == nullptr) {
@ -588,10 +651,10 @@ Status Tensor::GetItemAt(std::string_view *o, const std::vector<dsize_t> &index)
RETURN_UNEXPECTED_IF_NULL(o);
CHECK_FAIL_RETURN_UNEXPECTED(type_ == DataType::DE_STRING, "Type is not DE_STRING");
uchar *buf = nullptr;
uchar *start = nullptr;
offset_t length = 0;
RETURN_IF_NOT_OK(GetItemPtr(&buf, index, &length));
std::string_view sv{reinterpret_cast<const char *>(buf), length};
RETURN_IF_NOT_OK(GetItemPtr(&start, index, &length));
std::string_view sv{reinterpret_cast<const char *>(start)};
o->swap(sv);
return Status::OK();
}
@ -778,13 +841,11 @@ Status Tensor::GetStringAt(dsize_t index, uchar **string_start, offset_t *length
RETURN_UNEXPECTED_IF_NULL(string_start);
RETURN_UNEXPECTED_IF_NULL(length);
auto *offset_ptr = reinterpret_cast<offset_t *>(data_); // offsets starts here
offset_t end = offset_ptr[index];
offset_t start = 0;
if (index != 0) start = offset_ptr[index - 1] + 1; // string starts at where the previous string ends + 1
uchar *buf = GetStringsBuffer(); // string data starts here
*string_start = buf + start;
*length = end - start;
offset_t start = offset_ptr[index];
*string_start = data_ + start;
*length = offset_ptr[index + 1] - start - 1; // -1 to skip the \0 from the string length
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -35,6 +35,7 @@
#include "dataset/util/allocator.h"
#include "dataset/util/de_error.h"
#include "dataset/util/status.h"
#include "proto/example.pb.h"
namespace py = pybind11;
namespace mindspore {
@ -64,6 +65,8 @@ class Tensor {
// @param data unsigned char*, pointer to the data.
Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data);
Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data, const dsize_t &length);
Tensor(const Tensor &other) = delete;
Tensor &operator=(const Tensor &other) = delete;
@ -72,6 +75,8 @@ class Tensor {
Tensor &operator=(Tensor &&other) noexcept;
Status AllocateBuffer(const dsize_t &length);
// type of offest values to store strings information
using offset_t = uint32_t;
// const of the size of the offset variable
@ -84,15 +89,24 @@ class Tensor {
// Construct a tensor from a list of strings. Reshape the tensor with `shape` if given, otherwise assume the shape is
// the size of the vector `strings`.
// The memory layout of a Tensor of strings consists of the Offset_array followed by the strings.
// OFFSET1, OFFSET2, ... String1, String2, ...
// The value of each offset is the end index of the corresponding string
// Thr offset array will store one extra value to find the length of the last string.
// OFFSET1, OFFSET2, ..., OFFSETn+1, STRING1, STRING2, ..., STRINGn
// The value of each offset is the start index of the corresponding string
// Offsets is of type offest_t
// strings will ne null-terminated
// example: Tensor(['abc', 'de'], shape={2}, type=DE_STRING)
// 3 6 a b c \0 d e \0
// |----------------------------------------------------------------|
// | OFFSET ARRAY | STRINGS |
// | bytes 0-3 | bytes 3-6 | bytes 7-10 | bytes 11-14 | bytes 15-17 |
// | 11 | 15 | 18 | abc\0 | de\0 |
// |----------------------------------------------------------------|
explicit Tensor(const std::vector<std::string> &strings,
const TensorShape &shape = TensorShape::CreateUnknownRankShape());
// Same as Tensor(vector<string>) but the input is protobuf bytelist
explicit Tensor(const dataengine::BytesList &bytes_list,
const TensorShape &shape = TensorShape::CreateUnknownRankShape());
// A static factory method to create the given flavour of derived Tensor
// Returns the base class reference for the Tensor.
// @param ptr output argument to hold the created Tensor of given tensor_impl
@ -121,6 +135,9 @@ class Tensor {
static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const std::vector<std::string> &strings,
const TensorShape &shape = TensorShape::CreateUnknownRankShape());
static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const dataengine::BytesList &bytes_list,
const TensorShape &shape);
// Copy raw data of a array based on shape and strides to the destination pointer
// @param dst Pointer to the destination array where the content is to be copied
// @param src Pointer to the source of strided array to be copied
@ -166,7 +183,7 @@ class Tensor {
// @param value of type `T`
template <typename T>
Status SetItemAt(const std::vector<dsize_t> &index, const T &value) {
static_cast<void>(GetMutableBuffer());
RETURN_IF_NOT_OK(AllocateBuffer(SizeInBytes()));
T *ptr = nullptr;
RETURN_IF_NOT_OK(GetItemPtr<T>(&ptr, index));
*ptr = value;
@ -203,7 +220,7 @@ class Tensor {
template <typename T>
Status Fill(const T &value) {
CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use fill on tensor of strings.");
static_cast<void>(GetMutableBuffer());
RETURN_IF_NOT_OK(AllocateBuffer(SizeInBytes()));
int64_t cellSize = type_.SizeInBytes();
if ((data_ != nullptr) && type_.IsCompatible<T>()) {
for (dsize_t i = 0; i < Size(); i++) {
@ -418,32 +435,28 @@ class Tensor {
using pointer = std::string_view *;
using reference = std::string_view &;
explicit TensorIterator(uchar *offset = nullptr, const uchar *buf = nullptr, dsize_t index = 0) {
offset_ = reinterpret_cast<offset_t *>(offset);
buf_ = reinterpret_cast<const char *>(buf);
explicit TensorIterator(uchar *data = nullptr, dsize_t index = 0) {
data_ = reinterpret_cast<const char *>(data);
index_ = index;
}
TensorIterator(const TensorIterator<std::string_view, DUMMY> &raw_iterator) {
offset_ = raw_iterator.offset_;
buf_ = raw_iterator.buf_;
data_ = raw_iterator.data_;
index_ = raw_iterator.index_;
}
~TensorIterator() = default;
bool operator==(const TensorIterator<std::string_view> &rhs) {
return buf_ == rhs.buf_ && offset_ == rhs.offset_ && index_ == rhs.index_;
}
bool operator==(const TensorIterator<std::string_view> &rhs) { return data_ == rhs.data_ && index_ == rhs.index_; }
bool operator!=(const TensorIterator<std::string_view> &rhs) { return !(*this == rhs); }
operator bool() const { return offset_ != nullptr; }
operator bool() const { return data_ != nullptr; }
std::string_view operator*() const {
offset_t start = 0;
if (index_ != 0) start = offset_[index_ - 1] + 1;
return std::string_view{buf_ + start};
auto offset_ = reinterpret_cast<const offset_t *>(data_);
offset_t start = offset_[index_];
return std::string_view{data_ + start};
}
TensorIterator<std::string_view> &operator+=(const dsize_t &inc) {
@ -496,8 +509,7 @@ class Tensor {
protected:
dsize_t index_;
offset_t *offset_;
const char *buf_;
const char *data_;
};
// Return a TensorIterator that points to the start of the Tensor.
@ -518,11 +530,6 @@ class Tensor {
}
protected:
// Returns the location of the item assuming row major memory layout.
// @param index
// @return
Status ToFlatIndex(const std::vector<dsize_t> &index, dsize_t *flat_index) const;
// A function that prints Tensor recursively, first called by print
// @param out
// @param cur_dim
@ -559,7 +566,7 @@ class Tensor {
// Skip the offsets and returns the start of the buffer where the real strings is stored. Caller needs to check if the
// tensor's type is a string, otherwise undefined address would be returned.
// @return address of the first string of the tensor.
uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements(); }
uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements() + kOffsetSize; }
// all access to shape_ should be via shape
TensorShape shape_;
@ -573,14 +580,8 @@ class Tensor {
unsigned char *data_end_ = nullptr;
};
template <>
inline Tensor::TensorIterator<std::string_view> Tensor::begin<std::string_view>() {
uchar *buf = GetStringsBuffer();
return TensorIterator<std::string_view>(data_, buf);
}
template <>
inline Tensor::TensorIterator<std::string_view> Tensor::end<std::string_view>() {
uchar *buf = GetStringsBuffer();
return TensorIterator<std::string_view>(data_, buf, shape_.NumOfElements());
return TensorIterator<std::string_view>(data_, shape_.NumOfElements());
}
} // namespace dataset
} // namespace mindspore

View File

@ -40,16 +40,7 @@ dsize_t TensorShape::NumOfElements() const {
if (!known()) {
return 0;
}
dsize_t num = 1;
for (auto i : raw_shape_) {
if (multi_ok(num, i)) {
num *= i;
} else {
// dsize_t can wrap since it is signed int, we double check here
MS_LOG(ERROR) << "Tensor shape larger than maximum allowed value!";
}
}
return num;
return strides_[0];
}
void TensorShape::Print(std::ostream &out) const {
@ -72,20 +63,23 @@ void TensorShape::Print(std::ostream &out) const {
}
TensorShape::TensorShape(const std::initializer_list<dsize_t> &list)
: raw_shape_(*GlobalContext::Instance()->int_allocator()) {
: raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) {
AddListToShape(list);
}
TensorShape::TensorShape(const std::vector<dsize_t> &list) : raw_shape_(*GlobalContext::Instance()->int_allocator()) {
TensorShape::TensorShape(const std::vector<dsize_t> &list)
: raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) {
AddListToShape(list);
}
TensorShape::TensorShape(const TensorShape &shape) : raw_shape_(*GlobalContext::Instance()->int_allocator()) {
TensorShape::TensorShape(const TensorShape &shape)
: raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) {
AddListToShape(shape.AsVector());
known_ = shape.known_; // override with the input shape in case of unknown-rank tensor shape.
}
TensorShape::TensorShape(py::list l) : raw_shape_(*GlobalContext::Instance()->int_allocator()) {
TensorShape::TensorShape(py::list l)
: raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) {
std::vector<dsize_t> list_c;
for (auto &i : l) {
if (!i.is_none()) {
@ -97,6 +91,18 @@ TensorShape::TensorShape(py::list l) : raw_shape_(*GlobalContext::Instance()->in
AddListToShape(list_c);
}
TensorShape::TensorShape(cv::MatSize cv_size, uint32_t type)
: raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) {
for (int i = 0; i < cv_size.dims(); i++) {
raw_shape_.push_back(cv_size[i]);
}
auto channels = static_cast<uint8_t>(1 + (type >> static_cast<uint8_t>(CV_CN_SHIFT)));
if (channels != 1) {
raw_shape_.push_back(channels);
}
known_ = true;
}
TensorShape TensorShape::CreateUnknownRankShape() {
TensorShape s({});
s.known_ = false;
@ -109,17 +115,6 @@ TensorShape TensorShape::InsertDim(dsize_t axis, dsize_t dim) const {
return TensorShape(tmp);
}
TensorShape::TensorShape(cv::MatSize cv_size, uint32_t type) : raw_shape_(*GlobalContext::Instance()->int_allocator()) {
for (int i = 0; i < cv_size.dims(); i++) {
raw_shape_.push_back(cv_size[i]);
}
auto channels = static_cast<uint8_t>(1 + (type >> static_cast<uint8_t>(CV_CN_SHIFT)));
if (channels != 1) {
raw_shape_.push_back(channels);
}
known_ = true;
}
std::vector<dsize_t> TensorShape::AsVector() const {
return std::vector<dsize_t>(raw_shape_.begin(), raw_shape_.end());
}
@ -139,23 +134,28 @@ bool TensorShape::IsValidIndex(const std::vector<dsize_t> &index) const {
template <typename T>
void TensorShape::AddListToShape(const T &list) {
raw_shape_.resize(list.size());
strides_.resize(list.size() + 1);
strides_[list.size()] = 1;
known_ = true;
dsize_t num = 1;
dsize_t size = 0;
for (const auto &itr : list) {
if (itr > 0) {
if (num > std::numeric_limits<int64_t>::max() / itr) {
auto itr = std::rbegin(list); // iterate over the list in reverse order
auto s = list.size() - 1; // to compute strides while adding dims
for (; itr != std::rend(list); itr++, s--) {
dsize_t dim = *itr;
if (dim > 0) {
if (strides_[s + 1] > std::numeric_limits<int64_t>::max() / dim) {
MS_LOG(ERROR) << "Invalid shape data, overflow occurred!";
known_ = false;
raw_shape_.clear();
return;
}
num *= itr;
strides_[s] = dim * strides_[s + 1];
}
if (itr < 0) {
if (dim < 0) {
known_ = false;
}
if (itr > kDeMaxDim) {
if (dim > kDeMaxDim) {
std::stringstream ss;
ss << "Invalid shape data, dim (" << size << ") is larger than the maximum dim size(" << kDeMaxDim << ")!";
MS_LOG(ERROR) << ss.str().c_str();
@ -163,7 +163,7 @@ void TensorShape::AddListToShape(const T &list) {
raw_shape_.clear();
return;
}
raw_shape_.push_back(itr);
raw_shape_[s] = dim;
size++;
}
if (size > kDeMaxRank) {
@ -215,17 +215,18 @@ TensorShape TensorShape::Squeeze() const {
}
return TensorShape(new_shape);
}
std::vector<dsize_t> TensorShape::Strides() {
std::vector<dsize_t> strides(Rank());
dsize_t count = NumOfElements();
for (dsize_t i = 0; i < Rank(); i++) {
if (raw_shape_[i] != 0)
count /= raw_shape_[i];
else
count = 0;
strides[i] = count;
std::vector<dsize_t> TensorShape::Strides() const { return std::vector<dsize_t>{strides_.begin() + 1, strides_.end()}; }
// Name: ToFlatIndex()
// Description: convert a vector style index to number, used to access memory internal use only
Status TensorShape::ToFlatIndex(const std::vector<dsize_t> &index, dsize_t *flat_index) const {
*flat_index = 0;
for (size_t k = 0; k < index.size(); k++) {
*flat_index += index[k] * strides_[k + 1]; // skip the first element of strides_ which is numOfElements
}
return strides;
CHECK_FAIL_RETURN_UNEXPECTED(*flat_index < NumOfElements(), "Not a valid index");
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -156,13 +156,20 @@ class TensorShape {
TensorShape Squeeze() const;
std::vector<dsize_t> Strides();
std::vector<dsize_t> Strides() const;
// Returns the location of the item assuming row major memory layout.
// @param index
// @return
Status ToFlatIndex(const std::vector<dsize_t> &index, dsize_t *flat_index) const;
private:
// True if known and valid shape, false otherwise
bool known_;
// Vector to keep the dims of the shape.
std::vector<dsize_t, IntAlloc> raw_shape_;
// Vector to keep the strides of the shape. The size is rank+1
std::vector<dsize_t, IntAlloc> strides_;
// Internal utility function to iterate over a list, check if the dim is valid and then insert it into the shape.
// @tparam T list

View File

@ -1,5 +1,3 @@
ms_protobuf_generate(EXAMPLE_SRCS EXAMPLE_HDRS example.proto)
ms_protobuf_generate(FEATURE_SRCS FEATURE_HDRS feature.proto)
add_subdirectory(sampler)
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
@ -15,13 +13,9 @@ add_library(engine-datasetops-source OBJECT
image_folder_op.cc
mnist_op.cc
voc_op.cc
${EXAMPLE_SRCS}
${FEATURE_SRCS}
manifest_op.cc
cifar_op.cc
random_data_op.cc
celeba_op.cc
text_file_op.cc
)
add_dependencies(engine-datasetops-source mindspore::protobuf)
)

View File

@ -127,8 +127,10 @@ Status MindRecordOp::Init() {
std::string type_str = mindrecord::ColumnDataTypeNameNormalized[col_data_types[i]];
DataType t_dtype = DataType(type_str); // valid types: {"bytes", "string", "int32", "int64", "float32", "float64"}
if (col_data_types[i] == mindrecord::ColumnBytes || col_data_types[i] == mindrecord::ColumnString) { // rank = 1
if (col_data_types[i] == mindrecord::ColumnBytes) { // rank = 1
col_desc = ColDescriptor(colname, t_dtype, TensorImpl::kFlexible, 1);
} else if (col_data_types[i] == mindrecord::ColumnString) { // rank = 0
col_desc = ColDescriptor(colname, t_dtype, TensorImpl::kFlexible, 0);
} else if (col_shapes[i].size() > 0) {
std::vector<dsize_t> vec(col_shapes[i].size()); // temporary vector to hold shape
(void)std::copy(col_shapes[i].begin(), col_shapes[i].end(), vec.begin());
@ -310,7 +312,10 @@ Status MindRecordOp::LoadTensorRow(TensorRow *tensor_row, const std::vector<uint
// Set shape
auto num_elements = n_bytes / column_data_type_size;
if (column.hasShape()) {
if (type == DataType::DE_STRING) {
std::string s{data, data + n_bytes};
RETURN_IF_NOT_OK(Tensor::CreateTensor(&tensor, {s}, TensorShape::CreateScalar()));
} else if (column.hasShape()) {
auto new_shape = TensorShape(column.shape());
RETURN_IF_NOT_OK(column.MaterializeTensorShape(static_cast<int32_t>(num_elements), &new_shape));
RETURN_IF_NOT_OK(Tensor::CreateTensor(&tensor, column.tensorImpl(), new_shape, type, data));

View File

@ -63,7 +63,8 @@ Status Sampler::CreateSamplerTensor(std::shared_ptr<Tensor> *sample_ids, int64_t
}
TensorShape shape(std::vector<dsize_t>(1, num_elements));
RETURN_IF_NOT_OK(Tensor::CreateTensor(sample_ids, col_desc_->tensorImpl(), shape, col_desc_->type()));
(void)(*sample_ids)->GetMutableBuffer(); // allocate memory in case user forgets!
RETURN_IF_NOT_OK(
(*sample_ids)->AllocateBuffer((*sample_ids)->SizeInBytes())); // allocate memory in case user forgets!
return Status::OK();
}

View File

@ -724,18 +724,26 @@ Status TFReaderOp::LoadBytesList(const ColDescriptor &current_col, const dataeng
// kBytesList can map to the following DE types ONLY!
// DE_UINT8, DE_INT8
// Must be single byte type for each element!
if (current_col.type() != DataType::DE_UINT8 && current_col.type() != DataType::DE_INT8) {
if (current_col.type() != DataType::DE_UINT8 && current_col.type() != DataType::DE_INT8 &&
current_col.type() != DataType::DE_STRING) {
std::string err_msg = "Invalid datatype for Tensor at column: " + current_col.name();
RETURN_STATUS_UNEXPECTED(err_msg);
}
const dataengine::BytesList &bytes_list = column_values_list.bytes_list();
*num_elements = bytes_list.value_size();
if (current_col.type() == DataType::DE_STRING) {
TensorShape shape = TensorShape::CreateScalar();
RETURN_IF_NOT_OK(current_col.MaterializeTensorShape(*num_elements, &shape));
RETURN_IF_NOT_OK(Tensor::CreateTensor(tensor, bytes_list, shape));
return Status::OK();
}
uint64_t max_size = 0;
for (uint32_t i = 0; i < bytes_list.value_size(); ++i) max_size = std::max(max_size, bytes_list.value(i).size());
*num_elements = bytes_list.value_size();
int64_t pad_size = max_size;
// if user provides a shape in the form of [-1, d1, 2d, ... , dn], we need to pad to d1 * d2 * ... * dn
@ -879,7 +887,7 @@ Status TFReaderOp::LoadIntList(const ColDescriptor &current_col, const dataengin
RETURN_IF_NOT_OK(Tensor::CreateTensor(tensor, current_col.tensorImpl(), current_shape, current_col.type()));
// Tensors are lazily allocated, this eagerly allocates memory for the tensor.
(void)(*tensor)->GetMutableBuffer();
RETURN_IF_NOT_OK((*tensor)->AllocateBuffer((*tensor)->SizeInBytes()));
int64_t i = 0;
auto it = (*tensor)->begin<T>();

View File

@ -162,7 +162,7 @@ void CastFrom(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *out
Status TypeCast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const DataType &data_type) {
RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), data_type));
static_cast<void>((*output)->GetMutableBuffer());
RETURN_IF_NOT_OK((*output)->AllocateBuffer((*output)->SizeInBytes()));
switch (input->type().value()) {
case DataType::DE_BOOL:
CastFrom<bool>(input, output);
@ -211,7 +211,7 @@ Status ToFloat16(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *
// initiate new tensor for type cast
DataType new_type = DataType("float16");
RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), new_type));
static_cast<void>((*output)->GetMutableBuffer());
RETURN_IF_NOT_OK((*output)->AllocateBuffer((*output)->SizeInBytes()));
auto in_itr = input->begin<float>();
auto out_itr = (*output)->begin<float16>();

View File

@ -64,7 +64,8 @@ Status Flip(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, int
std::shared_ptr<CVTensor> output_cv = std::make_shared<CVTensor>(input_cv->shape(), input_cv->type());
RETURN_UNEXPECTED_IF_NULL(output_cv);
(void)output_cv->GetMutableBuffer();
RETURN_IF_NOT_OK(output_cv->AllocateBuffer(output_cv->SizeInBytes()));
if (input_cv->mat().data) {
try {
cv::flip(input_cv->mat(), output_cv->mat(), flip_code);

View File

@ -51,7 +51,7 @@ enum ColumnDataType {
// mapping as {"bytes", "string", "int32", "int64", "float32", "float64"};
const uint32_t ColumnDataTypeSize[kDataTypes] = {1, 1, 4, 8, 4, 8};
const std::vector<std::string> ColumnDataTypeNameNormalized = {"uint8", "uint8", "int32",
const std::vector<std::string> ColumnDataTypeNameNormalized = {"uint8", "string", "int32",
"int64", "float32", "float64"};
const std::unordered_map<std::string, ColumnDataType> ColumnDataTypeMap = {

View File

@ -48,6 +48,7 @@ def mstype_to_detype(type_):
mstype.float16: cde.DataType("float16"),
mstype.float32: cde.DataType("float32"),
mstype.float64: cde.DataType("float64"),
mstype.string: cde.DataType("string"),
}[type_]

View File

@ -26,7 +26,7 @@ from . import datasets
INT32_MAX = 2147483647
valid_detype = [
"bool", "int8", "int16", "int32", "int64", "uint8", "uint16",
"uint32", "uint64", "float16", "float32", "float64"
"uint32", "uint64", "float16", "float32", "float64", "string"
]

View File

@ -32,47 +32,47 @@ class MindDataTestDatatype : public UT::Common {
TEST_F(MindDataTestDatatype, TestSizes) {
uint8_t x = DataType::SIZE_IN_BYTES[DataType::DE_BOOL];
uint8_t x = DataType::kTypeInfo[DataType::DE_BOOL].sizeInBytes_;
DataType d = DataType(DataType::DE_BOOL);
ASSERT_EQ(x, 1);
ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_INT8];
x = DataType::kTypeInfo[DataType::DE_INT8].sizeInBytes_;
d = DataType(DataType::DE_INT8);
ASSERT_EQ(x, 1);
ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_UINT8];
x = DataType::kTypeInfo[DataType::DE_UINT8].sizeInBytes_;
d = DataType(DataType::DE_UINT8);
ASSERT_EQ(x, 1);
ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_INT16];
x = DataType::kTypeInfo[DataType::DE_INT16].sizeInBytes_;
d = DataType(DataType::DE_INT16);
ASSERT_EQ(x, 2);
ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_UINT16];
x = DataType::kTypeInfo[DataType::DE_UINT16].sizeInBytes_;
d = DataType(DataType::DE_UINT16);
ASSERT_EQ(x, 2);
ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_INT32];
x = DataType::kTypeInfo[DataType::DE_INT32].sizeInBytes_;
d = DataType(DataType::DE_INT32);
ASSERT_EQ(x, 4);
ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_UINT32];
x = DataType::kTypeInfo[DataType::DE_UINT32].sizeInBytes_;
d = DataType(DataType::DE_UINT32);
ASSERT_EQ(x, 4);
ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_INT64];
x = DataType::kTypeInfo[DataType::DE_INT64].sizeInBytes_;
d = DataType(DataType::DE_INT64);
ASSERT_EQ(x, 8);
ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_UINT64];
x = DataType::kTypeInfo[DataType::DE_UINT64].sizeInBytes_;
d = DataType(DataType::DE_UINT64);
ASSERT_EQ(x, 8);
ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_FLOAT32];
x = DataType::kTypeInfo[DataType::DE_FLOAT32].sizeInBytes_;
d = DataType(DataType::DE_FLOAT32);
ASSERT_EQ(x, 4);
ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_FLOAT64];
x = DataType::kTypeInfo[DataType::DE_FLOAT64].sizeInBytes_;
d = DataType(DataType::DE_FLOAT64);
ASSERT_EQ(x, 8);
ASSERT_EQ(d.SizeInBytes(), x);

View File

@ -14,9 +14,7 @@
* limitations under the License.
*/
#include "common/common.h"
#include "common/cvop_common.h"
#include "dataset/kernels/data/one_hot_op.h"
#include "dataset/core/cv_tensor.h"
#include "utils/log_adapter.h"
using namespace mindspore::dataset;
@ -24,9 +22,9 @@ using mindspore::MsLogLevel::INFO;
using mindspore::ExceptionType::NoExceptionType;
using mindspore::LogStream;
class MindDataTestOneHotOp : public UT::CVOP::CVOpCommon {
class MindDataTestOneHotOp : public UT::Common {
protected:
MindDataTestOneHotOp() : CVOpCommon() {}
MindDataTestOneHotOp() {}
};
TEST_F(MindDataTestOneHotOp, TestOp) {

View File

@ -65,14 +65,14 @@ TEST_F(MindDataTestStringTensorDE, Basics) {
TEST_F(MindDataTestStringTensorDE, Basics2) {
std::shared_ptr<Tensor> t =
std::make_shared<Tensor>(std::vector<std::string>{"abc", "defg", "hi", "klmno", "123", "789"}, TensorShape({2, 3}));
ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 20);
std::vector<uint32_t> offsets = {3, 8, 11, 17, 21, 25};
ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 20 + 4);
std::vector<uint32_t> offsets = {0, 4, 9, 12, 18, 22, 26};
uint32_t ctr = 0;
for (auto i : offsets) {
ASSERT_TRUE(*(reinterpret_cast<uint32_t *>(t->GetMutableBuffer() + ctr)) == i);
ASSERT_TRUE(*(reinterpret_cast<uint32_t *>(t->GetMutableBuffer() + ctr)) == i + 28);
ctr += 4;
}
const char *buf = reinterpret_cast<char *>(t->GetMutableBuffer()) + 6 * 4;
const char *buf = reinterpret_cast<char *>(t->GetMutableBuffer()) + 6 * 4 + 4;
std::vector<uint32_t> starts = {0, 4, 9, 12, 18, 22};
uint32_t index = 0;
@ -90,14 +90,14 @@ TEST_F(MindDataTestStringTensorDE, Empty) {
std::shared_ptr<Tensor> t = std::make_shared<Tensor>(strings, TensorShape({2, 3}));
// abc_defg___123__
// 0123456789012345
ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 10);
std::vector<uint32_t> offsets = {3, 8, 9, 10, 14, 15};
ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 10 + 4);
std::vector<uint32_t> offsets = {0, 4, 9, 10, 11, 15, 16};
uint32_t ctr = 0;
for (auto i : offsets) {
ASSERT_TRUE(*(reinterpret_cast<uint32_t *>(t->GetMutableBuffer() + ctr)) == i);
ASSERT_TRUE(*(reinterpret_cast<uint32_t *>(t->GetMutableBuffer() + ctr)) == i + 28);
ctr += 4;
}
const char *buf = reinterpret_cast<char *>(t->GetMutableBuffer()) + 6 * 4;
const char *buf = reinterpret_cast<char *>(t->GetMutableBuffer()) + 6 * 4 + 4;
std::vector<uint32_t> starts = {0, 4, 9, 10, 11, 15};
uint32_t index = 0;

View File

@ -41,6 +41,7 @@ class MindDataTestTensorDE : public UT::Common {
TEST_F(MindDataTestTensorDE, Basics) {
std::shared_ptr<Tensor> t = std::make_shared<Tensor>(TensorShape({2, 3}), DataType(DataType::DE_UINT64));
ASSERT_TRUE((t->AllocateBuffer(t->SizeInBytes())).IsOk());
ASSERT_EQ(t->shape(), TensorShape({2, 3}));
ASSERT_EQ(t->type(), DataType::DE_UINT64);
ASSERT_EQ(t->SizeInBytes(), 2 * 3 * 8);

View File

@ -0,0 +1,18 @@
{
"datasetType": "TF",
"numRows": 3,
"columns": {
"line": {
"type": "string",
"rank": 0
},
"words": {
"type": "string",
"rank": 1
},
"chinese": {
"type": "string",
"rank": 0
}
}
}

Binary file not shown.

View File

@ -584,7 +584,7 @@ def test_cv_minddataset_reader_basic_tutorial_5_epoch(add_and_remove_cv_file):
def test_cv_minddataset_reader_basic_tutorial_5_epoch_with_batch(add_and_remove_cv_file):
"""tutorial for cv minderdataset."""
columns_list = ["data", "file_name", "label"]
columns_list = ["data", "label"]
num_readers = 4
data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers)
@ -948,8 +948,7 @@ def test_write_with_multi_bytes_and_array_and_read_by_MindDataset():
data_value_to_list = []
for item in data:
new_data = {}
new_data['file_name'] = np.asarray(
list(bytes(item["file_name"], encoding='utf-8')), dtype=np.uint8)
new_data['file_name'] = np.asarray(item["file_name"], dtype='S')
new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32)
new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8)
new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8)
@ -1153,8 +1152,7 @@ def test_write_with_multi_bytes_and_MindDataset():
data_value_to_list = []
for item in data:
new_data = {}
new_data['file_name'] = np.asarray(
list(bytes(item["file_name"], encoding='utf-8')), dtype=np.uint8)
new_data['file_name'] = np.asarray(item["file_name"], dtype='S')
new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32)
new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8)
new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8)

View File

@ -27,6 +27,7 @@ import mindspore.dataset as ds
import mindspore.dataset.transforms.vision.c_transforms as vision
from mindspore import log as logger
from mindspore.dataset.transforms.vision import Inter
from mindspore.dataset.transforms.text import as_text
from mindspore.mindrecord import FileWriter
FILES_NUM = 4
@ -72,7 +73,7 @@ def test_cv_minddataset_pk_sample_no_column(add_and_remove_cv_file):
for item in data_set.create_dict_iterator():
logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter))
logger.info("-------------- item[file_name]: \
{}------------------------".format("".join([chr(x) for x in item["file_name"]])))
{}------------------------".format(as_text(item["file_name"])))
logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
num_iter += 1
@ -90,7 +91,7 @@ def test_cv_minddataset_pk_sample_basic(add_and_remove_cv_file):
for item in data_set.create_dict_iterator():
logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter))
logger.info("-------------- item[file_name]: \
{}------------------------".format("".join([chr(x) for x in item["file_name"]])))
{}------------------------".format(as_text(item["file_name"])))
logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
num_iter += 1
@ -108,7 +109,7 @@ def test_cv_minddataset_pk_sample_shuffle(add_and_remove_cv_file):
for item in data_set.create_dict_iterator():
logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter))
logger.info("-------------- item[file_name]: \
{}------------------------".format("".join([chr(x) for x in item["file_name"]])))
{}------------------------".format(as_text(item["file_name"])))
logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
num_iter += 1
@ -125,7 +126,7 @@ def test_cv_minddataset_pk_sample_out_of_range(add_and_remove_cv_file):
for item in data_set.create_dict_iterator():
logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter))
logger.info("-------------- item[file_name]: \
{}------------------------".format("".join([chr(x) for x in item["file_name"]])))
{}------------------------".format(as_text(item["file_name"])))
logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
num_iter += 1

View File

@ -17,16 +17,14 @@ import numpy as np
import pytest
import mindspore.dataset as ds
import mindspore.common.dtype as mstype
def test_basic():
x = np.array([["ab", "cde", "121"], ["x", "km", "789"]], dtype='S')
# x = np.array(["ab", "cde"], dtype='S')
n = cde.Tensor(x)
arr = n.as_array()
y = np.array([1, 2])
assert all(y == y)
# assert np.testing.assert_array_equal(y,y)
np.testing.assert_array_equal(x, arr)
def compare(strings):
@ -59,7 +57,125 @@ def test_batching_strings():
assert "[Batch ERROR] Batch does not support" in str(info)
def test_map():
def gen():
yield np.array(["ab cde 121"], dtype='S'),
data = ds.GeneratorDataset(gen, column_names=["col"])
def split(b):
splits = b.item().decode("utf8").split()
return np.array(splits, dtype='S')
data = data.map(input_columns=["col"], operations=split)
expected = np.array(["ab", "cde", "121"], dtype='S')
for d in data:
np.testing.assert_array_equal(d[0], expected)
def as_str(arr):
def decode(s): return s.decode("utf8")
decode_v = np.vectorize(decode)
return decode_v(arr)
line = np.array(["This is a text file.",
"Be happy every day.",
"Good luck to everyone."])
words = np.array([["This", "text", "file", "a"],
["Be", "happy", "day", "b"],
["", "", "everyone", "c"]])
chinese = np.array(["今天天气太好了我们一起去外面玩吧",
"男默女泪",
"江州市长江大桥参加了长江大桥的通车仪式"])
def test_tfrecord1():
s = ds.Schema()
s.add_column("line", "string", [])
s.add_column("words", "string", [-1])
s.add_column("chinese", "string", [])
data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema=s)
for i, d in enumerate(data.create_dict_iterator()):
assert d["line"].shape == line[i].shape
assert d["words"].shape == words[i].shape
assert d["chinese"].shape == chinese[i].shape
np.testing.assert_array_equal(line[i], as_str(d["line"]))
np.testing.assert_array_equal(words[i], as_str(d["words"]))
np.testing.assert_array_equal(chinese[i], as_str(d["chinese"]))
def test_tfrecord2():
data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False,
schema='../data/dataset/testTextTFRecord/datasetSchema.json')
for i, d in enumerate(data.create_dict_iterator()):
assert d["line"].shape == line[i].shape
assert d["words"].shape == words[i].shape
assert d["chinese"].shape == chinese[i].shape
np.testing.assert_array_equal(line[i], as_str(d["line"]))
np.testing.assert_array_equal(words[i], as_str(d["words"]))
np.testing.assert_array_equal(chinese[i], as_str(d["chinese"]))
def test_tfrecord3():
s = ds.Schema()
s.add_column("line", mstype.string, [])
s.add_column("words", mstype.string, [-1, 2])
s.add_column("chinese", mstype.string, [])
data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema=s)
for i, d in enumerate(data.create_dict_iterator()):
assert d["line"].shape == line[i].shape
assert d["words"].shape == words[i].reshape([2, 2]).shape
assert d["chinese"].shape == chinese[i].shape
np.testing.assert_array_equal(line[i], as_str(d["line"]))
np.testing.assert_array_equal(words[i].reshape([2, 2]), as_str(d["words"]))
np.testing.assert_array_equal(chinese[i], as_str(d["chinese"]))
def create_text_mindrecord():
# methood to create mindrecord with string data, used to generate testTextMindRecord/test.mindrecord
from mindspore.mindrecord import FileWriter
mindrecord_file_name = "test.mindrecord"
data = [{"english": "This is a text file.",
"chinese": "今天天气太好了我们一起去外面玩吧"},
{"english": "Be happy every day.",
"chinese": "男默女泪"},
{"english": "Good luck to everyone.",
"chinese": "江州市长江大桥参加了长江大桥的通车仪式"},
]
writer = FileWriter(mindrecord_file_name)
schema = {"english": {"type": "string"},
"chinese": {"type": "string"},
}
writer.add_schema(schema)
writer.write_raw_data(data)
writer.commit()
def test_mindrecord():
data = ds.MindDataset("../data/dataset/testTextMindRecord/test.mindrecord", shuffle=False)
for i, d in enumerate(data.create_dict_iterator()):
assert d["english"].shape == line[i].shape
assert d["chinese"].shape == chinese[i].shape
np.testing.assert_array_equal(line[i], as_str(d["english"]))
np.testing.assert_array_equal(chinese[i], as_str(d["chinese"]))
if __name__ == '__main__':
test_generator()
test_basic()
test_batching_strings()
# test_generator()
# test_basic()
# test_batching_strings()
test_map()
# test_tfrecord1()
# test_tfrecord2()
# test_tfrecord3()
# test_mindrecord()