diff --git a/mindspore/ccsrc/dataset/api/de_pipeline.cc b/mindspore/ccsrc/dataset/api/de_pipeline.cc index 0194785090b..452b1fbdf96 100644 --- a/mindspore/ccsrc/dataset/api/de_pipeline.cc +++ b/mindspore/ccsrc/dataset/api/de_pipeline.cc @@ -642,18 +642,8 @@ Status DEPipeline::ParseBatchOp(const py::dict &args, std::shared_ptr (void)builder->SetColumnsToMap(ToStringVector(value)); } if (key == "pad_info") { - std::map> pad_info; - for (auto p : py::reinterpret_borrow(value)) { - if (!p.second.is_none()) { - py::tuple tp = py::reinterpret_borrow(p.second); - CHECK_FAIL_RETURN_UNEXPECTED(tp.size() == 2, "tuple in pad_info must be (list,int) or (list,float)"); - TensorShape shape = tp[0].is_none() ? TensorShape::CreateUnknownRankShape() : TensorShape(tp[0]); - float pad_val = tp[1].is_none() ? 0 : ToFloat(tp[1]); - (void)pad_info.insert({ToString(p.first), {shape, pad_val}}); - } else { // tuple is None - (void)pad_info.insert({ToString(p.first), {TensorShape({}), 0}}); - } - } + PadInfo pad_info; + RETURN_IF_NOT_OK(ParsePadInfo(value, &pad_info)); (void)builder->SetPaddingMap(pad_info, true); } } @@ -1166,5 +1156,31 @@ Status DEPipeline::ParseTextFileOp(const py::dict &args, std::shared_ptr(value)) { + if (!p.second.is_none()) { + auto tp = py::reinterpret_borrow(p.second); + CHECK_FAIL_RETURN_UNEXPECTED(tp.size() == 2, "tuple in pad_info must be (list,int) or (list,float)"); + TensorShape shape = tp[0].is_none() ? TensorShape::CreateUnknownRankShape() : TensorShape(tp[0]); + std::shared_ptr pad_val = nullptr; + if (py::isinstance(tp[1])) { + std::string pad_val_string = tp[1].is_none() ? "" : ToString(tp[1]); + CHECK_FAIL_RETURN_UNEXPECTED( + Tensor::CreateTensor(&pad_val, std::vector{pad_val_string}, TensorShape::CreateScalar()), + "Cannot create pad_value Tensor"); + } else { + float pad_val_float = tp[1].is_none() ? 0 : ToFloat(tp[1]); + CHECK_FAIL_RETURN_UNEXPECTED(Tensor::CreateTensor(&pad_val, TensorImpl::kFlexible, TensorShape::CreateScalar(), + DataType(DataType::DE_FLOAT32)), + "Cannot create pad_value Tensor"); + pad_val->SetItemAt({}, pad_val_float); + } + (void)pad_info->insert({ToString(p.first), {shape, pad_val}}); + } else { // tuple is None + (void)pad_info->insert({ToString(p.first), {TensorShape({}), nullptr}}); + } + } + return Status::OK(); +} } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/dataset/api/de_pipeline.h b/mindspore/ccsrc/dataset/api/de_pipeline.h index 4ecfb080c1b..1c9ab9439b6 100644 --- a/mindspore/ccsrc/dataset/api/de_pipeline.h +++ b/mindspore/ccsrc/dataset/api/de_pipeline.h @@ -169,6 +169,8 @@ class DEPipeline { // Validate required args passed to storage op. Status ValidateArgStorageOp(const py::dict &args); + static Status ParsePadInfo(py::handle value, PadInfo *pad_info); + int batch_size_; int repeat_num_; int num_rows_; diff --git a/mindspore/ccsrc/dataset/core/data_type.cc b/mindspore/ccsrc/dataset/core/data_type.cc index 744c8c1ca04..71a510d88f0 100644 --- a/mindspore/ccsrc/dataset/core/data_type.cc +++ b/mindspore/ccsrc/dataset/core/data_type.cc @@ -138,7 +138,7 @@ DataType DataType::FromNpArray(const py::array &arr) { return DataType(DataType::DE_FLOAT32); } else if (py::isinstance>(arr)) { return DataType(DataType::DE_FLOAT64); - } else if (arr.dtype().kind() == 'S') { + } else if (arr.dtype().kind() == 'S' || arr.dtype().kind() == 'U') { return DataType(DataType::DE_STRING); } else { MS_LOG(ERROR) << "Cannot convert from numpy type. Unknown data type is returned!"; diff --git a/mindspore/ccsrc/dataset/core/tensor.cc b/mindspore/ccsrc/dataset/core/tensor.cc index c986e07089d..aa720585316 100644 --- a/mindspore/ccsrc/dataset/core/tensor.cc +++ b/mindspore/ccsrc/dataset/core/tensor.cc @@ -229,7 +229,12 @@ Status Tensor::CreateTensorFromNumpyString(std::shared_ptr *ptr, py::arr } arr.resize({arr.size()}); // flatten the py::array so we can iterate once std::vector strings; - std::for_each(arr.begin(), arr.end(), [&strings](const auto &s) { strings.emplace_back(py::cast(s)); }); + + if (arr.dtype().kind() == 'U') { + std::for_each(arr.begin(), arr.end(), [&strings](const auto &s) { strings.emplace_back(py::cast(s)); }); + } else { + std::for_each(arr.begin(), arr.end(), [&strings](const auto &s) { strings.emplace_back(py::cast(s)); }); + } arr.resize(shape); // resize arr back to the original shape @@ -699,6 +704,8 @@ Status Tensor::GetDataAsNumpyStrings(py::array *data) { for (; itr != end(); itr++) { max = std::max((*itr).length(), max); } + // if all strings are empty, numpy stores a byte for each string |S1 + max = (max == 0 ? 1 : max); uint64_t total_size = shape_.NumOfElements() * max; char *tmp_data = reinterpret_cast(data_allocator_->allocate(total_size)); if (tmp_data == nullptr) RETURN_STATUS_UNEXPECTED("Cannot create temp array."); @@ -708,8 +715,10 @@ Status Tensor::GetDataAsNumpyStrings(py::array *data) { itr = begin(); uint64_t i = 0; for (; itr != end(); itr++, i++) { - ret_code = memcpy_s(tmp_data + i * max, total_size, (*itr).data(), (*itr).length()); - CHECK_FAIL_RETURN_UNEXPECTED(ret_code == 0, "Failed to copy string data."); + if (!(*itr).empty()) { + ret_code = memcpy_s(tmp_data + i * max, total_size, (*itr).data(), (*itr).length()); + CHECK_FAIL_RETURN_UNEXPECTED(ret_code == 0, "Failed to copy string data."); + } } auto strides = shape_.Strides(); std::transform(strides.begin(), strides.end(), strides.begin(), [&max](const auto &s) { return s * max; }); @@ -847,6 +856,21 @@ Status Tensor::GetStringAt(dsize_t index, uchar **string_start, offset_t *length *length = offset_ptr[index + 1] - start - 1; // -1 to skip the \0 from the string length return Status::OK(); } +Status Tensor::CopyLastDimAt(const std::shared_ptr &src, const std::vector &index) { + CHECK_FAIL_RETURN_UNEXPECTED(src->type() == type_, "Source Tensor has a different type"); + CHECK_FAIL_RETURN_UNEXPECTED(index.back() == 0, "Last dim in index should be 0"); + + uint8_t type_size = type_.SizeInBytes(); + size_t len = std::min(src->shape()[-1], shape_[-1]) * type_size; + dsize_t src_flat_ind = 0, dst_flat_ind = 0; + RETURN_IF_NOT_OK(src->shape().ToFlatIndex(index, &src_flat_ind)); + RETURN_IF_NOT_OK(shape_.ToFlatIndex(index, &dst_flat_ind)); + + const unsigned char *src_addr = src->GetBuffer() + src_flat_ind * type_size; + unsigned char *dst_addr = GetMutableBuffer() + dst_flat_ind * type_size; + CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s(dst_addr, len, src_addr, len) == 0, "memcpy error"); + return Status::OK(); +} } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/dataset/core/tensor.h b/mindspore/ccsrc/dataset/core/tensor.h index 5efd989fc9c..1d176004d74 100644 --- a/mindspore/ccsrc/dataset/core/tensor.h +++ b/mindspore/ccsrc/dataset/core/tensor.h @@ -529,6 +529,12 @@ class Tensor { return TensorIterator(data_end_); } + // Copies the last dimension at `index` from Tensor `src` to this Tensor. + // @param src Tensor + // @param index vector to the start of the dimension. The last dim should be 0 + // @return Status + Status CopyLastDimAt(const std::shared_ptr &src, const std::vector &index); + protected: // A function that prints Tensor recursively, first called by print // @param out diff --git a/mindspore/ccsrc/dataset/core/tensor_shape.h b/mindspore/ccsrc/dataset/core/tensor_shape.h index 6cfb007b561..27fce91aec9 100644 --- a/mindspore/ccsrc/dataset/core/tensor_shape.h +++ b/mindspore/ccsrc/dataset/core/tensor_shape.h @@ -118,7 +118,10 @@ class TensorShape { bool operator!=(const TensorShape &rhs) const { return !(rhs == *this); } - dsize_t operator[](const dsize_t index) const { return raw_shape_[index]; } + dsize_t operator[](const dsize_t index) const { + if (index < 0) return raw_shape_[raw_shape_.size() + index]; + return raw_shape_[index]; + } // Return the Shape as a vector // @return diff --git a/mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc index 374128eb217..60643c90ba4 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc @@ -23,6 +23,7 @@ #include "dataset/engine/data_buffer.h" #include "dataset/engine/db_connector.h" #include "dataset/engine/opt/pass.h" +#include "dataset/kernels/data/data_utils.h" using float16 = Eigen::half; @@ -53,7 +54,7 @@ Status BatchOp::Builder::SanityCheck() { BatchOp::BatchOp(int32_t batch_size, bool drop, bool pad, int32_t op_queue_size, int32_t num_workers, const std::vector &cols_to_map, py::function batch_size_func, py::function batch_map_func, - std::map> pad_map) + PadInfo pad_map) : ParallelOp(num_workers, op_queue_size), start_batch_size_(batch_size), drop_(drop), @@ -75,10 +76,6 @@ Status BatchOp::operator()() { std::unique_ptr table = std::make_unique(); child_iterator_ = std::make_unique(this, 0, 0); RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row)); - for (const auto &t : new_row) { - CHECK_FAIL_RETURN_UNEXPECTED(t->type().IsNumeric(), - "[Batch ERROR] Batch does not support Tensor of type string yet."); - } RETURN_IF_NOT_OK(DatasetOp::AssignColMapFromChild()); // must come after the first fetch above int32_t cur_batch_size = 0; RETURN_IF_NOT_OK(GetBatchSize(&cur_batch_size, CBatchInfo(0, 0, 0))); @@ -134,49 +131,57 @@ void BatchOp::Print(std::ostream &out, bool show_all) const { } } -Status BatchOp::BatchRows(const std::unique_ptr *source_table, - const std::unique_ptr *dest_table, size_t batch_size) { - if ((*source_table)->size() < batch_size || (*source_table)->size() == 0) { - RETURN_STATUS_UNEXPECTED("[Internal Batch ERROR] Insufficient rows in source_table\n"); +Status BatchOp::BatchRows(const std::unique_ptr *src, const std::unique_ptr *dest, + dsize_t batch_size) { + if ((*src)->size() != batch_size) { + RETURN_STATUS_UNEXPECTED("[Internal Batch ERROR] Source table size does not match the batch_size"); } - TensorRow row = std::move((*source_table)->front()); - (*source_table)->pop_front(); + if (batch_size == 1) { - for (std::shared_ptr tensor : row) { + TensorRow row = std::move((*src)->front()); + (*src)->pop_front(); + (*dest)->push_back(row); + for (const auto &tensor : (*dest)->front()) { RETURN_IF_NOT_OK(tensor->ExpandDim(0)); } - (*dest_table)->push_back(row); - } else { // batch_size > 1 - std::vector row_shapes; - TensorRow batched_row; - for (size_t i = 0; i < row.size(); i++) { // Handle the first row popped - row_shapes.push_back(row[i]->shape()); - std::shared_ptr ts; - RETURN_IF_NOT_OK(Tensor::CreateTensor( - &ts, TensorImpl::kFlexible, row[i]->shape().PrependDim(static_cast(batch_size)), row[i]->type())); - batched_row.emplace_back(ts); - RETURN_IF_NOT_OK(batched_row[i]->InsertTensor(std::vector(1, 0), row[i])); // {j} = 0 - } - for (size_t j = 1; j < batch_size; j++) { // Handle the rest of the rows - row = std::move((*source_table)->front()); - (*source_table)->pop_front(); - for (size_t i = 0; i < row.size(); i++) { - if (row[i]->shape() == row_shapes[i]) { // check the newly popped rows have the same dim as the first - RETURN_IF_NOT_OK(batched_row[i]->InsertTensor(std::vector(1, j), row[i])); + return Status::OK(); + } + + TensorRow batched_row; + auto num_columns = (*src)->front().size(); + for (size_t i = 0; i < num_columns; i++) { + std::shared_ptr first_tensor = (*src)->at(0).at(i); // first row, column i + TensorShape first_shape = first_tensor->shape(); + DataType first_type = first_tensor->type(); + TensorShape new_shape = first_shape.PrependDim(static_cast(batch_size)); + + std::shared_ptr new_tensor; + if (first_type.IsNumeric()) { // numeric tensor + RETURN_IF_NOT_OK(Tensor::CreateTensor(&new_tensor, TensorImpl::kFlexible, new_shape, first_type)); + dsize_t j = 0; + for (auto row : **src) { + std::shared_ptr old_tensor = row.at(i); // row j, column i + if (old_tensor->shape() == first_shape) { // check the newly popped rows have the same dim as the first + RETURN_IF_NOT_OK(new_tensor->InsertTensor({j++}, old_tensor)); } else { - std::string column_name; - for (auto itr : column_name_id_map_) { - if (static_cast(itr.second) == i) { - column_name = itr.first; - break; - } - } - RETURN_STATUS_UNEXPECTED("[Batch ERROR] Inconsistent TensorShapes of Column " + column_name); + RETURN_STATUS_UNEXPECTED("[Batch ERROR] Inconsistent TensorShapes of Column " + std::to_string(i)); } } + } else { // handle string column differently + std::vector strings; + for (dsize_t j = 0; j < batch_size; j++) { + std::shared_ptr old_tensor = (*src)->at(j).at(i); + for (auto itr = old_tensor->begin(); itr != old_tensor->end(); itr++) { + strings.emplace_back(*itr); + } + } + RETURN_IF_NOT_OK(Tensor::CreateTensor(&new_tensor, strings, new_shape)); } - (*dest_table)->emplace_back(batched_row); + batched_row.emplace_back(new_tensor); } + + (*dest)->emplace_back(batched_row); + return Status::OK(); } @@ -202,8 +207,8 @@ Status BatchOp::WorkerEntry(int32_t workerId) { Status BatchOp::MakeBatchedBuffer(std::pair, CBatchInfo> table_pair, std::unique_ptr *db) { RETURN_UNEXPECTED_IF_NULL(table_pair.first); - if (!pyfunc_column_names_.empty()) RETURN_IF_NOT_OK(MapColumns(&table_pair)); // pass it through pyfunc - if (pad_) RETURN_IF_NOT_OK(PadColumns(&table_pair)); // do padding if needed + if (!pyfunc_column_names_.empty()) RETURN_IF_NOT_OK(MapColumns(&table_pair)); // pass it through pyfunc + if (pad_) RETURN_IF_NOT_OK(PadColumns(&table_pair.first, pad_info_, column_name_id_map_)); // do padding if needed (*db) = std::make_unique(table_pair.second.batch_num_, DataBuffer::kDeBFlagNone); std::unique_ptr dest_table = std::make_unique(); RETURN_IF_NOT_OK(BatchRows(&table_pair.first, &dest_table, table_pair.first->size())); @@ -333,74 +338,27 @@ Status BatchOp::InvokeBatchMapFunc(TensorBatchTable *input, TensorBatchTable *ou return Status(StatusCode::kOK); } -Status BatchOp::PadTensor(std::shared_ptr src, std::shared_ptr *dst, - const std::vector &pad_shape, float pad_val) { - CHECK_FAIL_RETURN_UNEXPECTED(src != nullptr && dst != nullptr, "tensor can't be nullptr"); - if (src->Rank() == 0 || src->shape().AsVector() == pad_shape) { - (*dst) = src; // if no padding, copy the pointer - } else { - CHECK_FAIL_RETURN_UNEXPECTED(src->Rank() == pad_shape.size(), "Pad to diff rank not allowed"); - RETURN_IF_NOT_OK(Tensor::CreateTensor(dst, TensorImpl::kFlexible, TensorShape(pad_shape), src->type())); - auto tensor_type = src->type().value(); - if (pad_val == 0) { // if pad with zero, don't care what type it is - RETURN_IF_NOT_OK((*dst)->Zero()); - } else if (tensor_type == DataType::DE_INT8) { - RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); - } else if (tensor_type == DataType::DE_BOOL) { - RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); - } else if (tensor_type == DataType::DE_UINT8) { - RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); - } else if (tensor_type == DataType::DE_INT16) { - RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); - } else if (tensor_type == DataType::DE_FLOAT16) { - RETURN_IF_NOT_OK((*dst)->Fill(static_cast(pad_val))); - } else if (tensor_type == DataType::DE_UINT16) { - RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); - } else if (tensor_type == DataType::DE_INT32) { - RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); - } else if (tensor_type == DataType::DE_UINT32) { - RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); - } else if (tensor_type == DataType::DE_INT64) { - RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); - } else if (tensor_type == DataType::DE_UINT64) { - RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); - } else if (tensor_type == DataType::DE_FLOAT32) { - RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); - } else if (tensor_type == DataType::DE_FLOAT64) { - RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); - } else { - RETURN_STATUS_UNEXPECTED("Incorrect/Unknown tensor type"); - } - std::vector cur_ind(src->Rank(), 0), src_s(src->Rank(), 1), dst_s(src->Rank(), 1); - for (dsize_t i = src->Rank() - 2; i >= 0; i--) { - src_s[i] = src->shape()[i + 1] * src_s[i + 1]; - dst_s[i] = pad_shape[i + 1] * dst_s[i + 1]; - } - RETURN_IF_NOT_OK(PadHelper(src, *dst, cur_ind, src_s, dst_s, 0)); - } - return Status::OK(); -} // namespace dataset - -Status BatchOp::PadColumns(std::pair, CBatchInfo> *table_pair) { - RETURN_UNEXPECTED_IF_NULL(table_pair); // placeholder for now, might need this in the future - CHECK_FAIL_RETURN_UNEXPECTED(table_pair->first->front().size() == column_name_id_map_.size(), - "col_name_map mismatch"); - std::vector pad_vals(column_name_id_map_.size(), 0); // value to pad each column's tensor with, default 0 +Status BatchOp::PadColumns(std::unique_ptr *table, const PadInfo &pad_info, + const std::unordered_map &column_name_id_map) { + RETURN_UNEXPECTED_IF_NULL(table); // placeholder for now, might need this in the future + CHECK_FAIL_RETURN_UNEXPECTED((*table)->front().size() == column_name_id_map.size(), "col_name_map mismatch"); + std::vector> pad_vals(column_name_id_map.size(), + 0); // value to pad each column's tensor with, default 0 std::set pad_cols; // padded_shape provided by user, maximum shapes of current batch of tensors - std::vector> pad_shapes(column_name_id_map_.size()), max_shapes(column_name_id_map_.size()); - RETURN_IF_NOT_OK(UnpackPadInfo(&pad_cols, &pad_vals, &pad_shapes)); + std::vector> pad_shapes(column_name_id_map.size()), max_shapes(column_name_id_map.size()); + RETURN_IF_NOT_OK(UnpackPadInfo(pad_info, column_name_id_map, &pad_cols, &pad_vals, &pad_shapes)); // init each shape in max_shape to {-1,-1...} init each unspecified shape in pad_shape to -1 as well for (size_t col_id : pad_cols) { - max_shapes[col_id] = std::vector(table_pair->first->front()[col_id]->Rank(), -1); + max_shapes[col_id] = std::vector((*table)->front()[col_id]->Rank(), -1); if (pad_shapes[col_id].empty()) pad_shapes[col_id] = max_shapes[col_id]; // fill pad shape with -1 CHECK_FAIL_RETURN_UNEXPECTED(pad_shapes[col_id].size() == max_shapes[col_id].size(), "wrong rank in pad_shape"); } // calculate maximum shape for each column that needs to be padded - for (const TensorRow &row : *(table_pair->first)) { // iterator each row in a batch - for (size_t col_id : pad_cols) { // iterator each tensor in a row + for (const TensorRow &row : **table) { // iterator each row in a batch + for (size_t col_id : pad_cols) { // iterator each tensor in a row CHECK_FAIL_RETURN_UNEXPECTED(row[col_id]->Rank() == max_shapes[col_id].size(), "Tensor to be padded together need to have the same rank"); for (size_t dim = 0; dim < row[col_id]->Rank(); dim++) { // pick the largest number in each dimension @@ -417,27 +375,29 @@ Status BatchOp::PadColumns(std::pair, CBatchInfo> } // call pad on each tensor that needs to be padded - for (TensorRow &row : *(table_pair->first)) { + for (TensorRow &row : **table) { for (size_t col_id : pad_cols) { std::shared_ptr pad_tensor; - RETURN_IF_NOT_OK(PadTensor(row[col_id], &pad_tensor, pad_shapes[col_id], pad_vals[col_id])); + RETURN_IF_NOT_OK(PadEnd(row[col_id], &pad_tensor, pad_shapes[col_id], pad_vals[col_id])); row[col_id] = pad_tensor; } } return Status::OK(); } -Status BatchOp::UnpackPadInfo(std::set *pad_cols, std::vector *pad_vals, +Status BatchOp::UnpackPadInfo(const PadInfo &pad_info, + const std::unordered_map &column_name_id_map, + std::set *pad_cols, std::vector> *pad_vals, std::vector> *pad_shapes) { - if (pad_info_.empty()) { // if pad_info empty, pad every columns automatically - for (dsize_t col_id = 0; col_id < column_name_id_map_.size(); col_id++) { + if (pad_info.empty()) { // if pad_info empty, pad every columns automatically + for (dsize_t col_id = 0; col_id < column_name_id_map.size(); col_id++) { pad_cols->insert(col_id); } } else { - for (auto p : pad_info_) { - CHECK_FAIL_RETURN_UNEXPECTED(column_name_id_map_.find(p.first) != column_name_id_map_.end(), - "no column exists with name:" + p.first); - dsize_t col_id = static_cast(column_name_id_map_[p.first]); + for (const auto &p : pad_info) { + auto location = column_name_id_map.find(p.first); + CHECK_FAIL_RETURN_UNEXPECTED(location != column_name_id_map.end(), "no column exists with name:" + p.first); + auto col_id = static_cast(location->second); CHECK_FAIL_RETURN_UNEXPECTED(col_id < pad_vals->size() && col_id < pad_shapes->size(), "col_id out of bound"); pad_cols->insert(col_id); (*pad_vals)[col_id] = p.second.second; // set pad values @@ -447,29 +407,6 @@ Status BatchOp::UnpackPadInfo(std::set *pad_cols, std::vector *p return Status::OK(); } -Status BatchOp::PadHelper(std::shared_ptr src, std::shared_ptr dst, std::vector cur_ind, - const std::vector &src_s, const std::vector &dst_s, size_t cur_dim) { - if (cur_dim == src->Rank() - 1) { // if this is the last dimension, copy the data - uint8_t type_size = src->type().SizeInBytes(); - size_t len = std::min(src->shape()[cur_dim], dst->shape()[cur_dim]) * type_size; - dsize_t src_flat_ind = 0, dst_flat_ind = 0; - for (size_t i = 0; i < src->Rank(); i++) { - src_flat_ind += src_s[i] * cur_ind[i]; - dst_flat_ind += dst_s[i] * cur_ind[i]; - } - unsigned char *src_addr = src->GetMutableBuffer() + src_flat_ind * type_size; - unsigned char *dst_addr = dst->GetMutableBuffer() + dst_flat_ind * type_size; - CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s(dst_addr, len, src_addr, len) == 0, "memcpy error"); - } else { // not the last dimension, keep doing recursion - dsize_t min_ind = std::min(dst->shape()[cur_dim], src->shape()[cur_dim]); - for (dsize_t i = 0; i < min_ind; i++) { - cur_ind[cur_dim] = i; - RETURN_IF_NOT_OK(PadHelper(src, dst, cur_ind, src_s, dst_s, cur_dim + 1)); - } - } - return Status::OK(); -} - // Visitor accept method for NodePass Status BatchOp::Accept(NodePass *p, bool *modified) { // Downcast shared pointer then call visitor diff --git a/mindspore/ccsrc/dataset/engine/datasetops/batch_op.h b/mindspore/ccsrc/dataset/engine/datasetops/batch_op.h index 1a862acd0bd..d1d7e232cf4 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/batch_op.h +++ b/mindspore/ccsrc/dataset/engine/datasetops/batch_op.h @@ -38,6 +38,7 @@ class DataBuffer; using TensorBatch = std::vector>; using TensorBatchTable = std::vector; +using PadInfo = std::map>>; class BatchOp : public ParallelOp { public: @@ -66,7 +67,7 @@ class BatchOp : public ParallelOp { return *this; } - Builder &SetPaddingMap(const std::map> &pad_map, bool pad = true) { + Builder &SetPaddingMap(const PadInfo &pad_map, bool pad = true) { builder_pad_ = pad; builder_pad_map_ = pad_map; return *this; @@ -119,7 +120,7 @@ class BatchOp : public ParallelOp { int32_t builder_num_workers_; int32_t builder_op_connector_size_; std::vector builder_cols_to_map_; - std::map> builder_pad_map_; + PadInfo builder_pad_map_; py::function builder_batch_size_func_; py::function builder_batch_map_func_; }; @@ -150,8 +151,7 @@ class BatchOp : public ParallelOp { // @param int32_t rows_per_buf // @param int32_t num_workers BatchOp(int32_t batch_size, bool drop, bool pad, int32_t op_queue_size, int32_t num_workers, - const std::vector &, py::function batch_size_func, py::function batch_map_func, - std::map> pad_map); + const std::vector &, py::function batch_size_func, py::function batch_map_func, PadInfo pad_map); // BatchOp destructor ~BatchOp() {} @@ -183,15 +183,6 @@ class BatchOp : public ParallelOp { // @return Status - The error code return Status operator()() override; - // Pad input tensor according pad_shape, need to have same rank. - // @param std::shared_ptr src - tensor to pad from - // @param std::shared_ptr *dst - return tensor padded - // @param std::vector pad_shape - shape to pad to - // @param float pad_val - value to pad with - // @return - The error code return - Status PadTensor(std::shared_ptr src, std::shared_ptr *dst, const std::vector &pad_shape, - float pad_val); - // Base-class override for NodePass visitor acceptor. // @param p - Pointer to the NodePass to be accepted. // @param modified - Whether this node visit modified the pipeline. @@ -199,18 +190,6 @@ class BatchOp : public ParallelOp { Status Accept(NodePass *p, bool *modified) override; private: - // recursive helper function. This function could be very expensive if called on a multi-dimensional tensor - // it is only meant to be called by PadTensor. - // @tparam T - type of tensor and fill value - // @param std::shared_ptr src - Tensor to pad from - // @param std::shared_ptr* dst - Tensor to pad to, return value - // @param std::vector cur_ind - recursion helper - // @param T pad_val - value to pad tensor with - // @param size_t cur_dim - recursion helper - // @return Status - The error code return - Status PadHelper(std::shared_ptr src, std::shared_ptr dst, std::vector cur_ind, - const std::vector &src_s, const std::vector &dst_s, size_t cur_dim = 0); - // Worker thread for doing the memcpy of batch // @param int32_t param workerId // @return Status - The error code return @@ -225,23 +204,33 @@ class BatchOp : public ParallelOp { // @param const std::unique_ptr *src - table that has the rows for batching // @param const std::unique_ptr *dest - dest_table to hold batched rows // @param int32_t size - batch_size + // @param const std::unordered_map& column_name_id_map - column names to index mapping // @return Status - The error code return - Status BatchRows(const std::unique_ptr *src, const std::unique_ptr *dest, size_t size); + static Status BatchRows(const std::unique_ptr *src, const std::unique_ptr *dest, + dsize_t batch_size); // Function that calls pyfunc to perform map on batch // @param (std::pair, batch_stats> *table_pair - contains un-batched tensor // @return Status - The error code return Status MapColumns(std::pair, CBatchInfo> *table_pair); + // @param const PadInfo &pad_info pad info to unpack + // @param const std::unordered_map& column_name_id_map - column names to index mapping // @param std::set *cols, col ids to perform pad on // @param std::vector *vals, default padding value for each column // @param std::vector> *shapes, padding shape specified by user // @return Status - The error code return - Status UnpackPadInfo(std::set *cols, std::vector *vals, std::vector> *shapes); + static Status UnpackPadInfo(const PadInfo &pad_info, + const std::unordered_map &column_name_id_map, + std::set *pad_cols, std::vector> *pad_vals, + std::vector> *pad_shapes); - // @param table_pair + // @param table + // @param const PadInfo &pad_info pad info + // @param const std::unordered_map& column_name_id_map - column names to index mapping // @return Status - The error code return - Status PadColumns(std::pair, CBatchInfo> *table_pair); + static Status PadColumns(std::unique_ptr *table, const PadInfo &pad_info, + const std::unordered_map &column_name_id_map); // the number of thread pulling from the mOutConnector of the Op below // @return int32_t, 1 @@ -264,11 +253,11 @@ class BatchOp : public ParallelOp { Status InvokeBatchMapFunc(TensorTable *input, TensorTable *output, CBatchInfo info); int32_t start_batch_size_; - bool drop_; // bool for whether to drop remainder or not - bool pad_; // bool for whether to perform padding on tensor - std::vector pyfunc_column_names_; // Name of the columns to perform map op on - std::map> pad_info_; // column names to perform padding on - std::unique_ptr child_iterator_; // child iterator for fetching TensorRows 1 by 1 + bool drop_; // bool for whether to drop remainder or not + bool pad_; // bool for whether to perform padding on tensor + std::vector pyfunc_column_names_; // Name of the columns to perform map op on + PadInfo pad_info_; // column names to perform padding on + std::unique_ptr child_iterator_; // child iterator for fetching TensorRows 1 by 1 QueueList, CBatchInfo>> worker_queues_; // internal queue for syncing worker py::function batch_size_func_; // Function pointer of batch size function py::function batch_map_func_; // Function pointer of per batch map function diff --git a/mindspore/ccsrc/dataset/kernels/data/data_utils.cc b/mindspore/ccsrc/dataset/kernels/data/data_utils.cc index f2635c1fe34..c20d9a4c757 100644 --- a/mindspore/ccsrc/dataset/kernels/data/data_utils.cc +++ b/mindspore/ccsrc/dataset/kernels/data/data_utils.cc @@ -15,6 +15,8 @@ */ #include "dataset/kernels/data/data_utils.h" +#include +#include #include #include "dataset/core/constants.h" #include "dataset/core/tensor.h" @@ -220,5 +222,125 @@ Status ToFloat16(const std::shared_ptr &input, std::shared_ptr * return Status::OK(); } + +Status PadEnd(const std::shared_ptr &src, std::shared_ptr *dst, const std::vector &pad_shape, + const std::shared_ptr &pad_val) { + if (pad_val == nullptr) { + if (src->type().IsNumeric()) { + return PadEndNumeric(src, dst, pad_shape, 0); + } else { + return PadEndString(src, dst, pad_shape, ""); + } + } + if (pad_val->type().IsNumeric()) { + float val = 0; + RETURN_IF_NOT_OK(pad_val->GetItemAt(&val, {})); + return PadEndNumeric(src, dst, pad_shape, val); + } + std::string_view val; + RETURN_IF_NOT_OK(pad_val->GetItemAt(&val, {})); + return PadEndString(src, dst, pad_shape, std::string(val)); +} + +Status PadEndNumeric(const std::shared_ptr &src, std::shared_ptr *dst, + const std::vector &pad_shape, float pad_val) { + CHECK_FAIL_RETURN_UNEXPECTED(src != nullptr && dst != nullptr, "tensor can't be nullptr"); + if (src->Rank() == 0 || src->shape().AsVector() == pad_shape) { + (*dst) = src; // if no padding, copy the pointer + } else { + CHECK_FAIL_RETURN_UNEXPECTED(src->Rank() == pad_shape.size(), "Pad to diff rank not allowed"); + RETURN_IF_NOT_OK(Tensor::CreateTensor(dst, TensorImpl::kFlexible, TensorShape(pad_shape), src->type())); + auto tensor_type = src->type().value(); + if (pad_val == 0) { // if pad with zero, don't care what type it is + RETURN_IF_NOT_OK((*dst)->Zero()); + } else if (tensor_type == DataType::DE_INT8) { + RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); + } else if (tensor_type == DataType::DE_BOOL) { + RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); + } else if (tensor_type == DataType::DE_UINT8) { + RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); + } else if (tensor_type == DataType::DE_INT16) { + RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); + } else if (tensor_type == DataType::DE_FLOAT16) { + RETURN_IF_NOT_OK((*dst)->Fill(static_cast(pad_val))); + } else if (tensor_type == DataType::DE_UINT16) { + RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); + } else if (tensor_type == DataType::DE_INT32) { + RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); + } else if (tensor_type == DataType::DE_UINT32) { + RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); + } else if (tensor_type == DataType::DE_INT64) { + RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); + } else if (tensor_type == DataType::DE_UINT64) { + RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); + } else if (tensor_type == DataType::DE_FLOAT32) { + RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); + } else if (tensor_type == DataType::DE_FLOAT64) { + RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); + } else { + RETURN_STATUS_UNEXPECTED("Incorrect/Unknown tensor type"); + } + std::vector cur_ind(src->Rank(), 0); + RETURN_IF_NOT_OK(PadEndNumericHelper(src, *dst, cur_ind, 0)); + } + return Status::OK(); +} +Status PadEndNumericHelper(const std::shared_ptr &src, std::shared_ptr dst, + std::vector cur_ind, size_t cur_dim) { + if (cur_dim == src->Rank() - 1) { // if this is the last dimension, copy the data + dst->CopyLastDimAt(src, cur_ind); + } else { // not the last dimension, keep doing recursion + dsize_t min_ind = std::min(dst->shape()[cur_dim], src->shape()[cur_dim]); + for (dsize_t i = 0; i < min_ind; i++) { + cur_ind[cur_dim] = i; + RETURN_IF_NOT_OK(PadEndNumericHelper(src, dst, cur_ind, cur_dim + 1)); + } + } + return Status::OK(); +} + +Status PadEndString(const std::shared_ptr &src, std::shared_ptr *dst, + const std::vector &pad_shape, const std::string &pad_val) { + CHECK_FAIL_RETURN_UNEXPECTED(src != nullptr && dst != nullptr, "tensor can't be nullptr"); + if (src->Rank() == 0 || src->shape().AsVector() == pad_shape) { + (*dst) = src; // if no padding, copy the pointer + } else { + CHECK_FAIL_RETURN_UNEXPECTED(src->Rank() == pad_shape.size(), "Pad to diff rank not allowed"); + std::vector cur_ind(src->Rank(), 0); + std::vector strings; + RETURN_IF_NOT_OK(PadEndStringHelper(src, &strings, TensorShape(pad_shape), cur_ind, 0, pad_val)); + RETURN_IF_NOT_OK(Tensor::CreateTensor(dst, strings, TensorShape(pad_shape))); + } + return Status::OK(); +} + +Status PadEndStringHelper(const std::shared_ptr &src, std::vector *dst, + const TensorShape &dst_shape, std::vector cur_ind, size_t cur_dim, + const std::string &pad_value) { + if (cur_dim == src->Rank() - 1) { // if this is the last dimension, copy the data + dsize_t min_ind = std::min(dst_shape[cur_dim], src->shape()[cur_dim]); + for (dsize_t i = 0; i < min_ind; i++) { + cur_ind[cur_dim] = i; + std::string_view item; + RETURN_IF_NOT_OK(src->GetItemAt(&item, cur_ind)); + dst->emplace_back(item); + } + for (dsize_t i = min_ind; i < dst_shape[cur_dim]; i++) { + dst->emplace_back(pad_value); + } + + } else { // not the last dimension, keep doing recursion + dsize_t min_ind = std::min(dst_shape[cur_dim], src->shape()[cur_dim]); + for (dsize_t i = 0; i < min_ind; i++) { + cur_ind[cur_dim] = i; + RETURN_IF_NOT_OK(PadEndStringHelper(src, dst, dst_shape, cur_ind, cur_dim + 1, pad_value)); + } + dsize_t count = (dst_shape[cur_dim] - min_ind) * dst_shape.Strides()[cur_dim]; + for (dsize_t i = 0; i < count; i++) { + dst->emplace_back(pad_value); + } + } + return Status::OK(); +} } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/dataset/kernels/data/data_utils.h b/mindspore/ccsrc/dataset/kernels/data/data_utils.h index 2bd168a1fec..a0eb15d6d9f 100644 --- a/mindspore/ccsrc/dataset/kernels/data/data_utils.h +++ b/mindspore/ccsrc/dataset/kernels/data/data_utils.h @@ -17,6 +17,7 @@ #define DATASET_KERNELS_DATA_DATA_UTILS_H_ #include +#include #include #include "dataset/core/constants.h" #include "dataset/core/cv_tensor.h" @@ -58,6 +59,59 @@ void Cast(const std::shared_ptr &input, std::shared_ptr *output) Status ToFloat16(const std::shared_ptr &input, std::shared_ptr *output); Status TypeCast(const std::shared_ptr &input, std::shared_ptr *output, const DataType &data_type); + +// Pad input tensor according pad_shape, need to have same rank. +// Based on the type of the input tensor, PadEndNumeric/String will be called. +// @param std::shared_ptr src - tensor to pad from +// @param std::shared_ptr *dst - return tensor padded +// @param std::vector pad_shape - shape to pad to +// @param std::shared_ptr pad_val - value to pad with in Tensor format, +// @return - The error code return +Status PadEnd(const std::shared_ptr &src, std::shared_ptr *dst, const std::vector &pad_shape, + const std::shared_ptr &pad_val); + +// Pad input numeric tensor according pad_shape, need to have same rank. +// @param std::shared_ptr src - tensor to pad from +// @param std::shared_ptr *dst - return tensor padded +// @param std::vector pad_shape - shape to pad to +// @param float pad_val - value to pad with +// @return - The error code return +Status PadEndNumeric(const std::shared_ptr &src, std::shared_ptr *dst, + const std::vector &pad_shape, float pad_val); + +// recursive helper function for padding numric tensors. This function could be very expensive if called on a +// multi-dimensional tensor it is only meant to be called by PadEndNumeric. +// @tparam T - type of tensor and fill value +// @param std::shared_ptr src - Tensor to pad from +// @param std::shared_ptr* dst - Tensor to pad to, return value +// @param std::vector cur_ind - recursion helper +// @param T pad_val - value to pad tensor with +// @param size_t cur_dim - recursion helper +// @return Status - The error code return +Status PadEndNumericHelper(const std::shared_ptr &src, std::shared_ptr dst, + std::vector cur_ind, size_t cur_dim = 0); + +// Pad input string tensor according pad_shape, need to have same rank. +// @param std::shared_ptr src - tensor to pad from +// @param std::shared_ptr *dst - return tensor padded +// @param std::vector pad_shape - shape to pad to +// @param std::string pad_val - value to pad with +// @return - The error code return +Status PadEndString(const std::shared_ptr &src, std::shared_ptr *dst, + const std::vector &pad_shape, const std::string &pad_val); + +// recursive helper function for padding string tensors. This function could be very expensive if called on a +// multi-dimensional tensor it is only meant to be called by PadEndNumeric. +// @tparam T - type of tensor and fill value +// @param std::shared_ptr src - Tensor to pad from +// @param std::shared_ptr* dst - Tensor to pad to, return value +// @param std::vector cur_ind - recursion helper +// @param std::string pad_val - value to pad tensor with +// @param size_t cur_dim - recursion helper +// @return Status - The error code return +Status PadEndStringHelper(const std::shared_ptr &src, std::vector *dst, + const TensorShape &dst_shape, std::vector cur_ind, size_t cur_dim, + const std::string &pad_value); } // namespace dataset } // namespace mindspore diff --git a/mindspore/dataset/engine/validators.py b/mindspore/dataset/engine/validators.py index 049931c80e6..12adb6811ef 100644 --- a/mindspore/dataset/engine/validators.py +++ b/mindspore/dataset/engine/validators.py @@ -19,10 +19,12 @@ import inspect as ins import os from functools import wraps from multiprocessing import cpu_count + import numpy as np from mindspore._c_expression import typing -from . import samplers + from . import datasets +from . import samplers INT32_MAX = 2147483647 valid_detype = [ @@ -683,7 +685,7 @@ def check_pad_info(key, val): check_type(dim, "dim in pad_shape", int) assert dim > 0, "pad shape should be positive integers" if val[1] is not None: - check_type(val[1], "pad_value", (int, float)) + check_type(val[1], "pad_value", (int, float, str, bytes)) def check_batch(method): diff --git a/tests/ut/cpp/dataset/batch_op_test.cc b/tests/ut/cpp/dataset/batch_op_test.cc index 866ebc9b19d..eaa0cd7b7ba 100644 --- a/tests/ut/cpp/dataset/batch_op_test.cc +++ b/tests/ut/cpp/dataset/batch_op_test.cc @@ -299,8 +299,11 @@ TEST_F(MindDataTestBatchOp, TestBatchDropTrueRepeat) { TEST_F(MindDataTestBatchOp, TestSimpleBatchPadding) { std::string schema_file = datasets_root_path_ + "/testBatchDataset"; std::shared_ptr op; - std::map> m; - m.insert({"col_1d", std::make_pair(TensorShape({4}), -1)}); + PadInfo m; + std::shared_ptr pad_value; + Tensor::CreateTensor(&pad_value, TensorImpl::kFlexible, TensorShape::CreateScalar(), DataType(DataType::DE_FLOAT32)); + pad_value->SetItemAt({}, -1); + m.insert({"col_1d", std::make_pair(TensorShape({4}), pad_value)}); de::BatchOp::Builder(12).SetDrop(false).SetPaddingMap(m, true).Build(&op); auto tree = Build({Storage(schema_file), op}); tree->Prepare(); @@ -308,9 +311,54 @@ TEST_F(MindDataTestBatchOp, TestSimpleBatchPadding) { if (rc.IsError()) { MS_LOG(ERROR) << "Return code error detected during tree launch: " << rc.ToString() << "."; } else { - int64_t payload[] = {-9223372036854775807 - 1, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, - 8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1, - 16, 17, -1, -1, 18, 19, -1, -1, 20, 21, -1, -1, 22, 23, -1, -1}; + int64_t payload[] = {-9223372036854775807 - 1, + 1, + -1, + -1, + 2, + 3, + -1, + -1, + 4, + 5, + -1, + -1, + 6, + 7, + -1, + -1, + 8, + 9, + -1, + -1, + 10, + 11, + -1, + -1, + 12, + 13, + -1, + -1, + 14, + 15, + -1, + -1, + 16, + 17, + -1, + -1, + 18, + 19, + -1, + -1, + 20, + 21, + -1, + -1, + 22, + 23, + -1, + -1}; std::shared_ptr t; rc = de::Tensor::CreateTensor(&t, TensorImpl::kFlexible, de::TensorShape({12, 4}), de::DataType(DataType::DE_INT64), (unsigned char *)payload); diff --git a/tests/ut/python/dataset/test_tensor_string.py b/tests/ut/python/dataset/test_tensor_string.py index 4005fca0506..c833527bc32 100644 --- a/tests/ut/python/dataset/test_tensor_string.py +++ b/tests/ut/python/dataset/test_tensor_string.py @@ -12,15 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -import pytest +import mindspore._c_dataengine as cde import numpy as np -import mindspore.dataset as ds -import mindspore._c_dataengine as cde import mindspore.common.dtype as mstype -from mindspore.dataset.text import to_str +import mindspore.dataset as ds +from mindspore.dataset.text import to_str, to_bytes + -# pylint: disable=comparison-with-itself def test_basic(): x = np.array([["ab", "cde", "121"], ["x", "km", "789"]], dtype='S') n = cde.Tensor(x) @@ -28,8 +27,8 @@ def test_basic(): np.testing.assert_array_equal(x, arr) -def compare(strings): - arr = np.array(strings, dtype='S') +def compare(strings, dtype='S'): + arr = np.array(strings, dtype=dtype) def gen(): (yield arr,) @@ -37,25 +36,51 @@ def compare(strings): data = ds.GeneratorDataset(gen, column_names=["col"]) for d in data: - np.testing.assert_array_equal(d[0], arr) + np.testing.assert_array_equal(d[0], arr.astype('S')) def test_generator(): compare(["ab"]) + compare(["", ""]) + compare([""]) + compare(["ab", ""]) compare(["ab", "cde", "121"]) compare([["ab", "cde", "121"], ["x", "km", "789"]]) + compare([["ab", "", "121"], ["", "km", "789"]]) + compare(["ab"], dtype='U') + compare(["", ""], dtype='U') + compare([""], dtype='U') + compare(["ab", ""], dtype='U') + compare(["", ""], dtype='U') + compare(["", "ab"], dtype='U') + compare(["ab", "cde", "121"], dtype='U') + compare([["ab", "cde", "121"], ["x", "km", "789"]], dtype='U') + compare([["ab", "", "121"], ["", "km", "789"]], dtype='U') + + +line = np.array(["This is a text file.", + "Be happy every day.", + "Good luck to everyone."]) + +words = np.array([["This", "text", "file", "a"], + ["Be", "happy", "day", "b"], + ["女", "", "everyone", "c"]]) + +chinese = np.array(["今天天气太好了我们一起去外面玩吧", + "男默女泪", + "江州市长江大桥参加了长江大桥的通车仪式"]) def test_batching_strings(): def gen(): - yield (np.array(["ab", "cde", "121"], dtype='S'),) + for row in chinese: + yield (np.array(row),) - data = ds.GeneratorDataset(gen, column_names=["col"]).batch(10) + data = ds.GeneratorDataset(gen, column_names=["col"]) + data = data.batch(2, drop_remainder=True) - with pytest.raises(RuntimeError) as info: - for _ in data: - pass - assert "[Batch ERROR] Batch does not support" in str(info.value) + for d in data: + np.testing.assert_array_equal(d[0], to_bytes(chinese[0:2])) def test_map(): @@ -67,7 +92,7 @@ def test_map(): def split(b): s = to_str(b) splits = s.item().split() - return np.array(splits, dtype='S') + return np.array(splits) data = data.map(input_columns=["col"], operations=split) expected = np.array(["ab", "cde", "121"], dtype='S') @@ -91,19 +116,6 @@ def test_map2(): np.testing.assert_array_equal(d[0], expected) -line = np.array(["This is a text file.", - "Be happy every day.", - "Good luck to everyone."]) - -words = np.array([["This", "text", "file", "a"], - ["Be", "happy", "day", "b"], - ["女", "", "everyone", "c"]]) - -chinese = np.array(["今天天气太好了我们一起去外面玩吧", - "男默女泪", - "江州市长江大桥参加了长江大桥的通车仪式"]) - - def test_tfrecord1(): s = ds.Schema() s.add_column("line", "string", []) @@ -181,6 +193,94 @@ def test_mindrecord(): np.testing.assert_array_equal(chinese[i], to_str(d["chinese"])) +# The following tests cases were copied from test_pad_batch but changed to strings instead + + +# this generator function yield two columns +# col1d: [0],[1], [2], [3] +# col2d: [[100],[200]], [[101],[201]], [102],[202]], [103],[203]] +def gen_2cols(num): + for i in range(num): + yield (np.array([str(i)]), np.array([[str(i + 100)], [str(i + 200)]])) + + +# this generator function yield one column of variable shapes +# col: [0], [0,1], [0,1,2], [0,1,2,3] +def gen_var_col(num): + for i in range(num): + yield (np.array([str(j) for j in range(i + 1)]),) + + +# this generator function yield two columns of variable shapes +# col1: [0], [0,1], [0,1,2], [0,1,2,3] +# col2: [100], [100,101], [100,101,102], [100,110,102,103] +def gen_var_cols(num): + for i in range(num): + yield (np.array([str(j) for j in range(i + 1)]), np.array([str(100 + j) for j in range(i + 1)])) + + +# this generator function yield two columns of variable shapes +# col1: [[0]], [[0,1]], [[0,1,2]], [[0,1,2,3]] +# col2: [[100]], [[100,101]], [[100,101,102]], [[100,110,102,103]] +def gen_var_cols_2d(num): + for i in range(num): + yield (np.array([[str(j) for j in range(i + 1)]]), np.array([[str(100 + j) for j in range(i + 1)]])) + + +def test_batch_padding_01(): + data1 = ds.GeneratorDataset((lambda: gen_2cols(2)), ["col1d", "col2d"]) + data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col2d": ([2, 2], b"-2"), "col1d": ([2], b"-1")}) + data1 = data1.repeat(2) + for data in data1.create_dict_iterator(): + np.testing.assert_array_equal([[b"0", b"-1"], [b"1", b"-1"]], data["col1d"]) + np.testing.assert_array_equal([[[b"100", b"-2"], [b"200", b"-2"]], [[b"101", b"-2"], [b"201", b"-2"]]], + data["col2d"]) + + +def test_batch_padding_02(): + data1 = ds.GeneratorDataset((lambda: gen_2cols(2)), ["col1d", "col2d"]) + data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col2d": ([1, 2], "")}) + data1 = data1.repeat(2) + for data in data1.create_dict_iterator(): + np.testing.assert_array_equal([[b"0"], [b"1"]], data["col1d"]) + np.testing.assert_array_equal([[[b"100", b""]], [[b"101", b""]]], data["col2d"]) + + +def test_batch_padding_03(): + data1 = ds.GeneratorDataset((lambda: gen_var_col(4)), ["col"]) + data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col": (None, "PAD_VALUE")}) # pad automatically + data1 = data1.repeat(2) + res = dict() + for ind, data in enumerate(data1.create_dict_iterator()): + res[ind] = data["col"].copy() + np.testing.assert_array_equal(res[0], [[b"0", b"PAD_VALUE"], [0, 1]]) + np.testing.assert_array_equal(res[1], [[b"0", b"1", b"2", b"PAD_VALUE"], [b"0", b"1", b"2", b"3"]]) + np.testing.assert_array_equal(res[2], [[b"0", b"PAD_VALUE"], [b"0", b"1"]]) + np.testing.assert_array_equal(res[3], [[b"0", b"1", b"2", b"PAD_VALUE"], [b"0", b"1", b"2", b"3"]]) + + +def test_batch_padding_04(): + data1 = ds.GeneratorDataset((lambda: gen_var_cols(2)), ["col1", "col2"]) + data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={}) # pad automatically + data1 = data1.repeat(2) + for data in data1.create_dict_iterator(): + np.testing.assert_array_equal(data["col1"], [[b"0", b""], [b"0", b"1"]]) + np.testing.assert_array_equal(data["col2"], [[b"100", b""], [b"100", b"101"]]) + + +def test_batch_padding_05(): + data1 = ds.GeneratorDataset((lambda: gen_var_cols_2d(3)), ["col1", "col2"]) + data1 = data1.batch(batch_size=3, drop_remainder=False, + pad_info={"col2": ([2, None], "-2"), "col1": (None, "-1")}) # pad automatically + for data in data1.create_dict_iterator(): + np.testing.assert_array_equal(data["col1"], + [[[b"0", b"-1", b"-1"]], [[b"0", b"1", b"-1"]], [[b"0", b"1", b"2"]]]) + np.testing.assert_array_equal(data["col2"], + [[[b"100", b"-2", b"-2"], [b"-2", b"-2", b"-2"]], + [[b"100", b"101", b"-2"], [b"-2", b"-2", b"-2"]], + [[b"100", b"101", b"102"], [b"-2", b"-2", b"-2"]]]) + + if __name__ == '__main__': test_generator() test_basic() @@ -191,3 +291,8 @@ if __name__ == '__main__': test_tfrecord2() test_tfrecord3() test_mindrecord() + test_batch_padding_01() + test_batch_padding_02() + test_batch_padding_03() + test_batch_padding_04() + test_batch_padding_05()