From 6a6e73ef824d40fb050bb869dfcd87e98cdab970 Mon Sep 17 00:00:00 2001 From: zhangyuqwer <1228862915@qq.com> Date: Thu, 4 Nov 2021 20:42:41 +0800 Subject: [PATCH] [fix] [assistant] [I3ZSQM] add new data operator Vectors --- .../python/bindings/dataset/text/bindings.cc | 17 +- .../dataset/text/kernels/ir/bindings.cc | 13 + mindspore/ccsrc/minddata/dataset/api/text.cc | 16 + .../minddata/dataset/include/dataset/text.h | 28 +- .../minddata/dataset/kernels/tensor_op.h | 1 + .../minddata/dataset/text/CMakeLists.txt | 1 + .../dataset/text/ir/kernels/text_ir.cc | 22 + .../dataset/text/ir/kernels/text_ir.h | 21 + .../dataset/text/kernels/CMakeLists.txt | 1 + .../dataset/text/kernels/to_vectors_op.cc | 58 +++ .../dataset/text/kernels/to_vectors_op.h | 64 +++ .../ccsrc/minddata/dataset/text/vectors.cc | 145 +++++++ .../ccsrc/minddata/dataset/text/vectors.h | 89 ++++ mindspore/dataset/text/__init__.py | 6 +- mindspore/dataset/text/transforms.py | 33 +- mindspore/dataset/text/utils.py | 36 +- mindspore/dataset/text/validators.py | 41 +- tests/ut/cpp/dataset/CMakeLists.txt | 1 + tests/ut/cpp/dataset/c_api_text_test.cc | 406 +++++++++++++++++- tests/ut/cpp/dataset/execute_test.cc | 136 ++++++ tests/ut/data/dataset/testVectors/vectors.txt | 6 + .../testVectors/vectors_dim_different.txt | 6 + .../dataset/testVectors/vectors_empty.txt | 0 .../dataset/testVectors/vectors_with_info.txt | 7 + .../testVectors/vectors_with_wrong_info.txt | 7 + tests/ut/data/dataset/testVectors/words.txt | 7 + .../testVectors/words_with_big_letter.txt | 7 + tests/ut/python/dataset/test_vectors.py | 236 ++++++++++ 28 files changed, 1391 insertions(+), 20 deletions(-) create mode 100644 mindspore/ccsrc/minddata/dataset/text/kernels/to_vectors_op.cc create mode 100644 mindspore/ccsrc/minddata/dataset/text/kernels/to_vectors_op.h create mode 100644 mindspore/ccsrc/minddata/dataset/text/vectors.cc create mode 100644 mindspore/ccsrc/minddata/dataset/text/vectors.h create mode 100644 tests/ut/data/dataset/testVectors/vectors.txt create mode 100644 tests/ut/data/dataset/testVectors/vectors_dim_different.txt create mode 100644 tests/ut/data/dataset/testVectors/vectors_empty.txt create mode 100644 tests/ut/data/dataset/testVectors/vectors_with_info.txt create mode 100644 tests/ut/data/dataset/testVectors/vectors_with_wrong_info.txt create mode 100644 tests/ut/data/dataset/testVectors/words.txt create mode 100644 tests/ut/data/dataset/testVectors/words_with_big_letter.txt create mode 100644 tests/ut/python/dataset/test_vectors.py diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc index e946e9e88fc..eca9af44569 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc @@ -1,5 +1,5 @@ /** - * Copyright 2020 Huawei Technologies Co., Ltd + * Copyright 2020-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,13 +18,13 @@ #include "pybind11/stl_bind.h" #include "minddata/dataset/api/python/pybind_register.h" -#include "minddata/dataset/text/vocab.h" -#include "minddata/dataset/text/sentence_piece_vocab.h" #include "minddata/dataset/include/dataset/constants.h" +#include "minddata/dataset/text/sentence_piece_vocab.h" +#include "minddata/dataset/text/vectors.h" +#include "minddata/dataset/text/vocab.h" namespace mindspore { namespace dataset { - PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) { (void)py::class_>(*m, "Vocab") .def(py::init<>()) @@ -88,5 +88,14 @@ PYBIND_REGISTER(SentencePieceModel, 0, ([](const py::module *m) { .export_values(); })); +PYBIND_REGISTER(Vectors, 0, ([](const py::module *m) { + (void)py::class_>(*m, "Vectors") + .def(py::init<>()) + .def_static("from_file", [](const std::string &path, int32_t max_vectors) { + std::shared_ptr vectors; + THROW_IF_ERROR(Vectors::BuildFromFile(&vectors, path, max_vectors)); + return vectors; + }); + })); } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc index e71073d6aba..f3962ad34ae 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc @@ -19,6 +19,7 @@ #include "minddata/dataset/api/python/pybind_register.h" #include "minddata/dataset/text/ir/kernels/text_ir.h" #include "minddata/dataset/text/sentence_piece_vocab.h" +#include "minddata/dataset/text/vectors.h" #include "minddata/dataset/text/vocab.h" namespace mindspore { @@ -208,6 +209,18 @@ PYBIND_REGISTER(ToNumberOperation, 1, ([](const py::module *m) { })); })); +PYBIND_REGISTER( + ToVectorsOperation, 1, ([](const py::module *m) { + (void)py::class_>( + *m, "ToVectorsOperation") + .def(py::init( + [](const std::shared_ptr &vectors, const std::vector &unk_init, bool lower_case_backup) { + auto to_vectors = std::make_shared(vectors, unk_init, lower_case_backup); + THROW_IF_ERROR(to_vectors->ValidateParams()); + return to_vectors; + })); + })); + PYBIND_REGISTER(TruncateSequencePairOperation, 1, ([](const py::module *m) { (void)py::class_>( diff --git a/mindspore/ccsrc/minddata/dataset/api/text.cc b/mindspore/ccsrc/minddata/dataset/api/text.cc index 78069e3c470..8448eaf18e7 100644 --- a/mindspore/ccsrc/minddata/dataset/api/text.cc +++ b/mindspore/ccsrc/minddata/dataset/api/text.cc @@ -358,6 +358,22 @@ ToNumber::ToNumber(mindspore::DataType data_type) : data_(std::make_shared std::shared_ptr ToNumber::Parse() { return std::make_shared(data_->data_type_); } +// ToVectors +struct ToVectors::Data { + Data(const std::shared_ptr &vectors, const std::vector &unk_init, bool lower_case_backup) + : vectors_(vectors), unk_init_(unk_init), lower_case_backup_(lower_case_backup) {} + std::shared_ptr vectors_; + std::vector unk_init_; + bool lower_case_backup_; +}; + +ToVectors::ToVectors(const std::shared_ptr &vectors, const std::vector unk_init, bool lower_case_backup) + : data_(std::make_shared(vectors, unk_init, lower_case_backup)) {} + +std::shared_ptr ToVectors::Parse() { + return std::make_shared(data_->vectors_, data_->unk_init_, data_->lower_case_backup_); +} + // TruncateSequencePair struct TruncateSequencePair::Data { explicit Data(int32_t max_length) : max_length_(max_length) {} diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/text.h b/mindspore/ccsrc/minddata/dataset/include/dataset/text.h index baea18563c1..742f2908a1e 100644 --- a/mindspore/ccsrc/minddata/dataset/include/dataset/text.h +++ b/mindspore/ccsrc/minddata/dataset/include/dataset/text.h @@ -31,13 +31,13 @@ namespace mindspore { namespace dataset { -class Vocab; class SentencePieceVocab; class TensorOperation; +class Vectors; +class Vocab; // Transform operations for text namespace text { - #ifndef _WIN32 /// \brief Tokenize a scalar tensor of UTF-8 string by specific rules. /// \note BasicTokenizer is not supported on the Windows platform yet. @@ -629,6 +629,30 @@ class MS_API ToNumber final : public TensorTransform { std::shared_ptr data_; }; +/// \brief Look up a token into an vector according to the input Vectors table. +class ToVectors final : public TensorTransform { + public: + /// \brief Constructor. + /// \param[in] vectors A Vectors object. + /// \param[in] unk_init In case of the token is out-of-vectors (OOV), the result will be initialized with `unk_init`. + /// (default={}, means to initialize with zero vectors). + /// \param[in] lower_case_backup Whether to look up the token in the lower case (default=false). + explicit ToVectors(const std::shared_ptr &vectors, std::vector unk_init = {}, + bool lower_case_backup = false); + + /// \brief Destructor + ~ToVectors() = default; + + protected: + /// \brief The function to convert a TensorTransform object into a TensorOperation object. + /// \return Shared pointer to the TensorOperation object. + std::shared_ptr Parse() override; + + private: + struct Data; + std::shared_ptr data_; +}; + /// \brief Truncate a pair of rank-1 tensors such that the total length is less than max_length. class MS_API TruncateSequencePair final : public TensorTransform { public: diff --git a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h index 0013de66457..15ad88c4e49 100644 --- a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h +++ b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h @@ -133,6 +133,7 @@ constexpr char kNormalizeUTF8Op[] = "NormalizeUTF8Op"; constexpr char kRegexReplaceOp[] = "RegexReplaceOp"; constexpr char kRegexTokenizerOp[] = "RegexTokenizerOp"; constexpr char kToNumberOp[] = "ToNumberOp"; +constexpr char kToVectorsOp[] = "ToVectorsOp"; constexpr char kTruncateSequencePairOp[] = "TruncateSequencePairOp"; constexpr char kUnicodeCharTokenizerOp[] = "UnicodeCharTokenizerOp"; constexpr char kUnicodeScriptTokenizerOp[] = "UnicodeScriptTokenizerOp"; diff --git a/mindspore/ccsrc/minddata/dataset/text/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/text/CMakeLists.txt index 844166ff1e8..61397a999b4 100644 --- a/mindspore/ccsrc/minddata/dataset/text/CMakeLists.txt +++ b/mindspore/ccsrc/minddata/dataset/text/CMakeLists.txt @@ -4,6 +4,7 @@ add_subdirectory(kernels) file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) add_library(text OBJECT + vectors.cc vocab.cc sentence_piece_vocab.cc ) diff --git a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc index 83c30a3f6c4..d6a9e64be02 100644 --- a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc +++ b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc @@ -33,6 +33,7 @@ #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h" #include "minddata/dataset/text/kernels/sliding_window_op.h" #include "minddata/dataset/text/kernels/to_number_op.h" +#include "minddata/dataset/text/kernels/to_vectors_op.h" #include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h" #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h" #include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h" @@ -420,6 +421,27 @@ Status ToNumberOperation::from_json(nlohmann::json op_params, std::shared_ptr &vectors, const std::vector &unk_init, + bool lower_case_backup) + : vectors_(vectors), unk_init_(unk_init), lower_case_backup_(lower_case_backup) {} + +ToVectorsOperation::~ToVectorsOperation() = default; + +Status ToVectorsOperation::ValidateParams() { + if (vectors_ == nullptr) { + std::string err_msg = "ToVectors: vectors can't be nullptr."; + MS_LOG(ERROR) << err_msg; + LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + return Status::OK(); +} + +std::shared_ptr ToVectorsOperation::Build() { + std::shared_ptr tensor_op = std::make_shared(vectors_, unk_init_, lower_case_backup_); + return tensor_op; +} + // TruncateSequencePairOperation TruncateSequencePairOperation::TruncateSequencePairOperation(int32_t max_length) : max_length_(max_length) {} diff --git a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h index 43dbe213584..2eae4543af9 100644 --- a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h +++ b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h @@ -27,6 +27,7 @@ namespace mindspore { namespace dataset { +class Vectors; class Vocab; class SentencePieceVocab; @@ -45,6 +46,7 @@ constexpr char kRegexTokenizerOperation[] = "RegexTokenizer"; constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer"; constexpr char kSlidingWindowOperation[] = "SlidingWindow"; constexpr char kToNumberOperation[] = "ToNumber"; +constexpr char kToVectorsOperation[] = "ToVectors"; constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair"; constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer"; constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer"; @@ -294,6 +296,25 @@ class ToNumberOperation : public TensorOperation { DataType data_type_; }; +class ToVectorsOperation : public TensorOperation { + public: + ToVectorsOperation(const std::shared_ptr &vectors, const std::vector &unk_init, + bool lower_case_backup); + + ~ToVectorsOperation(); + + std::shared_ptr Build() override; + + Status ValidateParams() override; + + std::string Name() const override { return kToVectorsOperation; } + + private: + std::shared_ptr vectors_; + std::vector unk_init_; + bool lower_case_backup_; +}; + class TruncateSequencePairOperation : public TensorOperation { public: explicit TruncateSequencePairOperation(int32_t max_length); diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/text/kernels/CMakeLists.txt index 4aa9dc7ae4a..756ac2b1790 100644 --- a/mindspore/ccsrc/minddata/dataset/text/kernels/CMakeLists.txt +++ b/mindspore/ccsrc/minddata/dataset/text/kernels/CMakeLists.txt @@ -22,6 +22,7 @@ add_library(text-kernels OBJECT wordpiece_tokenizer_op.cc truncate_sequence_pair_op.cc to_number_op.cc + to_vectors_op.cc sentence_piece_tokenizer_op.cc ${ICU_DEPEND_FILES} ) diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/to_vectors_op.cc b/mindspore/ccsrc/minddata/dataset/text/kernels/to_vectors_op.cc new file mode 100644 index 00000000000..9033d5410fa --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/text/kernels/to_vectors_op.cc @@ -0,0 +1,58 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "minddata/dataset/text/kernels/to_vectors_op.h" + +namespace mindspore { +namespace dataset { +ToVectorsOp::ToVectorsOp(const std::shared_ptr &vectors, const std::vector &unk_init, + bool lower_case_backup) + : vectors_(vectors), unk_init_(unk_init), lower_case_backup_(lower_case_backup) {} + +Status ToVectorsOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { + IO_CHECK(input, output); + CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "ToVectors: input tensor type should be string."); + CHECK_FAIL_RETURN_UNEXPECTED(unk_init_.size() == 0 || unk_init_.size() == vectors_->Dim(), + "ToVectors: unk_init must be the same length as vectors, but got unk_init: " + + std::to_string(unk_init_.size()) + " and vectors: " + std::to_string(vectors_->Dim())); + + std::vector vectors_vec; + int len = 0; + for (auto itr = input->begin(); itr != input->end(); ++itr) { + std::vector vectors_value = vectors_->Lookup(std::string(*itr), unk_init_, lower_case_backup_); + CHECK_FAIL_RETURN_UNEXPECTED(!vectors_value.empty(), "ToVectors: invalid data, token: \"" + std::string(*itr) + + "\" doesn't exist in vectors and no unk_init is specified."); + vectors_vec.insert(vectors_vec.end(), vectors_value.begin(), vectors_value.end()); + len++; + } + + int dim = static_cast(vectors_vec.size() / len); + if (vectors_vec.size() == dim) { + RETURN_IF_NOT_OK(Tensor::CreateFromVector(vectors_vec, output)); + } else { + RETURN_IF_NOT_OK(Tensor::CreateFromVector(vectors_vec, TensorShape({len, dim}), output)); + } + return Status::OK(); +} + +Status ToVectorsOp::OutputType(const std::vector &inputs, std::vector &outputs) { + CHECK_FAIL_RETURN_UNEXPECTED(inputs.size() == NumInput() && outputs.size() == NumOutput(), + "ToVectors: input and output size don't match."); + CHECK_FAIL_RETURN_UNEXPECTED(inputs[0] == DataType::DE_STRING, "ToVectors: input tensor type should be string."); + outputs[0] = DataType(DataType::DE_FLOAT32); + return Status::OK(); +} +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/to_vectors_op.h b/mindspore/ccsrc/minddata/dataset/text/kernels/to_vectors_op.h new file mode 100644 index 00000000000..913b3a91bff --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/text/kernels/to_vectors_op.h @@ -0,0 +1,64 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_TO_VECTORS_OP_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_TO_VECTORS_OP_H_ + +#include +#include +#include +#include + +#include "minddata/dataset/core/tensor.h" +#include "minddata/dataset/kernels/tensor_op.h" +#include "minddata/dataset/text/vectors.h" +#include "minddata/dataset/util/status.h" + +namespace mindspore { +namespace dataset { +class ToVectorsOp : public TensorOp { + public: + /// \brief Constructor. + /// \param[in] vectors Vectors used to lookup tokens. + /// \param[in] unk_init Vector used to initialize OOV token. + /// \param[in] lower_case_backup Whether to look up the token in the lower case. + ToVectorsOp(const std::shared_ptr &vectors, const std::vector &unk_init, bool lower_case_backup); + + /// \brief Destructor. + ~ToVectorsOp() = default; + + /// \brief Perform actual ToVectors on each tensor. + /// \param[in] input Input tensor. + /// \param[in] output Output tensor. + /// \return[out] Status code. + Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; + + /// \param[in] inputs DataType of input tensor. + /// \param[in] outputs DataType of output tensor. + /// \return[out] Status code. + Status OutputType(const std::vector &inputs, std::vector &outputs) override; + + /// \brief Get Op name. + std::string Name() const override { return kToVectorsOp; } + + private: + std::shared_ptr vectors_; + std::vector unk_init_; + bool lower_case_backup_; +}; +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_TO_VECTORS_OP_H_ diff --git a/mindspore/ccsrc/minddata/dataset/text/vectors.cc b/mindspore/ccsrc/minddata/dataset/text/vectors.cc new file mode 100644 index 00000000000..41b3a5c4e37 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/text/vectors.cc @@ -0,0 +1,145 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/text/vectors.h" + +#include "utils/file_utils.h" + +namespace mindspore { +namespace dataset { +Status Vectors::InferShape(const std::string &path, int32_t max_vectors, int32_t *num_lines, int32_t *header_num_lines, + int32_t *vector_dim) { + RETURN_UNEXPECTED_IF_NULL(num_lines); + RETURN_UNEXPECTED_IF_NULL(header_num_lines); + RETURN_UNEXPECTED_IF_NULL(vector_dim); + + std::ifstream file_reader; + file_reader.open(path, std::ios::in); + CHECK_FAIL_RETURN_UNEXPECTED(file_reader.is_open(), "Vectors: invalid file, failed to open vector file: " + path); + + *num_lines = 0, *header_num_lines = 0, *vector_dim = -1; + std::string line, row; + while (std::getline(file_reader, line)) { + if (*vector_dim == -1) { + std::vector vec; + std::istringstream line_reader(line); + while (std::getline(line_reader, row, ' ')) { + vec.push_back(row); + } + // The number of rows and dimensions can be obtained directly from the information header. + const int kInfoHeaderSize = 2; + if (vec.size() == kInfoHeaderSize) { + (*header_num_lines)++; + } else { + *vector_dim = vec.size() - 1; + (*num_lines)++; + } + } else { + (*num_lines)++; + } + } + CHECK_FAIL_RETURN_UNEXPECTED(*num_lines > 0, "Vectors: invalid file, file is empty."); + + if (max_vectors > 0) { + *num_lines = std::min(max_vectors, *num_lines); // Determine the true rows. + } + return Status::OK(); +} + +Status Vectors::Load(const std::string &path, int32_t max_vectors, + std::unordered_map> *map, int *vector_dim) { + RETURN_UNEXPECTED_IF_NULL(map); + RETURN_UNEXPECTED_IF_NULL(vector_dim); + auto realpath = FileUtils::GetRealPath(common::SafeCStr(path)); + CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(), "Vectors: get real path failed, path: " + path); + auto file_path = realpath.value(); + + CHECK_FAIL_RETURN_UNEXPECTED(max_vectors >= 0, + "Vectors: max_vectors must be non negative, but got: " + std::to_string(max_vectors)); + + int num_lines = 0, header_num_lines = 0; + RETURN_IF_NOT_OK(InferShape(file_path, max_vectors, &num_lines, &header_num_lines, vector_dim)); + + std::fstream file_reader; + file_reader.open(file_path, std::ios::in); + CHECK_FAIL_RETURN_UNEXPECTED(file_reader.is_open(), + "Vectors: invalid file, failed to open vector file: " + file_path); + + while (header_num_lines > 0) { + file_reader.ignore(std::numeric_limits::max(), '\n'); + header_num_lines--; + } + + std::string line, token, vector_value; + for (auto i = 0; i < num_lines; ++i) { + std::getline(file_reader, line); + std::istringstream line_reader(line); + std::getline(line_reader, token, ' '); + std::vector vector_values; + int dim = 0; + while (line_reader >> vector_value) { + dim++; + vector_values.push_back(atof(vector_value.c_str())); + } + CHECK_FAIL_RETURN_UNEXPECTED(dim > 1, "Vectors: token with 1-dimensional vector."); + CHECK_FAIL_RETURN_UNEXPECTED(dim == *vector_dim, + "Vectors: all vectors must have the same number of dimensions, but got dim " + + std::to_string(dim) + " while expecting " + std::to_string(*vector_dim)); + + auto token_index = map->find(token); + if (token_index == map->end()) { + (*map)[token] = vector_values; + } + } + return Status::OK(); +} + +Vectors::Vectors(const std::unordered_map> &map, int dim) { + map_ = std::move(map); + dim_ = dim; +} + +Status Vectors::BuildFromFile(std::shared_ptr *vectors, const std::string &path, int32_t max_vectors) { + std::unordered_map> map; + int vector_dim = -1; + RETURN_IF_NOT_OK(Load(path, max_vectors, &map, &vector_dim)); + *vectors = std::make_shared(std::move(map), vector_dim); + return Status::OK(); +} + +std::vector Vectors::Lookup(const std::string &token, const std::vector &unk_init, + bool lower_case_backup) { + std::vector init_vec(dim_, 0); + if (!unk_init.empty()) { + if (unk_init.size() != dim_) { + MS_LOG(WARNING) << "Vectors: size of unk_init is not the same as vectors, will initialize with zero vectors."; + } else { + init_vec = unk_init; + } + } + std::string lower_token = token; + if (lower_case_backup) { + transform(lower_token.begin(), lower_token.end(), lower_token.begin(), ::tolower); + } + auto str_index = map_.find(lower_token); + if (str_index == map_.end()) { + return init_vec; + } else { + return str_index->second; + } +} +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/text/vectors.h b/mindspore/ccsrc/minddata/dataset/text/vectors.h new file mode 100644 index 00000000000..ea79ecdee27 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/text/vectors.h @@ -0,0 +1,89 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "minddata/dataset/core/tensor.h" +#include "minddata/dataset/include/dataset/iterator.h" + +namespace mindspore { +namespace dataset { +/// \brief Pre-train word vectors. +class Vectors { + public: + /// Constructor. + Vectors() = default; + + /// Constructor. + /// \param[in] map A map between string and vector. + /// \param[in] dim Dimension of the vectors. + Vectors(const std::unordered_map> &map, int dim); + + /// Destructor. + virtual ~Vectors() = default; + + /// \brief Build Vectors from reading a pre-train vector file. + /// \param[out] vectors Vectors object which contains the pre-train vectors. + /// \param[in] path Path to the pre-trained word vector file. + /// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded (default=0, no limit). + static Status BuildFromFile(std::shared_ptr *vectors, const std::string &path, int32_t max_vectors = 0); + + /// \brief Look up embedding vectors of token. + /// \param[in] token A token to be looked up. + /// \param[in] unk_init In case of the token is out-of-vectors (OOV), the result will be initialized with `unk_init`. + /// (default={}, means to initialize with zero vectors). + /// \param[in] lower_case_backup Whether to look up the token in the lower case (Default = false). + /// \return The vector of the input token. + virtual std::vector Lookup(const std::string &token, const std::vector &unk_init = {}, + bool lower_case_backup = false); + + /// \brief Getter of dimension. + const int &Dim() const { return dim_; } + + protected: + /// \brief Infer the shape of the pre-trained word vector file. + /// \param[in] path Path to the pre-trained word vector file. + /// \param[in] max_vectors Maximum number of pre-trained word vectors to be read. + /// \param[out] num_lines The number of lines of the file. + /// \param[out] header_num_lines The number of lines of file header. + /// \param[out] vector_dim The dimension of the vectors in the file. + static Status InferShape(const std::string &path, int32_t max_vectors, int32_t *num_lines, int32_t *header_num_lines, + int32_t *vector_dim); + + /// \brief Load map from reading a pre-train vector file. + /// \param[in] path Path to the pre-trained word vector file. + /// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded, must be non negative. + /// \param[out] map The map between words and vectors. + /// \param[out] vector_dim The dimension of the vectors in the file. + static Status Load(const std::string &path, int32_t max_vectors, + std::unordered_map> *map, int *vector_dim); + + int dim_; + std::unordered_map> map_; +}; +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_ diff --git a/mindspore/dataset/text/__init__.py b/mindspore/dataset/text/__init__.py index d013599cae9..bf761382692 100644 --- a/mindspore/dataset/text/__init__.py +++ b/mindspore/dataset/text/__init__.py @@ -26,15 +26,15 @@ Common imported modules in corresponding API examples are as follows: """ import platform from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, \ - TruncateSequencePair, ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer + TruncateSequencePair, ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer, ToVectors from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm, SentencePieceVocab, SentencePieceModel, \ - SPieceTokenizerOutType, SPieceTokenizerLoadType + SPieceTokenizerOutType, SPieceTokenizerLoadType, Vectors __all__ = [ "Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram", "to_str", "to_bytes", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber", "PythonTokenizer", "SlidingWindow", "SentencePieceVocab", "SentencePieceTokenizer", "SPieceTokenizerOutType", - "SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm", + "SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm", "Vectors", "ToVectors" ] if platform.system().lower() != 'windows': diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index 12f8f96f807..e3126d04a6b 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -48,7 +48,7 @@ import mindspore._c_dataengine as cde from mindspore.common import dtype as mstype from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType -from .validators import check_lookup, check_jieba_add_dict, \ +from .validators import check_lookup, check_jieba_add_dict, check_to_vectors, \ check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer, \ check_wordpiece_tokenizer, check_regex_replace, check_regex_tokenizer, check_basic_tokenizer, check_ngram, \ check_pair_truncate, check_to_number, check_bert_tokenizer, check_python_tokenizer, check_slidingwindow, \ @@ -345,6 +345,7 @@ class SentencePieceTokenizer(TextTensorOperation): >>> tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) >>> text_file_dataset = text_file_dataset.map(operations=tokenizer) """ + @check_sentence_piece_tokenizer def __init__(self, mode, out_type): self.mode = mode @@ -421,6 +422,36 @@ class ToNumber(TextTensorOperation): return cde.ToNumberOperation(self.data_type) +class ToVectors(TextTensorOperation): + """ + Look up a token into vectors according to the input vector table. + + Args: + vectors (Vectors): A vectors object. + unk_init (sequence, optional): Sequence used to initialize out-of-vectors (OOV) token + (default=None, initialize with zero vectors). + lower_case_backup (bool, optional): Whether to look up the token in the lower case. If False, each token in the + original case will be looked up; if True, each token in the original case will be looked up first, if not + found in the keys of the property stoi, the token in the lower case will be looked up (default=False). + + Examples: + >>> # Load vectors from file + >>> vectors = text.Vectors.from_file("/path/to/vectors/file") + >>> # Use ToVectors operator to map tokens to vectors + >>> to_vectors = text.ToVectors(vectors) + >>> text_file_dataset = text_file_dataset.map(operations=[to_vectors]) + """ + + @check_to_vectors + def __init__(self, vectors, unk_init=None, lower_case_backup=False): + self.vectors = vectors + self.unk_init = unk_init if unk_init is not None else [] + self.lower_case_backup = lower_case_backup + + def parse(self): + return cde.ToVectorsOperation(self.vectors, self.unk_init, self.lower_case_backup) + + class TruncateSequencePair(TextTensorOperation): """ Truncate a pair of rank-1 tensors such that the total length is less than max_length. diff --git a/mindspore/dataset/text/utils.py b/mindspore/dataset/text/utils.py index 529181409eb..3b77294980a 100644 --- a/mindspore/dataset/text/utils.py +++ b/mindspore/dataset/text/utils.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2020-2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,16 +16,18 @@ The module text.utils provides some general methods for NLP text processing. For example, you can use Vocab to build a dictionary, use to_bytes and to_str to encode and decode strings into a specified format. """ + from enum import IntEnum import numpy as np -import mindspore._c_dataengine as cde +import mindspore._c_dataengine as cde from .validators import check_from_file, check_from_list, check_from_dict, check_from_dataset, \ - check_from_dataset_sentencepiece, check_from_file_sentencepiece, check_save_model + check_from_dataset_sentencepiece, check_from_file_sentencepiece, check_save_model, \ + check_from_file_vectors __all__ = [ - "Vocab", "SentencePieceVocab", "to_str", "to_bytes" + "Vocab", "SentencePieceVocab", "to_str", "to_bytes", "Vectors" ] @@ -383,3 +385,29 @@ class SPieceTokenizerLoadType(IntEnum): """ FILE = 0 MODEL = 1 + + +class Vectors(cde.Vectors): + """ + Vectors object that is used to map tokens into vectors. + """ + + @classmethod + @check_from_file_vectors + def from_file(cls, file_path, max_vectors=None): + """ + Build a vector from a file. + + Args: + file_path (str): Path of the file that contains the vectors. + max_vectors (int, optional): This can be used to limit the number of pre-trained vectors loaded. + Most pre-trained vector sets are sorted in the descending order of word frequency. Thus, in + situations where the entire set doesn’t fit in memory, or is not needed for another reason, + passing max_vectors can limit the size of the loaded set (default=None, no limit). + + Examples: + >>> vector = text.Vectors.from_file("/path/to/vectors/file", max_vectors=None) + """ + + max_vectors = max_vectors if max_vectors is not None else 0 + return super().from_file(file_path, max_vectors) diff --git a/mindspore/dataset/text/validators.py b/mindspore/dataset/text/validators.py index 883ac42e890..ad0b32bfd1d 100644 --- a/mindspore/dataset/text/validators.py +++ b/mindspore/dataset/text/validators.py @@ -15,15 +15,14 @@ """ validators for text ops """ - from functools import wraps -import mindspore.common.dtype as mstype import mindspore._c_dataengine as cde +import mindspore.common.dtype as mstype from mindspore._c_expression import typing from ..core.validator_helpers import parse_user_args, type_check, type_check_list, check_uint32, \ - INT32_MAX, check_value, check_positive, check_pos_int32 + INT32_MAX, check_value, check_positive, check_pos_int32, check_filename, check_non_negative_int32 def check_unique_list_of_words(words, arg_name): @@ -532,3 +531,39 @@ def check_sentence_piece_tokenizer(method): return method(self, *args, **kwargs) return new_method + + +def check_from_file_vectors(method): + """A wrapper that wraps a parameter checker to from_file of class Vectors.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [file_path, max_vectors], _ = parse_user_args(method, *args, **kwargs) + + type_check(file_path, (str,), "file_path") + check_filename(file_path) + if max_vectors is not None: + type_check(max_vectors, (int,), "max_vectors") + check_non_negative_int32(max_vectors, "max_vectors") + + return method(self, *args, **kwargs) + + return new_method + + +def check_to_vectors(method): + """A wrapper that wraps a parameter checker to ToVectors.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [vectors, unk_init, lower_case_backup], _ = parse_user_args(method, *args, **kwargs) + + type_check(vectors, (cde.Vectors,), "vectors") + if unk_init is not None: + type_check(unk_init, (list, tuple), "unk_init") + for i, value in enumerate(unk_init): + type_check(value, (int, float), "unk_init[{0}]".format(i)) + type_check(lower_case_backup, (bool,), "lower_case_backup") + return method(self, *args, **kwargs) + + return new_method diff --git a/tests/ut/cpp/dataset/CMakeLists.txt b/tests/ut/cpp/dataset/CMakeLists.txt index 7692fcf00d8..f96fd193c5a 100644 --- a/tests/ut/cpp/dataset/CMakeLists.txt +++ b/tests/ut/cpp/dataset/CMakeLists.txt @@ -51,6 +51,7 @@ SET(DE_UT_SRCS c_api_samplers_test.cc c_api_text_sentence_piece_vocab_test.cc c_api_text_vocab_test.cc + c_api_text_test.cc c_api_transforms_test.cc c_api_vision_a_to_q_test.cc c_api_vision_affine_test.cc diff --git a/tests/ut/cpp/dataset/c_api_text_test.cc b/tests/ut/cpp/dataset/c_api_text_test.cc index 6596833ae67..262e51f3241 100644 --- a/tests/ut/cpp/dataset/c_api_text_test.cc +++ b/tests/ut/cpp/dataset/c_api_text_test.cc @@ -14,8 +14,8 @@ * limitations under the License. */ #include -#include #include +#include #include "common/common.h" #include "include/api/status.h" @@ -23,12 +23,14 @@ #include "minddata/dataset/include/dataset/datasets.h" #include "minddata/dataset/include/dataset/text.h" #include "minddata/dataset/include/dataset/transforms.h" +#include "minddata/dataset/text/vectors.h" #include "minddata/dataset/text/vocab.h" using namespace mindspore::dataset; using mindspore::Status; using mindspore::dataset::ShuffleMode; using mindspore::dataset::Tensor; +using mindspore::dataset::Vectors; using mindspore::dataset::Vocab; class MindDataTestPipeline : public UT::DatasetOpTesting { @@ -892,7 +894,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess2) { std::vector expected_tokens = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"}; std::vector expected_offsets_start = {0, 12, 21, 27, 33, 36, 42}; std::vector expected_offsets_limit = {12, 21, 27, 33, 36, 42, 48}; - + std::shared_ptr de_expected_tokens; ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens)); mindspore::MSTensor ms_expected_tokens = @@ -1596,7 +1598,8 @@ TEST_F(MindDataTestPipeline, TestToNumberSuccess2) { EXPECT_NE(ds, nullptr); // Create ToNumber operation on ds - std::shared_ptr to_number = std::make_shared(mindspore::DataType::kNumberTypeFloat64); + std::shared_ptr to_number = + std::make_shared(mindspore::DataType::kNumberTypeFloat64); EXPECT_NE(to_number, nullptr); // Create a Map operation on ds @@ -3543,3 +3546,400 @@ TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess1) { // Manually terminate the pipeline iter->Stop(); } + +/// Feature: Vectors +/// Description: test with default parameter in function BuildFromFile and function Lookup +/// Expectation: return correct MSTensor which is equal to the expected +TEST_F(MindDataTestPipeline, TestVectorsDefaultParam) { + // Test with default parameter. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDefaultParam."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt"; + std::shared_ptr vectors; + Status s = Vectors::BuildFromFile(&vectors, vectors_dir); + EXPECT_EQ(s, Status::OK()); + + std::shared_ptr lookup = std::make_shared(vectors); + EXPECT_NE(lookup, nullptr); + + // Create Map operation on ds + ds = ds->Map({lookup}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map row; + ASSERT_OK(iter->GetNextRow(&row)); + + uint64_t i = 0; + std::vector> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}, + {0, 0, 0, 0, 0, 0}, + {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973}, + {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603}, + {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246}, + {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923}, + {0, 0, 0, 0, 0, 0}}; + while (row.size() != 0) { + auto ind = row["text"]; + MS_LOG(INFO) << ind.Shape(); + TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind); + TensorPtr de_expected_item; + dsize_t dim = 6; + ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item)); + mindspore::MSTensor ms_expected_item = + mindspore::MSTensor(std::make_shared(de_expected_item)); + EXPECT_MSTENSOR_EQ(ind, ms_expected_item); + + ASSERT_OK(iter->GetNextRow(&row)); + i++; + } + + EXPECT_EQ(i, 7); + + // Manually terminate the pipeline + iter->Stop(); +} + +/// Feature: Vectors +/// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile +/// Expectation: return correct MSTensor which is equal to the expected +TEST_F(MindDataTestPipeline, TestVectorsAllBuildfromfileParams) { + // Test with two parameters. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllBuildfromfileParams."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt"; + std::shared_ptr vectors; + Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100); + EXPECT_EQ(s, Status::OK()); + + std::shared_ptr lookup = std::make_shared(vectors); + EXPECT_NE(lookup, nullptr); + + // Create Map operation on ds + ds = ds->Map({lookup}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map row; + ASSERT_OK(iter->GetNextRow(&row)); + + uint64_t i = 0; + std::vector> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}, + {0, 0, 0, 0, 0, 0}, + {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973}, + {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603}, + {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246}, + {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923}, + {0, 0, 0, 0, 0, 0}}; + while (row.size() != 0) { + auto ind = row["text"]; + MS_LOG(INFO) << ind.Shape(); + TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind); + TensorPtr de_expected_item; + dsize_t dim = 6; + ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item)); + mindspore::MSTensor ms_expected_item = + mindspore::MSTensor(std::make_shared(de_expected_item)); + EXPECT_MSTENSOR_EQ(ind, ms_expected_item); + + ASSERT_OK(iter->GetNextRow(&row)); + i++; + } + + EXPECT_EQ(i, 7); + + // Manually terminate the pipeline + iter->Stop(); +} + +/// Feature: Vectors +/// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup +/// Expectation: return correct MSTensor which is equal to the expected +TEST_F(MindDataTestPipeline, TestVectorsUnknownInit) { + // Test with two parameters. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsUnknownInit."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt"; + std::shared_ptr vectors; + Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100); + EXPECT_EQ(s, Status::OK()); + + std::vector unknown_init = {-1, -1, -1, -1, -1, -1}; + std::shared_ptr lookup = std::make_shared(vectors, unknown_init); + EXPECT_NE(lookup, nullptr); + + // Create Map operation on ds + ds = ds->Map({lookup}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map row; + ASSERT_OK(iter->GetNextRow(&row)); + + uint64_t i = 0; + std::vector> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}, + {-1, -1, -1, -1, -1, -1}, + {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973}, + {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603}, + {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246}, + {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923}, + {-1, -1, -1, -1, -1, -1}}; + while (row.size() != 0) { + auto ind = row["text"]; + MS_LOG(INFO) << ind.Shape(); + TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind); + TensorPtr de_expected_item; + dsize_t dim = 6; + ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item)); + mindspore::MSTensor ms_expected_item = + mindspore::MSTensor(std::make_shared(de_expected_item)); + EXPECT_MSTENSOR_EQ(ind, ms_expected_item); + + ASSERT_OK(iter->GetNextRow(&row)); + i++; + } + + EXPECT_EQ(i, 7); + + // Manually terminate the pipeline + iter->Stop(); +} + +/// Feature: Vectors +/// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`, +/// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters +/// Expectation: return correct MSTensor which is equal to the expected +TEST_F(MindDataTestPipeline, TestVectorsAllParams) { + // Test with all parameters. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllParams."; + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt"; + std::shared_ptr vectors; + Status s = Vectors::BuildFromFile(&vectors, vectors_dir); + EXPECT_EQ(s, Status::OK()); + + std::vector unknown_init = {-1, -1, -1, -1, -1, -1}; + std::shared_ptr lookup = std::make_shared(vectors, unknown_init, true); + EXPECT_NE(lookup, nullptr); + + // Create Map operation on ds + ds = ds->Map({lookup}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map row; + ASSERT_OK(iter->GetNextRow(&row)); + + uint64_t i = 0; + std::vector> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}, + {-1, -1, -1, -1, -1, -1}, + {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973}, + {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603}, + {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246}, + {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923}, + {-1, -1, -1, -1, -1, -1}}; + while (row.size() != 0) { + auto ind = row["text"]; + MS_LOG(INFO) << ind.Shape(); + TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind); + TensorPtr de_expected_item; + dsize_t dim = 6; + ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item)); + mindspore::MSTensor ms_expected_item = + mindspore::MSTensor(std::make_shared(de_expected_item)); + EXPECT_MSTENSOR_EQ(ind, ms_expected_item); + + ASSERT_OK(iter->GetNextRow(&row)); + i++; + } + + EXPECT_EQ(i, 7); + + // Manually terminate the pipeline + iter->Stop(); +} + +/// Feature: Vectors +/// Description: test with pre-vectors set that have the different dimension +/// Expectation: throw correct error and message +TEST_F(MindDataTestPipeline, TestVectorsDifferentDimension) { + // Tokens don't have the same number of vectors. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDifferentDimension."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_dim_different.txt"; + std::shared_ptr vectors; + Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100); + EXPECT_NE(s, Status::OK()); +} + +/// Feature: Vectors +/// Description: test with pre-vectors set that has the head-info +/// Expectation: return correct MSTensor which is equal to the expected +TEST_F(MindDataTestPipeline, TestVectorsWithHeadInfo) { + // Test with words that has head info. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithHeadInfo."; + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_with_info.txt"; + std::shared_ptr vectors; + Status s = Vectors::BuildFromFile(&vectors, vectors_dir); + EXPECT_EQ(s, Status::OK()); + + std::vector unknown_init = {-1, -1, -1, -1, -1, -1}; + std::shared_ptr lookup = std::make_shared(vectors, unknown_init, true); + EXPECT_NE(lookup, nullptr); + + // Create Map operation on ds + ds = ds->Map({lookup}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map row; + ASSERT_OK(iter->GetNextRow(&row)); + + uint64_t i = 0; + std::vector> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}, + {-1, -1, -1, -1, -1, -1}, + {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973}, + {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603}, + {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246}, + {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923}, + {-1, -1, -1, -1, -1, -1}}; + while (row.size() != 0) { + auto ind = row["text"]; + MS_LOG(INFO) << ind.Shape(); + TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind); + TensorPtr de_expected_item; + dsize_t dim = 6; + ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item)); + mindspore::MSTensor ms_expected_item = + mindspore::MSTensor(std::make_shared(de_expected_item)); + EXPECT_MSTENSOR_EQ(ind, ms_expected_item); + + ASSERT_OK(iter->GetNextRow(&row)); + i++; + } + + EXPECT_EQ(i, 7); + + // Manually terminate the pipeline + iter->Stop(); +} + +/// Feature: Vectors +/// Description: test with the parameter max_vectors that is <= 0 +/// Expectation: throw correct error and message +TEST_F(MindDataTestPipeline, TestVectorsMaxVectorsLessThanZero) { + // Test with max_vectors <= 0. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsMaxVectorsLessThanZero."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt"; + std::shared_ptr vectors; + Status s = Vectors::BuildFromFile(&vectors, vectors_dir, -1); + EXPECT_NE(s, Status::OK()); +} + +/// Feature: Vectors +/// Description: test with the pre-vectors file that is empty +/// Expectation: throw correct error and message +TEST_F(MindDataTestPipeline, TestVectorsWithEmptyFile) { + // Read empty file. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithEmptyFile."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_empty.txt"; + std::shared_ptr vectors; + Status s = Vectors::BuildFromFile(&vectors, vectors_dir); + EXPECT_NE(s, Status::OK()); +} + +/// Feature: Vectors +/// Description: test with the pre-vectors file that is not exist +/// Expectation: throw correct error and message +TEST_F(MindDataTestPipeline, TestVectorsWithNotExistFile) { + // Test with not exist file. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithNotExistFile."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + std::string vectors_dir = datasets_root_path_ + "/testVectors/no_vectors.txt"; + std::shared_ptr vectors; + Status s = Vectors::BuildFromFile(&vectors, vectors_dir); + EXPECT_NE(s, Status::OK()); +} + +/// Feature: Vectors +/// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set +/// Expectation: throw correct error and message +TEST_F(MindDataTestPipeline, TestVectorsWithWrongInfoFile) { + // wrong info. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithWrongInfoFile."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testVectors/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_with_wrong_info.txt"; + std::shared_ptr vectors; + Status s = Vectors::BuildFromFile(&vectors, vectors_dir); + EXPECT_NE(s, Status::OK()); +} diff --git a/tests/ut/cpp/dataset/execute_test.cc b/tests/ut/cpp/dataset/execute_test.cc index bc644811ccd..3770032e00d 100644 --- a/tests/ut/cpp/dataset/execute_test.cc +++ b/tests/ut/cpp/dataset/execute_test.cc @@ -23,10 +23,12 @@ #include "minddata/dataset/include/dataset/vision.h" #include "minddata/dataset/include/dataset/audio.h" #include "minddata/dataset/include/dataset/text.h" +#include "minddata/dataset/text/vectors.h" #include "utils/log_adapter.h" using namespace mindspore::dataset; using mindspore::LogStream; +using mindspore::dataset::Vectors; using mindspore::ExceptionType::NoExceptionType; using mindspore::MsLogLevel::INFO; @@ -1529,6 +1531,140 @@ TEST_F(MindDataTestExecute, TestFlangerWithWrongArg) { EXPECT_FALSE(s01.IsOk()); } +/// Feature: Vectors +/// Description: test basic usage of Vectors and the ToVectors with default parameter +/// Expectation: get correct MSTensor +TEST_F(MindDataTestExecute, TestVectorsParam) { + MS_LOG(INFO) << "Doing MindDataTestExecute-TestVectorsParam."; + std::shared_ptr de_tensor; + Tensor::CreateScalar("ok", &de_tensor); + auto token = mindspore::MSTensor(std::make_shared(de_tensor)); + mindspore::MSTensor lookup_result; + + // Create expected output. + std::shared_ptr de_expected; + std::vector expected = {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}; + dsize_t dim = 6; + ASSERT_OK(Tensor::CreateFromVector(expected, TensorShape({dim}), &de_expected)); + auto ms_expected = mindspore::MSTensor(std::make_shared(de_expected)); + + // Transform params. + std::string vectors_dir = "data/dataset/testVectors/vectors.txt"; + std::shared_ptr vectors01; + Status s01 = Vectors::BuildFromFile(&vectors01, vectors_dir); + EXPECT_EQ(s01, Status::OK()); + std::shared_ptr to_vectors01 = std::make_shared(vectors01); + auto transform01 = Execute({to_vectors01}); + Status status01 = transform01(token, &lookup_result); + EXPECT_MSTENSOR_EQ(lookup_result, ms_expected); + EXPECT_TRUE(status01.IsOk()); + + std::shared_ptr vectors02; + Status s02 = Vectors::BuildFromFile(&vectors02, vectors_dir, 100); + EXPECT_EQ(s02, Status::OK()); + std::shared_ptr to_vectors02 = std::make_shared(vectors02); + auto transform02 = Execute({to_vectors02}); + Status status02 = transform02(token, &lookup_result); + EXPECT_MSTENSOR_EQ(lookup_result, ms_expected); + EXPECT_TRUE(status02.IsOk()); + + std::shared_ptr vectors03; + Status s03 = Vectors::BuildFromFile(&vectors03, vectors_dir, 3); + EXPECT_EQ(s03, Status::OK()); + std::shared_ptr to_vectors03 = std::make_shared(vectors03); + auto transform03 = Execute({to_vectors03}); + Status status03 = transform03(token, &lookup_result); + EXPECT_MSTENSOR_EQ(lookup_result, ms_expected); + EXPECT_TRUE(status03.IsOk()); +} + +/// Feature: ToVectors +/// Description: test basic usage of ToVectors and the Vectors with default parameter +/// Expectation: get correct MSTensor +TEST_F(MindDataTestExecute, TestToVectorsParam) { + MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsParam."; + std::shared_ptr de_tensor01; + Tensor::CreateScalar("none", &de_tensor01); + auto token01 = mindspore::MSTensor(std::make_shared(de_tensor01)); + std::shared_ptr de_tensor02; + Tensor::CreateScalar("ok", &de_tensor02); + auto token02 = mindspore::MSTensor(std::make_shared(de_tensor02)); + std::shared_ptr de_tensor03; + Tensor::CreateScalar("OK", &de_tensor03); + auto token03 = mindspore::MSTensor(std::make_shared(de_tensor03)); + mindspore::MSTensor lookup_result; + + // Create expected output. + dsize_t dim = 6; + std::shared_ptr de_expected01; + std::vector expected01 = {0, 0, 0, 0, 0, 0}; + ASSERT_OK(Tensor::CreateFromVector(expected01, TensorShape({dim}), &de_expected01)); + auto ms_expected01 = mindspore::MSTensor(std::make_shared(de_expected01)); + std::shared_ptr de_expected02; + std::vector expected02 = {-1, -1, -1, -1, -1, -1}; + ASSERT_OK(Tensor::CreateFromVector(expected02, TensorShape({dim}), &de_expected02)); + auto ms_expected02 = mindspore::MSTensor(std::make_shared(de_expected02)); + std::shared_ptr de_expected03; + std::vector expected03 = {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}; + ASSERT_OK(Tensor::CreateFromVector(expected03, TensorShape({dim}), &de_expected03)); + auto ms_expected03 = mindspore::MSTensor(std::make_shared(de_expected03)); + + // Transform params. + std::string vectors_dir = "data/dataset/testVectors/vectors.txt"; + std::shared_ptr vectors; + Status s = Vectors::BuildFromFile(&vectors, vectors_dir); + EXPECT_EQ(s, Status::OK()); + + std::shared_ptr to_vectors01 = std::make_shared(vectors); + auto transform01 = Execute({to_vectors01}); + Status status01 = transform01(token01, &lookup_result); + EXPECT_MSTENSOR_EQ(lookup_result, ms_expected01); + EXPECT_TRUE(status01.IsOk()); + std::vector unknown_init = {-1, -1, -1, -1, -1, -1}; + std::shared_ptr to_vectors02 = std::make_shared(vectors, unknown_init); + auto transform02 = Execute({to_vectors02}); + Status status02 = transform02(token01, &lookup_result); + EXPECT_MSTENSOR_EQ(lookup_result, ms_expected02); + EXPECT_TRUE(status02.IsOk()); + std::shared_ptr to_vectors03 = std::make_shared(vectors, unknown_init); + auto transform03 = Execute({to_vectors03}); + Status status03 = transform03(token02, &lookup_result); + EXPECT_MSTENSOR_EQ(lookup_result, ms_expected03); + EXPECT_TRUE(status03.IsOk()); + std::shared_ptr to_vectors04 = std::make_shared(vectors, unknown_init, true); + auto transform04 = Execute({to_vectors04}); + Status status04 = transform04(token03, &lookup_result); + EXPECT_MSTENSOR_EQ(lookup_result, ms_expected03); + EXPECT_TRUE(status04.IsOk()); +} + +/// Feature: ToVectors +/// Description: test invalid parameter of ToVectors +/// Expectation: throw exception correctly +TEST_F(MindDataTestExecute, TestToVectorsWithInvalidParam) { + MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsWithInvalidParam."; + std::shared_ptr de_tensor; + Tensor::CreateScalar("none", &de_tensor); + auto token = mindspore::MSTensor(std::make_shared(de_tensor)); + mindspore::MSTensor lookup_result; + + // Transform params. + std::string vectors_dir = "data/dataset/testVectors/vectors.txt"; + std::shared_ptr vectors01; + Status s = Vectors::BuildFromFile(&vectors01, vectors_dir); + EXPECT_EQ(s, Status::OK()); + std::vector unknown_init = {-1, -1, -1, -1}; + std::shared_ptr to_vectors01 = std::make_shared(vectors01, unknown_init); + auto transform01 = Execute({to_vectors01}); + Status status01 = transform01(token, &lookup_result); + EXPECT_FALSE(status01.IsOk()); + std::shared_ptr vectors02 = nullptr; + std::shared_ptr to_vectors02 = std::make_shared(vectors02); + auto transform02 = Execute({to_vectors02}); + Status status02 = transform02(token, &lookup_result); + EXPECT_FALSE(status02.IsOk()); +} + // Feature: DBToAmplitude // Description: test DBToAmplitude in eager mode // Expectation: the data is processed successfully diff --git a/tests/ut/data/dataset/testVectors/vectors.txt b/tests/ut/data/dataset/testVectors/vectors.txt new file mode 100644 index 00000000000..dc5c942ba1d --- /dev/null +++ b/tests/ut/data/dataset/testVectors/vectors.txt @@ -0,0 +1,6 @@ +ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411 +! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 +this 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 +is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 +my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 +home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923 diff --git a/tests/ut/data/dataset/testVectors/vectors_dim_different.txt b/tests/ut/data/dataset/testVectors/vectors_dim_different.txt new file mode 100644 index 00000000000..65830c6aaf0 --- /dev/null +++ b/tests/ut/data/dataset/testVectors/vectors_dim_different.txt @@ -0,0 +1,6 @@ +ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411 +! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 +this 0.15164 0.30177 -0.16763 0.17684 0.31719 +is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 +my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 +home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923 \ No newline at end of file diff --git a/tests/ut/data/dataset/testVectors/vectors_empty.txt b/tests/ut/data/dataset/testVectors/vectors_empty.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/ut/data/dataset/testVectors/vectors_with_info.txt b/tests/ut/data/dataset/testVectors/vectors_with_info.txt new file mode 100644 index 00000000000..b708aa25bc4 --- /dev/null +++ b/tests/ut/data/dataset/testVectors/vectors_with_info.txt @@ -0,0 +1,7 @@ +6 6 +ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411 +! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 +this 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 +is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 +my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 +home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923 \ No newline at end of file diff --git a/tests/ut/data/dataset/testVectors/vectors_with_wrong_info.txt b/tests/ut/data/dataset/testVectors/vectors_with_wrong_info.txt new file mode 100644 index 00000000000..86d3cc3952f --- /dev/null +++ b/tests/ut/data/dataset/testVectors/vectors_with_wrong_info.txt @@ -0,0 +1,7 @@ +the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411 +, 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 +. 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 +6 6 +of 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 +to 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 +and 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923 \ No newline at end of file diff --git a/tests/ut/data/dataset/testVectors/words.txt b/tests/ut/data/dataset/testVectors/words.txt new file mode 100644 index 00000000000..87e004ad8f1 --- /dev/null +++ b/tests/ut/data/dataset/testVectors/words.txt @@ -0,0 +1,7 @@ +ok +. +this +is +my +home +. diff --git a/tests/ut/data/dataset/testVectors/words_with_big_letter.txt b/tests/ut/data/dataset/testVectors/words_with_big_letter.txt new file mode 100644 index 00000000000..efa25a4b390 --- /dev/null +++ b/tests/ut/data/dataset/testVectors/words_with_big_letter.txt @@ -0,0 +1,7 @@ +ok +! +This +iS +my +HOME +. diff --git a/tests/ut/python/dataset/test_vectors.py b/tests/ut/python/dataset/test_vectors.py new file mode 100644 index 00000000000..2795d8b79a2 --- /dev/null +++ b/tests/ut/python/dataset/test_vectors.py @@ -0,0 +1,236 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import numpy as np +import pytest + +from mindspore import log +import mindspore.dataset as ds +import mindspore.dataset.text as text +import mindspore.dataset.text.transforms as T + +DATASET_ROOT_PATH = "../data/dataset/testVectors/" + + +def test_vectors_all_tovectors_params_eager(): + """ + Feature: Vectors + Description: test with all parameters which include `unk_init` + and `lower_case_backup` in function ToVectors in eager mode + Expectation: output is equal to the expected value + """ + vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt", max_vectors=4) + myUnk = [-1, -1, -1, -1, -1, -1] + to_vectors = T.ToVectors(vectors, unk_init=myUnk, lower_case_backup=True) + result1 = to_vectors("Ok") + result2 = to_vectors("!") + result3 = to_vectors("This") + result4 = to_vectors("is") + result5 = to_vectors("my") + result6 = to_vectors("home") + result7 = to_vectors("none") + res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411], + [0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709], + [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973], + [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603], + [-1, -1, -1, -1, -1, -1], + [-1, -1, -1, -1, -1, -1], + [-1, -1, -1, -1, -1, -1]] + res_array = np.array(res, dtype=np.float32) + + assert np.array_equal(result1, res_array[0]) + assert np.array_equal(result2, res_array[1]) + assert np.array_equal(result3, res_array[2]) + assert np.array_equal(result4, res_array[3]) + assert np.array_equal(result5, res_array[4]) + assert np.array_equal(result6, res_array[5]) + assert np.array_equal(result7, res_array[6]) + + +def test_vectors_from_file(): + """ + Feature: Vectors + Description: test with only default parameter + Expectation: output is equal to the expected value + """ + vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt") + to_vectors = text.ToVectors(vectors) + data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False) + data = data.map(operations=to_vectors, input_columns=["text"]) + ind = 0 + res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411], + [0, 0, 0, 0, 0, 0], + [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973], + [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603], + [0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246], + [0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923], + [0, 0, 0, 0, 0, 0]] + for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): + res_array = np.array(res[ind], dtype=np.float32) + assert np.array_equal(res_array, d["text"]), ind + ind += 1 + + +def test_vectors_from_file_all_buildfromfile_params(): + """ + Feature: Vectors + Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile + Expectation: output is equal to the expected value + """ + vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt", max_vectors=100) + to_vectors = text.ToVectors(vectors) + data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False) + data = data.map(operations=to_vectors, input_columns=["text"]) + ind = 0 + res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411], + [0, 0, 0, 0, 0, 0], + [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973], + [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603], + [0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246], + [0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923], + [0, 0, 0, 0, 0, 0]] + print(data) + for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): + res_array = np.array(res[ind], dtype=np.float32) + assert np.array_equal(res_array, d["text"]), ind + ind += 1 + + +def test_vectors_from_file_all_buildfromfile_params_eager(): + """ + Feature: Vectors + Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile in eager mode + Expectation: output is equal to the expected value + """ + vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt", max_vectors=4) + to_vectors = T.ToVectors(vectors) + result1 = to_vectors("ok") + result2 = to_vectors("!") + result3 = to_vectors("this") + result4 = to_vectors("is") + result5 = to_vectors("my") + result6 = to_vectors("home") + result7 = to_vectors("none") + res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411], + [0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709], + [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973], + [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0]] + res_array = np.array(res, dtype=np.float32) + + assert np.array_equal(result1, res_array[0]) + assert np.array_equal(result2, res_array[1]) + assert np.array_equal(result3, res_array[2]) + assert np.array_equal(result4, res_array[3]) + assert np.array_equal(result5, res_array[4]) + assert np.array_equal(result6, res_array[5]) + assert np.array_equal(result7, res_array[6]) + + +def test_vectors_from_file_eager(): + """ + Feature: Vectors + Description: test with only default parameter in eager mode + Expectation: output is equal to the expected value + """ + vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt") + to_vectors = T.ToVectors(vectors) + result1 = to_vectors("ok") + result2 = to_vectors("!") + result3 = to_vectors("this") + result4 = to_vectors("is") + result5 = to_vectors("my") + result6 = to_vectors("home") + result7 = to_vectors("none") + res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411], + [0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709], + [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973], + [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603], + [0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246], + [0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923], + [0, 0, 0, 0, 0, 0]] + res_array = np.array(res, dtype=np.float32) + + assert np.array_equal(result1, res_array[0]) + assert np.array_equal(result2, res_array[1]) + assert np.array_equal(result3, res_array[2]) + assert np.array_equal(result4, res_array[3]) + assert np.array_equal(result5, res_array[4]) + assert np.array_equal(result6, res_array[5]) + assert np.array_equal(result7, res_array[6]) + + +def test_vectors_invalid_input(): + """ + Feature: Vectors + Description: test the validate function with invalid parameters. + Expectation: + """ + def test_invalid_input(test_name, file_path, error, error_msg, max_vectors=None, + unk_init=None, lower_case_backup=False, token="ok"): + log.info("Test Vectors with wrong input: {0}".format(test_name)) + with pytest.raises(error) as error_info: + vectors = text.Vectors.from_file(file_path, max_vectors=max_vectors) + to_vectors = T.ToVectors(vectors, unk_init=unk_init, lower_case_backup=lower_case_backup) + to_vectors(token) + assert error_msg in str(error_info.value) + + test_invalid_input("Not all vectors have the same number of dimensions", + DATASET_ROOT_PATH + "vectors_dim_different.txt", error=RuntimeError, + error_msg="all vectors must have the same number of dimensions, but got dim 5 while expecting 6") + test_invalid_input("the file is empty.", DATASET_ROOT_PATH + "vectors_empty.txt", + error=RuntimeError, error_msg="invalid file, file is empty.") + test_invalid_input("the count of `unknown_init`'s element is different with word vector.", + DATASET_ROOT_PATH + "vectors.txt", + error=RuntimeError, error_msg="Unexpected error. ToVectors: " + + "unk_init must be the same length as vectors, but got unk_init: 2 and vectors: 6", + unk_init=[-1, -1]) + test_invalid_input("The file not exist", DATASET_ROOT_PATH + "not_exist.txt", error=RuntimeError, + error_msg="get real path failed") + test_invalid_input("The token is 1-dimensional", + DATASET_ROOT_PATH + "vectors_with_wrong_info.txt", error=RuntimeError, + error_msg="token with 1-dimensional vector.") + test_invalid_input("max_vectors parameter must be greater than 0", + DATASET_ROOT_PATH + "vectors.txt", error=ValueError, + error_msg="Input max_vectors is not within the required interval", max_vectors=-1) + test_invalid_input("invalid max_vectors parameter type as a float", + DATASET_ROOT_PATH + "vectors.txt", error=TypeError, + error_msg="Argument max_vectors with value 1.0 is not of type []," + " but got .", max_vectors=1.0) + test_invalid_input("invalid max_vectors parameter type as a string", + DATASET_ROOT_PATH + "vectors.txt", error=TypeError, + error_msg="Argument max_vectors with value 1 is not of type []," + " but got .", max_vectors="1") + test_invalid_input("invalid token parameter type as a float", DATASET_ROOT_PATH + "vectors.txt", error=RuntimeError, + error_msg="input tensor type should be string.", token=1.0) + test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "vectors.txt", + error=TypeError, error_msg="Argument lower_case_backup with " + + "value True is not of type []," + " but got .", lower_case_backup="True") + test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "vectors.txt", + error=TypeError, error_msg="Argument lower_case_backup with " + + "value True is not of type []," + " but got .", lower_case_backup="True") + + +if __name__ == '__main__': + test_vectors_all_tovectors_params_eager() + test_vectors_from_file() + test_vectors_from_file_all_buildfromfile_params() + test_vectors_from_file_all_buildfromfile_params_eager() + test_vectors_from_file_eager() + test_vectors_invalid_input()