From 457ee030824afc3dfd502d5b29da59cca3055159 Mon Sep 17 00:00:00 2001 From: liu-yongqi-63 Date: Tue, 8 Mar 2022 16:24:55 +0800 Subject: [PATCH] Vocab C++ Interface Alignment and SentencePieceVocab C++ interface alignment and Python interface refactoring --- .../ccsrc/minddata/dataset/api/datasets.cc | 3 +- .../dataset/engine/ir/datasetops/bindings.cc | 1 - .../python/bindings/dataset/text/bindings.cc | 40 ++-- .../dataset/text/kernels/ir/bindings.cc | 9 +- .../dataset/engine/consumers/tree_consumer.h | 2 +- .../build_sentence_piece_vocab_op.h | 4 +- .../engine/datasetops/build_vocab_op.cc | 6 +- .../engine/datasetops/build_vocab_op.h | 8 +- .../minddata/dataset/include/dataset/text.h | 201 +++++++++++++++++- .../dataset/text/ir/kernels/text_ir.cc | 2 +- .../dataset/text/kernels/lookup_op.cc | 2 +- .../minddata/dataset/text/kernels/lookup_op.h | 2 +- .../kernels/sentence_piece_tokenizer_op.h | 2 +- .../text/kernels/wordpiece_tokenizer_op.cc | 2 +- .../text/kernels/wordpiece_tokenizer_op.h | 2 +- .../dataset/text/sentence_piece_vocab.cc | 12 +- .../dataset/text/sentence_piece_vocab.h | 50 ----- .../ccsrc/minddata/dataset/text/vocab.cc | 123 ++--------- mindspore/ccsrc/minddata/dataset/text/vocab.h | 143 ------------- .../mindspore/dataset/text/transforms.py | 3 +- .../python/mindspore/dataset/text/utils.py | 27 ++- .../mindspore/dataset/text/validators.py | 4 +- tests/ut/cpp/dataset/build_vocab_test.cc | 26 +-- .../c_api_text_sentence_piece_vocab_test.cc | 1 - tests/ut/cpp/dataset/c_api_text_test.cc | 142 ++++++------- tests/ut/cpp/dataset/c_api_text_vocab_test.cc | 25 ++- .../dataset/sentence_piece_vocab_op_test.cc | 2 +- 27 files changed, 371 insertions(+), 473 deletions(-) delete mode 100644 mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h delete mode 100644 mindspore/ccsrc/minddata/dataset/text/vocab.h diff --git a/mindspore/ccsrc/minddata/dataset/api/datasets.cc b/mindspore/ccsrc/minddata/dataset/api/datasets.cc index 7a22c92bee1..fa1a9d4741f 100644 --- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc +++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc @@ -38,8 +38,7 @@ #include "minddata/dataset/util/status.h" #ifndef ENABLE_ANDROID #include "minddata/dataset/engine/ir/cache/dataset_cache_impl.h" -#include "minddata/dataset/text/sentence_piece_vocab.h" -#include "minddata/dataset/text/vocab.h" +#include "minddata/dataset/include/dataset/text.h" #endif // Sampler headers (in alphabetical order) diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/bindings.cc index 9622f023dad..87313bf440a 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/bindings.cc @@ -23,7 +23,6 @@ #include "minddata/dataset/core/data_type.h" #include "minddata/dataset/engine/serdes.h" #include "minddata/dataset/include/dataset/constants.h" -#include "minddata/dataset/text/sentence_piece_vocab.h" #include "minddata/dataset/util/path.h" // IR non-leaf nodes diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc index 30844a3f26c..c54dc1b91a7 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc @@ -19,12 +19,11 @@ #include "minddata/dataset/api/python/pybind_register.h" #include "minddata/dataset/include/dataset/constants.h" +#include "minddata/dataset/include/dataset/text.h" #include "minddata/dataset/text/char_n_gram.h" #include "minddata/dataset/text/fast_text.h" #include "minddata/dataset/text/glove.h" -#include "minddata/dataset/text/sentence_piece_vocab.h" #include "minddata/dataset/text/vectors.h" -#include "minddata/dataset/text/vocab.h" namespace mindspore { namespace dataset { @@ -32,28 +31,29 @@ PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) { (void)py::class_>(*m, "Vocab") .def(py::init<>()) .def_static("from_list", - [](const py::list &words, const py::list &special_tokens, bool special_first) { + [](const std::vector &words, + const std::vector &special_tokens, bool special_first) { std::shared_ptr v; - THROW_IF_ERROR(Vocab::BuildFromPyList(words, special_tokens, special_first, &v)); + THROW_IF_ERROR(Vocab::BuildFromVector(words, special_tokens, special_first, &v)); return v; }) .def_static( "from_file", [](const std::string &path, const std::string &dlm, int32_t vocab_size, - const py::list &special_tokens, bool special_first) { + const std::vector &special_tokens, bool special_first) { std::shared_ptr v; THROW_IF_ERROR(Vocab::BuildFromFile(path, dlm, vocab_size, special_tokens, special_first, &v)); return v; }) .def_static("from_dict", - [](const py::dict &words) { + [](const std::unordered_map &words) { std::shared_ptr v; - THROW_IF_ERROR(Vocab::BuildFromPyDict(words, &v)); + THROW_IF_ERROR(Vocab::BuildFromUnorderedMap(words, &v)); return v; }) .def("tokens_to_ids", [](Vocab &self, const std::vector words) { - auto ids = self.Lookup(words); + auto ids = self.TokensToIds(words); py::object ret; if (ids.size() == 1) { ret = py::int_(ids[0]); @@ -65,7 +65,7 @@ PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) { }) .def("ids_to_tokens", [](Vocab &self, const std::vector ids) { - auto words = self.ReverseLookup(ids); + auto words = self.IdsToTokens(ids); py::object ret; if (words.size() == 1) { ret = py::str(words[0]); @@ -75,31 +75,19 @@ PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) { } return ret; }) - .def("vocab", [](Vocab &self) { return self.vocab(); }); + .def("vocab", [](Vocab &self) { return self.GetVocab(); }); })); PYBIND_REGISTER(SentencePieceVocab, 0, ([](const py::module *m) { (void)py::class_>(*m, "SentencePieceVocab") .def(py::init<>()) .def_static("from_file", - [](const py::list &paths, const int32_t vocab_size, const float character_coverage, - const SentencePieceModel model_type, const py::dict ¶ms) { + [](const std::vector &paths, const int32_t vocab_size, + const float character_coverage, const SentencePieceModel model_type, + const std::unordered_map ¶ms) { std::shared_ptr v; - std::vector path_list; - for (auto path : paths) { - path_list.emplace_back(py::str(path)); - } - std::unordered_map param_map; - for (auto param : params) { - std::string key = py::reinterpret_borrow(param.first); - if (key == "input" || key == "vocab_size" || key == "model_prefix" || - key == "character_coverage" || key == "model_type") { - continue; - } - param_map[key] = py::reinterpret_borrow(param.second); - } THROW_IF_ERROR(SentencePieceVocab::BuildFromFile( - path_list, vocab_size, character_coverage, model_type, param_map, &v)); + paths, vocab_size, character_coverage, model_type, params, &v)); return v; }) .def_static("save_model", [](const std::shared_ptr *vocab, std::string path, diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc index 7768cbbe599..07e61d3d123 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc @@ -14,13 +14,12 @@ * limitations under the License. */ +#include "minddata/dataset/api/python/pybind_register.h" +#include "minddata/dataset/include/dataset/text.h" +#include "minddata/dataset/text/ir/kernels/text_ir.h" +#include "minddata/dataset/text/vectors.h" #include "pybind11/pybind11.h" #include "pybind11/stl_bind.h" -#include "minddata/dataset/api/python/pybind_register.h" -#include "minddata/dataset/text/ir/kernels/text_ir.h" -#include "minddata/dataset/text/sentence_piece_vocab.h" -#include "minddata/dataset/text/vectors.h" -#include "minddata/dataset/text/vocab.h" namespace mindspore { namespace dataset { diff --git a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.h b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.h index 9806673d08b..dcd167291c0 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.h +++ b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.h @@ -24,7 +24,7 @@ #include #include "minddata/dataset/engine/tree_adapter.h" -#include "minddata/dataset/text/vocab.h" +#include "minddata/dataset/include/dataset/text.h" namespace mindspore::dataset { // Forward declare diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h index 24f3575666b..2fb6e1bca38 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h @@ -28,9 +28,9 @@ #include "minddata/dataset/core/tensor.h" #include "minddata/dataset/engine/dataset_iterator.h" #include "minddata/dataset/engine/datasetops/pipeline_op.h" +#include "minddata/dataset/include/dataset/text.h" #include "minddata/dataset/util/status.h" #include "minddata/dataset/util/queue.h" -#include "minddata/dataset/text/sentence_piece_vocab.h" #include "pybind11/pybind11.h" namespace mindspore { @@ -54,7 +54,7 @@ class BuildSentencePieceVocabOp : public PipelineOp { BuildSentencePieceVocabOp *s_p_vocab_ptr_; }; - BuildSentencePieceVocabOp(std::shared_ptr vocab, std::vector col_names, + BuildSentencePieceVocabOp(std::shared_ptr vocab, std::vector col_names, int32_t vocab_size, float character_coverage, SentencePieceModel model_type, const std::unordered_map ¶ms, int32_t op_conn_size); diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc index 126453b1877..ec20be4a097 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc @@ -179,15 +179,15 @@ Status BuildVocabOp::CollectorThread() { }); if (special_first_) { - for (const std::string &sp_tk : special_tokens_) vocab_->append_word(sp_tk); + for (const std::string &sp_tk : special_tokens_) vocab_->AppendWord(sp_tk); } for (int64_t i = 0; i < num_words; i++) { - vocab_->append_word(words[i]); + vocab_->AppendWord(words[i]); } if (!special_first_) { - for (const std::string &sp_tk : special_tokens_) vocab_->append_word(sp_tk); + for (const std::string &sp_tk : special_tokens_) vocab_->AppendWord(sp_tk); } RETURN_IF_NOT_OK(out_connector_->SendEOE()); diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.h b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.h index 53b13f47b82..cbc2017b474 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.h +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.h @@ -25,7 +25,7 @@ #include "minddata/dataset/core/tensor.h" #include "minddata/dataset/engine/dataset_iterator.h" #include "minddata/dataset/engine/datasetops/parallel_op.h" -#include "minddata/dataset/text/vocab.h" +#include "minddata/dataset/include/dataset/text.h" #include "minddata/dataset/util/queue.h" #include "minddata/dataset/util/status.h" @@ -33,9 +33,9 @@ namespace mindspore { namespace dataset { class BuildVocabOp : public ParallelOp { public: - BuildVocabOp(std::shared_ptr vocab, std::vector col_names, std::pair freq_range, - int64_t top_k, const std::vector &tokens, bool prepend, int32_t num_workers, - int32_t op_connector_size); + BuildVocabOp(std::shared_ptr vocab, std::vector col_names, + std::pair freq_range, int64_t top_k, const std::vector &tokens, + bool prepend, int32_t num_workers, int32_t op_connector_size); ~BuildVocabOp() = default; diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/text.h b/mindspore/ccsrc/minddata/dataset/include/dataset/text.h index 644045517cf..168ac20c635 100644 --- a/mindspore/ccsrc/minddata/dataset/include/dataset/text.h +++ b/mindspore/ccsrc/minddata/dataset/include/dataset/text.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -30,10 +31,204 @@ namespace mindspore { namespace dataset { -class SentencePieceVocab; class TensorOperation; class Vectors; -class Vocab; + +using WordIdType = int32_t; +using WordType = std::string; + +/// \brief Vocab object that is used to save pairs of words and ids. +/// \note It contains a map that maps each word(str) to an id(int) or reverse. +class Vocab { + public: + /// \brief Build a vocab from an unordered_map. IDs should be no duplicate and continuous. + /// \param[in] words An unordered_map containing word id pair. + /// \param[out] vocab A vocab object. + /// \return Status code. + /// \par Example + /// \code + /// // Build a map + /// std::unordered_map dict; + /// dict["banana"] = 0; + /// dict["apple"] = 1; + /// dict["cat"] = 2; + /// dict["dog"] = 3; + /// // Build vocab from map + /// std::shared_ptr vocab = std::make_shared(); + /// Status s = Vocab::BuildFromUnorderedMap(dict, &vocab); + /// \endcode + static Status BuildFromUnorderedMap(const std::unordered_map &words, + std::shared_ptr *vocab); + + /// \brief Build a vocab from a c++ vector. id no duplicate and continuous. + /// \param[in] words A vector of string containing words. + /// \param[in] special_tokens A vector of string containing special tokens. + /// \param[in] prepend_special Whether the special_tokens will be prepended/appended to vocab. + /// \param[out] vocab A vocab object. + /// \return Status code. + /// \par Example + /// \code + /// // Build vocab from a vector of words, special tokens are prepended to vocab + /// std::vector list = {"apple", "banana", "cat", "dog", "egg"}; + /// std::shared_ptr vocab = std::make_shared(); + /// Status s = Vocab::BuildFromVector(list, {""}, true, &vocab); + /// \endcode + static Status BuildFromVector(const std::vector &words, const std::vector &special_tokens, + bool prepend_special, std::shared_ptr *vocab); + + /// \brief Build a vocab from vocab file, IDs will be automatically assigned. + /// \param[in] path Path to vocab file, each line in file is assumed as a word (including space). + /// \param[in] delimiter Delimiter to break each line, characters after the delimiter will be deprecated. + /// \param[in] vocab_size Number of lines to be read from file. + /// \param[in] special_tokens A vector of string containing special tokens. + /// \param[in] prepend_special Whether the special_tokens will be prepended/appended to vocab. + /// \param[out] vocab A vocab object. + /// \return Status code. + /// \par Example + /// \code + /// // Build vocab from local file + /// std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt"; + /// std::shared_ptr vocab = std::make_shared(); + /// Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"", ""}, true, &vocab); + /// \endcode + static Status BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size, + const std::vector &special_tokens, bool prepend_special, + std::shared_ptr *vocab); + + /// Lookup the id of a word, if the word doesn't exist in vocab, return -1. + /// \param word Word to be looked up. + /// \return ID of the word in the vocab. + /// \par Example + /// \code + /// // lookup, convert token to id + /// auto single_index = vocab->TokensToIds("home"); + /// single_index = vocab->TokensToIds("hello"); + /// \endcode + WordIdType TokensToIds(const WordType &word) const; + + /// Lookup the id of a word, if the word doesn't exist in vocab, return -1. + /// \param words Words to be looked up. + /// \return ID of the word in the vocab. + /// \par Example + /// \code + /// // lookup multiple tokens + /// auto multi_indexs = vocab->TokensToIds(std::vector{"", "behind"}); + /// std::vector expected_multi_indexs = {0, 4}; + /// multi_indexs = vocab->TokensToIds(std::vector{"", "apple"}); + /// expected_multi_indexs = {0, -1}; + /// \endcode + std::vector TokensToIds(const std::vector &words) const; + + /// Lookup the word of an ID, if ID doesn't exist in vocab, return empty string. + /// \param id ID to be looked up. + /// \return Indicates the word corresponding to the ID. + /// \par Example + /// \code + /// // reverse lookup, convert id to token + /// auto single_word = vocab->IdsToTokens(2); + /// single_word = vocab->IdsToTokens(-1); + /// \endcode + WordType IdsToTokens(const WordIdType &id); + + /// Lookup the word of an ID, if ID doesn't exist in vocab, return empty string. + /// \param ids ID to be looked up. + /// \return Indicates the word corresponding to the ID. + /// \par Example + /// \code + /// // reverse lookup multiple ids + /// auto multi_words = vocab->IdsToTokens(std::vector{0, 4}); + /// std::vector expected_multi_words = {"", "behind"}; + /// multi_words = vocab->IdsToTokens(std::vector{0, 99}); + /// expected_multi_words = {"", ""}; + /// \endcode + std::vector IdsToTokens(const std::vector &ids); + + /// Constructor, shouldn't be called directly, can't be private due to std::make_unique(). + /// \param map Sanitized word2id map. + explicit Vocab(std::unordered_map map); + + /// \brief Add one word to vocab, increment it's index automatically. + /// \param word Word to be added, word will skip if word already exists. + void AppendWord(const std::string &word); + + /// \brief Return a read-only vocab in unordered_map type. + /// \return A unordered_map of word2id. + const std::unordered_map &GetVocab() { return word2id_; } + + /// \brief Constructor. + Vocab() = default; + + /// \brief Destructor. + ~Vocab() = default; + + static const WordIdType kNoTokenExists; + static const WordType kNoIdExists; + + private: + std::unordered_map word2id_; + std::unordered_map id2word_; +}; + +/// \brief SentencePiece object that is used to do words segmentation. +class SentencePieceVocab { + public: + /// \brief Build a SentencePiece object from a file. + /// \param[in] path_list Path to the file which contains the SentencePiece list. + /// \param[in] vocab_size Vocabulary size. + /// \param[in] character_coverage Amount of characters covered by the model, good defaults are: 0.9995 for + /// languages with rich character set like Japanese or Chinese and 1.0 for other languages with small + /// character set. + /// \param[in] model_type It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE, + /// SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input + /// sentence must be pre-tokenized when using SentencePieceModel.WORD type. + /// - SentencePieceModel.UNIGRAM, Unigram Language Model means the next word in the sentence is assumed + /// to be independent of the previous words generated by the model. + /// - SentencePieceModel.BPE, refers to byte pair encoding algorithm, which replaces the most frequent + /// pair of bytes in a sentence with a single, unused byte. + /// - SentencePieceModel.CHAR, refers to char based sentencePiece Model type. + /// - SentencePieceModel.WORD, refers to word based sentencePiece Model type. + /// \param[in] params A dictionary with no incoming parameters(The parameters are derived from SentencePiece library). + /// \return SentencePieceVocab, vocab built from the file. + /// \par Example + /// \code + /// std::string dataset_path; + /// dataset_path = datasets_root_path_ + "/test_sentencepiece/botchan.txt"; + /// std::vector path_list; + /// path_list.emplace_back(dataset_path); + /// std::unordered_map param_map; + /// std::shared_ptr spm = std::make_unique(); + /// Status rc = SentencePieceVocab::BuildFromFile(path_list, 5000, 0.9995, + /// SentencePieceModel::kUnigram, param_map, &spm); + /// \endcode + static Status BuildFromFile(const std::vector &path_list, const int32_t vocab_size, + const float character_coverage, const SentencePieceModel model_type, + const std::unordered_map ¶ms, + std::shared_ptr *vocab); + + /// \brief Save the SentencePiece model into given file path. + /// \param[in] vocab A SentencePiece object to be saved. + /// \param[in] path Path to store the model. + /// \param[in] filename The save name of model file. + /// \par Example + /// \code + /// // Save vocab model to local + /// vocab->SaveModel(&vocab, datasets_root_path_ + "/test_sentencepiece", "m.model"); + /// \endcode + static Status SaveModel(const std::shared_ptr *vocab, std::string path, std::string filename); + + /// \brief Constructor. + SentencePieceVocab(); + + /// \brief Destructor. + ~SentencePieceVocab() = default; + + const std::string &model_proto(); + + void set_model_proto(const std::string model_proto); + + private: + std::string model_proto_; +}; // Transform operations for text namespace text { @@ -414,7 +609,7 @@ class MS_API NormalizeUTF8 final : public TensorTransform { /// \brief Constructor. /// \param[in] normalize_form Valid values can be any of [NormalizeForm::kNone,NormalizeForm::kNfc, /// NormalizeForm::kNfkc, NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc). - /// See http://unicode.org/reports/tr15/ for details. + /// See for details. /// - NormalizeForm.kNone, remain the input string tensor unchanged. /// - NormalizeForm.kNfc, normalizes with Normalization Form C. /// - NormalizeForm.kNfkc, normalizes with Normalization Form KC. diff --git a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc index c4f98942230..653836c1964 100644 --- a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc +++ b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc @@ -217,7 +217,7 @@ Status LookupOperation::ValidateParams() { LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg); } if (unknown_token_ != std::nullopt) { - default_id_ = vocab_->Lookup(*unknown_token_); + default_id_ = vocab_->TokensToIds(*unknown_token_); if (default_id_ == Vocab::kNoTokenExists) { std::string err_msg = "Lookup: \"" + *unknown_token_ + "\" doesn't exist in vocab."; LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg); diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.cc b/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.cc index 4f59e6b60d7..2edeb2e1507 100644 --- a/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.cc +++ b/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.cc @@ -30,7 +30,7 @@ Status LookupOp::Compute(const std::shared_ptr &input, std::shared_ptr word_ids; word_ids.reserve(input->Size()); for (auto itr = input->begin(); itr != input->end(); ++itr) { - WordIdType word_id = vocab_->Lookup(std::string(*itr)); + WordIdType word_id = vocab_->TokensToIds(std::string(*itr)); word_ids.emplace_back(word_id == Vocab::kNoTokenExists ? default_id_ : word_id); CHECK_FAIL_RETURN_UNEXPECTED(word_ids.back() != Vocab::kNoTokenExists, "Lookup: invalid data, token: \"" + std::string(*itr) + diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.h b/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.h index 1b6ecf2c2af..4abc5744503 100644 --- a/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.h +++ b/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.h @@ -23,9 +23,9 @@ #include #include "minddata/dataset/core/tensor.h" +#include "minddata/dataset/include/dataset/text.h" #include "minddata/dataset/kernels/tensor_op.h" #include "minddata/dataset/util/status.h" -#include "minddata/dataset/text/vocab.h" namespace mindspore { namespace dataset { diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h b/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h index bb2c16bd35c..89f02b2b543 100644 --- a/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h +++ b/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h @@ -24,10 +24,10 @@ #include #include "minddata/dataset/include/dataset/constants.h" +#include "minddata/dataset/include/dataset/text.h" #include "minddata/dataset/kernels/tensor_op.h" #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" #include "minddata/dataset/util/status.h" -#include "minddata/dataset/text/sentence_piece_vocab.h" namespace mindspore { namespace dataset { diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.cc b/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.cc index d62346a145a..6a7659221e3 100644 --- a/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.cc +++ b/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.cc @@ -46,7 +46,7 @@ Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const Ru if (start > 0) { word = suffix_indicator_ + word; } - if (vocab_->Lookup(word) != Vocab::kNoTokenExists) { + if (vocab_->TokensToIds(word) != Vocab::kNoTokenExists) { *out_found = true; break; } diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.h b/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.h index 7c959cda9c2..9766b67a5eb 100644 --- a/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.h +++ b/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.h @@ -23,9 +23,9 @@ #include "cppjieba/Unicode.hpp" #include "minddata/dataset/core/tensor.h" +#include "minddata/dataset/include/dataset/text.h" #include "minddata/dataset/kernels/tensor_op.h" #include "minddata/dataset/text/kernels/tokenizer_op.h" -#include "minddata/dataset/text/vocab.h" #include "minddata/dataset/util/status.h" using cppjieba::DecodeRunesInString; diff --git a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc b/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc index f2b9345043b..ce1b4e96415 100644 --- a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc +++ b/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc @@ -14,16 +14,18 @@ * limitations under the License. */ -#include "minddata/dataset/text/sentence_piece_vocab.h" - -#include #include +#include + #include +#include "include/common/utils/utils.h" +#include "minddata/dataset/include/dataset/constants.h" +#include "minddata/dataset/include/dataset/text.h" +#include "minddata/dataset/util/path.h" +#include "minddata/dataset/util/status.h" #include "utils/file_utils.h" #include "utils/ms_utils.h" -#include "include/common/utils/utils.h" -#include "minddata/dataset/util/path.h" namespace mindspore { namespace dataset { diff --git a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h b/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h deleted file mode 100644 index 4520c2b6040..00000000000 --- a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h +++ /dev/null @@ -1,50 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_ -#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_ - -#include -#include -#include -#include -#include "minddata/dataset/util/status.h" -#include "minddata/dataset/include/dataset/constants.h" - -namespace mindspore { -namespace dataset { - -class SentencePieceVocab { - public: - static Status BuildFromFile(const std::vector &path_list, const int32_t vocab_size, - const float character_coverage, const SentencePieceModel model_type, - const std::unordered_map ¶ms, - std::shared_ptr *vocab); - static Status SaveModel(const std::shared_ptr *vocab, std::string path, std::string filename); - SentencePieceVocab(); - - ~SentencePieceVocab() = default; - - const std::string &model_proto(); - - void set_model_proto(const std::string model_proto); - - private: - std::string model_proto_; -}; -} // namespace dataset -} // namespace mindspore -#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_ diff --git a/mindspore/ccsrc/minddata/dataset/text/vocab.cc b/mindspore/ccsrc/minddata/dataset/text/vocab.cc index 82dbb48b0f5..e1151255189 100644 --- a/mindspore/ccsrc/minddata/dataset/text/vocab.cc +++ b/mindspore/ccsrc/minddata/dataset/text/vocab.cc @@ -14,14 +14,14 @@ * limitations under the License. */ -#include "minddata/dataset/text/vocab.h" - -#include -#include -#include -#include #include +#include +#include +#include +#include +#include "minddata/dataset/include/dataset/text.h" +#include "minddata/dataset/util/status.h" #include "utils/file_utils.h" #ifndef ENABLE_ANDROID #include "utils/log_adapter.h" @@ -33,18 +33,18 @@ namespace mindspore { namespace dataset { Vocab::Vocab(std::unordered_map word2id) { word2id_ = std::move(word2id); } -WordIdType Vocab::Lookup(const WordType &word) const { +WordIdType Vocab::TokensToIds(const WordType &word) const { auto itr = word2id_.find(word); return itr == word2id_.end() ? kNoTokenExists : itr->second; } -std::vector Vocab::Lookup(const std::vector &words) const { +std::vector Vocab::TokensToIds(const std::vector &words) const { std::vector ids; - std::transform(words.begin(), words.end(), std::back_inserter(ids), [this](auto w) { return Lookup(w); }); + std::transform(words.begin(), words.end(), std::back_inserter(ids), [this](auto w) { return TokensToIds(w); }); return ids; } -WordType Vocab::ReverseLookup(const WordIdType &id) { +WordType Vocab::IdsToTokens(const WordIdType &id) { // lazy initialization, since I think it's not common use but waste memory if (id2word_.empty()) { for (const auto [word_, id_] : word2id_) { @@ -55,7 +55,7 @@ WordType Vocab::ReverseLookup(const WordIdType &id) { return itr == id2word_.end() ? kNoIdExists : itr->second; } -std::vector Vocab::ReverseLookup(const std::vector &ids) { +std::vector Vocab::IdsToTokens(const std::vector &ids) { // lazy initialization, since I think it's not common use but waste memory if (id2word_.empty()) { for (const auto [word_, id_] : word2id_) { @@ -63,50 +63,11 @@ std::vector Vocab::ReverseLookup(const std::vector &ids) { } } std::vector words; - std::transform(ids.begin(), ids.end(), std::back_inserter(words), [this](auto i) { return ReverseLookup(i); }); + std::transform(ids.begin(), ids.end(), std::back_inserter(words), [this](auto i) { return IdsToTokens(i); }); return words; } -#ifdef ENABLE_PYTHON -Status Vocab::BuildFromPyList(const py::list &words, const py::list &special_tokens, bool prepend_special, - std::shared_ptr *vocab) { - if (vocab == nullptr) { - RETURN_STATUS_UNEXPECTED("Vocab::BuildFromPyList: input vocab can not be null"); - } - // check of duplication on both words and special_tokens will be performed in python - // special_tokens and words both need to be unique, and shouldn't overlap - std::unordered_map word2id; - // if special is added in front, normal words id will start from number of special tokens - WordIdType word_id = prepend_special ? static_cast(special_tokens.size()) : 0; - - for (auto word : words) { - word2id[py::str(word)] = word_id++; - } - - word_id = prepend_special ? 0 : word2id.size(); - - for (auto special_token : special_tokens) { - word2id[py::str(special_token)] = word_id++; - } - - *vocab = std::make_shared(std::move(word2id)); - return Status::OK(); -} - -Status Vocab::BuildFromPyDict(const py::dict &words, std::shared_ptr *vocab) { - if (vocab == nullptr) { - RETURN_STATUS_UNEXPECTED("Vocab::BuildFromPyDict: input vocab can not be null"); - } - std::unordered_map word2id; - for (auto p : words) { - word2id[py::str(p.first)] = py::reinterpret_borrow(p.second); - } - *vocab = std::make_shared(std::move(word2id)); - return Status::OK(); -} -#endif - -void Vocab::append_word(const std::string &word) { +void Vocab::AppendWord(const std::string &word) { if (word2id_.find(word) == word2id_.end()) { word2id_[word] = word2id_.size(); } @@ -161,11 +122,11 @@ Status Vocab::BuildFromVector(const std::vector &words, const std::vec return Status::OK(); } -Status Vocab::BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size, - const std::vector &special_tokens, bool prepend_special, - std::shared_ptr *vocab) { +Status Vocab::BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size, + const std::vector &special_tokens, bool prepend_special, + std::shared_ptr *vocab) { if (vocab == nullptr) { - RETURN_STATUS_UNEXPECTED("Vocab::BuildFromFileCpp: input vocab can not be null"); + RETURN_STATUS_UNEXPECTED("Vocab::BuildFromFile: input vocab can not be null"); } // Validate parameters auto realpath = FileUtils::GetRealPath(path.c_str()); @@ -227,56 +188,6 @@ Status Vocab::BuildFromFileCpp(const std::string &path, const std::string &delim return Status::OK(); } -Status Vocab::BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size, - const py::list &special_tokens, bool prepend_special, std::shared_ptr *vocab) { - if (vocab == nullptr) { - RETURN_STATUS_UNEXPECTED("Vocab::BuildFromFile: input vocab can not be null"); - } - // python validator checks special_tokens doesn't contain any duplicate words - std::unordered_set specials; - // used to check that words in file don't contain any special token that already exists - for (auto word : special_tokens) { - specials.insert(py::str(word)); - } - WordIdType word_id = prepend_special ? static_cast(special_tokens.size()) : 0; - std::unordered_map word2id; - - auto realpath = FileUtils::GetRealPath(path.c_str()); - if (!realpath.has_value()) { - RETURN_STATUS_UNEXPECTED("Get real path failed, path=" + path); - } - - std::fstream handle(realpath.value(), std::ios::in); - CHECK_FAIL_RETURN_UNEXPECTED(handle.good() && handle.is_open(), "from_file: fail to open:" + path); - std::string word; - while (std::getline(handle, word)) { - if (!delimiter.empty()) { - // if delimiter is not found, find_first_of would return std::string::npos which is -1 - word = word.substr(0, word.find_first_of(delimiter)); - } - if (word2id.find(word) != word2id.end()) { - handle.close(); - RETURN_STATUS_UNEXPECTED("from_file: duplicate word:" + word + "."); - } - if (specials.find(word) != specials.end()) { - handle.close(); - RETURN_STATUS_UNEXPECTED("from_file: special_tokens and word_list contain duplicate word:" + word); - } - word2id[word] = word_id++; - // break if enough row is read, if vocab_size is smaller than 0 - if (word2id.size() == vocab_size) break; - } - handle.close(); - word_id = prepend_special ? 0 : word2id.size(); - - for (auto special_token : special_tokens) { - word2id[py::str(special_token)] = word_id++; - } - - *vocab = std::make_shared(std::move(word2id)); - return Status::OK(); -} - const WordIdType Vocab::kNoTokenExists = -1; const WordType Vocab::kNoIdExists = std::string(); diff --git a/mindspore/ccsrc/minddata/dataset/text/vocab.h b/mindspore/ccsrc/minddata/dataset/text/vocab.h deleted file mode 100644 index 2d08a1e94a5..00000000000 --- a/mindspore/ccsrc/minddata/dataset/text/vocab.h +++ /dev/null @@ -1,143 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VOCAB_H_ -#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VOCAB_H_ - -#include -#include -#include -#include - -#include "minddata/dataset/util/status.h" -#ifdef ENABLE_PYTHON -#include "pybind11/pybind11.h" -#include "pybind11/stl.h" -#endif - -namespace mindspore { -namespace dataset { -#ifdef ENABLE_PYTHON -namespace py = pybind11; -#endif - -using WordIdType = int32_t; -using WordType = std::string; - -class Vocab { - public: -#ifdef ENABLE_PYTHON - // Build a vocab from a python dictionary key is each word ,id needs to start from 2, no duplicate and continuous - // @param const py::dict &words - a dictionary containing word, word id pair. - // @param std::shared_ptr *vocab - return value, vocab object - // @return error code - static Status BuildFromPyDict(const py::dict &words, std::shared_ptr *vocab); - - // Build a vocab from a python list, id will be assigned automatically, start from 2 - // @param const py::list &words - a list of string, used to build vocab, id starts from 2 - // @param std::shared_ptr *vocab - return value, vocab object - // @return error code - static Status BuildFromPyList(const py::list &words, const py::list &special_tokens, bool prepend_special, - std::shared_ptr *vocab); - - // Build a vocab from reading a vocab file, id are automatically assigned, start from 2 - // @param std::string &path - path to vocab file , each line is assumed to contain 1 word - // @param std::string &delimiter - delimiter to break each line with - // @param int32_t vocab_size - number of words to read from file - // @param std::shared_ptr *vocab - return value, vocab object - // @return error code - static Status BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size, - const py::list &special_tokens, bool prepend_special, std::shared_ptr *vocab); -#endif - - /// \brief Build a vocab from a c++ map. id needs to start from 2, no duplicate and continuous - /// \param[in] words An unordered_map containing word, word id pair. - /// \param[out] vocab A vocab object - /// \return Error code - static Status BuildFromUnorderedMap(const std::unordered_map &words, - std::shared_ptr *vocab); - - /// \brief Build a vocab from a c++ vector. id needs to start from 2, no duplicate and continuous - /// \param[in] words A vector of string, used to build vocab, id starts from 2 - /// \param[in] special_tokens A vector of string contain special tokens - /// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab - /// \param[out] vocab A vocab object - /// \return Error code - static Status BuildFromVector(const std::vector &words, const std::vector &special_tokens, - bool prepend_special, std::shared_ptr *vocab); - - /// \brief Build a vocab from reading a vocab file, id are automatically assigned, start from 2 - /// \param[in] path Path to vocab file , each line is assumed to contain 1 word - /// \param[in] delimiter Delimiter to break each line with - /// \param[in] vocab_size Number of words to read from file - /// \param[in] special_tokens A vector of string contain special tokens - /// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab - /// \param[out] vocab A vocab object - /// \return Error code - static Status BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size, - const std::vector &special_tokens, bool prepend_special, - std::shared_ptr *vocab); - - // Lookup the id of a word, if word doesn't exist in vocab, return default_id - // @param const WordType word - word to look up - // @param WordIdType default_id - word id to return to user when its not in the vocab - // @return WordIdType, word_id - WordIdType Lookup(const WordType &word) const; - - // Lookup the ids of a vector of words, if word doesn't exist in vocab, return default_id - // @param const WordType word - word to look up - // @param WordIdType default_id - word id to return to user when its not in the vocab - // @return WordIdType, word_id - std::vector Lookup(const std::vector &words) const; - - // Find the word of a id, if word doesn't exist in vocab, return empty string - // @param const WordIdType id - id to reverse look up - // @return WordType, word - WordType ReverseLookup(const WordIdType &id); - - // Find the words of a vector of ids, if word doesn't exist in vocab, return empty string - // @param const WordIdType id - id to reverse look up - // @return WordType, word - std::vector ReverseLookup(const std::vector &ids); - - // constructor, shouldn't be called directly, can't be private due to std::make_unique() - // @param std::unordered_map map - sanitized word2id map - explicit Vocab(std::unordered_map map); - - Vocab() = default; - - // add one word to vocab, increment it's index automatically - // @param std::string & word - word to be added will skip if word already exists - void append_word(const std::string &word); - - // return a read-only vocab - const std::unordered_map vocab() { return word2id_; } - - // destructor - ~Vocab() = default; - - static const WordIdType kNoTokenExists; - static const WordType kNoIdExists; - - private: - std::unordered_map word2id_; - std::unordered_map id2word_; -}; - -} // namespace dataset -} // namespace mindspore - -#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VOCAB_H_ diff --git a/mindspore/python/mindspore/dataset/text/transforms.py b/mindspore/python/mindspore/dataset/text/transforms.py index f7b69c5beca..ed23651b50d 100644 --- a/mindspore/python/mindspore/dataset/text/transforms.py +++ b/mindspore/python/mindspore/dataset/text/transforms.py @@ -47,7 +47,7 @@ import numpy as np import mindspore._c_dataengine as cde from mindspore.common import dtype as mstype -from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType +from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType, SentencePieceVocab from .validators import check_lookup, check_jieba_add_dict, check_to_vectors, \ check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer, \ check_wordpiece_tokenizer, check_regex_replace, check_regex_tokenizer, check_basic_tokenizer, check_ngram, \ @@ -386,6 +386,7 @@ class SentencePieceTokenizer(TextTensorOperation): self.out_type = out_type def parse(self): + self.mode = self.mode.c_sentence_piece_vocab if isinstance(self.mode, SentencePieceVocab) else self.mode return cde.SentencePieceTokenizerOperation(self.mode, DE_C_INTER_SENTENCEPIECE_OUTTYPE[self.out_type]) diff --git a/mindspore/python/mindspore/dataset/text/utils.py b/mindspore/python/mindspore/dataset/text/utils.py index 0ee4d472bbc..51a8e5b8435 100644 --- a/mindspore/python/mindspore/dataset/text/utils.py +++ b/mindspore/python/mindspore/dataset/text/utils.py @@ -141,7 +141,7 @@ class Vocab: >>> dataset = dataset.map(operations=text.Lookup(vocab, ""), input_columns=["text"]) """ - vocab = Vocab() + vocab = cls() vocab.c_vocab = dataset.build_vocab(columns, freq_range, top_k, special_tokens, special_first) return vocab @@ -211,7 +211,7 @@ class Vocab: vocab_size = -1 if special_tokens is None: special_tokens = [] - vocab = Vocab() + vocab = cls() vocab.c_vocab = cde.Vocab.from_file(file_path, delimiter, vocab_size, special_tokens, special_first) return vocab @@ -232,16 +232,19 @@ class Vocab: >>> vocab = text.Vocab.from_dict({"home": 3, "behind": 2, "the": 4, "world": 5, "": 6}) """ - vocab = Vocab() + vocab = cls() vocab.c_vocab = cde.Vocab.from_dict(word_dict) return vocab -class SentencePieceVocab(cde.SentencePieceVocab): +class SentencePieceVocab: """ SentencePiece object that is used to do words segmentation. """ + def __init__(self): + self.c_sentence_piece_vocab = None + @classmethod @check_from_dataset_sentencepiece def from_dataset(cls, dataset, col_names, vocab_size, character_coverage, model_type, params): @@ -278,8 +281,11 @@ class SentencePieceVocab(cde.SentencePieceVocab): ... SentencePieceModel.UNIGRAM, {}) """ - return dataset.build_sentencepiece_vocab(col_names, vocab_size, character_coverage, - model_type, params) + sentence_piece_vocab = cls() + sentence_piece_vocab.c_sentence_piece_vocab = dataset.build_sentencepiece_vocab(col_names, vocab_size, + character_coverage, + model_type, params) + return sentence_piece_vocab @classmethod @check_from_file_sentencepiece @@ -321,8 +327,11 @@ class SentencePieceVocab(cde.SentencePieceVocab): ... SentencePieceModel.UNIGRAM, {}) """ - return super().from_file(file_path, vocab_size, character_coverage, - DE_C_INTER_SENTENCEPIECE_MODE[model_type], params) + sentence_piece_vocab = cls() + sentence_piece_vocab.c_sentence_piece_vocab = \ + cde.SentencePieceVocab.from_file(file_path, vocab_size, character_coverage, + DE_C_INTER_SENTENCEPIECE_MODE[model_type], params) + return sentence_piece_vocab @classmethod @check_save_model @@ -342,7 +351,7 @@ class SentencePieceVocab(cde.SentencePieceVocab): >>> text.SentencePieceVocab.save_model(vocab, "./", "m.model") """ - super().save_model(vocab, path, filename) + cde.SentencePieceVocab.save_model(vocab.c_sentence_piece_vocab, path, filename) def to_str(array, encoding='utf8'): diff --git a/mindspore/python/mindspore/dataset/text/validators.py b/mindspore/python/mindspore/dataset/text/validators.py index 5ac05e3932c..b76d617792d 100644 --- a/mindspore/python/mindspore/dataset/text/validators.py +++ b/mindspore/python/mindspore/dataset/text/validators.py @@ -551,7 +551,7 @@ def check_save_model(method): [vocab, path, filename], _ = parse_user_args(method, *args, **kwargs) if vocab is not None: - type_check(vocab, (cde.SentencePieceVocab,), "vocab") + type_check(vocab, (text.SentencePieceVocab,), "vocab") if path is not None: type_check(path, (str,), "path") @@ -573,7 +573,7 @@ def check_sentence_piece_tokenizer(method): def new_method(self, *args, **kwargs): [mode, out_type], _ = parse_user_args(method, *args, **kwargs) - type_check(mode, (str, cde.SentencePieceVocab), "mode is not an instance of str or cde.SentencePieceVocab.") + type_check(mode, (str, text.SentencePieceVocab), "mode is not an instance of str or text.SentencePieceVocab.") type_check(out_type, (SPieceTokenizerOutType,), "out_type is not an instance of SPieceTokenizerOutType") return method(self, *args, **kwargs) diff --git a/tests/ut/cpp/dataset/build_vocab_test.cc b/tests/ut/cpp/dataset/build_vocab_test.cc index 23013fd90c5..01812000816 100644 --- a/tests/ut/cpp/dataset/build_vocab_test.cc +++ b/tests/ut/cpp/dataset/build_vocab_test.cc @@ -20,7 +20,7 @@ #include "common/common.h" #include "include/api/status.h" -#include "minddata/dataset/text/vocab.h" +#include "minddata/dataset/include/dataset/text.h" using mindspore::dataset::Tensor; using mindspore::dataset::Vocab; @@ -47,7 +47,7 @@ TEST_F(MindDataTestVocab, TestVocabFromUnorderedMap) { std::vector words = {"apple", "dog", "egg"}; std::vector expected = {1, 3, -1}; for (uint32_t i = 0; i < words.size(); ++i) { - int32_t x = vocab->Lookup(words[i]); + int32_t x = vocab->TokensToIds(words[i]); EXPECT_EQ(x, expected[i]); } } @@ -65,7 +65,7 @@ TEST_F(MindDataTestVocab, TestVocabFromEmptyMap) { std::vector words = {"apple", "dog", "egg"}; std::vector expected = {-1, -1, -1}; for (uint32_t i = 0; i < words.size(); ++i) { - int32_t x = vocab->Lookup(words[i]); + int32_t x = vocab->TokensToIds(words[i]); EXPECT_EQ(x, expected[i]); } } @@ -96,7 +96,7 @@ TEST_F(MindDataTestVocab, TestVocabFromVectorPrependSpTokens) { std::vector words = {"apple", "banana", "fox"}; std::vector expected = {1, 2, -1}; for (uint32_t i = 0; i < words.size(); ++i) { - int32_t x = vocab->Lookup(words[i]); + int32_t x = vocab->TokensToIds(words[i]); EXPECT_EQ(x, expected[i]); } } @@ -113,7 +113,7 @@ TEST_F(MindDataTestVocab, TestVocabFromVectorAppendSpTokens) { std::vector words = {"apple", "", "fox"}; std::vector expected = {0, 5, -1}; for (uint32_t i = 0; i < words.size(); ++i) { - int32_t x = vocab->Lookup(words[i]); + int32_t x = vocab->TokensToIds(words[i]); EXPECT_EQ(x, expected[i]); } } @@ -131,7 +131,7 @@ TEST_F(MindDataTestVocab, TestVocabFromVectorWithNoSpTokens) { std::vector words = {"apple", "banana", "fox", ""}; std::vector expected = {0, 1, -1, -1}; for (uint32_t i = 0; i < words.size(); ++i) { - int32_t x = vocab->Lookup(words[i]); + int32_t x = vocab->TokensToIds(words[i]); EXPECT_EQ(x, expected[i]); } } @@ -149,7 +149,7 @@ TEST_F(MindDataTestVocab, TestVocabFromEmptyVector) { std::vector words = {"apple", "banana", "fox"}; std::vector expected = {-1, -1, -1}; for (uint32_t i = 0; i < words.size(); ++i) { - int32_t x = vocab->Lookup(words[i]); + int32_t x = vocab->TokensToIds(words[i]); EXPECT_EQ(x, expected[i]); } } @@ -195,14 +195,14 @@ TEST_F(MindDataTestVocab, TestVocabFromFile) { // Build vocab from local file std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt"; std::shared_ptr vocab = std::make_shared(); - Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"", ""}, true, &vocab); + Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"", ""}, true, &vocab); EXPECT_EQ(s, Status::OK()); // Look up specified words std::vector words = {"not", "all"}; std::vector expected = {2, 3}; for (uint32_t i = 0; i < words.size(); ++i) { - int32_t x = vocab->Lookup(words[i]); + int32_t x = vocab->TokensToIds(words[i]); EXPECT_EQ(x, expected[i]); } } @@ -212,7 +212,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail1) { // Build vocab from local file which is not exist std::string vocab_dir = datasets_root_path_ + "/testVocab/not_exist.txt"; std::shared_ptr vocab = std::make_shared(); - Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {}, true, &vocab); + Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {}, true, &vocab); EXPECT_NE(s, Status::OK()); } @@ -223,7 +223,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail2) { std::shared_ptr vocab = std::make_shared(); // Expected failure: vocab_size should be either -1 or positive integer - Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -2, {}, true, &vocab); + Status s = Vocab::BuildFromFile(vocab_dir, ",", -2, {}, true, &vocab); EXPECT_NE(s, Status::OK()); } @@ -234,7 +234,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail3) { std::shared_ptr vocab = std::make_shared(); // Expected failure: duplicate special token - Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"", ""}, true, &vocab); + Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"", ""}, true, &vocab); EXPECT_NE(s, Status::OK()); } @@ -245,6 +245,6 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail4) { std::shared_ptr vocab = std::make_shared(); // Expected failure: special_tokens and word_list contain duplicate word - Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"home"}, true, &vocab); + Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"home"}, true, &vocab); EXPECT_NE(s, Status::OK()); } diff --git a/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc b/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc index 4668f641ca4..c1c80917237 100644 --- a/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc +++ b/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc @@ -23,7 +23,6 @@ #include "minddata/dataset/include/dataset/datasets.h" #include "minddata/dataset/include/dataset/text.h" #include "minddata/dataset/include/dataset/transforms.h" -#include "minddata/dataset/text/sentence_piece_vocab.h" using namespace mindspore::dataset; using mindspore::dataset::SentencePieceModel; diff --git a/tests/ut/cpp/dataset/c_api_text_test.cc b/tests/ut/cpp/dataset/c_api_text_test.cc index ce6a51bccef..51566e92cc3 100644 --- a/tests/ut/cpp/dataset/c_api_text_test.cc +++ b/tests/ut/cpp/dataset/c_api_text_test.cc @@ -27,7 +27,6 @@ #include "minddata/dataset/text/fast_text.h" #include "minddata/dataset/text/glove.h" #include "minddata/dataset/text/vectors.h" -#include "minddata/dataset/text/vocab.h" using namespace mindspore::dataset; using mindspore::Status; @@ -797,7 +796,7 @@ TEST_F(MindDataTestPipeline, TestFilterWikipediaXMLSuccess) { // Iterate the dataset and get each row std::unordered_map row; ASSERT_OK(iter->GetNextRow(&row)); - std::vector expected = {"welcome to beijing","",""}; + std::vector expected = {"welcome to beijing", "", ""}; uint64_t i = 0; @@ -806,7 +805,7 @@ TEST_F(MindDataTestPipeline, TestFilterWikipediaXMLSuccess) { std::shared_ptr de_expected_tensor; ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor)); mindspore::MSTensor ms_expected_tensor = - mindspore::MSTensor(std::make_shared(de_expected_tensor)); + mindspore::MSTensor(std::make_shared(de_expected_tensor)); EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor); ASSERT_OK(iter->GetNextRow(&row)); i++; @@ -1709,8 +1708,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail1) { EXPECT_NE(ds, nullptr); // Create ToNumber operation on ds - std::shared_ptr to_number = - std::make_shared(mindspore::DataType::kNumberTypeInt8); + std::shared_ptr to_number = std::make_shared(mindspore::DataType::kNumberTypeInt8); EXPECT_NE(to_number, nullptr); // Create a Map operation on ds @@ -1760,7 +1758,8 @@ TEST_F(MindDataTestPipeline, TestToNumberFail2) { EXPECT_NE(ds, nullptr); // Create ToNumber operation on ds - std::shared_ptr to_number = std::make_shared(mindspore::DataType::kNumberTypeFloat16); + std::shared_ptr to_number = + std::make_shared(mindspore::DataType::kNumberTypeFloat16); EXPECT_NE(to_number, nullptr); // Create a Map operation on ds @@ -2143,8 +2142,7 @@ TEST_F(MindDataTestPipeline, TestNgramSuccess1) { ASSERT_OK(iter->GetNextRow(&row)); std::vector> expected = { - {"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a", - "is-a-text", + {"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a", "is-a-text", "a-text-file.", "text-file.-&", "file.-&-&"}, {"&-Be", "Be-happy", "happy-every", "every-day.", "day.-&", "&-&-Be", "&-Be-happy", "Be-happy-every", "happy-every-day.", "every-day.-&", "day.-&-&"}, @@ -4371,8 +4369,7 @@ TEST_F(MindDataTestPipeline, TestGloVeDefaultParam) { Status s = GloVe::BuildFromFile(&glove, vectors_dir); EXPECT_EQ(s, Status::OK()); - std::shared_ptr lookup = - std::make_shared(glove); + std::shared_ptr lookup = std::make_shared(glove); EXPECT_NE(lookup, nullptr); // Create Map operation on ds @@ -4388,14 +4385,13 @@ TEST_F(MindDataTestPipeline, TestGloVeDefaultParam) { ASSERT_OK(iter->GetNextRow(&row)); uint64_t i = 0; - std::vector> expected = { - {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}, - {0, 0, 0, 0, 0, 0}, - {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973}, - {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603}, - {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246}, - {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923}, - {0, 0, 0, 0, 0, 0}}; + std::vector> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}, + {0, 0, 0, 0, 0, 0}, + {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973}, + {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603}, + {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246}, + {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923}, + {0, 0, 0, 0, 0, 0}}; while (row.size() != 0) { auto ind = row["text"]; MS_LOG(INFO) << ind.Shape(); @@ -4434,8 +4430,7 @@ TEST_F(MindDataTestPipeline, TestGloVeAllBuildfromfileParams) { Status s = GloVe::BuildFromFile(&glove, vectors_dir, 100); EXPECT_EQ(s, Status::OK()); - std::shared_ptr lookup = - std::make_shared(glove); + std::shared_ptr lookup = std::make_shared(glove); EXPECT_NE(lookup, nullptr); // Create Map operation on ds @@ -4451,14 +4446,13 @@ TEST_F(MindDataTestPipeline, TestGloVeAllBuildfromfileParams) { ASSERT_OK(iter->GetNextRow(&row)); uint64_t i = 0; - std::vector> expected = { - {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}, - {0, 0, 0, 0, 0, 0}, - {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973}, - {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603}, - {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246}, - {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923}, - {0, 0, 0, 0, 0, 0}}; + std::vector> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}, + {0, 0, 0, 0, 0, 0}, + {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973}, + {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603}, + {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246}, + {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923}, + {0, 0, 0, 0, 0, 0}}; while (row.size() != 0) { auto ind = row["text"]; MS_LOG(INFO) << ind.Shape(); @@ -4498,8 +4492,7 @@ TEST_F(MindDataTestPipeline, TestGloVeUnknownInit) { EXPECT_EQ(s, Status::OK()); std::vector unknown_init = {-1, -1, -1, -1, -1, -1}; - std::shared_ptr lookup = - std::make_shared(glove, unknown_init); + std::shared_ptr lookup = std::make_shared(glove, unknown_init); EXPECT_NE(lookup, nullptr); // Create Map operation on ds @@ -4515,14 +4508,13 @@ TEST_F(MindDataTestPipeline, TestGloVeUnknownInit) { ASSERT_OK(iter->GetNextRow(&row)); uint64_t i = 0; - std::vector> expected = { - {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}, - {-1, -1, -1, -1, -1, -1}, - {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973}, - {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603}, - {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246}, - {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923}, - {-1, -1, -1, -1, -1, -1}}; + std::vector> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}, + {-1, -1, -1, -1, -1, -1}, + {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973}, + {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603}, + {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246}, + {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923}, + {-1, -1, -1, -1, -1, -1}}; while (row.size() != 0) { auto ind = row["text"]; MS_LOG(INFO) << ind.Shape(); @@ -4562,8 +4554,7 @@ TEST_F(MindDataTestPipeline, TestGloVeAllParams) { EXPECT_EQ(s, Status::OK()); std::vector unknown_init = {-1, -1, -1, -1, -1, -1}; - std::shared_ptr lookup = - std::make_shared(glove, unknown_init, true); + std::shared_ptr lookup = std::make_shared(glove, unknown_init, true); EXPECT_NE(lookup, nullptr); // Create Map operation on ds @@ -4579,14 +4570,13 @@ TEST_F(MindDataTestPipeline, TestGloVeAllParams) { ASSERT_OK(iter->GetNextRow(&row)); uint64_t i = 0; - std::vector> expected = { - {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}, - {-1, -1, -1, -1, -1, -1}, - {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973}, - {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603}, - {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246}, - {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923}, - {-1, -1, -1, -1, -1, -1}}; + std::vector> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411}, + {-1, -1, -1, -1, -1, -1}, + {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973}, + {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603}, + {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246}, + {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923}, + {-1, -1, -1, -1, -1, -1}}; while (row.size() != 0) { auto ind = row["text"]; MS_LOG(INFO) << ind.Shape(); @@ -4748,13 +4738,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramDefaultParam) { ASSERT_OK(iter->GetNextRow(&row)); uint64_t i = 0; - std::vector> expected = {{0,0,0,0,0}, - {0,0,0,0,0}, - {0.117336,0.362446,-0.983326,0.939264,-0.05648}, - {0.657201,2.11761,-1.59276,0.432072,1.21395}, - {0,0,0,0,0}, - {-2.26956,0.288491,-0.740001,0.661703,0.147355}, - {0,0,0,0,0}}; + std::vector> expected = {{0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0}, + {0.117336, 0.362446, -0.983326, 0.939264, -0.05648}, + {0.657201, 2.11761, -1.59276, 0.432072, 1.21395}, + {0, 0, 0, 0, 0}, + {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355}, + {0, 0, 0, 0, 0}}; while (row.size() != 0) { auto ind = row["text"]; MS_LOG(INFO) << ind.Shape(); @@ -4810,13 +4800,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramAllBuildfromfileParams) { ASSERT_OK(iter->GetNextRow(&row)); uint64_t i = 0; - std::vector> expected = {{0,0,0,0,0}, - {0,0,0,0,0}, - {-0.155665,0.664073,-0.538499,1.22657,-0.2162}, - {0.657201,2.11761,-1.59276,0.432072,1.21395}, - {0,0,0,0,0}, - {-2.26956,0.288491,-0.740001,0.661703,0.147355}, - {0,0,0,0,0}}; + std::vector> expected = {{0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0}, + {-0.155665, 0.664073, -0.538499, 1.22657, -0.2162}, + {0.657201, 2.11761, -1.59276, 0.432072, 1.21395}, + {0, 0, 0, 0, 0}, + {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355}, + {0, 0, 0, 0, 0}}; while (row.size() != 0) { auto ind = row["text"]; MS_LOG(INFO) << ind.Shape(); @@ -4873,13 +4863,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramUnknownInit) { ASSERT_OK(iter->GetNextRow(&row)); uint64_t i = 0; - std::vector> expected = {{-1,-1,-1,-1,-1}, - {-1,-1,-1,-1,-1}, - {-0.155665,0.664073,-0.538499,1.22657,-0.2162}, - {0.657201,2.11761,-1.59276,0.432072,1.21395}, - {-1,-1,-1,-1,-1}, - {-2.26956,0.288491,-0.740001,0.661703,0.147355}, - {-1,-1,-1,-1,-1}}; + std::vector> expected = {{-1, -1, -1, -1, -1}, + {-1, -1, -1, -1, -1}, + {-0.155665, 0.664073, -0.538499, 1.22657, -0.2162}, + {0.657201, 2.11761, -1.59276, 0.432072, 1.21395}, + {-1, -1, -1, -1, -1}, + {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355}, + {-1, -1, -1, -1, -1}}; while (row.size() != 0) { auto ind = row["text"]; MS_LOG(INFO) << ind.Shape(); @@ -4936,13 +4926,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramAllParams) { ASSERT_OK(iter->GetNextRow(&row)); uint64_t i = 0; - std::vector> expected = {{-1,-1,-1,-1,-1}, - {-1,-1,-1,-1,-1}, - {0.117336,0.362446,-0.983326,0.939264,-0.05648}, - {0.657201,2.11761,-1.59276,0.432072,1.21395}, - {-1,-1,-1,-1,-1}, - {-2.26956,0.288491,-0.740001,0.661703,0.147355}, - {-1,-1,-1,-1,-1}}; + std::vector> expected = {{-1, -1, -1, -1, -1}, + {-1, -1, -1, -1, -1}, + {0.117336, 0.362446, -0.983326, 0.939264, -0.05648}, + {0.657201, 2.11761, -1.59276, 0.432072, 1.21395}, + {-1, -1, -1, -1, -1}, + {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355}, + {-1, -1, -1, -1, -1}}; while (row.size() != 0) { auto ind = row["text"]; MS_LOG(INFO) << ind.Shape(); diff --git a/tests/ut/cpp/dataset/c_api_text_vocab_test.cc b/tests/ut/cpp/dataset/c_api_text_vocab_test.cc index 4c16e56de6d..f9c1736ecdf 100644 --- a/tests/ut/cpp/dataset/c_api_text_vocab_test.cc +++ b/tests/ut/cpp/dataset/c_api_text_vocab_test.cc @@ -22,7 +22,6 @@ #include "minddata/dataset/include/dataset/datasets.h" #include "minddata/dataset/include/dataset/text.h" #include "minddata/dataset/include/dataset/transforms.h" -#include "minddata/dataset/text/vocab.h" using namespace mindspore::dataset; using mindspore::Status; @@ -42,7 +41,7 @@ class MindDataTestPipeline : public UT::DatasetOpTesting { } while (false) /// Feature: C++ text.Vocab class. -/// Description: test Lookup() ReverseLookup() methods of text::Vocab. +/// Description: test TokensToIds() IdsToTokens() methods of text::Vocab. /// Expectation: success. TEST_F(MindDataTestPipeline, TestVocabLookupAndReverseLookup) { MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupAndReverseLookup."; @@ -53,30 +52,30 @@ TEST_F(MindDataTestPipeline, TestVocabLookupAndReverseLookup) { EXPECT_EQ(s, Status::OK()); // lookup, convert token to id - auto single_index = vocab->Lookup("home"); + auto single_index = vocab->TokensToIds("home"); EXPECT_EQ(single_index, 2); - single_index = vocab->Lookup("hello"); + single_index = vocab->TokensToIds("hello"); EXPECT_EQ(single_index, -1); // lookup multiple tokens - auto multi_indexs = vocab->Lookup(std::vector{"", "behind"}); + auto multi_indexs = vocab->TokensToIds(std::vector{"", "behind"}); std::vector expected_multi_indexs = {0, 4}; EXPECT_EQ(multi_indexs, expected_multi_indexs); - multi_indexs = vocab->Lookup(std::vector{"", "apple"}); + multi_indexs = vocab->TokensToIds(std::vector{"", "apple"}); expected_multi_indexs = {0, -1}; EXPECT_EQ(multi_indexs, expected_multi_indexs); // reverse lookup, convert id to token - auto single_word = vocab->ReverseLookup(2); + auto single_word = vocab->IdsToTokens(2); EXPECT_EQ(single_word, "home"); - single_word = vocab->ReverseLookup(-1); + single_word = vocab->IdsToTokens(-1); EXPECT_EQ(single_word, ""); // reverse lookup multiple ids - auto multi_words = vocab->ReverseLookup(std::vector{0, 4}); + auto multi_words = vocab->IdsToTokens(std::vector{0, 4}); std::vector expected_multi_words = {"", "behind"}; EXPECT_EQ(multi_words, expected_multi_words); - multi_words = vocab->ReverseLookup(std::vector{0, 99}); + multi_words = vocab->IdsToTokens(std::vector{0, 99}); expected_multi_words = {"", ""}; EXPECT_EQ(multi_words, expected_multi_words); } @@ -330,7 +329,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDataset) { EXPECT_NE(vocab, nullptr); // Check if vocab has words or not - int32_t home_index = vocab->Lookup("home"); + int32_t home_index = vocab->TokensToIds("home"); EXPECT_EQ(home_index, 4); // Create Lookup operation on ds @@ -386,7 +385,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) { EXPECT_NE(vocab, nullptr); // Check if vocab has words or not - int32_t home_index = vocab->Lookup("home"); + int32_t home_index = vocab->TokensToIds("home"); EXPECT_EQ(home_index, 2); // Create Lookup operation on ds @@ -509,7 +508,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetInt64) { EXPECT_NE(vocab, nullptr); // Check if vocab has words or not - int32_t home_index = vocab->Lookup("home"); + int32_t home_index = vocab->TokensToIds("home"); EXPECT_EQ(home_index, 2); // Create Lookup operation on ds diff --git a/tests/ut/cpp/dataset/sentence_piece_vocab_op_test.cc b/tests/ut/cpp/dataset/sentence_piece_vocab_op_test.cc index af0058fd6ba..a86a51c44e1 100644 --- a/tests/ut/cpp/dataset/sentence_piece_vocab_op_test.cc +++ b/tests/ut/cpp/dataset/sentence_piece_vocab_op_test.cc @@ -19,7 +19,7 @@ #include "common/common.h" #include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h" #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h" -#include "minddata/dataset/text/sentence_piece_vocab.h" +#include "minddata/dataset/include/dataset/text.h" #include "minddata/dataset/engine/datasetops/source/text_file_op.h" #include "gtest/gtest.h" #include "utils/log_adapter.h"