!30982 Vocab C++ Interface Alignment and SentencePieceVocab C++ interface alignment and Python interface refactoring

Merge pull request !30982 from 刘勇琪/master-vocab-sentencepiecevocab
2022-03-10 07:58:41 +00:00 · 2022-03-10 07:58:41 +00:00 · 872cb74d3f
parent c76ec8d648 457ee03082
commit 872cb74d3f
27 changed files with 371 additions and 473 deletions
--- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc
@ -38,8 +38,7 @@
 #include "minddata/dataset/util/status.h"
 #ifndef ENABLE_ANDROID
 #include "minddata/dataset/engine/ir/cache/dataset_cache_impl.h"
-#include "minddata/dataset/text/sentence_piece_vocab.h"
-#include "minddata/dataset/text/vocab.h"
+#include "minddata/dataset/include/dataset/text.h"
 #endif

 // Sampler headers (in alphabetical order)
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/bindings.cc
@ -23,7 +23,6 @@
 #include "minddata/dataset/core/data_type.h"
 #include "minddata/dataset/engine/serdes.h"
 #include "minddata/dataset/include/dataset/constants.h"
-#include "minddata/dataset/text/sentence_piece_vocab.h"
 #include "minddata/dataset/util/path.h"

 // IR non-leaf nodes
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc
@ -19,12 +19,11 @@

 #include "minddata/dataset/api/python/pybind_register.h"
 #include "minddata/dataset/include/dataset/constants.h"
+#include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/text/char_n_gram.h"
 #include "minddata/dataset/text/fast_text.h"
 #include "minddata/dataset/text/glove.h"
-#include "minddata/dataset/text/sentence_piece_vocab.h"
 #include "minddata/dataset/text/vectors.h"
-#include "minddata/dataset/text/vocab.h"

 namespace mindspore {
 namespace dataset {
@ -32,28 +31,29 @@ PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) {
                  (void)py::class_<Vocab, std::shared_ptr<Vocab>>(*m, "Vocab")
                    .def(py::init<>())
                    .def_static("from_list",
-                                [](const py::list &words, const py::list &special_tokens, bool special_first) {
+                                [](const std::vector<std::string> &words,
+                                   const std::vector<std::string> &special_tokens, bool special_first) {
                                  std::shared_ptr<Vocab> v;
-                                  THROW_IF_ERROR(Vocab::BuildFromPyList(words, special_tokens, special_first, &v));
+                                  THROW_IF_ERROR(Vocab::BuildFromVector(words, special_tokens, special_first, &v));
                                  return v;
                                })
                    .def_static(
                      "from_file",
                      [](const std::string &path, const std::string &dlm, int32_t vocab_size,
-                         const py::list &special_tokens, bool special_first) {
+                         const std::vector<std::string> &special_tokens, bool special_first) {
                        std::shared_ptr<Vocab> v;
                        THROW_IF_ERROR(Vocab::BuildFromFile(path, dlm, vocab_size, special_tokens, special_first, &v));
                        return v;
                      })
                    .def_static("from_dict",
-                                [](const py::dict &words) {
+                                [](const std::unordered_map<WordType, WordIdType> &words) {
                                  std::shared_ptr<Vocab> v;
-                                  THROW_IF_ERROR(Vocab::BuildFromPyDict(words, &v));
+                                  THROW_IF_ERROR(Vocab::BuildFromUnorderedMap(words, &v));
                                  return v;
                                })
                    .def("tokens_to_ids",
                         [](Vocab &self, const std::vector<std::string> words) {
-                           auto ids = self.Lookup(words);
+                           auto ids = self.TokensToIds(words);
                           py::object ret;
                           if (ids.size() == 1) {
                             ret = py::int_(ids[0]);
@ -65,7 +65,7 @@ PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) {
                         })
                    .def("ids_to_tokens",
                         [](Vocab &self, const std::vector<int32_t> ids) {
-                           auto words = self.ReverseLookup(ids);
+                           auto words = self.IdsToTokens(ids);
                           py::object ret;
                           if (words.size() == 1) {
                             ret = py::str(words[0]);
@ -75,31 +75,19 @@ PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) {
                           }
                           return ret;
                         })
-                    .def("vocab", [](Vocab &self) { return self.vocab(); });
+                    .def("vocab", [](Vocab &self) { return self.GetVocab(); });
                }));

 PYBIND_REGISTER(SentencePieceVocab, 0, ([](const py::module *m) {
                  (void)py::class_<SentencePieceVocab, std::shared_ptr<SentencePieceVocab>>(*m, "SentencePieceVocab")
                    .def(py::init<>())
                    .def_static("from_file",
-                                [](const py::list &paths, const int32_t vocab_size, const float character_coverage,
-                                   const SentencePieceModel model_type, const py::dict &params) {
+                                [](const std::vector<std::string> &paths, const int32_t vocab_size,
+                                   const float character_coverage, const SentencePieceModel model_type,
+                                   const std::unordered_map<std::string, std::string> &params) {
                                  std::shared_ptr<SentencePieceVocab> v;
-                                  std::vector<std::string> path_list;
-                                  for (auto path : paths) {
-                                    path_list.emplace_back(py::str(path));
-                                  }
-                                  std::unordered_map<std::string, std::string> param_map;
-                                  for (auto param : params) {
-                                    std::string key = py::reinterpret_borrow<py::str>(param.first);
-                                    if (key == "input" || key == "vocab_size" || key == "model_prefix" ||
-                                        key == "character_coverage" || key == "model_type") {
-                                      continue;
-                                    }
-                                    param_map[key] = py::reinterpret_borrow<py::str>(param.second);
-                                  }
                                  THROW_IF_ERROR(SentencePieceVocab::BuildFromFile(
-                                    path_list, vocab_size, character_coverage, model_type, param_map, &v));
+                                    paths, vocab_size, character_coverage, model_type, params, &v));
                                  return v;
                                })
                    .def_static("save_model", [](const std::shared_ptr<SentencePieceVocab> *vocab, std::string path,
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc
@ -14,13 +14,12 @@
 * limitations under the License.
 */

+#include "minddata/dataset/api/python/pybind_register.h"
+#include "minddata/dataset/include/dataset/text.h"
+#include "minddata/dataset/text/ir/kernels/text_ir.h"
+#include "minddata/dataset/text/vectors.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl_bind.h"
-#include "minddata/dataset/api/python/pybind_register.h"
-#include "minddata/dataset/text/ir/kernels/text_ir.h"
-#include "minddata/dataset/text/sentence_piece_vocab.h"
-#include "minddata/dataset/text/vectors.h"
-#include "minddata/dataset/text/vocab.h"

 namespace mindspore {
 namespace dataset {
--- a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.h
@ -24,7 +24,7 @@
 #include <vector>

 #include "minddata/dataset/engine/tree_adapter.h"
-#include "minddata/dataset/text/vocab.h"
+#include "minddata/dataset/include/dataset/text.h"

 namespace mindspore::dataset {
 // Forward declare
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h
@ -28,9 +28,9 @@
 #include "minddata/dataset/core/tensor.h"
 #include "minddata/dataset/engine/dataset_iterator.h"
 #include "minddata/dataset/engine/datasetops/pipeline_op.h"
+#include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/util/status.h"
 #include "minddata/dataset/util/queue.h"
-#include "minddata/dataset/text/sentence_piece_vocab.h"
 #include "pybind11/pybind11.h"

 namespace mindspore {
@ -54,7 +54,7 @@ class BuildSentencePieceVocabOp : public PipelineOp {
    BuildSentencePieceVocabOp *s_p_vocab_ptr_;
  };

-  BuildSentencePieceVocabOp(std::shared_ptr<SentencePieceVocab> vocab, std::vector<std::string> col_names,
+  BuildSentencePieceVocabOp(std::shared_ptr<dataset::SentencePieceVocab> vocab, std::vector<std::string> col_names,
                            int32_t vocab_size, float character_coverage, SentencePieceModel model_type,
                            const std::unordered_map<std::string, std::string> &params, int32_t op_conn_size);

--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc
@ -179,15 +179,15 @@ Status BuildVocabOp::CollectorThread() {
                    });

  if (special_first_) {
-    for (const std::string &sp_tk : special_tokens_) vocab_->append_word(sp_tk);
+    for (const std::string &sp_tk : special_tokens_) vocab_->AppendWord(sp_tk);
  }

  for (int64_t i = 0; i < num_words; i++) {
-    vocab_->append_word(words[i]);
+    vocab_->AppendWord(words[i]);
  }

  if (!special_first_) {
-    for (const std::string &sp_tk : special_tokens_) vocab_->append_word(sp_tk);
+    for (const std::string &sp_tk : special_tokens_) vocab_->AppendWord(sp_tk);
  }

  RETURN_IF_NOT_OK(out_connector_->SendEOE());
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.h
@ -25,7 +25,7 @@
 #include "minddata/dataset/core/tensor.h"
 #include "minddata/dataset/engine/dataset_iterator.h"
 #include "minddata/dataset/engine/datasetops/parallel_op.h"
-#include "minddata/dataset/text/vocab.h"
+#include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/util/queue.h"
 #include "minddata/dataset/util/status.h"

@ -33,9 +33,9 @@ namespace mindspore {
 namespace dataset {
 class BuildVocabOp : public ParallelOp<TensorRow, TensorRow> {
 public:
-  BuildVocabOp(std::shared_ptr<Vocab> vocab, std::vector<std::string> col_names, std::pair<int64_t, int64_t> freq_range,
-               int64_t top_k, const std::vector<std::string> &tokens, bool prepend, int32_t num_workers,
-               int32_t op_connector_size);
+  BuildVocabOp(std::shared_ptr<dataset::Vocab> vocab, std::vector<std::string> col_names,
+               std::pair<int64_t, int64_t> freq_range, int64_t top_k, const std::vector<std::string> &tokens,
+               bool prepend, int32_t num_workers, int32_t op_connector_size);

  ~BuildVocabOp() = default;

--- a/mindspore/ccsrc/minddata/dataset/include/dataset/text.h
+++ b/mindspore/ccsrc/minddata/dataset/include/dataset/text.h
@ -20,6 +20,7 @@
 #include <memory>
 #include <optional>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>

@ -30,10 +31,204 @@

 namespace mindspore {
 namespace dataset {
-class SentencePieceVocab;
 class TensorOperation;
 class Vectors;
-class Vocab;
+
+using WordIdType = int32_t;
+using WordType = std::string;
+
+/// \brief Vocab object that is used to save pairs of words and ids.
+/// \note It contains a map that maps each word(str) to an id(int) or reverse.
+class Vocab {
+ public:
+  /// \brief Build a vocab from an unordered_map. IDs should be no duplicate and continuous.
+  /// \param[in] words An unordered_map containing word id pair.
+  /// \param[out] vocab A vocab object.
+  /// \return Status code.
+  /// \par Example
+  /// \code
+  ///     // Build a map
+  ///     std::unordered_map<std::string, int32_t> dict;
+  ///     dict["banana"] = 0;
+  ///     dict["apple"] = 1;
+  ///     dict["cat"] = 2;
+  ///     dict["dog"] = 3;
+  ///     // Build vocab from map
+  ///     std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
+  ///     Status s = Vocab::BuildFromUnorderedMap(dict, &vocab);
+  /// \endcode
+  static Status BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdType> &words,
+                                      std::shared_ptr<Vocab> *vocab);
+
+  /// \brief Build a vocab from a c++ vector. id no duplicate and continuous.
+  /// \param[in] words A vector of string containing words.
+  /// \param[in] special_tokens A vector of string containing special tokens.
+  /// \param[in] prepend_special Whether the special_tokens will be prepended/appended to vocab.
+  /// \param[out] vocab A vocab object.
+  /// \return Status code.
+  /// \par Example
+  /// \code
+  ///     // Build vocab from a vector of words, special tokens are prepended to vocab
+  ///     std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"};
+  ///     std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
+  ///     Status s = Vocab::BuildFromVector(list, {"<unk>"}, true, &vocab);
+  /// \endcode
+  static Status BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens,
+                                bool prepend_special, std::shared_ptr<Vocab> *vocab);
+
+  /// \brief Build a vocab from vocab file, IDs will be automatically assigned.
+  /// \param[in] path Path to vocab file, each line in file is assumed as a word (including space).
+  /// \param[in] delimiter Delimiter to break each line, characters after the delimiter will be deprecated.
+  /// \param[in] vocab_size Number of lines to be read from file.
+  /// \param[in] special_tokens A vector of string containing special tokens.
+  /// \param[in] prepend_special Whether the special_tokens will be prepended/appended to vocab.
+  /// \param[out] vocab A vocab object.
+  /// \return Status code.
+  /// \par Example
+  /// \code
+  ///     // Build vocab from local file
+  ///     std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
+  ///     std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
+  ///     Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab);
+  /// \endcode
+  static Status BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
+                              const std::vector<WordType> &special_tokens, bool prepend_special,
+                              std::shared_ptr<Vocab> *vocab);
+
+  /// Lookup the id of a word, if the word doesn't exist in vocab, return -1.
+  /// \param word Word to be looked up.
+  /// \return ID of the word in the vocab.
+  /// \par Example
+  /// \code
+  ///     // lookup, convert token to id
+  ///     auto single_index = vocab->TokensToIds("home");
+  ///     single_index = vocab->TokensToIds("hello");
+  /// \endcode
+  WordIdType TokensToIds(const WordType &word) const;
+
+  /// Lookup the id of a word, if the word doesn't exist in vocab, return -1.
+  /// \param words Words to be looked up.
+  /// \return ID of the word in the vocab.
+  /// \par Example
+  /// \code
+  ///     // lookup multiple tokens
+  ///     auto multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "behind"});
+  ///     std::vector<int32_t> expected_multi_indexs = {0, 4};
+  ///     multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "apple"});
+  ///     expected_multi_indexs = {0, -1};
+  /// \endcode
+  std::vector<WordIdType> TokensToIds(const std::vector<WordType> &words) const;
+
+  /// Lookup the word of an ID, if ID doesn't exist in vocab, return empty string.
+  /// \param id ID to be looked up.
+  /// \return Indicates the word corresponding to the ID.
+  /// \par Example
+  /// \code
+  ///     // reverse lookup, convert id to token
+  ///     auto single_word = vocab->IdsToTokens(2);
+  ///     single_word = vocab->IdsToTokens(-1);
+  /// \endcode
+  WordType IdsToTokens(const WordIdType &id);
+
+  /// Lookup the word of an ID, if ID doesn't exist in vocab, return empty string.
+  /// \param ids ID to be looked up.
+  /// \return Indicates the word corresponding to the ID.
+  /// \par Example
+  /// \code
+  ///     // reverse lookup multiple ids
+  ///     auto multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 4});
+  ///     std::vector<std::string> expected_multi_words = {"<pad>", "behind"};
+  ///     multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 99});
+  ///     expected_multi_words = {"<pad>", ""};
+  /// \endcode
+  std::vector<WordType> IdsToTokens(const std::vector<WordIdType> &ids);
+
+  /// Constructor, shouldn't be called directly, can't be private due to std::make_unique().
+  /// \param map Sanitized word2id map.
+  explicit Vocab(std::unordered_map<WordType, WordIdType> map);
+
+  /// \brief Add one word to vocab, increment it's index automatically.
+  /// \param word Word to be added, word will skip if word already exists.
+  void AppendWord(const std::string &word);
+
+  /// \brief Return a read-only vocab in unordered_map type.
+  /// \return A unordered_map of word2id.
+  const std::unordered_map<WordType, WordIdType> &GetVocab() { return word2id_; }
+
+  /// \brief Constructor.
+  Vocab() = default;
+
+  /// \brief Destructor.
+  ~Vocab() = default;
+
+  static const WordIdType kNoTokenExists;
+  static const WordType kNoIdExists;
+
+ private:
+  std::unordered_map<WordType, WordIdType> word2id_;
+  std::unordered_map<WordIdType, WordType> id2word_;
+};
+
+/// \brief SentencePiece object that is used to do words segmentation.
+class SentencePieceVocab {
+ public:
+  /// \brief Build a SentencePiece object from a file.
+  /// \param[in] path_list Path to the file which contains the SentencePiece list.
+  /// \param[in] vocab_size Vocabulary size.
+  /// \param[in] character_coverage Amount of characters covered by the model, good defaults are: 0.9995 for
+  ///              languages with rich character set like Japanese or Chinese and 1.0 for other languages with small
+  ///              character set.
+  /// \param[in] model_type It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE,
+  ///              SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input
+  ///              sentence must be pre-tokenized when using SentencePieceModel.WORD type.
+  ///              - SentencePieceModel.UNIGRAM, Unigram Language Model means the next word in the sentence is assumed
+  ///                to be independent of the previous words generated by the model.
+  ///              - SentencePieceModel.BPE, refers to byte pair encoding algorithm, which replaces the most frequent
+  ///                pair of bytes in a sentence with a single, unused byte.
+  ///              - SentencePieceModel.CHAR, refers to char based sentencePiece Model type.
+  ///              - SentencePieceModel.WORD, refers to word based sentencePiece Model type.
+  /// \param[in] params A dictionary with no incoming parameters(The parameters are derived from SentencePiece library).
+  /// \return SentencePieceVocab, vocab built from the file.
+  /// \par Example
+  /// \code
+  ///     std::string dataset_path;
+  ///     dataset_path = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
+  ///     std::vector<std::string> path_list;
+  ///     path_list.emplace_back(dataset_path);
+  ///     std::unordered_map<std::string, std::string> param_map;
+  ///     std::shared_ptr<SentencePieceVocab> spm = std::make_unique<SentencePieceVocab>();
+  ///     Status rc = SentencePieceVocab::BuildFromFile(path_list, 5000, 0.9995,
+  ///                                                   SentencePieceModel::kUnigram, param_map, &spm);
+  /// \endcode
+  static Status BuildFromFile(const std::vector<std::string> &path_list, const int32_t vocab_size,
+                              const float character_coverage, const SentencePieceModel model_type,
+                              const std::unordered_map<std::string, std::string> &params,
+                              std::shared_ptr<SentencePieceVocab> *vocab);
+
+  /// \brief Save the SentencePiece model into given file path.
+  /// \param[in] vocab A SentencePiece object to be saved.
+  /// \param[in] path Path to store the model.
+  /// \param[in] filename The save name of model file.
+  /// \par Example
+  /// \code
+  ///     // Save vocab model to local
+  ///     vocab->SaveModel(&vocab, datasets_root_path_ + "/test_sentencepiece", "m.model");
+  /// \endcode
+  static Status SaveModel(const std::shared_ptr<SentencePieceVocab> *vocab, std::string path, std::string filename);
+
+  /// \brief Constructor.
+  SentencePieceVocab();
+
+  /// \brief Destructor.
+  ~SentencePieceVocab() = default;
+
+  const std::string &model_proto();
+
+  void set_model_proto(const std::string model_proto);
+
+ private:
+  std::string model_proto_;
+};

 // Transform operations for text
 namespace text {
@ -414,7 +609,7 @@ class MS_API NormalizeUTF8 final : public TensorTransform {
  /// \brief Constructor.
  /// \param[in] normalize_form Valid values can be any of [NormalizeForm::kNone,NormalizeForm::kNfc,
  ///   NormalizeForm::kNfkc, NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc).
-  ///   See http://unicode.org/reports/tr15/ for details.
+  ///   See <http://unicode.org/reports/tr15/> for details.
  ///   - NormalizeForm.kNone, remain the input string tensor unchanged.
  ///   - NormalizeForm.kNfc, normalizes with Normalization Form C.
  ///   - NormalizeForm.kNfkc, normalizes with Normalization Form KC.
--- a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc
@ -217,7 +217,7 @@ Status LookupOperation::ValidateParams() {
    LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
  }
  if (unknown_token_ != std::nullopt) {
-    default_id_ = vocab_->Lookup(*unknown_token_);
+    default_id_ = vocab_->TokensToIds(*unknown_token_);
    if (default_id_ == Vocab::kNoTokenExists) {
      std::string err_msg = "Lookup: \"" + *unknown_token_ + "\" doesn't exist in vocab.";
      LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.cc
@ -30,7 +30,7 @@ Status LookupOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<T
  std::vector<WordIdType> word_ids;
  word_ids.reserve(input->Size());
  for (auto itr = input->begin<std::string_view>(); itr != input->end<std::string_view>(); ++itr) {
-    WordIdType word_id = vocab_->Lookup(std::string(*itr));
+    WordIdType word_id = vocab_->TokensToIds(std::string(*itr));
    word_ids.emplace_back(word_id == Vocab::kNoTokenExists ? default_id_ : word_id);
    CHECK_FAIL_RETURN_UNEXPECTED(word_ids.back() != Vocab::kNoTokenExists,
                                 "Lookup: invalid data, token: \"" + std::string(*itr) +
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.h
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.h
@ -23,9 +23,9 @@
 #include <vector>

 #include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/kernels/tensor_op.h"
 #include "minddata/dataset/util/status.h"
-#include "minddata/dataset/text/vocab.h"

 namespace mindspore {
 namespace dataset {
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h
@ -24,10 +24,10 @@
 #include <memory>

 #include "minddata/dataset/include/dataset/constants.h"
+#include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/kernels/tensor_op.h"
 #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
 #include "minddata/dataset/util/status.h"
-#include "minddata/dataset/text/sentence_piece_vocab.h"

 namespace mindspore {
 namespace dataset {
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.cc
@ -46,7 +46,7 @@ Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const Ru
    if (start > 0) {
      word = suffix_indicator_ + word;
    }
-    if (vocab_->Lookup(word) != Vocab::kNoTokenExists) {
+    if (vocab_->TokensToIds(word) != Vocab::kNoTokenExists) {
      *out_found = true;
      break;
    }
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.h
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.h
@ -23,9 +23,9 @@
 #include "cppjieba/Unicode.hpp"

 #include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/kernels/tensor_op.h"
 #include "minddata/dataset/text/kernels/tokenizer_op.h"
-#include "minddata/dataset/text/vocab.h"
 #include "minddata/dataset/util/status.h"

 using cppjieba::DecodeRunesInString;
--- a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc
@ -14,16 +14,18 @@
 * limitations under the License.
 */

-#include "minddata/dataset/text/sentence_piece_vocab.h"
-
-#include <sentencepiece_trainer.h>
 #include <sentencepiece_processor.h>
+#include <sentencepiece_trainer.h>
+
 #include <fstream>

+#include "include/common/utils/utils.h"
+#include "minddata/dataset/include/dataset/constants.h"
+#include "minddata/dataset/include/dataset/text.h"
+#include "minddata/dataset/util/path.h"
+#include "minddata/dataset/util/status.h"
 #include "utils/file_utils.h"
 #include "utils/ms_utils.h"
-#include "include/common/utils/utils.h"
-#include "minddata/dataset/util/path.h"

 namespace mindspore {
 namespace dataset {
--- a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h
+++ b/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h
@ -1,50 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_
-#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_
-
-#include <string>
-#include <memory>
-#include <vector>
-#include <unordered_map>
-#include "minddata/dataset/util/status.h"
-#include "minddata/dataset/include/dataset/constants.h"
-
-namespace mindspore {
-namespace dataset {
-
-class SentencePieceVocab {
- public:
-  static Status BuildFromFile(const std::vector<std::string> &path_list, const int32_t vocab_size,
-                              const float character_coverage, const SentencePieceModel model_type,
-                              const std::unordered_map<std::string, std::string> &params,
-                              std::shared_ptr<SentencePieceVocab> *vocab);
-  static Status SaveModel(const std::shared_ptr<SentencePieceVocab> *vocab, std::string path, std::string filename);
-  SentencePieceVocab();
-
-  ~SentencePieceVocab() = default;
-
-  const std::string &model_proto();
-
-  void set_model_proto(const std::string model_proto);
-
- private:
-  std::string model_proto_;
-};
-}  // namespace dataset
-}  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_
--- a/mindspore/ccsrc/minddata/dataset/text/vocab.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/vocab.cc
@ -14,14 +14,14 @@
 * limitations under the License.
 */

-#include "minddata/dataset/text/vocab.h"
-
-#include <fstream>
-#include <unordered_set>
-#include <unordered_map>
-#include <utility>
 #include <algorithm>
+#include <fstream>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>

+#include "minddata/dataset/include/dataset/text.h"
+#include "minddata/dataset/util/status.h"
 #include "utils/file_utils.h"
 #ifndef ENABLE_ANDROID
 #include "utils/log_adapter.h"
@ -33,18 +33,18 @@ namespace mindspore {
 namespace dataset {
 Vocab::Vocab(std::unordered_map<WordType, WordIdType> word2id) { word2id_ = std::move(word2id); }

-WordIdType Vocab::Lookup(const WordType &word) const {
+WordIdType Vocab::TokensToIds(const WordType &word) const {
  auto itr = word2id_.find(word);
  return itr == word2id_.end() ? kNoTokenExists : itr->second;
 }

-std::vector<WordIdType> Vocab::Lookup(const std::vector<WordType> &words) const {
+std::vector<WordIdType> Vocab::TokensToIds(const std::vector<WordType> &words) const {
  std::vector<WordIdType> ids;
-  std::transform(words.begin(), words.end(), std::back_inserter(ids), [this](auto w) { return Lookup(w); });
+  std::transform(words.begin(), words.end(), std::back_inserter(ids), [this](auto w) { return TokensToIds(w); });
  return ids;
 }

-WordType Vocab::ReverseLookup(const WordIdType &id) {
+WordType Vocab::IdsToTokens(const WordIdType &id) {
  // lazy initialization, since I think it's not common use but waste memory
  if (id2word_.empty()) {
    for (const auto [word_, id_] : word2id_) {
@ -55,7 +55,7 @@ WordType Vocab::ReverseLookup(const WordIdType &id) {
  return itr == id2word_.end() ? kNoIdExists : itr->second;
 }

-std::vector<WordType> Vocab::ReverseLookup(const std::vector<WordIdType> &ids) {
+std::vector<WordType> Vocab::IdsToTokens(const std::vector<WordIdType> &ids) {
  // lazy initialization, since I think it's not common use but waste memory
  if (id2word_.empty()) {
    for (const auto [word_, id_] : word2id_) {
@ -63,50 +63,11 @@ std::vector<WordType> Vocab::ReverseLookup(const std::vector<WordIdType> &ids) {
    }
  }
  std::vector<WordType> words;
-  std::transform(ids.begin(), ids.end(), std::back_inserter(words), [this](auto i) { return ReverseLookup(i); });
+  std::transform(ids.begin(), ids.end(), std::back_inserter(words), [this](auto i) { return IdsToTokens(i); });
  return words;
 }

-#ifdef ENABLE_PYTHON
-Status Vocab::BuildFromPyList(const py::list &words, const py::list &special_tokens, bool prepend_special,
-                              std::shared_ptr<Vocab> *vocab) {
-  if (vocab == nullptr) {
-    RETURN_STATUS_UNEXPECTED("Vocab::BuildFromPyList: input vocab can not be null");
-  }
-  // check of duplication on both words and special_tokens will be performed in python
-  // special_tokens and words both need to be unique, and shouldn't overlap
-  std::unordered_map<WordType, WordIdType> word2id;
-  // if special is added in front, normal words id will start from number of special tokens
-  WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
-
-  for (auto word : words) {
-    word2id[py::str(word)] = word_id++;
-  }
-
-  word_id = prepend_special ? 0 : word2id.size();
-
-  for (auto special_token : special_tokens) {
-    word2id[py::str(special_token)] = word_id++;
-  }
-
-  *vocab = std::make_shared<Vocab>(std::move(word2id));
-  return Status::OK();
-}
-
-Status Vocab::BuildFromPyDict(const py::dict &words, std::shared_ptr<Vocab> *vocab) {
-  if (vocab == nullptr) {
-    RETURN_STATUS_UNEXPECTED("Vocab::BuildFromPyDict: input vocab can not be null");
-  }
-  std::unordered_map<WordType, WordIdType> word2id;
-  for (auto p : words) {
-    word2id[py::str(p.first)] = py::reinterpret_borrow<py::int_>(p.second);
-  }
-  *vocab = std::make_shared<Vocab>(std::move(word2id));
-  return Status::OK();
-}
-#endif
-
-void Vocab::append_word(const std::string &word) {
+void Vocab::AppendWord(const std::string &word) {
  if (word2id_.find(word) == word2id_.end()) {
    word2id_[word] = word2id_.size();
  }
@ -161,11 +122,11 @@ Status Vocab::BuildFromVector(const std::vector<WordType> &words, const std::vec
  return Status::OK();
 }

-Status Vocab::BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size,
-                               const std::vector<WordType> &special_tokens, bool prepend_special,
-                               std::shared_ptr<Vocab> *vocab) {
+Status Vocab::BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
+                            const std::vector<WordType> &special_tokens, bool prepend_special,
+                            std::shared_ptr<Vocab> *vocab) {
  if (vocab == nullptr) {
-    RETURN_STATUS_UNEXPECTED("Vocab::BuildFromFileCpp: input vocab can not be null");
+    RETURN_STATUS_UNEXPECTED("Vocab::BuildFromFile: input vocab can not be null");
  }
  // Validate parameters
  auto realpath = FileUtils::GetRealPath(path.c_str());
@ -227,56 +188,6 @@ Status Vocab::BuildFromFileCpp(const std::string &path, const std::string &delim
  return Status::OK();
 }

-Status Vocab::BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
-                            const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab) {
-  if (vocab == nullptr) {
-    RETURN_STATUS_UNEXPECTED("Vocab::BuildFromFile: input vocab can not be null");
-  }
-  // python validator checks special_tokens doesn't contain any duplicate words
-  std::unordered_set<std::string> specials;
-  // used to check that words in file don't contain any special token that already exists
-  for (auto word : special_tokens) {
-    specials.insert(py::str(word));
-  }
-  WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
-  std::unordered_map<WordType, WordIdType> word2id;
-
-  auto realpath = FileUtils::GetRealPath(path.c_str());
-  if (!realpath.has_value()) {
-    RETURN_STATUS_UNEXPECTED("Get real path failed, path=" + path);
-  }
-
-  std::fstream handle(realpath.value(), std::ios::in);
-  CHECK_FAIL_RETURN_UNEXPECTED(handle.good() && handle.is_open(), "from_file: fail to open:" + path);
-  std::string word;
-  while (std::getline(handle, word)) {
-    if (!delimiter.empty()) {
-      // if delimiter is not found, find_first_of would return std::string::npos which is -1
-      word = word.substr(0, word.find_first_of(delimiter));
-    }
-    if (word2id.find(word) != word2id.end()) {
-      handle.close();
-      RETURN_STATUS_UNEXPECTED("from_file: duplicate word:" + word + ".");
-    }
-    if (specials.find(word) != specials.end()) {
-      handle.close();
-      RETURN_STATUS_UNEXPECTED("from_file: special_tokens and word_list contain duplicate word:" + word);
-    }
-    word2id[word] = word_id++;
-    // break if enough row is read, if vocab_size is smaller than 0
-    if (word2id.size() == vocab_size) break;
-  }
-  handle.close();
-  word_id = prepend_special ? 0 : word2id.size();
-
-  for (auto special_token : special_tokens) {
-    word2id[py::str(special_token)] = word_id++;
-  }
-
-  *vocab = std::make_shared<Vocab>(std::move(word2id));
-  return Status::OK();
-}
-
 const WordIdType Vocab::kNoTokenExists = -1;
 const WordType Vocab::kNoIdExists = std::string();

--- a/mindspore/ccsrc/minddata/dataset/text/vocab.h
+++ b/mindspore/ccsrc/minddata/dataset/text/vocab.h
@ -1,143 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VOCAB_H_
-#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VOCAB_H_
-
-#include <string>
-#include <memory>
-#include <unordered_map>
-#include <vector>
-
-#include "minddata/dataset/util/status.h"
-#ifdef ENABLE_PYTHON
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-#endif
-
-namespace mindspore {
-namespace dataset {
-#ifdef ENABLE_PYTHON
-namespace py = pybind11;
-#endif
-
-using WordIdType = int32_t;
-using WordType = std::string;
-
-class Vocab {
- public:
-#ifdef ENABLE_PYTHON
-  // Build a vocab from a python dictionary key is each word ,id needs to start from 2, no duplicate and continuous
-  // @param const py::dict &words - a dictionary containing word, word id pair.
-  // @param std::shared_ptr<Vocab> *vocab - return value, vocab object
-  // @return error code
-  static Status BuildFromPyDict(const py::dict &words, std::shared_ptr<Vocab> *vocab);
-
-  // Build a vocab from a python list, id will be assigned automatically, start from 2
-  // @param const py::list &words - a list of string, used to build vocab, id starts from 2
-  // @param std::shared_ptr<Vocab> *vocab - return value, vocab object
-  // @return error code
-  static Status BuildFromPyList(const py::list &words, const py::list &special_tokens, bool prepend_special,
-                                std::shared_ptr<Vocab> *vocab);
-
-  // Build a vocab from reading a vocab file, id are automatically assigned, start from 2
-  // @param std::string &path - path to vocab file , each line is assumed to contain 1 word
-  // @param std::string &delimiter - delimiter to break each line with
-  // @param int32_t vocab_size - number of words to read from file
-  // @param std::shared_ptr<Vocab> *vocab - return value, vocab object
-  // @return error code
-  static Status BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
-                              const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab);
-#endif
-
-  /// \brief Build a vocab from a c++ map. id needs to start from 2, no duplicate and continuous
-  /// \param[in] words An unordered_map containing word, word id pair.
-  /// \param[out] vocab A vocab object
-  /// \return Error code
-  static Status BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdType> &words,
-                                      std::shared_ptr<Vocab> *vocab);
-
-  /// \brief Build a vocab from a c++ vector. id needs to start from 2, no duplicate and continuous
-  /// \param[in] words A vector of string, used to build vocab, id starts from 2
-  /// \param[in] special_tokens A vector of string contain special tokens
-  /// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab
-  /// \param[out] vocab A vocab object
-  /// \return Error code
-  static Status BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens,
-                                bool prepend_special, std::shared_ptr<Vocab> *vocab);
-
-  /// \brief Build a vocab from reading a vocab file, id are automatically assigned, start from 2
-  /// \param[in] path Path to vocab file , each line is assumed to contain 1 word
-  /// \param[in] delimiter Delimiter to break each line with
-  /// \param[in] vocab_size Number of words to read from file
-  /// \param[in] special_tokens A vector of string contain special tokens
-  /// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab
-  /// \param[out] vocab A vocab object
-  /// \return Error code
-  static Status BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size,
-                                 const std::vector<WordType> &special_tokens, bool prepend_special,
-                                 std::shared_ptr<Vocab> *vocab);
-
-  // Lookup the id of a word, if word doesn't exist in vocab, return default_id
-  // @param const WordType word - word to look up
-  // @param WordIdType default_id - word id to return to user when its not in the vocab
-  // @return WordIdType, word_id
-  WordIdType Lookup(const WordType &word) const;
-
-  // Lookup the ids of a vector of words, if word doesn't exist in vocab, return default_id
-  // @param const WordType word - word to look up
-  // @param WordIdType default_id - word id to return to user when its not in the vocab
-  // @return WordIdType, word_id
-  std::vector<WordIdType> Lookup(const std::vector<WordType> &words) const;
-
-  // Find the word of a id, if word doesn't exist in vocab, return empty string
-  // @param const WordIdType id - id to reverse look up
-  // @return WordType, word
-  WordType ReverseLookup(const WordIdType &id);
-
-  // Find the words of a vector of ids, if word doesn't exist in vocab, return empty string
-  // @param const WordIdType id - id to reverse look up
-  // @return WordType, word
-  std::vector<WordType> ReverseLookup(const std::vector<WordIdType> &ids);
-
-  // constructor, shouldn't be called directly, can't be private due to std::make_unique()
-  // @param std::unordered_map<WordType, WordIdType> map - sanitized word2id map
-  explicit Vocab(std::unordered_map<WordType, WordIdType> map);
-
-  Vocab() = default;
-
-  // add one word to vocab, increment it's index automatically
-  // @param std::string & word - word to be added will skip if word already exists
-  void append_word(const std::string &word);
-
-  // return a read-only vocab
-  const std::unordered_map<WordType, WordIdType> vocab() { return word2id_; }
-
-  // destructor
-  ~Vocab() = default;
-
-  static const WordIdType kNoTokenExists;
-  static const WordType kNoIdExists;
-
- private:
-  std::unordered_map<WordType, WordIdType> word2id_;
-  std::unordered_map<WordIdType, WordType> id2word_;
-};
-
-}  // namespace dataset
-}  // namespace mindspore
-
-#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VOCAB_H_
--- a/mindspore/python/mindspore/dataset/text/transforms.py
+++ b/mindspore/python/mindspore/dataset/text/transforms.py
@ -47,7 +47,7 @@ import numpy as np
 import mindspore._c_dataengine as cde
 from mindspore.common import dtype as mstype

-from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType
+from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType, SentencePieceVocab
 from .validators import check_lookup, check_jieba_add_dict, check_to_vectors, \
    check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer, \
    check_wordpiece_tokenizer, check_regex_replace, check_regex_tokenizer, check_basic_tokenizer, check_ngram, \
@ -386,6 +386,7 @@ class SentencePieceTokenizer(TextTensorOperation):
        self.out_type = out_type

    def parse(self):
+        self.mode = self.mode.c_sentence_piece_vocab if isinstance(self.mode, SentencePieceVocab) else self.mode
        return cde.SentencePieceTokenizerOperation(self.mode, DE_C_INTER_SENTENCEPIECE_OUTTYPE[self.out_type])


--- a/mindspore/python/mindspore/dataset/text/utils.py
+++ b/mindspore/python/mindspore/dataset/text/utils.py
@ -141,7 +141,7 @@ class Vocab:
            >>> dataset = dataset.map(operations=text.Lookup(vocab, "<unk>"), input_columns=["text"])
        """

-        vocab = Vocab()
+        vocab = cls()
        vocab.c_vocab = dataset.build_vocab(columns, freq_range, top_k, special_tokens, special_first)
        return vocab

@ -211,7 +211,7 @@ class Vocab:
            vocab_size = -1
        if special_tokens is None:
            special_tokens = []
-        vocab = Vocab()
+        vocab = cls()
        vocab.c_vocab = cde.Vocab.from_file(file_path, delimiter, vocab_size, special_tokens, special_first)
        return vocab

@ -232,16 +232,19 @@ class Vocab:
            >>> vocab = text.Vocab.from_dict({"home": 3, "behind": 2, "the": 4, "world": 5, "<unk>": 6})
        """

-        vocab = Vocab()
+        vocab = cls()
        vocab.c_vocab = cde.Vocab.from_dict(word_dict)
        return vocab


-class SentencePieceVocab(cde.SentencePieceVocab):
+class SentencePieceVocab:
    """
    SentencePiece object that is used to do words segmentation.
    """

+    def __init__(self):
+        self.c_sentence_piece_vocab = None
+
    @classmethod
    @check_from_dataset_sentencepiece
    def from_dataset(cls, dataset, col_names, vocab_size, character_coverage, model_type, params):
@ -278,8 +281,11 @@ class SentencePieceVocab(cde.SentencePieceVocab):
            ...                                              SentencePieceModel.UNIGRAM, {})
        """

-        return dataset.build_sentencepiece_vocab(col_names, vocab_size, character_coverage,
-                                                 model_type, params)
+        sentence_piece_vocab = cls()
+        sentence_piece_vocab.c_sentence_piece_vocab = dataset.build_sentencepiece_vocab(col_names, vocab_size,
+                                                                                        character_coverage,
+                                                                                        model_type, params)
+        return sentence_piece_vocab

    @classmethod
    @check_from_file_sentencepiece
@ -321,8 +327,11 @@ class SentencePieceVocab(cde.SentencePieceVocab):
            ...                                           SentencePieceModel.UNIGRAM, {})
        """

-        return super().from_file(file_path, vocab_size, character_coverage,
-                                 DE_C_INTER_SENTENCEPIECE_MODE[model_type], params)
+        sentence_piece_vocab = cls()
+        sentence_piece_vocab.c_sentence_piece_vocab = \
+        cde.SentencePieceVocab.from_file(file_path, vocab_size, character_coverage,
+                                         DE_C_INTER_SENTENCEPIECE_MODE[model_type], params)
+        return sentence_piece_vocab

    @classmethod
    @check_save_model
@ -342,7 +351,7 @@ class SentencePieceVocab(cde.SentencePieceVocab):
            >>> text.SentencePieceVocab.save_model(vocab, "./", "m.model")
        """

-        super().save_model(vocab, path, filename)
+        cde.SentencePieceVocab.save_model(vocab.c_sentence_piece_vocab, path, filename)


 def to_str(array, encoding='utf8'):
--- a/mindspore/python/mindspore/dataset/text/validators.py
+++ b/mindspore/python/mindspore/dataset/text/validators.py
@ -551,7 +551,7 @@ def check_save_model(method):
        [vocab, path, filename], _ = parse_user_args(method, *args, **kwargs)

        if vocab is not None:
-            type_check(vocab, (cde.SentencePieceVocab,), "vocab")
+            type_check(vocab, (text.SentencePieceVocab,), "vocab")

        if path is not None:
            type_check(path, (str,), "path")
@ -573,7 +573,7 @@ def check_sentence_piece_tokenizer(method):
    def new_method(self, *args, **kwargs):
        [mode, out_type], _ = parse_user_args(method, *args, **kwargs)

-        type_check(mode, (str, cde.SentencePieceVocab), "mode is not an instance of str or cde.SentencePieceVocab.")
+        type_check(mode, (str, text.SentencePieceVocab), "mode is not an instance of str or text.SentencePieceVocab.")
        type_check(out_type, (SPieceTokenizerOutType,), "out_type is not an instance of SPieceTokenizerOutType")

        return method(self, *args, **kwargs)
--- a/tests/ut/cpp/dataset/build_vocab_test.cc
+++ b/tests/ut/cpp/dataset/build_vocab_test.cc
@ -20,7 +20,7 @@

 #include "common/common.h"
 #include "include/api/status.h"
-#include "minddata/dataset/text/vocab.h"
+#include "minddata/dataset/include/dataset/text.h"

 using mindspore::dataset::Tensor;
 using mindspore::dataset::Vocab;
@ -47,7 +47,7 @@ TEST_F(MindDataTestVocab, TestVocabFromUnorderedMap) {
  std::vector<std::string> words = {"apple", "dog", "egg"};
  std::vector<int64_t> expected = {1, 3, -1};
  for (uint32_t i = 0; i < words.size(); ++i) {
-    int32_t x = vocab->Lookup(words[i]);
+    int32_t x = vocab->TokensToIds(words[i]);
    EXPECT_EQ(x, expected[i]);
  }
 }
@ -65,7 +65,7 @@ TEST_F(MindDataTestVocab, TestVocabFromEmptyMap) {
  std::vector<std::string> words = {"apple", "dog", "egg"};
  std::vector<int64_t> expected = {-1, -1, -1};
  for (uint32_t i = 0; i < words.size(); ++i) {
-    int32_t x = vocab->Lookup(words[i]);
+    int32_t x = vocab->TokensToIds(words[i]);
    EXPECT_EQ(x, expected[i]);
  }
 }
@ -96,7 +96,7 @@ TEST_F(MindDataTestVocab, TestVocabFromVectorPrependSpTokens) {
  std::vector<std::string> words = {"apple", "banana", "fox"};
  std::vector<int64_t> expected = {1, 2, -1};
  for (uint32_t i = 0; i < words.size(); ++i) {
-    int32_t x = vocab->Lookup(words[i]);
+    int32_t x = vocab->TokensToIds(words[i]);
    EXPECT_EQ(x, expected[i]);
  }
 }
@ -113,7 +113,7 @@ TEST_F(MindDataTestVocab, TestVocabFromVectorAppendSpTokens) {
  std::vector<std::string> words = {"apple", "<unk>", "fox"};
  std::vector<int64_t> expected = {0, 5, -1};
  for (uint32_t i = 0; i < words.size(); ++i) {
-    int32_t x = vocab->Lookup(words[i]);
+    int32_t x = vocab->TokensToIds(words[i]);
    EXPECT_EQ(x, expected[i]);
  }
 }
@ -131,7 +131,7 @@ TEST_F(MindDataTestVocab, TestVocabFromVectorWithNoSpTokens) {
  std::vector<std::string> words = {"apple", "banana", "fox", "<pad>"};
  std::vector<int64_t> expected = {0, 1, -1, -1};
  for (uint32_t i = 0; i < words.size(); ++i) {
-    int32_t x = vocab->Lookup(words[i]);
+    int32_t x = vocab->TokensToIds(words[i]);
    EXPECT_EQ(x, expected[i]);
  }
 }
@ -149,7 +149,7 @@ TEST_F(MindDataTestVocab, TestVocabFromEmptyVector) {
  std::vector<std::string> words = {"apple", "banana", "fox"};
  std::vector<int64_t> expected = {-1, -1, -1};
  for (uint32_t i = 0; i < words.size(); ++i) {
-    int32_t x = vocab->Lookup(words[i]);
+    int32_t x = vocab->TokensToIds(words[i]);
    EXPECT_EQ(x, expected[i]);
  }
 }
@ -195,14 +195,14 @@ TEST_F(MindDataTestVocab, TestVocabFromFile) {
  // Build vocab from local file
  std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
  std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
-  Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab);
+  Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab);
  EXPECT_EQ(s, Status::OK());

  // Look up specified words
  std::vector<std::string> words = {"not", "all"};
  std::vector<int64_t> expected = {2, 3};
  for (uint32_t i = 0; i < words.size(); ++i) {
-    int32_t x = vocab->Lookup(words[i]);
+    int32_t x = vocab->TokensToIds(words[i]);
    EXPECT_EQ(x, expected[i]);
  }
 }
@ -212,7 +212,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail1) {
  // Build vocab from local file which is not exist
  std::string vocab_dir = datasets_root_path_ + "/testVocab/not_exist.txt";
  std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
-  Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {}, true, &vocab);
+  Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {}, true, &vocab);
  EXPECT_NE(s, Status::OK());
 }

@ -223,7 +223,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail2) {
  std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();

  // Expected failure: vocab_size should be either -1 or positive integer
-  Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -2, {}, true, &vocab);
+  Status s = Vocab::BuildFromFile(vocab_dir, ",", -2, {}, true, &vocab);
  EXPECT_NE(s, Status::OK());
 }

@ -234,7 +234,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail3) {
  std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();

  // Expected failure: duplicate special token <unk>
-  Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<unk>", "<unk>"}, true, &vocab);
+  Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"<unk>", "<unk>"}, true, &vocab);
  EXPECT_NE(s, Status::OK());
 }

@ -245,6 +245,6 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail4) {
  std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();

  // Expected failure: special_tokens and word_list contain duplicate word
-  Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"home"}, true, &vocab);
+  Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"home"}, true, &vocab);
  EXPECT_NE(s, Status::OK());
 }
--- a/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc
+++ b/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc
@ -23,7 +23,6 @@
 #include "minddata/dataset/include/dataset/datasets.h"
 #include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/include/dataset/transforms.h"
-#include "minddata/dataset/text/sentence_piece_vocab.h"

 using namespace mindspore::dataset;
 using mindspore::dataset::SentencePieceModel;
--- a/tests/ut/cpp/dataset/c_api_text_test.cc
+++ b/tests/ut/cpp/dataset/c_api_text_test.cc
@ -27,7 +27,6 @@
 #include "minddata/dataset/text/fast_text.h"
 #include "minddata/dataset/text/glove.h"
 #include "minddata/dataset/text/vectors.h"
-#include "minddata/dataset/text/vocab.h"

 using namespace mindspore::dataset;
 using mindspore::Status;
@ -797,7 +796,7 @@ TEST_F(MindDataTestPipeline, TestFilterWikipediaXMLSuccess) {
  // Iterate the dataset and get each row
  std::unordered_map<std::string, mindspore::MSTensor> row;
  ASSERT_OK(iter->GetNextRow(&row));
-  std::vector<std::string> expected = {"welcome to beijing","",""};
+  std::vector<std::string> expected = {"welcome to beijing", "", ""};

  uint64_t i = 0;

@ -806,7 +805,7 @@ TEST_F(MindDataTestPipeline, TestFilterWikipediaXMLSuccess) {
    std::shared_ptr<Tensor> de_expected_tensor;
    ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
    mindspore::MSTensor ms_expected_tensor =
-    mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
+      mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
    EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
    ASSERT_OK(iter->GetNextRow(&row));
    i++;
@ -1709,8 +1708,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail1) {
  EXPECT_NE(ds, nullptr);

  // Create ToNumber operation on ds
-  std::shared_ptr<TensorTransform> to_number =
-    std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
+  std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
  EXPECT_NE(to_number, nullptr);

  // Create a Map operation on ds
@ -1760,7 +1758,8 @@ TEST_F(MindDataTestPipeline, TestToNumberFail2) {
  EXPECT_NE(ds, nullptr);

  // Create ToNumber operation on ds
-  std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16);
+  std::shared_ptr<TensorTransform> to_number =
+    std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16);
  EXPECT_NE(to_number, nullptr);

  // Create a Map operation on ds
@ -2143,8 +2142,7 @@ TEST_F(MindDataTestPipeline, TestNgramSuccess1) {
  ASSERT_OK(iter->GetNextRow(&row));

  std::vector<std::vector<std::string>> expected = {
-    {"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a",
-     "is-a-text",
+    {"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a", "is-a-text",
     "a-text-file.", "text-file.-&", "file.-&-&"},
    {"&-Be", "Be-happy", "happy-every", "every-day.", "day.-&", "&-&-Be", "&-Be-happy", "Be-happy-every",
     "happy-every-day.", "every-day.-&", "day.-&-&"},
@ -4371,8 +4369,7 @@ TEST_F(MindDataTestPipeline, TestGloVeDefaultParam) {
  Status s = GloVe::BuildFromFile(&glove, vectors_dir);
  EXPECT_EQ(s, Status::OK());

-  std::shared_ptr<TensorTransform> lookup =
-    std::make_shared<text::ToVectors>(glove);
+  std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove);
  EXPECT_NE(lookup, nullptr);

  // Create Map operation on ds
@ -4388,14 +4385,13 @@ TEST_F(MindDataTestPipeline, TestGloVeDefaultParam) {
  ASSERT_OK(iter->GetNextRow(&row));

  uint64_t i = 0;
-  std::vector<std::vector<float>> expected = {
-      {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
-      {0, 0, 0, 0, 0, 0},
-      {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
-      {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
-      {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
-      {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
-      {0, 0, 0, 0, 0, 0}};
+  std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
+                                              {0, 0, 0, 0, 0, 0},
+                                              {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
+                                              {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
+                                              {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
+                                              {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
+                                              {0, 0, 0, 0, 0, 0}};
  while (row.size() != 0) {
    auto ind = row["text"];
    MS_LOG(INFO) << ind.Shape();
@ -4434,8 +4430,7 @@ TEST_F(MindDataTestPipeline, TestGloVeAllBuildfromfileParams) {
  Status s = GloVe::BuildFromFile(&glove, vectors_dir, 100);
  EXPECT_EQ(s, Status::OK());

-  std::shared_ptr<TensorTransform> lookup =
-    std::make_shared<text::ToVectors>(glove);
+  std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove);
  EXPECT_NE(lookup, nullptr);

  // Create Map operation on ds
@ -4451,14 +4446,13 @@ TEST_F(MindDataTestPipeline, TestGloVeAllBuildfromfileParams) {
  ASSERT_OK(iter->GetNextRow(&row));

  uint64_t i = 0;
-  std::vector<std::vector<float>> expected = {
-      {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
-      {0, 0, 0, 0, 0, 0},
-      {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
-      {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
-      {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
-      {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
-      {0, 0, 0, 0, 0, 0}};
+  std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
+                                              {0, 0, 0, 0, 0, 0},
+                                              {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
+                                              {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
+                                              {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
+                                              {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
+                                              {0, 0, 0, 0, 0, 0}};
  while (row.size() != 0) {
    auto ind = row["text"];
    MS_LOG(INFO) << ind.Shape();
@ -4498,8 +4492,7 @@ TEST_F(MindDataTestPipeline, TestGloVeUnknownInit) {
  EXPECT_EQ(s, Status::OK());

  std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
-  std::shared_ptr<TensorTransform> lookup =
-    std::make_shared<text::ToVectors>(glove, unknown_init);
+  std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove, unknown_init);
  EXPECT_NE(lookup, nullptr);

  // Create Map operation on ds
@ -4515,14 +4508,13 @@ TEST_F(MindDataTestPipeline, TestGloVeUnknownInit) {
  ASSERT_OK(iter->GetNextRow(&row));

  uint64_t i = 0;
-  std::vector<std::vector<float>> expected = {
-      {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
-      {-1, -1, -1, -1, -1, -1},
-      {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
-      {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
-      {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
-      {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
-      {-1, -1, -1, -1, -1, -1}};
+  std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
+                                              {-1, -1, -1, -1, -1, -1},
+                                              {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
+                                              {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
+                                              {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
+                                              {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
+                                              {-1, -1, -1, -1, -1, -1}};
  while (row.size() != 0) {
    auto ind = row["text"];
    MS_LOG(INFO) << ind.Shape();
@ -4562,8 +4554,7 @@ TEST_F(MindDataTestPipeline, TestGloVeAllParams) {
  EXPECT_EQ(s, Status::OK());

  std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
-  std::shared_ptr<TensorTransform> lookup =
-    std::make_shared<text::ToVectors>(glove, unknown_init, true);
+  std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove, unknown_init, true);
  EXPECT_NE(lookup, nullptr);

  // Create Map operation on ds
@ -4579,14 +4570,13 @@ TEST_F(MindDataTestPipeline, TestGloVeAllParams) {
  ASSERT_OK(iter->GetNextRow(&row));

  uint64_t i = 0;
-  std::vector<std::vector<float>> expected = {
-      {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
-      {-1, -1, -1, -1, -1, -1},
-      {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
-      {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
-      {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
-      {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
-      {-1, -1, -1, -1, -1, -1}};
+  std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
+                                              {-1, -1, -1, -1, -1, -1},
+                                              {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
+                                              {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
+                                              {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
+                                              {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
+                                              {-1, -1, -1, -1, -1, -1}};
  while (row.size() != 0) {
    auto ind = row["text"];
    MS_LOG(INFO) << ind.Shape();
@ -4748,13 +4738,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramDefaultParam) {
  ASSERT_OK(iter->GetNextRow(&row));

  uint64_t i = 0;
-  std::vector<std::vector<float>> expected = {{0,0,0,0,0},
-                                              {0,0,0,0,0},
-                                              {0.117336,0.362446,-0.983326,0.939264,-0.05648},
-                                              {0.657201,2.11761,-1.59276,0.432072,1.21395},
-                                              {0,0,0,0,0},
-                                              {-2.26956,0.288491,-0.740001,0.661703,0.147355},
-                                              {0,0,0,0,0}};
+  std::vector<std::vector<float>> expected = {{0, 0, 0, 0, 0},
+                                              {0, 0, 0, 0, 0},
+                                              {0.117336, 0.362446, -0.983326, 0.939264, -0.05648},
+                                              {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
+                                              {0, 0, 0, 0, 0},
+                                              {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
+                                              {0, 0, 0, 0, 0}};
  while (row.size() != 0) {
    auto ind = row["text"];
    MS_LOG(INFO) << ind.Shape();
@ -4810,13 +4800,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramAllBuildfromfileParams) {
  ASSERT_OK(iter->GetNextRow(&row));

  uint64_t i = 0;
-  std::vector<std::vector<float>> expected = {{0,0,0,0,0},
-                                              {0,0,0,0,0},
-                                              {-0.155665,0.664073,-0.538499,1.22657,-0.2162},
-                                              {0.657201,2.11761,-1.59276,0.432072,1.21395},
-                                              {0,0,0,0,0},
-                                              {-2.26956,0.288491,-0.740001,0.661703,0.147355},
-                                              {0,0,0,0,0}};
+  std::vector<std::vector<float>> expected = {{0, 0, 0, 0, 0},
+                                              {0, 0, 0, 0, 0},
+                                              {-0.155665, 0.664073, -0.538499, 1.22657, -0.2162},
+                                              {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
+                                              {0, 0, 0, 0, 0},
+                                              {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
+                                              {0, 0, 0, 0, 0}};
  while (row.size() != 0) {
    auto ind = row["text"];
    MS_LOG(INFO) << ind.Shape();
@ -4873,13 +4863,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramUnknownInit) {
  ASSERT_OK(iter->GetNextRow(&row));

  uint64_t i = 0;
-  std::vector<std::vector<float>> expected = {{-1,-1,-1,-1,-1},
-                                              {-1,-1,-1,-1,-1},
-                                              {-0.155665,0.664073,-0.538499,1.22657,-0.2162},
-                                              {0.657201,2.11761,-1.59276,0.432072,1.21395},
-                                              {-1,-1,-1,-1,-1},
-                                              {-2.26956,0.288491,-0.740001,0.661703,0.147355},
-                                              {-1,-1,-1,-1,-1}};
+  std::vector<std::vector<float>> expected = {{-1, -1, -1, -1, -1},
+                                              {-1, -1, -1, -1, -1},
+                                              {-0.155665, 0.664073, -0.538499, 1.22657, -0.2162},
+                                              {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
+                                              {-1, -1, -1, -1, -1},
+                                              {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
+                                              {-1, -1, -1, -1, -1}};
  while (row.size() != 0) {
    auto ind = row["text"];
    MS_LOG(INFO) << ind.Shape();
@ -4936,13 +4926,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramAllParams) {
  ASSERT_OK(iter->GetNextRow(&row));

  uint64_t i = 0;
-  std::vector<std::vector<float>> expected = {{-1,-1,-1,-1,-1},
-                                              {-1,-1,-1,-1,-1},
-                                              {0.117336,0.362446,-0.983326,0.939264,-0.05648},
-                                              {0.657201,2.11761,-1.59276,0.432072,1.21395},
-                                              {-1,-1,-1,-1,-1},
-                                              {-2.26956,0.288491,-0.740001,0.661703,0.147355},
-                                              {-1,-1,-1,-1,-1}};
+  std::vector<std::vector<float>> expected = {{-1, -1, -1, -1, -1},
+                                              {-1, -1, -1, -1, -1},
+                                              {0.117336, 0.362446, -0.983326, 0.939264, -0.05648},
+                                              {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
+                                              {-1, -1, -1, -1, -1},
+                                              {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
+                                              {-1, -1, -1, -1, -1}};
  while (row.size() != 0) {
    auto ind = row["text"];
    MS_LOG(INFO) << ind.Shape();
--- a/tests/ut/cpp/dataset/c_api_text_vocab_test.cc
+++ b/tests/ut/cpp/dataset/c_api_text_vocab_test.cc
@ -22,7 +22,6 @@
 #include "minddata/dataset/include/dataset/datasets.h"
 #include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/include/dataset/transforms.h"
-#include "minddata/dataset/text/vocab.h"

 using namespace mindspore::dataset;
 using mindspore::Status;
@ -42,7 +41,7 @@ class MindDataTestPipeline : public UT::DatasetOpTesting {
  } while (false)

 /// Feature: C++ text.Vocab class.
-/// Description: test Lookup() ReverseLookup() methods of text::Vocab.
+/// Description: test TokensToIds() IdsToTokens() methods of text::Vocab.
 /// Expectation: success.
 TEST_F(MindDataTestPipeline, TestVocabLookupAndReverseLookup) {
  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupAndReverseLookup.";
@ -53,30 +52,30 @@ TEST_F(MindDataTestPipeline, TestVocabLookupAndReverseLookup) {
  EXPECT_EQ(s, Status::OK());

  // lookup, convert token to id
-  auto single_index = vocab->Lookup("home");
+  auto single_index = vocab->TokensToIds("home");
  EXPECT_EQ(single_index, 2);
-  single_index = vocab->Lookup("hello");
+  single_index = vocab->TokensToIds("hello");
  EXPECT_EQ(single_index, -1);

  // lookup multiple tokens
-  auto multi_indexs = vocab->Lookup(std::vector<std::string>{"<pad>", "behind"});
+  auto multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "behind"});
  std::vector<int32_t> expected_multi_indexs = {0, 4};
  EXPECT_EQ(multi_indexs, expected_multi_indexs);
-  multi_indexs = vocab->Lookup(std::vector<std::string>{"<pad>", "apple"});
+  multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "apple"});
  expected_multi_indexs = {0, -1};
  EXPECT_EQ(multi_indexs, expected_multi_indexs);

  // reverse lookup, convert id to token
-  auto single_word = vocab->ReverseLookup(2);
+  auto single_word = vocab->IdsToTokens(2);
  EXPECT_EQ(single_word, "home");
-  single_word = vocab->ReverseLookup(-1);
+  single_word = vocab->IdsToTokens(-1);
  EXPECT_EQ(single_word, "");

  // reverse lookup multiple ids
-  auto multi_words = vocab->ReverseLookup(std::vector<int32_t>{0, 4});
+  auto multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 4});
  std::vector<std::string> expected_multi_words = {"<pad>", "behind"};
  EXPECT_EQ(multi_words, expected_multi_words);
-  multi_words = vocab->ReverseLookup(std::vector<int32_t>{0, 99});
+  multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 99});
  expected_multi_words = {"<pad>", ""};
  EXPECT_EQ(multi_words, expected_multi_words);
 }
@ -330,7 +329,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
  EXPECT_NE(vocab, nullptr);

  // Check if vocab has words or not
-  int32_t home_index = vocab->Lookup("home");
+  int32_t home_index = vocab->TokensToIds("home");
  EXPECT_EQ(home_index, 4);

  // Create Lookup operation on ds
@ -386,7 +385,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) {
  EXPECT_NE(vocab, nullptr);

  // Check if vocab has words or not
-  int32_t home_index = vocab->Lookup("home");
+  int32_t home_index = vocab->TokensToIds("home");
  EXPECT_EQ(home_index, 2);

  // Create Lookup operation on ds
@ -509,7 +508,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetInt64) {
  EXPECT_NE(vocab, nullptr);

  // Check if vocab has words or not
-  int32_t home_index = vocab->Lookup("home");
+  int32_t home_index = vocab->TokensToIds("home");
  EXPECT_EQ(home_index, 2);

  // Create Lookup operation on ds
--- a/tests/ut/cpp/dataset/sentence_piece_vocab_op_test.cc
+++ b/tests/ut/cpp/dataset/sentence_piece_vocab_op_test.cc
@ -19,7 +19,7 @@
 #include "common/common.h"
 #include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h"
 #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
-#include "minddata/dataset/text/sentence_piece_vocab.h"
+#include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/engine/datasetops/source/text_file_op.h"
 #include "gtest/gtest.h"
 #include "utils/log_adapter.h"