From 457ee030824afc3dfd502d5b29da59cca3055159 Mon Sep 17 00:00:00 2001
From: liu-yongqi-63 <liuyongqi5@huawei.com>
Date: Tue, 8 Mar 2022 16:24:55 +0800
Subject: [PATCH] Vocab C++ Interface Alignment and SentencePieceVocab C++
 interface alignment and Python interface refactoring

---
 .../ccsrc/minddata/dataset/api/datasets.cc    |   3 +-
 .../dataset/engine/ir/datasetops/bindings.cc  |   1 -
 .../python/bindings/dataset/text/bindings.cc  |  40 ++--
 .../dataset/text/kernels/ir/bindings.cc       |   9 +-
 .../dataset/engine/consumers/tree_consumer.h  |   2 +-
 .../build_sentence_piece_vocab_op.h           |   4 +-
 .../engine/datasetops/build_vocab_op.cc       |   6 +-
 .../engine/datasetops/build_vocab_op.h        |   8 +-
 .../minddata/dataset/include/dataset/text.h   | 201 +++++++++++++++++-
 .../dataset/text/ir/kernels/text_ir.cc        |   2 +-
 .../dataset/text/kernels/lookup_op.cc         |   2 +-
 .../minddata/dataset/text/kernels/lookup_op.h |   2 +-
 .../kernels/sentence_piece_tokenizer_op.h     |   2 +-
 .../text/kernels/wordpiece_tokenizer_op.cc    |   2 +-
 .../text/kernels/wordpiece_tokenizer_op.h     |   2 +-
 .../dataset/text/sentence_piece_vocab.cc      |  12 +-
 .../dataset/text/sentence_piece_vocab.h       |  50 -----
 .../ccsrc/minddata/dataset/text/vocab.cc      | 123 ++---------
 mindspore/ccsrc/minddata/dataset/text/vocab.h | 143 -------------
 .../mindspore/dataset/text/transforms.py      |   3 +-
 .../python/mindspore/dataset/text/utils.py    |  27 ++-
 .../mindspore/dataset/text/validators.py      |   4 +-
 tests/ut/cpp/dataset/build_vocab_test.cc      |  26 +--
 .../c_api_text_sentence_piece_vocab_test.cc   |   1 -
 tests/ut/cpp/dataset/c_api_text_test.cc       | 142 ++++++-------
 tests/ut/cpp/dataset/c_api_text_vocab_test.cc |  25 ++-
 .../dataset/sentence_piece_vocab_op_test.cc   |   2 +-
 27 files changed, 371 insertions(+), 473 deletions(-)
 delete mode 100644 mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h
 delete mode 100644 mindspore/ccsrc/minddata/dataset/text/vocab.h

diff --git a/mindspore/ccsrc/minddata/dataset/api/datasets.cc b/mindspore/ccsrc/minddata/dataset/api/datasets.cc
index 7a22c92bee1..fa1a9d4741f 100644
--- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc
@@ -38,8 +38,7 @@
 #include "minddata/dataset/util/status.h"
 #ifndef ENABLE_ANDROID
 #include "minddata/dataset/engine/ir/cache/dataset_cache_impl.h"
-#include "minddata/dataset/text/sentence_piece_vocab.h"
-#include "minddata/dataset/text/vocab.h"
+#include "minddata/dataset/include/dataset/text.h"
 #endif
 
 // Sampler headers (in alphabetical order)
diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/bindings.cc
index 9622f023dad..87313bf440a 100644
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/bindings.cc
@@ -23,7 +23,6 @@
 #include "minddata/dataset/core/data_type.h"
 #include "minddata/dataset/engine/serdes.h"
 #include "minddata/dataset/include/dataset/constants.h"
-#include "minddata/dataset/text/sentence_piece_vocab.h"
 #include "minddata/dataset/util/path.h"
 
 // IR non-leaf nodes
diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc
index 30844a3f26c..c54dc1b91a7 100644
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc
@@ -19,12 +19,11 @@
 
 #include "minddata/dataset/api/python/pybind_register.h"
 #include "minddata/dataset/include/dataset/constants.h"
+#include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/text/char_n_gram.h"
 #include "minddata/dataset/text/fast_text.h"
 #include "minddata/dataset/text/glove.h"
-#include "minddata/dataset/text/sentence_piece_vocab.h"
 #include "minddata/dataset/text/vectors.h"
-#include "minddata/dataset/text/vocab.h"
 
 namespace mindspore {
 namespace dataset {
@@ -32,28 +31,29 @@ PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) {
                   (void)py::class_<Vocab, std::shared_ptr<Vocab>>(*m, "Vocab")
                     .def(py::init<>())
                     .def_static("from_list",
-                                [](const py::list &words, const py::list &special_tokens, bool special_first) {
+                                [](const std::vector<std::string> &words,
+                                   const std::vector<std::string> &special_tokens, bool special_first) {
                                   std::shared_ptr<Vocab> v;
-                                  THROW_IF_ERROR(Vocab::BuildFromPyList(words, special_tokens, special_first, &v));
+                                  THROW_IF_ERROR(Vocab::BuildFromVector(words, special_tokens, special_first, &v));
                                   return v;
                                 })
                     .def_static(
                       "from_file",
                       [](const std::string &path, const std::string &dlm, int32_t vocab_size,
-                         const py::list &special_tokens, bool special_first) {
+                         const std::vector<std::string> &special_tokens, bool special_first) {
                         std::shared_ptr<Vocab> v;
                         THROW_IF_ERROR(Vocab::BuildFromFile(path, dlm, vocab_size, special_tokens, special_first, &v));
                         return v;
                       })
                     .def_static("from_dict",
-                                [](const py::dict &words) {
+                                [](const std::unordered_map<WordType, WordIdType> &words) {
                                   std::shared_ptr<Vocab> v;
-                                  THROW_IF_ERROR(Vocab::BuildFromPyDict(words, &v));
+                                  THROW_IF_ERROR(Vocab::BuildFromUnorderedMap(words, &v));
                                   return v;
                                 })
                     .def("tokens_to_ids",
                          [](Vocab &self, const std::vector<std::string> words) {
-                           auto ids = self.Lookup(words);
+                           auto ids = self.TokensToIds(words);
                            py::object ret;
                            if (ids.size() == 1) {
                              ret = py::int_(ids[0]);
@@ -65,7 +65,7 @@ PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) {
                          })
                     .def("ids_to_tokens",
                          [](Vocab &self, const std::vector<int32_t> ids) {
-                           auto words = self.ReverseLookup(ids);
+                           auto words = self.IdsToTokens(ids);
                            py::object ret;
                            if (words.size() == 1) {
                              ret = py::str(words[0]);
@@ -75,31 +75,19 @@ PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) {
                            }
                            return ret;
                          })
-                    .def("vocab", [](Vocab &self) { return self.vocab(); });
+                    .def("vocab", [](Vocab &self) { return self.GetVocab(); });
                 }));
 
 PYBIND_REGISTER(SentencePieceVocab, 0, ([](const py::module *m) {
                   (void)py::class_<SentencePieceVocab, std::shared_ptr<SentencePieceVocab>>(*m, "SentencePieceVocab")
                     .def(py::init<>())
                     .def_static("from_file",
-                                [](const py::list &paths, const int32_t vocab_size, const float character_coverage,
-                                   const SentencePieceModel model_type, const py::dict &params) {
+                                [](const std::vector<std::string> &paths, const int32_t vocab_size,
+                                   const float character_coverage, const SentencePieceModel model_type,
+                                   const std::unordered_map<std::string, std::string> &params) {
                                   std::shared_ptr<SentencePieceVocab> v;
-                                  std::vector<std::string> path_list;
-                                  for (auto path : paths) {
-                                    path_list.emplace_back(py::str(path));
-                                  }
-                                  std::unordered_map<std::string, std::string> param_map;
-                                  for (auto param : params) {
-                                    std::string key = py::reinterpret_borrow<py::str>(param.first);
-                                    if (key == "input" || key == "vocab_size" || key == "model_prefix" ||
-                                        key == "character_coverage" || key == "model_type") {
-                                      continue;
-                                    }
-                                    param_map[key] = py::reinterpret_borrow<py::str>(param.second);
-                                  }
                                   THROW_IF_ERROR(SentencePieceVocab::BuildFromFile(
-                                    path_list, vocab_size, character_coverage, model_type, param_map, &v));
+                                    paths, vocab_size, character_coverage, model_type, params, &v));
                                   return v;
                                 })
                     .def_static("save_model", [](const std::shared_ptr<SentencePieceVocab> *vocab, std::string path,
diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc
index 7768cbbe599..07e61d3d123 100644
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc
@@ -14,13 +14,12 @@
  * limitations under the License.
  */
 
+#include "minddata/dataset/api/python/pybind_register.h"
+#include "minddata/dataset/include/dataset/text.h"
+#include "minddata/dataset/text/ir/kernels/text_ir.h"
+#include "minddata/dataset/text/vectors.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl_bind.h"
-#include "minddata/dataset/api/python/pybind_register.h"
-#include "minddata/dataset/text/ir/kernels/text_ir.h"
-#include "minddata/dataset/text/sentence_piece_vocab.h"
-#include "minddata/dataset/text/vectors.h"
-#include "minddata/dataset/text/vocab.h"
 
 namespace mindspore {
 namespace dataset {
diff --git a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.h b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.h
index 9806673d08b..dcd167291c0 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.h
@@ -24,7 +24,7 @@
 #include <vector>
 
 #include "minddata/dataset/engine/tree_adapter.h"
-#include "minddata/dataset/text/vocab.h"
+#include "minddata/dataset/include/dataset/text.h"
 
 namespace mindspore::dataset {
 // Forward declare
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h
index 24f3575666b..2fb6e1bca38 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h
@@ -28,9 +28,9 @@
 #include "minddata/dataset/core/tensor.h"
 #include "minddata/dataset/engine/dataset_iterator.h"
 #include "minddata/dataset/engine/datasetops/pipeline_op.h"
+#include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/util/status.h"
 #include "minddata/dataset/util/queue.h"
-#include "minddata/dataset/text/sentence_piece_vocab.h"
 #include "pybind11/pybind11.h"
 
 namespace mindspore {
@@ -54,7 +54,7 @@ class BuildSentencePieceVocabOp : public PipelineOp {
     BuildSentencePieceVocabOp *s_p_vocab_ptr_;
   };
 
-  BuildSentencePieceVocabOp(std::shared_ptr<SentencePieceVocab> vocab, std::vector<std::string> col_names,
+  BuildSentencePieceVocabOp(std::shared_ptr<dataset::SentencePieceVocab> vocab, std::vector<std::string> col_names,
                             int32_t vocab_size, float character_coverage, SentencePieceModel model_type,
                             const std::unordered_map<std::string, std::string> &params, int32_t op_conn_size);
 
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc
index 126453b1877..ec20be4a097 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc
@@ -179,15 +179,15 @@ Status BuildVocabOp::CollectorThread() {
                     });
 
   if (special_first_) {
-    for (const std::string &sp_tk : special_tokens_) vocab_->append_word(sp_tk);
+    for (const std::string &sp_tk : special_tokens_) vocab_->AppendWord(sp_tk);
   }
 
   for (int64_t i = 0; i < num_words; i++) {
-    vocab_->append_word(words[i]);
+    vocab_->AppendWord(words[i]);
   }
 
   if (!special_first_) {
-    for (const std::string &sp_tk : special_tokens_) vocab_->append_word(sp_tk);
+    for (const std::string &sp_tk : special_tokens_) vocab_->AppendWord(sp_tk);
   }
 
   RETURN_IF_NOT_OK(out_connector_->SendEOE());
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.h b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.h
index 53b13f47b82..cbc2017b474 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.h
@@ -25,7 +25,7 @@
 #include "minddata/dataset/core/tensor.h"
 #include "minddata/dataset/engine/dataset_iterator.h"
 #include "minddata/dataset/engine/datasetops/parallel_op.h"
-#include "minddata/dataset/text/vocab.h"
+#include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/util/queue.h"
 #include "minddata/dataset/util/status.h"
 
@@ -33,9 +33,9 @@ namespace mindspore {
 namespace dataset {
 class BuildVocabOp : public ParallelOp<TensorRow, TensorRow> {
  public:
-  BuildVocabOp(std::shared_ptr<Vocab> vocab, std::vector<std::string> col_names, std::pair<int64_t, int64_t> freq_range,
-               int64_t top_k, const std::vector<std::string> &tokens, bool prepend, int32_t num_workers,
-               int32_t op_connector_size);
+  BuildVocabOp(std::shared_ptr<dataset::Vocab> vocab, std::vector<std::string> col_names,
+               std::pair<int64_t, int64_t> freq_range, int64_t top_k, const std::vector<std::string> &tokens,
+               bool prepend, int32_t num_workers, int32_t op_connector_size);
 
   ~BuildVocabOp() = default;
 
diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/text.h b/mindspore/ccsrc/minddata/dataset/include/dataset/text.h
index 644045517cf..168ac20c635 100644
--- a/mindspore/ccsrc/minddata/dataset/include/dataset/text.h
+++ b/mindspore/ccsrc/minddata/dataset/include/dataset/text.h
@@ -20,6 +20,7 @@
 #include <memory>
 #include <optional>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -30,10 +31,204 @@
 
 namespace mindspore {
 namespace dataset {
-class SentencePieceVocab;
 class TensorOperation;
 class Vectors;
-class Vocab;
+
+using WordIdType = int32_t;
+using WordType = std::string;
+
+/// \brief Vocab object that is used to save pairs of words and ids.
+/// \note It contains a map that maps each word(str) to an id(int) or reverse.
+class Vocab {
+ public:
+  /// \brief Build a vocab from an unordered_map. IDs should be no duplicate and continuous.
+  /// \param[in] words An unordered_map containing word id pair.
+  /// \param[out] vocab A vocab object.
+  /// \return Status code.
+  /// \par Example
+  /// \code
+  ///     // Build a map
+  ///     std::unordered_map<std::string, int32_t> dict;
+  ///     dict["banana"] = 0;
+  ///     dict["apple"] = 1;
+  ///     dict["cat"] = 2;
+  ///     dict["dog"] = 3;
+  ///     // Build vocab from map
+  ///     std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
+  ///     Status s = Vocab::BuildFromUnorderedMap(dict, &vocab);
+  /// \endcode
+  static Status BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdType> &words,
+                                      std::shared_ptr<Vocab> *vocab);
+
+  /// \brief Build a vocab from a c++ vector. id no duplicate and continuous.
+  /// \param[in] words A vector of string containing words.
+  /// \param[in] special_tokens A vector of string containing special tokens.
+  /// \param[in] prepend_special Whether the special_tokens will be prepended/appended to vocab.
+  /// \param[out] vocab A vocab object.
+  /// \return Status code.
+  /// \par Example
+  /// \code
+  ///     // Build vocab from a vector of words, special tokens are prepended to vocab
+  ///     std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"};
+  ///     std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
+  ///     Status s = Vocab::BuildFromVector(list, {"<unk>"}, true, &vocab);
+  /// \endcode
+  static Status BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens,
+                                bool prepend_special, std::shared_ptr<Vocab> *vocab);
+
+  /// \brief Build a vocab from vocab file, IDs will be automatically assigned.
+  /// \param[in] path Path to vocab file, each line in file is assumed as a word (including space).
+  /// \param[in] delimiter Delimiter to break each line, characters after the delimiter will be deprecated.
+  /// \param[in] vocab_size Number of lines to be read from file.
+  /// \param[in] special_tokens A vector of string containing special tokens.
+  /// \param[in] prepend_special Whether the special_tokens will be prepended/appended to vocab.
+  /// \param[out] vocab A vocab object.
+  /// \return Status code.
+  /// \par Example
+  /// \code
+  ///     // Build vocab from local file
+  ///     std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
+  ///     std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
+  ///     Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab);
+  /// \endcode
+  static Status BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
+                              const std::vector<WordType> &special_tokens, bool prepend_special,
+                              std::shared_ptr<Vocab> *vocab);
+
+  /// Lookup the id of a word, if the word doesn't exist in vocab, return -1.
+  /// \param word Word to be looked up.
+  /// \return ID of the word in the vocab.
+  /// \par Example
+  /// \code
+  ///     // lookup, convert token to id
+  ///     auto single_index = vocab->TokensToIds("home");
+  ///     single_index = vocab->TokensToIds("hello");
+  /// \endcode
+  WordIdType TokensToIds(const WordType &word) const;
+
+  /// Lookup the id of a word, if the word doesn't exist in vocab, return -1.
+  /// \param words Words to be looked up.
+  /// \return ID of the word in the vocab.
+  /// \par Example
+  /// \code
+  ///     // lookup multiple tokens
+  ///     auto multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "behind"});
+  ///     std::vector<int32_t> expected_multi_indexs = {0, 4};
+  ///     multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "apple"});
+  ///     expected_multi_indexs = {0, -1};
+  /// \endcode
+  std::vector<WordIdType> TokensToIds(const std::vector<WordType> &words) const;
+
+  /// Lookup the word of an ID, if ID doesn't exist in vocab, return empty string.
+  /// \param id ID to be looked up.
+  /// \return Indicates the word corresponding to the ID.
+  /// \par Example
+  /// \code
+  ///     // reverse lookup, convert id to token
+  ///     auto single_word = vocab->IdsToTokens(2);
+  ///     single_word = vocab->IdsToTokens(-1);
+  /// \endcode
+  WordType IdsToTokens(const WordIdType &id);
+
+  /// Lookup the word of an ID, if ID doesn't exist in vocab, return empty string.
+  /// \param ids ID to be looked up.
+  /// \return Indicates the word corresponding to the ID.
+  /// \par Example
+  /// \code
+  ///     // reverse lookup multiple ids
+  ///     auto multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 4});
+  ///     std::vector<std::string> expected_multi_words = {"<pad>", "behind"};
+  ///     multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 99});
+  ///     expected_multi_words = {"<pad>", ""};
+  /// \endcode
+  std::vector<WordType> IdsToTokens(const std::vector<WordIdType> &ids);
+
+  /// Constructor, shouldn't be called directly, can't be private due to std::make_unique().
+  /// \param map Sanitized word2id map.
+  explicit Vocab(std::unordered_map<WordType, WordIdType> map);
+
+  /// \brief Add one word to vocab, increment it's index automatically.
+  /// \param word Word to be added, word will skip if word already exists.
+  void AppendWord(const std::string &word);
+
+  /// \brief Return a read-only vocab in unordered_map type.
+  /// \return A unordered_map of word2id.
+  const std::unordered_map<WordType, WordIdType> &GetVocab() { return word2id_; }
+
+  /// \brief Constructor.
+  Vocab() = default;
+
+  /// \brief Destructor.
+  ~Vocab() = default;
+
+  static const WordIdType kNoTokenExists;
+  static const WordType kNoIdExists;
+
+ private:
+  std::unordered_map<WordType, WordIdType> word2id_;
+  std::unordered_map<WordIdType, WordType> id2word_;
+};
+
+/// \brief SentencePiece object that is used to do words segmentation.
+class SentencePieceVocab {
+ public:
+  /// \brief Build a SentencePiece object from a file.
+  /// \param[in] path_list Path to the file which contains the SentencePiece list.
+  /// \param[in] vocab_size Vocabulary size.
+  /// \param[in] character_coverage Amount of characters covered by the model, good defaults are: 0.9995 for
+  ///              languages with rich character set like Japanese or Chinese and 1.0 for other languages with small
+  ///              character set.
+  /// \param[in] model_type It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE,
+  ///              SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input
+  ///              sentence must be pre-tokenized when using SentencePieceModel.WORD type.
+  ///              - SentencePieceModel.UNIGRAM, Unigram Language Model means the next word in the sentence is assumed
+  ///                to be independent of the previous words generated by the model.
+  ///              - SentencePieceModel.BPE, refers to byte pair encoding algorithm, which replaces the most frequent
+  ///                pair of bytes in a sentence with a single, unused byte.
+  ///              - SentencePieceModel.CHAR, refers to char based sentencePiece Model type.
+  ///              - SentencePieceModel.WORD, refers to word based sentencePiece Model type.
+  /// \param[in] params A dictionary with no incoming parameters(The parameters are derived from SentencePiece library).
+  /// \return SentencePieceVocab, vocab built from the file.
+  /// \par Example
+  /// \code
+  ///     std::string dataset_path;
+  ///     dataset_path = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
+  ///     std::vector<std::string> path_list;
+  ///     path_list.emplace_back(dataset_path);
+  ///     std::unordered_map<std::string, std::string> param_map;
+  ///     std::shared_ptr<SentencePieceVocab> spm = std::make_unique<SentencePieceVocab>();
+  ///     Status rc = SentencePieceVocab::BuildFromFile(path_list, 5000, 0.9995,
+  ///                                                   SentencePieceModel::kUnigram, param_map, &spm);
+  /// \endcode
+  static Status BuildFromFile(const std::vector<std::string> &path_list, const int32_t vocab_size,
+                              const float character_coverage, const SentencePieceModel model_type,
+                              const std::unordered_map<std::string, std::string> &params,
+                              std::shared_ptr<SentencePieceVocab> *vocab);
+
+  /// \brief Save the SentencePiece model into given file path.
+  /// \param[in] vocab A SentencePiece object to be saved.
+  /// \param[in] path Path to store the model.
+  /// \param[in] filename The save name of model file.
+  /// \par Example
+  /// \code
+  ///     // Save vocab model to local
+  ///     vocab->SaveModel(&vocab, datasets_root_path_ + "/test_sentencepiece", "m.model");
+  /// \endcode
+  static Status SaveModel(const std::shared_ptr<SentencePieceVocab> *vocab, std::string path, std::string filename);
+
+  /// \brief Constructor.
+  SentencePieceVocab();
+
+  /// \brief Destructor.
+  ~SentencePieceVocab() = default;
+
+  const std::string &model_proto();
+
+  void set_model_proto(const std::string model_proto);
+
+ private:
+  std::string model_proto_;
+};
 
 // Transform operations for text
 namespace text {
@@ -414,7 +609,7 @@ class MS_API NormalizeUTF8 final : public TensorTransform {
   /// \brief Constructor.
   /// \param[in] normalize_form Valid values can be any of [NormalizeForm::kNone,NormalizeForm::kNfc,
   ///   NormalizeForm::kNfkc, NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc).
-  ///   See http://unicode.org/reports/tr15/ for details.
+  ///   See <http://unicode.org/reports/tr15/> for details.
   ///   - NormalizeForm.kNone, remain the input string tensor unchanged.
   ///   - NormalizeForm.kNfc, normalizes with Normalization Form C.
   ///   - NormalizeForm.kNfkc, normalizes with Normalization Form KC.
diff --git a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc
index c4f98942230..653836c1964 100644
--- a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc
@@ -217,7 +217,7 @@ Status LookupOperation::ValidateParams() {
     LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
   }
   if (unknown_token_ != std::nullopt) {
-    default_id_ = vocab_->Lookup(*unknown_token_);
+    default_id_ = vocab_->TokensToIds(*unknown_token_);
     if (default_id_ == Vocab::kNoTokenExists) {
       std::string err_msg = "Lookup: \"" + *unknown_token_ + "\" doesn't exist in vocab.";
       LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.cc b/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.cc
index 4f59e6b60d7..2edeb2e1507 100644
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.cc
@@ -30,7 +30,7 @@ Status LookupOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<T
   std::vector<WordIdType> word_ids;
   word_ids.reserve(input->Size());
   for (auto itr = input->begin<std::string_view>(); itr != input->end<std::string_view>(); ++itr) {
-    WordIdType word_id = vocab_->Lookup(std::string(*itr));
+    WordIdType word_id = vocab_->TokensToIds(std::string(*itr));
     word_ids.emplace_back(word_id == Vocab::kNoTokenExists ? default_id_ : word_id);
     CHECK_FAIL_RETURN_UNEXPECTED(word_ids.back() != Vocab::kNoTokenExists,
                                  "Lookup: invalid data, token: \"" + std::string(*itr) +
diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.h b/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.h
index 1b6ecf2c2af..4abc5744503 100644
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.h
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.h
@@ -23,9 +23,9 @@
 #include <vector>
 
 #include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/kernels/tensor_op.h"
 #include "minddata/dataset/util/status.h"
-#include "minddata/dataset/text/vocab.h"
 
 namespace mindspore {
 namespace dataset {
diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h b/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h
index bb2c16bd35c..89f02b2b543 100644
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h
@@ -24,10 +24,10 @@
 #include <memory>
 
 #include "minddata/dataset/include/dataset/constants.h"
+#include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/kernels/tensor_op.h"
 #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
 #include "minddata/dataset/util/status.h"
-#include "minddata/dataset/text/sentence_piece_vocab.h"
 
 namespace mindspore {
 namespace dataset {
diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.cc b/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.cc
index d62346a145a..6a7659221e3 100644
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.cc
@@ -46,7 +46,7 @@ Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const Ru
     if (start > 0) {
       word = suffix_indicator_ + word;
     }
-    if (vocab_->Lookup(word) != Vocab::kNoTokenExists) {
+    if (vocab_->TokensToIds(word) != Vocab::kNoTokenExists) {
       *out_found = true;
       break;
     }
diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.h b/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.h
index 7c959cda9c2..9766b67a5eb 100644
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.h
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/wordpiece_tokenizer_op.h
@@ -23,9 +23,9 @@
 #include "cppjieba/Unicode.hpp"
 
 #include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/kernels/tensor_op.h"
 #include "minddata/dataset/text/kernels/tokenizer_op.h"
-#include "minddata/dataset/text/vocab.h"
 #include "minddata/dataset/util/status.h"
 
 using cppjieba::DecodeRunesInString;
diff --git a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc b/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc
index f2b9345043b..ce1b4e96415 100644
--- a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc
@@ -14,16 +14,18 @@
  * limitations under the License.
  */
 
-#include "minddata/dataset/text/sentence_piece_vocab.h"
-
-#include <sentencepiece_trainer.h>
 #include <sentencepiece_processor.h>
+#include <sentencepiece_trainer.h>
+
 #include <fstream>
 
+#include "include/common/utils/utils.h"
+#include "minddata/dataset/include/dataset/constants.h"
+#include "minddata/dataset/include/dataset/text.h"
+#include "minddata/dataset/util/path.h"
+#include "minddata/dataset/util/status.h"
 #include "utils/file_utils.h"
 #include "utils/ms_utils.h"
-#include "include/common/utils/utils.h"
-#include "minddata/dataset/util/path.h"
 
 namespace mindspore {
 namespace dataset {
diff --git a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h b/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h
deleted file mode 100644
index 4520c2b6040..00000000000
--- a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_
-#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_
-
-#include <string>
-#include <memory>
-#include <vector>
-#include <unordered_map>
-#include "minddata/dataset/util/status.h"
-#include "minddata/dataset/include/dataset/constants.h"
-
-namespace mindspore {
-namespace dataset {
-
-class SentencePieceVocab {
- public:
-  static Status BuildFromFile(const std::vector<std::string> &path_list, const int32_t vocab_size,
-                              const float character_coverage, const SentencePieceModel model_type,
-                              const std::unordered_map<std::string, std::string> &params,
-                              std::shared_ptr<SentencePieceVocab> *vocab);
-  static Status SaveModel(const std::shared_ptr<SentencePieceVocab> *vocab, std::string path, std::string filename);
-  SentencePieceVocab();
-
-  ~SentencePieceVocab() = default;
-
-  const std::string &model_proto();
-
-  void set_model_proto(const std::string model_proto);
-
- private:
-  std::string model_proto_;
-};
-}  // namespace dataset
-}  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_
diff --git a/mindspore/ccsrc/minddata/dataset/text/vocab.cc b/mindspore/ccsrc/minddata/dataset/text/vocab.cc
index 82dbb48b0f5..e1151255189 100644
--- a/mindspore/ccsrc/minddata/dataset/text/vocab.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/vocab.cc
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include "minddata/dataset/text/vocab.h"
-
-#include <fstream>
-#include <unordered_set>
-#include <unordered_map>
-#include <utility>
 #include <algorithm>
+#include <fstream>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
 
+#include "minddata/dataset/include/dataset/text.h"
+#include "minddata/dataset/util/status.h"
 #include "utils/file_utils.h"
 #ifndef ENABLE_ANDROID
 #include "utils/log_adapter.h"
@@ -33,18 +33,18 @@ namespace mindspore {
 namespace dataset {
 Vocab::Vocab(std::unordered_map<WordType, WordIdType> word2id) { word2id_ = std::move(word2id); }
 
-WordIdType Vocab::Lookup(const WordType &word) const {
+WordIdType Vocab::TokensToIds(const WordType &word) const {
   auto itr = word2id_.find(word);
   return itr == word2id_.end() ? kNoTokenExists : itr->second;
 }
 
-std::vector<WordIdType> Vocab::Lookup(const std::vector<WordType> &words) const {
+std::vector<WordIdType> Vocab::TokensToIds(const std::vector<WordType> &words) const {
   std::vector<WordIdType> ids;
-  std::transform(words.begin(), words.end(), std::back_inserter(ids), [this](auto w) { return Lookup(w); });
+  std::transform(words.begin(), words.end(), std::back_inserter(ids), [this](auto w) { return TokensToIds(w); });
   return ids;
 }
 
-WordType Vocab::ReverseLookup(const WordIdType &id) {
+WordType Vocab::IdsToTokens(const WordIdType &id) {
   // lazy initialization, since I think it's not common use but waste memory
   if (id2word_.empty()) {
     for (const auto [word_, id_] : word2id_) {
@@ -55,7 +55,7 @@ WordType Vocab::ReverseLookup(const WordIdType &id) {
   return itr == id2word_.end() ? kNoIdExists : itr->second;
 }
 
-std::vector<WordType> Vocab::ReverseLookup(const std::vector<WordIdType> &ids) {
+std::vector<WordType> Vocab::IdsToTokens(const std::vector<WordIdType> &ids) {
   // lazy initialization, since I think it's not common use but waste memory
   if (id2word_.empty()) {
     for (const auto [word_, id_] : word2id_) {
@@ -63,50 +63,11 @@ std::vector<WordType> Vocab::ReverseLookup(const std::vector<WordIdType> &ids) {
     }
   }
   std::vector<WordType> words;
-  std::transform(ids.begin(), ids.end(), std::back_inserter(words), [this](auto i) { return ReverseLookup(i); });
+  std::transform(ids.begin(), ids.end(), std::back_inserter(words), [this](auto i) { return IdsToTokens(i); });
   return words;
 }
 
-#ifdef ENABLE_PYTHON
-Status Vocab::BuildFromPyList(const py::list &words, const py::list &special_tokens, bool prepend_special,
-                              std::shared_ptr<Vocab> *vocab) {
-  if (vocab == nullptr) {
-    RETURN_STATUS_UNEXPECTED("Vocab::BuildFromPyList: input vocab can not be null");
-  }
-  // check of duplication on both words and special_tokens will be performed in python
-  // special_tokens and words both need to be unique, and shouldn't overlap
-  std::unordered_map<WordType, WordIdType> word2id;
-  // if special is added in front, normal words id will start from number of special tokens
-  WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
-
-  for (auto word : words) {
-    word2id[py::str(word)] = word_id++;
-  }
-
-  word_id = prepend_special ? 0 : word2id.size();
-
-  for (auto special_token : special_tokens) {
-    word2id[py::str(special_token)] = word_id++;
-  }
-
-  *vocab = std::make_shared<Vocab>(std::move(word2id));
-  return Status::OK();
-}
-
-Status Vocab::BuildFromPyDict(const py::dict &words, std::shared_ptr<Vocab> *vocab) {
-  if (vocab == nullptr) {
-    RETURN_STATUS_UNEXPECTED("Vocab::BuildFromPyDict: input vocab can not be null");
-  }
-  std::unordered_map<WordType, WordIdType> word2id;
-  for (auto p : words) {
-    word2id[py::str(p.first)] = py::reinterpret_borrow<py::int_>(p.second);
-  }
-  *vocab = std::make_shared<Vocab>(std::move(word2id));
-  return Status::OK();
-}
-#endif
-
-void Vocab::append_word(const std::string &word) {
+void Vocab::AppendWord(const std::string &word) {
   if (word2id_.find(word) == word2id_.end()) {
     word2id_[word] = word2id_.size();
   }
@@ -161,11 +122,11 @@ Status Vocab::BuildFromVector(const std::vector<WordType> &words, const std::vec
   return Status::OK();
 }
 
-Status Vocab::BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size,
-                               const std::vector<WordType> &special_tokens, bool prepend_special,
-                               std::shared_ptr<Vocab> *vocab) {
+Status Vocab::BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
+                            const std::vector<WordType> &special_tokens, bool prepend_special,
+                            std::shared_ptr<Vocab> *vocab) {
   if (vocab == nullptr) {
-    RETURN_STATUS_UNEXPECTED("Vocab::BuildFromFileCpp: input vocab can not be null");
+    RETURN_STATUS_UNEXPECTED("Vocab::BuildFromFile: input vocab can not be null");
   }
   // Validate parameters
   auto realpath = FileUtils::GetRealPath(path.c_str());
@@ -227,56 +188,6 @@ Status Vocab::BuildFromFileCpp(const std::string &path, const std::string &delim
   return Status::OK();
 }
 
-Status Vocab::BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
-                            const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab) {
-  if (vocab == nullptr) {
-    RETURN_STATUS_UNEXPECTED("Vocab::BuildFromFile: input vocab can not be null");
-  }
-  // python validator checks special_tokens doesn't contain any duplicate words
-  std::unordered_set<std::string> specials;
-  // used to check that words in file don't contain any special token that already exists
-  for (auto word : special_tokens) {
-    specials.insert(py::str(word));
-  }
-  WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
-  std::unordered_map<WordType, WordIdType> word2id;
-
-  auto realpath = FileUtils::GetRealPath(path.c_str());
-  if (!realpath.has_value()) {
-    RETURN_STATUS_UNEXPECTED("Get real path failed, path=" + path);
-  }
-
-  std::fstream handle(realpath.value(), std::ios::in);
-  CHECK_FAIL_RETURN_UNEXPECTED(handle.good() && handle.is_open(), "from_file: fail to open:" + path);
-  std::string word;
-  while (std::getline(handle, word)) {
-    if (!delimiter.empty()) {
-      // if delimiter is not found, find_first_of would return std::string::npos which is -1
-      word = word.substr(0, word.find_first_of(delimiter));
-    }
-    if (word2id.find(word) != word2id.end()) {
-      handle.close();
-      RETURN_STATUS_UNEXPECTED("from_file: duplicate word:" + word + ".");
-    }
-    if (specials.find(word) != specials.end()) {
-      handle.close();
-      RETURN_STATUS_UNEXPECTED("from_file: special_tokens and word_list contain duplicate word:" + word);
-    }
-    word2id[word] = word_id++;
-    // break if enough row is read, if vocab_size is smaller than 0
-    if (word2id.size() == vocab_size) break;
-  }
-  handle.close();
-  word_id = prepend_special ? 0 : word2id.size();
-
-  for (auto special_token : special_tokens) {
-    word2id[py::str(special_token)] = word_id++;
-  }
-
-  *vocab = std::make_shared<Vocab>(std::move(word2id));
-  return Status::OK();
-}
-
 const WordIdType Vocab::kNoTokenExists = -1;
 const WordType Vocab::kNoIdExists = std::string();
 
diff --git a/mindspore/ccsrc/minddata/dataset/text/vocab.h b/mindspore/ccsrc/minddata/dataset/text/vocab.h
deleted file mode 100644
index 2d08a1e94a5..00000000000
--- a/mindspore/ccsrc/minddata/dataset/text/vocab.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VOCAB_H_
-#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VOCAB_H_
-
-#include <string>
-#include <memory>
-#include <unordered_map>
-#include <vector>
-
-#include "minddata/dataset/util/status.h"
-#ifdef ENABLE_PYTHON
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-#endif
-
-namespace mindspore {
-namespace dataset {
-#ifdef ENABLE_PYTHON
-namespace py = pybind11;
-#endif
-
-using WordIdType = int32_t;
-using WordType = std::string;
-
-class Vocab {
- public:
-#ifdef ENABLE_PYTHON
-  // Build a vocab from a python dictionary key is each word ,id needs to start from 2, no duplicate and continuous
-  // @param const py::dict &words - a dictionary containing word, word id pair.
-  // @param std::shared_ptr<Vocab> *vocab - return value, vocab object
-  // @return error code
-  static Status BuildFromPyDict(const py::dict &words, std::shared_ptr<Vocab> *vocab);
-
-  // Build a vocab from a python list, id will be assigned automatically, start from 2
-  // @param const py::list &words - a list of string, used to build vocab, id starts from 2
-  // @param std::shared_ptr<Vocab> *vocab - return value, vocab object
-  // @return error code
-  static Status BuildFromPyList(const py::list &words, const py::list &special_tokens, bool prepend_special,
-                                std::shared_ptr<Vocab> *vocab);
-
-  // Build a vocab from reading a vocab file, id are automatically assigned, start from 2
-  // @param std::string &path - path to vocab file , each line is assumed to contain 1 word
-  // @param std::string &delimiter - delimiter to break each line with
-  // @param int32_t vocab_size - number of words to read from file
-  // @param std::shared_ptr<Vocab> *vocab - return value, vocab object
-  // @return error code
-  static Status BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
-                              const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab);
-#endif
-
-  /// \brief Build a vocab from a c++ map. id needs to start from 2, no duplicate and continuous
-  /// \param[in] words An unordered_map containing word, word id pair.
-  /// \param[out] vocab A vocab object
-  /// \return Error code
-  static Status BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdType> &words,
-                                      std::shared_ptr<Vocab> *vocab);
-
-  /// \brief Build a vocab from a c++ vector. id needs to start from 2, no duplicate and continuous
-  /// \param[in] words A vector of string, used to build vocab, id starts from 2
-  /// \param[in] special_tokens A vector of string contain special tokens
-  /// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab
-  /// \param[out] vocab A vocab object
-  /// \return Error code
-  static Status BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens,
-                                bool prepend_special, std::shared_ptr<Vocab> *vocab);
-
-  /// \brief Build a vocab from reading a vocab file, id are automatically assigned, start from 2
-  /// \param[in] path Path to vocab file , each line is assumed to contain 1 word
-  /// \param[in] delimiter Delimiter to break each line with
-  /// \param[in] vocab_size Number of words to read from file
-  /// \param[in] special_tokens A vector of string contain special tokens
-  /// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab
-  /// \param[out] vocab A vocab object
-  /// \return Error code
-  static Status BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size,
-                                 const std::vector<WordType> &special_tokens, bool prepend_special,
-                                 std::shared_ptr<Vocab> *vocab);
-
-  // Lookup the id of a word, if word doesn't exist in vocab, return default_id
-  // @param const WordType word - word to look up
-  // @param WordIdType default_id - word id to return to user when its not in the vocab
-  // @return WordIdType, word_id
-  WordIdType Lookup(const WordType &word) const;
-
-  // Lookup the ids of a vector of words, if word doesn't exist in vocab, return default_id
-  // @param const WordType word - word to look up
-  // @param WordIdType default_id - word id to return to user when its not in the vocab
-  // @return WordIdType, word_id
-  std::vector<WordIdType> Lookup(const std::vector<WordType> &words) const;
-
-  // Find the word of a id, if word doesn't exist in vocab, return empty string
-  // @param const WordIdType id - id to reverse look up
-  // @return WordType, word
-  WordType ReverseLookup(const WordIdType &id);
-
-  // Find the words of a vector of ids, if word doesn't exist in vocab, return empty string
-  // @param const WordIdType id - id to reverse look up
-  // @return WordType, word
-  std::vector<WordType> ReverseLookup(const std::vector<WordIdType> &ids);
-
-  // constructor, shouldn't be called directly, can't be private due to std::make_unique()
-  // @param std::unordered_map<WordType, WordIdType> map - sanitized word2id map
-  explicit Vocab(std::unordered_map<WordType, WordIdType> map);
-
-  Vocab() = default;
-
-  // add one word to vocab, increment it's index automatically
-  // @param std::string & word - word to be added will skip if word already exists
-  void append_word(const std::string &word);
-
-  // return a read-only vocab
-  const std::unordered_map<WordType, WordIdType> vocab() { return word2id_; }
-
-  // destructor
-  ~Vocab() = default;
-
-  static const WordIdType kNoTokenExists;
-  static const WordType kNoIdExists;
-
- private:
-  std::unordered_map<WordType, WordIdType> word2id_;
-  std::unordered_map<WordIdType, WordType> id2word_;
-};
-
-}  // namespace dataset
-}  // namespace mindspore
-
-#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VOCAB_H_
diff --git a/mindspore/python/mindspore/dataset/text/transforms.py b/mindspore/python/mindspore/dataset/text/transforms.py
index f7b69c5beca..ed23651b50d 100644
--- a/mindspore/python/mindspore/dataset/text/transforms.py
+++ b/mindspore/python/mindspore/dataset/text/transforms.py
@@ -47,7 +47,7 @@ import numpy as np
 import mindspore._c_dataengine as cde
 from mindspore.common import dtype as mstype
 
-from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType
+from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType, SentencePieceVocab
 from .validators import check_lookup, check_jieba_add_dict, check_to_vectors, \
     check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer, \
     check_wordpiece_tokenizer, check_regex_replace, check_regex_tokenizer, check_basic_tokenizer, check_ngram, \
@@ -386,6 +386,7 @@ class SentencePieceTokenizer(TextTensorOperation):
         self.out_type = out_type
 
     def parse(self):
+        self.mode = self.mode.c_sentence_piece_vocab if isinstance(self.mode, SentencePieceVocab) else self.mode
         return cde.SentencePieceTokenizerOperation(self.mode, DE_C_INTER_SENTENCEPIECE_OUTTYPE[self.out_type])
 
 
diff --git a/mindspore/python/mindspore/dataset/text/utils.py b/mindspore/python/mindspore/dataset/text/utils.py
index 0ee4d472bbc..51a8e5b8435 100644
--- a/mindspore/python/mindspore/dataset/text/utils.py
+++ b/mindspore/python/mindspore/dataset/text/utils.py
@@ -141,7 +141,7 @@ class Vocab:
             >>> dataset = dataset.map(operations=text.Lookup(vocab, "<unk>"), input_columns=["text"])
         """
 
-        vocab = Vocab()
+        vocab = cls()
         vocab.c_vocab = dataset.build_vocab(columns, freq_range, top_k, special_tokens, special_first)
         return vocab
 
@@ -211,7 +211,7 @@ class Vocab:
             vocab_size = -1
         if special_tokens is None:
             special_tokens = []
-        vocab = Vocab()
+        vocab = cls()
         vocab.c_vocab = cde.Vocab.from_file(file_path, delimiter, vocab_size, special_tokens, special_first)
         return vocab
 
@@ -232,16 +232,19 @@ class Vocab:
             >>> vocab = text.Vocab.from_dict({"home": 3, "behind": 2, "the": 4, "world": 5, "<unk>": 6})
         """
 
-        vocab = Vocab()
+        vocab = cls()
         vocab.c_vocab = cde.Vocab.from_dict(word_dict)
         return vocab
 
 
-class SentencePieceVocab(cde.SentencePieceVocab):
+class SentencePieceVocab:
     """
     SentencePiece object that is used to do words segmentation.
     """
 
+    def __init__(self):
+        self.c_sentence_piece_vocab = None
+
     @classmethod
     @check_from_dataset_sentencepiece
     def from_dataset(cls, dataset, col_names, vocab_size, character_coverage, model_type, params):
@@ -278,8 +281,11 @@ class SentencePieceVocab(cde.SentencePieceVocab):
             ...                                              SentencePieceModel.UNIGRAM, {})
         """
 
-        return dataset.build_sentencepiece_vocab(col_names, vocab_size, character_coverage,
-                                                 model_type, params)
+        sentence_piece_vocab = cls()
+        sentence_piece_vocab.c_sentence_piece_vocab = dataset.build_sentencepiece_vocab(col_names, vocab_size,
+                                                                                        character_coverage,
+                                                                                        model_type, params)
+        return sentence_piece_vocab
 
     @classmethod
     @check_from_file_sentencepiece
@@ -321,8 +327,11 @@ class SentencePieceVocab(cde.SentencePieceVocab):
             ...                                           SentencePieceModel.UNIGRAM, {})
         """
 
-        return super().from_file(file_path, vocab_size, character_coverage,
-                                 DE_C_INTER_SENTENCEPIECE_MODE[model_type], params)
+        sentence_piece_vocab = cls()
+        sentence_piece_vocab.c_sentence_piece_vocab = \
+        cde.SentencePieceVocab.from_file(file_path, vocab_size, character_coverage,
+                                         DE_C_INTER_SENTENCEPIECE_MODE[model_type], params)
+        return sentence_piece_vocab
 
     @classmethod
     @check_save_model
@@ -342,7 +351,7 @@ class SentencePieceVocab(cde.SentencePieceVocab):
             >>> text.SentencePieceVocab.save_model(vocab, "./", "m.model")
         """
 
-        super().save_model(vocab, path, filename)
+        cde.SentencePieceVocab.save_model(vocab.c_sentence_piece_vocab, path, filename)
 
 
 def to_str(array, encoding='utf8'):
diff --git a/mindspore/python/mindspore/dataset/text/validators.py b/mindspore/python/mindspore/dataset/text/validators.py
index 5ac05e3932c..b76d617792d 100644
--- a/mindspore/python/mindspore/dataset/text/validators.py
+++ b/mindspore/python/mindspore/dataset/text/validators.py
@@ -551,7 +551,7 @@ def check_save_model(method):
         [vocab, path, filename], _ = parse_user_args(method, *args, **kwargs)
 
         if vocab is not None:
-            type_check(vocab, (cde.SentencePieceVocab,), "vocab")
+            type_check(vocab, (text.SentencePieceVocab,), "vocab")
 
         if path is not None:
             type_check(path, (str,), "path")
@@ -573,7 +573,7 @@ def check_sentence_piece_tokenizer(method):
     def new_method(self, *args, **kwargs):
         [mode, out_type], _ = parse_user_args(method, *args, **kwargs)
 
-        type_check(mode, (str, cde.SentencePieceVocab), "mode is not an instance of str or cde.SentencePieceVocab.")
+        type_check(mode, (str, text.SentencePieceVocab), "mode is not an instance of str or text.SentencePieceVocab.")
         type_check(out_type, (SPieceTokenizerOutType,), "out_type is not an instance of SPieceTokenizerOutType")
 
         return method(self, *args, **kwargs)
diff --git a/tests/ut/cpp/dataset/build_vocab_test.cc b/tests/ut/cpp/dataset/build_vocab_test.cc
index 23013fd90c5..01812000816 100644
--- a/tests/ut/cpp/dataset/build_vocab_test.cc
+++ b/tests/ut/cpp/dataset/build_vocab_test.cc
@@ -20,7 +20,7 @@
 
 #include "common/common.h"
 #include "include/api/status.h"
-#include "minddata/dataset/text/vocab.h"
+#include "minddata/dataset/include/dataset/text.h"
 
 using mindspore::dataset::Tensor;
 using mindspore::dataset::Vocab;
@@ -47,7 +47,7 @@ TEST_F(MindDataTestVocab, TestVocabFromUnorderedMap) {
   std::vector<std::string> words = {"apple", "dog", "egg"};
   std::vector<int64_t> expected = {1, 3, -1};
   for (uint32_t i = 0; i < words.size(); ++i) {
-    int32_t x = vocab->Lookup(words[i]);
+    int32_t x = vocab->TokensToIds(words[i]);
     EXPECT_EQ(x, expected[i]);
   }
 }
@@ -65,7 +65,7 @@ TEST_F(MindDataTestVocab, TestVocabFromEmptyMap) {
   std::vector<std::string> words = {"apple", "dog", "egg"};
   std::vector<int64_t> expected = {-1, -1, -1};
   for (uint32_t i = 0; i < words.size(); ++i) {
-    int32_t x = vocab->Lookup(words[i]);
+    int32_t x = vocab->TokensToIds(words[i]);
     EXPECT_EQ(x, expected[i]);
   }
 }
@@ -96,7 +96,7 @@ TEST_F(MindDataTestVocab, TestVocabFromVectorPrependSpTokens) {
   std::vector<std::string> words = {"apple", "banana", "fox"};
   std::vector<int64_t> expected = {1, 2, -1};
   for (uint32_t i = 0; i < words.size(); ++i) {
-    int32_t x = vocab->Lookup(words[i]);
+    int32_t x = vocab->TokensToIds(words[i]);
     EXPECT_EQ(x, expected[i]);
   }
 }
@@ -113,7 +113,7 @@ TEST_F(MindDataTestVocab, TestVocabFromVectorAppendSpTokens) {
   std::vector<std::string> words = {"apple", "<unk>", "fox"};
   std::vector<int64_t> expected = {0, 5, -1};
   for (uint32_t i = 0; i < words.size(); ++i) {
-    int32_t x = vocab->Lookup(words[i]);
+    int32_t x = vocab->TokensToIds(words[i]);
     EXPECT_EQ(x, expected[i]);
   }
 }
@@ -131,7 +131,7 @@ TEST_F(MindDataTestVocab, TestVocabFromVectorWithNoSpTokens) {
   std::vector<std::string> words = {"apple", "banana", "fox", "<pad>"};
   std::vector<int64_t> expected = {0, 1, -1, -1};
   for (uint32_t i = 0; i < words.size(); ++i) {
-    int32_t x = vocab->Lookup(words[i]);
+    int32_t x = vocab->TokensToIds(words[i]);
     EXPECT_EQ(x, expected[i]);
   }
 }
@@ -149,7 +149,7 @@ TEST_F(MindDataTestVocab, TestVocabFromEmptyVector) {
   std::vector<std::string> words = {"apple", "banana", "fox"};
   std::vector<int64_t> expected = {-1, -1, -1};
   for (uint32_t i = 0; i < words.size(); ++i) {
-    int32_t x = vocab->Lookup(words[i]);
+    int32_t x = vocab->TokensToIds(words[i]);
     EXPECT_EQ(x, expected[i]);
   }
 }
@@ -195,14 +195,14 @@ TEST_F(MindDataTestVocab, TestVocabFromFile) {
   // Build vocab from local file
   std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
-  Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab);
+  Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab);
   EXPECT_EQ(s, Status::OK());
 
   // Look up specified words
   std::vector<std::string> words = {"not", "all"};
   std::vector<int64_t> expected = {2, 3};
   for (uint32_t i = 0; i < words.size(); ++i) {
-    int32_t x = vocab->Lookup(words[i]);
+    int32_t x = vocab->TokensToIds(words[i]);
     EXPECT_EQ(x, expected[i]);
   }
 }
@@ -212,7 +212,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail1) {
   // Build vocab from local file which is not exist
   std::string vocab_dir = datasets_root_path_ + "/testVocab/not_exist.txt";
   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
-  Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {}, true, &vocab);
+  Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {}, true, &vocab);
   EXPECT_NE(s, Status::OK());
 }
 
@@ -223,7 +223,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail2) {
   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
 
   // Expected failure: vocab_size should be either -1 or positive integer
-  Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -2, {}, true, &vocab);
+  Status s = Vocab::BuildFromFile(vocab_dir, ",", -2, {}, true, &vocab);
   EXPECT_NE(s, Status::OK());
 }
 
@@ -234,7 +234,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail3) {
   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
 
   // Expected failure: duplicate special token <unk>
-  Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<unk>", "<unk>"}, true, &vocab);
+  Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"<unk>", "<unk>"}, true, &vocab);
   EXPECT_NE(s, Status::OK());
 }
 
@@ -245,6 +245,6 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail4) {
   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
 
   // Expected failure: special_tokens and word_list contain duplicate word
-  Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"home"}, true, &vocab);
+  Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"home"}, true, &vocab);
   EXPECT_NE(s, Status::OK());
 }
diff --git a/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc b/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc
index 4668f641ca4..c1c80917237 100644
--- a/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc
+++ b/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc
@@ -23,7 +23,6 @@
 #include "minddata/dataset/include/dataset/datasets.h"
 #include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/include/dataset/transforms.h"
-#include "minddata/dataset/text/sentence_piece_vocab.h"
 
 using namespace mindspore::dataset;
 using mindspore::dataset::SentencePieceModel;
diff --git a/tests/ut/cpp/dataset/c_api_text_test.cc b/tests/ut/cpp/dataset/c_api_text_test.cc
index ce6a51bccef..51566e92cc3 100644
--- a/tests/ut/cpp/dataset/c_api_text_test.cc
+++ b/tests/ut/cpp/dataset/c_api_text_test.cc
@@ -27,7 +27,6 @@
 #include "minddata/dataset/text/fast_text.h"
 #include "minddata/dataset/text/glove.h"
 #include "minddata/dataset/text/vectors.h"
-#include "minddata/dataset/text/vocab.h"
 
 using namespace mindspore::dataset;
 using mindspore::Status;
@@ -797,7 +796,7 @@ TEST_F(MindDataTestPipeline, TestFilterWikipediaXMLSuccess) {
   // Iterate the dataset and get each row
   std::unordered_map<std::string, mindspore::MSTensor> row;
   ASSERT_OK(iter->GetNextRow(&row));
-  std::vector<std::string> expected = {"welcome to beijing","",""};
+  std::vector<std::string> expected = {"welcome to beijing", "", ""};
 
   uint64_t i = 0;
 
@@ -806,7 +805,7 @@ TEST_F(MindDataTestPipeline, TestFilterWikipediaXMLSuccess) {
     std::shared_ptr<Tensor> de_expected_tensor;
     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
     mindspore::MSTensor ms_expected_tensor =
-    mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
+      mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
     ASSERT_OK(iter->GetNextRow(&row));
     i++;
@@ -1709,8 +1708,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail1) {
   EXPECT_NE(ds, nullptr);
 
   // Create ToNumber operation on ds
-  std::shared_ptr<TensorTransform> to_number =
-    std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
+  std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
   EXPECT_NE(to_number, nullptr);
 
   // Create a Map operation on ds
@@ -1760,7 +1758,8 @@ TEST_F(MindDataTestPipeline, TestToNumberFail2) {
   EXPECT_NE(ds, nullptr);
 
   // Create ToNumber operation on ds
-  std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16);
+  std::shared_ptr<TensorTransform> to_number =
+    std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16);
   EXPECT_NE(to_number, nullptr);
 
   // Create a Map operation on ds
@@ -2143,8 +2142,7 @@ TEST_F(MindDataTestPipeline, TestNgramSuccess1) {
   ASSERT_OK(iter->GetNextRow(&row));
 
   std::vector<std::vector<std::string>> expected = {
-    {"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a",
-     "is-a-text",
+    {"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a", "is-a-text",
      "a-text-file.", "text-file.-&", "file.-&-&"},
     {"&-Be", "Be-happy", "happy-every", "every-day.", "day.-&", "&-&-Be", "&-Be-happy", "Be-happy-every",
      "happy-every-day.", "every-day.-&", "day.-&-&"},
@@ -4371,8 +4369,7 @@ TEST_F(MindDataTestPipeline, TestGloVeDefaultParam) {
   Status s = GloVe::BuildFromFile(&glove, vectors_dir);
   EXPECT_EQ(s, Status::OK());
 
-  std::shared_ptr<TensorTransform> lookup =
-    std::make_shared<text::ToVectors>(glove);
+  std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove);
   EXPECT_NE(lookup, nullptr);
 
   // Create Map operation on ds
@@ -4388,14 +4385,13 @@ TEST_F(MindDataTestPipeline, TestGloVeDefaultParam) {
   ASSERT_OK(iter->GetNextRow(&row));
 
   uint64_t i = 0;
-  std::vector<std::vector<float>> expected = {
-      {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
-      {0, 0, 0, 0, 0, 0},
-      {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
-      {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
-      {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
-      {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
-      {0, 0, 0, 0, 0, 0}};
+  std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
+                                              {0, 0, 0, 0, 0, 0},
+                                              {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
+                                              {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
+                                              {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
+                                              {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
+                                              {0, 0, 0, 0, 0, 0}};
   while (row.size() != 0) {
     auto ind = row["text"];
     MS_LOG(INFO) << ind.Shape();
@@ -4434,8 +4430,7 @@ TEST_F(MindDataTestPipeline, TestGloVeAllBuildfromfileParams) {
   Status s = GloVe::BuildFromFile(&glove, vectors_dir, 100);
   EXPECT_EQ(s, Status::OK());
 
-  std::shared_ptr<TensorTransform> lookup =
-    std::make_shared<text::ToVectors>(glove);
+  std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove);
   EXPECT_NE(lookup, nullptr);
 
   // Create Map operation on ds
@@ -4451,14 +4446,13 @@ TEST_F(MindDataTestPipeline, TestGloVeAllBuildfromfileParams) {
   ASSERT_OK(iter->GetNextRow(&row));
 
   uint64_t i = 0;
-  std::vector<std::vector<float>> expected = {
-      {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
-      {0, 0, 0, 0, 0, 0},
-      {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
-      {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
-      {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
-      {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
-      {0, 0, 0, 0, 0, 0}};
+  std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
+                                              {0, 0, 0, 0, 0, 0},
+                                              {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
+                                              {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
+                                              {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
+                                              {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
+                                              {0, 0, 0, 0, 0, 0}};
   while (row.size() != 0) {
     auto ind = row["text"];
     MS_LOG(INFO) << ind.Shape();
@@ -4498,8 +4492,7 @@ TEST_F(MindDataTestPipeline, TestGloVeUnknownInit) {
   EXPECT_EQ(s, Status::OK());
 
   std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
-  std::shared_ptr<TensorTransform> lookup =
-    std::make_shared<text::ToVectors>(glove, unknown_init);
+  std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove, unknown_init);
   EXPECT_NE(lookup, nullptr);
 
   // Create Map operation on ds
@@ -4515,14 +4508,13 @@ TEST_F(MindDataTestPipeline, TestGloVeUnknownInit) {
   ASSERT_OK(iter->GetNextRow(&row));
 
   uint64_t i = 0;
-  std::vector<std::vector<float>> expected = {
-      {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
-      {-1, -1, -1, -1, -1, -1},
-      {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
-      {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
-      {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
-      {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
-      {-1, -1, -1, -1, -1, -1}};
+  std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
+                                              {-1, -1, -1, -1, -1, -1},
+                                              {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
+                                              {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
+                                              {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
+                                              {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
+                                              {-1, -1, -1, -1, -1, -1}};
   while (row.size() != 0) {
     auto ind = row["text"];
     MS_LOG(INFO) << ind.Shape();
@@ -4562,8 +4554,7 @@ TEST_F(MindDataTestPipeline, TestGloVeAllParams) {
   EXPECT_EQ(s, Status::OK());
 
   std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
-  std::shared_ptr<TensorTransform> lookup =
-    std::make_shared<text::ToVectors>(glove, unknown_init, true);
+  std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove, unknown_init, true);
   EXPECT_NE(lookup, nullptr);
 
   // Create Map operation on ds
@@ -4579,14 +4570,13 @@ TEST_F(MindDataTestPipeline, TestGloVeAllParams) {
   ASSERT_OK(iter->GetNextRow(&row));
 
   uint64_t i = 0;
-  std::vector<std::vector<float>> expected = {
-      {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
-      {-1, -1, -1, -1, -1, -1},
-      {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
-      {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
-      {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
-      {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
-      {-1, -1, -1, -1, -1, -1}};
+  std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
+                                              {-1, -1, -1, -1, -1, -1},
+                                              {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
+                                              {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
+                                              {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
+                                              {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
+                                              {-1, -1, -1, -1, -1, -1}};
   while (row.size() != 0) {
     auto ind = row["text"];
     MS_LOG(INFO) << ind.Shape();
@@ -4748,13 +4738,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramDefaultParam) {
   ASSERT_OK(iter->GetNextRow(&row));
 
   uint64_t i = 0;
-  std::vector<std::vector<float>> expected = {{0,0,0,0,0},
-                                              {0,0,0,0,0},
-                                              {0.117336,0.362446,-0.983326,0.939264,-0.05648},
-                                              {0.657201,2.11761,-1.59276,0.432072,1.21395},
-                                              {0,0,0,0,0},
-                                              {-2.26956,0.288491,-0.740001,0.661703,0.147355},
-                                              {0,0,0,0,0}};
+  std::vector<std::vector<float>> expected = {{0, 0, 0, 0, 0},
+                                              {0, 0, 0, 0, 0},
+                                              {0.117336, 0.362446, -0.983326, 0.939264, -0.05648},
+                                              {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
+                                              {0, 0, 0, 0, 0},
+                                              {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
+                                              {0, 0, 0, 0, 0}};
   while (row.size() != 0) {
     auto ind = row["text"];
     MS_LOG(INFO) << ind.Shape();
@@ -4810,13 +4800,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramAllBuildfromfileParams) {
   ASSERT_OK(iter->GetNextRow(&row));
 
   uint64_t i = 0;
-  std::vector<std::vector<float>> expected = {{0,0,0,0,0},
-                                              {0,0,0,0,0},
-                                              {-0.155665,0.664073,-0.538499,1.22657,-0.2162},
-                                              {0.657201,2.11761,-1.59276,0.432072,1.21395},
-                                              {0,0,0,0,0},
-                                              {-2.26956,0.288491,-0.740001,0.661703,0.147355},
-                                              {0,0,0,0,0}};
+  std::vector<std::vector<float>> expected = {{0, 0, 0, 0, 0},
+                                              {0, 0, 0, 0, 0},
+                                              {-0.155665, 0.664073, -0.538499, 1.22657, -0.2162},
+                                              {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
+                                              {0, 0, 0, 0, 0},
+                                              {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
+                                              {0, 0, 0, 0, 0}};
   while (row.size() != 0) {
     auto ind = row["text"];
     MS_LOG(INFO) << ind.Shape();
@@ -4873,13 +4863,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramUnknownInit) {
   ASSERT_OK(iter->GetNextRow(&row));
 
   uint64_t i = 0;
-  std::vector<std::vector<float>> expected = {{-1,-1,-1,-1,-1},
-                                              {-1,-1,-1,-1,-1},
-                                              {-0.155665,0.664073,-0.538499,1.22657,-0.2162},
-                                              {0.657201,2.11761,-1.59276,0.432072,1.21395},
-                                              {-1,-1,-1,-1,-1},
-                                              {-2.26956,0.288491,-0.740001,0.661703,0.147355},
-                                              {-1,-1,-1,-1,-1}};
+  std::vector<std::vector<float>> expected = {{-1, -1, -1, -1, -1},
+                                              {-1, -1, -1, -1, -1},
+                                              {-0.155665, 0.664073, -0.538499, 1.22657, -0.2162},
+                                              {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
+                                              {-1, -1, -1, -1, -1},
+                                              {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
+                                              {-1, -1, -1, -1, -1}};
   while (row.size() != 0) {
     auto ind = row["text"];
     MS_LOG(INFO) << ind.Shape();
@@ -4936,13 +4926,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramAllParams) {
   ASSERT_OK(iter->GetNextRow(&row));
 
   uint64_t i = 0;
-  std::vector<std::vector<float>> expected = {{-1,-1,-1,-1,-1},
-                                              {-1,-1,-1,-1,-1},
-                                              {0.117336,0.362446,-0.983326,0.939264,-0.05648},
-                                              {0.657201,2.11761,-1.59276,0.432072,1.21395},
-                                              {-1,-1,-1,-1,-1},
-                                              {-2.26956,0.288491,-0.740001,0.661703,0.147355},
-                                              {-1,-1,-1,-1,-1}};
+  std::vector<std::vector<float>> expected = {{-1, -1, -1, -1, -1},
+                                              {-1, -1, -1, -1, -1},
+                                              {0.117336, 0.362446, -0.983326, 0.939264, -0.05648},
+                                              {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
+                                              {-1, -1, -1, -1, -1},
+                                              {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
+                                              {-1, -1, -1, -1, -1}};
   while (row.size() != 0) {
     auto ind = row["text"];
     MS_LOG(INFO) << ind.Shape();
diff --git a/tests/ut/cpp/dataset/c_api_text_vocab_test.cc b/tests/ut/cpp/dataset/c_api_text_vocab_test.cc
index 4c16e56de6d..f9c1736ecdf 100644
--- a/tests/ut/cpp/dataset/c_api_text_vocab_test.cc
+++ b/tests/ut/cpp/dataset/c_api_text_vocab_test.cc
@@ -22,7 +22,6 @@
 #include "minddata/dataset/include/dataset/datasets.h"
 #include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/include/dataset/transforms.h"
-#include "minddata/dataset/text/vocab.h"
 
 using namespace mindspore::dataset;
 using mindspore::Status;
@@ -42,7 +41,7 @@ class MindDataTestPipeline : public UT::DatasetOpTesting {
   } while (false)
 
 /// Feature: C++ text.Vocab class.
-/// Description: test Lookup() ReverseLookup() methods of text::Vocab.
+/// Description: test TokensToIds() IdsToTokens() methods of text::Vocab.
 /// Expectation: success.
 TEST_F(MindDataTestPipeline, TestVocabLookupAndReverseLookup) {
   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupAndReverseLookup.";
@@ -53,30 +52,30 @@ TEST_F(MindDataTestPipeline, TestVocabLookupAndReverseLookup) {
   EXPECT_EQ(s, Status::OK());
 
   // lookup, convert token to id
-  auto single_index = vocab->Lookup("home");
+  auto single_index = vocab->TokensToIds("home");
   EXPECT_EQ(single_index, 2);
-  single_index = vocab->Lookup("hello");
+  single_index = vocab->TokensToIds("hello");
   EXPECT_EQ(single_index, -1);
 
   // lookup multiple tokens
-  auto multi_indexs = vocab->Lookup(std::vector<std::string>{"<pad>", "behind"});
+  auto multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "behind"});
   std::vector<int32_t> expected_multi_indexs = {0, 4};
   EXPECT_EQ(multi_indexs, expected_multi_indexs);
-  multi_indexs = vocab->Lookup(std::vector<std::string>{"<pad>", "apple"});
+  multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "apple"});
   expected_multi_indexs = {0, -1};
   EXPECT_EQ(multi_indexs, expected_multi_indexs);
 
   // reverse lookup, convert id to token
-  auto single_word = vocab->ReverseLookup(2);
+  auto single_word = vocab->IdsToTokens(2);
   EXPECT_EQ(single_word, "home");
-  single_word = vocab->ReverseLookup(-1);
+  single_word = vocab->IdsToTokens(-1);
   EXPECT_EQ(single_word, "");
 
   // reverse lookup multiple ids
-  auto multi_words = vocab->ReverseLookup(std::vector<int32_t>{0, 4});
+  auto multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 4});
   std::vector<std::string> expected_multi_words = {"<pad>", "behind"};
   EXPECT_EQ(multi_words, expected_multi_words);
-  multi_words = vocab->ReverseLookup(std::vector<int32_t>{0, 99});
+  multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 99});
   expected_multi_words = {"<pad>", ""};
   EXPECT_EQ(multi_words, expected_multi_words);
 }
@@ -330,7 +329,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
   EXPECT_NE(vocab, nullptr);
 
   // Check if vocab has words or not
-  int32_t home_index = vocab->Lookup("home");
+  int32_t home_index = vocab->TokensToIds("home");
   EXPECT_EQ(home_index, 4);
 
   // Create Lookup operation on ds
@@ -386,7 +385,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) {
   EXPECT_NE(vocab, nullptr);
 
   // Check if vocab has words or not
-  int32_t home_index = vocab->Lookup("home");
+  int32_t home_index = vocab->TokensToIds("home");
   EXPECT_EQ(home_index, 2);
 
   // Create Lookup operation on ds
@@ -509,7 +508,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetInt64) {
   EXPECT_NE(vocab, nullptr);
 
   // Check if vocab has words or not
-  int32_t home_index = vocab->Lookup("home");
+  int32_t home_index = vocab->TokensToIds("home");
   EXPECT_EQ(home_index, 2);
 
   // Create Lookup operation on ds
diff --git a/tests/ut/cpp/dataset/sentence_piece_vocab_op_test.cc b/tests/ut/cpp/dataset/sentence_piece_vocab_op_test.cc
index af0058fd6ba..a86a51c44e1 100644
--- a/tests/ut/cpp/dataset/sentence_piece_vocab_op_test.cc
+++ b/tests/ut/cpp/dataset/sentence_piece_vocab_op_test.cc
@@ -19,7 +19,7 @@
 #include "common/common.h"
 #include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h"
 #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
-#include "minddata/dataset/text/sentence_piece_vocab.h"
+#include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/engine/datasetops/source/text_file_op.h"
 #include "gtest/gtest.h"
 #include "utils/log_adapter.h"