!9530 add BasicTokenizer and BertTokenizer C++ API

From: @tiancixiao Reviewed-by: @mikef,@nsyca Signed-off-by: @nsyca
2020-12-08 04:09:40 +08:00 · 2020-12-08 04:09:40 +08:00 · 6b5626634c
parent 6b94f27d78 290cc72887
commit 6b5626634c
3 changed files with 795 additions and 0 deletions
--- a/mindspore/ccsrc/minddata/dataset/api/text.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/text.cc
@ -18,6 +18,8 @@

 #include "minddata/dataset/include/text.h"
 #ifndef _WIN32
+#include "minddata/dataset/text/kernels/basic_tokenizer_op.h"
+#include "minddata/dataset/text/kernels/bert_tokenizer_op.h"
 #include "minddata/dataset/text/kernels/case_fold_op.h"
 #endif
 #include "minddata/dataset/text/kernels/jieba_tokenizer_op.h"
@ -47,6 +49,27 @@ namespace text {
 // (In alphabetical order)

 #ifndef _WIN32
+std::shared_ptr<BasicTokenizerOperation> BasicTokenizer(bool lower_case, bool keep_whitespace,
+                                                        const NormalizeForm normalize_form, bool preserve_unused_token,
+                                                        bool with_offsets) {
+  auto op = std::make_shared<BasicTokenizerOperation>(lower_case, keep_whitespace, normalize_form,
+                                                      preserve_unused_token, with_offsets);
+
+  return op->ValidateParams() ? op : nullptr;
+}
+
+std::shared_ptr<BertTokenizerOperation> BertTokenizer(const std::shared_ptr<Vocab> &vocab,
+                                                      const std::string &suffix_indicator, int32_t max_bytes_per_token,
+                                                      const std::string &unknown_token, bool lower_case,
+                                                      bool keep_whitespace, const NormalizeForm normalize_form,
+                                                      bool preserve_unused_token, bool with_offsets) {
+  auto op =
+    std::make_shared<BertTokenizerOperation>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case,
+                                             keep_whitespace, normalize_form, preserve_unused_token, with_offsets);
+
+  return op->ValidateParams() ? op : nullptr;
+}
+
 std::shared_ptr<CaseFoldOperation> CaseFold() {
  auto op = std::make_shared<CaseFoldOperation>();

@ -168,6 +191,64 @@ Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::s
 // (In alphabetical order)

 #ifndef _WIN32
+// BasicTokenizerOperation
+BasicTokenizerOperation::BasicTokenizerOperation(bool lower_case, bool keep_whitespace,
+                                                 const NormalizeForm normalize_form, bool preserve_unused_token,
+                                                 bool with_offsets)
+    : lower_case_(lower_case),
+      keep_whitespace_(keep_whitespace),
+      normalize_form_(normalize_form),
+      preserve_unused_token_(preserve_unused_token),
+      with_offsets_(with_offsets) {}
+
+Status BasicTokenizerOperation::ValidateParams() { return Status::OK(); }
+
+std::shared_ptr<TensorOp> BasicTokenizerOperation::Build() {
+  std::shared_ptr<BasicTokenizerOp> tensor_op = std::make_shared<BasicTokenizerOp>(
+    lower_case_, keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_);
+  return tensor_op;
+}
+
+// BertTokenizerOperation
+BertTokenizerOperation::BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
+                                               int32_t max_bytes_per_token, const std::string &unknown_token,
+                                               bool lower_case, bool keep_whitespace,
+                                               const NormalizeForm normalize_form, bool preserve_unused_token,
+                                               bool with_offsets)
+    : vocab_(vocab),
+      suffix_indicator_(suffix_indicator),
+      max_bytes_per_token_(max_bytes_per_token),
+      unknown_token_(unknown_token),
+      lower_case_(lower_case),
+      keep_whitespace_(keep_whitespace),
+      normalize_form_(normalize_form),
+      preserve_unused_token_(preserve_unused_token),
+      with_offsets_(with_offsets) {}
+
+Status BertTokenizerOperation::ValidateParams() {
+  if (vocab_ == nullptr) {
+    std::string err_msg = "BertTokenizer: vocab object type is incorrect or null.";
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  if (max_bytes_per_token_ < 0) {
+    std::string err_msg = "BertTokenizer : The parameter max_bytes_per_token must be greater than or equal to 0: " +
+                          std::to_string(max_bytes_per_token_);
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> BertTokenizerOperation::Build() {
+  std::shared_ptr<BertTokenizerOp> tensor_op =
+    std::make_shared<BertTokenizerOp>(vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, lower_case_,
+                                      keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_);
+  return tensor_op;
+}
+
 // CaseFoldOperation
 Status CaseFoldOperation::ValidateParams() { return Status::OK(); }

--- a/mindspore/ccsrc/minddata/dataset/include/text.h
+++ b/mindspore/ccsrc/minddata/dataset/include/text.h
@ -38,6 +38,8 @@ namespace dataset {
 namespace text {

 // Char arrays storing name of corresponding classes (in alphabetical order)
+constexpr char kBasicTokenizerOperation[] = "BasicTokenizer";
+constexpr char kBertTokenizerOperation[] = "BertTokenizer";
 constexpr char kCaseFoldOperation[] = "CaseFold";
 constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer";
 constexpr char kLookupOperation[] = "Lookup";
@ -53,6 +55,8 @@ constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer";

 // Text Op classes (in alphabetical order)
 #ifndef _WIN32
+class BasicTokenizerOperation;
+class BertTokenizerOperation;
 class CaseFoldOperation;
 #endif
 class JiebaTokenizerOperation;
@ -72,6 +76,47 @@ class WhitespaceTokenizerOperation;
 #endif

 #ifndef _WIN32
+/// \brief Tokenize a scalar tensor of UTF-8 string by specific rules.
+/// \notes BasicTokenizer is not supported on Windows platform yet.
+/// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation on input text to
+///   fold the text to lower case and strip accents characters. If false, only apply NormalizeUTF8('normalization_form'
+///   mode) operation on input text (default=false).
+/// \param[in] keep_whitespace If true, the whitespace will be kept in out tokens (default=false).
+/// \param[in] normalize_form Used to specify a specific normalize mode. This is only effective when 'lower_case' is
+///   false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
+/// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
+///   '[MASK]' (default=true).
+/// \param[in] with_offsets If or not output offsets of tokens (default=false).
+/// \return Shared pointer to the current TensorOperation.
+std::shared_ptr<BasicTokenizerOperation> BasicTokenizer(bool lower_case = false, bool keep_whitespace = false,
+                                                        const NormalizeForm normalize_form = NormalizeForm::kNone,
+                                                        bool preserve_unused_token = true, bool with_offsets = false);
+
+/// \brief Tokenizer used for Bert text process.
+/// \notes BertTokenizer is not supported on Windows platform yet.
+/// \param[in] vocab A Vocab object.
+/// \param[in] suffix_indicator Used to show that the subword is the last part of a word (default='##').
+/// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100).
+/// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty
+///   string, else return the string specified(default='[UNK]').
+/// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation on input text to
+///   fold the text to lower case and strip accents characters. If false, only apply NormalizeUTF8('normalization_form'
+///   mode) operation on input text (default=false).
+/// \param[in] keep_whitespace If true, the whitespace will be kept in out tokens (default=false).
+/// \param[in] normalize_form Used to specify a specific normalize mode. This is only effective when 'lower_case' is
+///   false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
+/// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
+///   '[MASK]' (default=true).
+/// \param[in] with_offsets If or not output offsets of tokens (default=false).
+/// \return Shared pointer to the current TensorOperation.
+std::shared_ptr<BertTokenizerOperation> BertTokenizer(const std::shared_ptr<Vocab> &vocab,
+                                                      const std::string &suffix_indicator = "##",
+                                                      int32_t max_bytes_per_token = 100,
+                                                      const std::string &unknown_token = "[UNK]",
+                                                      bool lower_case = false, bool keep_whitespace = false,
+                                                      const NormalizeForm normalize_form = NormalizeForm::kNone,
+                                                      bool preserve_unused_token = true, bool with_offsets = false);
+
 /// \brief Apply case fold operation on UTF-8 string tensor.
 /// \return Shared pointer to the current TensorOperation.
 std::shared_ptr<CaseFoldOperation> CaseFold();
@ -193,6 +238,54 @@ std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offs
 /* ####################################### Derived TensorOperation classes ################################# */

 #ifndef _WIN32
+class BasicTokenizerOperation : public TensorOperation {
+ public:
+  BasicTokenizerOperation(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form,
+                          bool preserve_unused_token, bool with_offsets);
+
+  ~BasicTokenizerOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kBasicTokenizerOperation; }
+
+ private:
+  bool lower_case_;
+  bool keep_whitespace_;
+  NormalizeForm normalize_form_;
+  bool preserve_unused_token_;
+  bool with_offsets_;
+};
+
+class BertTokenizerOperation : public TensorOperation {
+ public:
+  BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
+                         int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case,
+                         bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
+                         bool with_offsets);
+
+  ~BertTokenizerOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kBertTokenizerOperation; }
+
+ private:
+  std::shared_ptr<Vocab> vocab_;
+  std::string suffix_indicator_;
+  int32_t max_bytes_per_token_;
+  std::string unknown_token_;
+  bool lower_case_;
+  bool keep_whitespace_;
+  NormalizeForm normalize_form_;
+  bool preserve_unused_token_;
+  bool with_offsets_;
+};
+
 class CaseFoldOperation : public TensorOperation {
 public:
  CaseFoldOperation() = default;
--- a/tests/ut/cpp/dataset/c_api_text_test.cc
+++ b/tests/ut/cpp/dataset/c_api_text_test.cc
@ -34,6 +34,627 @@ class MindDataTestPipeline : public UT::DatasetOpTesting {
 protected:
 };

+TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess1) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess1.";
+  // Test BasicTokenizer with default parameters
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  // Create Take operation on ds
+  ds = ds->Take(6);
+  EXPECT_NE(ds, nullptr);
+
+  // Create BasicTokenizer operation on ds
+  std::shared_ptr<TensorOperation> basic_tokenizer = text::BasicTokenizer();
+  EXPECT_NE(basic_tokenizer, nullptr);
+
+  // Create Map operation on ds
+  ds = ds->Map({basic_tokenizer}, {"text"});
+  EXPECT_NE(ds, nullptr);
+
+  // Create an iterator over the result of the above dataset
+  // This will trigger the creation of the Execution Tree and launch it.
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(iter, nullptr);
+
+  // Iterate the dataset and get each row
+  std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
+  iter->GetNextRow(&row);
+
+  std::vector<std::vector<std::string>> expected = {
+    {"Welcome", "to", "Beijing", "北", "京", "欢", "迎", "您"},
+    {"長", "風", "破", "浪", "會", "有", "時", "，", "直", "掛", "雲", "帆", "濟", "滄", "海"},
+    {"😀", "嘿", "嘿", "😃", "哈", "哈", "😄", "大", "笑", "😁", "嘻", "嘻"},
+    {"明", "朝", "（", "1368", "—",  "1644", "年", "）", "和", "清", "朝", "（", "1644", "—",  "1911", "年", "）",
+     "，", "是", "中", "国",   "封", "建",   "王", "朝", "史", "上", "最", "后", "两",   "个", "朝",   "代"},
+    {"明", "代",   "（", "1368",     "-",  "1644", "）",      "と", "清", "代",    "（", "1644",
+     "-",  "1911", "）", "は",       "、", "中",   "国",      "の", "封", "建",    "王", "朝",
+     "の", "歴",   "史", "における", "最", "後",   "の2つの", "王", "朝", "でした"},
+    {"명나라", "(", "1368", "-",    "1644", ")",      "와",       "청나라", "(",  "1644",    "-",
+     "1911",   ")", "는",   "중국", "봉건", "왕조의", "역사에서", "마지막", "두", "왕조였다"}};
+
+  uint64_t i = 0;
+  while (row.size() != 0) {
+    auto ind = row["text"];
+    std::shared_ptr<Tensor> expected_tensor;
+    Tensor::CreateFromVector(expected[i], &expected_tensor);
+    EXPECT_EQ(*ind, *expected_tensor);
+    iter->GetNextRow(&row);
+    i++;
+  }
+
+  EXPECT_EQ(i, 6);
+
+  // Manually terminate the pipeline
+  iter->Stop();
+}
+
+TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess2) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess2.";
+  // Test BasicTokenizer with lower_case true
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  // Create Skip operation on ds
+  ds = ds->Skip(6);
+  EXPECT_NE(ds, nullptr);
+
+  // Create BasicTokenizer operation on ds
+  std::shared_ptr<TensorOperation> basic_tokenizer = text::BasicTokenizer(true);
+  EXPECT_NE(basic_tokenizer, nullptr);
+
+  // Create Map operation on ds
+  ds = ds->Map({basic_tokenizer}, {"text"});
+  EXPECT_NE(ds, nullptr);
+
+  // Create an iterator over the result of the above dataset
+  // This will trigger the creation of the Execution Tree and launch it.
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(iter, nullptr);
+
+  // Iterate the dataset and get each row
+  std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
+  iter->GetNextRow(&row);
+
+  std::vector<std::string> expected = {"this", "is", "a", "funky", "string"};
+
+  uint64_t i = 0;
+  while (row.size() != 0) {
+    auto ind = row["text"];
+    std::shared_ptr<Tensor> expected_tensor;
+    Tensor::CreateFromVector(expected, &expected_tensor);
+    EXPECT_EQ(*ind, *expected_tensor);
+    iter->GetNextRow(&row);
+    i++;
+  }
+
+  EXPECT_EQ(i, 1);
+
+  // Manually terminate the pipeline
+  iter->Stop();
+}
+
+TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess3) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess3.";
+  // Test BasicTokenizer with with_offsets true and lower_case true
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  // Create Skip operation on ds
+  ds = ds->Skip(6);
+  EXPECT_NE(ds, nullptr);
+
+  // Create BasicTokenizer operation on ds
+  std::shared_ptr<TensorOperation> basic_tokenizer =
+    text::BasicTokenizer(true, false, NormalizeForm::kNone, true, true);
+  EXPECT_NE(basic_tokenizer, nullptr);
+
+  // Create Map operation on ds
+  ds = ds->Map({basic_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
+  EXPECT_NE(ds, nullptr);
+
+  // Create an iterator over the result of the above dataset
+  // This will trigger the creation of the Execution Tree and launch it.
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(iter, nullptr);
+
+  // Iterate the dataset and get each row
+  std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
+  iter->GetNextRow(&row);
+
+  std::vector<std::string> expected_tokens = {"this", "is", "a", "funky", "string"};
+  std::vector<uint32_t> expected_offsets_start = {0, 5, 8, 10, 16};
+  std::vector<uint32_t> expected_offsets_limit = {4, 7, 9, 15, 22};
+
+  uint64_t i = 0;
+  while (row.size() != 0) {
+    auto ind = row["token"];
+    std::shared_ptr<Tensor> expected_token_tensor;
+    Tensor::CreateFromVector(expected_tokens, &expected_token_tensor);
+    EXPECT_EQ(*ind, *expected_token_tensor);
+    auto start = row["offsets_start"];
+    std::shared_ptr<Tensor> expected_start_tensor;
+    Tensor::CreateFromVector(expected_offsets_start, &expected_start_tensor);
+    EXPECT_EQ(*start, *expected_start_tensor);
+    auto limit = row["offsets_limit"];
+    std::shared_ptr<Tensor> expected_limit_tensor;
+    Tensor::CreateFromVector(expected_offsets_limit, &expected_limit_tensor);
+    EXPECT_EQ(*limit, *expected_limit_tensor);
+    iter->GetNextRow(&row);
+    i++;
+  }
+
+  EXPECT_EQ(i, 1);
+
+  // Manually terminate the pipeline
+  iter->Stop();
+}
+
+std::vector<std::string> list = {
+  "床", "前", "明",    "月",    "光",    "疑",    "是",      "地",        "上",        "霜",   "举",    "头",
+  "望", "低", "思",    "故",    "乡",    "繁",    "體",      "字",        "嘿",        "哈",   "大",    "笑",
+  "嘻", "i",  "am",    "mak",   "make",  "small", "mistake", "##s",       "during",    "work", "##ing", "hour",
+  "😀",  "😃",  "😄",     "😁",     "+",     "/",     "-",       "=",         "12",        "28",   "40",    "16",
+  " ",  "I",  "[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]",  "[unused1]", "[unused10]"};
+
+TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess1) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess1.";
+  // Test BertTokenizer with default parameters
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  // Create Take operation on ds
+  ds = ds->Take(4);
+  EXPECT_NE(ds, nullptr);
+
+  // Create a vocab from vector
+  std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
+  Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
+  EXPECT_EQ(s, Status::OK());
+
+  // Create BertTokenizer operation on ds
+  std::shared_ptr<TensorOperation> bert_tokenizer = text::BertTokenizer(vocab);
+  EXPECT_NE(bert_tokenizer, nullptr);
+
+  // Create Map operation on ds
+  ds = ds->Map({bert_tokenizer}, {"text"});
+  EXPECT_NE(ds, nullptr);
+
+  // Create an iterator over the result of the above dataset
+  // This will trigger the creation of the Execution Tree and launch it.
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(iter, nullptr);
+
+  // Iterate the dataset and get each row
+  std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
+  iter->GetNextRow(&row);
+
+  std::vector<std::vector<std::string>> expected = {{"床", "前", "明", "月", "光"},
+                                                    {"疑", "是", "地", "上", "霜"},
+                                                    {"举", "头", "望", "明", "月"},
+                                                    {"低", "头", "思", "故", "乡"}};
+
+  uint64_t i = 0;
+  while (row.size() != 0) {
+    auto ind = row["text"];
+    std::shared_ptr<Tensor> expected_tensor;
+    Tensor::CreateFromVector(expected[i], &expected_tensor);
+    EXPECT_EQ(*ind, *expected_tensor);
+    iter->GetNextRow(&row);
+    i++;
+  }
+
+  EXPECT_EQ(i, 4);
+
+  // Manually terminate the pipeline
+  iter->Stop();
+}
+
+TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess2) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess2.";
+  // Test BertTokenizer with lower_case true
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  // Create Skip operation on ds
+  ds = ds->Skip(4);
+  EXPECT_NE(ds, nullptr);
+
+  // Create Take operation on ds
+  ds = ds->Take(1);
+  EXPECT_NE(ds, nullptr);
+
+  // Create a vocab from vector
+  std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
+  Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
+  EXPECT_EQ(s, Status::OK());
+
+  // Create BertTokenizer operation on ds
+  std::shared_ptr<TensorOperation> bert_tokenizer = text::BertTokenizer(vocab, "##", 100, "[UNK]", true);
+  EXPECT_NE(bert_tokenizer, nullptr);
+
+  // Create Map operation on ds
+  ds = ds->Map({bert_tokenizer}, {"text"});
+  EXPECT_NE(ds, nullptr);
+
+  // Create an iterator over the result of the above dataset
+  // This will trigger the creation of the Execution Tree and launch it.
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(iter, nullptr);
+
+  // Iterate the dataset and get each row
+  std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
+  iter->GetNextRow(&row);
+
+  std::vector<std::string> expected = {"i",   "am",     "mak",  "##ing", "small", "mistake",
+                                       "##s", "during", "work", "##ing", "hour",  "##s"};
+
+  uint64_t i = 0;
+  while (row.size() != 0) {
+    auto ind = row["text"];
+    std::shared_ptr<Tensor> expected_tensor;
+    Tensor::CreateFromVector(expected, &expected_tensor);
+    EXPECT_EQ(*ind, *expected_tensor);
+    iter->GetNextRow(&row);
+    i++;
+  }
+
+  EXPECT_EQ(i, 1);
+
+  // Manually terminate the pipeline
+  iter->Stop();
+}
+
+TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess3) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess3.";
+  // Test BertTokenizer with normalization_form NFKC
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  // Create Skip operation on ds
+  ds = ds->Skip(5);
+  EXPECT_NE(ds, nullptr);
+
+  // Create Take operation on ds
+  ds = ds->Take(2);
+  EXPECT_NE(ds, nullptr);
+
+  // Create a vocab from vector
+  std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
+  Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
+  EXPECT_EQ(s, Status::OK());
+
+  // Create BertTokenizer operation on ds
+  std::shared_ptr<TensorOperation> bert_tokenizer =
+    text::BertTokenizer(vocab, "##", 100, "[UNK]", false, false, NormalizeForm::kNfc);
+  EXPECT_NE(bert_tokenizer, nullptr);
+
+  // Create Map operation on ds
+  ds = ds->Map({bert_tokenizer}, {"text"});
+  EXPECT_NE(ds, nullptr);
+
+  // Create an iterator over the result of the above dataset
+  // This will trigger the creation of the Execution Tree and launch it.
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(iter, nullptr);
+
+  // Iterate the dataset and get each row
+  std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
+  iter->GetNextRow(&row);
+
+  std::vector<std::vector<std::string>> expected = {
+    {"😀", "嘿", "嘿", "😃", "哈", "哈", "😄", "大", "笑", "😁", "嘻", "嘻"}, {"繁", "體", "字"}};
+
+  uint64_t i = 0;
+  while (row.size() != 0) {
+    auto ind = row["text"];
+    std::shared_ptr<Tensor> expected_tensor;
+    Tensor::CreateFromVector(expected[i], &expected_tensor);
+    EXPECT_EQ(*ind, *expected_tensor);
+    iter->GetNextRow(&row);
+    i++;
+  }
+
+  EXPECT_EQ(i, 2);
+
+  // Manually terminate the pipeline
+  iter->Stop();
+}
+
+TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess4) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess4.";
+  // Test BertTokenizer with keep_whitespace true
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  // Create Skip operation on ds
+  ds = ds->Skip(7);
+  EXPECT_NE(ds, nullptr);
+
+  // Create Take operation on ds
+  ds = ds->Take(1);
+  EXPECT_NE(ds, nullptr);
+
+  // Create a vocab from vector
+  std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
+  Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
+  EXPECT_EQ(s, Status::OK());
+
+  // Create BertTokenizer operation on ds
+  std::shared_ptr<TensorOperation> bert_tokenizer = text::BertTokenizer(vocab, "##", 100, "[UNK]", false, true);
+  EXPECT_NE(bert_tokenizer, nullptr);
+
+  // Create Map operation on ds
+  ds = ds->Map({bert_tokenizer}, {"text"});
+  EXPECT_NE(ds, nullptr);
+
+  // Create an iterator over the result of the above dataset
+  // This will trigger the creation of the Execution Tree and launch it.
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(iter, nullptr);
+
+  // Iterate the dataset and get each row
+  std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
+  iter->GetNextRow(&row);
+
+  std::vector<std::string> expected = {"[UNK]", " ", "[CLS]"};
+
+  uint64_t i = 0;
+  while (row.size() != 0) {
+    auto ind = row["text"];
+    std::shared_ptr<Tensor> expected_tensor;
+    Tensor::CreateFromVector(expected, &expected_tensor);
+    EXPECT_EQ(*ind, *expected_tensor);
+    iter->GetNextRow(&row);
+    i++;
+  }
+
+  EXPECT_EQ(i, 1);
+
+  // Manually terminate the pipeline
+  iter->Stop();
+}
+
+TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess5) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess5.";
+  // Test BertTokenizer with unknown_token empty and keep_whitespace true
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  // Create Skip operation on ds
+  ds = ds->Skip(7);
+  EXPECT_NE(ds, nullptr);
+
+  // Create Take operation on ds
+  ds = ds->Take(1);
+  EXPECT_NE(ds, nullptr);
+
+  // Create a vocab from vector
+  std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
+  Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
+  EXPECT_EQ(s, Status::OK());
+
+  // Create BertTokenizer operation on ds
+  std::shared_ptr<TensorOperation> bert_tokenizer = text::BertTokenizer(vocab, "##", 100, "", false, true);
+  EXPECT_NE(bert_tokenizer, nullptr);
+
+  // Create Map operation on ds
+  ds = ds->Map({bert_tokenizer}, {"text"});
+  EXPECT_NE(ds, nullptr);
+
+  // Create an iterator over the result of the above dataset
+  // This will trigger the creation of the Execution Tree and launch it.
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(iter, nullptr);
+
+  // Iterate the dataset and get each row
+  std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
+  iter->GetNextRow(&row);
+
+  std::vector<std::string> expected = {"unused", " ", "[CLS]"};
+
+  uint64_t i = 0;
+  while (row.size() != 0) {
+    auto ind = row["text"];
+    std::shared_ptr<Tensor> expected_tensor;
+    Tensor::CreateFromVector(expected, &expected_tensor);
+    EXPECT_EQ(*ind, *expected_tensor);
+    iter->GetNextRow(&row);
+    i++;
+  }
+
+  EXPECT_EQ(i, 1);
+
+  // Manually terminate the pipeline
+  iter->Stop();
+}
+
+TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess6) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess6.";
+  // Test BertTokenizer with preserve_unused_token false, unknown_token empty and keep_whitespace true
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  // Create Skip operation on ds
+  ds = ds->Skip(7);
+  EXPECT_NE(ds, nullptr);
+
+  // Create Take operation on ds
+  ds = ds->Take(1);
+  EXPECT_NE(ds, nullptr);
+
+  // Create a vocab from vector
+  std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
+  Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
+  EXPECT_EQ(s, Status::OK());
+
+  // Create BertTokenizer operation on ds
+  std::shared_ptr<TensorOperation> bert_tokenizer =
+    text::BertTokenizer(vocab, "##", 100, "", false, true, NormalizeForm::kNone, false);
+  EXPECT_NE(bert_tokenizer, nullptr);
+
+  // Create Map operation on ds
+  ds = ds->Map({bert_tokenizer}, {"text"});
+  EXPECT_NE(ds, nullptr);
+
+  // Create an iterator over the result of the above dataset
+  // This will trigger the creation of the Execution Tree and launch it.
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(iter, nullptr);
+
+  // Iterate the dataset and get each row
+  std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
+  iter->GetNextRow(&row);
+
+  std::vector<std::string> expected = {"unused", " ", "[", "CLS", "]"};
+
+  uint64_t i = 0;
+  while (row.size() != 0) {
+    auto ind = row["text"];
+    std::shared_ptr<Tensor> expected_tensor;
+    Tensor::CreateFromVector(expected, &expected_tensor);
+    EXPECT_EQ(*ind, *expected_tensor);
+    iter->GetNextRow(&row);
+    i++;
+  }
+
+  EXPECT_EQ(i, 1);
+
+  // Manually terminate the pipeline
+  iter->Stop();
+}
+
+TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess7) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess7.";
+  // Test BertTokenizer with with_offsets true and lower_case true
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  // Create Skip operation on ds
+  ds = ds->Skip(4);
+  EXPECT_NE(ds, nullptr);
+
+  // Create Take operation on ds
+  ds = ds->Take(1);
+  EXPECT_NE(ds, nullptr);
+
+  // Create a vocab from vector
+  std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
+  Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
+  EXPECT_EQ(s, Status::OK());
+
+  // Create BertTokenizer operation on ds
+  std::shared_ptr<TensorOperation> bert_tokenizer =
+    text::BertTokenizer(vocab, "##", 100, "[UNK]", true, false, NormalizeForm::kNone, true, true);
+  EXPECT_NE(bert_tokenizer, nullptr);
+
+  // Create Map operation on ds
+  ds = ds->Map({bert_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
+  EXPECT_NE(ds, nullptr);
+
+  // Create an iterator over the result of the above dataset
+  // This will trigger the creation of the Execution Tree and launch it.
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(iter, nullptr);
+
+  // Iterate the dataset and get each row
+  std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
+  iter->GetNextRow(&row);
+
+  std::vector<std::string> expected_tokens = {"i",   "am",     "mak",  "##ing", "small", "mistake",
+                                              "##s", "during", "work", "##ing", "hour",  "##s"};
+  std::vector<uint32_t> expected_offsets_start = {0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46};
+  std::vector<uint32_t> expected_offsets_limit = {1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47};
+
+  uint64_t i = 0;
+  while (row.size() != 0) {
+    auto ind = row["token"];
+    std::shared_ptr<Tensor> expected_token_tensor;
+    Tensor::CreateFromVector(expected_tokens, &expected_token_tensor);
+    EXPECT_EQ(*ind, *expected_token_tensor);
+    auto start = row["offsets_start"];
+    std::shared_ptr<Tensor> expected_start_tensor;
+    Tensor::CreateFromVector(expected_offsets_start, &expected_start_tensor);
+    EXPECT_EQ(*start, *expected_start_tensor);
+    auto limit = row["offsets_limit"];
+    std::shared_ptr<Tensor> expected_limit_tensor;
+    Tensor::CreateFromVector(expected_offsets_limit, &expected_limit_tensor);
+    EXPECT_EQ(*limit, *expected_limit_tensor);
+    iter->GetNextRow(&row);
+    i++;
+  }
+
+  EXPECT_EQ(i, 1);
+
+  // Manually terminate the pipeline
+  iter->Stop();
+}
+
+TEST_F(MindDataTestPipeline, TestBertTokenizerFail1) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail1.";
+  // Test BertTokenizer with nullptr vocab
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  // Create BertTokenizer operation on ds
+  std::shared_ptr<TensorOperation> bert_tokenizer = text::BertTokenizer(nullptr);
+  // Expect failure: invalid BertTokenizer input with nullptr vocab
+  EXPECT_EQ(bert_tokenizer, nullptr);
+}
+
+TEST_F(MindDataTestPipeline, TestBertTokenizerFail2) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail2.";
+  // Test BertTokenizer with negative max_bytes_per_token
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  // Create a vocab from vector
+  std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
+  Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
+  EXPECT_EQ(s, Status::OK());
+
+  // Create BertTokenizer operation on ds
+  std::shared_ptr<TensorOperation> bert_tokenizer = text::BertTokenizer(vocab, "##", -1);
+  // Expect failure: invalid BertTokenizer input with nullptr vocab
+  EXPECT_EQ(bert_tokenizer, nullptr);
+}
+
 TEST_F(MindDataTestPipeline, TestCaseFoldSuccess) {
  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaseFoldSuccess.";