!12349 [MD] Push down IR files for transforms and text

From: @tina_mengting_zhang Reviewed-by: Signed-off-by:
2021-02-12 00:01:13 +08:00 · 2021-02-12 00:01:13 +08:00 · ed7fef5d5e
parent d249ededf6 462422acc7
commit ed7fef5d5e
21 changed files with 1269 additions and 1058 deletions
--- a/mindspore/ccsrc/minddata/dataset/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/CMakeLists.txt
@ -92,11 +92,14 @@ add_dependencies(engine core)
 add_dependencies(callback core)
 add_dependencies(text core)
 add_dependencies(text-kernels core)
+add_dependencies(text-ir core)
+add_dependencies(text-ir-kernels core)
 add_dependencies(cpp-API core)
 add_dependencies(engine-ir-datasetops core)
 add_dependencies(engine-ir-datasetops-source core)
 add_dependencies(engine-ir-cache core)
 add_dependencies(kernels-ir core)
+add_dependencies(kernels-ir-data core)
 add_dependencies(kernels-ir-vision core)

 if(ENABLE_ACL)
@ -146,7 +149,10 @@ set(submodules
        $<TARGET_OBJECTS:engine>
        $<TARGET_OBJECTS:text>
        $<TARGET_OBJECTS:text-kernels>
+        $<TARGET_OBJECTS:text-ir>
+        $<TARGET_OBJECTS:text-ir-kernels>
        $<TARGET_OBJECTS:kernels-ir>
+        $<TARGET_OBJECTS:kernels-ir-data>
        $<TARGET_OBJECTS:kernels-ir-vision>
        )

--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/ir/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/ir/bindings.cc
@ -17,9 +17,9 @@

 #include "minddata/dataset/api/python/pybind_register.h"
 #include "minddata/dataset/core/global_context.h"
-#include "minddata/dataset/include/transforms.h"

 #include "minddata/dataset/kernels/py_func_op.h"
+#include "minddata/dataset/kernels/ir/data/transforms_ir.h"
 #include "minddata/dataset/kernels/ir/vision/vision_ir.h"

 namespace mindspore {
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc
@ -18,7 +18,7 @@
 #include "pybind11/stl.h"
 #include "pybind11/stl_bind.h"
 #include "minddata/dataset/api/python/pybind_register.h"
-#include "minddata/dataset/include/text.h"
+#include "minddata/dataset/text/ir/kernels/text_ir.h"
 #include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h"
 #include "minddata/dataset/text/sentence_piece_vocab.h"
 #include "minddata/dataset/text/vocab.h"
--- a/mindspore/ccsrc/minddata/dataset/api/python/pybind_conversion.h
+++ b/mindspore/ccsrc/minddata/dataset/api/python/pybind_conversion.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -30,10 +30,10 @@
 #include "pybind11/stl_bind.h"
 #include "minddata/dataset/include/datasets.h"
 #include "minddata/dataset/include/samplers.h"
-#include "minddata/dataset/include/transforms.h"
 #include "minddata/dataset/api/python/pybind_register.h"
 #include "minddata/dataset/engine/ir/cache/pre_built_dataset_cache.h"
 #include "minddata/dataset/engine/ir/datasetops/source/csv_node.h"
+#include "minddata/dataset/kernels/ir/data/transforms_ir.h"
 #include "minddata/dataset/kernels/py_func_op.h"
 namespace py = pybind11;

--- a/mindspore/ccsrc/minddata/dataset/api/text.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/text.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -17,30 +17,6 @@
 #include <unistd.h>

 #include "minddata/dataset/include/text.h"
-#ifndef _WIN32
-#include "minddata/dataset/text/kernels/basic_tokenizer_op.h"
-#include "minddata/dataset/text/kernels/bert_tokenizer_op.h"
-#include "minddata/dataset/text/kernels/case_fold_op.h"
-#endif
-#include "minddata/dataset/text/kernels/jieba_tokenizer_op.h"
-#include "minddata/dataset/text/kernels/lookup_op.h"
-#include "minddata/dataset/text/kernels/ngram_op.h"
-#ifndef _WIN32
-#include "minddata/dataset/text/kernels/normalize_utf8_op.h"
-#include "minddata/dataset/text/kernels/regex_replace_op.h"
-#include "minddata/dataset/text/kernels/regex_tokenizer_op.h"
-#endif
-#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
-#include "minddata/dataset/text/kernels/sliding_window_op.h"
-#include "minddata/dataset/text/kernels/to_number_op.h"
-#include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h"
-#include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
-#ifndef _WIN32
-#include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h"
-#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
-#endif
-#include "minddata/dataset/core/data_type.h"
-#include "minddata/dataset/util/path.h"

 namespace mindspore {
 namespace dataset {
@ -174,426 +150,6 @@ std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offs
  return op->ValidateParams() ? op : nullptr;
 }
 #endif
-
-/* ####################################### Validator Functions ############################################ */
-
-// Helper function to validate tokenizer directory parameter
-Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file) {
-  if (tokenizer_file.empty()) {
-    std::string err_msg = tokenizer_name + ": tokenizer_file is not specified.";
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-
-  Path file(tokenizer_file);
-  if (!file.Exists()) {
-    std::string err_msg = tokenizer_name + ": tokenizer_file: [" + tokenizer_file + "] is an invalid directory path.";
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-
-  if (access(tokenizer_file.c_str(), R_OK) == -1) {
-    std::string err_msg = tokenizer_name + ": No access to specified tokenizer path: " + tokenizer_file;
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-
-  return Status::OK();
-}
-
-// Helper functions to help validate data type passed by user
-bool IsTypeNumeric(const std::string &data_type) {
-  if (data_type == "int8" || data_type == "uint8" || data_type == "int16" || data_type == "uint16" ||
-      data_type == "int32" || data_type == "uint32" || data_type == "int64" || data_type == "uint64" ||
-      data_type == "float16" || data_type == "float32" || data_type == "float64")
-    return true;
-  return false;
-}
-
-bool IsTypeBoolean(const std::string &data_type) { return data_type == "bool"; }
-
-bool IsTypeString(const std::string &data_type) { return data_type == "string"; }
-
-/* ####################################### Derived TensorOperation classes ################################# */
-
-// (In alphabetical order)
-
-#ifndef _WIN32
-// BasicTokenizerOperation
-BasicTokenizerOperation::BasicTokenizerOperation(bool lower_case, bool keep_whitespace,
-                                                 const NormalizeForm normalize_form, bool preserve_unused_token,
-                                                 bool with_offsets)
-    : lower_case_(lower_case),
-      keep_whitespace_(keep_whitespace),
-      normalize_form_(normalize_form),
-      preserve_unused_token_(preserve_unused_token),
-      with_offsets_(with_offsets) {}
-
-Status BasicTokenizerOperation::ValidateParams() { return Status::OK(); }
-
-std::shared_ptr<TensorOp> BasicTokenizerOperation::Build() {
-  std::shared_ptr<BasicTokenizerOp> tensor_op = std::make_shared<BasicTokenizerOp>(
-    lower_case_, keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_);
-  return tensor_op;
-}
-
-// BertTokenizerOperation
-BertTokenizerOperation::BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
-                                               int32_t max_bytes_per_token, const std::string &unknown_token,
-                                               bool lower_case, bool keep_whitespace,
-                                               const NormalizeForm normalize_form, bool preserve_unused_token,
-                                               bool with_offsets)
-    : vocab_(vocab),
-      suffix_indicator_(suffix_indicator),
-      max_bytes_per_token_(max_bytes_per_token),
-      unknown_token_(unknown_token),
-      lower_case_(lower_case),
-      keep_whitespace_(keep_whitespace),
-      normalize_form_(normalize_form),
-      preserve_unused_token_(preserve_unused_token),
-      with_offsets_(with_offsets) {}
-
-BertTokenizerOperation::~BertTokenizerOperation() = default;
-
-Status BertTokenizerOperation::ValidateParams() {
-  if (vocab_ == nullptr) {
-    std::string err_msg = "BertTokenizer: vocab object type is incorrect or null.";
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-
-  if (max_bytes_per_token_ < 0) {
-    std::string err_msg = "BertTokenizer : The parameter max_bytes_per_token must be greater than or equal to 0: " +
-                          std::to_string(max_bytes_per_token_);
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-
-  return Status::OK();
-}
-
-std::shared_ptr<TensorOp> BertTokenizerOperation::Build() {
-  std::shared_ptr<BertTokenizerOp> tensor_op =
-    std::make_shared<BertTokenizerOp>(vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, lower_case_,
-                                      keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_);
-  return tensor_op;
-}
-
-// CaseFoldOperation
-Status CaseFoldOperation::ValidateParams() { return Status::OK(); }
-
-std::shared_ptr<TensorOp> CaseFoldOperation::Build() {
-  std::shared_ptr<CaseFoldOp> tensor_op = std::make_shared<CaseFoldOp>();
-  return tensor_op;
-}
-#endif
-
-// JiebaTokenizerOperation
-JiebaTokenizerOperation::JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path,
-                                                 const JiebaMode &mode, bool with_offsets)
-    : hmm_path_(hmm_path), mp_path_(mp_path), mode_(mode), with_offsets_(with_offsets) {}
-
-Status JiebaTokenizerOperation::ValidateParams() {
-  if (hmm_path_.empty()) {
-    std::string err_msg = "JiebaTokenizer: The dict of HMMSegment in cppjieba is not provided.";
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-
-  if (mp_path_.empty()) {
-    std::string err_msg = "JiebaTokenizer: The dict of MPSegment in cppjieba is not provided.";
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-
-  RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", hmm_path_));
-  RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", mp_path_));
-  return Status::OK();
-}
-
-std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() {
-  std::shared_ptr<JiebaTokenizerOp> tensor_op =
-    std::make_shared<JiebaTokenizerOp>(hmm_path_, mp_path_, mode_, with_offsets_);
-  for (auto &word : words_list_) {
-    Status rc = tensor_op->AddWord(word.first, word.second);
-    if (rc.IsError()) {
-      MS_LOG(ERROR) << rc;
-      return {};
-    }
-  }
-  return tensor_op;
-}
-
-Status JiebaTokenizerOperation::AddWord(const std::string &word, int64_t freq) {
-  if (word.empty()) {
-    std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided.";
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-  if (freq < 0) {
-    std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0.";
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-  words_list_.emplace_back(word, freq);
-  return Status::OK();
-}
-
-// LookupOperation
-LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
-                                 const std::string &data_type)
-    : vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {}
-
-LookupOperation::~LookupOperation() = default;
-
-Status LookupOperation::ValidateParams() {
-  if (vocab_ == nullptr) {
-    std::string err_msg = "Lookup: vocab object type is incorrect or null.";
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-  if (unknown_token_ != std::nullopt) {
-    default_id_ = vocab_->Lookup(*unknown_token_);
-    if (default_id_ == Vocab::kNoTokenExists) {
-      std::string err_msg = "Lookup: \"" + *unknown_token_ + "\" doesn't exist in vocab.";
-      MS_LOG(ERROR) << err_msg;
-      RETURN_STATUS_SYNTAX_ERROR(err_msg);
-    }
-  }
-
-  if (!IsTypeNumeric(data_type_)) {
-    std::string err_msg = "Lookup does not support a string to string mapping, data_type can only be numeric.";
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-
-  return Status::OK();
-}
-
-std::shared_ptr<TensorOp> LookupOperation::Build() {
-  std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, DataType(data_type_));
-  return tensor_op;
-}
-
-// NgramOperation
-NgramOperation::NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
-                               const std::pair<std::string, int32_t> &right_pad, const std::string &separator)
-    : ngrams_(ngrams), left_pad_(left_pad), right_pad_(right_pad), separator_(separator) {}
-
-Status NgramOperation::ValidateParams() {
-  if (ngrams_.size() == 0) {
-    std::string err_msg = "Ngram : Container cannot be empty.";
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  } else {
-    for (int32_t i = 0; i < ngrams_.size(); ++i) {
-      if (ngrams_[i] <= 0) {
-        std::string err_msg =
-          "Ngram : The value of ngrams vector must be greater than 0: " + std::to_string(ngrams_[i]);
-        MS_LOG(ERROR) << err_msg;
-        RETURN_STATUS_SYNTAX_ERROR(err_msg);
-      }
-    }
-  }
-
-  if (left_pad_.second < 0) {
-    std::string err_msg =
-      "Ngram : The second parameter pad_width in left_pad vector must be greater than or equal to 0: " +
-      std::to_string(left_pad_.second);
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-
-  if (right_pad_.second < 0) {
-    std::string err_msg =
-      "Ngram : The second parameter pad_width in right_pad vector must be greater than or equal to 0: " +
-      std::to_string(right_pad_.second);
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-  return Status::OK();
-}
-
-std::shared_ptr<TensorOp> NgramOperation::Build() {
-  int32_t l_len = left_pad_.second;
-  int32_t r_len = right_pad_.second;
-  std::string l_pad = left_pad_.first;
-  std::string r_pad = right_pad_.first;
-  std::shared_ptr<NgramOp> tensor_op = std::make_shared<NgramOp>(ngrams_, l_len, r_len, l_pad, r_pad, separator_);
-  return tensor_op;
-}
-
-#ifndef _WIN32
-// NormalizeUTF8Operation
-NormalizeUTF8Operation::NormalizeUTF8Operation(NormalizeForm normalize_form) : normalize_form_(normalize_form) {}
-
-Status NormalizeUTF8Operation::ValidateParams() { return Status::OK(); }
-
-std::shared_ptr<TensorOp> NormalizeUTF8Operation::Build() {
-  std::shared_ptr<NormalizeUTF8Op> tensor_op = std::make_shared<NormalizeUTF8Op>(normalize_form_);
-  return tensor_op;
-}
-
-// RegexReplaceOperation
-RegexReplaceOperation::RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all)
-    : pattern_(pattern), replace_(replace), replace_all_(replace_all) {}
-
-Status RegexReplaceOperation::ValidateParams() { return Status::OK(); }
-
-std::shared_ptr<TensorOp> RegexReplaceOperation::Build() {
-  std::shared_ptr<RegexReplaceOp> tensor_op = std::make_shared<RegexReplaceOp>(pattern_, replace_, replace_all_);
-  return tensor_op;
-}
-
-// RegexTokenizerOperation
-RegexTokenizerOperation::RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern,
-                                                 bool with_offsets)
-    : delim_pattern_(delim_pattern), keep_delim_pattern_(keep_delim_pattern), with_offsets_(with_offsets) {}
-
-Status RegexTokenizerOperation::ValidateParams() { return Status::OK(); }
-
-std::shared_ptr<TensorOp> RegexTokenizerOperation::Build() {
-  std::shared_ptr<RegexTokenizerOp> tensor_op =
-    std::make_shared<RegexTokenizerOp>(delim_pattern_, keep_delim_pattern_, with_offsets_);
-  return tensor_op;
-}
-#endif
-
-// SentencePieceTokenizerOperation
-SentencePieceTokenizerOperation::~SentencePieceTokenizerOperation() = default;
-
-SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab,
-                                                                 SPieceTokenizerOutType out_type)
-    : vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {}
-
-SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::string &vocab_path,
-                                                                 SPieceTokenizerOutType out_type)
-    : vocab_(nullptr), vocab_path_(vocab_path), load_type_(SPieceTokenizerLoadType::kFile), out_type_(out_type) {}
-
-Status SentencePieceTokenizerOperation::ValidateParams() {
-  if (load_type_ == SPieceTokenizerLoadType::kModel) {
-    if (vocab_ == nullptr) {
-      std::string err_msg = "SentencePieceTokenizer: vocab object type is incorrect or null.";
-      MS_LOG(ERROR) << err_msg;
-      RETURN_STATUS_SYNTAX_ERROR(err_msg);
-    }
-  } else {
-    Path vocab_file(vocab_path_);
-    if (!vocab_file.Exists() || vocab_file.IsDirectory()) {
-      std::string err_msg = "SentencePieceTokenizer : vocab file: [" + vocab_path_ + "] is invalid or does not exist.";
-      MS_LOG(ERROR) << err_msg;
-      RETURN_STATUS_SYNTAX_ERROR(err_msg);
-    }
-    if (access(vocab_file.toString().c_str(), R_OK) == -1) {
-      std::string err_msg = "SentencePieceTokenizer : no access to specified dataset file: " + vocab_path_;
-      MS_LOG(ERROR) << err_msg;
-      RETURN_STATUS_SYNTAX_ERROR(err_msg);
-    }
-  }
-  return Status::OK();
-}
-
-std::shared_ptr<TensorOp> SentencePieceTokenizerOperation::Build() {
-  std::shared_ptr<SentencePieceTokenizerOp> tensor_op;
-  if (load_type_ == SPieceTokenizerLoadType::kModel) {
-    tensor_op = std::make_shared<SentencePieceTokenizerOp>(vocab_, load_type_, out_type_);
-  } else {
-    Path vocab_file(vocab_path_);
-    std::string model_path = vocab_file.ParentPath();
-    std::string model_filename = vocab_file.Basename();
-    tensor_op = std::make_shared<SentencePieceTokenizerOp>(model_path, model_filename, load_type_, out_type_);
-  }
-  return tensor_op;
-}
-
-// SlidingWindowOperation
-SlidingWindowOperation::SlidingWindowOperation(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {}
-
-Status SlidingWindowOperation::ValidateParams() {
-  if (width_ < 1) {
-    std::string err_msg =
-      "SlidingWindow : The parameter width must be greater than or equal to 1: " + std::to_string(width_);
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-  return Status::OK();
-}
-
-std::shared_ptr<TensorOp> SlidingWindowOperation::Build() {
-  std::shared_ptr<SlidingWindowOp> tensor_op = std::make_shared<SlidingWindowOp>(static_cast<uint32_t>(width_), axis_);
-  return tensor_op;
-}
-
-// ToNumberOperation
-ToNumberOperation::ToNumberOperation(std::string data_type) : data_type_(data_type) {}
-
-Status ToNumberOperation::ValidateParams() {
-  if (!IsTypeNumeric(data_type_) || IsTypeBoolean(data_type_)) {
-    std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_;
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-
-  return Status::OK();
-}
-
-std::shared_ptr<TensorOp> ToNumberOperation::Build() {
-  std::shared_ptr<ToNumberOp> tensor_op = std::make_shared<ToNumberOp>(data_type_);
-  return tensor_op;
-}
-
-TruncateSequencePairOperation::TruncateSequencePairOperation(int32_t max_length) : max_length_(max_length) {}
-
-Status TruncateSequencePairOperation::ValidateParams() {
-  if (max_length_ < 0) {
-    std::string err_msg = "TruncateSequencePair : The parameter max_length must be greater than or equal to 0: " +
-                          std::to_string(max_length_);
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-
-  return Status::OK();
-}
-
-std::shared_ptr<TensorOp> TruncateSequencePairOperation::Build() {
-  std::shared_ptr<TruncateSequencePairOp> tensor_op = std::make_shared<TruncateSequencePairOp>(max_length_);
-  return tensor_op;
-}
-
-// UnicodeCharTokenizerOperation
-UnicodeCharTokenizerOperation::UnicodeCharTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {}
-
-Status UnicodeCharTokenizerOperation::ValidateParams() { return Status::OK(); }
-
-std::shared_ptr<TensorOp> UnicodeCharTokenizerOperation::Build() {
-  std::shared_ptr<UnicodeCharTokenizerOp> tensor_op = std::make_shared<UnicodeCharTokenizerOp>(with_offsets_);
-  return tensor_op;
-}
-
-#ifndef _WIN32
-// UnicodeScriptTokenizerOperation
-UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets)
-    : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {}
-
-Status UnicodeScriptTokenizerOperation::ValidateParams() { return Status::OK(); }
-
-std::shared_ptr<TensorOp> UnicodeScriptTokenizerOperation::Build() {
-  std::shared_ptr<UnicodeScriptTokenizerOp> tensor_op =
-    std::make_shared<UnicodeScriptTokenizerOp>(keep_whitespace_, with_offsets_);
-  return tensor_op;
-}
-
-// WhitespaceTokenizerOperation
-WhitespaceTokenizerOperation::WhitespaceTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {}
-
-Status WhitespaceTokenizerOperation::ValidateParams() { return Status::OK(); }
-
-std::shared_ptr<TensorOp> WhitespaceTokenizerOperation::Build() {
-  std::shared_ptr<WhitespaceTokenizerOp> tensor_op = std::make_shared<WhitespaceTokenizerOp>(with_offsets_);
-  return tensor_op;
-}
-#endif
-
 }  // namespace text
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/api/transforms.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/transforms.cc
@ -15,18 +15,6 @@
 */

 #include "minddata/dataset/include/transforms.h"
-#include "minddata/dataset/kernels/ir/validators.h"
-
-// Kernel data headers (in alphabetical order)
-#include "minddata/dataset/kernels/data/compose_op.h"
-#include "minddata/dataset/kernels/data/duplicate_op.h"
-#include "minddata/dataset/kernels/data/one_hot_op.h"
-#include "minddata/dataset/kernels/data/random_apply_op.h"
-#include "minddata/dataset/kernels/data/random_choice_op.h"
-#include "minddata/dataset/kernels/data/type_cast_op.h"
-#ifndef ENABLE_ANDROID
-#include "minddata/dataset/kernels/data/unique_op.h"
-#endif

 namespace mindspore {
 namespace dataset {
@ -88,122 +76,6 @@ std::shared_ptr<UniqueOperation> Unique() {
  return op->ValidateParams() ? op : nullptr;
 }
 #endif
-
-/* ####################################### Validator Functions ############################################ */
-
-/* ####################################### Derived TensorOperation classes ################################# */
-
-// (In alphabetical order)
-
-// ComposeOperation
-ComposeOperation::ComposeOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms)
-    : transforms_(transforms) {}
-
-Status ComposeOperation::ValidateParams() {
-  RETURN_IF_NOT_OK(ValidateVectorTransforms("Compose", transforms_));
-  return Status::OK();
-}
-
-std::shared_ptr<TensorOp> ComposeOperation::Build() {
-  std::vector<std::shared_ptr<TensorOp>> tensor_ops;
-  (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops),
-                       [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); });
-  return std::make_shared<ComposeOp>(tensor_ops);
-}
-
-// DuplicateOperation
-Status DuplicateOperation::ValidateParams() { return Status::OK(); }
-
-std::shared_ptr<TensorOp> DuplicateOperation::Build() { return std::make_shared<DuplicateOp>(); }
-
-// OneHotOperation
-OneHotOperation::OneHotOperation(int32_t num_classes) : num_classes_(num_classes) {}
-
-Status OneHotOperation::ValidateParams() {
-  if (num_classes_ <= 0) {
-    std::string err_msg = "OneHot: Number of classes must be greater than 0, but got: " + std::to_string(num_classes_);
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-
-  return Status::OK();
-}
-
-std::shared_ptr<TensorOp> OneHotOperation::Build() { return std::make_shared<OneHotOp>(num_classes_); }
-
-// PreBuiltOperation
-PreBuiltOperation::PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op) : op_(tensor_op) {}
-
-Status PreBuiltOperation::ValidateParams() { return Status::OK(); }
-
-std::shared_ptr<TensorOp> PreBuiltOperation::Build() { return op_; }
-
-std::string PreBuiltOperation::Name() const { return op_ ? op_->Name() : kPreBuiltOperation; }
-
-Status PreBuiltOperation::to_json(nlohmann::json *out_json) {
-  RETURN_IF_NOT_OK(op_->to_json(out_json));
-  return Status::OK();
-}
-
-// RandomApplyOperation
-RandomApplyOperation::RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob)
-    : TensorOperation(true), transforms_(transforms), prob_(prob) {}
-
-Status RandomApplyOperation::ValidateParams() {
-  RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomApply", transforms_));
-  RETURN_IF_NOT_OK(ValidateProbability("RandomApply", prob_));
-  return Status::OK();
-}
-
-std::shared_ptr<TensorOp> RandomApplyOperation::Build() {
-  std::vector<std::shared_ptr<TensorOp>> tensor_ops;
-  (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops),
-                       [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); });
-  return std::make_shared<RandomApplyOp>(prob_, tensor_ops);
-}
-
-// RandomChoiceOperation
-RandomChoiceOperation::RandomChoiceOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms)
-    : TensorOperation(true), transforms_(transforms) {}
-
-Status RandomChoiceOperation::ValidateParams() {
-  RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomChoice", transforms_));
-  return Status::OK();
-}
-
-std::shared_ptr<TensorOp> RandomChoiceOperation::Build() {
-  std::vector<std::shared_ptr<TensorOp>> tensor_ops;
-  (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops),
-                       [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); });
-  return std::make_shared<RandomChoiceOp>(tensor_ops);
-}
-
-// TypeCastOperation
-TypeCastOperation::TypeCastOperation(std::string data_type) : data_type_(data_type) {}
-
-Status TypeCastOperation::ValidateParams() {
-  std::vector<std::string> predefine_type = {"bool",  "int8",   "uint8",   "int16",   "uint16",  "int32", "uint32",
-                                             "int64", "uint64", "float16", "float32", "float64", "string"};
-  auto itr = std::find(predefine_type.begin(), predefine_type.end(), data_type_);
-  if (itr == predefine_type.end()) {
-    std::string err_msg = "TypeCast: Invalid data type: " + data_type_;
-    MS_LOG(ERROR) << "TypeCast: Only supports data type bool, int8, uint8, int16, uint16, int32, uint32, "
-                  << "int64, uint64, float16, float32, float64, string, but got: " << data_type_;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-
-  return Status::OK();
-}
-
-std::shared_ptr<TensorOp> TypeCastOperation::Build() { return std::make_shared<TypeCastOp>(data_type_); }
-
-#ifndef ENABLE_ANDROID
-// UniqueOperation
-Status UniqueOperation::ValidateParams() { return Status::OK(); }
-
-std::shared_ptr<TensorOp> UniqueOperation::Build() { return std::make_shared<UniqueOp>(); }
-#endif
-
 }  // namespace transforms
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -21,9 +21,9 @@
 #include "minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.h"

 #include "minddata/dataset/engine/ir/datasetops/map_node.h"
-#include "minddata/dataset/include/transforms.h"
 #include "minddata/dataset/kernels/image/random_crop_and_resize_op.h"
 #include "minddata/dataset/kernels/image/random_crop_decode_resize_op.h"
+#include "minddata/dataset/kernels/ir/data/transforms_ir.h"
 #include "minddata/dataset/kernels/ir/vision/vision_ir.h"

 namespace mindspore {
--- a/mindspore/ccsrc/minddata/dataset/include/text.h
+++ b/mindspore/ccsrc/minddata/dataset/include/text.h
@ -27,6 +27,9 @@
 #include "minddata/dataset/include/constants.h"
 #include "minddata/dataset/include/transforms.h"

+// FIXME - This internal IR header will be removed when external API classes are provided
+#include "minddata/dataset/text/ir/kernels/text_ir.h"
+
 namespace mindspore {
 namespace dataset {

@ -36,24 +39,6 @@ class SentencePieceVocab;
 // Transform operations for text
 namespace text {

-// Char arrays storing name of corresponding classes (in alphabetical order)
-constexpr char kBasicTokenizerOperation[] = "BasicTokenizer";
-constexpr char kBertTokenizerOperation[] = "BertTokenizer";
-constexpr char kCaseFoldOperation[] = "CaseFold";
-constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer";
-constexpr char kLookupOperation[] = "Lookup";
-constexpr char kNgramOperation[] = "Ngram";
-constexpr char kNormalizeUTF8Operation[] = "NormalizeUTF8";
-constexpr char kRegexReplaceOperation[] = "RegexReplace";
-constexpr char kRegexTokenizerOperation[] = "RegexTokenizer";
-constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer";
-constexpr char kSlidingWindowOperation[] = "SlidingWindow";
-constexpr char kToNumberOperation[] = "ToNumber";
-constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair";
-constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer";
-constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer";
-constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer";
-
 // Text Op classes (in alphabetical order)
 #ifndef _WIN32
 class BasicTokenizerOperation;
@ -255,309 +240,6 @@ std::shared_ptr<UnicodeScriptTokenizerOperation> UnicodeScriptTokenizer(bool kee
 /// \return Shared pointer to the current TensorOperation.
 std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offsets = false);
 #endif
-
-/* ####################################### Derived TensorOperation classes ################################# */
-
-#ifndef _WIN32
-class BasicTokenizerOperation : public TensorOperation {
- public:
-  BasicTokenizerOperation(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form,
-                          bool preserve_unused_token, bool with_offsets);
-
-  ~BasicTokenizerOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kBasicTokenizerOperation; }
-
- private:
-  bool lower_case_;
-  bool keep_whitespace_;
-  NormalizeForm normalize_form_;
-  bool preserve_unused_token_;
-  bool with_offsets_;
-};
-
-class BertTokenizerOperation : public TensorOperation {
- public:
-  BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
-                         int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case,
-                         bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
-                         bool with_offsets);
-
-  ~BertTokenizerOperation();
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kBertTokenizerOperation; }
-
- private:
-  std::shared_ptr<Vocab> vocab_;
-  std::string suffix_indicator_;
-  int32_t max_bytes_per_token_;
-  std::string unknown_token_;
-  bool lower_case_;
-  bool keep_whitespace_;
-  NormalizeForm normalize_form_;
-  bool preserve_unused_token_;
-  bool with_offsets_;
-};
-
-class CaseFoldOperation : public TensorOperation {
- public:
-  CaseFoldOperation() = default;
-
-  ~CaseFoldOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kCaseFoldOperation; }
-};
-#endif
-
-class JiebaTokenizerOperation : public TensorOperation {
- public:
-  explicit JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode,
-                                   bool with_offsets);
-
-  ~JiebaTokenizerOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kJiebaTokenizerOperation; }
-
-  Status AddWord(const std::string &word, int64_t freq = 0);
-
- private:
-  std::string hmm_path_;
-  std::string mp_path_;
-  JiebaMode mode_;
-  bool with_offsets_;
-  std::vector<std::pair<std::string, int64_t>> words_list_;
-};
-
-class LookupOperation : public TensorOperation {
- public:
-  explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
-                           const std::string &data_type);
-
-  ~LookupOperation();
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kLookupOperation; }
-
- private:
-  std::shared_ptr<Vocab> vocab_;
-  std::optional<std::string> unknown_token_;
-  int32_t default_id_;
-  std::string data_type_;
-};
-
-class NgramOperation : public TensorOperation {
- public:
-  explicit NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
-                          const std::pair<std::string, int32_t> &right_pad, const std::string &separator);
-
-  ~NgramOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kNgramOperation; }
-
- private:
-  std::vector<int32_t> ngrams_;
-  std::pair<std::string, int32_t> left_pad_;
-  std::pair<std::string, int32_t> right_pad_;
-  std::string separator_;
-};
-
-#ifndef _WIN32
-class NormalizeUTF8Operation : public TensorOperation {
- public:
-  explicit NormalizeUTF8Operation(NormalizeForm normalize_form);
-
-  ~NormalizeUTF8Operation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kNormalizeUTF8Operation; }
-
- private:
-  NormalizeForm normalize_form_;
-};
-
-class RegexReplaceOperation : public TensorOperation {
- public:
-  RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all);
-
-  ~RegexReplaceOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kRegexReplaceOperation; }
-
- private:
-  std::string pattern_;
-  std::string replace_;
-  bool replace_all_;
-};
-
-class RegexTokenizerOperation : public TensorOperation {
- public:
-  explicit RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, bool with_offsets);
-
-  ~RegexTokenizerOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kRegexTokenizerOperation; }
-
- private:
-  std::string delim_pattern_;
-  std::string keep_delim_pattern_;
-  bool with_offsets_;
-};
-#endif
-
-class SentencePieceTokenizerOperation : public TensorOperation {
- public:
-  SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type);
-
-  SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type);
-
-  ~SentencePieceTokenizerOperation();
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kSentencepieceTokenizerOperation; }
-
- private:
-  std::shared_ptr<SentencePieceVocab> vocab_;
-  std::string vocab_path_;
-  SPieceTokenizerLoadType load_type_;
-  SPieceTokenizerOutType out_type_;
-};
-
-class SlidingWindowOperation : public TensorOperation {
- public:
-  explicit SlidingWindowOperation(const int32_t width, const int32_t axis);
-
-  ~SlidingWindowOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kSlidingWindowOperation; }
-
- private:
-  int32_t width_;
-  int32_t axis_;
-};
-
-class ToNumberOperation : public TensorOperation {
- public:
-  explicit ToNumberOperation(std::string data_type);
-
-  ~ToNumberOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kToNumberOperation; }
-
- private:
-  std::string data_type_;
-};
-
-class TruncateSequencePairOperation : public TensorOperation {
- public:
-  explicit TruncateSequencePairOperation(int32_t max_length);
-
-  ~TruncateSequencePairOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kTruncateSequencePairOperation; }
-
- private:
-  int32_t max_length_;
-};
-
-class UnicodeCharTokenizerOperation : public TensorOperation {
- public:
-  explicit UnicodeCharTokenizerOperation(bool with_offsets);
-
-  ~UnicodeCharTokenizerOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kUnicodeCharTokenizerOperation; }
-
- private:
-  bool with_offsets_;
-};
-
-#ifndef _WIN32
-class UnicodeScriptTokenizerOperation : public TensorOperation {
- public:
-  explicit UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets);
-
-  ~UnicodeScriptTokenizerOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kUnicodeScriptTokenizerOperation; }
-
- private:
-  bool keep_whitespace_;
-  bool with_offsets_;
-};
-
-class WhitespaceTokenizerOperation : public TensorOperation {
- public:
-  explicit WhitespaceTokenizerOperation(bool with_offsets);
-
-  ~WhitespaceTokenizerOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kWhitespaceTokenizerOperation; }
-
- private:
-  bool with_offsets_;
-};
-#endif
 }  // namespace text
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/include/transforms.h
+++ b/mindspore/ccsrc/minddata/dataset/include/transforms.h
@ -25,40 +25,12 @@
 #include "include/api/status.h"
 #include "minddata/dataset/include/constants.h"

-// (TEMPORARY) will be removed when Tensor op ir moved down
-#include "minddata/dataset/kernels/ir/tensor_operation.h"
-
-#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
-#define INCLUDE_NLOHMANN_JSON_FWD_HPP_
-namespace nlohmann {
-template <typename T = void, typename SFINAE = void>
-struct adl_serializer;
-template <template <typename U, typename V, typename... Args> class ObjectType = std::map,
-          template <typename U, typename... Args> class ArrayType = std::vector, class StringType = std::string,
-          class BooleanType = bool, class NumberIntegerType = std::int64_t, class NumberUnsignedType = std::uint64_t,
-          class NumberFloatType = double, template <typename U> class AllocatorType = std::allocator,
-          template <typename T, typename SFINAE = void> class JSONSerializer = adl_serializer>
-class basic_json;
-template <typename BasicJsonType>
-class json_pointer;
-using json = basic_json<>;
-}  // namespace nlohmann
-#endif  // INCLUDE_NLOHMANN_JSON_FWD_HPP_
+// FIXME - This internal IR header will be removed when external API classes are provided
+#include "minddata/dataset/kernels/ir/data/transforms_ir.h"

 namespace mindspore {
 namespace dataset {

-// Char arrays storing name of corresponding classes (in alphabetical order)
-constexpr char kComposeOperation[] = "Compose";
-constexpr char kDuplicateOperation[] = "Duplicate";
-constexpr char kOneHotOperation[] = "OneHot";
-constexpr char kPreBuiltOperation[] = "PreBuilt";
-constexpr char kRandomApplyOperation[] = "RandomApply";
-constexpr char kRandomChoiceOperation[] = "RandomChoice";
-constexpr char kRandomSelectSubpolicyOperation[] = "RandomSelectSubpolicy";
-constexpr char kTypeCastOperation[] = "TypeCast";
-constexpr char kUniqueOperation[] = "Unique";
-
 // Transform operations for performing data transformation.
 namespace transforms {

@ -119,134 +91,6 @@ std::shared_ptr<TypeCastOperation> TypeCast(std::string data_type);
 /// \return Shared pointer to the current TensorOperation.
 std::shared_ptr<UniqueOperation> Unique();
 #endif
-
-/* ####################################### Derived TensorOperation classes ################################# */
-
-class ComposeOperation : public TensorOperation {
- public:
-  explicit ComposeOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms);
-
-  ~ComposeOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kComposeOperation; }
-
- private:
-  std::vector<std::shared_ptr<TensorOperation>> transforms_;
-};
-
-class DuplicateOperation : public TensorOperation {
- public:
-  DuplicateOperation() = default;
-
-  ~DuplicateOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kDuplicateOperation; }
-};
-
-class OneHotOperation : public TensorOperation {
- public:
-  explicit OneHotOperation(int32_t num_classes_);
-
-  ~OneHotOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kOneHotOperation; }
-
- private:
-  float num_classes_;
-};
-
-class PreBuiltOperation : public TensorOperation {
- public:
-  explicit PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op);
-
-  ~PreBuiltOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override;
-
-  Status to_json(nlohmann::json *out_json) override;
-
- private:
-  std::shared_ptr<TensorOp> op_;
-};
-
-class RandomApplyOperation : public TensorOperation {
- public:
-  explicit RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob);
-
-  ~RandomApplyOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kRandomApplyOperation; }
-
- private:
-  std::vector<std::shared_ptr<TensorOperation>> transforms_;
-  double prob_;
-};
-
-class RandomChoiceOperation : public TensorOperation {
- public:
-  explicit RandomChoiceOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms);
-
-  ~RandomChoiceOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kRandomChoiceOperation; }
-
- private:
-  std::vector<std::shared_ptr<TensorOperation>> transforms_;
-};
-class TypeCastOperation : public TensorOperation {
- public:
-  explicit TypeCastOperation(std::string data_type);
-
-  ~TypeCastOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kTypeCastOperation; }
-
- private:
-  std::string data_type_;
-};
-
-#ifndef ENABLE_ANDROID
-class UniqueOperation : public TensorOperation {
- public:
-  UniqueOperation() = default;
-
-  ~UniqueOperation() = default;
-
-  std::shared_ptr<TensorOp> Build() override;
-
-  Status ValidateParams() override;
-
-  std::string Name() const override { return kUniqueOperation; }
-};
-#endif
 }  // namespace transforms
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/ir/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/kernels/ir/CMakeLists.txt
@ -1,3 +1,4 @@
+add_subdirectory(data)
 add_subdirectory(vision)
 file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
--- a/mindspore/ccsrc/minddata/dataset/kernels/ir/data/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/kernels/ir/data/CMakeLists.txt
@ -0,0 +1,8 @@
+file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
+set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
+
+set(DATASET_KERNELS_IR_DATA_SRC_FILES
+        transforms_ir.cc
+        )
+
+add_library(kernels-ir-data OBJECT ${DATASET_KERNELS_IR_DATA_SRC_FILES})
--- a/mindspore/ccsrc/minddata/dataset/kernels/ir/data/transforms_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/ir/data/transforms_ir.cc
@ -0,0 +1,155 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+
+#include "minddata/dataset/kernels/ir/data/transforms_ir.h"
+
+// Kernel data headers (in alphabetical order)
+#include "minddata/dataset/kernels/data/compose_op.h"
+#include "minddata/dataset/kernels/data/duplicate_op.h"
+#include "minddata/dataset/kernels/data/one_hot_op.h"
+#include "minddata/dataset/kernels/data/random_apply_op.h"
+#include "minddata/dataset/kernels/data/random_choice_op.h"
+#include "minddata/dataset/kernels/data/type_cast_op.h"
+#ifndef ENABLE_ANDROID
+#include "minddata/dataset/kernels/data/unique_op.h"
+#endif
+
+#include "minddata/dataset/kernels/ir/validators.h"
+
+namespace mindspore {
+namespace dataset {
+
+// Transform operations for data.
+namespace transforms {
+
+/* ####################################### Derived TensorOperation classes ################################# */
+
+// (In alphabetical order)
+
+// ComposeOperation
+ComposeOperation::ComposeOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms)
+    : transforms_(transforms) {}
+
+Status ComposeOperation::ValidateParams() {
+  RETURN_IF_NOT_OK(ValidateVectorTransforms("Compose", transforms_));
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> ComposeOperation::Build() {
+  std::vector<std::shared_ptr<TensorOp>> tensor_ops;
+  (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops),
+                       [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); });
+  return std::make_shared<ComposeOp>(tensor_ops);
+}
+
+// DuplicateOperation
+Status DuplicateOperation::ValidateParams() { return Status::OK(); }
+
+std::shared_ptr<TensorOp> DuplicateOperation::Build() { return std::make_shared<DuplicateOp>(); }
+
+// OneHotOperation
+OneHotOperation::OneHotOperation(int32_t num_classes) : num_classes_(num_classes) {}
+
+Status OneHotOperation::ValidateParams() {
+  if (num_classes_ <= 0) {
+    std::string err_msg = "OneHot: Number of classes must be greater than 0, but got: " + std::to_string(num_classes_);
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> OneHotOperation::Build() { return std::make_shared<OneHotOp>(num_classes_); }
+
+// PreBuiltOperation
+PreBuiltOperation::PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op) : op_(tensor_op) {}
+
+Status PreBuiltOperation::ValidateParams() { return Status::OK(); }
+
+std::shared_ptr<TensorOp> PreBuiltOperation::Build() { return op_; }
+
+std::string PreBuiltOperation::Name() const { return op_ ? op_->Name() : kPreBuiltOperation; }
+
+Status PreBuiltOperation::to_json(nlohmann::json *out_json) {
+  RETURN_IF_NOT_OK(op_->to_json(out_json));
+  return Status::OK();
+}
+
+// RandomApplyOperation
+RandomApplyOperation::RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob)
+    : TensorOperation(true), transforms_(transforms), prob_(prob) {}
+
+Status RandomApplyOperation::ValidateParams() {
+  RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomApply", transforms_));
+  RETURN_IF_NOT_OK(ValidateProbability("RandomApply", prob_));
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> RandomApplyOperation::Build() {
+  std::vector<std::shared_ptr<TensorOp>> tensor_ops;
+  (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops),
+                       [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); });
+  return std::make_shared<RandomApplyOp>(prob_, tensor_ops);
+}
+
+// RandomChoiceOperation
+RandomChoiceOperation::RandomChoiceOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms)
+    : TensorOperation(true), transforms_(transforms) {}
+
+Status RandomChoiceOperation::ValidateParams() {
+  RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomChoice", transforms_));
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> RandomChoiceOperation::Build() {
+  std::vector<std::shared_ptr<TensorOp>> tensor_ops;
+  (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops),
+                       [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); });
+  return std::make_shared<RandomChoiceOp>(tensor_ops);
+}
+
+// TypeCastOperation
+TypeCastOperation::TypeCastOperation(std::string data_type) : data_type_(data_type) {}
+
+Status TypeCastOperation::ValidateParams() {
+  std::vector<std::string> predefine_type = {"bool",  "int8",   "uint8",   "int16",   "uint16",  "int32", "uint32",
+                                             "int64", "uint64", "float16", "float32", "float64", "string"};
+  auto itr = std::find(predefine_type.begin(), predefine_type.end(), data_type_);
+  if (itr == predefine_type.end()) {
+    std::string err_msg = "TypeCast: Invalid data type: " + data_type_;
+    MS_LOG(ERROR) << "TypeCast: Only supports data type bool, int8, uint8, int16, uint16, int32, uint32, "
+                  << "int64, uint64, float16, float32, float64, string, but got: " << data_type_;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> TypeCastOperation::Build() { return std::make_shared<TypeCastOp>(data_type_); }
+
+#ifndef ENABLE_ANDROID
+// UniqueOperation
+Status UniqueOperation::ValidateParams() { return Status::OK(); }
+
+std::shared_ptr<TensorOp> UniqueOperation::Build() { return std::make_shared<UniqueOp>(); }
+#endif
+
+}  // namespace transforms
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/ir/data/transforms_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/ir/data/transforms_ir.h
@ -0,0 +1,172 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IR_DATA_TRANSFORMS_IR_H_
+#define MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IR_DATA_TRANSFORMS_IR_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "minddata/dataset/kernels/ir/tensor_operation.h"
+
+namespace mindspore {
+namespace dataset {
+
+// Char arrays storing name of corresponding classes (in alphabetical order)
+constexpr char kComposeOperation[] = "Compose";
+constexpr char kDuplicateOperation[] = "Duplicate";
+constexpr char kOneHotOperation[] = "OneHot";
+constexpr char kPreBuiltOperation[] = "PreBuilt";
+constexpr char kRandomApplyOperation[] = "RandomApply";
+constexpr char kRandomChoiceOperation[] = "RandomChoice";
+constexpr char kTypeCastOperation[] = "TypeCast";
+constexpr char kUniqueOperation[] = "Unique";
+
+// Transform operations for performing data transformation.
+namespace transforms {
+/* ####################################### Derived TensorOperation classes ################################# */
+
+class ComposeOperation : public TensorOperation {
+ public:
+  explicit ComposeOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms);
+
+  ~ComposeOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kComposeOperation; }
+
+ private:
+  std::vector<std::shared_ptr<TensorOperation>> transforms_;
+};
+
+class DuplicateOperation : public TensorOperation {
+ public:
+  DuplicateOperation() = default;
+
+  ~DuplicateOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kDuplicateOperation; }
+};
+
+class OneHotOperation : public TensorOperation {
+ public:
+  explicit OneHotOperation(int32_t num_classes_);
+
+  ~OneHotOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kOneHotOperation; }
+
+ private:
+  float num_classes_;
+};
+
+class PreBuiltOperation : public TensorOperation {
+ public:
+  explicit PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op);
+
+  ~PreBuiltOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override;
+
+  Status to_json(nlohmann::json *out_json) override;
+
+ private:
+  std::shared_ptr<TensorOp> op_;
+};
+
+class RandomApplyOperation : public TensorOperation {
+ public:
+  explicit RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob);
+
+  ~RandomApplyOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kRandomApplyOperation; }
+
+ private:
+  std::vector<std::shared_ptr<TensorOperation>> transforms_;
+  double prob_;
+};
+
+class RandomChoiceOperation : public TensorOperation {
+ public:
+  explicit RandomChoiceOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms);
+
+  ~RandomChoiceOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kRandomChoiceOperation; }
+
+ private:
+  std::vector<std::shared_ptr<TensorOperation>> transforms_;
+};
+class TypeCastOperation : public TensorOperation {
+ public:
+  explicit TypeCastOperation(std::string data_type);
+
+  ~TypeCastOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kTypeCastOperation; }
+
+ private:
+  std::string data_type_;
+};
+
+#ifndef ENABLE_ANDROID
+class UniqueOperation : public TensorOperation {
+ public:
+  UniqueOperation() = default;
+
+  ~UniqueOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kUniqueOperation; }
+};
+#endif
+}  // namespace transforms
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IR_DATA_TRANSFORMS_IR_H_
--- a/mindspore/ccsrc/minddata/dataset/text/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/text/CMakeLists.txt
@ -1,3 +1,4 @@
+add_subdirectory(ir)
 add_subdirectory(kernels)

 file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
--- a/mindspore/ccsrc/minddata/dataset/text/ir/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/text/ir/CMakeLists.txt
@ -0,0 +1,6 @@
+add_subdirectory(kernels)
+
+file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
+set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
+
+add_library(text-ir OBJECT validators.cc)
--- a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/CMakeLists.txt
@ -0,0 +1,8 @@
+file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
+set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
+
+set(DATASET_TEXT_IR_KERNELS_SRC_FILES
+        text_ir.cc
+        )
+
+add_library(text-ir-kernels OBJECT ${DATASET_TEXT_IR_KERNELS_SRC_FILES})
--- a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc
@ -0,0 +1,436 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <unistd.h>
+
+#include "minddata/dataset/text/ir/kernels/text_ir.h"
+
+#ifndef _WIN32
+#include "minddata/dataset/text/kernels/basic_tokenizer_op.h"
+#include "minddata/dataset/text/kernels/bert_tokenizer_op.h"
+#include "minddata/dataset/text/kernels/case_fold_op.h"
+#endif
+#include "minddata/dataset/text/kernels/jieba_tokenizer_op.h"
+#include "minddata/dataset/text/kernels/lookup_op.h"
+#include "minddata/dataset/text/kernels/ngram_op.h"
+#ifndef _WIN32
+#include "minddata/dataset/text/kernels/normalize_utf8_op.h"
+#include "minddata/dataset/text/kernels/regex_replace_op.h"
+#include "minddata/dataset/text/kernels/regex_tokenizer_op.h"
+#endif
+#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
+#include "minddata/dataset/text/kernels/sliding_window_op.h"
+#include "minddata/dataset/text/kernels/to_number_op.h"
+#include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h"
+#include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
+#ifndef _WIN32
+#include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h"
+#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
+#endif
+#include "minddata/dataset/core/data_type.h"
+#include "minddata/dataset/util/path.h"
+
+#include "minddata/dataset/text/ir/validators.h"
+
+namespace mindspore {
+namespace dataset {
+
+// Transform operations for text.
+namespace text {
+
+/* ####################################### Derived TensorOperation classes ################################# */
+
+// (In alphabetical order)
+
+#ifndef _WIN32
+// BasicTokenizerOperation
+BasicTokenizerOperation::BasicTokenizerOperation(bool lower_case, bool keep_whitespace,
+                                                 const NormalizeForm normalize_form, bool preserve_unused_token,
+                                                 bool with_offsets)
+    : lower_case_(lower_case),
+      keep_whitespace_(keep_whitespace),
+      normalize_form_(normalize_form),
+      preserve_unused_token_(preserve_unused_token),
+      with_offsets_(with_offsets) {}
+
+Status BasicTokenizerOperation::ValidateParams() { return Status::OK(); }
+
+std::shared_ptr<TensorOp> BasicTokenizerOperation::Build() {
+  std::shared_ptr<BasicTokenizerOp> tensor_op = std::make_shared<BasicTokenizerOp>(
+    lower_case_, keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_);
+  return tensor_op;
+}
+
+// BertTokenizerOperation
+BertTokenizerOperation::BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
+                                               int32_t max_bytes_per_token, const std::string &unknown_token,
+                                               bool lower_case, bool keep_whitespace,
+                                               const NormalizeForm normalize_form, bool preserve_unused_token,
+                                               bool with_offsets)
+    : vocab_(vocab),
+      suffix_indicator_(suffix_indicator),
+      max_bytes_per_token_(max_bytes_per_token),
+      unknown_token_(unknown_token),
+      lower_case_(lower_case),
+      keep_whitespace_(keep_whitespace),
+      normalize_form_(normalize_form),
+      preserve_unused_token_(preserve_unused_token),
+      with_offsets_(with_offsets) {}
+
+BertTokenizerOperation::~BertTokenizerOperation() = default;
+
+Status BertTokenizerOperation::ValidateParams() {
+  if (vocab_ == nullptr) {
+    std::string err_msg = "BertTokenizer: vocab object type is incorrect or null.";
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  if (max_bytes_per_token_ < 0) {
+    std::string err_msg = "BertTokenizer : The parameter max_bytes_per_token must be greater than or equal to 0: " +
+                          std::to_string(max_bytes_per_token_);
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> BertTokenizerOperation::Build() {
+  std::shared_ptr<BertTokenizerOp> tensor_op =
+    std::make_shared<BertTokenizerOp>(vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, lower_case_,
+                                      keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_);
+  return tensor_op;
+}
+
+// CaseFoldOperation
+Status CaseFoldOperation::ValidateParams() { return Status::OK(); }
+
+std::shared_ptr<TensorOp> CaseFoldOperation::Build() {
+  std::shared_ptr<CaseFoldOp> tensor_op = std::make_shared<CaseFoldOp>();
+  return tensor_op;
+}
+#endif
+
+// JiebaTokenizerOperation
+JiebaTokenizerOperation::JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path,
+                                                 const JiebaMode &mode, bool with_offsets)
+    : hmm_path_(hmm_path), mp_path_(mp_path), mode_(mode), with_offsets_(with_offsets) {}
+
+Status JiebaTokenizerOperation::ValidateParams() {
+  if (hmm_path_.empty()) {
+    std::string err_msg = "JiebaTokenizer: The dict of HMMSegment in cppjieba is not provided.";
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  if (mp_path_.empty()) {
+    std::string err_msg = "JiebaTokenizer: The dict of MPSegment in cppjieba is not provided.";
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", hmm_path_));
+  RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", mp_path_));
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() {
+  std::shared_ptr<JiebaTokenizerOp> tensor_op =
+    std::make_shared<JiebaTokenizerOp>(hmm_path_, mp_path_, mode_, with_offsets_);
+  for (auto &word : words_list_) {
+    Status rc = tensor_op->AddWord(word.first, word.second);
+    if (rc.IsError()) {
+      MS_LOG(ERROR) << rc;
+      return {};
+    }
+  }
+  return tensor_op;
+}
+
+Status JiebaTokenizerOperation::AddWord(const std::string &word, int64_t freq) {
+  if (word.empty()) {
+    std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided.";
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+  if (freq < 0) {
+    std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0.";
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+  words_list_.emplace_back(word, freq);
+  return Status::OK();
+}
+
+// LookupOperation
+LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
+                                 const std::string &data_type)
+    : vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {}
+
+LookupOperation::~LookupOperation() = default;
+
+Status LookupOperation::ValidateParams() {
+  if (vocab_ == nullptr) {
+    std::string err_msg = "Lookup: vocab object type is incorrect or null.";
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+  if (unknown_token_ != std::nullopt) {
+    default_id_ = vocab_->Lookup(*unknown_token_);
+    if (default_id_ == Vocab::kNoTokenExists) {
+      std::string err_msg = "Lookup: \"" + *unknown_token_ + "\" doesn't exist in vocab.";
+      MS_LOG(ERROR) << err_msg;
+      RETURN_STATUS_SYNTAX_ERROR(err_msg);
+    }
+  }
+
+  if (!IsTypeNumeric(data_type_)) {
+    std::string err_msg = "Lookup does not support a string to string mapping, data_type can only be numeric.";
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> LookupOperation::Build() {
+  std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, DataType(data_type_));
+  return tensor_op;
+}
+
+// NgramOperation
+NgramOperation::NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
+                               const std::pair<std::string, int32_t> &right_pad, const std::string &separator)
+    : ngrams_(ngrams), left_pad_(left_pad), right_pad_(right_pad), separator_(separator) {}
+
+Status NgramOperation::ValidateParams() {
+  if (ngrams_.size() == 0) {
+    std::string err_msg = "Ngram : Container cannot be empty.";
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  } else {
+    for (int32_t i = 0; i < ngrams_.size(); ++i) {
+      if (ngrams_[i] <= 0) {
+        std::string err_msg =
+          "Ngram : The value of ngrams vector must be greater than 0: " + std::to_string(ngrams_[i]);
+        MS_LOG(ERROR) << err_msg;
+        RETURN_STATUS_SYNTAX_ERROR(err_msg);
+      }
+    }
+  }
+
+  if (left_pad_.second < 0) {
+    std::string err_msg =
+      "Ngram : The second parameter pad_width in left_pad vector must be greater than or equal to 0: " +
+      std::to_string(left_pad_.second);
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  if (right_pad_.second < 0) {
+    std::string err_msg =
+      "Ngram : The second parameter pad_width in right_pad vector must be greater than or equal to 0: " +
+      std::to_string(right_pad_.second);
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> NgramOperation::Build() {
+  int32_t l_len = left_pad_.second;
+  int32_t r_len = right_pad_.second;
+  std::string l_pad = left_pad_.first;
+  std::string r_pad = right_pad_.first;
+  std::shared_ptr<NgramOp> tensor_op = std::make_shared<NgramOp>(ngrams_, l_len, r_len, l_pad, r_pad, separator_);
+  return tensor_op;
+}
+
+#ifndef _WIN32
+// NormalizeUTF8Operation
+NormalizeUTF8Operation::NormalizeUTF8Operation(NormalizeForm normalize_form) : normalize_form_(normalize_form) {}
+
+Status NormalizeUTF8Operation::ValidateParams() { return Status::OK(); }
+
+std::shared_ptr<TensorOp> NormalizeUTF8Operation::Build() {
+  std::shared_ptr<NormalizeUTF8Op> tensor_op = std::make_shared<NormalizeUTF8Op>(normalize_form_);
+  return tensor_op;
+}
+
+// RegexReplaceOperation
+RegexReplaceOperation::RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all)
+    : pattern_(pattern), replace_(replace), replace_all_(replace_all) {}
+
+Status RegexReplaceOperation::ValidateParams() { return Status::OK(); }
+
+std::shared_ptr<TensorOp> RegexReplaceOperation::Build() {
+  std::shared_ptr<RegexReplaceOp> tensor_op = std::make_shared<RegexReplaceOp>(pattern_, replace_, replace_all_);
+  return tensor_op;
+}
+
+// RegexTokenizerOperation
+RegexTokenizerOperation::RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern,
+                                                 bool with_offsets)
+    : delim_pattern_(delim_pattern), keep_delim_pattern_(keep_delim_pattern), with_offsets_(with_offsets) {}
+
+Status RegexTokenizerOperation::ValidateParams() { return Status::OK(); }
+
+std::shared_ptr<TensorOp> RegexTokenizerOperation::Build() {
+  std::shared_ptr<RegexTokenizerOp> tensor_op =
+    std::make_shared<RegexTokenizerOp>(delim_pattern_, keep_delim_pattern_, with_offsets_);
+  return tensor_op;
+}
+#endif
+
+// SentencePieceTokenizerOperation
+SentencePieceTokenizerOperation::~SentencePieceTokenizerOperation() = default;
+
+SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab,
+                                                                 SPieceTokenizerOutType out_type)
+    : vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {}
+
+SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::string &vocab_path,
+                                                                 SPieceTokenizerOutType out_type)
+    : vocab_(nullptr), vocab_path_(vocab_path), load_type_(SPieceTokenizerLoadType::kFile), out_type_(out_type) {}
+
+Status SentencePieceTokenizerOperation::ValidateParams() {
+  if (load_type_ == SPieceTokenizerLoadType::kModel) {
+    if (vocab_ == nullptr) {
+      std::string err_msg = "SentencePieceTokenizer: vocab object type is incorrect or null.";
+      MS_LOG(ERROR) << err_msg;
+      RETURN_STATUS_SYNTAX_ERROR(err_msg);
+    }
+  } else {
+    Path vocab_file(vocab_path_);
+    if (!vocab_file.Exists() || vocab_file.IsDirectory()) {
+      std::string err_msg = "SentencePieceTokenizer : vocab file: [" + vocab_path_ + "] is invalid or does not exist.";
+      MS_LOG(ERROR) << err_msg;
+      RETURN_STATUS_SYNTAX_ERROR(err_msg);
+    }
+    if (access(vocab_file.toString().c_str(), R_OK) == -1) {
+      std::string err_msg = "SentencePieceTokenizer : no access to specified dataset file: " + vocab_path_;
+      MS_LOG(ERROR) << err_msg;
+      RETURN_STATUS_SYNTAX_ERROR(err_msg);
+    }
+  }
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> SentencePieceTokenizerOperation::Build() {
+  std::shared_ptr<SentencePieceTokenizerOp> tensor_op;
+  if (load_type_ == SPieceTokenizerLoadType::kModel) {
+    tensor_op = std::make_shared<SentencePieceTokenizerOp>(vocab_, load_type_, out_type_);
+  } else {
+    Path vocab_file(vocab_path_);
+    std::string model_path = vocab_file.ParentPath();
+    std::string model_filename = vocab_file.Basename();
+    tensor_op = std::make_shared<SentencePieceTokenizerOp>(model_path, model_filename, load_type_, out_type_);
+  }
+  return tensor_op;
+}
+
+// SlidingWindowOperation
+SlidingWindowOperation::SlidingWindowOperation(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {}
+
+Status SlidingWindowOperation::ValidateParams() {
+  if (width_ < 1) {
+    std::string err_msg =
+      "SlidingWindow : The parameter width must be greater than or equal to 1: " + std::to_string(width_);
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> SlidingWindowOperation::Build() {
+  std::shared_ptr<SlidingWindowOp> tensor_op = std::make_shared<SlidingWindowOp>(static_cast<uint32_t>(width_), axis_);
+  return tensor_op;
+}
+
+// ToNumberOperation
+ToNumberOperation::ToNumberOperation(std::string data_type) : data_type_(data_type) {}
+
+Status ToNumberOperation::ValidateParams() {
+  if (!IsTypeNumeric(data_type_) || IsTypeBoolean(data_type_)) {
+    std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_;
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> ToNumberOperation::Build() {
+  std::shared_ptr<ToNumberOp> tensor_op = std::make_shared<ToNumberOp>(data_type_);
+  return tensor_op;
+}
+
+TruncateSequencePairOperation::TruncateSequencePairOperation(int32_t max_length) : max_length_(max_length) {}
+
+Status TruncateSequencePairOperation::ValidateParams() {
+  if (max_length_ < 0) {
+    std::string err_msg = "TruncateSequencePair : The parameter max_length must be greater than or equal to 0: " +
+                          std::to_string(max_length_);
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> TruncateSequencePairOperation::Build() {
+  std::shared_ptr<TruncateSequencePairOp> tensor_op = std::make_shared<TruncateSequencePairOp>(max_length_);
+  return tensor_op;
+}
+
+// UnicodeCharTokenizerOperation
+UnicodeCharTokenizerOperation::UnicodeCharTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {}
+
+Status UnicodeCharTokenizerOperation::ValidateParams() { return Status::OK(); }
+
+std::shared_ptr<TensorOp> UnicodeCharTokenizerOperation::Build() {
+  std::shared_ptr<UnicodeCharTokenizerOp> tensor_op = std::make_shared<UnicodeCharTokenizerOp>(with_offsets_);
+  return tensor_op;
+}
+
+#ifndef _WIN32
+// UnicodeScriptTokenizerOperation
+UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets)
+    : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {}
+
+Status UnicodeScriptTokenizerOperation::ValidateParams() { return Status::OK(); }
+
+std::shared_ptr<TensorOp> UnicodeScriptTokenizerOperation::Build() {
+  std::shared_ptr<UnicodeScriptTokenizerOp> tensor_op =
+    std::make_shared<UnicodeScriptTokenizerOp>(keep_whitespace_, with_offsets_);
+  return tensor_op;
+}
+
+// WhitespaceTokenizerOperation
+WhitespaceTokenizerOperation::WhitespaceTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {}
+
+Status WhitespaceTokenizerOperation::ValidateParams() { return Status::OK(); }
+
+std::shared_ptr<TensorOp> WhitespaceTokenizerOperation::Build() {
+  std::shared_ptr<WhitespaceTokenizerOp> tensor_op = std::make_shared<WhitespaceTokenizerOp>(with_offsets_);
+  return tensor_op;
+}
+#endif
+
+}  // namespace text
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h
@ -0,0 +1,360 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_KERNELS_TEXT_IR_H_
+#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_KERNELS_TEXT_IR_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "minddata/dataset/kernels/ir/tensor_operation.h"
+
+namespace mindspore {
+namespace dataset {
+
+class Vocab;
+class SentencePieceVocab;
+
+// Transform operations for text
+namespace text {
+
+// Char arrays storing name of corresponding classes (in alphabetical order)
+constexpr char kBasicTokenizerOperation[] = "BasicTokenizer";
+constexpr char kBertTokenizerOperation[] = "BertTokenizer";
+constexpr char kCaseFoldOperation[] = "CaseFold";
+constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer";
+constexpr char kLookupOperation[] = "Lookup";
+constexpr char kNgramOperation[] = "Ngram";
+constexpr char kNormalizeUTF8Operation[] = "NormalizeUTF8";
+constexpr char kRegexReplaceOperation[] = "RegexReplace";
+constexpr char kRegexTokenizerOperation[] = "RegexTokenizer";
+constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer";
+constexpr char kSlidingWindowOperation[] = "SlidingWindow";
+constexpr char kToNumberOperation[] = "ToNumber";
+constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair";
+constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer";
+constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer";
+constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer";
+
+/* ####################################### Derived TensorOperation classes ################################# */
+
+#ifndef _WIN32
+class BasicTokenizerOperation : public TensorOperation {
+ public:
+  BasicTokenizerOperation(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form,
+                          bool preserve_unused_token, bool with_offsets);
+
+  ~BasicTokenizerOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kBasicTokenizerOperation; }
+
+ private:
+  bool lower_case_;
+  bool keep_whitespace_;
+  NormalizeForm normalize_form_;
+  bool preserve_unused_token_;
+  bool with_offsets_;
+};
+
+class BertTokenizerOperation : public TensorOperation {
+ public:
+  BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
+                         int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case,
+                         bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
+                         bool with_offsets);
+
+  ~BertTokenizerOperation();
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kBertTokenizerOperation; }
+
+ private:
+  std::shared_ptr<Vocab> vocab_;
+  std::string suffix_indicator_;
+  int32_t max_bytes_per_token_;
+  std::string unknown_token_;
+  bool lower_case_;
+  bool keep_whitespace_;
+  NormalizeForm normalize_form_;
+  bool preserve_unused_token_;
+  bool with_offsets_;
+};
+
+class CaseFoldOperation : public TensorOperation {
+ public:
+  CaseFoldOperation() = default;
+
+  ~CaseFoldOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kCaseFoldOperation; }
+};
+#endif
+
+class JiebaTokenizerOperation : public TensorOperation {
+ public:
+  explicit JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode,
+                                   bool with_offsets);
+
+  ~JiebaTokenizerOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kJiebaTokenizerOperation; }
+
+  Status AddWord(const std::string &word, int64_t freq = 0);
+
+ private:
+  std::string hmm_path_;
+  std::string mp_path_;
+  JiebaMode mode_;
+  bool with_offsets_;
+  std::vector<std::pair<std::string, int64_t>> words_list_;
+};
+
+class LookupOperation : public TensorOperation {
+ public:
+  explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
+                           const std::string &data_type);
+
+  ~LookupOperation();
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kLookupOperation; }
+
+ private:
+  std::shared_ptr<Vocab> vocab_;
+  std::optional<std::string> unknown_token_;
+  int32_t default_id_;
+  std::string data_type_;
+};
+
+class NgramOperation : public TensorOperation {
+ public:
+  explicit NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
+                          const std::pair<std::string, int32_t> &right_pad, const std::string &separator);
+
+  ~NgramOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kNgramOperation; }
+
+ private:
+  std::vector<int32_t> ngrams_;
+  std::pair<std::string, int32_t> left_pad_;
+  std::pair<std::string, int32_t> right_pad_;
+  std::string separator_;
+};
+
+#ifndef _WIN32
+class NormalizeUTF8Operation : public TensorOperation {
+ public:
+  explicit NormalizeUTF8Operation(NormalizeForm normalize_form);
+
+  ~NormalizeUTF8Operation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kNormalizeUTF8Operation; }
+
+ private:
+  NormalizeForm normalize_form_;
+};
+
+class RegexReplaceOperation : public TensorOperation {
+ public:
+  RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all);
+
+  ~RegexReplaceOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kRegexReplaceOperation; }
+
+ private:
+  std::string pattern_;
+  std::string replace_;
+  bool replace_all_;
+};
+
+class RegexTokenizerOperation : public TensorOperation {
+ public:
+  explicit RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, bool with_offsets);
+
+  ~RegexTokenizerOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kRegexTokenizerOperation; }
+
+ private:
+  std::string delim_pattern_;
+  std::string keep_delim_pattern_;
+  bool with_offsets_;
+};
+#endif
+
+class SentencePieceTokenizerOperation : public TensorOperation {
+ public:
+  SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type);
+
+  SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type);
+
+  ~SentencePieceTokenizerOperation();
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kSentencepieceTokenizerOperation; }
+
+ private:
+  std::shared_ptr<SentencePieceVocab> vocab_;
+  std::string vocab_path_;
+  SPieceTokenizerLoadType load_type_;
+  SPieceTokenizerOutType out_type_;
+};
+
+class SlidingWindowOperation : public TensorOperation {
+ public:
+  explicit SlidingWindowOperation(const int32_t width, const int32_t axis);
+
+  ~SlidingWindowOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kSlidingWindowOperation; }
+
+ private:
+  int32_t width_;
+  int32_t axis_;
+};
+
+class ToNumberOperation : public TensorOperation {
+ public:
+  explicit ToNumberOperation(std::string data_type);
+
+  ~ToNumberOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kToNumberOperation; }
+
+ private:
+  std::string data_type_;
+};
+
+class TruncateSequencePairOperation : public TensorOperation {
+ public:
+  explicit TruncateSequencePairOperation(int32_t max_length);
+
+  ~TruncateSequencePairOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kTruncateSequencePairOperation; }
+
+ private:
+  int32_t max_length_;
+};
+
+class UnicodeCharTokenizerOperation : public TensorOperation {
+ public:
+  explicit UnicodeCharTokenizerOperation(bool with_offsets);
+
+  ~UnicodeCharTokenizerOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kUnicodeCharTokenizerOperation; }
+
+ private:
+  bool with_offsets_;
+};
+
+#ifndef _WIN32
+class UnicodeScriptTokenizerOperation : public TensorOperation {
+ public:
+  explicit UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets);
+
+  ~UnicodeScriptTokenizerOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kUnicodeScriptTokenizerOperation; }
+
+ private:
+  bool keep_whitespace_;
+  bool with_offsets_;
+};
+
+class WhitespaceTokenizerOperation : public TensorOperation {
+ public:
+  explicit WhitespaceTokenizerOperation(bool with_offsets);
+
+  ~WhitespaceTokenizerOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kWhitespaceTokenizerOperation; }
+
+ private:
+  bool with_offsets_;
+};
+#endif
+}  // namespace text
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_KERNELS_TEXT_IR_H_
--- a/mindspore/ccsrc/minddata/dataset/text/ir/validators.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/ir/validators.cc
@ -0,0 +1,60 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "minddata/dataset/text/ir/validators.h"
+
+namespace mindspore {
+namespace dataset {
+/* ####################################### Validator Functions ############################################ */
+
+// Helper function to validate tokenizer directory parameter
+Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file) {
+  if (tokenizer_file.empty()) {
+    std::string err_msg = tokenizer_name + ": tokenizer_file is not specified.";
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  Path file(tokenizer_file);
+  if (!file.Exists()) {
+    std::string err_msg = tokenizer_name + ": tokenizer_file: [" + tokenizer_file + "] is an invalid directory path.";
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  if (access(tokenizer_file.c_str(), R_OK) == -1) {
+    std::string err_msg = tokenizer_name + ": No access to specified tokenizer path: " + tokenizer_file;
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+
+  return Status::OK();
+}
+
+// Helper functions to help validate data type passed by user
+bool IsTypeNumeric(const std::string &data_type) {
+  if (data_type == "int8" || data_type == "uint8" || data_type == "int16" || data_type == "uint16" ||
+      data_type == "int32" || data_type == "uint32" || data_type == "int64" || data_type == "uint64" ||
+      data_type == "float16" || data_type == "float32" || data_type == "float64")
+    return true;
+  return false;
+}
+
+bool IsTypeBoolean(const std::string &data_type) { return data_type == "bool"; }
+
+bool IsTypeString(const std::string &data_type) { return data_type == "string"; }
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/text/ir/validators.h
+++ b/mindspore/ccsrc/minddata/dataset/text/ir/validators.h
@ -0,0 +1,41 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_VALIDATORS_H_
+#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_VALIDATORS_H_
+
+#include <string>
+
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+// Helper function to validate tokenizer directory parameter
+Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file);
+
+// Helper function to validate data type passed by user
+bool IsTypeNumeric(const std::string &data_type);
+
+// Helper function to validate data type is boolean
+bool IsTypeBoolean(const std::string &data_type);
+
+// Helper function to validate data type is string
+bool IsTypeString(const std::string &data_type);
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_VALIDATORS_H_
--- a/mindspore/lite/minddata/CMakeLists.txt
+++ b/mindspore/lite/minddata/CMakeLists.txt
@ -202,6 +202,7 @@ if(BUILD_MINDDATA STREQUAL "full")
        ${MINDDATA_DIR}/kernels/data/type_cast_op.cc
        ${MINDDATA_DIR}/kernels/image/exif_utils.cc
        ${MINDDATA_DIR}/kernels/ir/validators.cc
+        ${MINDDATA_DIR}/kernels/ir/data/transforms_ir.cc
        ${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc
        ${MINDDATA_DIR}/callback/callback_manager.cc
        ${MINDDATA_DIR}/util/task_manager.cc
@ -281,6 +282,7 @@ elseif(BUILD_MINDDATA STREQUAL "wrapper")
            ${MINDDATA_DIR}/kernels/data/data_utils.cc
            ${MINDDATA_DIR}/kernels/image/exif_utils.cc
            ${MINDDATA_DIR}/kernels/ir/validators.cc
+            ${MINDDATA_DIR}/kernels/ir/data/transforms_ir.cc
            ${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc
            ${CMAKE_CURRENT_SOURCE_DIR}/wrapper/MDToDApi.cc
            ${CMAKE_CURRENT_SOURCE_DIR}/wrapper/album_op_android.cc
@ -393,6 +395,7 @@ elseif(BUILD_MINDDATA STREQUAL "lite")
        ${CMAKE_CURRENT_SOURCE_DIR}/../src/common/log_adapter.cc
        ${CORE_DIR}/utils/ms_utils.cc
        ${MINDDATA_DIR}/kernels/ir/validators.cc
+        ${MINDDATA_DIR}/kernels/ir/data/transforms_ir.cc
        ${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc
        )