diff --git a/mindspore/ccsrc/minddata/dataset/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/CMakeLists.txt index 15db427d9e1..6860095b413 100644 --- a/mindspore/ccsrc/minddata/dataset/CMakeLists.txt +++ b/mindspore/ccsrc/minddata/dataset/CMakeLists.txt @@ -92,11 +92,14 @@ add_dependencies(engine core) add_dependencies(callback core) add_dependencies(text core) add_dependencies(text-kernels core) +add_dependencies(text-ir core) +add_dependencies(text-ir-kernels core) add_dependencies(cpp-API core) add_dependencies(engine-ir-datasetops core) add_dependencies(engine-ir-datasetops-source core) add_dependencies(engine-ir-cache core) add_dependencies(kernels-ir core) +add_dependencies(kernels-ir-data core) add_dependencies(kernels-ir-vision core) if(ENABLE_ACL) @@ -146,7 +149,10 @@ set(submodules $ $ $ + $ + $ $ + $ $ ) diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/ir/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/ir/bindings.cc index 76d29f11763..50de865bafa 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/ir/bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/ir/bindings.cc @@ -17,9 +17,9 @@ #include "minddata/dataset/api/python/pybind_register.h" #include "minddata/dataset/core/global_context.h" -#include "minddata/dataset/include/transforms.h" #include "minddata/dataset/kernels/py_func_op.h" +#include "minddata/dataset/kernels/ir/data/transforms_ir.h" #include "minddata/dataset/kernels/ir/vision/vision_ir.h" namespace mindspore { diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc index d9a8b4b28bb..5b5c0827185 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc @@ -18,7 +18,7 @@ #include "pybind11/stl.h" #include "pybind11/stl_bind.h" #include "minddata/dataset/api/python/pybind_register.h" -#include "minddata/dataset/include/text.h" +#include "minddata/dataset/text/ir/kernels/text_ir.h" #include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h" #include "minddata/dataset/text/sentence_piece_vocab.h" #include "minddata/dataset/text/vocab.h" diff --git a/mindspore/ccsrc/minddata/dataset/api/python/pybind_conversion.h b/mindspore/ccsrc/minddata/dataset/api/python/pybind_conversion.h index d22a861f5d6..a707ad241c3 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/pybind_conversion.h +++ b/mindspore/ccsrc/minddata/dataset/api/python/pybind_conversion.h @@ -1,5 +1,5 @@ /** - * Copyright 2020 Huawei Technologies Co., Ltd + * Copyright 2020-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,10 +30,10 @@ #include "pybind11/stl_bind.h" #include "minddata/dataset/include/datasets.h" #include "minddata/dataset/include/samplers.h" -#include "minddata/dataset/include/transforms.h" #include "minddata/dataset/api/python/pybind_register.h" #include "minddata/dataset/engine/ir/cache/pre_built_dataset_cache.h" #include "minddata/dataset/engine/ir/datasetops/source/csv_node.h" +#include "minddata/dataset/kernels/ir/data/transforms_ir.h" #include "minddata/dataset/kernels/py_func_op.h" namespace py = pybind11; diff --git a/mindspore/ccsrc/minddata/dataset/api/text.cc b/mindspore/ccsrc/minddata/dataset/api/text.cc index 5d998415c15..39f1b31b7b6 100644 --- a/mindspore/ccsrc/minddata/dataset/api/text.cc +++ b/mindspore/ccsrc/minddata/dataset/api/text.cc @@ -1,5 +1,5 @@ /** - * Copyright 2020 Huawei Technologies Co., Ltd + * Copyright 2020-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,30 +17,6 @@ #include #include "minddata/dataset/include/text.h" -#ifndef _WIN32 -#include "minddata/dataset/text/kernels/basic_tokenizer_op.h" -#include "minddata/dataset/text/kernels/bert_tokenizer_op.h" -#include "minddata/dataset/text/kernels/case_fold_op.h" -#endif -#include "minddata/dataset/text/kernels/jieba_tokenizer_op.h" -#include "minddata/dataset/text/kernels/lookup_op.h" -#include "minddata/dataset/text/kernels/ngram_op.h" -#ifndef _WIN32 -#include "minddata/dataset/text/kernels/normalize_utf8_op.h" -#include "minddata/dataset/text/kernels/regex_replace_op.h" -#include "minddata/dataset/text/kernels/regex_tokenizer_op.h" -#endif -#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h" -#include "minddata/dataset/text/kernels/sliding_window_op.h" -#include "minddata/dataset/text/kernels/to_number_op.h" -#include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h" -#include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h" -#ifndef _WIN32 -#include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h" -#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" -#endif -#include "minddata/dataset/core/data_type.h" -#include "minddata/dataset/util/path.h" namespace mindspore { namespace dataset { @@ -174,426 +150,6 @@ std::shared_ptr WhitespaceTokenizer(bool with_offs return op->ValidateParams() ? op : nullptr; } #endif - -/* ####################################### Validator Functions ############################################ */ - -// Helper function to validate tokenizer directory parameter -Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file) { - if (tokenizer_file.empty()) { - std::string err_msg = tokenizer_name + ": tokenizer_file is not specified."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - Path file(tokenizer_file); - if (!file.Exists()) { - std::string err_msg = tokenizer_name + ": tokenizer_file: [" + tokenizer_file + "] is an invalid directory path."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - if (access(tokenizer_file.c_str(), R_OK) == -1) { - std::string err_msg = tokenizer_name + ": No access to specified tokenizer path: " + tokenizer_file; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - return Status::OK(); -} - -// Helper functions to help validate data type passed by user -bool IsTypeNumeric(const std::string &data_type) { - if (data_type == "int8" || data_type == "uint8" || data_type == "int16" || data_type == "uint16" || - data_type == "int32" || data_type == "uint32" || data_type == "int64" || data_type == "uint64" || - data_type == "float16" || data_type == "float32" || data_type == "float64") - return true; - return false; -} - -bool IsTypeBoolean(const std::string &data_type) { return data_type == "bool"; } - -bool IsTypeString(const std::string &data_type) { return data_type == "string"; } - -/* ####################################### Derived TensorOperation classes ################################# */ - -// (In alphabetical order) - -#ifndef _WIN32 -// BasicTokenizerOperation -BasicTokenizerOperation::BasicTokenizerOperation(bool lower_case, bool keep_whitespace, - const NormalizeForm normalize_form, bool preserve_unused_token, - bool with_offsets) - : lower_case_(lower_case), - keep_whitespace_(keep_whitespace), - normalize_form_(normalize_form), - preserve_unused_token_(preserve_unused_token), - with_offsets_(with_offsets) {} - -Status BasicTokenizerOperation::ValidateParams() { return Status::OK(); } - -std::shared_ptr BasicTokenizerOperation::Build() { - std::shared_ptr tensor_op = std::make_shared( - lower_case_, keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_); - return tensor_op; -} - -// BertTokenizerOperation -BertTokenizerOperation::BertTokenizerOperation(const std::shared_ptr &vocab, const std::string &suffix_indicator, - int32_t max_bytes_per_token, const std::string &unknown_token, - bool lower_case, bool keep_whitespace, - const NormalizeForm normalize_form, bool preserve_unused_token, - bool with_offsets) - : vocab_(vocab), - suffix_indicator_(suffix_indicator), - max_bytes_per_token_(max_bytes_per_token), - unknown_token_(unknown_token), - lower_case_(lower_case), - keep_whitespace_(keep_whitespace), - normalize_form_(normalize_form), - preserve_unused_token_(preserve_unused_token), - with_offsets_(with_offsets) {} - -BertTokenizerOperation::~BertTokenizerOperation() = default; - -Status BertTokenizerOperation::ValidateParams() { - if (vocab_ == nullptr) { - std::string err_msg = "BertTokenizer: vocab object type is incorrect or null."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - if (max_bytes_per_token_ < 0) { - std::string err_msg = "BertTokenizer : The parameter max_bytes_per_token must be greater than or equal to 0: " + - std::to_string(max_bytes_per_token_); - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - return Status::OK(); -} - -std::shared_ptr BertTokenizerOperation::Build() { - std::shared_ptr tensor_op = - std::make_shared(vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, lower_case_, - keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_); - return tensor_op; -} - -// CaseFoldOperation -Status CaseFoldOperation::ValidateParams() { return Status::OK(); } - -std::shared_ptr CaseFoldOperation::Build() { - std::shared_ptr tensor_op = std::make_shared(); - return tensor_op; -} -#endif - -// JiebaTokenizerOperation -JiebaTokenizerOperation::JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, - const JiebaMode &mode, bool with_offsets) - : hmm_path_(hmm_path), mp_path_(mp_path), mode_(mode), with_offsets_(with_offsets) {} - -Status JiebaTokenizerOperation::ValidateParams() { - if (hmm_path_.empty()) { - std::string err_msg = "JiebaTokenizer: The dict of HMMSegment in cppjieba is not provided."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - if (mp_path_.empty()) { - std::string err_msg = "JiebaTokenizer: The dict of MPSegment in cppjieba is not provided."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", hmm_path_)); - RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", mp_path_)); - return Status::OK(); -} - -std::shared_ptr JiebaTokenizerOperation::Build() { - std::shared_ptr tensor_op = - std::make_shared(hmm_path_, mp_path_, mode_, with_offsets_); - for (auto &word : words_list_) { - Status rc = tensor_op->AddWord(word.first, word.second); - if (rc.IsError()) { - MS_LOG(ERROR) << rc; - return {}; - } - } - return tensor_op; -} - -Status JiebaTokenizerOperation::AddWord(const std::string &word, int64_t freq) { - if (word.empty()) { - std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - if (freq < 0) { - std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - words_list_.emplace_back(word, freq); - return Status::OK(); -} - -// LookupOperation -LookupOperation::LookupOperation(const std::shared_ptr &vocab, const std::optional &unknown_token, - const std::string &data_type) - : vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {} - -LookupOperation::~LookupOperation() = default; - -Status LookupOperation::ValidateParams() { - if (vocab_ == nullptr) { - std::string err_msg = "Lookup: vocab object type is incorrect or null."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - if (unknown_token_ != std::nullopt) { - default_id_ = vocab_->Lookup(*unknown_token_); - if (default_id_ == Vocab::kNoTokenExists) { - std::string err_msg = "Lookup: \"" + *unknown_token_ + "\" doesn't exist in vocab."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - } - - if (!IsTypeNumeric(data_type_)) { - std::string err_msg = "Lookup does not support a string to string mapping, data_type can only be numeric."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - return Status::OK(); -} - -std::shared_ptr LookupOperation::Build() { - std::shared_ptr tensor_op = std::make_shared(vocab_, default_id_, DataType(data_type_)); - return tensor_op; -} - -// NgramOperation -NgramOperation::NgramOperation(const std::vector &ngrams, const std::pair &left_pad, - const std::pair &right_pad, const std::string &separator) - : ngrams_(ngrams), left_pad_(left_pad), right_pad_(right_pad), separator_(separator) {} - -Status NgramOperation::ValidateParams() { - if (ngrams_.size() == 0) { - std::string err_msg = "Ngram : Container cannot be empty."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } else { - for (int32_t i = 0; i < ngrams_.size(); ++i) { - if (ngrams_[i] <= 0) { - std::string err_msg = - "Ngram : The value of ngrams vector must be greater than 0: " + std::to_string(ngrams_[i]); - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - } - } - - if (left_pad_.second < 0) { - std::string err_msg = - "Ngram : The second parameter pad_width in left_pad vector must be greater than or equal to 0: " + - std::to_string(left_pad_.second); - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - if (right_pad_.second < 0) { - std::string err_msg = - "Ngram : The second parameter pad_width in right_pad vector must be greater than or equal to 0: " + - std::to_string(right_pad_.second); - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - return Status::OK(); -} - -std::shared_ptr NgramOperation::Build() { - int32_t l_len = left_pad_.second; - int32_t r_len = right_pad_.second; - std::string l_pad = left_pad_.first; - std::string r_pad = right_pad_.first; - std::shared_ptr tensor_op = std::make_shared(ngrams_, l_len, r_len, l_pad, r_pad, separator_); - return tensor_op; -} - -#ifndef _WIN32 -// NormalizeUTF8Operation -NormalizeUTF8Operation::NormalizeUTF8Operation(NormalizeForm normalize_form) : normalize_form_(normalize_form) {} - -Status NormalizeUTF8Operation::ValidateParams() { return Status::OK(); } - -std::shared_ptr NormalizeUTF8Operation::Build() { - std::shared_ptr tensor_op = std::make_shared(normalize_form_); - return tensor_op; -} - -// RegexReplaceOperation -RegexReplaceOperation::RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all) - : pattern_(pattern), replace_(replace), replace_all_(replace_all) {} - -Status RegexReplaceOperation::ValidateParams() { return Status::OK(); } - -std::shared_ptr RegexReplaceOperation::Build() { - std::shared_ptr tensor_op = std::make_shared(pattern_, replace_, replace_all_); - return tensor_op; -} - -// RegexTokenizerOperation -RegexTokenizerOperation::RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, - bool with_offsets) - : delim_pattern_(delim_pattern), keep_delim_pattern_(keep_delim_pattern), with_offsets_(with_offsets) {} - -Status RegexTokenizerOperation::ValidateParams() { return Status::OK(); } - -std::shared_ptr RegexTokenizerOperation::Build() { - std::shared_ptr tensor_op = - std::make_shared(delim_pattern_, keep_delim_pattern_, with_offsets_); - return tensor_op; -} -#endif - -// SentencePieceTokenizerOperation -SentencePieceTokenizerOperation::~SentencePieceTokenizerOperation() = default; - -SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr &vocab, - SPieceTokenizerOutType out_type) - : vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {} - -SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::string &vocab_path, - SPieceTokenizerOutType out_type) - : vocab_(nullptr), vocab_path_(vocab_path), load_type_(SPieceTokenizerLoadType::kFile), out_type_(out_type) {} - -Status SentencePieceTokenizerOperation::ValidateParams() { - if (load_type_ == SPieceTokenizerLoadType::kModel) { - if (vocab_ == nullptr) { - std::string err_msg = "SentencePieceTokenizer: vocab object type is incorrect or null."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - } else { - Path vocab_file(vocab_path_); - if (!vocab_file.Exists() || vocab_file.IsDirectory()) { - std::string err_msg = "SentencePieceTokenizer : vocab file: [" + vocab_path_ + "] is invalid or does not exist."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - if (access(vocab_file.toString().c_str(), R_OK) == -1) { - std::string err_msg = "SentencePieceTokenizer : no access to specified dataset file: " + vocab_path_; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - } - return Status::OK(); -} - -std::shared_ptr SentencePieceTokenizerOperation::Build() { - std::shared_ptr tensor_op; - if (load_type_ == SPieceTokenizerLoadType::kModel) { - tensor_op = std::make_shared(vocab_, load_type_, out_type_); - } else { - Path vocab_file(vocab_path_); - std::string model_path = vocab_file.ParentPath(); - std::string model_filename = vocab_file.Basename(); - tensor_op = std::make_shared(model_path, model_filename, load_type_, out_type_); - } - return tensor_op; -} - -// SlidingWindowOperation -SlidingWindowOperation::SlidingWindowOperation(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {} - -Status SlidingWindowOperation::ValidateParams() { - if (width_ < 1) { - std::string err_msg = - "SlidingWindow : The parameter width must be greater than or equal to 1: " + std::to_string(width_); - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - return Status::OK(); -} - -std::shared_ptr SlidingWindowOperation::Build() { - std::shared_ptr tensor_op = std::make_shared(static_cast(width_), axis_); - return tensor_op; -} - -// ToNumberOperation -ToNumberOperation::ToNumberOperation(std::string data_type) : data_type_(data_type) {} - -Status ToNumberOperation::ValidateParams() { - if (!IsTypeNumeric(data_type_) || IsTypeBoolean(data_type_)) { - std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - return Status::OK(); -} - -std::shared_ptr ToNumberOperation::Build() { - std::shared_ptr tensor_op = std::make_shared(data_type_); - return tensor_op; -} - -TruncateSequencePairOperation::TruncateSequencePairOperation(int32_t max_length) : max_length_(max_length) {} - -Status TruncateSequencePairOperation::ValidateParams() { - if (max_length_ < 0) { - std::string err_msg = "TruncateSequencePair : The parameter max_length must be greater than or equal to 0: " + - std::to_string(max_length_); - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - return Status::OK(); -} - -std::shared_ptr TruncateSequencePairOperation::Build() { - std::shared_ptr tensor_op = std::make_shared(max_length_); - return tensor_op; -} - -// UnicodeCharTokenizerOperation -UnicodeCharTokenizerOperation::UnicodeCharTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {} - -Status UnicodeCharTokenizerOperation::ValidateParams() { return Status::OK(); } - -std::shared_ptr UnicodeCharTokenizerOperation::Build() { - std::shared_ptr tensor_op = std::make_shared(with_offsets_); - return tensor_op; -} - -#ifndef _WIN32 -// UnicodeScriptTokenizerOperation -UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets) - : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {} - -Status UnicodeScriptTokenizerOperation::ValidateParams() { return Status::OK(); } - -std::shared_ptr UnicodeScriptTokenizerOperation::Build() { - std::shared_ptr tensor_op = - std::make_shared(keep_whitespace_, with_offsets_); - return tensor_op; -} - -// WhitespaceTokenizerOperation -WhitespaceTokenizerOperation::WhitespaceTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {} - -Status WhitespaceTokenizerOperation::ValidateParams() { return Status::OK(); } - -std::shared_ptr WhitespaceTokenizerOperation::Build() { - std::shared_ptr tensor_op = std::make_shared(with_offsets_); - return tensor_op; -} -#endif - } // namespace text } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/api/transforms.cc b/mindspore/ccsrc/minddata/dataset/api/transforms.cc index 9491a0e4d1d..59a3277a8a4 100644 --- a/mindspore/ccsrc/minddata/dataset/api/transforms.cc +++ b/mindspore/ccsrc/minddata/dataset/api/transforms.cc @@ -15,18 +15,6 @@ */ #include "minddata/dataset/include/transforms.h" -#include "minddata/dataset/kernels/ir/validators.h" - -// Kernel data headers (in alphabetical order) -#include "minddata/dataset/kernels/data/compose_op.h" -#include "minddata/dataset/kernels/data/duplicate_op.h" -#include "minddata/dataset/kernels/data/one_hot_op.h" -#include "minddata/dataset/kernels/data/random_apply_op.h" -#include "minddata/dataset/kernels/data/random_choice_op.h" -#include "minddata/dataset/kernels/data/type_cast_op.h" -#ifndef ENABLE_ANDROID -#include "minddata/dataset/kernels/data/unique_op.h" -#endif namespace mindspore { namespace dataset { @@ -88,122 +76,6 @@ std::shared_ptr Unique() { return op->ValidateParams() ? op : nullptr; } #endif - -/* ####################################### Validator Functions ############################################ */ - -/* ####################################### Derived TensorOperation classes ################################# */ - -// (In alphabetical order) - -// ComposeOperation -ComposeOperation::ComposeOperation(const std::vector> &transforms) - : transforms_(transforms) {} - -Status ComposeOperation::ValidateParams() { - RETURN_IF_NOT_OK(ValidateVectorTransforms("Compose", transforms_)); - return Status::OK(); -} - -std::shared_ptr ComposeOperation::Build() { - std::vector> tensor_ops; - (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops), - [](std::shared_ptr op) -> std::shared_ptr { return op->Build(); }); - return std::make_shared(tensor_ops); -} - -// DuplicateOperation -Status DuplicateOperation::ValidateParams() { return Status::OK(); } - -std::shared_ptr DuplicateOperation::Build() { return std::make_shared(); } - -// OneHotOperation -OneHotOperation::OneHotOperation(int32_t num_classes) : num_classes_(num_classes) {} - -Status OneHotOperation::ValidateParams() { - if (num_classes_ <= 0) { - std::string err_msg = "OneHot: Number of classes must be greater than 0, but got: " + std::to_string(num_classes_); - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - return Status::OK(); -} - -std::shared_ptr OneHotOperation::Build() { return std::make_shared(num_classes_); } - -// PreBuiltOperation -PreBuiltOperation::PreBuiltOperation(std::shared_ptr tensor_op) : op_(tensor_op) {} - -Status PreBuiltOperation::ValidateParams() { return Status::OK(); } - -std::shared_ptr PreBuiltOperation::Build() { return op_; } - -std::string PreBuiltOperation::Name() const { return op_ ? op_->Name() : kPreBuiltOperation; } - -Status PreBuiltOperation::to_json(nlohmann::json *out_json) { - RETURN_IF_NOT_OK(op_->to_json(out_json)); - return Status::OK(); -} - -// RandomApplyOperation -RandomApplyOperation::RandomApplyOperation(const std::vector> &transforms, double prob) - : TensorOperation(true), transforms_(transforms), prob_(prob) {} - -Status RandomApplyOperation::ValidateParams() { - RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomApply", transforms_)); - RETURN_IF_NOT_OK(ValidateProbability("RandomApply", prob_)); - return Status::OK(); -} - -std::shared_ptr RandomApplyOperation::Build() { - std::vector> tensor_ops; - (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops), - [](std::shared_ptr op) -> std::shared_ptr { return op->Build(); }); - return std::make_shared(prob_, tensor_ops); -} - -// RandomChoiceOperation -RandomChoiceOperation::RandomChoiceOperation(const std::vector> &transforms) - : TensorOperation(true), transforms_(transforms) {} - -Status RandomChoiceOperation::ValidateParams() { - RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomChoice", transforms_)); - return Status::OK(); -} - -std::shared_ptr RandomChoiceOperation::Build() { - std::vector> tensor_ops; - (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops), - [](std::shared_ptr op) -> std::shared_ptr { return op->Build(); }); - return std::make_shared(tensor_ops); -} - -// TypeCastOperation -TypeCastOperation::TypeCastOperation(std::string data_type) : data_type_(data_type) {} - -Status TypeCastOperation::ValidateParams() { - std::vector predefine_type = {"bool", "int8", "uint8", "int16", "uint16", "int32", "uint32", - "int64", "uint64", "float16", "float32", "float64", "string"}; - auto itr = std::find(predefine_type.begin(), predefine_type.end(), data_type_); - if (itr == predefine_type.end()) { - std::string err_msg = "TypeCast: Invalid data type: " + data_type_; - MS_LOG(ERROR) << "TypeCast: Only supports data type bool, int8, uint8, int16, uint16, int32, uint32, " - << "int64, uint64, float16, float32, float64, string, but got: " << data_type_; - RETURN_STATUS_SYNTAX_ERROR(err_msg); - } - - return Status::OK(); -} - -std::shared_ptr TypeCastOperation::Build() { return std::make_shared(data_type_); } - -#ifndef ENABLE_ANDROID -// UniqueOperation -Status UniqueOperation::ValidateParams() { return Status::OK(); } - -std::shared_ptr UniqueOperation::Build() { return std::make_shared(); } -#endif - } // namespace transforms } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.cc b/mindspore/ccsrc/minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.cc index d9a9b441dfa..23e6e7839bf 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.cc @@ -1,5 +1,5 @@ /** - * Copyright 2020 Huawei Technologies Co., Ltd + * Copyright 2020-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,9 +21,9 @@ #include "minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.h" #include "minddata/dataset/engine/ir/datasetops/map_node.h" -#include "minddata/dataset/include/transforms.h" #include "minddata/dataset/kernels/image/random_crop_and_resize_op.h" #include "minddata/dataset/kernels/image/random_crop_decode_resize_op.h" +#include "minddata/dataset/kernels/ir/data/transforms_ir.h" #include "minddata/dataset/kernels/ir/vision/vision_ir.h" namespace mindspore { diff --git a/mindspore/ccsrc/minddata/dataset/include/text.h b/mindspore/ccsrc/minddata/dataset/include/text.h index dda1c214bae..d53850ea0b7 100644 --- a/mindspore/ccsrc/minddata/dataset/include/text.h +++ b/mindspore/ccsrc/minddata/dataset/include/text.h @@ -27,6 +27,9 @@ #include "minddata/dataset/include/constants.h" #include "minddata/dataset/include/transforms.h" +// FIXME - This internal IR header will be removed when external API classes are provided +#include "minddata/dataset/text/ir/kernels/text_ir.h" + namespace mindspore { namespace dataset { @@ -36,24 +39,6 @@ class SentencePieceVocab; // Transform operations for text namespace text { -// Char arrays storing name of corresponding classes (in alphabetical order) -constexpr char kBasicTokenizerOperation[] = "BasicTokenizer"; -constexpr char kBertTokenizerOperation[] = "BertTokenizer"; -constexpr char kCaseFoldOperation[] = "CaseFold"; -constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer"; -constexpr char kLookupOperation[] = "Lookup"; -constexpr char kNgramOperation[] = "Ngram"; -constexpr char kNormalizeUTF8Operation[] = "NormalizeUTF8"; -constexpr char kRegexReplaceOperation[] = "RegexReplace"; -constexpr char kRegexTokenizerOperation[] = "RegexTokenizer"; -constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer"; -constexpr char kSlidingWindowOperation[] = "SlidingWindow"; -constexpr char kToNumberOperation[] = "ToNumber"; -constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair"; -constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer"; -constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer"; -constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer"; - // Text Op classes (in alphabetical order) #ifndef _WIN32 class BasicTokenizerOperation; @@ -255,309 +240,6 @@ std::shared_ptr UnicodeScriptTokenizer(bool kee /// \return Shared pointer to the current TensorOperation. std::shared_ptr WhitespaceTokenizer(bool with_offsets = false); #endif - -/* ####################################### Derived TensorOperation classes ################################# */ - -#ifndef _WIN32 -class BasicTokenizerOperation : public TensorOperation { - public: - BasicTokenizerOperation(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, - bool preserve_unused_token, bool with_offsets); - - ~BasicTokenizerOperation() = default; - - std::shared_ptr Build() override; - - Status ValidateParams() override; - - std::string Name() const override { return kBasicTokenizerOperation; } - - private: - bool lower_case_; - bool keep_whitespace_; - NormalizeForm normalize_form_; - bool preserve_unused_token_; - bool with_offsets_; -}; - -class BertTokenizerOperation : public TensorOperation { - public: - BertTokenizerOperation(const std::shared_ptr &vocab, const std::string &suffix_indicator, - int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case, - bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token, - bool with_offsets); - - ~BertTokenizerOperation(); - - std::shared_ptr Build() override; - - Status ValidateParams() override; - - std::string Name() const override { return kBertTokenizerOperation; } - - private: - std::shared_ptr vocab_; - std::string suffix_indicator_; - int32_t max_bytes_per_token_; - std::string unknown_token_; - bool lower_case_; - bool keep_whitespace_; - NormalizeForm normalize_form_; - bool preserve_unused_token_; - bool with_offsets_; -}; - -class CaseFoldOperation : public TensorOperation { - public: - CaseFoldOperation() = default; - - ~CaseFoldOperation() = default; - - std::shared_ptr Build() override; - - Status ValidateParams() override; - - std::string Name() const override { return kCaseFoldOperation; } -}; -#endif - -class JiebaTokenizerOperation : public TensorOperation { - public: - explicit JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode, - bool with_offsets); - - ~JiebaTokenizerOperation() = default; - - std::shared_ptr Build() override; - - Status ValidateParams() override; - - std::string Name() const override { return kJiebaTokenizerOperation; } - - Status AddWord(const std::string &word, int64_t freq = 0); - - private: - std::string hmm_path_; - std::string mp_path_; - JiebaMode mode_; - bool with_offsets_; - std::vector> words_list_; -}; - -class LookupOperation : public TensorOperation { - public: - explicit LookupOperation(const std::shared_ptr &vocab, const std::optional &unknown_token, - const std::string &data_type); - - ~LookupOperation(); - - std::shared_ptr Build() override; - - Status ValidateParams() override; - - std::string Name() const override { return kLookupOperation; } - - private: - std::shared_ptr vocab_; - std::optional unknown_token_; - int32_t default_id_; - std::string data_type_; -}; - -class NgramOperation : public TensorOperation { - public: - explicit NgramOperation(const std::vector &ngrams, const std::pair &left_pad, - const std::pair &right_pad, const std::string &separator); - - ~NgramOperation() = default; - - std::shared_ptr Build() override; - - Status ValidateParams() override; - - std::string Name() const override { return kNgramOperation; } - - private: - std::vector ngrams_; - std::pair left_pad_; - std::pair right_pad_; - std::string separator_; -}; - -#ifndef _WIN32 -class NormalizeUTF8Operation : public TensorOperation { - public: - explicit NormalizeUTF8Operation(NormalizeForm normalize_form); - - ~NormalizeUTF8Operation() = default; - - std::shared_ptr Build() override; - - Status ValidateParams() override; - - std::string Name() const override { return kNormalizeUTF8Operation; } - - private: - NormalizeForm normalize_form_; -}; - -class RegexReplaceOperation : public TensorOperation { - public: - RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all); - - ~RegexReplaceOperation() = default; - - std::shared_ptr Build() override; - - Status ValidateParams() override; - - std::string Name() const override { return kRegexReplaceOperation; } - - private: - std::string pattern_; - std::string replace_; - bool replace_all_; -}; - -class RegexTokenizerOperation : public TensorOperation { - public: - explicit RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, bool with_offsets); - - ~RegexTokenizerOperation() = default; - - std::shared_ptr Build() override; - - Status ValidateParams() override; - - std::string Name() const override { return kRegexTokenizerOperation; } - - private: - std::string delim_pattern_; - std::string keep_delim_pattern_; - bool with_offsets_; -}; -#endif - -class SentencePieceTokenizerOperation : public TensorOperation { - public: - SentencePieceTokenizerOperation(const std::shared_ptr &vocab, SPieceTokenizerOutType out_type); - - SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type); - - ~SentencePieceTokenizerOperation(); - - std::shared_ptr Build() override; - - Status ValidateParams() override; - - std::string Name() const override { return kSentencepieceTokenizerOperation; } - - private: - std::shared_ptr vocab_; - std::string vocab_path_; - SPieceTokenizerLoadType load_type_; - SPieceTokenizerOutType out_type_; -}; - -class SlidingWindowOperation : public TensorOperation { - public: - explicit SlidingWindowOperation(const int32_t width, const int32_t axis); - - ~SlidingWindowOperation() = default; - - std::shared_ptr Build() override; - - Status ValidateParams() override; - - std::string Name() const override { return kSlidingWindowOperation; } - - private: - int32_t width_; - int32_t axis_; -}; - -class ToNumberOperation : public TensorOperation { - public: - explicit ToNumberOperation(std::string data_type); - - ~ToNumberOperation() = default; - - std::shared_ptr Build() override; - - Status ValidateParams() override; - - std::string Name() const override { return kToNumberOperation; } - - private: - std::string data_type_; -}; - -class TruncateSequencePairOperation : public TensorOperation { - public: - explicit TruncateSequencePairOperation(int32_t max_length); - - ~TruncateSequencePairOperation() = default; - - std::shared_ptr Build() override; - - Status ValidateParams() override; - - std::string Name() const override { return kTruncateSequencePairOperation; } - - private: - int32_t max_length_; -}; - -class UnicodeCharTokenizerOperation : public TensorOperation { - public: - explicit UnicodeCharTokenizerOperation(bool with_offsets); - - ~UnicodeCharTokenizerOperation() = default; - - std::shared_ptr Build() override; - - Status ValidateParams() override; - - std::string Name() const override { return kUnicodeCharTokenizerOperation; } - - private: - bool with_offsets_; -}; - -#ifndef _WIN32 -class UnicodeScriptTokenizerOperation : public TensorOperation { - public: - explicit UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets); - - ~UnicodeScriptTokenizerOperation() = default; - - std::shared_ptr Build() override; - - Status ValidateParams() override; - - std::string Name() const override { return kUnicodeScriptTokenizerOperation; } - - private: - bool keep_whitespace_; - bool with_offsets_; -}; - -class WhitespaceTokenizerOperation : public TensorOperation { - public: - explicit WhitespaceTokenizerOperation(bool with_offsets); - - ~WhitespaceTokenizerOperation() = default; - - std::shared_ptr Build() override; - - Status ValidateParams() override; - - std::string Name() const override { return kWhitespaceTokenizerOperation; } - - private: - bool with_offsets_; -}; -#endif } // namespace text } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/include/transforms.h b/mindspore/ccsrc/minddata/dataset/include/transforms.h index 22455b02e46..5a6c23dbfd3 100644 --- a/mindspore/ccsrc/minddata/dataset/include/transforms.h +++ b/mindspore/ccsrc/minddata/dataset/include/transforms.h @@ -25,40 +25,12 @@ #include "include/api/status.h" #include "minddata/dataset/include/constants.h" -// (TEMPORARY) will be removed when Tensor op ir moved down -#include "minddata/dataset/kernels/ir/tensor_operation.h" - -#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_ -#define INCLUDE_NLOHMANN_JSON_FWD_HPP_ -namespace nlohmann { -template -struct adl_serializer; -template