diff --git a/mindspore/ccsrc/minddata/dataset/api/text.cc b/mindspore/ccsrc/minddata/dataset/api/text.cc index e763d6fbe59..2fe5c0df459 100644 --- a/mindspore/ccsrc/minddata/dataset/api/text.cc +++ b/mindspore/ccsrc/minddata/dataset/api/text.cc @@ -25,6 +25,8 @@ #include "minddata/dataset/text/kernels/ngram_op.h" #ifndef _WIN32 #include "minddata/dataset/text/kernels/normalize_utf8_op.h" +#include "minddata/dataset/text/kernels/regex_replace_op.h" +#include "minddata/dataset/text/kernels/regex_tokenizer_op.h" #endif #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h" #include "minddata/dataset/text/kernels/sliding_window_op.h" @@ -80,6 +82,19 @@ std::shared_ptr NormalizeUTF8(NormalizeForm normalize_fo return op->ValidateParams() ? op : nullptr; } + +std::shared_ptr RegexReplace(std::string pattern, std::string replace, bool replace_all) { + auto op = std::make_shared(pattern, replace, replace_all); + + return op->ValidateParams() ? op : nullptr; +} + +std::shared_ptr RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern, + bool with_offsets) { + auto op = std::make_shared(delim_pattern, keep_delim_pattern, with_offsets); + + return op->ValidateParams() ? op : nullptr; +} #endif std::shared_ptr SentencePieceTokenizer( @@ -276,6 +291,30 @@ std::shared_ptr NormalizeUTF8Operation::Build() { std::shared_ptr tensor_op = std::make_shared(normalize_form_); return tensor_op; } + +// RegexReplaceOperation +RegexReplaceOperation::RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all) + : pattern_(pattern), replace_(replace), replace_all_(replace_all) {} + +Status RegexReplaceOperation::ValidateParams() { return Status::OK(); } + +std::shared_ptr RegexReplaceOperation::Build() { + std::shared_ptr tensor_op = std::make_shared(pattern_, replace_, replace_all_); + return tensor_op; +} + +// RegexTokenizerOperation +RegexTokenizerOperation::RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, + bool with_offsets) + : delim_pattern_(delim_pattern), keep_delim_pattern_(keep_delim_pattern), with_offsets_(with_offsets) {} + +Status RegexTokenizerOperation::ValidateParams() { return Status::OK(); } + +std::shared_ptr RegexTokenizerOperation::Build() { + std::shared_ptr tensor_op = + std::make_shared(delim_pattern_, keep_delim_pattern_, with_offsets_); + return tensor_op; +} #endif // SentencePieceTokenizerOperation diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.cc index 2b44015cc6f..1a314858d42 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.cc @@ -309,10 +309,10 @@ void DatasetNode::PrintNode(std::ostream &out, int *level) const { } } -// Add a node as a child, node's parent needs to be nullptr +// Add a node as a child, node's parent needs to be empty // this function will allow child to be a nullptr, in which case it will simply skip void DatasetNode::AddChild(std::shared_ptr child) { - if (child != nullptr && !child->parent_.size()) { + if (child != nullptr && child->parent_.empty()) { children_.push_back(child); child->parent_.push_back(this); } else if (child != nullptr) { diff --git a/mindspore/ccsrc/minddata/dataset/include/text.h b/mindspore/ccsrc/minddata/dataset/include/text.h index 0a437945456..ab6bdf7a360 100644 --- a/mindspore/ccsrc/minddata/dataset/include/text.h +++ b/mindspore/ccsrc/minddata/dataset/include/text.h @@ -43,6 +43,8 @@ constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer"; constexpr char kLookupOperation[] = "Lookup"; constexpr char kNgramOperation[] = "Ngram"; constexpr char kNormalizeUTF8Operation[] = "NormalizeUTF8"; +constexpr char kRegexReplaceOperation[] = "RegexReplace"; +constexpr char kRegexTokenizerOperation[] = "RegexTokenizer"; constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer"; constexpr char kSlidingWindowOperation[] = "SlidingWindow"; constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer"; @@ -58,6 +60,8 @@ class LookupOperation; class NgramOperation; #ifndef _WIN32 class NormalizeUTF8Operation; +class RegexReplaceOperation; +class RegexTokenizerOperation; #endif class SentencePieceTokenizerOperation; class SlidingWindowOperation; @@ -125,6 +129,24 @@ std::shared_ptr Ngram(const std::vector &ngrams, /// - NormalizeForm.NFKD, normalize with Normalization Form KD. /// \return Shared pointer to the current TensorOperation. std::shared_ptr NormalizeUTF8(NormalizeForm normalize_form = NormalizeForm::kNfkc); + +/// \brief Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'. +/// \param[in] pattern The regex expression patterns. +/// \param[in] replace The string to replace matched element. +/// \param[in] replace_all Confirm whether to replace all. If false, only replace first matched element; +/// if true, replace all matched elements (default=true). +/// \return Shared pointer to the current TensorOperation. +std::shared_ptr RegexReplace(std::string pattern, std::string replace, bool replace_all = true); + +/// \brief Tokenize a scalar tensor of UTF-8 string by regex expression pattern. +/// \param[in] delim_pattern The pattern of regex delimiters. +/// \param[in] keep_delim_pattern The string matched by 'delim_pattern' can be kept as a token if it can be +/// matched by 'keep_delim_pattern'. The default value is an empty string ("") +/// which means that delimiters will not be kept as an output token (default=""). +/// \param[in] with_offsets If or not output offsets of tokens (default=false). +/// \return Shared pointer to the current TensorOperation. +std::shared_ptr RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern = "", + bool with_offsets = false); #endif /// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece. @@ -261,6 +283,42 @@ class NormalizeUTF8Operation : public TensorOperation { private: NormalizeForm normalize_form_; }; + +class RegexReplaceOperation : public TensorOperation { + public: + RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all); + + ~RegexReplaceOperation() = default; + + std::shared_ptr Build() override; + + Status ValidateParams() override; + + std::string Name() const override { return kRegexReplaceOperation; } + + private: + std::string pattern_; + std::string replace_; + bool replace_all_; +}; + +class RegexTokenizerOperation : public TensorOperation { + public: + explicit RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, bool with_offsets); + + ~RegexTokenizerOperation() = default; + + std::shared_ptr Build() override; + + Status ValidateParams() override; + + std::string Name() const override { return kRegexTokenizerOperation; } + + private: + std::string delim_pattern_; + std::string keep_delim_pattern_; + bool with_offsets_; +}; #endif class SentencePieceTokenizerOperation : public TensorOperation { diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index af91ec9f148..10cdac774ea 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -479,7 +479,7 @@ if platform.system().lower() != 'windows': class CaseFold(cde.CaseFoldOp): """ - Apply case fold operation on utf-8 string tensor. + Apply case fold operation on UTF-8 string tensor. Note: CaseFold is not supported on Windows platform yet. @@ -502,7 +502,7 @@ if platform.system().lower() != 'windows': class NormalizeUTF8(cde.NormalizeUTF8Op): """ - Apply normalize operation on utf-8 string tensor. + Apply normalize operation on UTF-8 string tensor. Note: NormalizeUTF8 is not supported on Windows platform yet. @@ -536,7 +536,7 @@ if platform.system().lower() != 'windows': class RegexReplace(cde.RegexReplaceOp): """ - Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'. + Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'. See http://userguide.icu-project.org/strings/regexp for support regex pattern. diff --git a/tests/ut/cpp/dataset/c_api_text_test.cc b/tests/ut/cpp/dataset/c_api_text_test.cc index 522ce7c3a7c..97fa4c5c6d5 100644 --- a/tests/ut/cpp/dataset/c_api_text_test.cc +++ b/tests/ut/cpp/dataset/c_api_text_test.cc @@ -691,6 +691,215 @@ TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success3) { iter->Stop(); } +TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess) { + // Testing the parameter of RegexReplace interface when the replace_all is true. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create regex_replace operation on ds + std::shared_ptr regex_replace = text::RegexReplace("\\s+", "_", true); + EXPECT_NE(regex_replace, nullptr); + + // Create Map operation on ds + ds = ds->Map({regex_replace}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector expected = {"Hello_World", "Let's_Go", "1:hello", "2:world", + "31:beijing", "Welcome_to_China!", "_我_不想_长大_", "Welcome_to_Shenzhen!"}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + Tensor::CreateScalar(expected[i], &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 8); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess1) { + // Testing the parameter of RegexReplace interface when the replace_all is false. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess1."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create regex_replace operation on ds + std::shared_ptr regex_replace = text::RegexReplace("\\s+", "_", false); + EXPECT_NE(regex_replace, nullptr); + + // Create Map operation on ds + ds = ds->Map({regex_replace}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector expected = {"Hello_World", "Let's_Go", "1:hello", "2:world", + "31:beijing", "Welcome_to China!", "_我 不想 长大 ", "Welcome_to Shenzhen!"}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + Tensor::CreateScalar(expected[i], &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 8); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess) { + // Testing the parameter of RegexTokenizer interface when the with_offsets is false. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create regex_tokenizer operation on ds + std::shared_ptr regex_tokenizer = text::RegexTokenizer("\\s+", "\\s+", false); + EXPECT_NE(regex_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({regex_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector> expected = {{"Hello", " ", "World"}, + {"Let's", " ", "Go"}, + {"1:hello"}, + {"2:world"}, + {"31:beijing"}, + {"Welcome", " ", "to", " ", "China!"}, + {" ", "我", " ", "不想", " ", "长大", " "}, + {"Welcome", " ", "to", " ", "Shenzhen!"}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + int x = expected[i].size(); + Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 8); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess1) { + // Testing the parameter of RegexTokenizer interface when the with_offsets is true. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess1."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create regex_tokenizer operation on ds + std::shared_ptr regex_tokenizer = text::RegexTokenizer("\\s+", "\\s+", true); + EXPECT_NE(regex_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({regex_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}, + {"token", "offsets_start", "offsets_limit"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector> expected = {{"Hello", " ", "World"}, + {"Let's", " ", "Go"}, + {"1:hello"}, + {"2:world"}, + {"31:beijing"}, + {"Welcome", " ", "to", " ", "China!"}, + {" ", "我", " ", "不想", " ", "长大", " "}, + {"Welcome", " ", "to", " ", "Shenzhen!"}}; + + std::vector> expected_offsets_start = { + {0, 5, 6}, {0, 5, 6}, {0}, {0}, {0}, {0, 7, 8, 10, 11}, {0, 2, 5, 6, 12, 14, 20}, {0, 7, 8, 10, 11}}; + std::vector> expected_offsets_limit = { + {5, 6, 11}, {5, 6, 8}, {7}, {7}, {10}, {7, 8, 10, 11, 17}, {2, 5, 6, 12, 14, 20, 21}, {7, 8, 10, 11, 20}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["offsets_start"]; + auto ind1 = row["offsets_limit"]; + auto token = row["token"]; + std::shared_ptr expected_tensor; + std::shared_ptr expected_tensor_offsets_start; + std::shared_ptr expected_tensor_offsets_limit; + int x = expected[i].size(); + Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); + Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &expected_tensor_offsets_start); + Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &expected_tensor_offsets_limit); + EXPECT_EQ(*ind, *expected_tensor_offsets_start); + EXPECT_EQ(*ind1, *expected_tensor_offsets_limit); + EXPECT_EQ(*token, *expected_tensor); + + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 8); + + // Manually terminate the pipeline + iter->Stop(); +} + TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess) { // Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is default. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess.";