From 3bea84d0f7e033096a5d01699e25642cd74f154d Mon Sep 17 00:00:00 2001 From: shenwei41 Date: Mon, 30 Nov 2020 14:31:18 +0800 Subject: [PATCH] add four new text API --- mindspore/ccsrc/minddata/dataset/api/text.cc | 80 +++ .../ccsrc/minddata/dataset/core/constants.h | 9 + .../ccsrc/minddata/dataset/include/text.h | 113 +++- .../dataset/text/kernels/normalize_utf8_op.h | 7 - tests/ut/cpp/dataset/c_api_text_test.cc | 551 ++++++++++++++++++ 5 files changed, 752 insertions(+), 8 deletions(-) diff --git a/mindspore/ccsrc/minddata/dataset/api/text.cc b/mindspore/ccsrc/minddata/dataset/api/text.cc index 855bf1b39fd..e763d6fbe59 100644 --- a/mindspore/ccsrc/minddata/dataset/api/text.cc +++ b/mindspore/ccsrc/minddata/dataset/api/text.cc @@ -17,12 +17,20 @@ #include #include "minddata/dataset/include/text.h" +#ifndef _WIN32 +#include "minddata/dataset/text/kernels/case_fold_op.h" +#endif #include "minddata/dataset/text/kernels/jieba_tokenizer_op.h" #include "minddata/dataset/text/kernels/lookup_op.h" #include "minddata/dataset/text/kernels/ngram_op.h" +#ifndef _WIN32 +#include "minddata/dataset/text/kernels/normalize_utf8_op.h" +#endif #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h" #include "minddata/dataset/text/kernels/sliding_window_op.h" +#include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h" #ifndef _WIN32 +#include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h" #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" #endif #include "minddata/dataset/util/path.h" @@ -36,6 +44,14 @@ namespace text { // FUNCTIONS TO CREATE TEXT OPERATIONS // (In alphabetical order) +#ifndef _WIN32 +std::shared_ptr CaseFold() { + auto op = std::make_shared(); + + return op->ValidateParams() ? op : nullptr; +} +#endif + std::shared_ptr JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode, bool with_offsets) { auto op = std::make_shared(hmm_path, mp_path, mode, with_offsets); @@ -58,6 +74,14 @@ std::shared_ptr Ngram(const std::vector &ngrams, return op->ValidateParams() ? op : nullptr; } +#ifndef _WIN32 +std::shared_ptr NormalizeUTF8(NormalizeForm normalize_form) { + auto op = std::make_shared(normalize_form); + + return op->ValidateParams() ? op : nullptr; +} +#endif + std::shared_ptr SentencePieceTokenizer( const std::shared_ptr &vocab, SPieceTokenizerOutType out_type) { auto op = std::make_shared(vocab, out_type); @@ -78,7 +102,19 @@ std::shared_ptr SlidingWindow(const int32_t width, const return op->ValidateParams() ? op : nullptr; } +std::shared_ptr UnicodeCharTokenizer(bool with_offsets) { + auto op = std::make_shared(with_offsets); + + return op->ValidateParams() ? op : nullptr; +} + #ifndef _WIN32 +std::shared_ptr UnicodeScriptTokenizer(bool keep_whitespace, bool with_offsets) { + auto op = std::make_shared(keep_whitespace, with_offsets); + + return op->ValidateParams() ? op : nullptr; +} + std::shared_ptr WhitespaceTokenizer(bool with_offsets) { auto op = std::make_shared(with_offsets); @@ -116,6 +152,16 @@ Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::s // (In alphabetical order) +#ifndef _WIN32 +// CaseFoldOperation +Status CaseFoldOperation::ValidateParams() { return Status::OK(); } + +std::shared_ptr CaseFoldOperation::Build() { + std::shared_ptr tensor_op = std::make_shared(); + return tensor_op; +} +#endif + // JiebaTokenizerOperation JiebaTokenizerOperation::JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode, bool with_offsets) @@ -220,6 +266,18 @@ std::shared_ptr NgramOperation::Build() { return tensor_op; } +#ifndef _WIN32 +// NormalizeUTF8Operation +NormalizeUTF8Operation::NormalizeUTF8Operation(NormalizeForm normalize_form) : normalize_form_(normalize_form) {} + +Status NormalizeUTF8Operation::ValidateParams() { return Status::OK(); } + +std::shared_ptr NormalizeUTF8Operation::Build() { + std::shared_ptr tensor_op = std::make_shared(normalize_form_); + return tensor_op; +} +#endif + // SentencePieceTokenizerOperation SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr &vocab, SPieceTokenizerOutType out_type) @@ -283,7 +341,29 @@ std::shared_ptr SlidingWindowOperation::Build() { return tensor_op; } +// UnicodeCharTokenizerOperation +UnicodeCharTokenizerOperation::UnicodeCharTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {} + +Status UnicodeCharTokenizerOperation::ValidateParams() { return Status::OK(); } + +std::shared_ptr UnicodeCharTokenizerOperation::Build() { + std::shared_ptr tensor_op = std::make_shared(with_offsets_); + return tensor_op; +} + #ifndef _WIN32 +// UnicodeScriptTokenizerOperation +UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets) + : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {} + +Status UnicodeScriptTokenizerOperation::ValidateParams() { return Status::OK(); } + +std::shared_ptr UnicodeScriptTokenizerOperation::Build() { + std::shared_ptr tensor_op = + std::make_shared(keep_whitespace_, with_offsets_); + return tensor_op; +} + // WhitespaceTokenizerOperation WhitespaceTokenizerOperation::WhitespaceTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {} diff --git a/mindspore/ccsrc/minddata/dataset/core/constants.h b/mindspore/ccsrc/minddata/dataset/core/constants.h index d8cb5ad6dc7..0e03df5c509 100644 --- a/mindspore/ccsrc/minddata/dataset/core/constants.h +++ b/mindspore/ccsrc/minddata/dataset/core/constants.h @@ -59,6 +59,15 @@ enum class SPieceTokenizerOutType { kString = 0, kInt = 1 }; // Possible values for SPieceTokenizerLoadType enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 }; +// Possible values for NormalizeForm +enum class NormalizeForm { + kNone = 0, + kNfc, + kNfkc, + kNfd, + kNfkd, +}; + // convenience functions for 32bit int bitmask inline bool BitTest(uint32_t bits, uint32_t bitMask) { return (bits & bitMask) == bitMask; } diff --git a/mindspore/ccsrc/minddata/dataset/include/text.h b/mindspore/ccsrc/minddata/dataset/include/text.h index 2f43ab8cf4c..0a437945456 100644 --- a/mindspore/ccsrc/minddata/dataset/include/text.h +++ b/mindspore/ccsrc/minddata/dataset/include/text.h @@ -38,23 +38,41 @@ namespace dataset { namespace text { // Char arrays storing name of corresponding classes (in alphabetical order) +constexpr char kCaseFoldOperation[] = "CaseFold"; constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer"; constexpr char kLookupOperation[] = "Lookup"; constexpr char kNgramOperation[] = "Ngram"; +constexpr char kNormalizeUTF8Operation[] = "NormalizeUTF8"; constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer"; constexpr char kSlidingWindowOperation[] = "SlidingWindow"; +constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer"; +constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer"; constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer"; // Text Op classes (in alphabetical order) +#ifndef _WIN32 +class CaseFoldOperation; +#endif class JiebaTokenizerOperation; class LookupOperation; class NgramOperation; +#ifndef _WIN32 +class NormalizeUTF8Operation; +#endif class SentencePieceTokenizerOperation; class SlidingWindowOperation; +class UnicodeCharTokenizerOperation; #ifndef _WIN32 +class UnicodeScriptTokenizerOperation; class WhitespaceTokenizerOperation; #endif +#ifndef _WIN32 +/// \brief Apply case fold operation on UTF-8 string tensor. +/// \return Shared pointer to the current TensorOperation. +std::shared_ptr CaseFold(); +#endif + /// \brief Tokenize Chinese string into words based on dictionary. /// \param[in] hmm_path Dictionary file is used by HMMSegment algorithm. The dictionary can be obtained on the /// official website of cppjieba. @@ -94,6 +112,21 @@ std::shared_ptr Ngram(const std::vector &ngrams, const std::pair &right_pad = {"", 0}, const std::string &separator = " "); +#ifndef _WIN32 +/// \brief Apply normalize operation on UTF-8 string tensor. +/// \param[in] normalize_form Valid values can be any of [NormalizeForm::kNone,NormalizeForm::kNfc, +/// NormalizeForm::kNfkc, +/// NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc). +/// See http://unicode.org/reports/tr15/ for details. +/// - NormalizeForm.NONE, do nothing for input string tensor. +/// - NormalizeForm.NFC, normalize with Normalization Form C. +/// - NormalizeForm.NFKC, normalize with Normalization Form KC. +/// - NormalizeForm.NFD, normalize with Normalization Form D. +/// - NormalizeForm.NFKD, normalize with Normalization Form KD. +/// \return Shared pointer to the current TensorOperation. +std::shared_ptr NormalizeUTF8(NormalizeForm normalize_form = NormalizeForm::kNfkc); +#endif + /// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece. /// \param[in] vocab a SentencePieceVocab object. /// \param[in] out_type The type of output. @@ -116,8 +149,20 @@ std::shared_ptr SentencePieceTokenizer( /// \return Shared pointer to the current TensorOperation. std::shared_ptr SlidingWindow(const int32_t width, const int32_t axis = 0); +/// \brief Tokenize a scalar tensor of UTF-8 string to Unicode characters. +/// \param[in] with_offsets If or not output offsets of tokens (default=false). +/// \return Shared pointer to the current TensorOperation. +std::shared_ptr UnicodeCharTokenizer(bool with_offsets = false); + #ifndef _WIN32 -/// \brief Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces +/// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries. +/// \param[in] keep_whitespace If or not emit whitespace tokens (default=false). +/// \param[in] with_offsets If or not output offsets of tokens (default=false). +/// \return Shared pointer to the current TensorOperation. +std::shared_ptr UnicodeScriptTokenizer(bool keep_whitespace = false, + bool with_offsets = false); + +/// \brief Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces. /// \param[in] with_offsets If or not output offsets of tokens (default=false). /// \return Shared pointer to the current TensorOperation. std::shared_ptr WhitespaceTokenizer(bool with_offsets = false); @@ -125,6 +170,21 @@ std::shared_ptr WhitespaceTokenizer(bool with_offs /* ####################################### Derived TensorOperation classes ################################# */ +#ifndef _WIN32 +class CaseFoldOperation : public TensorOperation { + public: + CaseFoldOperation() = default; + + ~CaseFoldOperation() = default; + + std::shared_ptr Build() override; + + Status ValidateParams() override; + + std::string Name() const override { return kCaseFoldOperation; } +}; +#endif + class JiebaTokenizerOperation : public TensorOperation { public: explicit JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode, @@ -185,6 +245,24 @@ class NgramOperation : public TensorOperation { std::string separator_; }; +#ifndef _WIN32 +class NormalizeUTF8Operation : public TensorOperation { + public: + explicit NormalizeUTF8Operation(NormalizeForm normalize_form); + + ~NormalizeUTF8Operation() = default; + + std::shared_ptr Build() override; + + Status ValidateParams() override; + + std::string Name() const override { return kNormalizeUTF8Operation; } + + private: + NormalizeForm normalize_form_; +}; +#endif + class SentencePieceTokenizerOperation : public TensorOperation { public: SentencePieceTokenizerOperation(const std::shared_ptr &vocab, SPieceTokenizerOutType out_type); @@ -223,7 +301,40 @@ class SlidingWindowOperation : public TensorOperation { int32_t axis_; }; +class UnicodeCharTokenizerOperation : public TensorOperation { + public: + explicit UnicodeCharTokenizerOperation(bool with_offsets); + + ~UnicodeCharTokenizerOperation() = default; + + std::shared_ptr Build() override; + + Status ValidateParams() override; + + std::string Name() const override { return kUnicodeCharTokenizerOperation; } + + private: + bool with_offsets_; +}; + #ifndef _WIN32 +class UnicodeScriptTokenizerOperation : public TensorOperation { + public: + explicit UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets); + + ~UnicodeScriptTokenizerOperation() = default; + + std::shared_ptr Build() override; + + Status ValidateParams() override; + + std::string Name() const override { return kUnicodeScriptTokenizerOperation; } + + private: + bool keep_whitespace_; + bool with_offsets_; +}; + class WhitespaceTokenizerOperation : public TensorOperation { public: explicit WhitespaceTokenizerOperation(bool with_offsets); diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/normalize_utf8_op.h b/mindspore/ccsrc/minddata/dataset/text/kernels/normalize_utf8_op.h index 66b630adb11..d05566bc7a6 100644 --- a/mindspore/ccsrc/minddata/dataset/text/kernels/normalize_utf8_op.h +++ b/mindspore/ccsrc/minddata/dataset/text/kernels/normalize_utf8_op.h @@ -24,13 +24,6 @@ namespace mindspore { namespace dataset { -enum class NormalizeForm { - kNone = 0, - kNfc, - kNfkc, - kNfd, - kNfkd, -}; class NormalizeUTF8Op : public TensorOp { public: diff --git a/tests/ut/cpp/dataset/c_api_text_test.cc b/tests/ut/cpp/dataset/c_api_text_test.cc index 5a52bf0a22e..522ce7c3a7c 100644 --- a/tests/ut/cpp/dataset/c_api_text_test.cc +++ b/tests/ut/cpp/dataset/c_api_text_test.cc @@ -34,6 +34,49 @@ class MindDataTestPipeline : public UT::DatasetOpTesting { protected: }; +TEST_F(MindDataTestPipeline, TestCaseFoldSuccess) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaseFoldSuccess."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create casefold operation on ds + std::shared_ptr casefold = text::CaseFold(); + EXPECT_NE(casefold, nullptr); + + // Create Map operation on ds + ds = ds->Map({casefold}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector expected = {"welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + Tensor::CreateScalar(expected[i], &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 4); + + // Manually terminate the pipeline + iter->Stop(); +} + TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess) { // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is false. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess."; @@ -472,6 +515,514 @@ TEST_F(MindDataTestPipeline, TestTextOperationName) { EXPECT_EQ(correct_name, sentence_piece_tokenizer_op->Name()); } +TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success) { + // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkc. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create normalizeutf8 operation on ds + std::shared_ptr normalizeutf8 = text::NormalizeUTF8(NormalizeForm::kNfkc); + EXPECT_NE(normalizeutf8, nullptr); + + // Create Map operation on ds + ds = ds->Map({normalizeutf8}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + Tensor::CreateScalar(expected[i], &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 6); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success1) { + // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfc. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success1."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create normalizeutf8 operation on ds + std::shared_ptr normalizeutf8 = text::NormalizeUTF8(NormalizeForm::kNfc); + EXPECT_NE(normalizeutf8, nullptr); + + // Create Map operation on ds + ds = ds->Map({normalizeutf8}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + Tensor::CreateScalar(expected[i], &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 6); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success2) { + // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfd. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success2."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create normalizeutf8 operation on ds + std::shared_ptr normalizeutf8 = text::NormalizeUTF8(NormalizeForm::kNfd); + EXPECT_NE(normalizeutf8, nullptr); + + // Create Map operation on ds + ds = ds->Map({normalizeutf8}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + Tensor::CreateScalar(expected[i], &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 6); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success3) { + // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkd. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success3."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create normalizeutf8 operation on ds + std::shared_ptr normalizeutf8 = text::NormalizeUTF8(NormalizeForm::kNfkd); + EXPECT_NE(normalizeutf8, nullptr); + + // Create Map operation on ds + ds = ds->Map({normalizeutf8}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + Tensor::CreateScalar(expected[i], &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 6); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess) { + // Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is default. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create unicodechar_tokenizer operation on ds + std::shared_ptr unicodechar_tokenizer = text::UnicodeCharTokenizer(); + EXPECT_NE(unicodechar_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({unicodechar_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector> expected = { + {"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"}, + {"北", "京", "欢", "迎", "您", "!"}, + {"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"}, + {" ", " "}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + int x = expected[i].size(); + Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 4); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess1) { + // Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is true. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess1."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create unicodechar_tokenizer operation on ds + std::shared_ptr unicodechar_tokenizer = text::UnicodeCharTokenizer(true); + EXPECT_NE(unicodechar_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({unicodechar_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}, + {"token", "offsets_start", "offsets_limit"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector> expected = { + {"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"}, + {"北", "京", "欢", "迎", "您", "!"}, + {"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"}, + {" ", " "}}; + + std::vector> expected_offsets_start = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}, + {0, 3, 6, 9, 12, 15}, + {0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16}, + {0, 1}}; + std::vector> expected_offsets_limit = { + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, + {3, 6, 9, 12, 15, 18}, + {3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17}, + {1, 2}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["offsets_start"]; + auto ind1 = row["offsets_limit"]; + auto token = row["token"]; + std::shared_ptr expected_tensor; + std::shared_ptr expected_tensor_offsets_start; + std::shared_ptr expected_tensor_offsets_limit; + int x = expected[i].size(); + Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); + Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &expected_tensor_offsets_start); + Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &expected_tensor_offsets_limit); + EXPECT_EQ(*ind, *expected_tensor_offsets_start); + EXPECT_EQ(*ind1, *expected_tensor_offsets_limit); + EXPECT_EQ(*token, *expected_tensor); + + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 4); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess) { + // Testing the parameter of UnicodeScriptTokenizer interface when the with_offsets and the keep_whitespace is default. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create unicodescript_tokenizer operation on ds + std::shared_ptr unicodescript_tokenizer = text::UnicodeScriptTokenizer(); + EXPECT_NE(unicodescript_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({unicodescript_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector> expected = { + {"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + int x = expected[i].size(); + Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 4); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess1) { + // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is + // false. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess1."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create unicodescript_tokenizer operation on ds + std::shared_ptr unicodescript_tokenizer = text::UnicodeScriptTokenizer(true); + EXPECT_NE(unicodescript_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({unicodescript_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector> expected = { + {"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {" "}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + int x = expected[i].size(); + Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 4); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess2) { + // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is false and the with_offsets is + // true. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess2."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create unicodescript_tokenizer operation on ds + std::shared_ptr unicodescript_tokenizer = text::UnicodeScriptTokenizer(false, true); + EXPECT_NE(unicodescript_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}, + {"token", "offsets_start", "offsets_limit"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector> expected = { + {"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}}; + + std::vector> expected_offsets_start = {{0, 8, 11, 18}, {0, 15}, {0, 9, 16}, {0}}; + std::vector> expected_offsets_limit = {{7, 10, 18, 19}, {15, 18}, {9, 16, 17}, {0}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["offsets_start"]; + auto ind1 = row["offsets_limit"]; + auto token = row["token"]; + std::shared_ptr expected_tensor; + std::shared_ptr expected_tensor_offsets_start; + std::shared_ptr expected_tensor_offsets_limit; + int x = expected[i].size(); + Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); + Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &expected_tensor_offsets_start); + Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &expected_tensor_offsets_limit); + EXPECT_EQ(*ind, *expected_tensor_offsets_start); + EXPECT_EQ(*ind1, *expected_tensor_offsets_limit); + EXPECT_EQ(*token, *expected_tensor); + + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 4); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess3) { + // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is + // true. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess3."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create unicodescript_tokenizer operation on ds + std::shared_ptr unicodescript_tokenizer = text::UnicodeScriptTokenizer(true, true); + EXPECT_NE(unicodescript_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}, + {"token", "offsets_start", "offsets_limit"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector> expected = { + {"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {" "}}; + + std::vector> expected_offsets_start = {{0, 7, 8, 10, 11, 18}, {0, 15}, {0, 9, 16}, {0}}; + std::vector> expected_offsets_limit = {{7, 8, 10, 11, 18, 19}, {15, 18}, {9, 16, 17}, {2}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["offsets_start"]; + auto ind1 = row["offsets_limit"]; + auto token = row["token"]; + std::shared_ptr expected_tensor; + std::shared_ptr expected_tensor_offsets_start; + std::shared_ptr expected_tensor_offsets_limit; + int x = expected[i].size(); + Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); + Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &expected_tensor_offsets_start); + Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &expected_tensor_offsets_limit); + EXPECT_EQ(*ind, *expected_tensor_offsets_start); + EXPECT_EQ(*ind1, *expected_tensor_offsets_limit); + EXPECT_EQ(*token, *expected_tensor); + + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 4); + + // Manually terminate the pipeline + iter->Stop(); +} + TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess) { // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is default. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess.";