From 4e56618d18f0637956ea5c79fb3234c331df7f47 Mon Sep 17 00:00:00 2001 From: shenwei41 Date: Wed, 25 Nov 2020 12:05:46 +0800 Subject: [PATCH] Add text C++ API --- mindspore/ccsrc/minddata/dataset/api/text.cc | 167 ++++++ .../ccsrc/minddata/dataset/core/constants.h | 9 + .../ccsrc/minddata/dataset/include/text.h | 119 ++++ .../dataset/text/kernels/jieba_tokenizer_op.h | 3 +- .../kernels/sentence_piece_tokenizer_op.h | 3 +- .../c_api_text_sentence_piece_vocab_test.cc | 1 + tests/ut/cpp/dataset/c_api_text_test.cc | 567 ++++++++++++++++++ 7 files changed, 865 insertions(+), 4 deletions(-) create mode 100644 tests/ut/cpp/dataset/c_api_text_test.cc diff --git a/mindspore/ccsrc/minddata/dataset/api/text.cc b/mindspore/ccsrc/minddata/dataset/api/text.cc index 9fed10e30b6..855bf1b39fd 100644 --- a/mindspore/ccsrc/minddata/dataset/api/text.cc +++ b/mindspore/ccsrc/minddata/dataset/api/text.cc @@ -15,9 +15,16 @@ */ #include + #include "minddata/dataset/include/text.h" +#include "minddata/dataset/text/kernels/jieba_tokenizer_op.h" #include "minddata/dataset/text/kernels/lookup_op.h" +#include "minddata/dataset/text/kernels/ngram_op.h" #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h" +#include "minddata/dataset/text/kernels/sliding_window_op.h" +#ifndef _WIN32 +#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" +#endif #include "minddata/dataset/util/path.h" namespace mindspore { @@ -29,6 +36,13 @@ namespace text { // FUNCTIONS TO CREATE TEXT OPERATIONS // (In alphabetical order) +std::shared_ptr JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path, + const JiebaMode &mode, bool with_offsets) { + auto op = std::make_shared(hmm_path, mp_path, mode, with_offsets); + + return op->ValidateParams() ? op : nullptr; +} + std::shared_ptr Lookup(const std::shared_ptr &vocab, const std::string &unknown_token, const DataType &data_type) { auto op = std::make_shared(vocab, unknown_token, data_type); @@ -36,6 +50,14 @@ std::shared_ptr Lookup(const std::shared_ptr &vocab, con return op->ValidateParams() ? op : nullptr; } +std::shared_ptr Ngram(const std::vector &ngrams, + const std::pair &left_pad, + const std::pair &right_pad, const std::string &separator) { + auto op = std::make_shared(ngrams, left_pad, right_pad, separator); + + return op->ValidateParams() ? op : nullptr; +} + std::shared_ptr SentencePieceTokenizer( const std::shared_ptr &vocab, SPieceTokenizerOutType out_type) { auto op = std::make_shared(vocab, out_type); @@ -50,12 +72,79 @@ std::shared_ptr SentencePieceTokenizer(const st return op->ValidateParams() ? op : nullptr; } +std::shared_ptr SlidingWindow(const int32_t width, const int32_t axis) { + auto op = std::make_shared(width, axis); + + return op->ValidateParams() ? op : nullptr; +} + +#ifndef _WIN32 +std::shared_ptr WhitespaceTokenizer(bool with_offsets) { + auto op = std::make_shared(with_offsets); + + return op->ValidateParams() ? op : nullptr; +} +#endif + /* ####################################### Validator Functions ############################################ */ +// Helper function to validate tokenizer directory parameter +Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file) { + if (tokenizer_file.empty()) { + std::string err_msg = tokenizer_name + ": tokenizer_file is not specified."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + Path file(tokenizer_file); + if (!file.Exists()) { + std::string err_msg = tokenizer_name + ": tokenizer_file: [" + tokenizer_file + "] is an invalid directory path."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + if (access(tokenizer_file.c_str(), R_OK) == -1) { + std::string err_msg = tokenizer_name + ": No access to specified tokenizer path: " + tokenizer_file; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + return Status::OK(); +} + /* ####################################### Derived TensorOperation classes ################################# */ // (In alphabetical order) +// JiebaTokenizerOperation +JiebaTokenizerOperation::JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, + const JiebaMode &mode, bool with_offsets) + : hmm_path_(hmm_path), mp_path_(mp_path), mode_(mode), with_offsets_(with_offsets) {} + +Status JiebaTokenizerOperation::ValidateParams() { + if (hmm_path_.empty()) { + std::string err_msg = "JiebaTokenizer: The dict of HMMSegment in cppjieba is not provided."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + if (mp_path_.empty()) { + std::string err_msg = "JiebaTokenizer: The dict of MPSegment in cppjieba is not provided."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", hmm_path_)); + RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", mp_path_)); + return Status::OK(); +} + +std::shared_ptr JiebaTokenizerOperation::Build() { + std::shared_ptr tensor_op = + std::make_shared(hmm_path_, mp_path_, mode_, with_offsets_); + return tensor_op; +} + // LookupOperation LookupOperation::LookupOperation(const std::shared_ptr &vocab, const std::string &unknown_token, const DataType &data_type) @@ -83,6 +172,54 @@ std::shared_ptr LookupOperation::Build() { return tensor_op; } +// NgramOperation +NgramOperation::NgramOperation(const std::vector &ngrams, const std::pair &left_pad, + const std::pair &right_pad, const std::string &separator) + : ngrams_(ngrams), left_pad_(left_pad), right_pad_(right_pad), separator_(separator) {} + +Status NgramOperation::ValidateParams() { + if (ngrams_.size() == 0) { + std::string err_msg = "Ngram : Container cannot be empty."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } else { + for (int32_t i = 0; i < ngrams_.size(); ++i) { + if (ngrams_[i] <= 0) { + std::string err_msg = + "Ngram : The value of ngrams vector must be greater than 0: " + std::to_string(ngrams_[i]); + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + } + } + + if (left_pad_.second < 0) { + std::string err_msg = + "Ngram : The second parameter pad_width in left_pad vector must be greater than or equal to 0: " + + std::to_string(left_pad_.second); + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + if (right_pad_.second < 0) { + std::string err_msg = + "Ngram : The second parameter pad_width in right_pad vector must be greater than or equal to 0: " + + std::to_string(right_pad_.second); + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + return Status::OK(); +} + +std::shared_ptr NgramOperation::Build() { + int32_t l_len = left_pad_.second; + int32_t r_len = right_pad_.second; + std::string l_pad = left_pad_.first; + std::string r_pad = right_pad_.first; + std::shared_ptr tensor_op = std::make_shared(ngrams_, l_len, r_len, l_pad, r_pad, separator_); + return tensor_op; +} + // SentencePieceTokenizerOperation SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr &vocab, SPieceTokenizerOutType out_type) @@ -128,6 +265,36 @@ std::shared_ptr SentencePieceTokenizerOperation::Build() { return tensor_op; } +// SlidingWindowOperation +SlidingWindowOperation::SlidingWindowOperation(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {} + +Status SlidingWindowOperation::ValidateParams() { + if (width_ < 1) { + std::string err_msg = + "SlidingWindow : The parameter width must be greater than or equal to 1: " + std::to_string(width_); + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + return Status::OK(); +} + +std::shared_ptr SlidingWindowOperation::Build() { + std::shared_ptr tensor_op = std::make_shared(static_cast(width_), axis_); + return tensor_op; +} + +#ifndef _WIN32 +// WhitespaceTokenizerOperation +WhitespaceTokenizerOperation::WhitespaceTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {} + +Status WhitespaceTokenizerOperation::ValidateParams() { return Status::OK(); } + +std::shared_ptr WhitespaceTokenizerOperation::Build() { + std::shared_ptr tensor_op = std::make_shared(with_offsets_); + return tensor_op; +} +#endif + } // namespace text } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/core/constants.h b/mindspore/ccsrc/minddata/dataset/core/constants.h index 87fb9b455ec..d8cb5ad6dc7 100644 --- a/mindspore/ccsrc/minddata/dataset/core/constants.h +++ b/mindspore/ccsrc/minddata/dataset/core/constants.h @@ -50,6 +50,15 @@ enum class ImageFormat { HWC = 0, CHW = 1, HW = 2 }; // Possible interpolation modes enum class InterpolationMode { kLinear = 0, kNearestNeighbour = 1, kCubic = 2, kArea = 3 }; +// Possible JiebaMode modes +enum class JiebaMode { kMix = 0, kMp = 1, kHmm = 2 }; + +// Possible values for SPieceTokenizerOutType +enum class SPieceTokenizerOutType { kString = 0, kInt = 1 }; + +// Possible values for SPieceTokenizerLoadType +enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 }; + // convenience functions for 32bit int bitmask inline bool BitTest(uint32_t bits, uint32_t bitMask) { return (bits & bitMask) == bitMask; } diff --git a/mindspore/ccsrc/minddata/dataset/include/text.h b/mindspore/ccsrc/minddata/dataset/include/text.h index a03cba422f7..d492334b6bf 100644 --- a/mindspore/ccsrc/minddata/dataset/include/text.h +++ b/mindspore/ccsrc/minddata/dataset/include/text.h @@ -19,6 +19,7 @@ #include #include +#include #include #include "mindspore/ccsrc/minddata/dataset/core/data_type.h" @@ -37,8 +38,29 @@ namespace dataset { namespace text { // Text Op classes (in alphabetical order) +class JiebaTokenizerOperation; class LookupOperation; +class NgramOperation; class SentencePieceTokenizerOperation; +class SlidingWindowOperation; +#ifndef _WIN32 +class WhitespaceTokenizerOperation; +#endif + +/// \brief Tokenize Chinese string into words based on dictionary. +/// \param[in] hmm_path Dictionary file is used by HMMSegment algorithm. The dictionary can be obtained on the +/// official website of cppjieba. +/// \param[in] mp_path Dictionary file is used by MPSegment algorithm. The dictionary can be obtained on the +/// official website of cppjieba. +/// \param[in] mode Valid values can be any of [JiebaMode.MP, JiebaMode.HMM, JiebaMode.MIX](default=JiebaMode.MIX). +/// - JiebaMode.MP, tokenize with MPSegment algorithm. +/// - JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm. +/// - JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm. +/// \param[in] with_offsets If or not output offsets of tokens (default=false). +/// \return Shared pointer to the current TensorOperation. +std::shared_ptr JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path, + const JiebaMode &mode = JiebaMode::kMix, + bool with_offsets = false); /// \brief Lookup operator that looks up a word to an id. /// \param[in] vocab a Vocab object. @@ -49,6 +71,21 @@ class SentencePieceTokenizerOperation; std::shared_ptr Lookup(const std::shared_ptr &vocab, const std::string &unknown_token, const mindspore::dataset::DataType &data_type = DataType("int32")); +/// \brief TensorOp to generate n-gram from a 1-D string Tensor. +/// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result +/// would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up +/// for a n-gram, an empty string will be returned. +/// \param[in] left_pad {"pad_token", pad_width}. Padding performed on left side of the sequence. pad_width will +/// be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (default={"", 0}}). +/// \param[in] right_pad {"pad_token", pad_width}. Padding performed on right side of the sequence.pad_width will +/// be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--" (default={"", 0}}). +/// \param[in] separator Symbol used to join strings together (default=" "). +/// \return Shared pointer to the current TensorOperation. +std::shared_ptr Ngram(const std::vector &ngrams, + const std::pair &left_pad = {"", 0}, + const std::pair &right_pad = {"", 0}, + const std::string &separator = " "); + /// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece. /// \param[in] vocab a SentencePieceVocab object. /// \param[in] out_type The type of output. @@ -63,8 +100,41 @@ std::shared_ptr SentencePieceTokenizer( std::shared_ptr SentencePieceTokenizer( const std::string &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type); +/// \brief TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension +/// axis is a slice of data starting at the corresponding position, with a specified width. +/// \param[in] width The width of the window. It must be an integer and greater than zero. +/// \param[in] axis The axis along which the sliding window is computed (default=0), axis support 0 or -1 only +/// for now. +/// \return Shared pointer to the current TensorOperation. +std::shared_ptr SlidingWindow(const int32_t width, const int32_t axis = 0); + +#ifndef _WIN32 +/// \brief Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces +/// \param[in] with_offsets If or not output offsets of tokens (default=false). +/// \return Shared pointer to the current TensorOperation. +std::shared_ptr WhitespaceTokenizer(bool with_offsets = false); +#endif + /* ####################################### Derived TensorOperation classes ################################# */ +class JiebaTokenizerOperation : public TensorOperation { + public: + explicit JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode, + bool with_offsets); + + ~JiebaTokenizerOperation() = default; + + std::shared_ptr Build() override; + + Status ValidateParams() override; + + private: + std::string hmm_path_; + std::string mp_path_; + JiebaMode mode_; + bool with_offsets_; +}; + class LookupOperation : public TensorOperation { public: explicit LookupOperation(const std::shared_ptr &vocab, const std::string &unknown_token, @@ -83,6 +153,24 @@ class LookupOperation : public TensorOperation { DataType data_type_; }; +class NgramOperation : public TensorOperation { + public: + explicit NgramOperation(const std::vector &ngrams, const std::pair &left_pad, + const std::pair &right_pad, const std::string &separator); + + ~NgramOperation() = default; + + std::shared_ptr Build() override; + + Status ValidateParams() override; + + private: + std::vector ngrams_; + std::pair left_pad_; + std::pair right_pad_; + std::string separator_; +}; + class SentencePieceTokenizerOperation : public TensorOperation { public: SentencePieceTokenizerOperation(const std::shared_ptr &vocab, SPieceTokenizerOutType out_type); @@ -101,6 +189,37 @@ class SentencePieceTokenizerOperation : public TensorOperation { SPieceTokenizerLoadType load_type_; SPieceTokenizerOutType out_type_; }; + +class SlidingWindowOperation : public TensorOperation { + public: + explicit SlidingWindowOperation(const int32_t width, const int32_t axis); + + ~SlidingWindowOperation() = default; + + std::shared_ptr Build() override; + + Status ValidateParams() override; + + private: + int32_t width_; + int32_t axis_; +}; + +#ifndef _WIN32 +class WhitespaceTokenizerOperation : public TensorOperation { + public: + explicit WhitespaceTokenizerOperation(bool with_offsets); + + ~WhitespaceTokenizerOperation() = default; + + std::shared_ptr Build() override; + + Status ValidateParams() override; + + private: + bool with_offsets_; +}; +#endif } // namespace text } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/jieba_tokenizer_op.h b/mindspore/ccsrc/minddata/dataset/text/kernels/jieba_tokenizer_op.h index a319ccd015b..fa29375dcdc 100644 --- a/mindspore/ccsrc/minddata/dataset/text/kernels/jieba_tokenizer_op.h +++ b/mindspore/ccsrc/minddata/dataset/text/kernels/jieba_tokenizer_op.h @@ -20,14 +20,13 @@ #include #include "cppjieba/Jieba.hpp" +#include "minddata/dataset/core/constants.h" #include "minddata/dataset/kernels/tensor_op.h" #include "minddata/dataset/util/status.h" namespace mindspore { namespace dataset { -enum class JiebaMode { kMix = 0, kMp = 1, kHmm = 2 }; - class JiebaTokenizerOp : public TensorOp { public: // default constant for Jieba MPSegment algorithm. diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h b/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h index c7baca00b56..d40005968ed 100644 --- a/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h +++ b/mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h @@ -23,14 +23,13 @@ #include #include +#include "minddata/dataset/core/constants.h" #include "minddata/dataset/kernels/tensor_op.h" #include "minddata/dataset/util/status.h" #include "minddata/dataset/text/sentence_piece_vocab.h" namespace mindspore { namespace dataset { -enum class SPieceTokenizerOutType { kString = 0, kInt = 1 }; -enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 }; class SentencePieceTokenizerOp : public TensorOp { public: diff --git a/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc b/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc index ba638834b2f..884a542785f 100644 --- a/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc +++ b/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc @@ -18,6 +18,7 @@ #include #include "common/common.h" +#include "minddata/dataset/core/constants.h" #include "minddata/dataset/include/datasets.h" #include "minddata/dataset/include/status.h" #include "minddata/dataset/include/transforms.h" diff --git a/tests/ut/cpp/dataset/c_api_text_test.cc b/tests/ut/cpp/dataset/c_api_text_test.cc new file mode 100644 index 00000000000..491eb920324 --- /dev/null +++ b/tests/ut/cpp/dataset/c_api_text_test.cc @@ -0,0 +1,567 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +#include "common/common.h" +#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/include/status.h" +#include "minddata/dataset/include/transforms.h" +#include "minddata/dataset/include/text.h" + +using namespace mindspore::dataset; +using mindspore::dataset::DataType; +using mindspore::dataset::ShuffleMode; +using mindspore::dataset::Status; +using mindspore::dataset::Tensor; +using mindspore::dataset::Vocab; + +class MindDataTestPipeline : public UT::DatasetOpTesting { + protected: +}; + +TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess) { + // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is false. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; + std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; + std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create jieba_tokenizer operation on ds + std::shared_ptr jieba_tokenizer = text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp); + EXPECT_NE(jieba_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({jieba_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + Tensor::CreateFromVector(expected, &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 1); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess1) { + // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kHmm and the with_offsets is false. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess1."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; + std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; + std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create jieba_tokenizer operation on ds + std::shared_ptr jieba_tokenizer = text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kHmm); + EXPECT_NE(jieba_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({jieba_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector expected = {"今天", "天气", "太", "好", "了", "我们", "一起", "去", "外面", "玩", "吧"}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + Tensor::CreateFromVector(expected, &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 1); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess2) { + // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is true. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess2."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; + std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; + std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create jieba_tokenizer operation on ds + std::shared_ptr jieba_tokenizer = text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp, true); + EXPECT_NE(jieba_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({jieba_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}, + {"token", "offsets_start", "offsets_limit"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"}; + + std::vector expected_offsets_start = {0, 12, 21, 27, 33, 36, 42}; + std::vector expected_offsets_limit = {12, 21, 27, 33, 36, 42, 48}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["offsets_start"]; + auto ind1 = row["offsets_limit"]; + auto token = row["token"]; + std::shared_ptr expected_tensor; + std::shared_ptr expected_tensor_offsets_start; + std::shared_ptr expected_tensor_offsets_limit; + Tensor::CreateFromVector(expected, &expected_tensor); + Tensor::CreateFromVector(expected_offsets_start, &expected_tensor_offsets_start); + Tensor::CreateFromVector(expected_offsets_limit, &expected_tensor_offsets_limit); + EXPECT_EQ(*ind, *expected_tensor_offsets_start); + EXPECT_EQ(*ind1, *expected_tensor_offsets_limit); + EXPECT_EQ(*token, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 1); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail) { + // Testing the incorrect parameter of JiebaTokenizer interface. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; + std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; + std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create jieba_tokenizer operation on ds + // Testing the parameter hmm_path is empty + std::shared_ptr jieba_tokenizer = text::JiebaTokenizer("", mp_path, JiebaMode::kMp); + EXPECT_EQ(jieba_tokenizer, nullptr); + // Testing the parameter mp_path is empty + std::shared_ptr jieba_tokenizer1 = text::JiebaTokenizer(hmm_path, "", JiebaMode::kMp); + EXPECT_EQ(jieba_tokenizer1, nullptr); + // Testing the parameter hmm_path is invalid path + std::string hmm_path_invalid = datasets_root_path_ + "/jiebadict/1.txt"; + std::shared_ptr jieba_tokenizer2 = text::JiebaTokenizer(hmm_path_invalid, mp_path, JiebaMode::kMp); + EXPECT_EQ(jieba_tokenizer2, nullptr); + // Testing the parameter mp_path is invalid path + std::string mp_path_invalid = datasets_root_path_ + "/jiebadict/1.txt"; + std::shared_ptr jieba_tokenizer3 = text::JiebaTokenizer(hmm_path, mp_path_invalid, JiebaMode::kMp); + EXPECT_EQ(jieba_tokenizer3, nullptr); +} + +TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) { + // Testing the parameter of SlidingWindow interface when the axis is 0. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create white_tokenizer operation on ds + std::shared_ptr white_tokenizer = text::WhitespaceTokenizer(); + EXPECT_NE(white_tokenizer, nullptr); + // Create sliding_window operation on ds + std::shared_ptr sliding_window = text::SlidingWindow(3, 0); + EXPECT_NE(sliding_window, nullptr); + + // Create Map operation on ds + ds = ds->Map({white_tokenizer, sliding_window}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector> expected = {{"This", "is", "a", "is", "a", "text", "a", "text", "file."}, + {"Be", "happy", "every", "happy", "every", "day."}, + {"Good", "luck", "to", "luck", "to", "everyone."}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + int x = expected[i].size() / 3; + Tensor::CreateFromVector(expected[i], TensorShape({x, 3}), &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 3); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess1) { + // Testing the parameter of SlidingWindow interface when the axis is -1. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess1."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create white_tokenizer operation on ds + std::shared_ptr white_tokenizer = text::WhitespaceTokenizer(); + EXPECT_NE(white_tokenizer, nullptr); + // Create sliding_window operation on ds + std::shared_ptr sliding_window = text::SlidingWindow(2, -1); + EXPECT_NE(sliding_window, nullptr); + + // Create Map operation on ds + ds = ds->Map({white_tokenizer, sliding_window}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector> expected = {{"This", "is", "is", "a", "a", "text", "text", "file."}, + {"Be", "happy", "happy", "every", "every", "day."}, + {"Good", "luck", "luck", "to", "to", "everyone."}}; + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + int x = expected[i].size() / 2; + Tensor::CreateFromVector(expected[i], TensorShape({x, 2}), &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 3); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestSlidingWindowFail) { + // Testing the incorrect parameter of SlidingWindow interface. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowFail."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create sliding_window operation on ds + // Testing the parameter width less than or equal to 0 + // The parameter axis support 0 or -1 only for now + std::shared_ptr sliding_window = text::SlidingWindow(0, 0); + EXPECT_EQ(sliding_window, nullptr); + // Testing the parameter width less than or equal to 0 + // The parameter axis support 0 or -1 only for now + std::shared_ptr sliding_window1 = text::SlidingWindow(-2, 0); + EXPECT_EQ(sliding_window1, nullptr); +} + +TEST_F(MindDataTestPipeline, TestNgramSuccess) { + // Testing the parameter of Ngram interface. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create white_tokenizer operation on ds + std::shared_ptr white_tokenizer = text::WhitespaceTokenizer(); + EXPECT_NE(white_tokenizer, nullptr); + // Create sliding_window operation on ds + std::shared_ptr ngram_op = text::Ngram({2}, {"_", 1}, {"_", 1}, " "); + EXPECT_NE(ngram_op, nullptr); + + // Create Map operation on ds + ds = ds->Map({white_tokenizer, ngram_op}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector> expected = {{"_ This", "This is", "is a", "a text", "text file.", "file. _"}, + {"_ Be", "Be happy", "happy every", "every day.", "day. _"}, + {"_ Good", "Good luck", "luck to", "to everyone.", "everyone. _"}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + int x = expected[i].size(); + Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 3); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestNgramSuccess1) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess1."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create white_tokenizer operation on ds + std::shared_ptr white_tokenizer = text::WhitespaceTokenizer(); + EXPECT_NE(white_tokenizer, nullptr); + // Create sliding_window operation on ds + std::shared_ptr ngram_op = text::Ngram({2, 3}, {"&", 2}, {"&", 2}, "-"); + EXPECT_NE(ngram_op, nullptr); + + // Create Map operation on ds + ds = ds->Map({white_tokenizer, ngram_op}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector> expected = { + {"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a", "is-a-text", + "a-text-file.", "text-file.-&", "file.-&-&"}, + {"&-Be", "Be-happy", "happy-every", "every-day.", "day.-&", "&-&-Be", "&-Be-happy", "Be-happy-every", + "happy-every-day.", "every-day.-&", "day.-&-&"}, + {"&-Good", "Good-luck", "luck-to", "to-everyone.", "everyone.-&", "&-&-Good", "&-Good-luck", "Good-luck-to", + "luck-to-everyone.", "to-everyone.-&", "everyone.-&-&"}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + int x = expected[i].size(); + Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 3); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestNgramFail) { + // Testing the incorrect parameter of Ngram interface. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create sliding_window operation on ds + // Testing the vector of ngram is empty + std::shared_ptr ngram_op = text::Ngram({}); + EXPECT_EQ(ngram_op, nullptr); + // Testing the value of ngrams vector less than and equal to 0 + std::shared_ptr ngram_op1 = text::Ngram({0}); + EXPECT_EQ(ngram_op1, nullptr); + // Testing the value of ngrams vector less than and equal to 0 + std::shared_ptr ngram_op2 = text::Ngram({-2}); + EXPECT_EQ(ngram_op2, nullptr); + // Testing the second parameter pad_width in left_pad vector less than 0 + std::shared_ptr ngram_op3 = text::Ngram({2}, {"", -1}); + EXPECT_EQ(ngram_op3, nullptr); + // Testing the second parameter pad_width in right_pad vector less than 0 + std::shared_ptr ngram_op4 = text::Ngram({2}, {"", 1}, {"", -1}); + EXPECT_EQ(ngram_op4, nullptr); +} + +TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess) { + // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is default. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create white_tokenizer operation on ds + std::shared_ptr white_tokenizer = text::WhitespaceTokenizer(); + EXPECT_NE(white_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({white_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector> expected = { + {"This", "is", "a", "text", "file."}, {"Be", "happy", "every", "day."}, {"Good", "luck", "to", "everyone."}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + int x = expected[i].size(); + Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 3); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess1) { + // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is true. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess1."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create white_tokenizer operation on ds + std::shared_ptr white_tokenizer = text::WhitespaceTokenizer(true); + EXPECT_NE(white_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({white_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}, + {"token", "offsets_start", "offsets_limit"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector> expected = { + {"Welcome", "to", "Beijing!"}, {"北京欢迎您!"}, {"我喜欢English!"}, {""}}; + + std::vector> expected_offsets_start = {{0, 8, 11}, {0}, {0}, {0}}; + std::vector> expected_offsets_limit = {{7, 10, 19}, {18}, {17}, {0}}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["offsets_start"]; + auto ind1 = row["offsets_limit"]; + auto token = row["token"]; + std::shared_ptr expected_tensor; + std::shared_ptr expected_tensor_offsets_start; + std::shared_ptr expected_tensor_offsets_limit; + int x = expected[i].size(); + Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); + Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &expected_tensor_offsets_start); + Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &expected_tensor_offsets_limit); + EXPECT_EQ(*ind, *expected_tensor_offsets_start); + EXPECT_EQ(*ind1, *expected_tensor_offsets_limit); + EXPECT_EQ(*token, *expected_tensor); + + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 4); + + // Manually terminate the pipeline + iter->Stop(); +}