From 52d278e858b7fd2e9acc505ad947ac89557ca0ee Mon Sep 17 00:00:00 2001 From: luoyang Date: Tue, 27 Oct 2020 21:17:52 +0800 Subject: [PATCH] [MD] C++ API support build_sentence_piece_vocab_node & sentence_piece_tokenizer --- .../ccsrc/minddata/dataset/api/datasets.cc | 30 +++ mindspore/ccsrc/minddata/dataset/api/text.cc | 67 +++++- .../ccsrc/minddata/dataset/api/vision.cc | 2 +- .../dataset/engine/consumers/tree_consumer.cc | 5 + .../build_sentence_piece_vocab_op.cc | 2 +- .../source/sampler/weighted_random_sampler.cc | 11 + .../engine/ir/datasetops/CMakeLists.txt | 1 + .../build_sentence_piece_vocab_node.cc | 82 +++++++ .../build_sentence_piece_vocab_node.h | 62 +++++ .../ccsrc/minddata/dataset/include/datasets.h | 36 ++- .../ccsrc/minddata/dataset/include/text.h | 41 +++- mindspore/dataset/engine/samplers.py | 9 - mindspore/lite/minddata/CMakeLists.txt | 1 + tests/ut/cpp/dataset/CMakeLists.txt | 3 + ...cord.cc => c_api_dataset_minddata_test.cc} | 0 .../c_api_text_sentence_piece_vocab_test.cc | 224 ++++++++++++++++++ 16 files changed, 549 insertions(+), 27 deletions(-) create mode 100644 mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.cc create mode 100644 mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h rename tests/ut/cpp/dataset/{c_api_dataset_mindrecord.cc => c_api_dataset_minddata_test.cc} (100%) create mode 100644 tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc diff --git a/mindspore/ccsrc/minddata/dataset/api/datasets.cc b/mindspore/ccsrc/minddata/dataset/api/datasets.cc index bc99c85fcbe..55cb2af1e59 100644 --- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc +++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc @@ -67,6 +67,7 @@ #ifndef ENABLE_ANDROID #include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h" +#include "minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h" #include "minddata/dataset/engine/ir/datasetops/build_vocab_node.h" #endif @@ -506,6 +507,35 @@ std::shared_ptr Dataset::BucketBatchByLength( return ds; } +// Function to create a SentencePieceVocab from dataset +std::shared_ptr Dataset::BuildSentencePieceVocab( + const std::vector &col_names, uint32_t vocab_size, float character_coverage, + SentencePieceModel model_type, const std::unordered_map ¶ms) { + auto vocab = std::make_shared(); + auto ds = std::make_shared(shared_from_this(), vocab, col_names, vocab_size, + character_coverage, model_type, params); + + // Validate input params + if (!ds->ValidateParams()) { + return nullptr; + } + + // Run tree here to start building vocab + std::shared_ptr iter = ds->CreateIterator(); + if (iter == nullptr) { + MS_LOG(ERROR) << "Fail to run iterator in BuildSentencePieceVocab."; + return nullptr; + } + + // Finish building vocab by triggering GetNextRow + std::unordered_map> row; + if (!iter->GetNextRow(&row)) { + return nullptr; + } + + return vocab; +} + // Function to create a Vocab from dataset std::shared_ptr Dataset::BuildVocab(const std::vector &columns, const std::pair &freq_range, int64_t top_k, diff --git a/mindspore/ccsrc/minddata/dataset/api/text.cc b/mindspore/ccsrc/minddata/dataset/api/text.cc index 51caa7c3342..2adf7a188e9 100644 --- a/mindspore/ccsrc/minddata/dataset/api/text.cc +++ b/mindspore/ccsrc/minddata/dataset/api/text.cc @@ -14,8 +14,11 @@ * limitations under the License. */ +#include #include "minddata/dataset/include/text.h" #include "minddata/dataset/text/kernels/lookup_op.h" +#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h" +#include "minddata/dataset/util/path.h" namespace mindspore { namespace dataset { @@ -31,10 +34,21 @@ std::shared_ptr Lookup(const std::shared_ptr &vocab, con const DataType &data_type) { auto op = std::make_shared(vocab, unknown_token, data_type); - if (!op->ValidateParams()) { - return nullptr; - } - return op; + return op->ValidateParams() ? op : nullptr; +} + +std::shared_ptr SentencePieceTokenizer( + const std::shared_ptr &vocab, SPieceTokenizerOutType out_type) { + auto op = std::make_shared(vocab, out_type); + + return op->ValidateParams() ? op : nullptr; +} + +std::shared_ptr SentencePieceTokenizer(const std::string &vocab_path, + SPieceTokenizerOutType out_type) { + auto op = std::make_shared(vocab_path, out_type); + + return op->ValidateParams() ? op : nullptr; } /* ####################################### Validator Functions ############################################ */ @@ -70,6 +84,51 @@ std::shared_ptr LookupOperation::Build() { return tensor_op; } +// SentencePieceTokenizerOperation +SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr &vocab, + SPieceTokenizerOutType out_type) + : vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {} + +SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::string &vocab_path, + SPieceTokenizerOutType out_type) + : vocab_(nullptr), vocab_path_(vocab_path), load_type_(SPieceTokenizerLoadType::kFile), out_type_(out_type) {} + +Status SentencePieceTokenizerOperation::ValidateParams() { + if (load_type_ == SPieceTokenizerLoadType::kModel) { + if (vocab_ == nullptr) { + std::string err_msg = "SentencePieceTokenizer: vocab object type is incorrect or null."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + } else { + Path vocab_file(vocab_path_); + if (!vocab_file.Exists() || vocab_file.IsDirectory()) { + std::string err_msg = "SentencePieceTokenizer : vocab file: [" + vocab_path_ + "] is invalid or does not exist."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + if (access(vocab_file.toString().c_str(), R_OK) == -1) { + std::string err_msg = "SentencePieceTokenizer : no access to specified dataset file: " + vocab_path_; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + } + return Status::OK(); +} + +std::shared_ptr SentencePieceTokenizerOperation::Build() { + std::shared_ptr tensor_op; + if (load_type_ == SPieceTokenizerLoadType::kModel) { + tensor_op = std::make_shared(vocab_, load_type_, out_type_); + } else { + Path vocab_file(vocab_path_); + std::string model_path = vocab_file.ParentPath(); + std::string model_filename = vocab_file.Basename(); + tensor_op = std::make_shared(model_path, model_filename, load_type_, out_type_); + } + return tensor_op; +} + } // namespace text } // namespace api } // namespace dataset diff --git a/mindspore/ccsrc/minddata/dataset/api/vision.cc b/mindspore/ccsrc/minddata/dataset/api/vision.cc index 6a3814ac323..8ac4280c887 100644 --- a/mindspore/ccsrc/minddata/dataset/api/vision.cc +++ b/mindspore/ccsrc/minddata/dataset/api/vision.cc @@ -563,7 +563,7 @@ Status NormalizeOperation::ValidateParams() { RETURN_STATUS_SYNTAX_ERROR(err_msg); } if (mean_[i] < 0.0f || mean_[i] > 255.0f || CmpFloat(mean_[i], 0.0f)) { - std::string err_msg = "Normalize: mean vector has incorrect value: " + std::to_string(std_[i]); + std::string err_msg = "Normalize: mean vector has incorrect value: " + std::to_string(mean_[i]); MS_LOG(ERROR) << err_msg; RETURN_STATUS_SYNTAX_ERROR(err_msg); } diff --git a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc index 53910bdcce5..d385e3542c1 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc @@ -113,6 +113,11 @@ Status SaveToDisk::ValidateParams() { MS_LOG(ERROR) << err; RETURN_STATUS_SYNTAX_ERROR(err); } + if (access(dir.ParentPath().c_str(), R_OK) == -1) { + std::string err_msg = "CreateSaver failed, no access to specified dataset path: " + dataset_path_; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } if (num_files_ <= 0 || num_files_ > 1000) { std::string err = "CreateSaver failed, num_files must between 1 and 1000, but got " + std::to_string(num_files_); MS_LOG(ERROR) << err; diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.cc index 2674b566734..76312943729 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.cc @@ -76,7 +76,7 @@ Status BuildSentencePieceVocabOp::SentenceThread() { } else { auto itr = column_name_id_map_.find(col_names_[0]); CHECK_FAIL_RETURN_UNEXPECTED(itr != column_name_id_map_.end(), - "Invalid parameter, column name: " + col_names_[0] + "does not exist."); + "Invalid parameter, column name: " + col_names_[0] + " does not exist."); col_id_ = itr->second; } std::unique_ptr sentence_iter = std::make_unique(this); diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc index 0a2a3f99253..c8d62c34585 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc @@ -48,6 +48,17 @@ Status WeightedRandomSampler::InitSampler() { CHECK_FAIL_RETURN_UNEXPECTED(samples_per_buffer_ > 0, "Invalid parameter, samples_per_buffer must be greater than 0, but got " + std::to_string(samples_per_buffer_) + ".\n"); + + CHECK_FAIL_RETURN_UNEXPECTED(weights_.size() != 0, "Invalid parameter, weights size must not be 0.\n"); + int32_t zero_elem = 0; + for (auto &elem : weights_) { + CHECK_FAIL_RETURN_UNEXPECTED(elem >= 0.0, "Invalid parameter, weights must not contain negative number, but got " + + std::to_string(elem) + ".\n"); + if (elem == 0.0) zero_elem++; + } + CHECK_FAIL_RETURN_UNEXPECTED(zero_elem != weights_.size(), + "Invalid parameter, elements of weights must not be all zero.\n"); + if (weights_.size() > static_cast(num_rows_)) { return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "Invalid parameter, size of sample weights must be less than or equal to num of data, " diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/CMakeLists.txt index b771b4cc84a..2c6f8e133df 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/CMakeLists.txt +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/CMakeLists.txt @@ -5,6 +5,7 @@ add_subdirectory(source) set(DATASET_ENGINE_IR_DATASETOPS_SRC_FILES batch_node.cc bucket_batch_by_length_node.cc + build_sentence_piece_vocab_node.cc build_vocab_node.cc concat_node.cc map_node.cc diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.cc new file mode 100644 index 00000000000..5704617f9a6 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.cc @@ -0,0 +1,82 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h" + +#include +#include +#include +#include + +#include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h" +#include "minddata/dataset/util/status.h" + +namespace mindspore { +namespace dataset { +namespace api { + +BuildSentenceVocabNode::BuildSentenceVocabNode(std::shared_ptr child, + std::shared_ptr vocab, + const std::vector &col_names, uint32_t vocab_size, + float character_coverage, SentencePieceModel model_type, + const std::unordered_map ¶ms) + : vocab_(vocab), + col_names_(col_names), + vocab_size_(vocab_size), + character_coverage_(character_coverage), + model_type_(model_type), + params_(params) { + this->children.push_back(child); +} + +// Function to build BuildSentenceVocabNode +std::vector> BuildSentenceVocabNode::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + std::shared_ptr build_sentence_piece_vocab_op; + build_sentence_piece_vocab_op = std::make_shared( + vocab_, col_names_, vocab_size_, character_coverage_, model_type_, params_, connector_que_size_); + node_ops.push_back(build_sentence_piece_vocab_op); + return node_ops; +} + +Status BuildSentenceVocabNode::ValidateParams() { + if (vocab_ == nullptr) { + std::string err_msg = "BuildSentenceVocabNode: vocab is null."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + if (vocab_size_ <= 0) { + std::string err_msg = + "BuildSentenceVocabNode: vocab_size should be positive, but got: " + std::to_string(vocab_size_); + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + if (character_coverage_ < 0.98f || character_coverage_ > 1.0f) { + std::string err_msg = "BuildSentenceVocabNode: character_coverage should to be between 0.98 and 1.0, but got " + + std::to_string(character_coverage_); + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + + return Status::OK(); +} +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h new file mode 100644 index 00000000000..868daaae5ff --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h @@ -0,0 +1,62 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_BUILD_SENTENCE_PIECE_VOCAB_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_BUILD_SENTENCE_PIECE_VOCAB_NODE_H_ + +#include +#include +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { + +class BuildSentenceVocabNode : public Dataset { + public: + /// \brief Constructor + BuildSentenceVocabNode(std::shared_ptr child, std::shared_ptr vocab, + const std::vector &col_names, uint32_t vocab_size, float character_coverage, + SentencePieceModel model_type, const std::unordered_map ¶ms); + + /// \brief Destructor + ~BuildSentenceVocabNode() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return The list of shared pointers to the newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return Status Status::OK() if all the parameters are valid + Status ValidateParams() override; + + private: + std::shared_ptr vocab_; + std::vector col_names_; + uint32_t vocab_size_; + float character_coverage_; + SentencePieceModel model_type_; + std::unordered_map params_; +}; + +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_BUILD_SENTENCE_PIECE_VOCAB_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/include/datasets.h b/mindspore/ccsrc/minddata/dataset/include/datasets.h index bf80a876d4f..9bad7b15499 100644 --- a/mindspore/ccsrc/minddata/dataset/include/datasets.h +++ b/mindspore/ccsrc/minddata/dataset/include/datasets.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -37,6 +38,7 @@ #include "minddata/dataset/kernels/tensor_op.h" #include "minddata/dataset/util/path.h" #ifndef ENABLE_ANDROID +#include "minddata/dataset/text/sentence_piece_vocab.h" #include "minddata/dataset/text/vocab.h" #endif @@ -86,7 +88,6 @@ class VOCNode; // Dataset Op classes (in alphabetical order) #ifndef ENABLE_ANDROID class BucketBatchByLengthNode; -class BuildVocabNode; #endif class ConcatNode; class MapNode; @@ -635,7 +636,7 @@ class Dataset : public std::enable_shared_from_this { /// \brief Function to transfer data through a device. /// \notes If device is Ascend, features of data will be transferred one by one. The limitation - /// of data transmission per time is 256M. + /// of data transmission per time is 256M. /// \param[in] send_epoch_end Whether to send end of sequence to device or not (default=True). /// \return Returns true if no error encountered else false. bool DeviceQueue(bool send_epoch_end = true); @@ -658,7 +659,7 @@ class Dataset : public std::enable_shared_from_this { /// \brief Function to create a BatchNode /// \notes Combines batch_size number of consecutive rows into batches - /// \param[in] batch_size Path to the root directory that contains the dataset + /// \param[in] batch_size The number of rows each batch is created with /// \param[in] drop_remainder Determines whether or not to drop the last possibly incomplete /// batch. If true, and if there are less than batch_size rows /// available to make the last batch, then those rows will @@ -668,7 +669,8 @@ class Dataset : public std::enable_shared_from_this { #ifndef ENABLE_ANDROID /// \brief Function to create a BucketBatchByLengthNode - /// \notes Combines batch_size number of consecutive rows into batches + /// \notes Bucket elements according to their lengths. Each bucket will be padded and batched when + /// they are full. /// \param[in] column_names Columns passed to element_length_function /// \param[in] bucket_boundaries A list consisting of the upper boundaries of the buckets. /// Must be strictly increasing. If there are n boundaries, n+1 buckets are created: One bucket for @@ -676,10 +678,10 @@ class Dataset : public std::enable_shared_from_this { /// 0 { /// padded to the longest in the current batch, unless if pad_to_bucket_boundary is true. If no padding is /// wanted, set pad_info to None (default=empty dictionary). /// \param[in] pad_to_bucket_boundary If true, will pad each unspecified dimension in pad_info to the - /// bucket_boundary - /// minus 1. If there are any elements that fall into the last bucket, an error will occur (default=false). + /// bucket_boundary minus 1. If there are any elements that fall into the last bucket, + /// an error will occur (default=false). /// \param[in] drop_remainder If true, will drop the last batch for each bucket if it is not a full batch /// (default=false). /// \return Shared pointer to the current BucketBatchByLengthNode @@ -699,6 +701,20 @@ class Dataset : public std::enable_shared_from_this { const std::map>> &pad_info = {}, bool pad_to_bucket_boundary = false, bool drop_remainder = false); + /// \brief Function to create a SentencePieceVocab from source dataset + /// \notes Build a SentencePieceVocab from a dataset. + /// \param[in] col_names Column names to get words from. It can be a vector of column names + /// \param[in] vocab_size Vocabulary size. The type is uint32 + /// \param[in] character_coverage Percentage of characters covered by the model, must be between + /// 0.98 and 1.0 Good defaults are: 0.9995 for languages with rich character sets like + /// Japanese or Chinese character sets, and 1.0 for other languages with small character sets. + /// \param[in] model_type Model type. Choose from unigram (default), bpe, char, or word. + /// The input sentence must be pretokenized when using word type. + /// \param[in] params A vector contains more option parameters of sentencepiece library + std::shared_ptr BuildSentencePieceVocab( + const std::vector &col_names, uint32_t vocab_size, float character_coverage, + SentencePieceModel model_type, const std::unordered_map ¶ms); + /// \brief Function to create a Vocab from source dataset /// \notes Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab /// which contains top_k most frequent words (if top_k is specified) diff --git a/mindspore/ccsrc/minddata/dataset/include/text.h b/mindspore/ccsrc/minddata/dataset/include/text.h index 53a31efb506..7e465dc983c 100644 --- a/mindspore/ccsrc/minddata/dataset/include/text.h +++ b/mindspore/ccsrc/minddata/dataset/include/text.h @@ -21,11 +21,14 @@ #include #include +#include "mindspore/ccsrc/minddata/dataset/core/data_type.h" #include "minddata/dataset/core/constants.h" #include "minddata/dataset/include/transforms.h" -#include "minddata/dataset/text/vocab.h" #include "minddata/dataset/util/status.h" -#include "mindspore/ccsrc/minddata/dataset/core/data_type.h" + +#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h" +#include "minddata/dataset/text/sentence_piece_vocab.h" +#include "minddata/dataset/text/vocab.h" namespace mindspore { namespace dataset { @@ -36,6 +39,7 @@ namespace text { // Text Op classes (in alphabetical order) class LookupOperation; +class SentencePieceTokenizerOperation; /// \brief Lookup operator that looks up a word to an id. /// \param[in] vocab a Vocab object. @@ -46,6 +50,20 @@ class LookupOperation; std::shared_ptr Lookup(const std::shared_ptr &vocab, const std::string &unknown_token, const mindspore::dataset::DataType &data_type = DataType("int32")); +/// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece. +/// \param[in] vocab a SentencePieceVocab object. +/// \param[in] out_type The type of output. +/// \return Shared pointer to the current TensorOperation. +std::shared_ptr SentencePieceTokenizer( + const std::shared_ptr &vocab, mindspore::dataset::SPieceTokenizerOutType out_type); + +/// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece. +/// \param[in] vocab_path vocab model file path. +/// \param[in] out_type The type of output. +/// \return Shared pointer to the current TensorOperation. +std::shared_ptr SentencePieceTokenizer( + const std::string &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type); + /* ####################################### Derived TensorOperation classes ################################# */ class LookupOperation : public TensorOperation { @@ -65,6 +83,25 @@ class LookupOperation : public TensorOperation { int32_t default_id_; DataType data_type_; }; + +class SentencePieceTokenizerOperation : public TensorOperation { + public: + SentencePieceTokenizerOperation(const std::shared_ptr &vocab, SPieceTokenizerOutType out_type); + + SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type); + + ~SentencePieceTokenizerOperation() = default; + + std::shared_ptr Build() override; + + Status ValidateParams() override; + + private: + std::shared_ptr vocab_; + std::string vocab_path_; + SPieceTokenizerLoadType load_type_; + SPieceTokenizerOutType out_type_; +}; } // namespace text } // namespace api } // namespace dataset diff --git a/mindspore/dataset/engine/samplers.py b/mindspore/dataset/engine/samplers.py index 6ff1c3413ba..c5039fb15cf 100644 --- a/mindspore/dataset/engine/samplers.py +++ b/mindspore/dataset/engine/samplers.py @@ -585,15 +585,6 @@ class WeightedRandomSampler(BuiltinSampler): if not isinstance(weights, list): weights = [weights] - if weights == []: - raise ValueError("weights size should not be 0") - - if list(filter(lambda x: x < 0, weights)): - raise ValueError("weights should not contain negative numbers") - - if list(filter(lambda x: x == 0, weights)) == weights: - raise ValueError("elements of weights should not be all zero") - if num_samples is not None: if num_samples <= 0: raise ValueError("num_samples should be a positive integer " diff --git a/mindspore/lite/minddata/CMakeLists.txt b/mindspore/lite/minddata/CMakeLists.txt index a6168113678..f3cb4fd24c9 100644 --- a/mindspore/lite/minddata/CMakeLists.txt +++ b/mindspore/lite/minddata/CMakeLists.txt @@ -139,6 +139,7 @@ if (BUILD_MINDDATA STREQUAL "full") list(REMOVE_ITEM MINDDATA_ENGINE_IR_DATASETOPS_SRC_FILES "${MINDDATA_DIR}/engine/ir/datasetops/bucket_batch_by_length_node.cc" + "${MINDDATA_DIR}/engine/ir/datasetops/build_sentence_piece_vocab_node.cc" "${MINDDATA_DIR}/engine/ir/datasetops/build_vocab_node.cc" "${MINDDATA_DIR}/engine/ir/datasetops/sync_wait_node.cc" ) diff --git a/tests/ut/cpp/dataset/CMakeLists.txt b/tests/ut/cpp/dataset/CMakeLists.txt index 3c21213759c..3b28d1fdf85 100644 --- a/tests/ut/cpp/dataset/CMakeLists.txt +++ b/tests/ut/cpp/dataset/CMakeLists.txt @@ -112,12 +112,15 @@ SET(DE_UT_SRCS c_api_dataset_config_test.cc c_api_dataset_csv_test.cc c_api_dataset_manifest_test.cc + c_api_dataset_minddata_test.cc c_api_dataset_randomdata_test.cc + c_api_dataset_save.cc c_api_dataset_textfile_test.cc c_api_dataset_tfrecord_test.cc c_api_dataset_voc_test.cc c_api_datasets_test.cc c_api_dataset_iterator_test.cc + c_api_text_sentence_piece_vocab_test.cc c_api_text_vocab_test.cc tensor_op_fusion_pass_test.cc sliding_window_op_test.cc diff --git a/tests/ut/cpp/dataset/c_api_dataset_mindrecord.cc b/tests/ut/cpp/dataset/c_api_dataset_minddata_test.cc similarity index 100% rename from tests/ut/cpp/dataset/c_api_dataset_mindrecord.cc rename to tests/ut/cpp/dataset/c_api_dataset_minddata_test.cc diff --git a/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc b/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc new file mode 100644 index 00000000000..2d344be1a0c --- /dev/null +++ b/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc @@ -0,0 +1,224 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +#include "common/common.h" +#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/include/status.h" +#include "minddata/dataset/include/transforms.h" +#include "minddata/dataset/include/text.h" + +// IR non-leaf nodes +#include "minddata/dataset/engine/ir/datasetops/map_node.h" + +// IR leaf nodes +#include "minddata/dataset/engine/ir/datasetops/source/text_file_node.h" + +using namespace mindspore::dataset::api; +using mindspore::dataset::Tensor; +using mindspore::dataset::ShuffleMode; +using mindspore::dataset::SentencePieceModel; +using mindspore::dataset::SentencePieceVocab; + +class MindDataTestPipeline : public UT::DatasetOpTesting { + protected: +}; + +TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess1) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabSuccess1 plus sentencepiece tokenizer."; + + // Create a TextFile dataset + std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt"; + std::shared_ptr ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds_vocab, nullptr); + + // Create vocab from dataset + std::shared_ptr vocab = + ds_vocab->BuildSentencePieceVocab({}, 5000, 0.9995, SentencePieceModel::kUnigram, {}); + EXPECT_NE(vocab, nullptr); + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/sentencepiece_tokenizer.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + + // Create SentencePieceTokenizer operation from vocab object + std::shared_ptr sentencepiece_tokenizer = + text::SentencePieceTokenizer(vocab, mindspore::dataset::SPieceTokenizerOutType::kString); + EXPECT_NE(sentencepiece_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({sentencepiece_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + // Expected result after tokenization + std::vector expected = {"▁I", "▁sa", "w", "▁a", "▁girl", "▁with", "▁a", "▁te", "les", "co", "pe", "."}; + uint64_t i = 0; + while (row.size() != 0) { + auto txt = row["text"]; + MS_LOG(INFO) << *txt; + std::shared_ptr expected_tensor; + Tensor::CreateFromVector(expected, &expected_tensor); + EXPECT_EQ(*txt, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 1); +} + +TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess2) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabSuccess2 plus sentencepiece tokenizer."; + + // Create a TextFile dataset + std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt"; + std::shared_ptr ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds_vocab, nullptr); + + // Create vocab from dataset + std::shared_ptr vocab = + ds_vocab->BuildSentencePieceVocab({}, 5000, 0.9995, SentencePieceModel::kUnigram, {}); + EXPECT_NE(vocab, nullptr); + + // Save vocab model to local + vocab->SaveModel(&vocab, datasets_root_path_ + "/test_sentencepiece", "m.model"); + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/sentencepiece_tokenizer.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + + // Create SentencePieceTokenizer operation from local vocab model + std::string vocab_model = datasets_root_path_ + "/test_sentencepiece/m.model"; + std::shared_ptr sentencepiece_tokenizer = + text::SentencePieceTokenizer(vocab_model, mindspore::dataset::SPieceTokenizerOutType::kString); + EXPECT_NE(sentencepiece_tokenizer, nullptr); + + // Create Map operation on ds + ds = ds->Map({sentencepiece_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + // Expected result after tokenization + std::vector expected = {"▁I", "▁sa", "w", "▁a", "▁girl", "▁with", "▁a", "▁te", "les", "co", "pe", "."}; + uint64_t i = 0; + while (row.size() != 0) { + auto txt = row["text"]; + MS_LOG(INFO) << *txt; + std::shared_ptr expected_tensor; + Tensor::CreateFromVector(expected, &expected_tensor); + EXPECT_EQ(*txt, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 1); +} + +TEST_F(MindDataTestPipeline, TestSentencePieceVocabFail) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabFail1 with incorrect parameter."; + + // Create a TextFile dataset + std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt"; + std::shared_ptr ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds_vocab, nullptr); + + // vocab_size can not less than or equal to 0 + std::shared_ptr vocab1 = + ds_vocab->BuildSentencePieceVocab({}, 0, 0.9995, SentencePieceModel::kUnigram, {}); + EXPECT_EQ(vocab1, nullptr); + + // character_coverage should to be between 0.98 and 1.0 + std::shared_ptr vocab2 = + ds_vocab->BuildSentencePieceVocab({}, 1, 0.979, SentencePieceModel::kUnigram, {}); + EXPECT_EQ(vocab2, nullptr); + + // character_coverage should to be between 0.98 and 1.0 + std::shared_ptr vocab3 = + ds_vocab->BuildSentencePieceVocab({}, 1, 1.01, SentencePieceModel::kUnigram, {}); + EXPECT_EQ(vocab3, nullptr); + + // column name does not exist + std::shared_ptr vocab4 = + ds_vocab->BuildSentencePieceVocab({"image"}, 2, 0.98, SentencePieceModel::kUnigram, {}); + EXPECT_EQ(vocab4, nullptr); +} + +TEST_F(MindDataTestPipeline, TestSentencePieceTokenizerFail1) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceTokenizerFail with incorrect parameter."; + + // Create SentencePieceTokenizer operation from local vocab model + std::string vocab_model1 = ""; + std::shared_ptr sentencepiece_tokenizer1 = + text::SentencePieceTokenizer(vocab_model1, mindspore::dataset::SPieceTokenizerOutType::kString); + EXPECT_EQ(sentencepiece_tokenizer1, nullptr); + + // Create SentencePieceTokenizer operation from local vocab model + std::string vocab_model2 = "m.model"; + std::shared_ptr sentencepiece_tokenizer2 = + text::SentencePieceTokenizer(vocab_model2, mindspore::dataset::SPieceTokenizerOutType::kString); + EXPECT_EQ(sentencepiece_tokenizer2, nullptr); + + // Create SentencePieceTokenizer operation from vocab object + std::shared_ptr vocab_model3 = nullptr; + std::shared_ptr sentencepiece_tokenizer3 = + text::SentencePieceTokenizer(vocab_model3, mindspore::dataset::SPieceTokenizerOutType::kString); + EXPECT_EQ(sentencepiece_tokenizer3, nullptr); +} + +TEST_F(MindDataTestPipeline, TestSentencePieceTokenizerFail2) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceTokenizerFail with invalid SentencePieceVocab object."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testTokenizerData/sentencepiece_tokenizer.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + + // Create SentencePieceTokenizer operation from vocab object + std::shared_ptr vocab_model4 = std::make_shared(); + std::shared_ptr sentencepiece_tokenizer4 = + text::SentencePieceTokenizer(vocab_model4, mindspore::dataset::SPieceTokenizerOutType::kString); + EXPECT_NE(sentencepiece_tokenizer4, nullptr); + + // Create Map operation on ds + ds = ds->Map({sentencepiece_tokenizer4}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + EXPECT_EQ(iter->GetNextRow(&row), false); +}