forked from mindspore-Ecosystem/mindspore
[MD] C++ API support build_sentence_piece_vocab_node & sentence_piece_tokenizer
This commit is contained in:
parent
b6b254f6e4
commit
52d278e858
|
@ -67,6 +67,7 @@
|
|||
|
||||
#ifndef ENABLE_ANDROID
|
||||
#include "minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h"
|
||||
#include "minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h"
|
||||
#include "minddata/dataset/engine/ir/datasetops/build_vocab_node.h"
|
||||
#endif
|
||||
|
||||
|
@ -506,6 +507,35 @@ std::shared_ptr<BucketBatchByLengthNode> Dataset::BucketBatchByLength(
|
|||
return ds;
|
||||
}
|
||||
|
||||
// Function to create a SentencePieceVocab from dataset
|
||||
std::shared_ptr<SentencePieceVocab> Dataset::BuildSentencePieceVocab(
|
||||
const std::vector<std::string> &col_names, uint32_t vocab_size, float character_coverage,
|
||||
SentencePieceModel model_type, const std::unordered_map<std::string, std::string> ¶ms) {
|
||||
auto vocab = std::make_shared<SentencePieceVocab>();
|
||||
auto ds = std::make_shared<BuildSentenceVocabNode>(shared_from_this(), vocab, col_names, vocab_size,
|
||||
character_coverage, model_type, params);
|
||||
|
||||
// Validate input params
|
||||
if (!ds->ValidateParams()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Run tree here to start building vocab
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
if (iter == nullptr) {
|
||||
MS_LOG(ERROR) << "Fail to run iterator in BuildSentencePieceVocab.";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Finish building vocab by triggering GetNextRow
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
if (!iter->GetNextRow(&row)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return vocab;
|
||||
}
|
||||
|
||||
// Function to create a Vocab from dataset
|
||||
std::shared_ptr<Vocab> Dataset::BuildVocab(const std::vector<std::string> &columns,
|
||||
const std::pair<int64_t, int64_t> &freq_range, int64_t top_k,
|
||||
|
|
|
@ -14,8 +14,11 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include "minddata/dataset/include/text.h"
|
||||
#include "minddata/dataset/text/kernels/lookup_op.h"
|
||||
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
|
||||
#include "minddata/dataset/util/path.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
@ -31,10 +34,21 @@ std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, con
|
|||
const DataType &data_type) {
|
||||
auto op = std::make_shared<LookupOperation>(vocab, unknown_token, data_type);
|
||||
|
||||
if (!op->ValidateParams()) {
|
||||
return nullptr;
|
||||
}
|
||||
return op;
|
||||
return op->ValidateParams() ? op : nullptr;
|
||||
}
|
||||
|
||||
std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
|
||||
const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type) {
|
||||
auto op = std::make_shared<SentencePieceTokenizerOperation>(vocab, out_type);
|
||||
|
||||
return op->ValidateParams() ? op : nullptr;
|
||||
}
|
||||
|
||||
std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(const std::string &vocab_path,
|
||||
SPieceTokenizerOutType out_type) {
|
||||
auto op = std::make_shared<SentencePieceTokenizerOperation>(vocab_path, out_type);
|
||||
|
||||
return op->ValidateParams() ? op : nullptr;
|
||||
}
|
||||
|
||||
/* ####################################### Validator Functions ############################################ */
|
||||
|
@ -70,6 +84,51 @@ std::shared_ptr<TensorOp> LookupOperation::Build() {
|
|||
return tensor_op;
|
||||
}
|
||||
|
||||
// SentencePieceTokenizerOperation
|
||||
SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab,
|
||||
SPieceTokenizerOutType out_type)
|
||||
: vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {}
|
||||
|
||||
SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::string &vocab_path,
|
||||
SPieceTokenizerOutType out_type)
|
||||
: vocab_(nullptr), vocab_path_(vocab_path), load_type_(SPieceTokenizerLoadType::kFile), out_type_(out_type) {}
|
||||
|
||||
Status SentencePieceTokenizerOperation::ValidateParams() {
|
||||
if (load_type_ == SPieceTokenizerLoadType::kModel) {
|
||||
if (vocab_ == nullptr) {
|
||||
std::string err_msg = "SentencePieceTokenizer: vocab object type is incorrect or null.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
} else {
|
||||
Path vocab_file(vocab_path_);
|
||||
if (!vocab_file.Exists() || vocab_file.IsDirectory()) {
|
||||
std::string err_msg = "SentencePieceTokenizer : vocab file: [" + vocab_path_ + "] is invalid or does not exist.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
if (access(vocab_file.toString().c_str(), R_OK) == -1) {
|
||||
std::string err_msg = "SentencePieceTokenizer : no access to specified dataset file: " + vocab_path_;
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::shared_ptr<TensorOp> SentencePieceTokenizerOperation::Build() {
|
||||
std::shared_ptr<SentencePieceTokenizerOp> tensor_op;
|
||||
if (load_type_ == SPieceTokenizerLoadType::kModel) {
|
||||
tensor_op = std::make_shared<SentencePieceTokenizerOp>(vocab_, load_type_, out_type_);
|
||||
} else {
|
||||
Path vocab_file(vocab_path_);
|
||||
std::string model_path = vocab_file.ParentPath();
|
||||
std::string model_filename = vocab_file.Basename();
|
||||
tensor_op = std::make_shared<SentencePieceTokenizerOp>(model_path, model_filename, load_type_, out_type_);
|
||||
}
|
||||
return tensor_op;
|
||||
}
|
||||
|
||||
} // namespace text
|
||||
} // namespace api
|
||||
} // namespace dataset
|
||||
|
|
|
@ -563,7 +563,7 @@ Status NormalizeOperation::ValidateParams() {
|
|||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
if (mean_[i] < 0.0f || mean_[i] > 255.0f || CmpFloat(mean_[i], 0.0f)) {
|
||||
std::string err_msg = "Normalize: mean vector has incorrect value: " + std::to_string(std_[i]);
|
||||
std::string err_msg = "Normalize: mean vector has incorrect value: " + std::to_string(mean_[i]);
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
|
|
@ -113,6 +113,11 @@ Status SaveToDisk::ValidateParams() {
|
|||
MS_LOG(ERROR) << err;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err);
|
||||
}
|
||||
if (access(dir.ParentPath().c_str(), R_OK) == -1) {
|
||||
std::string err_msg = "CreateSaver failed, no access to specified dataset path: " + dataset_path_;
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
if (num_files_ <= 0 || num_files_ > 1000) {
|
||||
std::string err = "CreateSaver failed, num_files must between 1 and 1000, but got " + std::to_string(num_files_);
|
||||
MS_LOG(ERROR) << err;
|
||||
|
|
|
@ -76,7 +76,7 @@ Status BuildSentencePieceVocabOp::SentenceThread() {
|
|||
} else {
|
||||
auto itr = column_name_id_map_.find(col_names_[0]);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(itr != column_name_id_map_.end(),
|
||||
"Invalid parameter, column name: " + col_names_[0] + "does not exist.");
|
||||
"Invalid parameter, column name: " + col_names_[0] + " does not exist.");
|
||||
col_id_ = itr->second;
|
||||
}
|
||||
std::unique_ptr<DatasetSentenceIterator> sentence_iter = std::make_unique<DatasetSentenceIterator>(this);
|
||||
|
|
|
@ -48,6 +48,17 @@ Status WeightedRandomSampler::InitSampler() {
|
|||
CHECK_FAIL_RETURN_UNEXPECTED(samples_per_buffer_ > 0,
|
||||
"Invalid parameter, samples_per_buffer must be greater than 0, but got " +
|
||||
std::to_string(samples_per_buffer_) + ".\n");
|
||||
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(weights_.size() != 0, "Invalid parameter, weights size must not be 0.\n");
|
||||
int32_t zero_elem = 0;
|
||||
for (auto &elem : weights_) {
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(elem >= 0.0, "Invalid parameter, weights must not contain negative number, but got " +
|
||||
std::to_string(elem) + ".\n");
|
||||
if (elem == 0.0) zero_elem++;
|
||||
}
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(zero_elem != weights_.size(),
|
||||
"Invalid parameter, elements of weights must not be all zero.\n");
|
||||
|
||||
if (weights_.size() > static_cast<size_t>(num_rows_)) {
|
||||
return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
|
||||
"Invalid parameter, size of sample weights must be less than or equal to num of data, "
|
||||
|
|
|
@ -5,6 +5,7 @@ add_subdirectory(source)
|
|||
set(DATASET_ENGINE_IR_DATASETOPS_SRC_FILES
|
||||
batch_node.cc
|
||||
bucket_batch_by_length_node.cc
|
||||
build_sentence_piece_vocab_node.cc
|
||||
build_vocab_node.cc
|
||||
concat_node.cc
|
||||
map_node.cc
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
namespace api {
|
||||
|
||||
BuildSentenceVocabNode::BuildSentenceVocabNode(std::shared_ptr<Dataset> child,
|
||||
std::shared_ptr<SentencePieceVocab> vocab,
|
||||
const std::vector<std::string> &col_names, uint32_t vocab_size,
|
||||
float character_coverage, SentencePieceModel model_type,
|
||||
const std::unordered_map<std::string, std::string> ¶ms)
|
||||
: vocab_(vocab),
|
||||
col_names_(col_names),
|
||||
vocab_size_(vocab_size),
|
||||
character_coverage_(character_coverage),
|
||||
model_type_(model_type),
|
||||
params_(params) {
|
||||
this->children.push_back(child);
|
||||
}
|
||||
|
||||
// Function to build BuildSentenceVocabNode
|
||||
std::vector<std::shared_ptr<DatasetOp>> BuildSentenceVocabNode::Build() {
|
||||
// A vector containing shared pointer to the Dataset Ops that this object will create
|
||||
std::vector<std::shared_ptr<DatasetOp>> node_ops;
|
||||
|
||||
std::shared_ptr<BuildSentencePieceVocabOp> build_sentence_piece_vocab_op;
|
||||
build_sentence_piece_vocab_op = std::make_shared<BuildSentencePieceVocabOp>(
|
||||
vocab_, col_names_, vocab_size_, character_coverage_, model_type_, params_, connector_que_size_);
|
||||
node_ops.push_back(build_sentence_piece_vocab_op);
|
||||
return node_ops;
|
||||
}
|
||||
|
||||
Status BuildSentenceVocabNode::ValidateParams() {
|
||||
if (vocab_ == nullptr) {
|
||||
std::string err_msg = "BuildSentenceVocabNode: vocab is null.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
||||
if (vocab_size_ <= 0) {
|
||||
std::string err_msg =
|
||||
"BuildSentenceVocabNode: vocab_size should be positive, but got: " + std::to_string(vocab_size_);
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
||||
if (character_coverage_ < 0.98f || character_coverage_ > 1.0f) {
|
||||
std::string err_msg = "BuildSentenceVocabNode: character_coverage should to be between 0.98 and 1.0, but got " +
|
||||
std::to_string(character_coverage_);
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace api
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,62 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_BUILD_SENTENCE_PIECE_VOCAB_NODE_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_BUILD_SENTENCE_PIECE_VOCAB_NODE_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "minddata/dataset/include/datasets.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
namespace api {
|
||||
|
||||
class BuildSentenceVocabNode : public Dataset {
|
||||
public:
|
||||
/// \brief Constructor
|
||||
BuildSentenceVocabNode(std::shared_ptr<Dataset> child, std::shared_ptr<SentencePieceVocab> vocab,
|
||||
const std::vector<std::string> &col_names, uint32_t vocab_size, float character_coverage,
|
||||
SentencePieceModel model_type, const std::unordered_map<std::string, std::string> ¶ms);
|
||||
|
||||
/// \brief Destructor
|
||||
~BuildSentenceVocabNode() = default;
|
||||
|
||||
/// \brief a base class override function to create the required runtime dataset op objects for this class
|
||||
/// \return The list of shared pointers to the newly created DatasetOps
|
||||
std::vector<std::shared_ptr<DatasetOp>> Build() override;
|
||||
|
||||
/// \brief Parameters validation
|
||||
/// \return Status Status::OK() if all the parameters are valid
|
||||
Status ValidateParams() override;
|
||||
|
||||
private:
|
||||
std::shared_ptr<SentencePieceVocab> vocab_;
|
||||
std::vector<std::string> col_names_;
|
||||
uint32_t vocab_size_;
|
||||
float character_coverage_;
|
||||
SentencePieceModel model_type_;
|
||||
std::unordered_map<std::string, std::string> params_;
|
||||
};
|
||||
|
||||
} // namespace api
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_BUILD_SENTENCE_PIECE_VOCAB_NODE_H_
|
|
@ -22,6 +22,7 @@
|
|||
#include <memory>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
@ -37,6 +38,7 @@
|
|||
#include "minddata/dataset/kernels/tensor_op.h"
|
||||
#include "minddata/dataset/util/path.h"
|
||||
#ifndef ENABLE_ANDROID
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
#endif
|
||||
|
||||
|
@ -86,7 +88,6 @@ class VOCNode;
|
|||
// Dataset Op classes (in alphabetical order)
|
||||
#ifndef ENABLE_ANDROID
|
||||
class BucketBatchByLengthNode;
|
||||
class BuildVocabNode;
|
||||
#endif
|
||||
class ConcatNode;
|
||||
class MapNode;
|
||||
|
@ -635,7 +636,7 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
|
|||
|
||||
/// \brief Function to transfer data through a device.
|
||||
/// \notes If device is Ascend, features of data will be transferred one by one. The limitation
|
||||
/// of data transmission per time is 256M.
|
||||
/// of data transmission per time is 256M.
|
||||
/// \param[in] send_epoch_end Whether to send end of sequence to device or not (default=True).
|
||||
/// \return Returns true if no error encountered else false.
|
||||
bool DeviceQueue(bool send_epoch_end = true);
|
||||
|
@ -658,7 +659,7 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
|
|||
|
||||
/// \brief Function to create a BatchNode
|
||||
/// \notes Combines batch_size number of consecutive rows into batches
|
||||
/// \param[in] batch_size Path to the root directory that contains the dataset
|
||||
/// \param[in] batch_size The number of rows each batch is created with
|
||||
/// \param[in] drop_remainder Determines whether or not to drop the last possibly incomplete
|
||||
/// batch. If true, and if there are less than batch_size rows
|
||||
/// available to make the last batch, then those rows will
|
||||
|
@ -668,7 +669,8 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
|
|||
|
||||
#ifndef ENABLE_ANDROID
|
||||
/// \brief Function to create a BucketBatchByLengthNode
|
||||
/// \notes Combines batch_size number of consecutive rows into batches
|
||||
/// \notes Bucket elements according to their lengths. Each bucket will be padded and batched when
|
||||
/// they are full.
|
||||
/// \param[in] column_names Columns passed to element_length_function
|
||||
/// \param[in] bucket_boundaries A list consisting of the upper boundaries of the buckets.
|
||||
/// Must be strictly increasing. If there are n boundaries, n+1 buckets are created: One bucket for
|
||||
|
@ -676,10 +678,10 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
|
|||
/// 0<i<n, and one bucket for [bucket_boundaries[n-1], inf).
|
||||
/// \param[in] bucket_batch_sizes A list consisting of the batch sizes for each bucket.
|
||||
/// Must contain elements equal to the size of bucket_boundaries + 1.
|
||||
/// \param[in] element_length_function A function pointer that takes in TensorRow and outputs a TensorRow. The
|
||||
/// output
|
||||
/// must contain a single tensor containing a single int32_t. If no value is provided, then size of column_names
|
||||
/// must be 1, and the size of the first dimension of that column will be taken as the length (default=nullptr)
|
||||
/// \param[in] element_length_function A function pointer that takes in TensorRow and outputs a TensorRow.
|
||||
/// The output must contain a single tensor containing a single int32_t. If no value is provided,
|
||||
/// then size of column_names must be 1, and the size of the first dimension of that column will be taken
|
||||
/// as the length (default=nullptr)
|
||||
/// \param[in] pad_info Represents how to batch each column. The key corresponds to the column name, the value must
|
||||
/// be a tuple of 2 elements. The first element corresponds to the shape to pad to, and the second element
|
||||
/// corresponds to the value to pad with. If a column is not specified, then that column will be padded to the
|
||||
|
@ -687,8 +689,8 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
|
|||
/// padded to the longest in the current batch, unless if pad_to_bucket_boundary is true. If no padding is
|
||||
/// wanted, set pad_info to None (default=empty dictionary).
|
||||
/// \param[in] pad_to_bucket_boundary If true, will pad each unspecified dimension in pad_info to the
|
||||
/// bucket_boundary
|
||||
/// minus 1. If there are any elements that fall into the last bucket, an error will occur (default=false).
|
||||
/// bucket_boundary minus 1. If there are any elements that fall into the last bucket,
|
||||
/// an error will occur (default=false).
|
||||
/// \param[in] drop_remainder If true, will drop the last batch for each bucket if it is not a full batch
|
||||
/// (default=false).
|
||||
/// \return Shared pointer to the current BucketBatchByLengthNode
|
||||
|
@ -699,6 +701,20 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
|
|||
const std::map<std::string, std::pair<TensorShape, std::shared_ptr<Tensor>>> &pad_info = {},
|
||||
bool pad_to_bucket_boundary = false, bool drop_remainder = false);
|
||||
|
||||
/// \brief Function to create a SentencePieceVocab from source dataset
|
||||
/// \notes Build a SentencePieceVocab from a dataset.
|
||||
/// \param[in] col_names Column names to get words from. It can be a vector of column names
|
||||
/// \param[in] vocab_size Vocabulary size. The type is uint32
|
||||
/// \param[in] character_coverage Percentage of characters covered by the model, must be between
|
||||
/// 0.98 and 1.0 Good defaults are: 0.9995 for languages with rich character sets like
|
||||
/// Japanese or Chinese character sets, and 1.0 for other languages with small character sets.
|
||||
/// \param[in] model_type Model type. Choose from unigram (default), bpe, char, or word.
|
||||
/// The input sentence must be pretokenized when using word type.
|
||||
/// \param[in] params A vector contains more option parameters of sentencepiece library
|
||||
std::shared_ptr<SentencePieceVocab> BuildSentencePieceVocab(
|
||||
const std::vector<std::string> &col_names, uint32_t vocab_size, float character_coverage,
|
||||
SentencePieceModel model_type, const std::unordered_map<std::string, std::string> ¶ms);
|
||||
|
||||
/// \brief Function to create a Vocab from source dataset
|
||||
/// \notes Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab
|
||||
/// which contains top_k most frequent words (if top_k is specified)
|
||||
|
|
|
@ -21,11 +21,14 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "mindspore/ccsrc/minddata/dataset/core/data_type.h"
|
||||
#include "minddata/dataset/core/constants.h"
|
||||
#include "minddata/dataset/include/transforms.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#include "mindspore/ccsrc/minddata/dataset/core/data_type.h"
|
||||
|
||||
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
@ -36,6 +39,7 @@ namespace text {
|
|||
|
||||
// Text Op classes (in alphabetical order)
|
||||
class LookupOperation;
|
||||
class SentencePieceTokenizerOperation;
|
||||
|
||||
/// \brief Lookup operator that looks up a word to an id.
|
||||
/// \param[in] vocab a Vocab object.
|
||||
|
@ -46,6 +50,20 @@ class LookupOperation;
|
|||
std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
|
||||
const mindspore::dataset::DataType &data_type = DataType("int32"));
|
||||
|
||||
/// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
|
||||
/// \param[in] vocab a SentencePieceVocab object.
|
||||
/// \param[in] out_type The type of output.
|
||||
/// \return Shared pointer to the current TensorOperation.
|
||||
std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
|
||||
const std::shared_ptr<SentencePieceVocab> &vocab, mindspore::dataset::SPieceTokenizerOutType out_type);
|
||||
|
||||
/// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
|
||||
/// \param[in] vocab_path vocab model file path.
|
||||
/// \param[in] out_type The type of output.
|
||||
/// \return Shared pointer to the current TensorOperation.
|
||||
std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
|
||||
const std::string &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type);
|
||||
|
||||
/* ####################################### Derived TensorOperation classes ################################# */
|
||||
|
||||
class LookupOperation : public TensorOperation {
|
||||
|
@ -65,6 +83,25 @@ class LookupOperation : public TensorOperation {
|
|||
int32_t default_id_;
|
||||
DataType data_type_;
|
||||
};
|
||||
|
||||
class SentencePieceTokenizerOperation : public TensorOperation {
|
||||
public:
|
||||
SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type);
|
||||
|
||||
SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type);
|
||||
|
||||
~SentencePieceTokenizerOperation() = default;
|
||||
|
||||
std::shared_ptr<TensorOp> Build() override;
|
||||
|
||||
Status ValidateParams() override;
|
||||
|
||||
private:
|
||||
std::shared_ptr<SentencePieceVocab> vocab_;
|
||||
std::string vocab_path_;
|
||||
SPieceTokenizerLoadType load_type_;
|
||||
SPieceTokenizerOutType out_type_;
|
||||
};
|
||||
} // namespace text
|
||||
} // namespace api
|
||||
} // namespace dataset
|
||||
|
|
|
@ -585,15 +585,6 @@ class WeightedRandomSampler(BuiltinSampler):
|
|||
if not isinstance(weights, list):
|
||||
weights = [weights]
|
||||
|
||||
if weights == []:
|
||||
raise ValueError("weights size should not be 0")
|
||||
|
||||
if list(filter(lambda x: x < 0, weights)):
|
||||
raise ValueError("weights should not contain negative numbers")
|
||||
|
||||
if list(filter(lambda x: x == 0, weights)) == weights:
|
||||
raise ValueError("elements of weights should not be all zero")
|
||||
|
||||
if num_samples is not None:
|
||||
if num_samples <= 0:
|
||||
raise ValueError("num_samples should be a positive integer "
|
||||
|
|
|
@ -139,6 +139,7 @@ if (BUILD_MINDDATA STREQUAL "full")
|
|||
|
||||
list(REMOVE_ITEM MINDDATA_ENGINE_IR_DATASETOPS_SRC_FILES
|
||||
"${MINDDATA_DIR}/engine/ir/datasetops/bucket_batch_by_length_node.cc"
|
||||
"${MINDDATA_DIR}/engine/ir/datasetops/build_sentence_piece_vocab_node.cc"
|
||||
"${MINDDATA_DIR}/engine/ir/datasetops/build_vocab_node.cc"
|
||||
"${MINDDATA_DIR}/engine/ir/datasetops/sync_wait_node.cc"
|
||||
)
|
||||
|
|
|
@ -112,12 +112,15 @@ SET(DE_UT_SRCS
|
|||
c_api_dataset_config_test.cc
|
||||
c_api_dataset_csv_test.cc
|
||||
c_api_dataset_manifest_test.cc
|
||||
c_api_dataset_minddata_test.cc
|
||||
c_api_dataset_randomdata_test.cc
|
||||
c_api_dataset_save.cc
|
||||
c_api_dataset_textfile_test.cc
|
||||
c_api_dataset_tfrecord_test.cc
|
||||
c_api_dataset_voc_test.cc
|
||||
c_api_datasets_test.cc
|
||||
c_api_dataset_iterator_test.cc
|
||||
c_api_text_sentence_piece_vocab_test.cc
|
||||
c_api_text_vocab_test.cc
|
||||
tensor_op_fusion_pass_test.cc
|
||||
sliding_window_op_test.cc
|
||||
|
|
|
@ -0,0 +1,224 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "common/common.h"
|
||||
#include "minddata/dataset/include/datasets.h"
|
||||
#include "minddata/dataset/include/status.h"
|
||||
#include "minddata/dataset/include/transforms.h"
|
||||
#include "minddata/dataset/include/text.h"
|
||||
|
||||
// IR non-leaf nodes
|
||||
#include "minddata/dataset/engine/ir/datasetops/map_node.h"
|
||||
|
||||
// IR leaf nodes
|
||||
#include "minddata/dataset/engine/ir/datasetops/source/text_file_node.h"
|
||||
|
||||
using namespace mindspore::dataset::api;
|
||||
using mindspore::dataset::Tensor;
|
||||
using mindspore::dataset::ShuffleMode;
|
||||
using mindspore::dataset::SentencePieceModel;
|
||||
using mindspore::dataset::SentencePieceVocab;
|
||||
|
||||
class MindDataTestPipeline : public UT::DatasetOpTesting {
|
||||
protected:
|
||||
};
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess1) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabSuccess1 plus sentencepiece tokenizer.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
|
||||
std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds_vocab, nullptr);
|
||||
|
||||
// Create vocab from dataset
|
||||
std::shared_ptr<SentencePieceVocab> vocab =
|
||||
ds_vocab->BuildSentencePieceVocab({}, 5000, 0.9995, SentencePieceModel::kUnigram, {});
|
||||
EXPECT_NE(vocab, nullptr);
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/sentencepiece_tokenizer.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
|
||||
// Create SentencePieceTokenizer operation from vocab object
|
||||
std::shared_ptr<TensorOperation> sentencepiece_tokenizer =
|
||||
text::SentencePieceTokenizer(vocab, mindspore::dataset::SPieceTokenizerOutType::kString);
|
||||
EXPECT_NE(sentencepiece_tokenizer, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({sentencepiece_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
// Expected result after tokenization
|
||||
std::vector<std::string> expected = {"▁I", "▁sa", "w", "▁a", "▁girl", "▁with", "▁a", "▁te", "les", "co", "pe", "."};
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto txt = row["text"];
|
||||
MS_LOG(INFO) << *txt;
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
Tensor::CreateFromVector(expected, &expected_tensor);
|
||||
EXPECT_EQ(*txt, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess2) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabSuccess2 plus sentencepiece tokenizer.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
|
||||
std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds_vocab, nullptr);
|
||||
|
||||
// Create vocab from dataset
|
||||
std::shared_ptr<SentencePieceVocab> vocab =
|
||||
ds_vocab->BuildSentencePieceVocab({}, 5000, 0.9995, SentencePieceModel::kUnigram, {});
|
||||
EXPECT_NE(vocab, nullptr);
|
||||
|
||||
// Save vocab model to local
|
||||
vocab->SaveModel(&vocab, datasets_root_path_ + "/test_sentencepiece", "m.model");
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/sentencepiece_tokenizer.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
|
||||
// Create SentencePieceTokenizer operation from local vocab model
|
||||
std::string vocab_model = datasets_root_path_ + "/test_sentencepiece/m.model";
|
||||
std::shared_ptr<TensorOperation> sentencepiece_tokenizer =
|
||||
text::SentencePieceTokenizer(vocab_model, mindspore::dataset::SPieceTokenizerOutType::kString);
|
||||
EXPECT_NE(sentencepiece_tokenizer, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({sentencepiece_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
// Expected result after tokenization
|
||||
std::vector<std::string> expected = {"▁I", "▁sa", "w", "▁a", "▁girl", "▁with", "▁a", "▁te", "les", "co", "pe", "."};
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto txt = row["text"];
|
||||
MS_LOG(INFO) << *txt;
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
Tensor::CreateFromVector(expected, &expected_tensor);
|
||||
EXPECT_EQ(*txt, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSentencePieceVocabFail) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabFail1 with incorrect parameter.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
|
||||
std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds_vocab, nullptr);
|
||||
|
||||
// vocab_size can not less than or equal to 0
|
||||
std::shared_ptr<SentencePieceVocab> vocab1 =
|
||||
ds_vocab->BuildSentencePieceVocab({}, 0, 0.9995, SentencePieceModel::kUnigram, {});
|
||||
EXPECT_EQ(vocab1, nullptr);
|
||||
|
||||
// character_coverage should to be between 0.98 and 1.0
|
||||
std::shared_ptr<SentencePieceVocab> vocab2 =
|
||||
ds_vocab->BuildSentencePieceVocab({}, 1, 0.979, SentencePieceModel::kUnigram, {});
|
||||
EXPECT_EQ(vocab2, nullptr);
|
||||
|
||||
// character_coverage should to be between 0.98 and 1.0
|
||||
std::shared_ptr<SentencePieceVocab> vocab3 =
|
||||
ds_vocab->BuildSentencePieceVocab({}, 1, 1.01, SentencePieceModel::kUnigram, {});
|
||||
EXPECT_EQ(vocab3, nullptr);
|
||||
|
||||
// column name does not exist
|
||||
std::shared_ptr<SentencePieceVocab> vocab4 =
|
||||
ds_vocab->BuildSentencePieceVocab({"image"}, 2, 0.98, SentencePieceModel::kUnigram, {});
|
||||
EXPECT_EQ(vocab4, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSentencePieceTokenizerFail1) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceTokenizerFail with incorrect parameter.";
|
||||
|
||||
// Create SentencePieceTokenizer operation from local vocab model
|
||||
std::string vocab_model1 = "";
|
||||
std::shared_ptr<TensorOperation> sentencepiece_tokenizer1 =
|
||||
text::SentencePieceTokenizer(vocab_model1, mindspore::dataset::SPieceTokenizerOutType::kString);
|
||||
EXPECT_EQ(sentencepiece_tokenizer1, nullptr);
|
||||
|
||||
// Create SentencePieceTokenizer operation from local vocab model
|
||||
std::string vocab_model2 = "m.model";
|
||||
std::shared_ptr<TensorOperation> sentencepiece_tokenizer2 =
|
||||
text::SentencePieceTokenizer(vocab_model2, mindspore::dataset::SPieceTokenizerOutType::kString);
|
||||
EXPECT_EQ(sentencepiece_tokenizer2, nullptr);
|
||||
|
||||
// Create SentencePieceTokenizer operation from vocab object
|
||||
std::shared_ptr<SentencePieceVocab> vocab_model3 = nullptr;
|
||||
std::shared_ptr<TensorOperation> sentencepiece_tokenizer3 =
|
||||
text::SentencePieceTokenizer(vocab_model3, mindspore::dataset::SPieceTokenizerOutType::kString);
|
||||
EXPECT_EQ(sentencepiece_tokenizer3, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSentencePieceTokenizerFail2) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceTokenizerFail with invalid SentencePieceVocab object.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/sentencepiece_tokenizer.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
|
||||
// Create SentencePieceTokenizer operation from vocab object
|
||||
std::shared_ptr<SentencePieceVocab> vocab_model4 = std::make_shared<SentencePieceVocab>();
|
||||
std::shared_ptr<TensorOperation> sentencepiece_tokenizer4 =
|
||||
text::SentencePieceTokenizer(vocab_model4, mindspore::dataset::SPieceTokenizerOutType::kString);
|
||||
EXPECT_NE(sentencepiece_tokenizer4, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({sentencepiece_tokenizer4}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
EXPECT_EQ(iter->GetNextRow(&row), false);
|
||||
}
|
Loading…
Reference in New Issue