forked from mindspore-Ecosystem/mindspore
!12349 [MD] Push down IR files for transforms and text
From: @tina_mengting_zhang Reviewed-by: Signed-off-by:
This commit is contained in:
commit
ed7fef5d5e
|
@ -92,11 +92,14 @@ add_dependencies(engine core)
|
||||||
add_dependencies(callback core)
|
add_dependencies(callback core)
|
||||||
add_dependencies(text core)
|
add_dependencies(text core)
|
||||||
add_dependencies(text-kernels core)
|
add_dependencies(text-kernels core)
|
||||||
|
add_dependencies(text-ir core)
|
||||||
|
add_dependencies(text-ir-kernels core)
|
||||||
add_dependencies(cpp-API core)
|
add_dependencies(cpp-API core)
|
||||||
add_dependencies(engine-ir-datasetops core)
|
add_dependencies(engine-ir-datasetops core)
|
||||||
add_dependencies(engine-ir-datasetops-source core)
|
add_dependencies(engine-ir-datasetops-source core)
|
||||||
add_dependencies(engine-ir-cache core)
|
add_dependencies(engine-ir-cache core)
|
||||||
add_dependencies(kernels-ir core)
|
add_dependencies(kernels-ir core)
|
||||||
|
add_dependencies(kernels-ir-data core)
|
||||||
add_dependencies(kernels-ir-vision core)
|
add_dependencies(kernels-ir-vision core)
|
||||||
|
|
||||||
if(ENABLE_ACL)
|
if(ENABLE_ACL)
|
||||||
|
@ -146,7 +149,10 @@ set(submodules
|
||||||
$<TARGET_OBJECTS:engine>
|
$<TARGET_OBJECTS:engine>
|
||||||
$<TARGET_OBJECTS:text>
|
$<TARGET_OBJECTS:text>
|
||||||
$<TARGET_OBJECTS:text-kernels>
|
$<TARGET_OBJECTS:text-kernels>
|
||||||
|
$<TARGET_OBJECTS:text-ir>
|
||||||
|
$<TARGET_OBJECTS:text-ir-kernels>
|
||||||
$<TARGET_OBJECTS:kernels-ir>
|
$<TARGET_OBJECTS:kernels-ir>
|
||||||
|
$<TARGET_OBJECTS:kernels-ir-data>
|
||||||
$<TARGET_OBJECTS:kernels-ir-vision>
|
$<TARGET_OBJECTS:kernels-ir-vision>
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -17,9 +17,9 @@
|
||||||
|
|
||||||
#include "minddata/dataset/api/python/pybind_register.h"
|
#include "minddata/dataset/api/python/pybind_register.h"
|
||||||
#include "minddata/dataset/core/global_context.h"
|
#include "minddata/dataset/core/global_context.h"
|
||||||
#include "minddata/dataset/include/transforms.h"
|
|
||||||
|
|
||||||
#include "minddata/dataset/kernels/py_func_op.h"
|
#include "minddata/dataset/kernels/py_func_op.h"
|
||||||
|
#include "minddata/dataset/kernels/ir/data/transforms_ir.h"
|
||||||
#include "minddata/dataset/kernels/ir/vision/vision_ir.h"
|
#include "minddata/dataset/kernels/ir/vision/vision_ir.h"
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
|
|
|
@ -18,7 +18,7 @@
|
||||||
#include "pybind11/stl.h"
|
#include "pybind11/stl.h"
|
||||||
#include "pybind11/stl_bind.h"
|
#include "pybind11/stl_bind.h"
|
||||||
#include "minddata/dataset/api/python/pybind_register.h"
|
#include "minddata/dataset/api/python/pybind_register.h"
|
||||||
#include "minddata/dataset/include/text.h"
|
#include "minddata/dataset/text/ir/kernels/text_ir.h"
|
||||||
#include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h"
|
#include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h"
|
||||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||||
#include "minddata/dataset/text/vocab.h"
|
#include "minddata/dataset/text/vocab.h"
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
* you may not use this file except in compliance with the License.
|
* you may not use this file except in compliance with the License.
|
||||||
|
@ -30,10 +30,10 @@
|
||||||
#include "pybind11/stl_bind.h"
|
#include "pybind11/stl_bind.h"
|
||||||
#include "minddata/dataset/include/datasets.h"
|
#include "minddata/dataset/include/datasets.h"
|
||||||
#include "minddata/dataset/include/samplers.h"
|
#include "minddata/dataset/include/samplers.h"
|
||||||
#include "minddata/dataset/include/transforms.h"
|
|
||||||
#include "minddata/dataset/api/python/pybind_register.h"
|
#include "minddata/dataset/api/python/pybind_register.h"
|
||||||
#include "minddata/dataset/engine/ir/cache/pre_built_dataset_cache.h"
|
#include "minddata/dataset/engine/ir/cache/pre_built_dataset_cache.h"
|
||||||
#include "minddata/dataset/engine/ir/datasetops/source/csv_node.h"
|
#include "minddata/dataset/engine/ir/datasetops/source/csv_node.h"
|
||||||
|
#include "minddata/dataset/kernels/ir/data/transforms_ir.h"
|
||||||
#include "minddata/dataset/kernels/py_func_op.h"
|
#include "minddata/dataset/kernels/py_func_op.h"
|
||||||
namespace py = pybind11;
|
namespace py = pybind11;
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
* you may not use this file except in compliance with the License.
|
* you may not use this file except in compliance with the License.
|
||||||
|
@ -17,30 +17,6 @@
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
#include "minddata/dataset/include/text.h"
|
#include "minddata/dataset/include/text.h"
|
||||||
#ifndef _WIN32
|
|
||||||
#include "minddata/dataset/text/kernels/basic_tokenizer_op.h"
|
|
||||||
#include "minddata/dataset/text/kernels/bert_tokenizer_op.h"
|
|
||||||
#include "minddata/dataset/text/kernels/case_fold_op.h"
|
|
||||||
#endif
|
|
||||||
#include "minddata/dataset/text/kernels/jieba_tokenizer_op.h"
|
|
||||||
#include "minddata/dataset/text/kernels/lookup_op.h"
|
|
||||||
#include "minddata/dataset/text/kernels/ngram_op.h"
|
|
||||||
#ifndef _WIN32
|
|
||||||
#include "minddata/dataset/text/kernels/normalize_utf8_op.h"
|
|
||||||
#include "minddata/dataset/text/kernels/regex_replace_op.h"
|
|
||||||
#include "minddata/dataset/text/kernels/regex_tokenizer_op.h"
|
|
||||||
#endif
|
|
||||||
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
|
|
||||||
#include "minddata/dataset/text/kernels/sliding_window_op.h"
|
|
||||||
#include "minddata/dataset/text/kernels/to_number_op.h"
|
|
||||||
#include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h"
|
|
||||||
#include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
|
|
||||||
#ifndef _WIN32
|
|
||||||
#include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h"
|
|
||||||
#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
|
|
||||||
#endif
|
|
||||||
#include "minddata/dataset/core/data_type.h"
|
|
||||||
#include "minddata/dataset/util/path.h"
|
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace dataset {
|
namespace dataset {
|
||||||
|
@ -174,426 +150,6 @@ std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offs
|
||||||
return op->ValidateParams() ? op : nullptr;
|
return op->ValidateParams() ? op : nullptr;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* ####################################### Validator Functions ############################################ */
|
|
||||||
|
|
||||||
// Helper function to validate tokenizer directory parameter
|
|
||||||
Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file) {
|
|
||||||
if (tokenizer_file.empty()) {
|
|
||||||
std::string err_msg = tokenizer_name + ": tokenizer_file is not specified.";
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
Path file(tokenizer_file);
|
|
||||||
if (!file.Exists()) {
|
|
||||||
std::string err_msg = tokenizer_name + ": tokenizer_file: [" + tokenizer_file + "] is an invalid directory path.";
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (access(tokenizer_file.c_str(), R_OK) == -1) {
|
|
||||||
std::string err_msg = tokenizer_name + ": No access to specified tokenizer path: " + tokenizer_file;
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Helper functions to help validate data type passed by user
|
|
||||||
bool IsTypeNumeric(const std::string &data_type) {
|
|
||||||
if (data_type == "int8" || data_type == "uint8" || data_type == "int16" || data_type == "uint16" ||
|
|
||||||
data_type == "int32" || data_type == "uint32" || data_type == "int64" || data_type == "uint64" ||
|
|
||||||
data_type == "float16" || data_type == "float32" || data_type == "float64")
|
|
||||||
return true;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool IsTypeBoolean(const std::string &data_type) { return data_type == "bool"; }
|
|
||||||
|
|
||||||
bool IsTypeString(const std::string &data_type) { return data_type == "string"; }
|
|
||||||
|
|
||||||
/* ####################################### Derived TensorOperation classes ################################# */
|
|
||||||
|
|
||||||
// (In alphabetical order)
|
|
||||||
|
|
||||||
#ifndef _WIN32
|
|
||||||
// BasicTokenizerOperation
|
|
||||||
BasicTokenizerOperation::BasicTokenizerOperation(bool lower_case, bool keep_whitespace,
|
|
||||||
const NormalizeForm normalize_form, bool preserve_unused_token,
|
|
||||||
bool with_offsets)
|
|
||||||
: lower_case_(lower_case),
|
|
||||||
keep_whitespace_(keep_whitespace),
|
|
||||||
normalize_form_(normalize_form),
|
|
||||||
preserve_unused_token_(preserve_unused_token),
|
|
||||||
with_offsets_(with_offsets) {}
|
|
||||||
|
|
||||||
Status BasicTokenizerOperation::ValidateParams() { return Status::OK(); }
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> BasicTokenizerOperation::Build() {
|
|
||||||
std::shared_ptr<BasicTokenizerOp> tensor_op = std::make_shared<BasicTokenizerOp>(
|
|
||||||
lower_case_, keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_);
|
|
||||||
return tensor_op;
|
|
||||||
}
|
|
||||||
|
|
||||||
// BertTokenizerOperation
|
|
||||||
BertTokenizerOperation::BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
|
|
||||||
int32_t max_bytes_per_token, const std::string &unknown_token,
|
|
||||||
bool lower_case, bool keep_whitespace,
|
|
||||||
const NormalizeForm normalize_form, bool preserve_unused_token,
|
|
||||||
bool with_offsets)
|
|
||||||
: vocab_(vocab),
|
|
||||||
suffix_indicator_(suffix_indicator),
|
|
||||||
max_bytes_per_token_(max_bytes_per_token),
|
|
||||||
unknown_token_(unknown_token),
|
|
||||||
lower_case_(lower_case),
|
|
||||||
keep_whitespace_(keep_whitespace),
|
|
||||||
normalize_form_(normalize_form),
|
|
||||||
preserve_unused_token_(preserve_unused_token),
|
|
||||||
with_offsets_(with_offsets) {}
|
|
||||||
|
|
||||||
BertTokenizerOperation::~BertTokenizerOperation() = default;
|
|
||||||
|
|
||||||
Status BertTokenizerOperation::ValidateParams() {
|
|
||||||
if (vocab_ == nullptr) {
|
|
||||||
std::string err_msg = "BertTokenizer: vocab object type is incorrect or null.";
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (max_bytes_per_token_ < 0) {
|
|
||||||
std::string err_msg = "BertTokenizer : The parameter max_bytes_per_token must be greater than or equal to 0: " +
|
|
||||||
std::to_string(max_bytes_per_token_);
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> BertTokenizerOperation::Build() {
|
|
||||||
std::shared_ptr<BertTokenizerOp> tensor_op =
|
|
||||||
std::make_shared<BertTokenizerOp>(vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, lower_case_,
|
|
||||||
keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_);
|
|
||||||
return tensor_op;
|
|
||||||
}
|
|
||||||
|
|
||||||
// CaseFoldOperation
|
|
||||||
Status CaseFoldOperation::ValidateParams() { return Status::OK(); }
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> CaseFoldOperation::Build() {
|
|
||||||
std::shared_ptr<CaseFoldOp> tensor_op = std::make_shared<CaseFoldOp>();
|
|
||||||
return tensor_op;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// JiebaTokenizerOperation
|
|
||||||
JiebaTokenizerOperation::JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path,
|
|
||||||
const JiebaMode &mode, bool with_offsets)
|
|
||||||
: hmm_path_(hmm_path), mp_path_(mp_path), mode_(mode), with_offsets_(with_offsets) {}
|
|
||||||
|
|
||||||
Status JiebaTokenizerOperation::ValidateParams() {
|
|
||||||
if (hmm_path_.empty()) {
|
|
||||||
std::string err_msg = "JiebaTokenizer: The dict of HMMSegment in cppjieba is not provided.";
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (mp_path_.empty()) {
|
|
||||||
std::string err_msg = "JiebaTokenizer: The dict of MPSegment in cppjieba is not provided.";
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", hmm_path_));
|
|
||||||
RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", mp_path_));
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() {
|
|
||||||
std::shared_ptr<JiebaTokenizerOp> tensor_op =
|
|
||||||
std::make_shared<JiebaTokenizerOp>(hmm_path_, mp_path_, mode_, with_offsets_);
|
|
||||||
for (auto &word : words_list_) {
|
|
||||||
Status rc = tensor_op->AddWord(word.first, word.second);
|
|
||||||
if (rc.IsError()) {
|
|
||||||
MS_LOG(ERROR) << rc;
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return tensor_op;
|
|
||||||
}
|
|
||||||
|
|
||||||
Status JiebaTokenizerOperation::AddWord(const std::string &word, int64_t freq) {
|
|
||||||
if (word.empty()) {
|
|
||||||
std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided.";
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
if (freq < 0) {
|
|
||||||
std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0.";
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
words_list_.emplace_back(word, freq);
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
// LookupOperation
|
|
||||||
LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
|
|
||||||
const std::string &data_type)
|
|
||||||
: vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {}
|
|
||||||
|
|
||||||
LookupOperation::~LookupOperation() = default;
|
|
||||||
|
|
||||||
Status LookupOperation::ValidateParams() {
|
|
||||||
if (vocab_ == nullptr) {
|
|
||||||
std::string err_msg = "Lookup: vocab object type is incorrect or null.";
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
if (unknown_token_ != std::nullopt) {
|
|
||||||
default_id_ = vocab_->Lookup(*unknown_token_);
|
|
||||||
if (default_id_ == Vocab::kNoTokenExists) {
|
|
||||||
std::string err_msg = "Lookup: \"" + *unknown_token_ + "\" doesn't exist in vocab.";
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!IsTypeNumeric(data_type_)) {
|
|
||||||
std::string err_msg = "Lookup does not support a string to string mapping, data_type can only be numeric.";
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> LookupOperation::Build() {
|
|
||||||
std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, DataType(data_type_));
|
|
||||||
return tensor_op;
|
|
||||||
}
|
|
||||||
|
|
||||||
// NgramOperation
|
|
||||||
NgramOperation::NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
|
|
||||||
const std::pair<std::string, int32_t> &right_pad, const std::string &separator)
|
|
||||||
: ngrams_(ngrams), left_pad_(left_pad), right_pad_(right_pad), separator_(separator) {}
|
|
||||||
|
|
||||||
Status NgramOperation::ValidateParams() {
|
|
||||||
if (ngrams_.size() == 0) {
|
|
||||||
std::string err_msg = "Ngram : Container cannot be empty.";
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
} else {
|
|
||||||
for (int32_t i = 0; i < ngrams_.size(); ++i) {
|
|
||||||
if (ngrams_[i] <= 0) {
|
|
||||||
std::string err_msg =
|
|
||||||
"Ngram : The value of ngrams vector must be greater than 0: " + std::to_string(ngrams_[i]);
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (left_pad_.second < 0) {
|
|
||||||
std::string err_msg =
|
|
||||||
"Ngram : The second parameter pad_width in left_pad vector must be greater than or equal to 0: " +
|
|
||||||
std::to_string(left_pad_.second);
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (right_pad_.second < 0) {
|
|
||||||
std::string err_msg =
|
|
||||||
"Ngram : The second parameter pad_width in right_pad vector must be greater than or equal to 0: " +
|
|
||||||
std::to_string(right_pad_.second);
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> NgramOperation::Build() {
|
|
||||||
int32_t l_len = left_pad_.second;
|
|
||||||
int32_t r_len = right_pad_.second;
|
|
||||||
std::string l_pad = left_pad_.first;
|
|
||||||
std::string r_pad = right_pad_.first;
|
|
||||||
std::shared_ptr<NgramOp> tensor_op = std::make_shared<NgramOp>(ngrams_, l_len, r_len, l_pad, r_pad, separator_);
|
|
||||||
return tensor_op;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef _WIN32
|
|
||||||
// NormalizeUTF8Operation
|
|
||||||
NormalizeUTF8Operation::NormalizeUTF8Operation(NormalizeForm normalize_form) : normalize_form_(normalize_form) {}
|
|
||||||
|
|
||||||
Status NormalizeUTF8Operation::ValidateParams() { return Status::OK(); }
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> NormalizeUTF8Operation::Build() {
|
|
||||||
std::shared_ptr<NormalizeUTF8Op> tensor_op = std::make_shared<NormalizeUTF8Op>(normalize_form_);
|
|
||||||
return tensor_op;
|
|
||||||
}
|
|
||||||
|
|
||||||
// RegexReplaceOperation
|
|
||||||
RegexReplaceOperation::RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all)
|
|
||||||
: pattern_(pattern), replace_(replace), replace_all_(replace_all) {}
|
|
||||||
|
|
||||||
Status RegexReplaceOperation::ValidateParams() { return Status::OK(); }
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> RegexReplaceOperation::Build() {
|
|
||||||
std::shared_ptr<RegexReplaceOp> tensor_op = std::make_shared<RegexReplaceOp>(pattern_, replace_, replace_all_);
|
|
||||||
return tensor_op;
|
|
||||||
}
|
|
||||||
|
|
||||||
// RegexTokenizerOperation
|
|
||||||
RegexTokenizerOperation::RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern,
|
|
||||||
bool with_offsets)
|
|
||||||
: delim_pattern_(delim_pattern), keep_delim_pattern_(keep_delim_pattern), with_offsets_(with_offsets) {}
|
|
||||||
|
|
||||||
Status RegexTokenizerOperation::ValidateParams() { return Status::OK(); }
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> RegexTokenizerOperation::Build() {
|
|
||||||
std::shared_ptr<RegexTokenizerOp> tensor_op =
|
|
||||||
std::make_shared<RegexTokenizerOp>(delim_pattern_, keep_delim_pattern_, with_offsets_);
|
|
||||||
return tensor_op;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// SentencePieceTokenizerOperation
|
|
||||||
SentencePieceTokenizerOperation::~SentencePieceTokenizerOperation() = default;
|
|
||||||
|
|
||||||
SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab,
|
|
||||||
SPieceTokenizerOutType out_type)
|
|
||||||
: vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {}
|
|
||||||
|
|
||||||
SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::string &vocab_path,
|
|
||||||
SPieceTokenizerOutType out_type)
|
|
||||||
: vocab_(nullptr), vocab_path_(vocab_path), load_type_(SPieceTokenizerLoadType::kFile), out_type_(out_type) {}
|
|
||||||
|
|
||||||
Status SentencePieceTokenizerOperation::ValidateParams() {
|
|
||||||
if (load_type_ == SPieceTokenizerLoadType::kModel) {
|
|
||||||
if (vocab_ == nullptr) {
|
|
||||||
std::string err_msg = "SentencePieceTokenizer: vocab object type is incorrect or null.";
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
Path vocab_file(vocab_path_);
|
|
||||||
if (!vocab_file.Exists() || vocab_file.IsDirectory()) {
|
|
||||||
std::string err_msg = "SentencePieceTokenizer : vocab file: [" + vocab_path_ + "] is invalid or does not exist.";
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
if (access(vocab_file.toString().c_str(), R_OK) == -1) {
|
|
||||||
std::string err_msg = "SentencePieceTokenizer : no access to specified dataset file: " + vocab_path_;
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> SentencePieceTokenizerOperation::Build() {
|
|
||||||
std::shared_ptr<SentencePieceTokenizerOp> tensor_op;
|
|
||||||
if (load_type_ == SPieceTokenizerLoadType::kModel) {
|
|
||||||
tensor_op = std::make_shared<SentencePieceTokenizerOp>(vocab_, load_type_, out_type_);
|
|
||||||
} else {
|
|
||||||
Path vocab_file(vocab_path_);
|
|
||||||
std::string model_path = vocab_file.ParentPath();
|
|
||||||
std::string model_filename = vocab_file.Basename();
|
|
||||||
tensor_op = std::make_shared<SentencePieceTokenizerOp>(model_path, model_filename, load_type_, out_type_);
|
|
||||||
}
|
|
||||||
return tensor_op;
|
|
||||||
}
|
|
||||||
|
|
||||||
// SlidingWindowOperation
|
|
||||||
SlidingWindowOperation::SlidingWindowOperation(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {}
|
|
||||||
|
|
||||||
Status SlidingWindowOperation::ValidateParams() {
|
|
||||||
if (width_ < 1) {
|
|
||||||
std::string err_msg =
|
|
||||||
"SlidingWindow : The parameter width must be greater than or equal to 1: " + std::to_string(width_);
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> SlidingWindowOperation::Build() {
|
|
||||||
std::shared_ptr<SlidingWindowOp> tensor_op = std::make_shared<SlidingWindowOp>(static_cast<uint32_t>(width_), axis_);
|
|
||||||
return tensor_op;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToNumberOperation
|
|
||||||
ToNumberOperation::ToNumberOperation(std::string data_type) : data_type_(data_type) {}
|
|
||||||
|
|
||||||
Status ToNumberOperation::ValidateParams() {
|
|
||||||
if (!IsTypeNumeric(data_type_) || IsTypeBoolean(data_type_)) {
|
|
||||||
std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_;
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> ToNumberOperation::Build() {
|
|
||||||
std::shared_ptr<ToNumberOp> tensor_op = std::make_shared<ToNumberOp>(data_type_);
|
|
||||||
return tensor_op;
|
|
||||||
}
|
|
||||||
|
|
||||||
TruncateSequencePairOperation::TruncateSequencePairOperation(int32_t max_length) : max_length_(max_length) {}
|
|
||||||
|
|
||||||
Status TruncateSequencePairOperation::ValidateParams() {
|
|
||||||
if (max_length_ < 0) {
|
|
||||||
std::string err_msg = "TruncateSequencePair : The parameter max_length must be greater than or equal to 0: " +
|
|
||||||
std::to_string(max_length_);
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> TruncateSequencePairOperation::Build() {
|
|
||||||
std::shared_ptr<TruncateSequencePairOp> tensor_op = std::make_shared<TruncateSequencePairOp>(max_length_);
|
|
||||||
return tensor_op;
|
|
||||||
}
|
|
||||||
|
|
||||||
// UnicodeCharTokenizerOperation
|
|
||||||
UnicodeCharTokenizerOperation::UnicodeCharTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {}
|
|
||||||
|
|
||||||
Status UnicodeCharTokenizerOperation::ValidateParams() { return Status::OK(); }
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> UnicodeCharTokenizerOperation::Build() {
|
|
||||||
std::shared_ptr<UnicodeCharTokenizerOp> tensor_op = std::make_shared<UnicodeCharTokenizerOp>(with_offsets_);
|
|
||||||
return tensor_op;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef _WIN32
|
|
||||||
// UnicodeScriptTokenizerOperation
|
|
||||||
UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets)
|
|
||||||
: keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {}
|
|
||||||
|
|
||||||
Status UnicodeScriptTokenizerOperation::ValidateParams() { return Status::OK(); }
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> UnicodeScriptTokenizerOperation::Build() {
|
|
||||||
std::shared_ptr<UnicodeScriptTokenizerOp> tensor_op =
|
|
||||||
std::make_shared<UnicodeScriptTokenizerOp>(keep_whitespace_, with_offsets_);
|
|
||||||
return tensor_op;
|
|
||||||
}
|
|
||||||
|
|
||||||
// WhitespaceTokenizerOperation
|
|
||||||
WhitespaceTokenizerOperation::WhitespaceTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {}
|
|
||||||
|
|
||||||
Status WhitespaceTokenizerOperation::ValidateParams() { return Status::OK(); }
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> WhitespaceTokenizerOperation::Build() {
|
|
||||||
std::shared_ptr<WhitespaceTokenizerOp> tensor_op = std::make_shared<WhitespaceTokenizerOp>(with_offsets_);
|
|
||||||
return tensor_op;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
} // namespace text
|
} // namespace text
|
||||||
} // namespace dataset
|
} // namespace dataset
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -15,18 +15,6 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "minddata/dataset/include/transforms.h"
|
#include "minddata/dataset/include/transforms.h"
|
||||||
#include "minddata/dataset/kernels/ir/validators.h"
|
|
||||||
|
|
||||||
// Kernel data headers (in alphabetical order)
|
|
||||||
#include "minddata/dataset/kernels/data/compose_op.h"
|
|
||||||
#include "minddata/dataset/kernels/data/duplicate_op.h"
|
|
||||||
#include "minddata/dataset/kernels/data/one_hot_op.h"
|
|
||||||
#include "minddata/dataset/kernels/data/random_apply_op.h"
|
|
||||||
#include "minddata/dataset/kernels/data/random_choice_op.h"
|
|
||||||
#include "minddata/dataset/kernels/data/type_cast_op.h"
|
|
||||||
#ifndef ENABLE_ANDROID
|
|
||||||
#include "minddata/dataset/kernels/data/unique_op.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace dataset {
|
namespace dataset {
|
||||||
|
@ -88,122 +76,6 @@ std::shared_ptr<UniqueOperation> Unique() {
|
||||||
return op->ValidateParams() ? op : nullptr;
|
return op->ValidateParams() ? op : nullptr;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* ####################################### Validator Functions ############################################ */
|
|
||||||
|
|
||||||
/* ####################################### Derived TensorOperation classes ################################# */
|
|
||||||
|
|
||||||
// (In alphabetical order)
|
|
||||||
|
|
||||||
// ComposeOperation
|
|
||||||
ComposeOperation::ComposeOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms)
|
|
||||||
: transforms_(transforms) {}
|
|
||||||
|
|
||||||
Status ComposeOperation::ValidateParams() {
|
|
||||||
RETURN_IF_NOT_OK(ValidateVectorTransforms("Compose", transforms_));
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> ComposeOperation::Build() {
|
|
||||||
std::vector<std::shared_ptr<TensorOp>> tensor_ops;
|
|
||||||
(void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops),
|
|
||||||
[](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); });
|
|
||||||
return std::make_shared<ComposeOp>(tensor_ops);
|
|
||||||
}
|
|
||||||
|
|
||||||
// DuplicateOperation
|
|
||||||
Status DuplicateOperation::ValidateParams() { return Status::OK(); }
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> DuplicateOperation::Build() { return std::make_shared<DuplicateOp>(); }
|
|
||||||
|
|
||||||
// OneHotOperation
|
|
||||||
OneHotOperation::OneHotOperation(int32_t num_classes) : num_classes_(num_classes) {}
|
|
||||||
|
|
||||||
Status OneHotOperation::ValidateParams() {
|
|
||||||
if (num_classes_ <= 0) {
|
|
||||||
std::string err_msg = "OneHot: Number of classes must be greater than 0, but got: " + std::to_string(num_classes_);
|
|
||||||
MS_LOG(ERROR) << err_msg;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> OneHotOperation::Build() { return std::make_shared<OneHotOp>(num_classes_); }
|
|
||||||
|
|
||||||
// PreBuiltOperation
|
|
||||||
PreBuiltOperation::PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op) : op_(tensor_op) {}
|
|
||||||
|
|
||||||
Status PreBuiltOperation::ValidateParams() { return Status::OK(); }
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> PreBuiltOperation::Build() { return op_; }
|
|
||||||
|
|
||||||
std::string PreBuiltOperation::Name() const { return op_ ? op_->Name() : kPreBuiltOperation; }
|
|
||||||
|
|
||||||
Status PreBuiltOperation::to_json(nlohmann::json *out_json) {
|
|
||||||
RETURN_IF_NOT_OK(op_->to_json(out_json));
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
// RandomApplyOperation
|
|
||||||
RandomApplyOperation::RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob)
|
|
||||||
: TensorOperation(true), transforms_(transforms), prob_(prob) {}
|
|
||||||
|
|
||||||
Status RandomApplyOperation::ValidateParams() {
|
|
||||||
RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomApply", transforms_));
|
|
||||||
RETURN_IF_NOT_OK(ValidateProbability("RandomApply", prob_));
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> RandomApplyOperation::Build() {
|
|
||||||
std::vector<std::shared_ptr<TensorOp>> tensor_ops;
|
|
||||||
(void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops),
|
|
||||||
[](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); });
|
|
||||||
return std::make_shared<RandomApplyOp>(prob_, tensor_ops);
|
|
||||||
}
|
|
||||||
|
|
||||||
// RandomChoiceOperation
|
|
||||||
RandomChoiceOperation::RandomChoiceOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms)
|
|
||||||
: TensorOperation(true), transforms_(transforms) {}
|
|
||||||
|
|
||||||
Status RandomChoiceOperation::ValidateParams() {
|
|
||||||
RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomChoice", transforms_));
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> RandomChoiceOperation::Build() {
|
|
||||||
std::vector<std::shared_ptr<TensorOp>> tensor_ops;
|
|
||||||
(void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops),
|
|
||||||
[](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); });
|
|
||||||
return std::make_shared<RandomChoiceOp>(tensor_ops);
|
|
||||||
}
|
|
||||||
|
|
||||||
// TypeCastOperation
|
|
||||||
TypeCastOperation::TypeCastOperation(std::string data_type) : data_type_(data_type) {}
|
|
||||||
|
|
||||||
Status TypeCastOperation::ValidateParams() {
|
|
||||||
std::vector<std::string> predefine_type = {"bool", "int8", "uint8", "int16", "uint16", "int32", "uint32",
|
|
||||||
"int64", "uint64", "float16", "float32", "float64", "string"};
|
|
||||||
auto itr = std::find(predefine_type.begin(), predefine_type.end(), data_type_);
|
|
||||||
if (itr == predefine_type.end()) {
|
|
||||||
std::string err_msg = "TypeCast: Invalid data type: " + data_type_;
|
|
||||||
MS_LOG(ERROR) << "TypeCast: Only supports data type bool, int8, uint8, int16, uint16, int32, uint32, "
|
|
||||||
<< "int64, uint64, float16, float32, float64, string, but got: " << data_type_;
|
|
||||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> TypeCastOperation::Build() { return std::make_shared<TypeCastOp>(data_type_); }
|
|
||||||
|
|
||||||
#ifndef ENABLE_ANDROID
|
|
||||||
// UniqueOperation
|
|
||||||
Status UniqueOperation::ValidateParams() { return Status::OK(); }
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> UniqueOperation::Build() { return std::make_shared<UniqueOp>(); }
|
|
||||||
#endif
|
|
||||||
|
|
||||||
} // namespace transforms
|
} // namespace transforms
|
||||||
} // namespace dataset
|
} // namespace dataset
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
* you may not use this file except in compliance with the License.
|
* you may not use this file except in compliance with the License.
|
||||||
|
@ -21,9 +21,9 @@
|
||||||
#include "minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.h"
|
#include "minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.h"
|
||||||
|
|
||||||
#include "minddata/dataset/engine/ir/datasetops/map_node.h"
|
#include "minddata/dataset/engine/ir/datasetops/map_node.h"
|
||||||
#include "minddata/dataset/include/transforms.h"
|
|
||||||
#include "minddata/dataset/kernels/image/random_crop_and_resize_op.h"
|
#include "minddata/dataset/kernels/image/random_crop_and_resize_op.h"
|
||||||
#include "minddata/dataset/kernels/image/random_crop_decode_resize_op.h"
|
#include "minddata/dataset/kernels/image/random_crop_decode_resize_op.h"
|
||||||
|
#include "minddata/dataset/kernels/ir/data/transforms_ir.h"
|
||||||
#include "minddata/dataset/kernels/ir/vision/vision_ir.h"
|
#include "minddata/dataset/kernels/ir/vision/vision_ir.h"
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
|
|
|
@ -27,6 +27,9 @@
|
||||||
#include "minddata/dataset/include/constants.h"
|
#include "minddata/dataset/include/constants.h"
|
||||||
#include "minddata/dataset/include/transforms.h"
|
#include "minddata/dataset/include/transforms.h"
|
||||||
|
|
||||||
|
// FIXME - This internal IR header will be removed when external API classes are provided
|
||||||
|
#include "minddata/dataset/text/ir/kernels/text_ir.h"
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace dataset {
|
namespace dataset {
|
||||||
|
|
||||||
|
@ -36,24 +39,6 @@ class SentencePieceVocab;
|
||||||
// Transform operations for text
|
// Transform operations for text
|
||||||
namespace text {
|
namespace text {
|
||||||
|
|
||||||
// Char arrays storing name of corresponding classes (in alphabetical order)
|
|
||||||
constexpr char kBasicTokenizerOperation[] = "BasicTokenizer";
|
|
||||||
constexpr char kBertTokenizerOperation[] = "BertTokenizer";
|
|
||||||
constexpr char kCaseFoldOperation[] = "CaseFold";
|
|
||||||
constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer";
|
|
||||||
constexpr char kLookupOperation[] = "Lookup";
|
|
||||||
constexpr char kNgramOperation[] = "Ngram";
|
|
||||||
constexpr char kNormalizeUTF8Operation[] = "NormalizeUTF8";
|
|
||||||
constexpr char kRegexReplaceOperation[] = "RegexReplace";
|
|
||||||
constexpr char kRegexTokenizerOperation[] = "RegexTokenizer";
|
|
||||||
constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer";
|
|
||||||
constexpr char kSlidingWindowOperation[] = "SlidingWindow";
|
|
||||||
constexpr char kToNumberOperation[] = "ToNumber";
|
|
||||||
constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair";
|
|
||||||
constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer";
|
|
||||||
constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer";
|
|
||||||
constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer";
|
|
||||||
|
|
||||||
// Text Op classes (in alphabetical order)
|
// Text Op classes (in alphabetical order)
|
||||||
#ifndef _WIN32
|
#ifndef _WIN32
|
||||||
class BasicTokenizerOperation;
|
class BasicTokenizerOperation;
|
||||||
|
@ -255,309 +240,6 @@ std::shared_ptr<UnicodeScriptTokenizerOperation> UnicodeScriptTokenizer(bool kee
|
||||||
/// \return Shared pointer to the current TensorOperation.
|
/// \return Shared pointer to the current TensorOperation.
|
||||||
std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offsets = false);
|
std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offsets = false);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* ####################################### Derived TensorOperation classes ################################# */
|
|
||||||
|
|
||||||
#ifndef _WIN32
|
|
||||||
class BasicTokenizerOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
BasicTokenizerOperation(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form,
|
|
||||||
bool preserve_unused_token, bool with_offsets);
|
|
||||||
|
|
||||||
~BasicTokenizerOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kBasicTokenizerOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
bool lower_case_;
|
|
||||||
bool keep_whitespace_;
|
|
||||||
NormalizeForm normalize_form_;
|
|
||||||
bool preserve_unused_token_;
|
|
||||||
bool with_offsets_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class BertTokenizerOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
|
|
||||||
int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case,
|
|
||||||
bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
|
|
||||||
bool with_offsets);
|
|
||||||
|
|
||||||
~BertTokenizerOperation();
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kBertTokenizerOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::shared_ptr<Vocab> vocab_;
|
|
||||||
std::string suffix_indicator_;
|
|
||||||
int32_t max_bytes_per_token_;
|
|
||||||
std::string unknown_token_;
|
|
||||||
bool lower_case_;
|
|
||||||
bool keep_whitespace_;
|
|
||||||
NormalizeForm normalize_form_;
|
|
||||||
bool preserve_unused_token_;
|
|
||||||
bool with_offsets_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class CaseFoldOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
CaseFoldOperation() = default;
|
|
||||||
|
|
||||||
~CaseFoldOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kCaseFoldOperation; }
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
class JiebaTokenizerOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
explicit JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode,
|
|
||||||
bool with_offsets);
|
|
||||||
|
|
||||||
~JiebaTokenizerOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kJiebaTokenizerOperation; }
|
|
||||||
|
|
||||||
Status AddWord(const std::string &word, int64_t freq = 0);
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::string hmm_path_;
|
|
||||||
std::string mp_path_;
|
|
||||||
JiebaMode mode_;
|
|
||||||
bool with_offsets_;
|
|
||||||
std::vector<std::pair<std::string, int64_t>> words_list_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class LookupOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
|
|
||||||
const std::string &data_type);
|
|
||||||
|
|
||||||
~LookupOperation();
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kLookupOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::shared_ptr<Vocab> vocab_;
|
|
||||||
std::optional<std::string> unknown_token_;
|
|
||||||
int32_t default_id_;
|
|
||||||
std::string data_type_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class NgramOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
explicit NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
|
|
||||||
const std::pair<std::string, int32_t> &right_pad, const std::string &separator);
|
|
||||||
|
|
||||||
~NgramOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kNgramOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::vector<int32_t> ngrams_;
|
|
||||||
std::pair<std::string, int32_t> left_pad_;
|
|
||||||
std::pair<std::string, int32_t> right_pad_;
|
|
||||||
std::string separator_;
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifndef _WIN32
|
|
||||||
class NormalizeUTF8Operation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
explicit NormalizeUTF8Operation(NormalizeForm normalize_form);
|
|
||||||
|
|
||||||
~NormalizeUTF8Operation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kNormalizeUTF8Operation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
NormalizeForm normalize_form_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class RegexReplaceOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all);
|
|
||||||
|
|
||||||
~RegexReplaceOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kRegexReplaceOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::string pattern_;
|
|
||||||
std::string replace_;
|
|
||||||
bool replace_all_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class RegexTokenizerOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
explicit RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, bool with_offsets);
|
|
||||||
|
|
||||||
~RegexTokenizerOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kRegexTokenizerOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::string delim_pattern_;
|
|
||||||
std::string keep_delim_pattern_;
|
|
||||||
bool with_offsets_;
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
class SentencePieceTokenizerOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type);
|
|
||||||
|
|
||||||
SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type);
|
|
||||||
|
|
||||||
~SentencePieceTokenizerOperation();
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kSentencepieceTokenizerOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::shared_ptr<SentencePieceVocab> vocab_;
|
|
||||||
std::string vocab_path_;
|
|
||||||
SPieceTokenizerLoadType load_type_;
|
|
||||||
SPieceTokenizerOutType out_type_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class SlidingWindowOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
explicit SlidingWindowOperation(const int32_t width, const int32_t axis);
|
|
||||||
|
|
||||||
~SlidingWindowOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kSlidingWindowOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
int32_t width_;
|
|
||||||
int32_t axis_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class ToNumberOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
explicit ToNumberOperation(std::string data_type);
|
|
||||||
|
|
||||||
~ToNumberOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kToNumberOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::string data_type_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class TruncateSequencePairOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
explicit TruncateSequencePairOperation(int32_t max_length);
|
|
||||||
|
|
||||||
~TruncateSequencePairOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kTruncateSequencePairOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
int32_t max_length_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class UnicodeCharTokenizerOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
explicit UnicodeCharTokenizerOperation(bool with_offsets);
|
|
||||||
|
|
||||||
~UnicodeCharTokenizerOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kUnicodeCharTokenizerOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
bool with_offsets_;
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifndef _WIN32
|
|
||||||
class UnicodeScriptTokenizerOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
explicit UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets);
|
|
||||||
|
|
||||||
~UnicodeScriptTokenizerOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kUnicodeScriptTokenizerOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
bool keep_whitespace_;
|
|
||||||
bool with_offsets_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class WhitespaceTokenizerOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
explicit WhitespaceTokenizerOperation(bool with_offsets);
|
|
||||||
|
|
||||||
~WhitespaceTokenizerOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kWhitespaceTokenizerOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
bool with_offsets_;
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
} // namespace text
|
} // namespace text
|
||||||
} // namespace dataset
|
} // namespace dataset
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -25,40 +25,12 @@
|
||||||
#include "include/api/status.h"
|
#include "include/api/status.h"
|
||||||
#include "minddata/dataset/include/constants.h"
|
#include "minddata/dataset/include/constants.h"
|
||||||
|
|
||||||
// (TEMPORARY) will be removed when Tensor op ir moved down
|
// FIXME - This internal IR header will be removed when external API classes are provided
|
||||||
#include "minddata/dataset/kernels/ir/tensor_operation.h"
|
#include "minddata/dataset/kernels/ir/data/transforms_ir.h"
|
||||||
|
|
||||||
#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
|
|
||||||
#define INCLUDE_NLOHMANN_JSON_FWD_HPP_
|
|
||||||
namespace nlohmann {
|
|
||||||
template <typename T = void, typename SFINAE = void>
|
|
||||||
struct adl_serializer;
|
|
||||||
template <template <typename U, typename V, typename... Args> class ObjectType = std::map,
|
|
||||||
template <typename U, typename... Args> class ArrayType = std::vector, class StringType = std::string,
|
|
||||||
class BooleanType = bool, class NumberIntegerType = std::int64_t, class NumberUnsignedType = std::uint64_t,
|
|
||||||
class NumberFloatType = double, template <typename U> class AllocatorType = std::allocator,
|
|
||||||
template <typename T, typename SFINAE = void> class JSONSerializer = adl_serializer>
|
|
||||||
class basic_json;
|
|
||||||
template <typename BasicJsonType>
|
|
||||||
class json_pointer;
|
|
||||||
using json = basic_json<>;
|
|
||||||
} // namespace nlohmann
|
|
||||||
#endif // INCLUDE_NLOHMANN_JSON_FWD_HPP_
|
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace dataset {
|
namespace dataset {
|
||||||
|
|
||||||
// Char arrays storing name of corresponding classes (in alphabetical order)
|
|
||||||
constexpr char kComposeOperation[] = "Compose";
|
|
||||||
constexpr char kDuplicateOperation[] = "Duplicate";
|
|
||||||
constexpr char kOneHotOperation[] = "OneHot";
|
|
||||||
constexpr char kPreBuiltOperation[] = "PreBuilt";
|
|
||||||
constexpr char kRandomApplyOperation[] = "RandomApply";
|
|
||||||
constexpr char kRandomChoiceOperation[] = "RandomChoice";
|
|
||||||
constexpr char kRandomSelectSubpolicyOperation[] = "RandomSelectSubpolicy";
|
|
||||||
constexpr char kTypeCastOperation[] = "TypeCast";
|
|
||||||
constexpr char kUniqueOperation[] = "Unique";
|
|
||||||
|
|
||||||
// Transform operations for performing data transformation.
|
// Transform operations for performing data transformation.
|
||||||
namespace transforms {
|
namespace transforms {
|
||||||
|
|
||||||
|
@ -119,134 +91,6 @@ std::shared_ptr<TypeCastOperation> TypeCast(std::string data_type);
|
||||||
/// \return Shared pointer to the current TensorOperation.
|
/// \return Shared pointer to the current TensorOperation.
|
||||||
std::shared_ptr<UniqueOperation> Unique();
|
std::shared_ptr<UniqueOperation> Unique();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* ####################################### Derived TensorOperation classes ################################# */
|
|
||||||
|
|
||||||
class ComposeOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
explicit ComposeOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms);
|
|
||||||
|
|
||||||
~ComposeOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kComposeOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::vector<std::shared_ptr<TensorOperation>> transforms_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class DuplicateOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
DuplicateOperation() = default;
|
|
||||||
|
|
||||||
~DuplicateOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kDuplicateOperation; }
|
|
||||||
};
|
|
||||||
|
|
||||||
class OneHotOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
explicit OneHotOperation(int32_t num_classes_);
|
|
||||||
|
|
||||||
~OneHotOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kOneHotOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
float num_classes_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class PreBuiltOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
explicit PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op);
|
|
||||||
|
|
||||||
~PreBuiltOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override;
|
|
||||||
|
|
||||||
Status to_json(nlohmann::json *out_json) override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::shared_ptr<TensorOp> op_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class RandomApplyOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
explicit RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob);
|
|
||||||
|
|
||||||
~RandomApplyOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kRandomApplyOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::vector<std::shared_ptr<TensorOperation>> transforms_;
|
|
||||||
double prob_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class RandomChoiceOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
explicit RandomChoiceOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms);
|
|
||||||
|
|
||||||
~RandomChoiceOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kRandomChoiceOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::vector<std::shared_ptr<TensorOperation>> transforms_;
|
|
||||||
};
|
|
||||||
class TypeCastOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
explicit TypeCastOperation(std::string data_type);
|
|
||||||
|
|
||||||
~TypeCastOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kTypeCastOperation; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::string data_type_;
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifndef ENABLE_ANDROID
|
|
||||||
class UniqueOperation : public TensorOperation {
|
|
||||||
public:
|
|
||||||
UniqueOperation() = default;
|
|
||||||
|
|
||||||
~UniqueOperation() = default;
|
|
||||||
|
|
||||||
std::shared_ptr<TensorOp> Build() override;
|
|
||||||
|
|
||||||
Status ValidateParams() override;
|
|
||||||
|
|
||||||
std::string Name() const override { return kUniqueOperation; }
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
} // namespace transforms
|
} // namespace transforms
|
||||||
} // namespace dataset
|
} // namespace dataset
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
add_subdirectory(data)
|
||||||
add_subdirectory(vision)
|
add_subdirectory(vision)
|
||||||
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
||||||
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
|
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
|
||||||
|
|
|
@ -0,0 +1,8 @@
|
||||||
|
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
||||||
|
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
|
||||||
|
|
||||||
|
set(DATASET_KERNELS_IR_DATA_SRC_FILES
|
||||||
|
transforms_ir.cc
|
||||||
|
)
|
||||||
|
|
||||||
|
add_library(kernels-ir-data OBJECT ${DATASET_KERNELS_IR_DATA_SRC_FILES})
|
|
@ -0,0 +1,155 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
#include "minddata/dataset/kernels/ir/data/transforms_ir.h"
|
||||||
|
|
||||||
|
// Kernel data headers (in alphabetical order)
|
||||||
|
#include "minddata/dataset/kernels/data/compose_op.h"
|
||||||
|
#include "minddata/dataset/kernels/data/duplicate_op.h"
|
||||||
|
#include "minddata/dataset/kernels/data/one_hot_op.h"
|
||||||
|
#include "minddata/dataset/kernels/data/random_apply_op.h"
|
||||||
|
#include "minddata/dataset/kernels/data/random_choice_op.h"
|
||||||
|
#include "minddata/dataset/kernels/data/type_cast_op.h"
|
||||||
|
#ifndef ENABLE_ANDROID
|
||||||
|
#include "minddata/dataset/kernels/data/unique_op.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "minddata/dataset/kernels/ir/validators.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
|
||||||
|
// Transform operations for data.
|
||||||
|
namespace transforms {
|
||||||
|
|
||||||
|
/* ####################################### Derived TensorOperation classes ################################# */
|
||||||
|
|
||||||
|
// (In alphabetical order)
|
||||||
|
|
||||||
|
// ComposeOperation
|
||||||
|
ComposeOperation::ComposeOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms)
|
||||||
|
: transforms_(transforms) {}
|
||||||
|
|
||||||
|
Status ComposeOperation::ValidateParams() {
|
||||||
|
RETURN_IF_NOT_OK(ValidateVectorTransforms("Compose", transforms_));
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> ComposeOperation::Build() {
|
||||||
|
std::vector<std::shared_ptr<TensorOp>> tensor_ops;
|
||||||
|
(void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops),
|
||||||
|
[](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); });
|
||||||
|
return std::make_shared<ComposeOp>(tensor_ops);
|
||||||
|
}
|
||||||
|
|
||||||
|
// DuplicateOperation
|
||||||
|
Status DuplicateOperation::ValidateParams() { return Status::OK(); }
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> DuplicateOperation::Build() { return std::make_shared<DuplicateOp>(); }
|
||||||
|
|
||||||
|
// OneHotOperation
|
||||||
|
OneHotOperation::OneHotOperation(int32_t num_classes) : num_classes_(num_classes) {}
|
||||||
|
|
||||||
|
Status OneHotOperation::ValidateParams() {
|
||||||
|
if (num_classes_ <= 0) {
|
||||||
|
std::string err_msg = "OneHot: Number of classes must be greater than 0, but got: " + std::to_string(num_classes_);
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> OneHotOperation::Build() { return std::make_shared<OneHotOp>(num_classes_); }
|
||||||
|
|
||||||
|
// PreBuiltOperation
|
||||||
|
PreBuiltOperation::PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op) : op_(tensor_op) {}
|
||||||
|
|
||||||
|
Status PreBuiltOperation::ValidateParams() { return Status::OK(); }
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> PreBuiltOperation::Build() { return op_; }
|
||||||
|
|
||||||
|
std::string PreBuiltOperation::Name() const { return op_ ? op_->Name() : kPreBuiltOperation; }
|
||||||
|
|
||||||
|
Status PreBuiltOperation::to_json(nlohmann::json *out_json) {
|
||||||
|
RETURN_IF_NOT_OK(op_->to_json(out_json));
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
// RandomApplyOperation
|
||||||
|
RandomApplyOperation::RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob)
|
||||||
|
: TensorOperation(true), transforms_(transforms), prob_(prob) {}
|
||||||
|
|
||||||
|
Status RandomApplyOperation::ValidateParams() {
|
||||||
|
RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomApply", transforms_));
|
||||||
|
RETURN_IF_NOT_OK(ValidateProbability("RandomApply", prob_));
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> RandomApplyOperation::Build() {
|
||||||
|
std::vector<std::shared_ptr<TensorOp>> tensor_ops;
|
||||||
|
(void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops),
|
||||||
|
[](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); });
|
||||||
|
return std::make_shared<RandomApplyOp>(prob_, tensor_ops);
|
||||||
|
}
|
||||||
|
|
||||||
|
// RandomChoiceOperation
|
||||||
|
RandomChoiceOperation::RandomChoiceOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms)
|
||||||
|
: TensorOperation(true), transforms_(transforms) {}
|
||||||
|
|
||||||
|
Status RandomChoiceOperation::ValidateParams() {
|
||||||
|
RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomChoice", transforms_));
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> RandomChoiceOperation::Build() {
|
||||||
|
std::vector<std::shared_ptr<TensorOp>> tensor_ops;
|
||||||
|
(void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops),
|
||||||
|
[](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); });
|
||||||
|
return std::make_shared<RandomChoiceOp>(tensor_ops);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TypeCastOperation
|
||||||
|
TypeCastOperation::TypeCastOperation(std::string data_type) : data_type_(data_type) {}
|
||||||
|
|
||||||
|
Status TypeCastOperation::ValidateParams() {
|
||||||
|
std::vector<std::string> predefine_type = {"bool", "int8", "uint8", "int16", "uint16", "int32", "uint32",
|
||||||
|
"int64", "uint64", "float16", "float32", "float64", "string"};
|
||||||
|
auto itr = std::find(predefine_type.begin(), predefine_type.end(), data_type_);
|
||||||
|
if (itr == predefine_type.end()) {
|
||||||
|
std::string err_msg = "TypeCast: Invalid data type: " + data_type_;
|
||||||
|
MS_LOG(ERROR) << "TypeCast: Only supports data type bool, int8, uint8, int16, uint16, int32, uint32, "
|
||||||
|
<< "int64, uint64, float16, float32, float64, string, but got: " << data_type_;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> TypeCastOperation::Build() { return std::make_shared<TypeCastOp>(data_type_); }
|
||||||
|
|
||||||
|
#ifndef ENABLE_ANDROID
|
||||||
|
// UniqueOperation
|
||||||
|
Status UniqueOperation::ValidateParams() { return Status::OK(); }
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> UniqueOperation::Build() { return std::make_shared<UniqueOp>(); }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
} // namespace transforms
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
|
@ -0,0 +1,172 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IR_DATA_TRANSFORMS_IR_H_
|
||||||
|
#define MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IR_DATA_TRANSFORMS_IR_H_
|
||||||
|
|
||||||
|
#include <map>
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "minddata/dataset/kernels/ir/tensor_operation.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
|
||||||
|
// Char arrays storing name of corresponding classes (in alphabetical order)
|
||||||
|
constexpr char kComposeOperation[] = "Compose";
|
||||||
|
constexpr char kDuplicateOperation[] = "Duplicate";
|
||||||
|
constexpr char kOneHotOperation[] = "OneHot";
|
||||||
|
constexpr char kPreBuiltOperation[] = "PreBuilt";
|
||||||
|
constexpr char kRandomApplyOperation[] = "RandomApply";
|
||||||
|
constexpr char kRandomChoiceOperation[] = "RandomChoice";
|
||||||
|
constexpr char kTypeCastOperation[] = "TypeCast";
|
||||||
|
constexpr char kUniqueOperation[] = "Unique";
|
||||||
|
|
||||||
|
// Transform operations for performing data transformation.
|
||||||
|
namespace transforms {
|
||||||
|
/* ####################################### Derived TensorOperation classes ################################# */
|
||||||
|
|
||||||
|
class ComposeOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
explicit ComposeOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms);
|
||||||
|
|
||||||
|
~ComposeOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kComposeOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::vector<std::shared_ptr<TensorOperation>> transforms_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class DuplicateOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
DuplicateOperation() = default;
|
||||||
|
|
||||||
|
~DuplicateOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kDuplicateOperation; }
|
||||||
|
};
|
||||||
|
|
||||||
|
class OneHotOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
explicit OneHotOperation(int32_t num_classes_);
|
||||||
|
|
||||||
|
~OneHotOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kOneHotOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
float num_classes_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class PreBuiltOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
explicit PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op);
|
||||||
|
|
||||||
|
~PreBuiltOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override;
|
||||||
|
|
||||||
|
Status to_json(nlohmann::json *out_json) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::shared_ptr<TensorOp> op_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class RandomApplyOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
explicit RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob);
|
||||||
|
|
||||||
|
~RandomApplyOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kRandomApplyOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::vector<std::shared_ptr<TensorOperation>> transforms_;
|
||||||
|
double prob_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class RandomChoiceOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
explicit RandomChoiceOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms);
|
||||||
|
|
||||||
|
~RandomChoiceOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kRandomChoiceOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::vector<std::shared_ptr<TensorOperation>> transforms_;
|
||||||
|
};
|
||||||
|
class TypeCastOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
explicit TypeCastOperation(std::string data_type);
|
||||||
|
|
||||||
|
~TypeCastOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kTypeCastOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::string data_type_;
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifndef ENABLE_ANDROID
|
||||||
|
class UniqueOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
UniqueOperation() = default;
|
||||||
|
|
||||||
|
~UniqueOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kUniqueOperation; }
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
} // namespace transforms
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
||||||
|
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IR_DATA_TRANSFORMS_IR_H_
|
|
@ -1,3 +1,4 @@
|
||||||
|
add_subdirectory(ir)
|
||||||
add_subdirectory(kernels)
|
add_subdirectory(kernels)
|
||||||
|
|
||||||
file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
add_subdirectory(kernels)
|
||||||
|
|
||||||
|
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
||||||
|
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
|
||||||
|
|
||||||
|
add_library(text-ir OBJECT validators.cc)
|
|
@ -0,0 +1,8 @@
|
||||||
|
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
||||||
|
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
|
||||||
|
|
||||||
|
set(DATASET_TEXT_IR_KERNELS_SRC_FILES
|
||||||
|
text_ir.cc
|
||||||
|
)
|
||||||
|
|
||||||
|
add_library(text-ir-kernels OBJECT ${DATASET_TEXT_IR_KERNELS_SRC_FILES})
|
|
@ -0,0 +1,436 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#include "minddata/dataset/text/ir/kernels/text_ir.h"
|
||||||
|
|
||||||
|
#ifndef _WIN32
|
||||||
|
#include "minddata/dataset/text/kernels/basic_tokenizer_op.h"
|
||||||
|
#include "minddata/dataset/text/kernels/bert_tokenizer_op.h"
|
||||||
|
#include "minddata/dataset/text/kernels/case_fold_op.h"
|
||||||
|
#endif
|
||||||
|
#include "minddata/dataset/text/kernels/jieba_tokenizer_op.h"
|
||||||
|
#include "minddata/dataset/text/kernels/lookup_op.h"
|
||||||
|
#include "minddata/dataset/text/kernels/ngram_op.h"
|
||||||
|
#ifndef _WIN32
|
||||||
|
#include "minddata/dataset/text/kernels/normalize_utf8_op.h"
|
||||||
|
#include "minddata/dataset/text/kernels/regex_replace_op.h"
|
||||||
|
#include "minddata/dataset/text/kernels/regex_tokenizer_op.h"
|
||||||
|
#endif
|
||||||
|
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
|
||||||
|
#include "minddata/dataset/text/kernels/sliding_window_op.h"
|
||||||
|
#include "minddata/dataset/text/kernels/to_number_op.h"
|
||||||
|
#include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h"
|
||||||
|
#include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
|
||||||
|
#ifndef _WIN32
|
||||||
|
#include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h"
|
||||||
|
#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
|
||||||
|
#endif
|
||||||
|
#include "minddata/dataset/core/data_type.h"
|
||||||
|
#include "minddata/dataset/util/path.h"
|
||||||
|
|
||||||
|
#include "minddata/dataset/text/ir/validators.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
|
||||||
|
// Transform operations for text.
|
||||||
|
namespace text {
|
||||||
|
|
||||||
|
/* ####################################### Derived TensorOperation classes ################################# */
|
||||||
|
|
||||||
|
// (In alphabetical order)
|
||||||
|
|
||||||
|
#ifndef _WIN32
|
||||||
|
// BasicTokenizerOperation
|
||||||
|
BasicTokenizerOperation::BasicTokenizerOperation(bool lower_case, bool keep_whitespace,
|
||||||
|
const NormalizeForm normalize_form, bool preserve_unused_token,
|
||||||
|
bool with_offsets)
|
||||||
|
: lower_case_(lower_case),
|
||||||
|
keep_whitespace_(keep_whitespace),
|
||||||
|
normalize_form_(normalize_form),
|
||||||
|
preserve_unused_token_(preserve_unused_token),
|
||||||
|
with_offsets_(with_offsets) {}
|
||||||
|
|
||||||
|
Status BasicTokenizerOperation::ValidateParams() { return Status::OK(); }
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> BasicTokenizerOperation::Build() {
|
||||||
|
std::shared_ptr<BasicTokenizerOp> tensor_op = std::make_shared<BasicTokenizerOp>(
|
||||||
|
lower_case_, keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_);
|
||||||
|
return tensor_op;
|
||||||
|
}
|
||||||
|
|
||||||
|
// BertTokenizerOperation
|
||||||
|
BertTokenizerOperation::BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
|
||||||
|
int32_t max_bytes_per_token, const std::string &unknown_token,
|
||||||
|
bool lower_case, bool keep_whitespace,
|
||||||
|
const NormalizeForm normalize_form, bool preserve_unused_token,
|
||||||
|
bool with_offsets)
|
||||||
|
: vocab_(vocab),
|
||||||
|
suffix_indicator_(suffix_indicator),
|
||||||
|
max_bytes_per_token_(max_bytes_per_token),
|
||||||
|
unknown_token_(unknown_token),
|
||||||
|
lower_case_(lower_case),
|
||||||
|
keep_whitespace_(keep_whitespace),
|
||||||
|
normalize_form_(normalize_form),
|
||||||
|
preserve_unused_token_(preserve_unused_token),
|
||||||
|
with_offsets_(with_offsets) {}
|
||||||
|
|
||||||
|
BertTokenizerOperation::~BertTokenizerOperation() = default;
|
||||||
|
|
||||||
|
Status BertTokenizerOperation::ValidateParams() {
|
||||||
|
if (vocab_ == nullptr) {
|
||||||
|
std::string err_msg = "BertTokenizer: vocab object type is incorrect or null.";
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (max_bytes_per_token_ < 0) {
|
||||||
|
std::string err_msg = "BertTokenizer : The parameter max_bytes_per_token must be greater than or equal to 0: " +
|
||||||
|
std::to_string(max_bytes_per_token_);
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> BertTokenizerOperation::Build() {
|
||||||
|
std::shared_ptr<BertTokenizerOp> tensor_op =
|
||||||
|
std::make_shared<BertTokenizerOp>(vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, lower_case_,
|
||||||
|
keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_);
|
||||||
|
return tensor_op;
|
||||||
|
}
|
||||||
|
|
||||||
|
// CaseFoldOperation
|
||||||
|
Status CaseFoldOperation::ValidateParams() { return Status::OK(); }
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> CaseFoldOperation::Build() {
|
||||||
|
std::shared_ptr<CaseFoldOp> tensor_op = std::make_shared<CaseFoldOp>();
|
||||||
|
return tensor_op;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// JiebaTokenizerOperation
|
||||||
|
JiebaTokenizerOperation::JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path,
|
||||||
|
const JiebaMode &mode, bool with_offsets)
|
||||||
|
: hmm_path_(hmm_path), mp_path_(mp_path), mode_(mode), with_offsets_(with_offsets) {}
|
||||||
|
|
||||||
|
Status JiebaTokenizerOperation::ValidateParams() {
|
||||||
|
if (hmm_path_.empty()) {
|
||||||
|
std::string err_msg = "JiebaTokenizer: The dict of HMMSegment in cppjieba is not provided.";
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mp_path_.empty()) {
|
||||||
|
std::string err_msg = "JiebaTokenizer: The dict of MPSegment in cppjieba is not provided.";
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", hmm_path_));
|
||||||
|
RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", mp_path_));
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() {
|
||||||
|
std::shared_ptr<JiebaTokenizerOp> tensor_op =
|
||||||
|
std::make_shared<JiebaTokenizerOp>(hmm_path_, mp_path_, mode_, with_offsets_);
|
||||||
|
for (auto &word : words_list_) {
|
||||||
|
Status rc = tensor_op->AddWord(word.first, word.second);
|
||||||
|
if (rc.IsError()) {
|
||||||
|
MS_LOG(ERROR) << rc;
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return tensor_op;
|
||||||
|
}
|
||||||
|
|
||||||
|
Status JiebaTokenizerOperation::AddWord(const std::string &word, int64_t freq) {
|
||||||
|
if (word.empty()) {
|
||||||
|
std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided.";
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
if (freq < 0) {
|
||||||
|
std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0.";
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
words_list_.emplace_back(word, freq);
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
// LookupOperation
|
||||||
|
LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
|
||||||
|
const std::string &data_type)
|
||||||
|
: vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {}
|
||||||
|
|
||||||
|
LookupOperation::~LookupOperation() = default;
|
||||||
|
|
||||||
|
Status LookupOperation::ValidateParams() {
|
||||||
|
if (vocab_ == nullptr) {
|
||||||
|
std::string err_msg = "Lookup: vocab object type is incorrect or null.";
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
if (unknown_token_ != std::nullopt) {
|
||||||
|
default_id_ = vocab_->Lookup(*unknown_token_);
|
||||||
|
if (default_id_ == Vocab::kNoTokenExists) {
|
||||||
|
std::string err_msg = "Lookup: \"" + *unknown_token_ + "\" doesn't exist in vocab.";
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!IsTypeNumeric(data_type_)) {
|
||||||
|
std::string err_msg = "Lookup does not support a string to string mapping, data_type can only be numeric.";
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> LookupOperation::Build() {
|
||||||
|
std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, DataType(data_type_));
|
||||||
|
return tensor_op;
|
||||||
|
}
|
||||||
|
|
||||||
|
// NgramOperation
|
||||||
|
NgramOperation::NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
|
||||||
|
const std::pair<std::string, int32_t> &right_pad, const std::string &separator)
|
||||||
|
: ngrams_(ngrams), left_pad_(left_pad), right_pad_(right_pad), separator_(separator) {}
|
||||||
|
|
||||||
|
Status NgramOperation::ValidateParams() {
|
||||||
|
if (ngrams_.size() == 0) {
|
||||||
|
std::string err_msg = "Ngram : Container cannot be empty.";
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
} else {
|
||||||
|
for (int32_t i = 0; i < ngrams_.size(); ++i) {
|
||||||
|
if (ngrams_[i] <= 0) {
|
||||||
|
std::string err_msg =
|
||||||
|
"Ngram : The value of ngrams vector must be greater than 0: " + std::to_string(ngrams_[i]);
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (left_pad_.second < 0) {
|
||||||
|
std::string err_msg =
|
||||||
|
"Ngram : The second parameter pad_width in left_pad vector must be greater than or equal to 0: " +
|
||||||
|
std::to_string(left_pad_.second);
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (right_pad_.second < 0) {
|
||||||
|
std::string err_msg =
|
||||||
|
"Ngram : The second parameter pad_width in right_pad vector must be greater than or equal to 0: " +
|
||||||
|
std::to_string(right_pad_.second);
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> NgramOperation::Build() {
|
||||||
|
int32_t l_len = left_pad_.second;
|
||||||
|
int32_t r_len = right_pad_.second;
|
||||||
|
std::string l_pad = left_pad_.first;
|
||||||
|
std::string r_pad = right_pad_.first;
|
||||||
|
std::shared_ptr<NgramOp> tensor_op = std::make_shared<NgramOp>(ngrams_, l_len, r_len, l_pad, r_pad, separator_);
|
||||||
|
return tensor_op;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef _WIN32
|
||||||
|
// NormalizeUTF8Operation
|
||||||
|
NormalizeUTF8Operation::NormalizeUTF8Operation(NormalizeForm normalize_form) : normalize_form_(normalize_form) {}
|
||||||
|
|
||||||
|
Status NormalizeUTF8Operation::ValidateParams() { return Status::OK(); }
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> NormalizeUTF8Operation::Build() {
|
||||||
|
std::shared_ptr<NormalizeUTF8Op> tensor_op = std::make_shared<NormalizeUTF8Op>(normalize_form_);
|
||||||
|
return tensor_op;
|
||||||
|
}
|
||||||
|
|
||||||
|
// RegexReplaceOperation
|
||||||
|
RegexReplaceOperation::RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all)
|
||||||
|
: pattern_(pattern), replace_(replace), replace_all_(replace_all) {}
|
||||||
|
|
||||||
|
Status RegexReplaceOperation::ValidateParams() { return Status::OK(); }
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> RegexReplaceOperation::Build() {
|
||||||
|
std::shared_ptr<RegexReplaceOp> tensor_op = std::make_shared<RegexReplaceOp>(pattern_, replace_, replace_all_);
|
||||||
|
return tensor_op;
|
||||||
|
}
|
||||||
|
|
||||||
|
// RegexTokenizerOperation
|
||||||
|
RegexTokenizerOperation::RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern,
|
||||||
|
bool with_offsets)
|
||||||
|
: delim_pattern_(delim_pattern), keep_delim_pattern_(keep_delim_pattern), with_offsets_(with_offsets) {}
|
||||||
|
|
||||||
|
Status RegexTokenizerOperation::ValidateParams() { return Status::OK(); }
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> RegexTokenizerOperation::Build() {
|
||||||
|
std::shared_ptr<RegexTokenizerOp> tensor_op =
|
||||||
|
std::make_shared<RegexTokenizerOp>(delim_pattern_, keep_delim_pattern_, with_offsets_);
|
||||||
|
return tensor_op;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// SentencePieceTokenizerOperation
|
||||||
|
SentencePieceTokenizerOperation::~SentencePieceTokenizerOperation() = default;
|
||||||
|
|
||||||
|
SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab,
|
||||||
|
SPieceTokenizerOutType out_type)
|
||||||
|
: vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {}
|
||||||
|
|
||||||
|
SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::string &vocab_path,
|
||||||
|
SPieceTokenizerOutType out_type)
|
||||||
|
: vocab_(nullptr), vocab_path_(vocab_path), load_type_(SPieceTokenizerLoadType::kFile), out_type_(out_type) {}
|
||||||
|
|
||||||
|
Status SentencePieceTokenizerOperation::ValidateParams() {
|
||||||
|
if (load_type_ == SPieceTokenizerLoadType::kModel) {
|
||||||
|
if (vocab_ == nullptr) {
|
||||||
|
std::string err_msg = "SentencePieceTokenizer: vocab object type is incorrect or null.";
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Path vocab_file(vocab_path_);
|
||||||
|
if (!vocab_file.Exists() || vocab_file.IsDirectory()) {
|
||||||
|
std::string err_msg = "SentencePieceTokenizer : vocab file: [" + vocab_path_ + "] is invalid or does not exist.";
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
if (access(vocab_file.toString().c_str(), R_OK) == -1) {
|
||||||
|
std::string err_msg = "SentencePieceTokenizer : no access to specified dataset file: " + vocab_path_;
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> SentencePieceTokenizerOperation::Build() {
|
||||||
|
std::shared_ptr<SentencePieceTokenizerOp> tensor_op;
|
||||||
|
if (load_type_ == SPieceTokenizerLoadType::kModel) {
|
||||||
|
tensor_op = std::make_shared<SentencePieceTokenizerOp>(vocab_, load_type_, out_type_);
|
||||||
|
} else {
|
||||||
|
Path vocab_file(vocab_path_);
|
||||||
|
std::string model_path = vocab_file.ParentPath();
|
||||||
|
std::string model_filename = vocab_file.Basename();
|
||||||
|
tensor_op = std::make_shared<SentencePieceTokenizerOp>(model_path, model_filename, load_type_, out_type_);
|
||||||
|
}
|
||||||
|
return tensor_op;
|
||||||
|
}
|
||||||
|
|
||||||
|
// SlidingWindowOperation
|
||||||
|
SlidingWindowOperation::SlidingWindowOperation(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {}
|
||||||
|
|
||||||
|
Status SlidingWindowOperation::ValidateParams() {
|
||||||
|
if (width_ < 1) {
|
||||||
|
std::string err_msg =
|
||||||
|
"SlidingWindow : The parameter width must be greater than or equal to 1: " + std::to_string(width_);
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> SlidingWindowOperation::Build() {
|
||||||
|
std::shared_ptr<SlidingWindowOp> tensor_op = std::make_shared<SlidingWindowOp>(static_cast<uint32_t>(width_), axis_);
|
||||||
|
return tensor_op;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ToNumberOperation
|
||||||
|
ToNumberOperation::ToNumberOperation(std::string data_type) : data_type_(data_type) {}
|
||||||
|
|
||||||
|
Status ToNumberOperation::ValidateParams() {
|
||||||
|
if (!IsTypeNumeric(data_type_) || IsTypeBoolean(data_type_)) {
|
||||||
|
std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_;
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> ToNumberOperation::Build() {
|
||||||
|
std::shared_ptr<ToNumberOp> tensor_op = std::make_shared<ToNumberOp>(data_type_);
|
||||||
|
return tensor_op;
|
||||||
|
}
|
||||||
|
|
||||||
|
TruncateSequencePairOperation::TruncateSequencePairOperation(int32_t max_length) : max_length_(max_length) {}
|
||||||
|
|
||||||
|
Status TruncateSequencePairOperation::ValidateParams() {
|
||||||
|
if (max_length_ < 0) {
|
||||||
|
std::string err_msg = "TruncateSequencePair : The parameter max_length must be greater than or equal to 0: " +
|
||||||
|
std::to_string(max_length_);
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> TruncateSequencePairOperation::Build() {
|
||||||
|
std::shared_ptr<TruncateSequencePairOp> tensor_op = std::make_shared<TruncateSequencePairOp>(max_length_);
|
||||||
|
return tensor_op;
|
||||||
|
}
|
||||||
|
|
||||||
|
// UnicodeCharTokenizerOperation
|
||||||
|
UnicodeCharTokenizerOperation::UnicodeCharTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {}
|
||||||
|
|
||||||
|
Status UnicodeCharTokenizerOperation::ValidateParams() { return Status::OK(); }
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> UnicodeCharTokenizerOperation::Build() {
|
||||||
|
std::shared_ptr<UnicodeCharTokenizerOp> tensor_op = std::make_shared<UnicodeCharTokenizerOp>(with_offsets_);
|
||||||
|
return tensor_op;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef _WIN32
|
||||||
|
// UnicodeScriptTokenizerOperation
|
||||||
|
UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets)
|
||||||
|
: keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {}
|
||||||
|
|
||||||
|
Status UnicodeScriptTokenizerOperation::ValidateParams() { return Status::OK(); }
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> UnicodeScriptTokenizerOperation::Build() {
|
||||||
|
std::shared_ptr<UnicodeScriptTokenizerOp> tensor_op =
|
||||||
|
std::make_shared<UnicodeScriptTokenizerOp>(keep_whitespace_, with_offsets_);
|
||||||
|
return tensor_op;
|
||||||
|
}
|
||||||
|
|
||||||
|
// WhitespaceTokenizerOperation
|
||||||
|
WhitespaceTokenizerOperation::WhitespaceTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {}
|
||||||
|
|
||||||
|
Status WhitespaceTokenizerOperation::ValidateParams() { return Status::OK(); }
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> WhitespaceTokenizerOperation::Build() {
|
||||||
|
std::shared_ptr<WhitespaceTokenizerOp> tensor_op = std::make_shared<WhitespaceTokenizerOp>(with_offsets_);
|
||||||
|
return tensor_op;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
} // namespace text
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
|
@ -0,0 +1,360 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_KERNELS_TEXT_IR_H_
|
||||||
|
#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_KERNELS_TEXT_IR_H_
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <optional>
|
||||||
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "minddata/dataset/kernels/ir/tensor_operation.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
|
||||||
|
class Vocab;
|
||||||
|
class SentencePieceVocab;
|
||||||
|
|
||||||
|
// Transform operations for text
|
||||||
|
namespace text {
|
||||||
|
|
||||||
|
// Char arrays storing name of corresponding classes (in alphabetical order)
|
||||||
|
constexpr char kBasicTokenizerOperation[] = "BasicTokenizer";
|
||||||
|
constexpr char kBertTokenizerOperation[] = "BertTokenizer";
|
||||||
|
constexpr char kCaseFoldOperation[] = "CaseFold";
|
||||||
|
constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer";
|
||||||
|
constexpr char kLookupOperation[] = "Lookup";
|
||||||
|
constexpr char kNgramOperation[] = "Ngram";
|
||||||
|
constexpr char kNormalizeUTF8Operation[] = "NormalizeUTF8";
|
||||||
|
constexpr char kRegexReplaceOperation[] = "RegexReplace";
|
||||||
|
constexpr char kRegexTokenizerOperation[] = "RegexTokenizer";
|
||||||
|
constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer";
|
||||||
|
constexpr char kSlidingWindowOperation[] = "SlidingWindow";
|
||||||
|
constexpr char kToNumberOperation[] = "ToNumber";
|
||||||
|
constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair";
|
||||||
|
constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer";
|
||||||
|
constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer";
|
||||||
|
constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer";
|
||||||
|
|
||||||
|
/* ####################################### Derived TensorOperation classes ################################# */
|
||||||
|
|
||||||
|
#ifndef _WIN32
|
||||||
|
class BasicTokenizerOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
BasicTokenizerOperation(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form,
|
||||||
|
bool preserve_unused_token, bool with_offsets);
|
||||||
|
|
||||||
|
~BasicTokenizerOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kBasicTokenizerOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool lower_case_;
|
||||||
|
bool keep_whitespace_;
|
||||||
|
NormalizeForm normalize_form_;
|
||||||
|
bool preserve_unused_token_;
|
||||||
|
bool with_offsets_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class BertTokenizerOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
|
||||||
|
int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case,
|
||||||
|
bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
|
||||||
|
bool with_offsets);
|
||||||
|
|
||||||
|
~BertTokenizerOperation();
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kBertTokenizerOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::shared_ptr<Vocab> vocab_;
|
||||||
|
std::string suffix_indicator_;
|
||||||
|
int32_t max_bytes_per_token_;
|
||||||
|
std::string unknown_token_;
|
||||||
|
bool lower_case_;
|
||||||
|
bool keep_whitespace_;
|
||||||
|
NormalizeForm normalize_form_;
|
||||||
|
bool preserve_unused_token_;
|
||||||
|
bool with_offsets_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class CaseFoldOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
CaseFoldOperation() = default;
|
||||||
|
|
||||||
|
~CaseFoldOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kCaseFoldOperation; }
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
class JiebaTokenizerOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
explicit JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode,
|
||||||
|
bool with_offsets);
|
||||||
|
|
||||||
|
~JiebaTokenizerOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kJiebaTokenizerOperation; }
|
||||||
|
|
||||||
|
Status AddWord(const std::string &word, int64_t freq = 0);
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::string hmm_path_;
|
||||||
|
std::string mp_path_;
|
||||||
|
JiebaMode mode_;
|
||||||
|
bool with_offsets_;
|
||||||
|
std::vector<std::pair<std::string, int64_t>> words_list_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class LookupOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
|
||||||
|
const std::string &data_type);
|
||||||
|
|
||||||
|
~LookupOperation();
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kLookupOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::shared_ptr<Vocab> vocab_;
|
||||||
|
std::optional<std::string> unknown_token_;
|
||||||
|
int32_t default_id_;
|
||||||
|
std::string data_type_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class NgramOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
explicit NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
|
||||||
|
const std::pair<std::string, int32_t> &right_pad, const std::string &separator);
|
||||||
|
|
||||||
|
~NgramOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kNgramOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::vector<int32_t> ngrams_;
|
||||||
|
std::pair<std::string, int32_t> left_pad_;
|
||||||
|
std::pair<std::string, int32_t> right_pad_;
|
||||||
|
std::string separator_;
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifndef _WIN32
|
||||||
|
class NormalizeUTF8Operation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
explicit NormalizeUTF8Operation(NormalizeForm normalize_form);
|
||||||
|
|
||||||
|
~NormalizeUTF8Operation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kNormalizeUTF8Operation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
NormalizeForm normalize_form_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class RegexReplaceOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all);
|
||||||
|
|
||||||
|
~RegexReplaceOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kRegexReplaceOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::string pattern_;
|
||||||
|
std::string replace_;
|
||||||
|
bool replace_all_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class RegexTokenizerOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
explicit RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, bool with_offsets);
|
||||||
|
|
||||||
|
~RegexTokenizerOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kRegexTokenizerOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::string delim_pattern_;
|
||||||
|
std::string keep_delim_pattern_;
|
||||||
|
bool with_offsets_;
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
class SentencePieceTokenizerOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type);
|
||||||
|
|
||||||
|
SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type);
|
||||||
|
|
||||||
|
~SentencePieceTokenizerOperation();
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kSentencepieceTokenizerOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::shared_ptr<SentencePieceVocab> vocab_;
|
||||||
|
std::string vocab_path_;
|
||||||
|
SPieceTokenizerLoadType load_type_;
|
||||||
|
SPieceTokenizerOutType out_type_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class SlidingWindowOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
explicit SlidingWindowOperation(const int32_t width, const int32_t axis);
|
||||||
|
|
||||||
|
~SlidingWindowOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kSlidingWindowOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
int32_t width_;
|
||||||
|
int32_t axis_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class ToNumberOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
explicit ToNumberOperation(std::string data_type);
|
||||||
|
|
||||||
|
~ToNumberOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kToNumberOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::string data_type_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class TruncateSequencePairOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
explicit TruncateSequencePairOperation(int32_t max_length);
|
||||||
|
|
||||||
|
~TruncateSequencePairOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kTruncateSequencePairOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
int32_t max_length_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class UnicodeCharTokenizerOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
explicit UnicodeCharTokenizerOperation(bool with_offsets);
|
||||||
|
|
||||||
|
~UnicodeCharTokenizerOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kUnicodeCharTokenizerOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool with_offsets_;
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifndef _WIN32
|
||||||
|
class UnicodeScriptTokenizerOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
explicit UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets);
|
||||||
|
|
||||||
|
~UnicodeScriptTokenizerOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kUnicodeScriptTokenizerOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool keep_whitespace_;
|
||||||
|
bool with_offsets_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class WhitespaceTokenizerOperation : public TensorOperation {
|
||||||
|
public:
|
||||||
|
explicit WhitespaceTokenizerOperation(bool with_offsets);
|
||||||
|
|
||||||
|
~WhitespaceTokenizerOperation() = default;
|
||||||
|
|
||||||
|
std::shared_ptr<TensorOp> Build() override;
|
||||||
|
|
||||||
|
Status ValidateParams() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kWhitespaceTokenizerOperation; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool with_offsets_;
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
} // namespace text
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
||||||
|
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_KERNELS_TEXT_IR_H_
|
|
@ -0,0 +1,60 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "minddata/dataset/text/ir/validators.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
/* ####################################### Validator Functions ############################################ */
|
||||||
|
|
||||||
|
// Helper function to validate tokenizer directory parameter
|
||||||
|
Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file) {
|
||||||
|
if (tokenizer_file.empty()) {
|
||||||
|
std::string err_msg = tokenizer_name + ": tokenizer_file is not specified.";
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
Path file(tokenizer_file);
|
||||||
|
if (!file.Exists()) {
|
||||||
|
std::string err_msg = tokenizer_name + ": tokenizer_file: [" + tokenizer_file + "] is an invalid directory path.";
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (access(tokenizer_file.c_str(), R_OK) == -1) {
|
||||||
|
std::string err_msg = tokenizer_name + ": No access to specified tokenizer path: " + tokenizer_file;
|
||||||
|
MS_LOG(ERROR) << err_msg;
|
||||||
|
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper functions to help validate data type passed by user
|
||||||
|
bool IsTypeNumeric(const std::string &data_type) {
|
||||||
|
if (data_type == "int8" || data_type == "uint8" || data_type == "int16" || data_type == "uint16" ||
|
||||||
|
data_type == "int32" || data_type == "uint32" || data_type == "int64" || data_type == "uint64" ||
|
||||||
|
data_type == "float16" || data_type == "float32" || data_type == "float64")
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool IsTypeBoolean(const std::string &data_type) { return data_type == "bool"; }
|
||||||
|
|
||||||
|
bool IsTypeString(const std::string &data_type) { return data_type == "string"; }
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
|
@ -0,0 +1,41 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_VALIDATORS_H_
|
||||||
|
#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_VALIDATORS_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "minddata/dataset/core/tensor.h"
|
||||||
|
#include "minddata/dataset/util/status.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
|
||||||
|
// Helper function to validate tokenizer directory parameter
|
||||||
|
Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file);
|
||||||
|
|
||||||
|
// Helper function to validate data type passed by user
|
||||||
|
bool IsTypeNumeric(const std::string &data_type);
|
||||||
|
|
||||||
|
// Helper function to validate data type is boolean
|
||||||
|
bool IsTypeBoolean(const std::string &data_type);
|
||||||
|
|
||||||
|
// Helper function to validate data type is string
|
||||||
|
bool IsTypeString(const std::string &data_type);
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
||||||
|
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_VALIDATORS_H_
|
|
@ -202,6 +202,7 @@ if(BUILD_MINDDATA STREQUAL "full")
|
||||||
${MINDDATA_DIR}/kernels/data/type_cast_op.cc
|
${MINDDATA_DIR}/kernels/data/type_cast_op.cc
|
||||||
${MINDDATA_DIR}/kernels/image/exif_utils.cc
|
${MINDDATA_DIR}/kernels/image/exif_utils.cc
|
||||||
${MINDDATA_DIR}/kernels/ir/validators.cc
|
${MINDDATA_DIR}/kernels/ir/validators.cc
|
||||||
|
${MINDDATA_DIR}/kernels/ir/data/transforms_ir.cc
|
||||||
${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc
|
${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc
|
||||||
${MINDDATA_DIR}/callback/callback_manager.cc
|
${MINDDATA_DIR}/callback/callback_manager.cc
|
||||||
${MINDDATA_DIR}/util/task_manager.cc
|
${MINDDATA_DIR}/util/task_manager.cc
|
||||||
|
@ -281,6 +282,7 @@ elseif(BUILD_MINDDATA STREQUAL "wrapper")
|
||||||
${MINDDATA_DIR}/kernels/data/data_utils.cc
|
${MINDDATA_DIR}/kernels/data/data_utils.cc
|
||||||
${MINDDATA_DIR}/kernels/image/exif_utils.cc
|
${MINDDATA_DIR}/kernels/image/exif_utils.cc
|
||||||
${MINDDATA_DIR}/kernels/ir/validators.cc
|
${MINDDATA_DIR}/kernels/ir/validators.cc
|
||||||
|
${MINDDATA_DIR}/kernels/ir/data/transforms_ir.cc
|
||||||
${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc
|
${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/wrapper/MDToDApi.cc
|
${CMAKE_CURRENT_SOURCE_DIR}/wrapper/MDToDApi.cc
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/wrapper/album_op_android.cc
|
${CMAKE_CURRENT_SOURCE_DIR}/wrapper/album_op_android.cc
|
||||||
|
@ -393,6 +395,7 @@ elseif(BUILD_MINDDATA STREQUAL "lite")
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/../src/common/log_adapter.cc
|
${CMAKE_CURRENT_SOURCE_DIR}/../src/common/log_adapter.cc
|
||||||
${CORE_DIR}/utils/ms_utils.cc
|
${CORE_DIR}/utils/ms_utils.cc
|
||||||
${MINDDATA_DIR}/kernels/ir/validators.cc
|
${MINDDATA_DIR}/kernels/ir/validators.cc
|
||||||
|
${MINDDATA_DIR}/kernels/ir/data/transforms_ir.cc
|
||||||
${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc
|
${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue