TextOp decoupling

Signed-off-by: alex-yuyue <yue.yu1@huawei.com>
This commit is contained in:
alex-yuyue 2021-01-19 09:09:51 -05:00
parent 4e3abb2434
commit a8675f8227
12 changed files with 670 additions and 306 deletions

View File

@ -18,7 +18,7 @@ if(ENABLE_PYTHON)
python/bindings/dataset/kernels/ir/bindings.cc
python/bindings/dataset/kernels/ir/image/bindings.cc
python/bindings/dataset/text/bindings.cc
python/bindings/dataset/text/kernels/bindings.cc
python/bindings/dataset/text/kernels/ir/bindings.cc
python/bindings/mindrecord/include/bindings.cc
python/pybind_conversion.cc
python/pybind_register.cc

View File

@ -1,205 +0,0 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
#include "pybind11/stl_bind.h"
#include "minddata/dataset/api/python/pybind_register.h"
#include "minddata/dataset/text/kernels/jieba_tokenizer_op.h"
#include "minddata/dataset/text/kernels/lookup_op.h"
#include "minddata/dataset/text/kernels/ngram_op.h"
#include "minddata/dataset/text/kernels/sliding_window_op.h"
#include "minddata/dataset/text/kernels/to_number_op.h"
#include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
#include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h"
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
#include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h"
#ifdef ENABLE_ICU4C
#include "minddata/dataset/text/kernels/basic_tokenizer_op.h"
#include "minddata/dataset/text/kernels/bert_tokenizer_op.h"
#include "minddata/dataset/text/kernels/case_fold_op.h"
#include "minddata/dataset/text/kernels/normalize_utf8_op.h"
#include "minddata/dataset/text/kernels/regex_replace_op.h"
#include "minddata/dataset/text/kernels/regex_tokenizer_op.h"
#include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h"
#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
#endif
namespace mindspore {
namespace dataset {
#ifdef ENABLE_ICU4C
PYBIND_REGISTER(BasicTokenizerOp, 1, ([](const py::module *m) {
(void)py::class_<BasicTokenizerOp, TensorOp, std::shared_ptr<BasicTokenizerOp>>(*m,
"BasicTokenizerOp")
.def(py::init<const bool &, const bool &, const NormalizeForm &, const bool &, const bool &>());
}));
PYBIND_REGISTER(WhitespaceTokenizerOp, 1, ([](const py::module *m) {
(void)py::class_<WhitespaceTokenizerOp, TensorOp, std::shared_ptr<WhitespaceTokenizerOp>>(
*m, "WhitespaceTokenizerOp")
.def(py::init<const bool &>());
}));
PYBIND_REGISTER(UnicodeScriptTokenizerOp, 1, ([](const py::module *m) {
(void)py::class_<UnicodeScriptTokenizerOp, TensorOp, std::shared_ptr<UnicodeScriptTokenizerOp>>(
*m, "UnicodeScriptTokenizerOp")
.def(py::init<>())
.def(py::init<const bool &, const bool &>());
}));
PYBIND_REGISTER(
CaseFoldOp, 1, ([](const py::module *m) {
(void)py::class_<CaseFoldOp, TensorOp, std::shared_ptr<CaseFoldOp>>(*m, "CaseFoldOp").def(py::init<>());
}));
PYBIND_REGISTER(NormalizeUTF8Op, 1, ([](const py::module *m) {
(void)py::class_<NormalizeUTF8Op, TensorOp, std::shared_ptr<NormalizeUTF8Op>>(*m, "NormalizeUTF8Op")
.def(py::init<>())
.def(py::init<NormalizeForm>());
}));
PYBIND_REGISTER(RegexReplaceOp, 1, ([](const py::module *m) {
(void)py::class_<RegexReplaceOp, TensorOp, std::shared_ptr<RegexReplaceOp>>(*m, "RegexReplaceOp")
.def(py::init<const std::string &, const std::string &, bool>());
}));
PYBIND_REGISTER(RegexTokenizerOp, 1, ([](const py::module *m) {
(void)py::class_<RegexTokenizerOp, TensorOp, std::shared_ptr<RegexTokenizerOp>>(*m,
"RegexTokenizerOp")
.def(py::init<const std::string &, const std::string &, const bool &>());
}));
PYBIND_REGISTER(BertTokenizerOp, 1, ([](const py::module *m) {
(void)py::class_<BertTokenizerOp, TensorOp, std::shared_ptr<BertTokenizerOp>>(*m, "BertTokenizerOp")
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &,
const bool &, const bool &, const NormalizeForm &, const bool &, const bool &>());
}));
PYBIND_REGISTER(NormalizeForm, 0, ([](const py::module *m) {
(void)py::enum_<NormalizeForm>(*m, "NormalizeForm", py::arithmetic())
.value("DE_NORMALIZE_NONE", NormalizeForm::kNone)
.value("DE_NORMALIZE_NFC", NormalizeForm::kNfc)
.value("DE_NORMALIZE_NFKC", NormalizeForm::kNfkc)
.value("DE_NORMALIZE_NFD", NormalizeForm::kNfd)
.value("DE_NORMALIZE_NFKD", NormalizeForm::kNfkd)
.export_values();
}));
#endif
PYBIND_REGISTER(JiebaTokenizerOp, 1, ([](const py::module *m) {
(void)py::class_<JiebaTokenizerOp, TensorOp, std::shared_ptr<JiebaTokenizerOp>>(*m,
"JiebaTokenizerOp")
.def(py::init<const std::string &, const std::string &, const JiebaMode &, const bool &>())
.def("add_word", [](JiebaTokenizerOp &self, const std::string word, int freq) {
THROW_IF_ERROR(self.AddWord(word, freq));
});
}));
PYBIND_REGISTER(UnicodeCharTokenizerOp, 1, ([](const py::module *m) {
(void)py::class_<UnicodeCharTokenizerOp, TensorOp, std::shared_ptr<UnicodeCharTokenizerOp>>(
*m, "UnicodeCharTokenizerOp")
.def(py::init<const bool &>());
}));
PYBIND_REGISTER(LookupOp, 1, ([](const py::module *m) {
(void)py::class_<LookupOp, TensorOp, std::shared_ptr<LookupOp>>(*m, "LookupOp")
.def(py::init([](std::shared_ptr<Vocab> vocab, const py::object &py_word,
const DataType &data_type) {
if (vocab == nullptr) {
THROW_IF_ERROR(Status(StatusCode::kUnexpectedError, "vocab object type is incorrect or null."));
}
if (py_word.is_none()) {
return std::make_shared<LookupOp>(vocab, Vocab::kNoTokenExists, data_type);
}
std::string word = py::reinterpret_borrow<py::str>(py_word);
WordIdType default_id = vocab->Lookup(word);
if (default_id == Vocab::kNoTokenExists) {
THROW_IF_ERROR(Status(StatusCode::kUnexpectedError,
"default unknown token: " + word + " doesn't exist in vocab."));
}
return std::make_shared<LookupOp>(vocab, default_id, data_type);
}));
}));
PYBIND_REGISTER(NgramOp, 1, ([](const py::module *m) {
(void)py::class_<NgramOp, TensorOp, std::shared_ptr<NgramOp>>(*m, "NgramOp")
.def(py::init<const std::vector<int32_t> &, int32_t, int32_t, const std::string &,
const std::string &, const std::string &>());
}));
PYBIND_REGISTER(WordpieceTokenizerOp, 1, ([](const py::module *m) {
(void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
*m, "WordpieceTokenizerOp")
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &,
const bool &>());
}));
PYBIND_REGISTER(SlidingWindowOp, 1, ([](const py::module *m) {
(void)py::class_<SlidingWindowOp, TensorOp, std::shared_ptr<SlidingWindowOp>>(*m, "SlidingWindowOp")
.def(py::init<uint32_t, int32_t>());
}));
PYBIND_REGISTER(
SentencePieceTokenizerOp, 1, ([](const py::module *m) {
(void)py::class_<SentencePieceTokenizerOp, TensorOp, std::shared_ptr<SentencePieceTokenizerOp>>(
*m, "SentencePieceTokenizerOp")
.def(
py::init<std::shared_ptr<SentencePieceVocab> &, const SPieceTokenizerLoadType, const SPieceTokenizerOutType>())
.def(py::init<const std::string &, const std::string &, const SPieceTokenizerLoadType,
const SPieceTokenizerOutType>());
}));
PYBIND_REGISTER(ToNumberOp, 1, ([](const py::module *m) {
(void)py::class_<ToNumberOp, TensorOp, std::shared_ptr<ToNumberOp>>(*m, "ToNumberOp")
.def(py::init<DataType>())
.def(py::init<std::string>());
}));
PYBIND_REGISTER(TruncateSequencePairOp, 1, ([](const py::module *m) {
(void)py::class_<TruncateSequencePairOp, TensorOp, std::shared_ptr<TruncateSequencePairOp>>(
*m, "TruncateSequencePairOp")
.def(py::init<int64_t>());
}));
PYBIND_REGISTER(JiebaMode, 0, ([](const py::module *m) {
(void)py::enum_<JiebaMode>(*m, "JiebaMode", py::arithmetic())
.value("DE_JIEBA_MIX", JiebaMode::kMix)
.value("DE_JIEBA_MP", JiebaMode::kMp)
.value("DE_JIEBA_HMM", JiebaMode::kHmm)
.export_values();
}));
PYBIND_REGISTER(SPieceTokenizerOutType, 0, ([](const py::module *m) {
(void)py::enum_<SPieceTokenizerOutType>(*m, "SPieceTokenizerOutType", py::arithmetic())
.value("DE_SPIECE_TOKENIZER_OUTTYPE_KString", SPieceTokenizerOutType::kString)
.value("DE_SPIECE_TOKENIZER_OUTTYPE_KINT", SPieceTokenizerOutType::kInt)
.export_values();
}));
PYBIND_REGISTER(SPieceTokenizerLoadType, 0, ([](const py::module *m) {
(void)py::enum_<SPieceTokenizerLoadType>(*m, "SPieceTokenizerLoadType", py::arithmetic())
.value("DE_SPIECE_TOKENIZER_LOAD_KFILE", SPieceTokenizerLoadType::kFile)
.value("DE_SPIECE_TOKENIZER_LOAD_KMODEL", SPieceTokenizerLoadType::kModel)
.export_values();
}));
} // namespace dataset
} // namespace mindspore

View File

@ -0,0 +1,267 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
#include "pybind11/stl_bind.h"
#include "minddata/dataset/api/python/pybind_register.h"
#include "minddata/dataset/include/text.h"
#include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/text/vocab.h"
namespace mindspore {
namespace dataset {
#ifdef ENABLE_ICU4C
PYBIND_REGISTER(
BasicTokenizerOperation, 1, ([](const py::module *m) {
(void)py::class_<text::BasicTokenizerOperation, TensorOperation, std::shared_ptr<text::BasicTokenizerOperation>>(
*m, "BasicTokenizerOperation")
.def(py::init([](bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form,
bool preserve_unused_token, bool with_offsets) {
auto basic_tokenizer = std::make_shared<text::BasicTokenizerOperation>(
lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets);
THROW_IF_ERROR(basic_tokenizer->ValidateParams());
return basic_tokenizer;
}));
}));
PYBIND_REGISTER(
BertTokenizerOperation, 1, ([](const py::module *m) {
(void)py::class_<text::BertTokenizerOperation, TensorOperation, std::shared_ptr<text::BertTokenizerOperation>>(
*m, "BertTokenizerOperation")
.def(py::init([](const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case,
bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
bool with_offsets) {
auto bert_tokenizer = std::make_shared<text::BertTokenizerOperation>(
vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case, keep_whitespace, normalize_form,
preserve_unused_token, with_offsets);
THROW_IF_ERROR(bert_tokenizer->ValidateParams());
return bert_tokenizer;
}));
}));
PYBIND_REGISTER(CaseFoldOperation, 1, ([](const py::module *m) {
(void)py::class_<text::CaseFoldOperation, TensorOperation, std::shared_ptr<text::CaseFoldOperation>>(
*m, "CaseFoldOperation")
.def(py::init([]() {
auto case_fold = std::make_shared<text::CaseFoldOperation>();
THROW_IF_ERROR(case_fold->ValidateParams());
return case_fold;
}));
}));
PYBIND_REGISTER(
NormalizeUTF8Operation, 1, ([](const py::module *m) {
(void)py::class_<text::NormalizeUTF8Operation, TensorOperation, std::shared_ptr<text::NormalizeUTF8Operation>>(
*m, "NormalizeUTF8Operation")
.def(py::init([](NormalizeForm normalize_form) {
auto normalize_utf8 = std::make_shared<text::NormalizeUTF8Operation>(normalize_form);
THROW_IF_ERROR(normalize_utf8->ValidateParams());
return normalize_utf8;
}));
}));
PYBIND_REGISTER(
RegexReplaceOperation, 1, ([](const py::module *m) {
(void)py::class_<text::RegexReplaceOperation, TensorOperation, std::shared_ptr<text::RegexReplaceOperation>>(
*m, "RegexReplaceOperation")
.def(py::init([](std::string pattern, std::string replace, bool replace_all) {
auto regex_replace = std::make_shared<text::RegexReplaceOperation>(pattern, replace, replace_all);
THROW_IF_ERROR(regex_replace->ValidateParams());
return regex_replace;
}));
}));
PYBIND_REGISTER(
RegexTokenizerOperation, 1, ([](const py::module *m) {
(void)py::class_<text::RegexTokenizerOperation, TensorOperation, std::shared_ptr<text::RegexTokenizerOperation>>(
*m, "RegexTokenizerOperation")
.def(
py::init([](const std::string &delim_pattern, const std::string &keep_delim_pattern, const bool &with_offsets) {
auto regex_tokenizer =
std::make_shared<text::RegexTokenizerOperation>(delim_pattern, keep_delim_pattern, with_offsets);
THROW_IF_ERROR(regex_tokenizer->ValidateParams());
return regex_tokenizer;
}));
}));
PYBIND_REGISTER(UnicodeScriptTokenizerOperation, 1, ([](const py::module *m) {
(void)py::class_<text::UnicodeScriptTokenizerOperation, TensorOperation,
std::shared_ptr<text::UnicodeScriptTokenizerOperation>>(
*m, "UnicodeScriptTokenizerOperation")
.def(py::init([](bool keep_whitespace, bool with_offsets) {
auto unicode_script_tokenizer =
std::make_shared<text::UnicodeScriptTokenizerOperation>(keep_whitespace, with_offsets);
THROW_IF_ERROR(unicode_script_tokenizer->ValidateParams());
return unicode_script_tokenizer;
}));
}));
PYBIND_REGISTER(WhitespaceTokenizerOperation, 1, ([](const py::module *m) {
(void)py::class_<text::WhitespaceTokenizerOperation, TensorOperation,
std::shared_ptr<text::WhitespaceTokenizerOperation>>(*m,
"WhitespaceTokenizerOperation")
.def(py::init([](bool with_offsets) {
auto whitespace_tokenizer = std::make_shared<text::WhitespaceTokenizerOperation>(with_offsets);
THROW_IF_ERROR(whitespace_tokenizer->ValidateParams());
return whitespace_tokenizer;
}));
}));
PYBIND_REGISTER(NormalizeForm, 0, ([](const py::module *m) {
(void)py::enum_<NormalizeForm>(*m, "NormalizeForm", py::arithmetic())
.value("DE_NORMALIZE_NONE", NormalizeForm::kNone)
.value("DE_NORMALIZE_NFC", NormalizeForm::kNfc)
.value("DE_NORMALIZE_NFKC", NormalizeForm::kNfkc)
.value("DE_NORMALIZE_NFD", NormalizeForm::kNfd)
.value("DE_NORMALIZE_NFKD", NormalizeForm::kNfkd)
.export_values();
}));
#endif
PYBIND_REGISTER(
JiebaTokenizerOperation, 1, ([](const py::module *m) {
(void)py::class_<text::JiebaTokenizerOperation, TensorOperation, std::shared_ptr<text::JiebaTokenizerOperation>>(
*m, "JiebaTokenizerOperation")
.def(
py::init([](const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode, bool with_offsets) {
auto jieba_tokenizer = std::make_shared<text::JiebaTokenizerOperation>(hmm_path, mp_path, mode, with_offsets);
THROW_IF_ERROR(jieba_tokenizer->ValidateParams());
return jieba_tokenizer;
}))
.def("add_word", [](text::JiebaTokenizerOperation &self, const std::string word, int64_t freq) {
THROW_IF_ERROR(self.AddWord(word, freq));
});
}));
PYBIND_REGISTER(LookupOperation, 1, ([](const py::module *m) {
(void)py::class_<text::LookupOperation, TensorOperation, std::shared_ptr<text::LookupOperation>>(
*m, "LookupOperation")
.def(py::init([](const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
const std::string &data_type) {
auto lookup = std::make_shared<text::LookupOperation>(vocab, unknown_token, data_type);
THROW_IF_ERROR(lookup->ValidateParams());
return lookup;
}));
}));
PYBIND_REGISTER(NgramOperation, 1, ([](const py::module *m) {
(void)py::class_<text::NgramOperation, TensorOperation, std::shared_ptr<text::NgramOperation>>(
*m, "NgramOperation")
.def(
py::init([](const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
const std::pair<std::string, int32_t> &right_pad, const std::string &separator) {
auto ngram = std::make_shared<text::NgramOperation>(ngrams, left_pad, right_pad, separator);
THROW_IF_ERROR(ngram->ValidateParams());
return ngram;
}));
}));
PYBIND_REGISTER(
SentencePieceTokenizerOperation, 1, ([](const py::module *m) {
(void)py::class_<text::SentencePieceTokenizerOperation, TensorOperation,
std::shared_ptr<text::SentencePieceTokenizerOperation>>(*m, "SentencePieceTokenizerOperation")
.def(py::init([](const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type) {
auto SentencePieceTokenizer = std::make_shared<text::SentencePieceTokenizerOperation>(vocab, out_type);
THROW_IF_ERROR(SentencePieceTokenizer->ValidateParams());
return SentencePieceTokenizer;
}))
.def(py::init([](const std::string &vocab_path, SPieceTokenizerOutType out_type) {
auto sentence_piece_tokenizer = std::make_shared<text::SentencePieceTokenizerOperation>(vocab_path, out_type);
THROW_IF_ERROR(sentence_piece_tokenizer->ValidateParams());
return sentence_piece_tokenizer;
}));
}));
PYBIND_REGISTER(
SlidingWindowOperation, 1, ([](const py::module *m) {
(void)py::class_<text::SlidingWindowOperation, TensorOperation, std::shared_ptr<text::SlidingWindowOperation>>(
*m, "SlidingWindowOperation")
.def(py::init([](const int32_t width, const int32_t axis) {
auto sliding_window = std::make_shared<text::SlidingWindowOperation>(width, axis);
THROW_IF_ERROR(sliding_window->ValidateParams());
return sliding_window;
}));
}));
PYBIND_REGISTER(ToNumberOperation, 1, ([](const py::module *m) {
(void)py::class_<text::ToNumberOperation, TensorOperation, std::shared_ptr<text::ToNumberOperation>>(
*m, "ToNumberOperation")
.def(py::init([](std::string data_type) {
auto to_number = std::make_shared<text::ToNumberOperation>(data_type);
THROW_IF_ERROR(to_number->ValidateParams());
return to_number;
}));
}));
PYBIND_REGISTER(TruncateSequencePairOperation, 1, ([](const py::module *m) {
(void)py::class_<text::TruncateSequencePairOperation, TensorOperation,
std::shared_ptr<text::TruncateSequencePairOperation>>(
*m, "TruncateSequencePairOperation")
.def(py::init([](int32_t max_length) {
auto truncate_sequence_pair = std::make_shared<text::TruncateSequencePairOperation>(max_length);
THROW_IF_ERROR(truncate_sequence_pair->ValidateParams());
return truncate_sequence_pair;
}));
}));
PYBIND_REGISTER(UnicodeCharTokenizerOperation, 1, ([](const py::module *m) {
(void)py::class_<text::UnicodeCharTokenizerOperation, TensorOperation,
std::shared_ptr<text::UnicodeCharTokenizerOperation>>(
*m, "UnicodeCharTokenizerOperation")
.def(py::init([](bool with_offsets) {
auto unicode_char_tokenizer = std::make_shared<text::UnicodeCharTokenizerOperation>(with_offsets);
THROW_IF_ERROR(unicode_char_tokenizer->ValidateParams());
return unicode_char_tokenizer;
}));
}));
// TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++
PYBIND_REGISTER(WordpieceTokenizerOp, 1, ([](const py::module *m) {
(void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
*m, "WordpieceTokenizerOp")
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &,
const bool &>());
}));
PYBIND_REGISTER(JiebaMode, 0, ([](const py::module *m) {
(void)py::enum_<JiebaMode>(*m, "JiebaMode", py::arithmetic())
.value("DE_JIEBA_MIX", JiebaMode::kMix)
.value("DE_JIEBA_MP", JiebaMode::kMp)
.value("DE_JIEBA_HMM", JiebaMode::kHmm)
.export_values();
}));
PYBIND_REGISTER(SPieceTokenizerLoadType, 0, ([](const py::module *m) {
(void)py::enum_<SPieceTokenizerLoadType>(*m, "SPieceTokenizerLoadType", py::arithmetic())
.value("DE_SPIECE_TOKENIZER_LOAD_KFILE", SPieceTokenizerLoadType::kFile)
.value("DE_SPIECE_TOKENIZER_LOAD_KMODEL", SPieceTokenizerLoadType::kModel)
.export_values();
}));
PYBIND_REGISTER(SPieceTokenizerOutType, 0, ([](const py::module *m) {
(void)py::enum_<SPieceTokenizerOutType>(*m, "SPieceTokenizerOutType", py::arithmetic())
.value("DE_SPIECE_TOKENIZER_OUTTYPE_KString", SPieceTokenizerOutType::kString)
.value("DE_SPIECE_TOKENIZER_OUTTYPE_KINT", SPieceTokenizerOutType::kInt)
.export_values();
}));
} // namespace dataset
} // namespace mindspore

View File

@ -314,9 +314,31 @@ Status JiebaTokenizerOperation::ValidateParams() {
std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() {
std::shared_ptr<JiebaTokenizerOp> tensor_op =
std::make_shared<JiebaTokenizerOp>(hmm_path_, mp_path_, mode_, with_offsets_);
for (auto &word : words_list_) {
Status rc = tensor_op->AddWord(word.first, word.second);
if (rc.IsError()) {
MS_LOG(ERROR) << rc;
return {};
}
}
return tensor_op;
}
Status JiebaTokenizerOperation::AddWord(const std::string &word, int64_t freq) {
if (word.empty()) {
std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided.";
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
}
if (freq < 0) {
std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0.";
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
}
words_list_.emplace_back(word, freq);
return Status::OK();
}
// LookupOperation
LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
const std::string &data_type)
@ -330,12 +352,13 @@ Status LookupOperation::ValidateParams() {
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
}
default_id_ = vocab_->Lookup(unknown_token_);
if (default_id_ == Vocab::kNoTokenExists) {
std::string err_msg = "Lookup: \"" + unknown_token_ + "\" doesn't exist in vocab.";
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
if (!unknown_token_.empty()) {
default_id_ = vocab_->Lookup(unknown_token_);
if (default_id_ == Vocab::kNoTokenExists) {
std::string err_msg = "Lookup: \"" + unknown_token_ + "\" doesn't exist in vocab.";
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
}
}
if (!IsTypeNumeric(data_type_)) {

View File

@ -331,11 +331,14 @@ class JiebaTokenizerOperation : public TensorOperation {
std::string Name() const override { return kJiebaTokenizerOperation; }
Status AddWord(const std::string &word, int64_t freq = 0);
private:
std::string hmm_path_;
std::string mp_path_;
JiebaMode mode_;
bool with_offsets_;
std::vector<std::pair<std::string, int64_t>> words_list_;
};
class LookupOperation : public TensorOperation {

View File

@ -383,3 +383,7 @@ def check_tensor_op(param, param_name):
"""check whether param is a tensor op or a callable Python function"""
if not isinstance(param, cde.TensorOp) and not callable(param) and not getattr(param, 'parse', None):
raise TypeError("{0} is neither a c_transform op (TensorOperation) nor a callable pyfunc.".format(param_name))
def replace_none(value, default):
return value if value is not None else default

View File

@ -55,9 +55,10 @@ from .validators import check_batch, check_shuffle, check_map, check_filter, che
check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset, \
check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat, \
check_random_dataset, check_split, check_bucket_batch_by_length, check_cluedataset, check_save, check_csvdataset, \
check_paddeddataset, check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send, replace_none
check_paddeddataset, check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send
from ..core.config import get_callback_timeout, _init_device_info
from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist
from ..core.validator_helpers import replace_none
try:
context = import_module("mindspore.context")
@ -372,7 +373,7 @@ class Dataset:
Args:
condition_name (str): The condition name that is used to toggle sending next row.
num_batch (int): the number of batches without blocking at the start of each epoch.
callback (function): The callback funciton that will be invoked when sync_update is called.
callback (function): The callback function that will be invoked when sync_update is called.
Returns:
SyncWaitDataset, dataset added a blocking condition.
@ -398,7 +399,7 @@ class Dataset:
1. Make a shuffle buffer that contains the first buffer_size rows.
2. Randomly select an element from the shuffle buffer to be the next row
propogated to the child node.
propagated to the child node.
3. Get the next row (if any) from the parent node and put it in the shuffle buffer.
4. Repeat steps 2 and 3 until there are no more rows left in the shuffle buffer.
@ -1718,7 +1719,7 @@ class MappableDataset(SourceDataset):
- The sum of split sizes < K, the difference will be added to the first split.
- The sum of split sizes > K, the difference will be removed from the first large
enough split such that it will have atleast 1 row after removing the difference.
enough split such that it will have at least 1 row after removing the difference.
randomize (bool, optional): Determines whether or not to split the data randomly (default=True).
If True, the data will be randomly split. Otherwise, each split will be created with

View File

@ -1323,7 +1323,3 @@ def check_to_device_send(method):
return method(self, *args, **kwargs)
return new_method
def replace_none(value, default):
return value if value is not None else default

View File

@ -58,9 +58,13 @@ from .validators import check_lookup, check_jieba_add_dict, \
check_wordpiece_tokenizer, check_regex_tokenizer, check_basic_tokenizer, check_ngram, check_pair_truncate, \
check_to_number, check_bert_tokenizer, check_python_tokenizer, check_slidingwindow
from ..core.datatypes import mstype_to_detype
from ..core.validator_helpers import replace_none
class TextTensorOperation:
def parse(self):
raise NotImplementedError("TextTensorOperation has to implement parse method.")
class Lookup(cde.LookupOp):
class Lookup(TextTensorOperation):
"""
Lookup operator that looks up a word to an id.
@ -82,10 +86,15 @@ class Lookup(cde.LookupOp):
@check_lookup
def __init__(self, vocab, unknown_token=None, data_type=mstype.int32):
super().__init__(vocab, unknown_token, mstype_to_detype(data_type))
self.vocab = vocab
self.unknown_token = replace_none(unknown_token, '')
self.data_type = data_type
def parse(self):
return cde.LookupOperation(self.vocab, self.unknown_token, str(mstype_to_detype(self.data_type)))
class SlidingWindow(cde.SlidingWindowOp):
class SlidingWindow(TextTensorOperation):
"""
TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension axis
is a slice of data starting at the corresponding position, with a specified width.
@ -114,10 +123,14 @@ class SlidingWindow(cde.SlidingWindowOp):
@check_slidingwindow
def __init__(self, width, axis=0):
super().__init__(width, axis)
self.width = width
self.axis = axis
def parse(self):
return cde.SlidingWindowOperation(self.width, self.axis)
class Ngram(cde.NgramOp):
class Ngram(TextTensorOperation):
"""
TensorOp to generate n-gram from a 1-D string Tensor.
@ -145,7 +158,13 @@ class Ngram(cde.NgramOp):
@check_ngram
def __init__(self, n, left_pad=("", 0), right_pad=("", 0), separator=" "):
super().__init__(n, left_pad[1], right_pad[1], left_pad[0], right_pad[0], separator)
self.ngrams = n
self.left_pad = left_pad
self.right_pad = right_pad
self.separator = separator
def parse(self):
return cde.NgramOperation(self.ngrams, self.left_pad, self.right_pad, self.separator)
DE_C_INTER_JIEBA_MODE = {
@ -155,7 +174,7 @@ DE_C_INTER_JIEBA_MODE = {
}
class JiebaTokenizer(cde.JiebaTokenizerOp):
class JiebaTokenizer(TextTensorOperation):
"""
Tokenize Chinese string into words based on dictionary.
@ -196,11 +215,19 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
self.mode = mode
self.__check_path__(hmm_path)
self.hmm_path = hmm_path
self.__check_path__(mp_path)
self.mp_path = mp_path
self.with_offsets = with_offsets
super().__init__(hmm_path, mp_path,
DE_C_INTER_JIEBA_MODE[mode],
self.with_offsets)
self.words = []
def parse(self):
jieba_tokenizer = cde.JiebaTokenizerOperation(self.hmm_path, self.mp_path,
DE_C_INTER_JIEBA_MODE[self.mode],
self.with_offsets)
for word in self.words:
jieba_tokenizer.add_word(word[0], word[1])
return jieba_tokenizer
@check_jieba_add_word
def add_word(self, word, freq=None):
@ -225,9 +252,9 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
"""
if freq is None:
super().add_word(word, 0)
self.words.append((word, 0))
else:
super().add_word(word, freq)
self.words.append((word, freq))
@check_jieba_add_dict
def add_dict(self, user_dict):
@ -308,7 +335,7 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
" jieba mode file {} is not exist.".format(model_path))
class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp):
class UnicodeCharTokenizer(TextTensorOperation):
"""
Tokenize a scalar tensor of UTF-8 string to Unicode characters.
@ -332,9 +359,12 @@ class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp):
@check_with_offsets
def __init__(self, with_offsets=False):
self.with_offsets = with_offsets
super().__init__(self.with_offsets)
def parse(self):
return cde.UnicodeCharTokenizerOperation(self.with_offsets)
# TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++
class WordpieceTokenizer(cde.WordpieceTokenizerOp):
"""
Tokenize scalar token or 1-D tokens to 1-D subword tokens.
@ -386,7 +416,7 @@ DE_C_INTER_SENTENCEPIECE_OUTTYPE = {
}
class SentencePieceTokenizer(cde.SentencePieceTokenizerOp):
class SentencePieceTokenizer(TextTensorOperation):
"""
Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
@ -404,19 +434,15 @@ class SentencePieceTokenizer(cde.SentencePieceTokenizerOp):
"""
def __init__(self, mode, out_type):
self.mode = mode
self.out_type = out_type
if isinstance(mode, str):
model_path, model_filename = os.path.split(mode)
super().__init__(model_path, model_filename,
DE_C_INTER_SENTENCEPIECE_LOADTYPE[SPieceTokenizerLoadType.FILE],
DE_C_INTER_SENTENCEPIECE_OUTTYPE[out_type])
elif isinstance(mode, cde.SentencePieceVocab):
super().__init__(mode, DE_C_INTER_SENTENCEPIECE_LOADTYPE[SPieceTokenizerLoadType.MODEL],
DE_C_INTER_SENTENCEPIECE_OUTTYPE[out_type])
def parse(self):
return cde.SentencePieceTokenizerOperation(self.mode, DE_C_INTER_SENTENCEPIECE_OUTTYPE[self.out_type])
if platform.system().lower() != 'windows':
class WhitespaceTokenizer(cde.WhitespaceTokenizerOp):
class WhitespaceTokenizer(TextTensorOperation):
"""
Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces, such as: ' ', '\\\\t', '\\\\r', '\\\\n'.
@ -444,10 +470,12 @@ if platform.system().lower() != 'windows':
@check_with_offsets
def __init__(self, with_offsets=False):
self.with_offsets = with_offsets
super().__init__(self.with_offsets)
def parse(self):
return cde.WhitespaceTokenizerOperation(self.with_offsets)
class UnicodeScriptTokenizer(cde.UnicodeScriptTokenizerOp):
class UnicodeScriptTokenizer(TextTensorOperation):
"""
Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
@ -475,12 +503,16 @@ if platform.system().lower() != 'windows':
@check_unicode_script_tokenizer
def __init__(self, keep_whitespace=False, with_offsets=False):
keep_whitespace = replace_none(keep_whitespace, False)
with_offsets = replace_none(with_offsets, False)
self.keep_whitespace = keep_whitespace
self.with_offsets = with_offsets
super().__init__(self.keep_whitespace, self.with_offsets)
def parse(self):
return cde.UnicodeScriptTokenizerOperation(self.keep_whitespace, self.with_offsets)
class CaseFold(cde.CaseFoldOp):
class CaseFold(TextTensorOperation):
"""
Apply case fold operation on UTF-8 string tensor.
@ -494,6 +526,9 @@ if platform.system().lower() != 'windows':
>>> data1 = data1.map(operations=case_op)
"""
def parse(self):
return cde.CaseFoldOperation()
DE_C_INTER_NORMALIZE_FORM = {
NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE,
@ -504,7 +539,7 @@ if platform.system().lower() != 'windows':
}
class NormalizeUTF8(cde.NormalizeUTF8Op):
class NormalizeUTF8(TextTensorOperation):
"""
Apply normalize operation on UTF-8 string tensor.
@ -534,11 +569,14 @@ if platform.system().lower() != 'windows':
if not isinstance(normalize_form, NormalizeForm):
raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.")
normalize_form = replace_none(normalize_form, NormalizeForm.NFKC)
self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form]
super().__init__(self.normalize_form)
def parse(self):
return cde.NormalizeUTF8Operation(self.normalize_form)
class RegexReplace(cde.RegexReplaceOp):
class RegexReplace(TextTensorOperation):
"""
Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'.
@ -566,10 +604,12 @@ if platform.system().lower() != 'windows':
self.pattern = pattern
self.replace = replace
self.replace_all = replace_all
super().__init__(self.pattern, self.replace, self.replace_all)
def parse(self):
return cde.RegexReplaceOperation(self.pattern, self.replace, self.replace_all)
class RegexTokenizer(cde.RegexTokenizerOp):
class RegexTokenizer(TextTensorOperation):
"""
Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
@ -606,10 +646,12 @@ if platform.system().lower() != 'windows':
self.delim_pattern = delim_pattern
self.keep_delim_pattern = keep_delim_pattern
self.with_offsets = with_offsets
super().__init__(self.delim_pattern, self.keep_delim_pattern, self.with_offsets)
def parse(self):
return cde.RegexTokenizerOperation(self.delim_pattern, self.keep_delim_pattern, self.with_offsets)
class BasicTokenizer(cde.BasicTokenizerOp):
class BasicTokenizer(TextTensorOperation):
"""
Tokenize a scalar tensor of UTF-8 string by specific rules.
@ -661,11 +703,13 @@ if platform.system().lower() != 'windows':
self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
self.preserve_unused_token = preserve_unused_token
self.with_offsets = with_offsets
super().__init__(self.lower_case, self.keep_whitespace, self.normalization_form,
self.preserve_unused_token, self.with_offsets)
def parse(self):
return cde.BasicTokenizerOperation(self.lower_case, self.keep_whitespace, self.normalization_form,
self.preserve_unused_token, self.with_offsets)
class BertTokenizer(cde.BertTokenizerOp):
class BertTokenizer(TextTensorOperation):
"""
Tokenizer used for Bert text process.
@ -725,12 +769,14 @@ if platform.system().lower() != 'windows':
self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
self.preserve_unused_token = preserve_unused_token
self.with_offsets = with_offsets
super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token,
self.lower_case, self.keep_whitespace, self.normalization_form,
self.preserve_unused_token, self.with_offsets)
def parse(self):
return cde.BertTokenizerOperation(self.vocab, self.suffix_indicator, self.max_bytes_per_token,
self.unknown_token, self.lower_case, self.keep_whitespace,
self.normalization_form, self.preserve_unused_token, self.with_offsets)
class TruncateSequencePair(cde.TruncateSequencePairOp):
class TruncateSequencePair(TextTensorOperation):
"""
Truncate a pair of rank-1 tensors such that the total length is less than max_length.
@ -757,10 +803,13 @@ class TruncateSequencePair(cde.TruncateSequencePairOp):
@check_pair_truncate
def __init__(self, max_length):
super().__init__(max_length)
self.max_length = max_length
def parse(self):
return cde.TruncateSequencePairOperation(self.max_length)
class ToNumber(cde.ToNumberOp):
class ToNumber(TextTensorOperation):
"""
Tensor operation to convert every element of a string tensor to a number.
@ -789,7 +838,9 @@ class ToNumber(cde.ToNumberOp):
def __init__(self, data_type):
data_type = mstype_to_detype(data_type)
self.data_type = str(data_type)
super().__init__(data_type)
def parse(self):
return cde.ToNumberOperation(self.data_type)
class PythonTokenizer:

View File

@ -81,11 +81,11 @@ def parse_padding(padding):
padding = tuple(padding)
return padding
class TensorOperation:
class ImageTensorOperation:
def parse(self):
raise NotImplementedError("TensorOperation has to implement parse method.")
raise NotImplementedError("ImageTensorOperation has to implement parse method.")
class AutoContrast(TensorOperation):
class AutoContrast(ImageTensorOperation):
"""
Apply automatic contrast on input image.
@ -112,7 +112,7 @@ class AutoContrast(TensorOperation):
return cde.AutoContrastOperation(self.cutoff, self.ignore)
class RandomSharpness(TensorOperation):
class RandomSharpness(ImageTensorOperation):
"""
Adjust the sharpness of the input image by a fixed or random degree. Degree of 0.0 gives a blurred image,
degree of 1.0 gives the original image, and degree of 2.0 gives a sharpened image.
@ -140,7 +140,7 @@ class RandomSharpness(TensorOperation):
return cde.RandomSharpnessOperation(self.degrees)
class Equalize(TensorOperation):
class Equalize(ImageTensorOperation):
"""
Apply histogram equalization on input image.
@ -153,7 +153,7 @@ class Equalize(TensorOperation):
return cde.EqualizeOperation()
class Invert(TensorOperation):
class Invert(ImageTensorOperation):
"""
Apply invert on input image in RGB mode.
@ -166,7 +166,7 @@ class Invert(TensorOperation):
return cde.InvertOperation()
class Decode(TensorOperation):
class Decode(ImageTensorOperation):
"""
Decode the input image in RGB mode.
@ -203,7 +203,7 @@ class Decode(TensorOperation):
return cde.DecodeOperation(self.rgb)
class CutMixBatch(TensorOperation):
class CutMixBatch(ImageTensorOperation):
"""
Apply CutMix transformation on input batch of images and labels.
Note that you need to make labels into one-hot format and batch before calling this function.
@ -235,7 +235,7 @@ class CutMixBatch(TensorOperation):
return cde.CutMixBatchOperation(DE_C_IMAGE_BATCH_FORMAT[self.image_batch_format], self.alpha, self.prob)
class CutOut(TensorOperation):
class CutOut(ImageTensorOperation):
"""
Randomly cut (mask) out a given number of square patches from the input NumPy image array.
@ -258,7 +258,7 @@ class CutOut(TensorOperation):
return cde.CutOutOperation(self.length, self.num_patches)
class MixUpBatch(TensorOperation):
class MixUpBatch(ImageTensorOperation):
"""
Apply MixUp transformation on input batch of images and labels. Each image is multiplied by a random weight (lambda)
and then added to a randomly selected image from the batch multiplied by (1 - lambda). The same formula is also
@ -286,7 +286,7 @@ class MixUpBatch(TensorOperation):
return cde.MixUpBatchOperation(self.alpha)
class Normalize(TensorOperation):
class Normalize(ImageTensorOperation):
"""
Normalize the input image with respect to mean and standard deviation.
@ -333,7 +333,7 @@ class Normalize(TensorOperation):
return cde.NormalizeOperation(self.mean, self.std)
class NormalizePad(TensorOperation):
class NormalizePad(ImageTensorOperation):
"""
Normalize the input image with respect to mean and standard deviation then pad an extra channel with value zero.
@ -380,7 +380,7 @@ class NormalizePad(TensorOperation):
return cde.NormalizePadOperation(self.mean, self.std, self.dtype)
class RandomAffine(TensorOperation):
class RandomAffine(ImageTensorOperation):
"""
Apply Random affine transformation to the input image.
@ -486,7 +486,7 @@ class RandomAffine(TensorOperation):
self.fill_value)
class RandomCrop(TensorOperation):
class RandomCrop(ImageTensorOperation):
"""
Crop the input image at a random location.
@ -551,7 +551,7 @@ class RandomCrop(TensorOperation):
return cde.RandomCropOperation(self.size, self.padding, self.pad_if_needed, self.fill_value, border_type)
class RandomCropWithBBox(TensorOperation):
class RandomCropWithBBox(ImageTensorOperation):
"""
Crop the input image at a random location and adjust bounding boxes accordingly.
@ -615,7 +615,7 @@ class RandomCropWithBBox(TensorOperation):
border_type)
class RandomHorizontalFlip(TensorOperation):
class RandomHorizontalFlip(ImageTensorOperation):
"""
Flip the input image horizontally, randomly with a given probability.
@ -636,7 +636,7 @@ class RandomHorizontalFlip(TensorOperation):
return cde.RandomHorizontalFlipOperation(self.prob)
class RandomHorizontalFlipWithBBox(TensorOperation):
class RandomHorizontalFlipWithBBox(ImageTensorOperation):
"""
Flip the input image horizontally, randomly with a given probability and adjust bounding boxes accordingly.
@ -657,7 +657,7 @@ class RandomHorizontalFlipWithBBox(TensorOperation):
return cde.RandomHorizontalFlipWithBBoxOperation(self.prob)
class RandomPosterize(TensorOperation):
class RandomPosterize(ImageTensorOperation):
"""
Reduce the number of bits for each color channel.
@ -685,7 +685,7 @@ class RandomPosterize(TensorOperation):
return cde.RandomPosterizeOperation(bits)
class RandomVerticalFlip(TensorOperation):
class RandomVerticalFlip(ImageTensorOperation):
"""
Flip the input image vertically, randomly with a given probability.
@ -706,7 +706,7 @@ class RandomVerticalFlip(TensorOperation):
return cde.RandomVerticalFlipOperation(self.prob)
class RandomVerticalFlipWithBBox(TensorOperation):
class RandomVerticalFlipWithBBox(ImageTensorOperation):
"""
Flip the input image vertically, randomly with a given probability and adjust bounding boxes accordingly.
@ -727,7 +727,7 @@ class RandomVerticalFlipWithBBox(TensorOperation):
return cde.RandomVerticalFlipWithBBoxOperation(self.prob)
class BoundingBoxAugment(TensorOperation):
class BoundingBoxAugment(ImageTensorOperation):
"""
Apply a given image transform on a random selection of bounding box regions of a given image.
@ -760,7 +760,7 @@ class BoundingBoxAugment(TensorOperation):
return cde.BoundingBoxAugmentOperation(transform, self.ratio)
class Resize(TensorOperation):
class Resize(ImageTensorOperation):
"""
Resize the input image to the given size.
@ -816,7 +816,7 @@ class Resize(TensorOperation):
return cde.ResizeOperation(self.size, DE_C_INTER_MODE[self.interpolation])
class ResizeWithBBox(TensorOperation):
class ResizeWithBBox(ImageTensorOperation):
"""
Resize the input image to the given size and adjust bounding boxes accordingly.
@ -855,7 +855,7 @@ class ResizeWithBBox(TensorOperation):
return cde.ResizeWithBBoxOperation(size, DE_C_INTER_MODE[self.interpolation])
class RandomResizedCropWithBBox(TensorOperation):
class RandomResizedCropWithBBox(ImageTensorOperation):
"""
Crop the input image to a random size and aspect ratio and adjust bounding boxes accordingly.
@ -904,7 +904,7 @@ class RandomResizedCropWithBBox(TensorOperation):
DE_C_INTER_MODE[self.interpolation], self.max_attempts)
class RandomResizedCrop(TensorOperation):
class RandomResizedCrop(ImageTensorOperation):
"""
Crop the input image to a random size and aspect ratio.
@ -954,7 +954,7 @@ class RandomResizedCrop(TensorOperation):
self.max_attempts)
class CenterCrop(TensorOperation):
class CenterCrop(ImageTensorOperation):
"""
Crops the input image at the center to the given size.
@ -984,7 +984,7 @@ class CenterCrop(TensorOperation):
return cde.CenterCropOperation(self.size)
class RandomColor(TensorOperation):
class RandomColor(ImageTensorOperation):
"""
Adjust the color of the input image by a fixed or random degree.
This operation works only with 3-channel color images.
@ -1008,7 +1008,7 @@ class RandomColor(TensorOperation):
return cde.RandomColorOperation(*self.degrees)
class RandomColorAdjust(TensorOperation):
class RandomColorAdjust(ImageTensorOperation):
"""
Randomly adjust the brightness, contrast, saturation, and hue of the input image.
@ -1060,7 +1060,7 @@ class RandomColorAdjust(TensorOperation):
return cde.RandomColorAdjustOperation(self.brightness, self.contrast, self.saturation, self.hue)
class RandomRotation(TensorOperation):
class RandomRotation(ImageTensorOperation):
"""
Rotate the input image by a random angle.
@ -1116,7 +1116,7 @@ class RandomRotation(TensorOperation):
return cde.RandomRotationOperation(degrees, interpolation, expand, center, fill_value)
class Rescale(TensorOperation):
class Rescale(ImageTensorOperation):
"""
Tensor operation to rescale the input image.
@ -1155,7 +1155,7 @@ class Rescale(TensorOperation):
return cde.RescaleOperation(self.rescale, self.shift)
class RandomResize(TensorOperation):
class RandomResize(ImageTensorOperation):
"""
Tensor operation to resize the input image using a randomly selected interpolation mode.
@ -1187,7 +1187,7 @@ class RandomResize(TensorOperation):
return cde.RandomResizeOperation(size)
class RandomResizeWithBBox(TensorOperation):
class RandomResizeWithBBox(ImageTensorOperation):
"""
Tensor operation to resize the input image using a randomly selected interpolation mode and adjust
bounding boxes accordingly.
@ -1220,7 +1220,7 @@ class RandomResizeWithBBox(TensorOperation):
return cde.RandomResizeWithBBoxOperation(size)
class HWC2CHW(TensorOperation):
class HWC2CHW(ImageTensorOperation):
"""
Transpose the input image; shape (H, W, C) to shape (C, H, W).
@ -1253,7 +1253,7 @@ class HWC2CHW(TensorOperation):
return cde.HwcToChwOperation()
class RandomCropDecodeResize(TensorOperation):
class RandomCropDecodeResize(ImageTensorOperation):
"""
Equivalent to RandomResizedCrop, but crops before decodes.
@ -1305,7 +1305,7 @@ class RandomCropDecodeResize(TensorOperation):
self.max_attempts)
class Pad(TensorOperation):
class Pad(ImageTensorOperation):
"""
Pads the image according to padding parameters.
@ -1370,7 +1370,7 @@ class Pad(TensorOperation):
return img.as_array()
class UniformAugment(TensorOperation):
class UniformAugment(ImageTensorOperation):
"""
Tensor operation to perform randomly selected augmentation.
@ -1407,7 +1407,7 @@ class UniformAugment(TensorOperation):
return cde.UniformAugOperation(transforms, self.num_ops)
class RandomSelectSubpolicy(TensorOperation):
class RandomSelectSubpolicy(ImageTensorOperation):
"""
Choose a random sub-policy from a list to be applied on the input image. A sub-policy is a list of tuples
(op, prob), where op is a TensorOp operation and prob is the probability that this op will be applied. Once
@ -1446,7 +1446,7 @@ class RandomSelectSubpolicy(TensorOperation):
return cde.RandomSelectSubpolicyOperation(policy)
class SoftDvppDecodeResizeJpeg(TensorOperation):
class SoftDvppDecodeResizeJpeg(ImageTensorOperation):
"""
Tensor operation to decode and resize JPEG image using the simulation algorithm of
Ascend series chip DVPP module.
@ -1486,7 +1486,7 @@ class SoftDvppDecodeResizeJpeg(TensorOperation):
return cde.SoftDvppDecodeResizeJpegOperation(self.size)
class SoftDvppDecodeRandomCropResizeJpeg(TensorOperation):
class SoftDvppDecodeRandomCropResizeJpeg(ImageTensorOperation):
"""
Tensor operation to decode, random crop and resize JPEG image using the simulation algorithm of
Ascend series chip DVPP module.
@ -1531,7 +1531,7 @@ class SoftDvppDecodeRandomCropResizeJpeg(TensorOperation):
return cde.SoftDvppDecodeRandomCropResizeJpegOperation(self.size, self.scale, self.ratio, self.max_attempts)
class RandomSolarize(TensorOperation):
class RandomSolarize(ImageTensorOperation):
"""
Invert all pixel values above a threshold.

View File

@ -877,6 +877,229 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail) {
EXPECT_EQ(jieba_tokenizer3, nullptr);
}
TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord) {
// Testing the parameter AddWord of JiebaTokenizer when the freq is not provided (default 0).
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
std::shared_ptr<Dataset> ds = TextFile({data_file});
EXPECT_NE(ds, nullptr);
// Create jieba_tokenizer operation on ds
std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer =
text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp);
EXPECT_NE(jieba_tokenizer, nullptr);
// Add word with freq not provided (default 0)
jieba_tokenizer->AddWord("男默女泪");
// Create Map operation on ds
ds = ds->Map({jieba_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
std::vector<std::string> expected = {"男默女泪", "", "长江大桥"};
uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["text"];
std::shared_ptr<Tensor> expected_tensor;
Tensor::CreateFromVector(expected, &expected_tensor);
EXPECT_EQ(*ind, *expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 1);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord1) {
// Testing the parameter AddWord of JiebaTokenizer when the freq is set explicitly to 0.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord1.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
std::shared_ptr<Dataset> ds = TextFile({data_file});
EXPECT_NE(ds, nullptr);
// Create jieba_tokenizer operation on ds
std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer =
text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp);
EXPECT_NE(jieba_tokenizer, nullptr);
// Add word with freq is set explicitly to 0
jieba_tokenizer->AddWord("男默女泪", 0);
// Create Map operation on ds
ds = ds->Map({jieba_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
std::vector<std::string> expected = {"男默女泪", "", "长江大桥"};
uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["text"];
std::shared_ptr<Tensor> expected_tensor;
Tensor::CreateFromVector(expected, &expected_tensor);
EXPECT_EQ(*ind, *expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 1);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord2) {
// Testing the parameter AddWord of JiebaTokenizer when the freq is 10.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord2.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
std::shared_ptr<Dataset> ds = TextFile({data_file});
EXPECT_NE(ds, nullptr);
// Create jieba_tokenizer operation on ds
std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer =
text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp);
EXPECT_NE(jieba_tokenizer, nullptr);
// Add word with freq 10
jieba_tokenizer->AddWord("男默女泪", 10);
// Create Map operation on ds
ds = ds->Map({jieba_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
std::vector<std::string> expected = {"男默女泪", "", "长江大桥"};
uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["text"];
std::shared_ptr<Tensor> expected_tensor;
Tensor::CreateFromVector(expected, &expected_tensor);
EXPECT_EQ(*ind, *expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 1);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord3) {
// Testing the parameter AddWord of JiebaTokenizer when the freq is 20000 which affects the result of segmentation.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord3.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt";
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
std::shared_ptr<Dataset> ds = TextFile({data_file});
EXPECT_NE(ds, nullptr);
// Create jieba_tokenizer operation on ds
std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer =
text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp);
EXPECT_NE(jieba_tokenizer, nullptr);
// Add word with freq 20000
jieba_tokenizer->AddWord("江大桥", 20000);
// Create Map operation on ds
ds = ds->Map({jieba_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "", "长江大桥", "", "通车", "仪式"};
uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["text"];
std::shared_ptr<Tensor> expected_tensor;
Tensor::CreateFromVector(expected, &expected_tensor);
EXPECT_EQ(*ind, *expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 1);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWordFail) {
// Testing the incorrect parameter of AddWord in JiebaTokenizer.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWordFail.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Testing the parameter word of AddWord is empty
std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer =
text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp);
EXPECT_NE(jieba_tokenizer, nullptr);
EXPECT_NE(jieba_tokenizer->AddWord("", 10), Status::OK());
// Testing the parameter freq of AddWord is negative
std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer1 =
text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp);
EXPECT_NE(jieba_tokenizer1, nullptr);
EXPECT_NE(jieba_tokenizer1->AddWord("我们", -1), Status::OK());
}
TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) {
// Testing the parameter of SlidingWindow interface when the axis is 0.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess.";

View File

@ -166,7 +166,8 @@ def test_lookup_cast_type():
assert test_config("unk") == np.dtype("int32")
# test exception, data_type isn't the correct type
assert "tldr is not of type (<class 'mindspore._c_expression.typing.Type'>,)" in test_config("unk", "tldr")
assert "Lookup doesn't support string to string lookup" in test_config("w1", mstype.string)
assert "Lookup does not support a string to string mapping, data_type can only be numeric." in \
test_config("w1", mstype.string)
if __name__ == '__main__':