forked from mindspore-Ecosystem/mindspore
TextOp decoupling
Signed-off-by: alex-yuyue <yue.yu1@huawei.com>
This commit is contained in:
parent
4e3abb2434
commit
a8675f8227
|
@ -18,7 +18,7 @@ if(ENABLE_PYTHON)
|
|||
python/bindings/dataset/kernels/ir/bindings.cc
|
||||
python/bindings/dataset/kernels/ir/image/bindings.cc
|
||||
python/bindings/dataset/text/bindings.cc
|
||||
python/bindings/dataset/text/kernels/bindings.cc
|
||||
python/bindings/dataset/text/kernels/ir/bindings.cc
|
||||
python/bindings/mindrecord/include/bindings.cc
|
||||
python/pybind_conversion.cc
|
||||
python/pybind_register.cc
|
||||
|
|
|
@ -1,205 +0,0 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "pybind11/pybind11.h"
|
||||
#include "pybind11/stl.h"
|
||||
#include "pybind11/stl_bind.h"
|
||||
#include "minddata/dataset/api/python/pybind_register.h"
|
||||
|
||||
#include "minddata/dataset/text/kernels/jieba_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/kernels/lookup_op.h"
|
||||
#include "minddata/dataset/text/kernels/ngram_op.h"
|
||||
#include "minddata/dataset/text/kernels/sliding_window_op.h"
|
||||
#include "minddata/dataset/text/kernels/to_number_op.h"
|
||||
#include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h"
|
||||
|
||||
#ifdef ENABLE_ICU4C
|
||||
#include "minddata/dataset/text/kernels/basic_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/kernels/bert_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/kernels/case_fold_op.h"
|
||||
#include "minddata/dataset/text/kernels/normalize_utf8_op.h"
|
||||
#include "minddata/dataset/text/kernels/regex_replace_op.h"
|
||||
#include "minddata/dataset/text/kernels/regex_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
|
||||
#endif
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
#ifdef ENABLE_ICU4C
|
||||
|
||||
PYBIND_REGISTER(BasicTokenizerOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<BasicTokenizerOp, TensorOp, std::shared_ptr<BasicTokenizerOp>>(*m,
|
||||
"BasicTokenizerOp")
|
||||
.def(py::init<const bool &, const bool &, const NormalizeForm &, const bool &, const bool &>());
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(WhitespaceTokenizerOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<WhitespaceTokenizerOp, TensorOp, std::shared_ptr<WhitespaceTokenizerOp>>(
|
||||
*m, "WhitespaceTokenizerOp")
|
||||
.def(py::init<const bool &>());
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(UnicodeScriptTokenizerOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<UnicodeScriptTokenizerOp, TensorOp, std::shared_ptr<UnicodeScriptTokenizerOp>>(
|
||||
*m, "UnicodeScriptTokenizerOp")
|
||||
.def(py::init<>())
|
||||
.def(py::init<const bool &, const bool &>());
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(
|
||||
CaseFoldOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<CaseFoldOp, TensorOp, std::shared_ptr<CaseFoldOp>>(*m, "CaseFoldOp").def(py::init<>());
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(NormalizeUTF8Op, 1, ([](const py::module *m) {
|
||||
(void)py::class_<NormalizeUTF8Op, TensorOp, std::shared_ptr<NormalizeUTF8Op>>(*m, "NormalizeUTF8Op")
|
||||
.def(py::init<>())
|
||||
.def(py::init<NormalizeForm>());
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(RegexReplaceOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<RegexReplaceOp, TensorOp, std::shared_ptr<RegexReplaceOp>>(*m, "RegexReplaceOp")
|
||||
.def(py::init<const std::string &, const std::string &, bool>());
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(RegexTokenizerOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<RegexTokenizerOp, TensorOp, std::shared_ptr<RegexTokenizerOp>>(*m,
|
||||
"RegexTokenizerOp")
|
||||
.def(py::init<const std::string &, const std::string &, const bool &>());
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(BertTokenizerOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<BertTokenizerOp, TensorOp, std::shared_ptr<BertTokenizerOp>>(*m, "BertTokenizerOp")
|
||||
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &,
|
||||
const bool &, const bool &, const NormalizeForm &, const bool &, const bool &>());
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(NormalizeForm, 0, ([](const py::module *m) {
|
||||
(void)py::enum_<NormalizeForm>(*m, "NormalizeForm", py::arithmetic())
|
||||
.value("DE_NORMALIZE_NONE", NormalizeForm::kNone)
|
||||
.value("DE_NORMALIZE_NFC", NormalizeForm::kNfc)
|
||||
.value("DE_NORMALIZE_NFKC", NormalizeForm::kNfkc)
|
||||
.value("DE_NORMALIZE_NFD", NormalizeForm::kNfd)
|
||||
.value("DE_NORMALIZE_NFKD", NormalizeForm::kNfkd)
|
||||
.export_values();
|
||||
}));
|
||||
|
||||
#endif
|
||||
|
||||
PYBIND_REGISTER(JiebaTokenizerOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<JiebaTokenizerOp, TensorOp, std::shared_ptr<JiebaTokenizerOp>>(*m,
|
||||
"JiebaTokenizerOp")
|
||||
.def(py::init<const std::string &, const std::string &, const JiebaMode &, const bool &>())
|
||||
.def("add_word", [](JiebaTokenizerOp &self, const std::string word, int freq) {
|
||||
THROW_IF_ERROR(self.AddWord(word, freq));
|
||||
});
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(UnicodeCharTokenizerOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<UnicodeCharTokenizerOp, TensorOp, std::shared_ptr<UnicodeCharTokenizerOp>>(
|
||||
*m, "UnicodeCharTokenizerOp")
|
||||
.def(py::init<const bool &>());
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(LookupOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<LookupOp, TensorOp, std::shared_ptr<LookupOp>>(*m, "LookupOp")
|
||||
.def(py::init([](std::shared_ptr<Vocab> vocab, const py::object &py_word,
|
||||
const DataType &data_type) {
|
||||
if (vocab == nullptr) {
|
||||
THROW_IF_ERROR(Status(StatusCode::kUnexpectedError, "vocab object type is incorrect or null."));
|
||||
}
|
||||
if (py_word.is_none()) {
|
||||
return std::make_shared<LookupOp>(vocab, Vocab::kNoTokenExists, data_type);
|
||||
}
|
||||
std::string word = py::reinterpret_borrow<py::str>(py_word);
|
||||
WordIdType default_id = vocab->Lookup(word);
|
||||
if (default_id == Vocab::kNoTokenExists) {
|
||||
THROW_IF_ERROR(Status(StatusCode::kUnexpectedError,
|
||||
"default unknown token: " + word + " doesn't exist in vocab."));
|
||||
}
|
||||
return std::make_shared<LookupOp>(vocab, default_id, data_type);
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(NgramOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<NgramOp, TensorOp, std::shared_ptr<NgramOp>>(*m, "NgramOp")
|
||||
.def(py::init<const std::vector<int32_t> &, int32_t, int32_t, const std::string &,
|
||||
const std::string &, const std::string &>());
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(WordpieceTokenizerOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
|
||||
*m, "WordpieceTokenizerOp")
|
||||
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &,
|
||||
const bool &>());
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(SlidingWindowOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<SlidingWindowOp, TensorOp, std::shared_ptr<SlidingWindowOp>>(*m, "SlidingWindowOp")
|
||||
.def(py::init<uint32_t, int32_t>());
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(
|
||||
SentencePieceTokenizerOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<SentencePieceTokenizerOp, TensorOp, std::shared_ptr<SentencePieceTokenizerOp>>(
|
||||
*m, "SentencePieceTokenizerOp")
|
||||
.def(
|
||||
py::init<std::shared_ptr<SentencePieceVocab> &, const SPieceTokenizerLoadType, const SPieceTokenizerOutType>())
|
||||
.def(py::init<const std::string &, const std::string &, const SPieceTokenizerLoadType,
|
||||
const SPieceTokenizerOutType>());
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(ToNumberOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<ToNumberOp, TensorOp, std::shared_ptr<ToNumberOp>>(*m, "ToNumberOp")
|
||||
.def(py::init<DataType>())
|
||||
.def(py::init<std::string>());
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(TruncateSequencePairOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<TruncateSequencePairOp, TensorOp, std::shared_ptr<TruncateSequencePairOp>>(
|
||||
*m, "TruncateSequencePairOp")
|
||||
.def(py::init<int64_t>());
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(JiebaMode, 0, ([](const py::module *m) {
|
||||
(void)py::enum_<JiebaMode>(*m, "JiebaMode", py::arithmetic())
|
||||
.value("DE_JIEBA_MIX", JiebaMode::kMix)
|
||||
.value("DE_JIEBA_MP", JiebaMode::kMp)
|
||||
.value("DE_JIEBA_HMM", JiebaMode::kHmm)
|
||||
.export_values();
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(SPieceTokenizerOutType, 0, ([](const py::module *m) {
|
||||
(void)py::enum_<SPieceTokenizerOutType>(*m, "SPieceTokenizerOutType", py::arithmetic())
|
||||
.value("DE_SPIECE_TOKENIZER_OUTTYPE_KString", SPieceTokenizerOutType::kString)
|
||||
.value("DE_SPIECE_TOKENIZER_OUTTYPE_KINT", SPieceTokenizerOutType::kInt)
|
||||
.export_values();
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(SPieceTokenizerLoadType, 0, ([](const py::module *m) {
|
||||
(void)py::enum_<SPieceTokenizerLoadType>(*m, "SPieceTokenizerLoadType", py::arithmetic())
|
||||
.value("DE_SPIECE_TOKENIZER_LOAD_KFILE", SPieceTokenizerLoadType::kFile)
|
||||
.value("DE_SPIECE_TOKENIZER_LOAD_KMODEL", SPieceTokenizerLoadType::kModel)
|
||||
.export_values();
|
||||
}));
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,267 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "pybind11/pybind11.h"
|
||||
#include "pybind11/stl.h"
|
||||
#include "pybind11/stl_bind.h"
|
||||
#include "minddata/dataset/api/python/pybind_register.h"
|
||||
#include "minddata/dataset/include/text.h"
|
||||
#include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
#ifdef ENABLE_ICU4C
|
||||
|
||||
PYBIND_REGISTER(
|
||||
BasicTokenizerOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::BasicTokenizerOperation, TensorOperation, std::shared_ptr<text::BasicTokenizerOperation>>(
|
||||
*m, "BasicTokenizerOperation")
|
||||
.def(py::init([](bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form,
|
||||
bool preserve_unused_token, bool with_offsets) {
|
||||
auto basic_tokenizer = std::make_shared<text::BasicTokenizerOperation>(
|
||||
lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets);
|
||||
THROW_IF_ERROR(basic_tokenizer->ValidateParams());
|
||||
return basic_tokenizer;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(
|
||||
BertTokenizerOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::BertTokenizerOperation, TensorOperation, std::shared_ptr<text::BertTokenizerOperation>>(
|
||||
*m, "BertTokenizerOperation")
|
||||
.def(py::init([](const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
|
||||
int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case,
|
||||
bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
|
||||
bool with_offsets) {
|
||||
auto bert_tokenizer = std::make_shared<text::BertTokenizerOperation>(
|
||||
vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case, keep_whitespace, normalize_form,
|
||||
preserve_unused_token, with_offsets);
|
||||
THROW_IF_ERROR(bert_tokenizer->ValidateParams());
|
||||
return bert_tokenizer;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(CaseFoldOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::CaseFoldOperation, TensorOperation, std::shared_ptr<text::CaseFoldOperation>>(
|
||||
*m, "CaseFoldOperation")
|
||||
.def(py::init([]() {
|
||||
auto case_fold = std::make_shared<text::CaseFoldOperation>();
|
||||
THROW_IF_ERROR(case_fold->ValidateParams());
|
||||
return case_fold;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(
|
||||
NormalizeUTF8Operation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::NormalizeUTF8Operation, TensorOperation, std::shared_ptr<text::NormalizeUTF8Operation>>(
|
||||
*m, "NormalizeUTF8Operation")
|
||||
.def(py::init([](NormalizeForm normalize_form) {
|
||||
auto normalize_utf8 = std::make_shared<text::NormalizeUTF8Operation>(normalize_form);
|
||||
THROW_IF_ERROR(normalize_utf8->ValidateParams());
|
||||
return normalize_utf8;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(
|
||||
RegexReplaceOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::RegexReplaceOperation, TensorOperation, std::shared_ptr<text::RegexReplaceOperation>>(
|
||||
*m, "RegexReplaceOperation")
|
||||
.def(py::init([](std::string pattern, std::string replace, bool replace_all) {
|
||||
auto regex_replace = std::make_shared<text::RegexReplaceOperation>(pattern, replace, replace_all);
|
||||
THROW_IF_ERROR(regex_replace->ValidateParams());
|
||||
return regex_replace;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(
|
||||
RegexTokenizerOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::RegexTokenizerOperation, TensorOperation, std::shared_ptr<text::RegexTokenizerOperation>>(
|
||||
*m, "RegexTokenizerOperation")
|
||||
.def(
|
||||
py::init([](const std::string &delim_pattern, const std::string &keep_delim_pattern, const bool &with_offsets) {
|
||||
auto regex_tokenizer =
|
||||
std::make_shared<text::RegexTokenizerOperation>(delim_pattern, keep_delim_pattern, with_offsets);
|
||||
THROW_IF_ERROR(regex_tokenizer->ValidateParams());
|
||||
return regex_tokenizer;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(UnicodeScriptTokenizerOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::UnicodeScriptTokenizerOperation, TensorOperation,
|
||||
std::shared_ptr<text::UnicodeScriptTokenizerOperation>>(
|
||||
*m, "UnicodeScriptTokenizerOperation")
|
||||
.def(py::init([](bool keep_whitespace, bool with_offsets) {
|
||||
auto unicode_script_tokenizer =
|
||||
std::make_shared<text::UnicodeScriptTokenizerOperation>(keep_whitespace, with_offsets);
|
||||
THROW_IF_ERROR(unicode_script_tokenizer->ValidateParams());
|
||||
return unicode_script_tokenizer;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(WhitespaceTokenizerOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::WhitespaceTokenizerOperation, TensorOperation,
|
||||
std::shared_ptr<text::WhitespaceTokenizerOperation>>(*m,
|
||||
"WhitespaceTokenizerOperation")
|
||||
.def(py::init([](bool with_offsets) {
|
||||
auto whitespace_tokenizer = std::make_shared<text::WhitespaceTokenizerOperation>(with_offsets);
|
||||
THROW_IF_ERROR(whitespace_tokenizer->ValidateParams());
|
||||
return whitespace_tokenizer;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(NormalizeForm, 0, ([](const py::module *m) {
|
||||
(void)py::enum_<NormalizeForm>(*m, "NormalizeForm", py::arithmetic())
|
||||
.value("DE_NORMALIZE_NONE", NormalizeForm::kNone)
|
||||
.value("DE_NORMALIZE_NFC", NormalizeForm::kNfc)
|
||||
.value("DE_NORMALIZE_NFKC", NormalizeForm::kNfkc)
|
||||
.value("DE_NORMALIZE_NFD", NormalizeForm::kNfd)
|
||||
.value("DE_NORMALIZE_NFKD", NormalizeForm::kNfkd)
|
||||
.export_values();
|
||||
}));
|
||||
#endif
|
||||
|
||||
PYBIND_REGISTER(
|
||||
JiebaTokenizerOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::JiebaTokenizerOperation, TensorOperation, std::shared_ptr<text::JiebaTokenizerOperation>>(
|
||||
*m, "JiebaTokenizerOperation")
|
||||
.def(
|
||||
py::init([](const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode, bool with_offsets) {
|
||||
auto jieba_tokenizer = std::make_shared<text::JiebaTokenizerOperation>(hmm_path, mp_path, mode, with_offsets);
|
||||
THROW_IF_ERROR(jieba_tokenizer->ValidateParams());
|
||||
return jieba_tokenizer;
|
||||
}))
|
||||
.def("add_word", [](text::JiebaTokenizerOperation &self, const std::string word, int64_t freq) {
|
||||
THROW_IF_ERROR(self.AddWord(word, freq));
|
||||
});
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(LookupOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::LookupOperation, TensorOperation, std::shared_ptr<text::LookupOperation>>(
|
||||
*m, "LookupOperation")
|
||||
.def(py::init([](const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
|
||||
const std::string &data_type) {
|
||||
auto lookup = std::make_shared<text::LookupOperation>(vocab, unknown_token, data_type);
|
||||
THROW_IF_ERROR(lookup->ValidateParams());
|
||||
return lookup;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(NgramOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::NgramOperation, TensorOperation, std::shared_ptr<text::NgramOperation>>(
|
||||
*m, "NgramOperation")
|
||||
.def(
|
||||
py::init([](const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
|
||||
const std::pair<std::string, int32_t> &right_pad, const std::string &separator) {
|
||||
auto ngram = std::make_shared<text::NgramOperation>(ngrams, left_pad, right_pad, separator);
|
||||
THROW_IF_ERROR(ngram->ValidateParams());
|
||||
return ngram;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(
|
||||
SentencePieceTokenizerOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::SentencePieceTokenizerOperation, TensorOperation,
|
||||
std::shared_ptr<text::SentencePieceTokenizerOperation>>(*m, "SentencePieceTokenizerOperation")
|
||||
.def(py::init([](const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type) {
|
||||
auto SentencePieceTokenizer = std::make_shared<text::SentencePieceTokenizerOperation>(vocab, out_type);
|
||||
THROW_IF_ERROR(SentencePieceTokenizer->ValidateParams());
|
||||
return SentencePieceTokenizer;
|
||||
}))
|
||||
.def(py::init([](const std::string &vocab_path, SPieceTokenizerOutType out_type) {
|
||||
auto sentence_piece_tokenizer = std::make_shared<text::SentencePieceTokenizerOperation>(vocab_path, out_type);
|
||||
THROW_IF_ERROR(sentence_piece_tokenizer->ValidateParams());
|
||||
return sentence_piece_tokenizer;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(
|
||||
SlidingWindowOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::SlidingWindowOperation, TensorOperation, std::shared_ptr<text::SlidingWindowOperation>>(
|
||||
*m, "SlidingWindowOperation")
|
||||
.def(py::init([](const int32_t width, const int32_t axis) {
|
||||
auto sliding_window = std::make_shared<text::SlidingWindowOperation>(width, axis);
|
||||
THROW_IF_ERROR(sliding_window->ValidateParams());
|
||||
return sliding_window;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(ToNumberOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::ToNumberOperation, TensorOperation, std::shared_ptr<text::ToNumberOperation>>(
|
||||
*m, "ToNumberOperation")
|
||||
.def(py::init([](std::string data_type) {
|
||||
auto to_number = std::make_shared<text::ToNumberOperation>(data_type);
|
||||
THROW_IF_ERROR(to_number->ValidateParams());
|
||||
return to_number;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(TruncateSequencePairOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::TruncateSequencePairOperation, TensorOperation,
|
||||
std::shared_ptr<text::TruncateSequencePairOperation>>(
|
||||
*m, "TruncateSequencePairOperation")
|
||||
.def(py::init([](int32_t max_length) {
|
||||
auto truncate_sequence_pair = std::make_shared<text::TruncateSequencePairOperation>(max_length);
|
||||
THROW_IF_ERROR(truncate_sequence_pair->ValidateParams());
|
||||
return truncate_sequence_pair;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(UnicodeCharTokenizerOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::UnicodeCharTokenizerOperation, TensorOperation,
|
||||
std::shared_ptr<text::UnicodeCharTokenizerOperation>>(
|
||||
*m, "UnicodeCharTokenizerOperation")
|
||||
.def(py::init([](bool with_offsets) {
|
||||
auto unicode_char_tokenizer = std::make_shared<text::UnicodeCharTokenizerOperation>(with_offsets);
|
||||
THROW_IF_ERROR(unicode_char_tokenizer->ValidateParams());
|
||||
return unicode_char_tokenizer;
|
||||
}));
|
||||
}));
|
||||
|
||||
// TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++
|
||||
PYBIND_REGISTER(WordpieceTokenizerOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
|
||||
*m, "WordpieceTokenizerOp")
|
||||
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &,
|
||||
const bool &>());
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(JiebaMode, 0, ([](const py::module *m) {
|
||||
(void)py::enum_<JiebaMode>(*m, "JiebaMode", py::arithmetic())
|
||||
.value("DE_JIEBA_MIX", JiebaMode::kMix)
|
||||
.value("DE_JIEBA_MP", JiebaMode::kMp)
|
||||
.value("DE_JIEBA_HMM", JiebaMode::kHmm)
|
||||
.export_values();
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(SPieceTokenizerLoadType, 0, ([](const py::module *m) {
|
||||
(void)py::enum_<SPieceTokenizerLoadType>(*m, "SPieceTokenizerLoadType", py::arithmetic())
|
||||
.value("DE_SPIECE_TOKENIZER_LOAD_KFILE", SPieceTokenizerLoadType::kFile)
|
||||
.value("DE_SPIECE_TOKENIZER_LOAD_KMODEL", SPieceTokenizerLoadType::kModel)
|
||||
.export_values();
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(SPieceTokenizerOutType, 0, ([](const py::module *m) {
|
||||
(void)py::enum_<SPieceTokenizerOutType>(*m, "SPieceTokenizerOutType", py::arithmetic())
|
||||
.value("DE_SPIECE_TOKENIZER_OUTTYPE_KString", SPieceTokenizerOutType::kString)
|
||||
.value("DE_SPIECE_TOKENIZER_OUTTYPE_KINT", SPieceTokenizerOutType::kInt)
|
||||
.export_values();
|
||||
}));
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -314,9 +314,31 @@ Status JiebaTokenizerOperation::ValidateParams() {
|
|||
std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() {
|
||||
std::shared_ptr<JiebaTokenizerOp> tensor_op =
|
||||
std::make_shared<JiebaTokenizerOp>(hmm_path_, mp_path_, mode_, with_offsets_);
|
||||
for (auto &word : words_list_) {
|
||||
Status rc = tensor_op->AddWord(word.first, word.second);
|
||||
if (rc.IsError()) {
|
||||
MS_LOG(ERROR) << rc;
|
||||
return {};
|
||||
}
|
||||
}
|
||||
return tensor_op;
|
||||
}
|
||||
|
||||
Status JiebaTokenizerOperation::AddWord(const std::string &word, int64_t freq) {
|
||||
if (word.empty()) {
|
||||
std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
if (freq < 0) {
|
||||
std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
words_list_.emplace_back(word, freq);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// LookupOperation
|
||||
LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
|
||||
const std::string &data_type)
|
||||
|
@ -330,12 +352,13 @@ Status LookupOperation::ValidateParams() {
|
|||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
||||
default_id_ = vocab_->Lookup(unknown_token_);
|
||||
if (default_id_ == Vocab::kNoTokenExists) {
|
||||
std::string err_msg = "Lookup: \"" + unknown_token_ + "\" doesn't exist in vocab.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
if (!unknown_token_.empty()) {
|
||||
default_id_ = vocab_->Lookup(unknown_token_);
|
||||
if (default_id_ == Vocab::kNoTokenExists) {
|
||||
std::string err_msg = "Lookup: \"" + unknown_token_ + "\" doesn't exist in vocab.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
}
|
||||
|
||||
if (!IsTypeNumeric(data_type_)) {
|
||||
|
|
|
@ -331,11 +331,14 @@ class JiebaTokenizerOperation : public TensorOperation {
|
|||
|
||||
std::string Name() const override { return kJiebaTokenizerOperation; }
|
||||
|
||||
Status AddWord(const std::string &word, int64_t freq = 0);
|
||||
|
||||
private:
|
||||
std::string hmm_path_;
|
||||
std::string mp_path_;
|
||||
JiebaMode mode_;
|
||||
bool with_offsets_;
|
||||
std::vector<std::pair<std::string, int64_t>> words_list_;
|
||||
};
|
||||
|
||||
class LookupOperation : public TensorOperation {
|
||||
|
|
|
@ -383,3 +383,7 @@ def check_tensor_op(param, param_name):
|
|||
"""check whether param is a tensor op or a callable Python function"""
|
||||
if not isinstance(param, cde.TensorOp) and not callable(param) and not getattr(param, 'parse', None):
|
||||
raise TypeError("{0} is neither a c_transform op (TensorOperation) nor a callable pyfunc.".format(param_name))
|
||||
|
||||
|
||||
def replace_none(value, default):
|
||||
return value if value is not None else default
|
||||
|
|
|
@ -55,9 +55,10 @@ from .validators import check_batch, check_shuffle, check_map, check_filter, che
|
|||
check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset, \
|
||||
check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat, \
|
||||
check_random_dataset, check_split, check_bucket_batch_by_length, check_cluedataset, check_save, check_csvdataset, \
|
||||
check_paddeddataset, check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send, replace_none
|
||||
check_paddeddataset, check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send
|
||||
from ..core.config import get_callback_timeout, _init_device_info
|
||||
from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist
|
||||
from ..core.validator_helpers import replace_none
|
||||
|
||||
try:
|
||||
context = import_module("mindspore.context")
|
||||
|
@ -372,7 +373,7 @@ class Dataset:
|
|||
Args:
|
||||
condition_name (str): The condition name that is used to toggle sending next row.
|
||||
num_batch (int): the number of batches without blocking at the start of each epoch.
|
||||
callback (function): The callback funciton that will be invoked when sync_update is called.
|
||||
callback (function): The callback function that will be invoked when sync_update is called.
|
||||
|
||||
Returns:
|
||||
SyncWaitDataset, dataset added a blocking condition.
|
||||
|
@ -398,7 +399,7 @@ class Dataset:
|
|||
|
||||
1. Make a shuffle buffer that contains the first buffer_size rows.
|
||||
2. Randomly select an element from the shuffle buffer to be the next row
|
||||
propogated to the child node.
|
||||
propagated to the child node.
|
||||
3. Get the next row (if any) from the parent node and put it in the shuffle buffer.
|
||||
4. Repeat steps 2 and 3 until there are no more rows left in the shuffle buffer.
|
||||
|
||||
|
@ -1718,7 +1719,7 @@ class MappableDataset(SourceDataset):
|
|||
- The sum of split sizes < K, the difference will be added to the first split.
|
||||
|
||||
- The sum of split sizes > K, the difference will be removed from the first large
|
||||
enough split such that it will have atleast 1 row after removing the difference.
|
||||
enough split such that it will have at least 1 row after removing the difference.
|
||||
|
||||
randomize (bool, optional): Determines whether or not to split the data randomly (default=True).
|
||||
If True, the data will be randomly split. Otherwise, each split will be created with
|
||||
|
|
|
@ -1323,7 +1323,3 @@ def check_to_device_send(method):
|
|||
return method(self, *args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
||||
|
||||
def replace_none(value, default):
|
||||
return value if value is not None else default
|
||||
|
|
|
@ -58,9 +58,13 @@ from .validators import check_lookup, check_jieba_add_dict, \
|
|||
check_wordpiece_tokenizer, check_regex_tokenizer, check_basic_tokenizer, check_ngram, check_pair_truncate, \
|
||||
check_to_number, check_bert_tokenizer, check_python_tokenizer, check_slidingwindow
|
||||
from ..core.datatypes import mstype_to_detype
|
||||
from ..core.validator_helpers import replace_none
|
||||
|
||||
class TextTensorOperation:
|
||||
def parse(self):
|
||||
raise NotImplementedError("TextTensorOperation has to implement parse method.")
|
||||
|
||||
class Lookup(cde.LookupOp):
|
||||
class Lookup(TextTensorOperation):
|
||||
"""
|
||||
Lookup operator that looks up a word to an id.
|
||||
|
||||
|
@ -82,10 +86,15 @@ class Lookup(cde.LookupOp):
|
|||
|
||||
@check_lookup
|
||||
def __init__(self, vocab, unknown_token=None, data_type=mstype.int32):
|
||||
super().__init__(vocab, unknown_token, mstype_to_detype(data_type))
|
||||
self.vocab = vocab
|
||||
self.unknown_token = replace_none(unknown_token, '')
|
||||
self.data_type = data_type
|
||||
|
||||
def parse(self):
|
||||
return cde.LookupOperation(self.vocab, self.unknown_token, str(mstype_to_detype(self.data_type)))
|
||||
|
||||
|
||||
class SlidingWindow(cde.SlidingWindowOp):
|
||||
class SlidingWindow(TextTensorOperation):
|
||||
"""
|
||||
TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension axis
|
||||
is a slice of data starting at the corresponding position, with a specified width.
|
||||
|
@ -114,10 +123,14 @@ class SlidingWindow(cde.SlidingWindowOp):
|
|||
|
||||
@check_slidingwindow
|
||||
def __init__(self, width, axis=0):
|
||||
super().__init__(width, axis)
|
||||
self.width = width
|
||||
self.axis = axis
|
||||
|
||||
def parse(self):
|
||||
return cde.SlidingWindowOperation(self.width, self.axis)
|
||||
|
||||
|
||||
class Ngram(cde.NgramOp):
|
||||
class Ngram(TextTensorOperation):
|
||||
"""
|
||||
TensorOp to generate n-gram from a 1-D string Tensor.
|
||||
|
||||
|
@ -145,7 +158,13 @@ class Ngram(cde.NgramOp):
|
|||
|
||||
@check_ngram
|
||||
def __init__(self, n, left_pad=("", 0), right_pad=("", 0), separator=" "):
|
||||
super().__init__(n, left_pad[1], right_pad[1], left_pad[0], right_pad[0], separator)
|
||||
self.ngrams = n
|
||||
self.left_pad = left_pad
|
||||
self.right_pad = right_pad
|
||||
self.separator = separator
|
||||
|
||||
def parse(self):
|
||||
return cde.NgramOperation(self.ngrams, self.left_pad, self.right_pad, self.separator)
|
||||
|
||||
|
||||
DE_C_INTER_JIEBA_MODE = {
|
||||
|
@ -155,7 +174,7 @@ DE_C_INTER_JIEBA_MODE = {
|
|||
}
|
||||
|
||||
|
||||
class JiebaTokenizer(cde.JiebaTokenizerOp):
|
||||
class JiebaTokenizer(TextTensorOperation):
|
||||
"""
|
||||
Tokenize Chinese string into words based on dictionary.
|
||||
|
||||
|
@ -196,11 +215,19 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
|
|||
|
||||
self.mode = mode
|
||||
self.__check_path__(hmm_path)
|
||||
self.hmm_path = hmm_path
|
||||
self.__check_path__(mp_path)
|
||||
self.mp_path = mp_path
|
||||
self.with_offsets = with_offsets
|
||||
super().__init__(hmm_path, mp_path,
|
||||
DE_C_INTER_JIEBA_MODE[mode],
|
||||
self.with_offsets)
|
||||
self.words = []
|
||||
|
||||
def parse(self):
|
||||
jieba_tokenizer = cde.JiebaTokenizerOperation(self.hmm_path, self.mp_path,
|
||||
DE_C_INTER_JIEBA_MODE[self.mode],
|
||||
self.with_offsets)
|
||||
for word in self.words:
|
||||
jieba_tokenizer.add_word(word[0], word[1])
|
||||
return jieba_tokenizer
|
||||
|
||||
@check_jieba_add_word
|
||||
def add_word(self, word, freq=None):
|
||||
|
@ -225,9 +252,9 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
|
|||
"""
|
||||
|
||||
if freq is None:
|
||||
super().add_word(word, 0)
|
||||
self.words.append((word, 0))
|
||||
else:
|
||||
super().add_word(word, freq)
|
||||
self.words.append((word, freq))
|
||||
|
||||
@check_jieba_add_dict
|
||||
def add_dict(self, user_dict):
|
||||
|
@ -308,7 +335,7 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
|
|||
" jieba mode file {} is not exist.".format(model_path))
|
||||
|
||||
|
||||
class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp):
|
||||
class UnicodeCharTokenizer(TextTensorOperation):
|
||||
"""
|
||||
Tokenize a scalar tensor of UTF-8 string to Unicode characters.
|
||||
|
||||
|
@ -332,9 +359,12 @@ class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp):
|
|||
@check_with_offsets
|
||||
def __init__(self, with_offsets=False):
|
||||
self.with_offsets = with_offsets
|
||||
super().__init__(self.with_offsets)
|
||||
|
||||
def parse(self):
|
||||
return cde.UnicodeCharTokenizerOperation(self.with_offsets)
|
||||
|
||||
|
||||
# TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++
|
||||
class WordpieceTokenizer(cde.WordpieceTokenizerOp):
|
||||
"""
|
||||
Tokenize scalar token or 1-D tokens to 1-D subword tokens.
|
||||
|
@ -386,7 +416,7 @@ DE_C_INTER_SENTENCEPIECE_OUTTYPE = {
|
|||
}
|
||||
|
||||
|
||||
class SentencePieceTokenizer(cde.SentencePieceTokenizerOp):
|
||||
class SentencePieceTokenizer(TextTensorOperation):
|
||||
"""
|
||||
Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
|
||||
|
||||
|
@ -404,19 +434,15 @@ class SentencePieceTokenizer(cde.SentencePieceTokenizerOp):
|
|||
"""
|
||||
|
||||
def __init__(self, mode, out_type):
|
||||
self.mode = mode
|
||||
self.out_type = out_type
|
||||
if isinstance(mode, str):
|
||||
model_path, model_filename = os.path.split(mode)
|
||||
super().__init__(model_path, model_filename,
|
||||
DE_C_INTER_SENTENCEPIECE_LOADTYPE[SPieceTokenizerLoadType.FILE],
|
||||
DE_C_INTER_SENTENCEPIECE_OUTTYPE[out_type])
|
||||
elif isinstance(mode, cde.SentencePieceVocab):
|
||||
super().__init__(mode, DE_C_INTER_SENTENCEPIECE_LOADTYPE[SPieceTokenizerLoadType.MODEL],
|
||||
DE_C_INTER_SENTENCEPIECE_OUTTYPE[out_type])
|
||||
|
||||
def parse(self):
|
||||
return cde.SentencePieceTokenizerOperation(self.mode, DE_C_INTER_SENTENCEPIECE_OUTTYPE[self.out_type])
|
||||
|
||||
|
||||
if platform.system().lower() != 'windows':
|
||||
class WhitespaceTokenizer(cde.WhitespaceTokenizerOp):
|
||||
class WhitespaceTokenizer(TextTensorOperation):
|
||||
"""
|
||||
Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces, such as: ' ', '\\\\t', '\\\\r', '\\\\n'.
|
||||
|
||||
|
@ -444,10 +470,12 @@ if platform.system().lower() != 'windows':
|
|||
@check_with_offsets
|
||||
def __init__(self, with_offsets=False):
|
||||
self.with_offsets = with_offsets
|
||||
super().__init__(self.with_offsets)
|
||||
|
||||
def parse(self):
|
||||
return cde.WhitespaceTokenizerOperation(self.with_offsets)
|
||||
|
||||
|
||||
class UnicodeScriptTokenizer(cde.UnicodeScriptTokenizerOp):
|
||||
class UnicodeScriptTokenizer(TextTensorOperation):
|
||||
"""
|
||||
Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
|
||||
|
||||
|
@ -475,12 +503,16 @@ if platform.system().lower() != 'windows':
|
|||
|
||||
@check_unicode_script_tokenizer
|
||||
def __init__(self, keep_whitespace=False, with_offsets=False):
|
||||
keep_whitespace = replace_none(keep_whitespace, False)
|
||||
with_offsets = replace_none(with_offsets, False)
|
||||
self.keep_whitespace = keep_whitespace
|
||||
self.with_offsets = with_offsets
|
||||
super().__init__(self.keep_whitespace, self.with_offsets)
|
||||
|
||||
def parse(self):
|
||||
return cde.UnicodeScriptTokenizerOperation(self.keep_whitespace, self.with_offsets)
|
||||
|
||||
|
||||
class CaseFold(cde.CaseFoldOp):
|
||||
class CaseFold(TextTensorOperation):
|
||||
"""
|
||||
Apply case fold operation on UTF-8 string tensor.
|
||||
|
||||
|
@ -494,6 +526,9 @@ if platform.system().lower() != 'windows':
|
|||
>>> data1 = data1.map(operations=case_op)
|
||||
"""
|
||||
|
||||
def parse(self):
|
||||
return cde.CaseFoldOperation()
|
||||
|
||||
|
||||
DE_C_INTER_NORMALIZE_FORM = {
|
||||
NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE,
|
||||
|
@ -504,7 +539,7 @@ if platform.system().lower() != 'windows':
|
|||
}
|
||||
|
||||
|
||||
class NormalizeUTF8(cde.NormalizeUTF8Op):
|
||||
class NormalizeUTF8(TextTensorOperation):
|
||||
"""
|
||||
Apply normalize operation on UTF-8 string tensor.
|
||||
|
||||
|
@ -534,11 +569,14 @@ if platform.system().lower() != 'windows':
|
|||
if not isinstance(normalize_form, NormalizeForm):
|
||||
raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.")
|
||||
|
||||
normalize_form = replace_none(normalize_form, NormalizeForm.NFKC)
|
||||
self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form]
|
||||
super().__init__(self.normalize_form)
|
||||
|
||||
def parse(self):
|
||||
return cde.NormalizeUTF8Operation(self.normalize_form)
|
||||
|
||||
|
||||
class RegexReplace(cde.RegexReplaceOp):
|
||||
class RegexReplace(TextTensorOperation):
|
||||
"""
|
||||
Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'.
|
||||
|
||||
|
@ -566,10 +604,12 @@ if platform.system().lower() != 'windows':
|
|||
self.pattern = pattern
|
||||
self.replace = replace
|
||||
self.replace_all = replace_all
|
||||
super().__init__(self.pattern, self.replace, self.replace_all)
|
||||
|
||||
def parse(self):
|
||||
return cde.RegexReplaceOperation(self.pattern, self.replace, self.replace_all)
|
||||
|
||||
|
||||
class RegexTokenizer(cde.RegexTokenizerOp):
|
||||
class RegexTokenizer(TextTensorOperation):
|
||||
"""
|
||||
Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
|
||||
|
||||
|
@ -606,10 +646,12 @@ if platform.system().lower() != 'windows':
|
|||
self.delim_pattern = delim_pattern
|
||||
self.keep_delim_pattern = keep_delim_pattern
|
||||
self.with_offsets = with_offsets
|
||||
super().__init__(self.delim_pattern, self.keep_delim_pattern, self.with_offsets)
|
||||
|
||||
def parse(self):
|
||||
return cde.RegexTokenizerOperation(self.delim_pattern, self.keep_delim_pattern, self.with_offsets)
|
||||
|
||||
|
||||
class BasicTokenizer(cde.BasicTokenizerOp):
|
||||
class BasicTokenizer(TextTensorOperation):
|
||||
"""
|
||||
Tokenize a scalar tensor of UTF-8 string by specific rules.
|
||||
|
||||
|
@ -661,11 +703,13 @@ if platform.system().lower() != 'windows':
|
|||
self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
|
||||
self.preserve_unused_token = preserve_unused_token
|
||||
self.with_offsets = with_offsets
|
||||
super().__init__(self.lower_case, self.keep_whitespace, self.normalization_form,
|
||||
self.preserve_unused_token, self.with_offsets)
|
||||
|
||||
def parse(self):
|
||||
return cde.BasicTokenizerOperation(self.lower_case, self.keep_whitespace, self.normalization_form,
|
||||
self.preserve_unused_token, self.with_offsets)
|
||||
|
||||
|
||||
class BertTokenizer(cde.BertTokenizerOp):
|
||||
class BertTokenizer(TextTensorOperation):
|
||||
"""
|
||||
Tokenizer used for Bert text process.
|
||||
|
||||
|
@ -725,12 +769,14 @@ if platform.system().lower() != 'windows':
|
|||
self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
|
||||
self.preserve_unused_token = preserve_unused_token
|
||||
self.with_offsets = with_offsets
|
||||
super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token,
|
||||
self.lower_case, self.keep_whitespace, self.normalization_form,
|
||||
self.preserve_unused_token, self.with_offsets)
|
||||
|
||||
def parse(self):
|
||||
return cde.BertTokenizerOperation(self.vocab, self.suffix_indicator, self.max_bytes_per_token,
|
||||
self.unknown_token, self.lower_case, self.keep_whitespace,
|
||||
self.normalization_form, self.preserve_unused_token, self.with_offsets)
|
||||
|
||||
|
||||
class TruncateSequencePair(cde.TruncateSequencePairOp):
|
||||
class TruncateSequencePair(TextTensorOperation):
|
||||
"""
|
||||
Truncate a pair of rank-1 tensors such that the total length is less than max_length.
|
||||
|
||||
|
@ -757,10 +803,13 @@ class TruncateSequencePair(cde.TruncateSequencePairOp):
|
|||
|
||||
@check_pair_truncate
|
||||
def __init__(self, max_length):
|
||||
super().__init__(max_length)
|
||||
self.max_length = max_length
|
||||
|
||||
def parse(self):
|
||||
return cde.TruncateSequencePairOperation(self.max_length)
|
||||
|
||||
|
||||
class ToNumber(cde.ToNumberOp):
|
||||
class ToNumber(TextTensorOperation):
|
||||
"""
|
||||
Tensor operation to convert every element of a string tensor to a number.
|
||||
|
||||
|
@ -789,7 +838,9 @@ class ToNumber(cde.ToNumberOp):
|
|||
def __init__(self, data_type):
|
||||
data_type = mstype_to_detype(data_type)
|
||||
self.data_type = str(data_type)
|
||||
super().__init__(data_type)
|
||||
|
||||
def parse(self):
|
||||
return cde.ToNumberOperation(self.data_type)
|
||||
|
||||
|
||||
class PythonTokenizer:
|
||||
|
|
|
@ -81,11 +81,11 @@ def parse_padding(padding):
|
|||
padding = tuple(padding)
|
||||
return padding
|
||||
|
||||
class TensorOperation:
|
||||
class ImageTensorOperation:
|
||||
def parse(self):
|
||||
raise NotImplementedError("TensorOperation has to implement parse method.")
|
||||
raise NotImplementedError("ImageTensorOperation has to implement parse method.")
|
||||
|
||||
class AutoContrast(TensorOperation):
|
||||
class AutoContrast(ImageTensorOperation):
|
||||
"""
|
||||
Apply automatic contrast on input image.
|
||||
|
||||
|
@ -112,7 +112,7 @@ class AutoContrast(TensorOperation):
|
|||
return cde.AutoContrastOperation(self.cutoff, self.ignore)
|
||||
|
||||
|
||||
class RandomSharpness(TensorOperation):
|
||||
class RandomSharpness(ImageTensorOperation):
|
||||
"""
|
||||
Adjust the sharpness of the input image by a fixed or random degree. Degree of 0.0 gives a blurred image,
|
||||
degree of 1.0 gives the original image, and degree of 2.0 gives a sharpened image.
|
||||
|
@ -140,7 +140,7 @@ class RandomSharpness(TensorOperation):
|
|||
return cde.RandomSharpnessOperation(self.degrees)
|
||||
|
||||
|
||||
class Equalize(TensorOperation):
|
||||
class Equalize(ImageTensorOperation):
|
||||
"""
|
||||
Apply histogram equalization on input image.
|
||||
|
||||
|
@ -153,7 +153,7 @@ class Equalize(TensorOperation):
|
|||
return cde.EqualizeOperation()
|
||||
|
||||
|
||||
class Invert(TensorOperation):
|
||||
class Invert(ImageTensorOperation):
|
||||
"""
|
||||
Apply invert on input image in RGB mode.
|
||||
|
||||
|
@ -166,7 +166,7 @@ class Invert(TensorOperation):
|
|||
return cde.InvertOperation()
|
||||
|
||||
|
||||
class Decode(TensorOperation):
|
||||
class Decode(ImageTensorOperation):
|
||||
"""
|
||||
Decode the input image in RGB mode.
|
||||
|
||||
|
@ -203,7 +203,7 @@ class Decode(TensorOperation):
|
|||
return cde.DecodeOperation(self.rgb)
|
||||
|
||||
|
||||
class CutMixBatch(TensorOperation):
|
||||
class CutMixBatch(ImageTensorOperation):
|
||||
"""
|
||||
Apply CutMix transformation on input batch of images and labels.
|
||||
Note that you need to make labels into one-hot format and batch before calling this function.
|
||||
|
@ -235,7 +235,7 @@ class CutMixBatch(TensorOperation):
|
|||
return cde.CutMixBatchOperation(DE_C_IMAGE_BATCH_FORMAT[self.image_batch_format], self.alpha, self.prob)
|
||||
|
||||
|
||||
class CutOut(TensorOperation):
|
||||
class CutOut(ImageTensorOperation):
|
||||
"""
|
||||
Randomly cut (mask) out a given number of square patches from the input NumPy image array.
|
||||
|
||||
|
@ -258,7 +258,7 @@ class CutOut(TensorOperation):
|
|||
return cde.CutOutOperation(self.length, self.num_patches)
|
||||
|
||||
|
||||
class MixUpBatch(TensorOperation):
|
||||
class MixUpBatch(ImageTensorOperation):
|
||||
"""
|
||||
Apply MixUp transformation on input batch of images and labels. Each image is multiplied by a random weight (lambda)
|
||||
and then added to a randomly selected image from the batch multiplied by (1 - lambda). The same formula is also
|
||||
|
@ -286,7 +286,7 @@ class MixUpBatch(TensorOperation):
|
|||
return cde.MixUpBatchOperation(self.alpha)
|
||||
|
||||
|
||||
class Normalize(TensorOperation):
|
||||
class Normalize(ImageTensorOperation):
|
||||
"""
|
||||
Normalize the input image with respect to mean and standard deviation.
|
||||
|
||||
|
@ -333,7 +333,7 @@ class Normalize(TensorOperation):
|
|||
return cde.NormalizeOperation(self.mean, self.std)
|
||||
|
||||
|
||||
class NormalizePad(TensorOperation):
|
||||
class NormalizePad(ImageTensorOperation):
|
||||
"""
|
||||
Normalize the input image with respect to mean and standard deviation then pad an extra channel with value zero.
|
||||
|
||||
|
@ -380,7 +380,7 @@ class NormalizePad(TensorOperation):
|
|||
return cde.NormalizePadOperation(self.mean, self.std, self.dtype)
|
||||
|
||||
|
||||
class RandomAffine(TensorOperation):
|
||||
class RandomAffine(ImageTensorOperation):
|
||||
"""
|
||||
Apply Random affine transformation to the input image.
|
||||
|
||||
|
@ -486,7 +486,7 @@ class RandomAffine(TensorOperation):
|
|||
self.fill_value)
|
||||
|
||||
|
||||
class RandomCrop(TensorOperation):
|
||||
class RandomCrop(ImageTensorOperation):
|
||||
"""
|
||||
Crop the input image at a random location.
|
||||
|
||||
|
@ -551,7 +551,7 @@ class RandomCrop(TensorOperation):
|
|||
return cde.RandomCropOperation(self.size, self.padding, self.pad_if_needed, self.fill_value, border_type)
|
||||
|
||||
|
||||
class RandomCropWithBBox(TensorOperation):
|
||||
class RandomCropWithBBox(ImageTensorOperation):
|
||||
"""
|
||||
Crop the input image at a random location and adjust bounding boxes accordingly.
|
||||
|
||||
|
@ -615,7 +615,7 @@ class RandomCropWithBBox(TensorOperation):
|
|||
border_type)
|
||||
|
||||
|
||||
class RandomHorizontalFlip(TensorOperation):
|
||||
class RandomHorizontalFlip(ImageTensorOperation):
|
||||
"""
|
||||
Flip the input image horizontally, randomly with a given probability.
|
||||
|
||||
|
@ -636,7 +636,7 @@ class RandomHorizontalFlip(TensorOperation):
|
|||
return cde.RandomHorizontalFlipOperation(self.prob)
|
||||
|
||||
|
||||
class RandomHorizontalFlipWithBBox(TensorOperation):
|
||||
class RandomHorizontalFlipWithBBox(ImageTensorOperation):
|
||||
"""
|
||||
Flip the input image horizontally, randomly with a given probability and adjust bounding boxes accordingly.
|
||||
|
||||
|
@ -657,7 +657,7 @@ class RandomHorizontalFlipWithBBox(TensorOperation):
|
|||
return cde.RandomHorizontalFlipWithBBoxOperation(self.prob)
|
||||
|
||||
|
||||
class RandomPosterize(TensorOperation):
|
||||
class RandomPosterize(ImageTensorOperation):
|
||||
"""
|
||||
Reduce the number of bits for each color channel.
|
||||
|
||||
|
@ -685,7 +685,7 @@ class RandomPosterize(TensorOperation):
|
|||
return cde.RandomPosterizeOperation(bits)
|
||||
|
||||
|
||||
class RandomVerticalFlip(TensorOperation):
|
||||
class RandomVerticalFlip(ImageTensorOperation):
|
||||
"""
|
||||
Flip the input image vertically, randomly with a given probability.
|
||||
|
||||
|
@ -706,7 +706,7 @@ class RandomVerticalFlip(TensorOperation):
|
|||
return cde.RandomVerticalFlipOperation(self.prob)
|
||||
|
||||
|
||||
class RandomVerticalFlipWithBBox(TensorOperation):
|
||||
class RandomVerticalFlipWithBBox(ImageTensorOperation):
|
||||
"""
|
||||
Flip the input image vertically, randomly with a given probability and adjust bounding boxes accordingly.
|
||||
|
||||
|
@ -727,7 +727,7 @@ class RandomVerticalFlipWithBBox(TensorOperation):
|
|||
return cde.RandomVerticalFlipWithBBoxOperation(self.prob)
|
||||
|
||||
|
||||
class BoundingBoxAugment(TensorOperation):
|
||||
class BoundingBoxAugment(ImageTensorOperation):
|
||||
"""
|
||||
Apply a given image transform on a random selection of bounding box regions of a given image.
|
||||
|
||||
|
@ -760,7 +760,7 @@ class BoundingBoxAugment(TensorOperation):
|
|||
return cde.BoundingBoxAugmentOperation(transform, self.ratio)
|
||||
|
||||
|
||||
class Resize(TensorOperation):
|
||||
class Resize(ImageTensorOperation):
|
||||
"""
|
||||
Resize the input image to the given size.
|
||||
|
||||
|
@ -816,7 +816,7 @@ class Resize(TensorOperation):
|
|||
return cde.ResizeOperation(self.size, DE_C_INTER_MODE[self.interpolation])
|
||||
|
||||
|
||||
class ResizeWithBBox(TensorOperation):
|
||||
class ResizeWithBBox(ImageTensorOperation):
|
||||
"""
|
||||
Resize the input image to the given size and adjust bounding boxes accordingly.
|
||||
|
||||
|
@ -855,7 +855,7 @@ class ResizeWithBBox(TensorOperation):
|
|||
return cde.ResizeWithBBoxOperation(size, DE_C_INTER_MODE[self.interpolation])
|
||||
|
||||
|
||||
class RandomResizedCropWithBBox(TensorOperation):
|
||||
class RandomResizedCropWithBBox(ImageTensorOperation):
|
||||
"""
|
||||
Crop the input image to a random size and aspect ratio and adjust bounding boxes accordingly.
|
||||
|
||||
|
@ -904,7 +904,7 @@ class RandomResizedCropWithBBox(TensorOperation):
|
|||
DE_C_INTER_MODE[self.interpolation], self.max_attempts)
|
||||
|
||||
|
||||
class RandomResizedCrop(TensorOperation):
|
||||
class RandomResizedCrop(ImageTensorOperation):
|
||||
"""
|
||||
Crop the input image to a random size and aspect ratio.
|
||||
|
||||
|
@ -954,7 +954,7 @@ class RandomResizedCrop(TensorOperation):
|
|||
self.max_attempts)
|
||||
|
||||
|
||||
class CenterCrop(TensorOperation):
|
||||
class CenterCrop(ImageTensorOperation):
|
||||
"""
|
||||
Crops the input image at the center to the given size.
|
||||
|
||||
|
@ -984,7 +984,7 @@ class CenterCrop(TensorOperation):
|
|||
return cde.CenterCropOperation(self.size)
|
||||
|
||||
|
||||
class RandomColor(TensorOperation):
|
||||
class RandomColor(ImageTensorOperation):
|
||||
"""
|
||||
Adjust the color of the input image by a fixed or random degree.
|
||||
This operation works only with 3-channel color images.
|
||||
|
@ -1008,7 +1008,7 @@ class RandomColor(TensorOperation):
|
|||
return cde.RandomColorOperation(*self.degrees)
|
||||
|
||||
|
||||
class RandomColorAdjust(TensorOperation):
|
||||
class RandomColorAdjust(ImageTensorOperation):
|
||||
"""
|
||||
Randomly adjust the brightness, contrast, saturation, and hue of the input image.
|
||||
|
||||
|
@ -1060,7 +1060,7 @@ class RandomColorAdjust(TensorOperation):
|
|||
return cde.RandomColorAdjustOperation(self.brightness, self.contrast, self.saturation, self.hue)
|
||||
|
||||
|
||||
class RandomRotation(TensorOperation):
|
||||
class RandomRotation(ImageTensorOperation):
|
||||
"""
|
||||
Rotate the input image by a random angle.
|
||||
|
||||
|
@ -1116,7 +1116,7 @@ class RandomRotation(TensorOperation):
|
|||
return cde.RandomRotationOperation(degrees, interpolation, expand, center, fill_value)
|
||||
|
||||
|
||||
class Rescale(TensorOperation):
|
||||
class Rescale(ImageTensorOperation):
|
||||
"""
|
||||
Tensor operation to rescale the input image.
|
||||
|
||||
|
@ -1155,7 +1155,7 @@ class Rescale(TensorOperation):
|
|||
return cde.RescaleOperation(self.rescale, self.shift)
|
||||
|
||||
|
||||
class RandomResize(TensorOperation):
|
||||
class RandomResize(ImageTensorOperation):
|
||||
"""
|
||||
Tensor operation to resize the input image using a randomly selected interpolation mode.
|
||||
|
||||
|
@ -1187,7 +1187,7 @@ class RandomResize(TensorOperation):
|
|||
return cde.RandomResizeOperation(size)
|
||||
|
||||
|
||||
class RandomResizeWithBBox(TensorOperation):
|
||||
class RandomResizeWithBBox(ImageTensorOperation):
|
||||
"""
|
||||
Tensor operation to resize the input image using a randomly selected interpolation mode and adjust
|
||||
bounding boxes accordingly.
|
||||
|
@ -1220,7 +1220,7 @@ class RandomResizeWithBBox(TensorOperation):
|
|||
return cde.RandomResizeWithBBoxOperation(size)
|
||||
|
||||
|
||||
class HWC2CHW(TensorOperation):
|
||||
class HWC2CHW(ImageTensorOperation):
|
||||
"""
|
||||
Transpose the input image; shape (H, W, C) to shape (C, H, W).
|
||||
|
||||
|
@ -1253,7 +1253,7 @@ class HWC2CHW(TensorOperation):
|
|||
return cde.HwcToChwOperation()
|
||||
|
||||
|
||||
class RandomCropDecodeResize(TensorOperation):
|
||||
class RandomCropDecodeResize(ImageTensorOperation):
|
||||
"""
|
||||
Equivalent to RandomResizedCrop, but crops before decodes.
|
||||
|
||||
|
@ -1305,7 +1305,7 @@ class RandomCropDecodeResize(TensorOperation):
|
|||
self.max_attempts)
|
||||
|
||||
|
||||
class Pad(TensorOperation):
|
||||
class Pad(ImageTensorOperation):
|
||||
"""
|
||||
Pads the image according to padding parameters.
|
||||
|
||||
|
@ -1370,7 +1370,7 @@ class Pad(TensorOperation):
|
|||
return img.as_array()
|
||||
|
||||
|
||||
class UniformAugment(TensorOperation):
|
||||
class UniformAugment(ImageTensorOperation):
|
||||
"""
|
||||
Tensor operation to perform randomly selected augmentation.
|
||||
|
||||
|
@ -1407,7 +1407,7 @@ class UniformAugment(TensorOperation):
|
|||
return cde.UniformAugOperation(transforms, self.num_ops)
|
||||
|
||||
|
||||
class RandomSelectSubpolicy(TensorOperation):
|
||||
class RandomSelectSubpolicy(ImageTensorOperation):
|
||||
"""
|
||||
Choose a random sub-policy from a list to be applied on the input image. A sub-policy is a list of tuples
|
||||
(op, prob), where op is a TensorOp operation and prob is the probability that this op will be applied. Once
|
||||
|
@ -1446,7 +1446,7 @@ class RandomSelectSubpolicy(TensorOperation):
|
|||
return cde.RandomSelectSubpolicyOperation(policy)
|
||||
|
||||
|
||||
class SoftDvppDecodeResizeJpeg(TensorOperation):
|
||||
class SoftDvppDecodeResizeJpeg(ImageTensorOperation):
|
||||
"""
|
||||
Tensor operation to decode and resize JPEG image using the simulation algorithm of
|
||||
Ascend series chip DVPP module.
|
||||
|
@ -1486,7 +1486,7 @@ class SoftDvppDecodeResizeJpeg(TensorOperation):
|
|||
return cde.SoftDvppDecodeResizeJpegOperation(self.size)
|
||||
|
||||
|
||||
class SoftDvppDecodeRandomCropResizeJpeg(TensorOperation):
|
||||
class SoftDvppDecodeRandomCropResizeJpeg(ImageTensorOperation):
|
||||
"""
|
||||
Tensor operation to decode, random crop and resize JPEG image using the simulation algorithm of
|
||||
Ascend series chip DVPP module.
|
||||
|
@ -1531,7 +1531,7 @@ class SoftDvppDecodeRandomCropResizeJpeg(TensorOperation):
|
|||
return cde.SoftDvppDecodeRandomCropResizeJpegOperation(self.size, self.scale, self.ratio, self.max_attempts)
|
||||
|
||||
|
||||
class RandomSolarize(TensorOperation):
|
||||
class RandomSolarize(ImageTensorOperation):
|
||||
"""
|
||||
Invert all pixel values above a threshold.
|
||||
|
||||
|
|
|
@ -877,6 +877,229 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail) {
|
|||
EXPECT_EQ(jieba_tokenizer3, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord) {
|
||||
// Testing the parameter AddWord of JiebaTokenizer when the freq is not provided (default 0).
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
|
||||
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
|
||||
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create jieba_tokenizer operation on ds
|
||||
std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer =
|
||||
text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp);
|
||||
EXPECT_NE(jieba_tokenizer, nullptr);
|
||||
|
||||
// Add word with freq not provided (default 0)
|
||||
jieba_tokenizer->AddWord("男默女泪");
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({jieba_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
Tensor::CreateFromVector(expected, &expected_tensor);
|
||||
EXPECT_EQ(*ind, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord1) {
|
||||
// Testing the parameter AddWord of JiebaTokenizer when the freq is set explicitly to 0.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord1.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
|
||||
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
|
||||
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create jieba_tokenizer operation on ds
|
||||
std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer =
|
||||
text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp);
|
||||
EXPECT_NE(jieba_tokenizer, nullptr);
|
||||
|
||||
// Add word with freq is set explicitly to 0
|
||||
jieba_tokenizer->AddWord("男默女泪", 0);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({jieba_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
Tensor::CreateFromVector(expected, &expected_tensor);
|
||||
EXPECT_EQ(*ind, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord2) {
|
||||
// Testing the parameter AddWord of JiebaTokenizer when the freq is 10.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord2.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
|
||||
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
|
||||
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create jieba_tokenizer operation on ds
|
||||
std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer =
|
||||
text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp);
|
||||
EXPECT_NE(jieba_tokenizer, nullptr);
|
||||
|
||||
// Add word with freq 10
|
||||
jieba_tokenizer->AddWord("男默女泪", 10);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({jieba_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
Tensor::CreateFromVector(expected, &expected_tensor);
|
||||
EXPECT_EQ(*ind, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord3) {
|
||||
// Testing the parameter AddWord of JiebaTokenizer when the freq is 20000 which affects the result of segmentation.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord3.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt";
|
||||
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
|
||||
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create jieba_tokenizer operation on ds
|
||||
std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer =
|
||||
text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp);
|
||||
EXPECT_NE(jieba_tokenizer, nullptr);
|
||||
|
||||
// Add word with freq 20000
|
||||
jieba_tokenizer->AddWord("江大桥", 20000);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({jieba_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
Tensor::CreateFromVector(expected, &expected_tensor);
|
||||
EXPECT_EQ(*ind, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWordFail) {
|
||||
// Testing the incorrect parameter of AddWord in JiebaTokenizer.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWordFail.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
|
||||
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
|
||||
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Testing the parameter word of AddWord is empty
|
||||
std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer =
|
||||
text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp);
|
||||
EXPECT_NE(jieba_tokenizer, nullptr);
|
||||
EXPECT_NE(jieba_tokenizer->AddWord("", 10), Status::OK());
|
||||
// Testing the parameter freq of AddWord is negative
|
||||
std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer1 =
|
||||
text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp);
|
||||
EXPECT_NE(jieba_tokenizer1, nullptr);
|
||||
EXPECT_NE(jieba_tokenizer1->AddWord("我们", -1), Status::OK());
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) {
|
||||
// Testing the parameter of SlidingWindow interface when the axis is 0.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess.";
|
||||
|
|
|
@ -166,7 +166,8 @@ def test_lookup_cast_type():
|
|||
assert test_config("unk") == np.dtype("int32")
|
||||
# test exception, data_type isn't the correct type
|
||||
assert "tldr is not of type (<class 'mindspore._c_expression.typing.Type'>,)" in test_config("unk", "tldr")
|
||||
assert "Lookup doesn't support string to string lookup" in test_config("w1", mstype.string)
|
||||
assert "Lookup does not support a string to string mapping, data_type can only be numeric." in \
|
||||
test_config("w1", mstype.string)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Reference in New Issue