forked from mindspore-Ecosystem/mindspore
add offsets feature to tokenizer
This commit is contained in:
parent
4bdd8e16a2
commit
47060631e5
|
@ -601,13 +601,14 @@ void bindTensorOps4(py::module *m) {
|
|||
|
||||
void bindTokenizerOps(py::module *m) {
|
||||
(void)py::class_<JiebaTokenizerOp, TensorOp, std::shared_ptr<JiebaTokenizerOp>>(*m, "JiebaTokenizerOp", "")
|
||||
.def(py::init<const std::string, std::string, JiebaMode>(), py::arg("hmm_path"), py::arg("mp_path"),
|
||||
py::arg("mode") = JiebaMode::kMix)
|
||||
.def(py::init<const std::string &, const std::string &, const JiebaMode &, const bool &>(), py::arg("hmm_path"),
|
||||
py::arg("mp_path"), py::arg("mode") = JiebaMode::kMix,
|
||||
py::arg("with_offsets") = JiebaTokenizerOp::kDefWithOffsets)
|
||||
.def("add_word",
|
||||
[](JiebaTokenizerOp &self, const std::string word, int freq) { THROW_IF_ERROR(self.AddWord(word, freq)); });
|
||||
(void)py::class_<UnicodeCharTokenizerOp, TensorOp, std::shared_ptr<UnicodeCharTokenizerOp>>(
|
||||
*m, "UnicodeCharTokenizerOp", "Tokenize a scalar tensor of UTF-8 string to Unicode characters.")
|
||||
.def(py::init<>());
|
||||
.def(py::init<const bool &>(), py::arg("with_offsets") = UnicodeCharTokenizerOp::kDefWithOffsets);
|
||||
(void)py::class_<LookupOp, TensorOp, std::shared_ptr<LookupOp>>(*m, "LookupOp",
|
||||
"Tensor operation to LookUp each word")
|
||||
.def(py::init<std::shared_ptr<Vocab>, WordIdType>(), py::arg("vocab"), py::arg("unknown"))
|
||||
|
@ -619,21 +620,25 @@ void bindTokenizerOps(py::module *m) {
|
|||
py::arg("separator"));
|
||||
(void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
|
||||
*m, "WordpieceTokenizerOp", "Tokenize scalar token or 1-D tokens to subword tokens.")
|
||||
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &>(),
|
||||
py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
|
||||
py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
|
||||
py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken));
|
||||
.def(
|
||||
py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, const bool &>(),
|
||||
py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
|
||||
py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
|
||||
py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken),
|
||||
py::arg("with_offsets") = WordpieceTokenizerOp::kDefWithOffsets);
|
||||
}
|
||||
|
||||
void bindDependIcuTokenizerOps(py::module *m) {
|
||||
#ifdef ENABLE_ICU4C
|
||||
(void)py::class_<WhitespaceTokenizerOp, TensorOp, std::shared_ptr<WhitespaceTokenizerOp>>(
|
||||
*m, "WhitespaceTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces.")
|
||||
.def(py::init<>());
|
||||
.def(py::init<const bool &>(), py::arg("with_offsets") = WhitespaceTokenizerOp::kDefWithOffsets);
|
||||
(void)py::class_<UnicodeScriptTokenizerOp, TensorOp, std::shared_ptr<UnicodeScriptTokenizerOp>>(
|
||||
*m, "UnicodeScriptTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.")
|
||||
.def(py::init<>())
|
||||
.def(py::init<bool>(), py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace);
|
||||
.def(py::init<const bool &, const bool &>(),
|
||||
py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace,
|
||||
py::arg("with_offsets") = UnicodeScriptTokenizerOp::kDefWithOffsets);
|
||||
(void)py::class_<CaseFoldOp, TensorOp, std::shared_ptr<CaseFoldOp>>(
|
||||
*m, "CaseFoldOp", "Apply case fold operation on utf-8 string tensor")
|
||||
.def(py::init<>());
|
||||
|
@ -647,24 +652,28 @@ void bindDependIcuTokenizerOps(py::module *m) {
|
|||
py::arg("replace_all"));
|
||||
(void)py::class_<RegexTokenizerOp, TensorOp, std::shared_ptr<RegexTokenizerOp>>(
|
||||
*m, "RegexTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by regex expression pattern.")
|
||||
.def(py::init<const std::string &, const std::string &>(), py::arg("delim_pattern"), py::arg("keep_delim_pattern"));
|
||||
.def(py::init<const std::string &, const std::string &, const bool &>(), py::arg("delim_pattern"),
|
||||
py::arg("keep_delim_pattern"), py::arg("with_offsets") = RegexTokenizerOp::kDefWithOffsets);
|
||||
(void)py::class_<BasicTokenizerOp, TensorOp, std::shared_ptr<BasicTokenizerOp>>(
|
||||
*m, "BasicTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by specific rules.")
|
||||
.def(py::init<bool, bool, NormalizeForm, bool>(), py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
|
||||
.def(py::init<const bool &, const bool &, const NormalizeForm &, const bool &, const bool &>(),
|
||||
py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
|
||||
py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
|
||||
py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
|
||||
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken);
|
||||
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken,
|
||||
py::arg("with_offsets") = BasicTokenizerOp::kDefWithOffsets);
|
||||
(void)py::class_<BertTokenizerOp, TensorOp, std::shared_ptr<BertTokenizerOp>>(*m, "BertTokenizerOp",
|
||||
"Tokenizer used for Bert text process.")
|
||||
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, bool, bool,
|
||||
NormalizeForm, bool>(),
|
||||
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, const bool &,
|
||||
const bool &, const NormalizeForm &, const bool &, const bool &>(),
|
||||
py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
|
||||
py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
|
||||
py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken),
|
||||
py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
|
||||
py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
|
||||
py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
|
||||
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken);
|
||||
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken,
|
||||
py::arg("with_offsets") = WordpieceTokenizerOp::kDefWithOffsets);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -27,10 +27,12 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
const bool BasicTokenizerOp::kDefLowerCase = false;
|
||||
const bool BasicTokenizerOp::kDefKeepWhitespace = false;
|
||||
const NormalizeForm BasicTokenizerOp::kDefNormalizationForm = NormalizeForm::kNone;
|
||||
const bool BasicTokenizerOp::kDefPreserveUnusedToken = true;
|
||||
const bool BasicTokenizerOp::kDefWithOffsets = false;
|
||||
const char BasicTokenizerOp::kCommonPattern[] =
|
||||
"[!-/]"
|
||||
"|[:-@]"
|
||||
|
@ -47,11 +49,14 @@ const char BasicTokenizerOp::kCommonPattern[] =
|
|||
"|[\\x{2F800}-\\x{2FA1F}]";
|
||||
const char BasicTokenizerOp::kUnusedPattern[] = "\\[CLS\\]|\\[SEP\\]|\\[UNK\\]|\\[PAD\\]|\\[MASK\\]|\\[unused\\d+\\]|";
|
||||
const std::unordered_set<std::string> BasicTokenizerOp::kUnusedWords{"[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]"};
|
||||
BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, NormalizeForm normalization_form,
|
||||
bool preserve_unused_token)
|
||||
|
||||
BasicTokenizerOp::BasicTokenizerOp(const bool &lower_case, const bool &keep_whitespace,
|
||||
const NormalizeForm &normalization_form, const bool &preserve_unused_token,
|
||||
const bool &with_offsets)
|
||||
: lower_case_(lower_case),
|
||||
keep_whitespace_(keep_whitespace),
|
||||
preserve_unused_token_(preserve_unused_token),
|
||||
with_offsets_(with_offsets),
|
||||
case_fold_(std::make_unique<CaseFoldOp>()),
|
||||
nfd_normalize_(std::make_unique<NormalizeUTF8Op>(NormalizeForm::kNfd)),
|
||||
normalization_form_(normalization_form),
|
||||
|
@ -69,7 +74,7 @@ BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, Normal
|
|||
keep_delim_pattern = kUnusedPattern + keep_delim_pattern;
|
||||
delim_pattern = kUnusedPattern + delim_pattern;
|
||||
}
|
||||
regex_tokenizer_ = std::make_unique<RegexTokenizerOp>(delim_pattern, keep_delim_pattern);
|
||||
regex_tokenizer_ = std::make_unique<RegexTokenizerOp>(delim_pattern, keep_delim_pattern, with_offsets_);
|
||||
}
|
||||
|
||||
Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::string_view &text,
|
||||
|
@ -135,9 +140,10 @@ Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::shared_ptr<Tensor
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
|
||||
Status BasicTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
|
||||
IO_CHECK_VECTOR(input, output);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
|
||||
if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
|
||||
}
|
||||
std::shared_ptr<Tensor> cur_input;
|
||||
|
@ -145,10 +151,10 @@ Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shar
|
|||
if (lower_case_) {
|
||||
if (!preserve_unused_token_) {
|
||||
// to lower case
|
||||
RETURN_IF_NOT_OK(case_fold_->Compute(input, &processed_tensor));
|
||||
RETURN_IF_NOT_OK(case_fold_->Compute(input[0], &processed_tensor));
|
||||
} else {
|
||||
// to lower case except words in kUnusedWords
|
||||
RETURN_IF_NOT_OK(CaseFoldWithoutUnusedWords(input, &processed_tensor));
|
||||
RETURN_IF_NOT_OK(CaseFoldWithoutUnusedWords(input[0], &processed_tensor));
|
||||
}
|
||||
cur_input = processed_tensor;
|
||||
// strip accent characters
|
||||
|
@ -156,12 +162,12 @@ Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shar
|
|||
cur_input = processed_tensor;
|
||||
RETURN_IF_NOT_OK(replace_accent_chars_->Compute(cur_input, &processed_tensor));
|
||||
} else {
|
||||
RETURN_IF_NOT_OK(common_normalize_->Compute(input, &processed_tensor));
|
||||
RETURN_IF_NOT_OK(common_normalize_->Compute(input[0], &processed_tensor));
|
||||
}
|
||||
// strip control characters
|
||||
cur_input = processed_tensor;
|
||||
RETURN_IF_NOT_OK(replace_control_chars_->Compute(cur_input, &processed_tensor));
|
||||
return regex_tokenizer_->Compute(processed_tensor, output);
|
||||
return regex_tokenizer_->Compute(TensorRow(0, {std::move(processed_tensor)}), output);
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -36,15 +36,18 @@ class BasicTokenizerOp : public TensorOp {
|
|||
static const bool kDefKeepWhitespace;
|
||||
static const NormalizeForm kDefNormalizationForm;
|
||||
static const bool kDefPreserveUnusedToken;
|
||||
explicit BasicTokenizerOp(bool lower_case = kDefLowerCase, bool keep_whitespace = kDefKeepWhitespace,
|
||||
NormalizeForm normalization_form = kDefNormalizationForm,
|
||||
bool preserve_unused_token = kDefPreserveUnusedToken);
|
||||
static const bool kDefWithOffsets;
|
||||
|
||||
explicit BasicTokenizerOp(const bool &lower_case = kDefLowerCase, const bool &keep_whitespace = kDefKeepWhitespace,
|
||||
const NormalizeForm &normalization_form = kDefNormalizationForm,
|
||||
const bool &preserve_unused_token = kDefPreserveUnusedToken,
|
||||
const bool &with_offsets = kDefWithOffsets);
|
||||
|
||||
~BasicTokenizerOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "BasicTokenizerOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
Status Compute(const TensorRow &input, TensorRow *output) override;
|
||||
|
||||
protected:
|
||||
Status CaseFoldWithoutUnusedWords(const std::string_view &text, const std::unordered_set<std::string> &unused_words,
|
||||
|
@ -55,6 +58,7 @@ class BasicTokenizerOp : public TensorOp {
|
|||
static const char kCommonPattern[];
|
||||
static const char kUnusedPattern[];
|
||||
static const std::unordered_set<std::string> kUnusedWords;
|
||||
bool with_offsets_;
|
||||
bool lower_case_;
|
||||
bool keep_whitespace_;
|
||||
NormalizeForm normalization_form_;
|
||||
|
|
|
@ -16,9 +16,9 @@
|
|||
#include "dataset/text/kernels/bert_tokenizer_op.h"
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
Status BertTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
std::shared_ptr<Tensor> basic_tensor;
|
||||
Status BertTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
|
||||
IO_CHECK_VECTOR(input, output);
|
||||
TensorRow basic_tensor;
|
||||
RETURN_IF_NOT_OK(basic_tokenizer_.Compute(input, &basic_tensor));
|
||||
RETURN_IF_NOT_OK(wordpiece_tokenizer_.Compute(basic_tensor, output));
|
||||
return Status::OK();
|
||||
|
|
|
@ -32,18 +32,19 @@ class BertTokenizerOp : public TensorOp {
|
|||
const std::string &suffix_indicator = WordpieceTokenizerOp::kDefSuffixIndicator,
|
||||
const int &max_bytes_per_token = WordpieceTokenizerOp::kDefMaxBytesPerToken,
|
||||
const std::string &unknown_token = WordpieceTokenizerOp::kDefUnknownToken,
|
||||
bool lower_case = BasicTokenizerOp::kDefLowerCase,
|
||||
bool keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace,
|
||||
NormalizeForm normalization_form = BasicTokenizerOp::kDefNormalizationForm,
|
||||
bool preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken)
|
||||
: wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token),
|
||||
basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token) {}
|
||||
const bool &lower_case = BasicTokenizerOp::kDefLowerCase,
|
||||
const bool &keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace,
|
||||
const NormalizeForm &normalization_form = BasicTokenizerOp::kDefNormalizationForm,
|
||||
const bool &preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken,
|
||||
const bool &with_offsets = WordpieceTokenizerOp::kDefWithOffsets)
|
||||
: wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets),
|
||||
basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token, with_offsets) {}
|
||||
|
||||
~BertTokenizerOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "BertTokenizerOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
Status Compute(const TensorRow &input, TensorRow *output) override;
|
||||
|
||||
private:
|
||||
WordpieceTokenizerOp wordpiece_tokenizer_;
|
||||
|
|
|
@ -23,35 +23,63 @@
|
|||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
JiebaTokenizerOp::JiebaTokenizerOp(const std::string &hmm_path, const std::string &dict_path, JiebaMode mode)
|
||||
: jieba_mode_(mode), hmm_model_path_(hmm_path), mp_dict_path_(dict_path) {
|
||||
const bool JiebaTokenizerOp::kDefWithOffsets = false;
|
||||
|
||||
JiebaTokenizerOp::JiebaTokenizerOp(const std::string &hmm_path, const std::string &dict_path, const JiebaMode &mode,
|
||||
const bool &with_offsets)
|
||||
: jieba_mode_(mode), hmm_model_path_(hmm_path), mp_dict_path_(dict_path), with_offsets_(with_offsets) {
|
||||
jieba_parser_ = std::make_unique<cppjieba::Jieba>(mp_dict_path_, hmm_model_path_, "");
|
||||
}
|
||||
|
||||
Status JiebaTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
Status JiebaTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
|
||||
IO_CHECK_VECTOR(input, output);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
|
||||
RETURN_UNEXPECTED_IF_NULL(jieba_parser_);
|
||||
|
||||
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
|
||||
if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("the input tensor should be scalar string tensor");
|
||||
}
|
||||
|
||||
std::string_view sentence_v;
|
||||
RETURN_IF_NOT_OK(input->GetItemAt(&sentence_v, {}));
|
||||
RETURN_IF_NOT_OK(input[0]->GetItemAt(&sentence_v, {}));
|
||||
std::string sentence{sentence_v};
|
||||
std::vector<std::string> words;
|
||||
std::vector<uint32_t> offsets_start, offsets_limit;
|
||||
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
|
||||
if (sentence == "") {
|
||||
words.push_back("");
|
||||
} else {
|
||||
std::vector<cppjieba::Word> tmp;
|
||||
if (jieba_mode_ == JiebaMode::kMp) {
|
||||
jieba_parser_->CutSmall(sentence, words, MAX_WORD_LENGTH);
|
||||
std::unique_ptr<cppjieba::MPSegment> mp_seg = std::make_unique<cppjieba::MPSegment>(jieba_parser_->GetDictTrie());
|
||||
mp_seg->Cut(sentence, tmp, MAX_WORD_LENGTH);
|
||||
} else if (jieba_mode_ == JiebaMode::kHmm) {
|
||||
jieba_parser_->CutHMM(sentence, words);
|
||||
std::unique_ptr<cppjieba::HMMSegment> hmm_seg =
|
||||
std::make_unique<cppjieba::HMMSegment>(jieba_parser_->GetHMMModel());
|
||||
hmm_seg->Cut(sentence, tmp);
|
||||
} else { // Mix
|
||||
jieba_parser_->Cut(sentence, words, true);
|
||||
std::unique_ptr<cppjieba::MixSegment> mix_seg =
|
||||
std::make_unique<cppjieba::MixSegment>(jieba_parser_->GetDictTrie(), jieba_parser_->GetHMMModel());
|
||||
mix_seg->Cut(sentence, tmp, true);
|
||||
}
|
||||
GetStringsFromWords(tmp, words);
|
||||
for (auto item : tmp) {
|
||||
offsets_start.push_back(static_cast<uint32_t>(item.offset));
|
||||
offsets_limit.push_back(static_cast<uint32_t>(item.offset + item.word.length()));
|
||||
}
|
||||
}
|
||||
*output = std::make_shared<Tensor>(words, TensorShape({(dsize_t)words.size()}));
|
||||
token_tensor = std::make_shared<Tensor>(words, TensorShape({(dsize_t)words.size()}));
|
||||
output->push_back(token_tensor);
|
||||
if (with_offsets_) {
|
||||
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
|
||||
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
|
||||
reinterpret_cast<unsigned char *>(&offsets_start[0])));
|
||||
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
|
||||
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
|
||||
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
|
||||
output->push_back(offsets_start_tensor);
|
||||
output->push_back(offsets_limit_tensor);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
|
|
@ -30,15 +30,19 @@ enum class JiebaMode { kMix = 0, kMp = 1, kHmm = 2 };
|
|||
|
||||
class JiebaTokenizerOp : public TensorOp {
|
||||
public:
|
||||
// deffault constant for Jieba MPSegment algorithm.
|
||||
// default constant for Jieba MPSegment algorithm.
|
||||
static constexpr size_t MAX_WORD_LENGTH = 512;
|
||||
// default const for set whether Jieba output offsets tensor.
|
||||
static const bool kDefWithOffsets;
|
||||
// Constructor for JiebaTokenizerOp.
|
||||
// @param hmm_path HMM model file.
|
||||
// @param mp_path MP model file.
|
||||
// @mode tokenization mode [Default "MIX"], "MP" model will tokenize with MPSegment algorithm, "HMM" mode will
|
||||
// tokenize with Hiddel Markov Model Segment algorithm, "MIx" model will tokenize with a mix of MPSegment and
|
||||
// HMMSegment algorithm.
|
||||
JiebaTokenizerOp(const std::string &hmm_path, const std::string &mp_path, JiebaMode mode = JiebaMode::kMix);
|
||||
// @with_offsets user set this value to choose whether output offset tensor.
|
||||
JiebaTokenizerOp(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode = JiebaMode::kMix,
|
||||
const bool &with_offsets = kDefWithOffsets);
|
||||
~JiebaTokenizerOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override {
|
||||
|
@ -46,7 +50,7 @@ class JiebaTokenizerOp : public TensorOp {
|
|||
<< mp_dict_path_;
|
||||
}
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
Status Compute(const TensorRow &input, TensorRow *output) override;
|
||||
|
||||
// @word the word to be added to the JiebaTokenizer.
|
||||
// @freq [Default 0] the frequency fo the word to be added.
|
||||
|
@ -58,6 +62,7 @@ class JiebaTokenizerOp : public TensorOp {
|
|||
std::string mp_dict_path_;
|
||||
std::unique_ptr<cppjieba::Jieba> jieba_parser_;
|
||||
JiebaMode jieba_mode_;
|
||||
bool with_offsets_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -22,8 +22,11 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
|
||||
icu::UnicodeString *out_unicode) const {
|
||||
|
||||
const bool RegexTokenizerOp::kDefWithOffsets = false;
|
||||
|
||||
Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, const int &start, const int &len,
|
||||
std::string *out_utf8, icu::UnicodeString *out_unicode) const {
|
||||
CHECK_FAIL_RETURN_UNEXPECTED((out_utf8 != nullptr || out_unicode != nullptr), "Wrong input");
|
||||
int total_len = input.length();
|
||||
int end = start + len;
|
||||
|
@ -39,7 +42,9 @@ Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int s
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const {
|
||||
Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens,
|
||||
std::vector<uint32_t> *offsets_start,
|
||||
std::vector<uint32_t> *offsets_limit) const {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
out_tokens->clear();
|
||||
icu::RegexMatcher token_matcher(delim_pattern_, 0, status);
|
||||
|
@ -50,6 +55,7 @@ Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std
|
|||
icu::UnicodeString utext(icu::UnicodeString::fromUTF8(text));
|
||||
token_matcher.reset(utext);
|
||||
|
||||
int text_start_index = 0;
|
||||
int token_start_index = 0;
|
||||
status = U_ZERO_ERROR;
|
||||
while (token_matcher.find(status) && U_SUCCESS(status)) {
|
||||
|
@ -62,41 +68,70 @@ Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std
|
|||
int token_len = deli_start_index - token_start_index;
|
||||
if (token_len > 0) {
|
||||
std::string token;
|
||||
uint32_t token_offset = 0;
|
||||
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, token_len, &token));
|
||||
token_offset = token.length();
|
||||
out_tokens->emplace_back(std::move(token));
|
||||
offsets_start->push_back(static_cast<uint32_t>(text_start_index));
|
||||
offsets_limit->push_back(static_cast<uint32_t>(text_start_index + token_offset));
|
||||
text_start_index += token_offset;
|
||||
}
|
||||
|
||||
int delim_len = deli_end_index - deli_start_index;
|
||||
if (keep_delim_ && delim_len > 0) {
|
||||
if (delim_len > 0) {
|
||||
icu::UnicodeString delim_str;
|
||||
std::string delim_utf8_str;
|
||||
uint32_t delim_str_offset = 0;
|
||||
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, deli_start_index, delim_len, &delim_utf8_str, &delim_str));
|
||||
delim_matcher.reset(delim_str);
|
||||
if (delim_matcher.matches(status) && U_SUCCESS(status)) {
|
||||
delim_str_offset = delim_utf8_str.length();
|
||||
if (keep_delim_ && delim_matcher.matches(status) && U_SUCCESS(status)) {
|
||||
out_tokens->emplace_back(std::move(delim_utf8_str));
|
||||
offsets_start->push_back(static_cast<uint32_t>(text_start_index));
|
||||
offsets_limit->push_back(static_cast<uint32_t>(text_start_index + delim_str_offset));
|
||||
}
|
||||
text_start_index += delim_str_offset;
|
||||
}
|
||||
token_start_index = deli_end_index;
|
||||
}
|
||||
|
||||
if (token_start_index < utext.length()) {
|
||||
std::string temp;
|
||||
uint32_t temp_offset = 0;
|
||||
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, utext.length() - token_start_index, &temp));
|
||||
temp_offset = temp.length();
|
||||
out_tokens->emplace_back(std::move(temp));
|
||||
offsets_start->push_back(static_cast<uint32_t>(text_start_index));
|
||||
offsets_limit->push_back(static_cast<uint32_t>(text_start_index + temp_offset));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status RegexTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
|
||||
Status RegexTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
|
||||
IO_CHECK_VECTOR(input, output);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
|
||||
if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
|
||||
}
|
||||
std::string_view text;
|
||||
RETURN_IF_NOT_OK(input->GetItemAt(&text, {}));
|
||||
std::vector<std::string> tokens;
|
||||
RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens));
|
||||
*output = std::make_shared<Tensor>(std::move(tokens), TensorShape({(dsize_t)tokens.size()}));
|
||||
std::vector<uint32_t> offsets_start;
|
||||
std::vector<uint32_t> offsets_limit;
|
||||
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
|
||||
RETURN_IF_NOT_OK(input[0]->GetItemAt(&text, {}));
|
||||
RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens, &offsets_start, &offsets_limit));
|
||||
token_tensor = std::make_shared<Tensor>(std::move(tokens), TensorShape({(dsize_t)tokens.size()}));
|
||||
output->push_back(token_tensor);
|
||||
if (with_offsets_) {
|
||||
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
|
||||
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
|
||||
reinterpret_cast<unsigned char *>(&offsets_start[0])));
|
||||
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
|
||||
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
|
||||
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
|
||||
output->push_back(offsets_start_tensor);
|
||||
output->push_back(offsets_limit_tensor);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
|
|
|
@ -32,25 +32,31 @@ namespace dataset {
|
|||
|
||||
class RegexTokenizerOp : public TensorOp {
|
||||
public:
|
||||
RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern)
|
||||
static const bool kDefWithOffsets;
|
||||
|
||||
RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern,
|
||||
const bool &with_offsets = kDefWithOffsets)
|
||||
: delim_pattern_(icu::UnicodeString::fromUTF8(delim_pattern)),
|
||||
keep_delim_pattern_(icu::UnicodeString::fromUTF8(keep_delim_pattern)),
|
||||
with_offsets_(with_offsets),
|
||||
keep_delim_(!keep_delim_pattern.empty()) {}
|
||||
|
||||
~RegexTokenizerOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "RegexTokenizerOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
Status Compute(const TensorRow &input, TensorRow *output) override;
|
||||
|
||||
protected:
|
||||
Status GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
|
||||
Status GetUnicodeSubstr(const icu::UnicodeString &input, const int &start, const int &len, std::string *out_utf8,
|
||||
icu::UnicodeString *out_unicode = nullptr) const;
|
||||
Status GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const;
|
||||
Status GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens,
|
||||
std::vector<uint32_t> *offsets_start, std::vector<uint32_t> *offsets_limit) const;
|
||||
|
||||
private:
|
||||
const icu::UnicodeString delim_pattern_;
|
||||
const icu::UnicodeString keep_delim_pattern_;
|
||||
bool with_offsets_;
|
||||
const bool keep_delim_;
|
||||
};
|
||||
} // namespace dataset
|
||||
|
|
|
@ -27,26 +27,46 @@ using cppjieba::RuneStrArray;
|
|||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
Status UnicodeCharTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
|
||||
const bool UnicodeCharTokenizerOp::kDefWithOffsets = false;
|
||||
|
||||
Status UnicodeCharTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
|
||||
IO_CHECK_VECTOR(input, output);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
|
||||
if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
|
||||
}
|
||||
std::string_view str;
|
||||
RETURN_IF_NOT_OK(input->GetItemAt(&str, {}));
|
||||
RETURN_IF_NOT_OK(input[0]->GetItemAt(&str, {}));
|
||||
|
||||
RuneStrArray runes;
|
||||
if (!DecodeRunesInString(str.data(), str.size(), runes)) {
|
||||
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
|
||||
}
|
||||
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
|
||||
std::vector<std::string> splits(runes.size());
|
||||
std::vector<uint32_t> offsets_start, offsets_limit;
|
||||
for (size_t i = 0; i < runes.size(); i++) {
|
||||
offsets_start.push_back(runes[i].offset);
|
||||
offsets_limit.push_back(runes[i].offset + runes[i].len);
|
||||
splits[i] = str.substr(runes[i].offset, runes[i].len);
|
||||
}
|
||||
if (splits.empty()) {
|
||||
splits.emplace_back("");
|
||||
offsets_start.push_back(0);
|
||||
offsets_limit.push_back(0);
|
||||
}
|
||||
token_tensor = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
|
||||
output->push_back(token_tensor);
|
||||
if (with_offsets_) {
|
||||
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
|
||||
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
|
||||
reinterpret_cast<unsigned char *>(&offsets_start[0])));
|
||||
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
|
||||
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
|
||||
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
|
||||
output->push_back(offsets_start_tensor);
|
||||
output->push_back(offsets_limit_tensor);
|
||||
}
|
||||
*output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
|
|
|
@ -26,13 +26,18 @@ namespace dataset {
|
|||
|
||||
class UnicodeCharTokenizerOp : public TensorOp {
|
||||
public:
|
||||
UnicodeCharTokenizerOp() {}
|
||||
static const bool kDefWithOffsets;
|
||||
|
||||
explicit UnicodeCharTokenizerOp(const bool &with_offsets = kDefWithOffsets) : with_offsets_(with_offsets) {}
|
||||
|
||||
~UnicodeCharTokenizerOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "UnicodeCharTokenizerOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
Status Compute(const TensorRow &input, TensorRow *output) override;
|
||||
|
||||
private:
|
||||
bool with_offsets_;
|
||||
};
|
||||
|
||||
} // namespace dataset
|
||||
|
|
|
@ -32,24 +32,28 @@ namespace mindspore {
|
|||
namespace dataset {
|
||||
|
||||
const bool UnicodeScriptTokenizerOp::kDefKeepWhitespace = false;
|
||||
const bool UnicodeScriptTokenizerOp::kDefWithOffsets = false;
|
||||
|
||||
Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
|
||||
Status UnicodeScriptTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
|
||||
IO_CHECK_VECTOR(input, output);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
|
||||
if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
|
||||
}
|
||||
std::string_view str;
|
||||
RETURN_IF_NOT_OK(input->GetItemAt(&str, {}));
|
||||
RETURN_IF_NOT_OK(input[0]->GetItemAt(&str, {}));
|
||||
RuneStrArray runes;
|
||||
if (!DecodeRunesInString(str.data(), str.size(), runes)) {
|
||||
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
|
||||
}
|
||||
|
||||
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
|
||||
UScriptCode last_script = USCRIPT_INVALID_CODE;
|
||||
icu::ErrorCode status;
|
||||
int start = 0;
|
||||
int len = 0;
|
||||
std::vector<std::string> splits;
|
||||
std::vector<uint32_t> offsets_start, offsets_limit;
|
||||
|
||||
bool was_space = false;
|
||||
for (size_t i = 0; i < runes.size(); i++) {
|
||||
|
@ -66,6 +70,8 @@ Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, s
|
|||
if (len > 0 && (script != last_script || is_space != was_space)) {
|
||||
// 3) If keep_whitespace_ is false, all the whitespace characters will be discard
|
||||
if (keep_whitespace_ || !was_space) {
|
||||
offsets_start.push_back(static_cast<uint32_t>(start));
|
||||
offsets_limit.push_back(static_cast<uint32_t>(start + len));
|
||||
std::string temp(str.substr(start, len));
|
||||
splits.emplace_back(std::move(temp));
|
||||
}
|
||||
|
@ -79,14 +85,29 @@ Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, s
|
|||
}
|
||||
|
||||
if (len > 0 && (keep_whitespace_ || !was_space)) {
|
||||
offsets_start.push_back(static_cast<uint32_t>(start));
|
||||
offsets_limit.push_back(static_cast<uint32_t>(start + len));
|
||||
std::string temp(str.substr(start, len));
|
||||
splits.emplace_back(std::move(temp));
|
||||
}
|
||||
// 4) If the input is empty scalar string, the output will be 1-D empty string.
|
||||
if (splits.empty()) {
|
||||
splits.emplace_back("");
|
||||
offsets_start.push_back(0);
|
||||
offsets_limit.push_back(0);
|
||||
}
|
||||
token_tensor = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
|
||||
output->push_back(token_tensor);
|
||||
if (with_offsets_) {
|
||||
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
|
||||
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
|
||||
reinterpret_cast<unsigned char *>(&offsets_start[0])));
|
||||
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
|
||||
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
|
||||
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
|
||||
output->push_back(offsets_start_tensor);
|
||||
output->push_back(offsets_limit_tensor);
|
||||
}
|
||||
*output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
|
|
|
@ -27,17 +27,21 @@ namespace dataset {
|
|||
class UnicodeScriptTokenizerOp : public TensorOp {
|
||||
public:
|
||||
static const bool kDefKeepWhitespace;
|
||||
static const bool kDefWithOffsets;
|
||||
|
||||
explicit UnicodeScriptTokenizerOp(bool keep_whitespace = kDefKeepWhitespace) : keep_whitespace_(keep_whitespace) {}
|
||||
explicit UnicodeScriptTokenizerOp(const bool &keep_whitespace = kDefKeepWhitespace,
|
||||
const bool &with_offsets = kDefWithOffsets)
|
||||
: keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {}
|
||||
|
||||
~UnicodeScriptTokenizerOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "UnicodeScriptTokenizerOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
Status Compute(const TensorRow &input, TensorRow *output) override;
|
||||
|
||||
private:
|
||||
bool keep_whitespace_; // If or not keep whitespace tokens
|
||||
bool with_offsets_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -30,24 +30,33 @@ using cppjieba::RuneStrArray;
|
|||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
Status WhitespaceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
|
||||
|
||||
const bool WhitespaceTokenizerOp::kDefWithOffsets = false;
|
||||
|
||||
Status WhitespaceTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
|
||||
IO_CHECK_VECTOR(input, output);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
|
||||
if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
|
||||
}
|
||||
std::string_view str;
|
||||
RETURN_IF_NOT_OK(input->GetItemAt(&str, {}));
|
||||
RETURN_IF_NOT_OK(input[0]->GetItemAt(&str, {}));
|
||||
|
||||
RuneStrArray runes;
|
||||
if (!DecodeRunesInString(str.data(), str.size(), runes)) {
|
||||
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
|
||||
}
|
||||
|
||||
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
|
||||
std::vector<uint32_t> offsets_start, offsets_limit;
|
||||
std::vector<std::string> splits;
|
||||
int start = 0;
|
||||
int len = 0;
|
||||
for (size_t i = 0; i < runes.size(); i++) {
|
||||
if (u_isUWhiteSpace(runes[i].rune)) {
|
||||
if (len > 0) {
|
||||
offsets_start.push_back(static_cast<uint32_t>(start));
|
||||
offsets_limit.push_back(static_cast<uint32_t>(start + len));
|
||||
std::string temp(str.substr(start, len));
|
||||
splits.emplace_back(std::move(temp));
|
||||
len = 0;
|
||||
|
@ -60,13 +69,28 @@ Status WhitespaceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std:
|
|||
}
|
||||
}
|
||||
if (len > 0) {
|
||||
offsets_start.push_back(static_cast<uint32_t>(start));
|
||||
offsets_limit.push_back(static_cast<uint32_t>(start + len));
|
||||
std::string temp(str.substr(start, len));
|
||||
splits.emplace_back(std::move(temp));
|
||||
}
|
||||
if (splits.empty()) {
|
||||
splits.emplace_back("");
|
||||
offsets_start.push_back(0);
|
||||
offsets_limit.push_back(0);
|
||||
}
|
||||
token_tensor = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
|
||||
output->push_back(token_tensor);
|
||||
if (with_offsets_) {
|
||||
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
|
||||
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
|
||||
reinterpret_cast<unsigned char *>(&offsets_start[0])));
|
||||
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
|
||||
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
|
||||
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
|
||||
output->push_back(offsets_start_tensor);
|
||||
output->push_back(offsets_limit_tensor);
|
||||
}
|
||||
*output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
|
|
|
@ -26,13 +26,18 @@ namespace dataset {
|
|||
|
||||
class WhitespaceTokenizerOp : public TensorOp {
|
||||
public:
|
||||
WhitespaceTokenizerOp() {}
|
||||
static const bool kDefWithOffsets;
|
||||
|
||||
explicit WhitespaceTokenizerOp(const bool &with_offsets = kDefWithOffsets) : with_offsets_(with_offsets) {}
|
||||
|
||||
~WhitespaceTokenizerOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "WhitespaceTokenizerOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
Status Compute(const TensorRow &input, TensorRow *output) override;
|
||||
|
||||
private:
|
||||
bool with_offsets_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -24,13 +24,16 @@ namespace dataset {
|
|||
const char WordpieceTokenizerOp::kDefSuffixIndicator[] = "##";
|
||||
const int WordpieceTokenizerOp::kDefMaxBytesPerToken = 100;
|
||||
const char WordpieceTokenizerOp::kDefUnknownToken[] = "[UNK]";
|
||||
const bool WordpieceTokenizerOp::kDefWithOffsets = false;
|
||||
|
||||
WordpieceTokenizerOp::WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
|
||||
const int &max_bytes_per_token, const std::string &unknown_token)
|
||||
const int &max_bytes_per_token, const std::string &unknown_token,
|
||||
const bool &with_offsets)
|
||||
: vocab_(vocab),
|
||||
suffix_indicator_(suffix_indicator),
|
||||
max_bytes_per_token_(max_bytes_per_token),
|
||||
unknown_token_(unknown_token) {}
|
||||
unknown_token_(unknown_token),
|
||||
with_offsets_(with_offsets) {}
|
||||
|
||||
Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start,
|
||||
bool *out_found, int *out_end) const {
|
||||
|
@ -52,17 +55,22 @@ Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const Ru
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status WordpieceTokenizerOp::FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const {
|
||||
Status WordpieceTokenizerOp::FoundNoToken(const std::string &input_token, const uint32_t &basic_start,
|
||||
std::vector<std::string> *out_tokens, std::vector<uint32_t> *offsets_start,
|
||||
std::vector<uint32_t> *offsets_limit) const {
|
||||
out_tokens->clear();
|
||||
offsets_start->push_back(basic_start);
|
||||
if (unknown_token_.empty()) {
|
||||
out_tokens->emplace_back(input_token);
|
||||
offsets_limit->push_back(basic_start + input_token.length());
|
||||
} else {
|
||||
out_tokens->emplace_back(unknown_token_);
|
||||
offsets_limit->push_back(basic_start + input_token.length());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const int start, const int end,
|
||||
Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const int &start, const int &end,
|
||||
std::vector<std::string> *out_tokens) const {
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && end > start && end <= input_token.size(), "Out of range");
|
||||
std::string subword = input_token.substr(start, end - start);
|
||||
|
@ -73,9 +81,19 @@ Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const in
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vector<std::string> *out_tokens) const {
|
||||
Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, const uint32_t &basic_start,
|
||||
std::vector<std::string> *out_tokens, std::vector<uint32_t> *offsets_start,
|
||||
std::vector<uint32_t> *offsets_limit) const {
|
||||
if (input_token.size() > max_bytes_per_token_) {
|
||||
return FoundNoToken(input_token, out_tokens);
|
||||
offsets_start->push_back(basic_start);
|
||||
if (!unknown_token_.empty()) {
|
||||
offsets_limit->push_back(basic_start + unknown_token_.size());
|
||||
out_tokens->emplace_back(unknown_token_);
|
||||
} else {
|
||||
out_tokens->emplace_back(input_token);
|
||||
offsets_limit->push_back(basic_start + input_token.size());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
RuneStrArray runes;
|
||||
if (!DecodeRunesInString(input_token.data(), input_token.size(), runes)) {
|
||||
|
@ -87,29 +105,52 @@ Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vect
|
|||
RETURN_IF_NOT_OK(LookupWord(input_token, runes, start, &found, &end));
|
||||
if (found) {
|
||||
RETURN_IF_NOT_OK(AddSubword(input_token, start, end, out_tokens));
|
||||
offsets_start->push_back(static_cast<uint32_t>(basic_start + start));
|
||||
offsets_limit->push_back(static_cast<uint32_t>(basic_start + end));
|
||||
start = end;
|
||||
} else {
|
||||
return FoundNoToken(input_token, out_tokens);
|
||||
return FoundNoToken(input_token, basic_start, out_tokens, offsets_start, offsets_limit);
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status WordpieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
if (input->Rank() > 1 || input->type() != DataType::DE_STRING) {
|
||||
Status WordpieceTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
|
||||
IO_CHECK_VECTOR(input, output);
|
||||
if (input[0]->Rank() > 1 || input[0]->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar or 1-D string tensor");
|
||||
}
|
||||
dsize_t count = 0;
|
||||
std::vector<std::string> out_tokens;
|
||||
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
|
||||
std::vector<uint32_t> offsets_start, offsets_limit;
|
||||
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
|
||||
for (auto iter = input[0]->begin<std::string_view>(); iter != input[0]->end<std::string_view>(); iter++) {
|
||||
uint32_t basic_start = 0;
|
||||
std::vector<std::string> temp_tokens;
|
||||
RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &temp_tokens));
|
||||
if (with_offsets_ && input.size() == 3) {
|
||||
RETURN_IF_NOT_OK(input[1]->GetItemAt<uint32_t>(&basic_start, {count, 0}));
|
||||
}
|
||||
RETURN_IF_NOT_OK(GetTokens(std::string(*iter), basic_start, &temp_tokens, &offsets_start, &offsets_limit));
|
||||
out_tokens.insert(out_tokens.end(), temp_tokens.begin(), temp_tokens.end());
|
||||
count++;
|
||||
}
|
||||
if (out_tokens.empty()) {
|
||||
out_tokens.emplace_back("");
|
||||
offsets_start.push_back(0);
|
||||
offsets_limit.push_back(0);
|
||||
}
|
||||
token_tensor = std::make_shared<Tensor>(out_tokens, TensorShape({(dsize_t)out_tokens.size()}));
|
||||
output->push_back(token_tensor);
|
||||
if (with_offsets_) {
|
||||
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
|
||||
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
|
||||
reinterpret_cast<unsigned char *>(&offsets_start[0])));
|
||||
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
|
||||
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
|
||||
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
|
||||
output->push_back(offsets_start_tensor);
|
||||
output->push_back(offsets_limit_tensor);
|
||||
}
|
||||
*output = std::make_shared<Tensor>(out_tokens, TensorShape({(dsize_t)out_tokens.size()}));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
|
|
@ -37,27 +37,31 @@ class WordpieceTokenizerOp : public TensorOp {
|
|||
static const char kDefSuffixIndicator[];
|
||||
static const int kDefMaxBytesPerToken;
|
||||
static const char kDefUnknownToken[];
|
||||
static const bool kDefWithOffsets;
|
||||
WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = kDefSuffixIndicator,
|
||||
const int &max_bytes_per_token = kDefMaxBytesPerToken,
|
||||
const std::string &unknown_token = kDefUnknownToken);
|
||||
const std::string &unknown_token = kDefUnknownToken, const bool &with_offsets = kDefWithOffsets);
|
||||
|
||||
~WordpieceTokenizerOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "WordpieceTokenizerOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
Status Compute(const TensorRow &input, TensorRow *output) override;
|
||||
|
||||
protected:
|
||||
Status AddSubword(const std::string &input_token, const int start, const int end,
|
||||
Status AddSubword(const std::string &input_token, const int &start, const int &end,
|
||||
std::vector<std::string> *out_token) const;
|
||||
Status FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const;
|
||||
Status FoundNoToken(const std::string &input_token, const uint32_t &basic_start, std::vector<std::string> *out_tokens,
|
||||
std::vector<uint32_t> *offsets_start, std::vector<uint32_t> *offsets_limit) const;
|
||||
Status LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, bool *out_found,
|
||||
int *out_end) const;
|
||||
Status GetTokens(const std::string &input_token, std::vector<std::string> *out_tokens) const;
|
||||
Status GetTokens(const std::string &input_token, const uint32_t &basic_start, std::vector<std::string> *out_tokens,
|
||||
std::vector<uint32_t> *offsets_start, std::vector<uint32_t> *offsets_limit) const;
|
||||
|
||||
private:
|
||||
const std::shared_ptr<Vocab> vocab_;
|
||||
const std::string suffix_indicator_;
|
||||
const bool with_offsets_;
|
||||
const int max_bytes_per_token_;
|
||||
const std::string unknown_token_;
|
||||
};
|
||||
|
|
|
@ -52,8 +52,9 @@ import mindspore._c_dataengine as cde
|
|||
|
||||
from .utils import JiebaMode, NormalizeForm, to_str
|
||||
from .validators import check_lookup, check_jieba_add_dict, \
|
||||
check_jieba_add_word, check_jieba_init, check_ngram, check_pair_truncate, \
|
||||
check_to_number, check_python_tokenizer
|
||||
check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer,\
|
||||
check_wordpiece_tokenizer, check_regex_tokenizer, check_basic_tokenizer, check_ngram, check_pair_truncate,\
|
||||
check_to_number, check_bert_tokenizer, check_python_tokenizer
|
||||
from ..core.datatypes import mstype_to_detype
|
||||
|
||||
|
||||
|
@ -125,15 +126,31 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
|
|||
- JiebaMode.MP, tokenize with MPSegment algorithm.
|
||||
- JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm.
|
||||
- JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=False)
|
||||
>>> data = data.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
|
||||
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
|
||||
"""
|
||||
|
||||
@check_jieba_init
|
||||
def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX):
|
||||
def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX, with_offsets=False):
|
||||
if not isinstance(mode, JiebaMode):
|
||||
raise TypeError("Wrong input type for mode, should be JiebaMode.")
|
||||
|
||||
self.mode = mode
|
||||
self.__check_path__(hmm_path)
|
||||
self.__check_path__(mp_path)
|
||||
self.with_offsets = with_offsets
|
||||
super().__init__(hmm_path, mp_path,
|
||||
DE_C_INTER_JIEBA_MODE[mode])
|
||||
DE_C_INTER_JIEBA_MODE[mode],
|
||||
self.with_offsets)
|
||||
|
||||
@check_jieba_add_word
|
||||
def add_word(self, word, freq=None):
|
||||
|
@ -226,8 +243,26 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
|
|||
class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp):
|
||||
"""
|
||||
Tokenize a scalar tensor of UTF-8 string to Unicode characters.
|
||||
|
||||
Args:
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.UnicodeCharTokenizer()
|
||||
>>> dataset = dataset.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.UnicodeCharTokenizer(True)
|
||||
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
|
||||
"""
|
||||
|
||||
@check_with_offsets
|
||||
def __init__(self, with_offsets=False):
|
||||
self.with_offsets = with_offsets
|
||||
super().__init__(self.with_offsets)
|
||||
|
||||
|
||||
class WordpieceTokenizer(cde.WordpieceTokenizerOp):
|
||||
"""
|
||||
|
@ -239,22 +274,58 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
|
|||
max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default=100).
|
||||
unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string,
|
||||
return the token directly, else return 'unknown_token'(default='[UNK]').
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'],
|
||||
>>> max_bytes_per_token=100, with_offsets=False)
|
||||
>>> dataset = dataset.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'],
|
||||
>>> max_bytes_per_token=100, with_offsets=True)
|
||||
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
|
||||
"""
|
||||
|
||||
def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]'):
|
||||
@check_wordpiece_tokenizer
|
||||
def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100,
|
||||
unknown_token='[UNK]', with_offsets=False):
|
||||
self.vocab = vocab
|
||||
self.suffix_indicator = suffix_indicator
|
||||
self.max_bytes_per_token = max_bytes_per_token
|
||||
self.unknown_token = unknown_token
|
||||
super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token)
|
||||
self.with_offsets = with_offsets
|
||||
super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token,
|
||||
self.unknown_token, self.with_offsets)
|
||||
|
||||
|
||||
if platform.system().lower() != 'windows':
|
||||
class WhitespaceTokenizer(cde.WhitespaceTokenizerOp):
|
||||
"""
|
||||
Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\\\\t', '\\\\r', '\\\\n').
|
||||
|
||||
Args:
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.WhitespaceTokenizer()
|
||||
>>> dataset = dataset.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.WhitespaceTokenizer(True)
|
||||
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
|
||||
"""
|
||||
|
||||
@check_with_offsets
|
||||
def __init__(self, with_offsets=False):
|
||||
self.with_offsets = with_offsets
|
||||
super().__init__(self.with_offsets)
|
||||
|
||||
|
||||
class UnicodeScriptTokenizer(cde.UnicodeScriptTokenizerOp):
|
||||
"""
|
||||
|
@ -262,11 +333,25 @@ if platform.system().lower() != 'windows':
|
|||
|
||||
Args:
|
||||
keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False).
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=False)
|
||||
>>> dataset = dataset.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=True)
|
||||
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
|
||||
"""
|
||||
|
||||
def __init__(self, keep_whitespace=False):
|
||||
@check_unicode_script_tokenizer
|
||||
def __init__(self, keep_whitespace=False, with_offsets=False):
|
||||
self.keep_whitespace = keep_whitespace
|
||||
super().__init__(self.keep_whitespace)
|
||||
self.with_offsets = with_offsets
|
||||
super().__init__(self.keep_whitespace, self.with_offsets)
|
||||
|
||||
|
||||
class CaseFold(cde.CaseFoldOp):
|
||||
|
@ -302,6 +387,9 @@ if platform.system().lower() != 'windows':
|
|||
"""
|
||||
|
||||
def __init__(self, normalize_form=NormalizeForm.NFKC):
|
||||
if not isinstance(normalize_form, NormalizeForm):
|
||||
raise TypeError("Wrong input type for normalization_form, should be NormalizeForm.")
|
||||
|
||||
self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form]
|
||||
super().__init__(self.normalize_form)
|
||||
|
||||
|
@ -338,12 +426,26 @@ if platform.system().lower() != 'windows':
|
|||
keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token
|
||||
if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''),
|
||||
in this situation, delimiters will not kept as a output token(default='').
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=False)
|
||||
>>> dataset = dataset.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True)
|
||||
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
|
||||
"""
|
||||
|
||||
def __init__(self, delim_pattern, keep_delim_pattern=''):
|
||||
@check_regex_tokenizer
|
||||
def __init__(self, delim_pattern, keep_delim_pattern='', with_offsets=False):
|
||||
self.delim_pattern = delim_pattern
|
||||
self.keep_delim_pattern = keep_delim_pattern
|
||||
super().__init__(self.delim_pattern, self.keep_delim_pattern)
|
||||
self.with_offsets = with_offsets
|
||||
super().__init__(self.delim_pattern, self.keep_delim_pattern, self.with_offsets)
|
||||
|
||||
|
||||
class BasicTokenizer(cde.BasicTokenizerOp):
|
||||
|
@ -359,16 +461,41 @@ if platform.system().lower() != 'windows':
|
|||
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
|
||||
preserve_unused_token(bool, optional): If True, do not split special tokens like
|
||||
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.BasicTokenizer(lower_case=False,
|
||||
>>> keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE,
|
||||
>>> preserve_unused_token=True,
|
||||
>>> with_offsets=False)
|
||||
>>> dataset = dataset.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.BasicTokenizer(lower_case=False,
|
||||
>>> keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE,
|
||||
>>> preserve_unused_token=True,
|
||||
>>> with_offsets=True)
|
||||
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
|
||||
"""
|
||||
|
||||
def __init__(self, lower_case=False, keep_whitespace=False,
|
||||
normalization_form=NormalizeForm.NONE, preserve_unused_token=True):
|
||||
@check_basic_tokenizer
|
||||
def __init__(self, lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE,
|
||||
preserve_unused_token=True, with_offsets=False):
|
||||
if not isinstance(normalization_form, NormalizeForm):
|
||||
raise TypeError("Wrong input type for normalization_form, should be NormalizeForm.")
|
||||
|
||||
self.lower_case = lower_case
|
||||
self.keep_whitespace = keep_whitespace
|
||||
self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
|
||||
self.preserve_unused_token = preserve_unused_token
|
||||
super().__init__(self.lower_case, self.keep_whitespace,
|
||||
self.normalization_form, self.preserve_unused_token)
|
||||
self.with_offsets = with_offsets
|
||||
super().__init__(self.lower_case, self.keep_whitespace, self.normalization_form,
|
||||
self.preserve_unused_token, self.with_offsets)
|
||||
|
||||
|
||||
class BertTokenizer(cde.BertTokenizerOp):
|
||||
|
@ -389,11 +516,33 @@ if platform.system().lower() != 'windows':
|
|||
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
|
||||
preserve_unused_token(bool, optional): If True, do not split special tokens like
|
||||
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100,
|
||||
>>> unknown_token=100, lower_case=False, keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
|
||||
>>> with_offsets=False)
|
||||
>>> dataset = dataset.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100,
|
||||
>>> unknown_token=100, lower_case=False, keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
|
||||
>>> with_offsets=True)
|
||||
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
|
||||
"""
|
||||
|
||||
def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100,
|
||||
unknown_token='[UNK]', lower_case=False, keep_whitespace=False,
|
||||
normalization_form=NormalizeForm.NONE, preserve_unused_token=True):
|
||||
@check_bert_tokenizer
|
||||
def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]',
|
||||
lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE,
|
||||
preserve_unused_token=True, with_offsets=False):
|
||||
if not isinstance(normalization_form, NormalizeForm):
|
||||
raise TypeError("Wrong input type for normalization_form, should be NormalizeForm.")
|
||||
|
||||
self.vocab = vocab
|
||||
self.suffix_indicator = suffix_indicator
|
||||
self.max_bytes_per_token = max_bytes_per_token
|
||||
|
@ -402,8 +551,10 @@ if platform.system().lower() != 'windows':
|
|||
self.keep_whitespace = keep_whitespace
|
||||
self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
|
||||
self.preserve_unused_token = preserve_unused_token
|
||||
self.with_offsets = with_offsets
|
||||
super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token,
|
||||
self.lower_case, self.keep_whitespace, self.normalization_form, self.preserve_unused_token)
|
||||
self.lower_case, self.keep_whitespace, self.normalization_form,
|
||||
self.preserve_unused_token, self.with_offsets)
|
||||
|
||||
|
||||
class TruncateSequencePair(cde.TruncateSequencePairOp):
|
||||
|
|
|
@ -25,7 +25,6 @@ from mindspore._c_expression import typing
|
|||
from ..core.validator_helpers import parse_user_args, type_check, type_check_list, check_uint32, check_positive, \
|
||||
INT32_MAX, check_value
|
||||
|
||||
|
||||
def check_unique_list_of_words(words, arg_name):
|
||||
"""Check that words is a list and each element is a str without any duplication"""
|
||||
|
||||
|
@ -116,11 +115,22 @@ def check_from_dict(method):
|
|||
|
||||
|
||||
def check_jieba_init(method):
|
||||
"""Wrapper method to check the parameters of jieba add word."""
|
||||
"""Wrapper method to check the parameters of jieba init."""
|
||||
|
||||
@wraps(method)
|
||||
def new_method(self, *args, **kwargs):
|
||||
parse_user_args(method, *args, **kwargs)
|
||||
[hmm_path, mp_path, _, with_offsets], _ = parse_user_args(method, *args, **kwargs)
|
||||
|
||||
if hmm_path is None:
|
||||
raise ValueError("The dict of HMMSegment in cppjieba is not provided.")
|
||||
if not isinstance(hmm_path, str):
|
||||
raise TypeError("Wrong input type for hmm_path, should be string.")
|
||||
if mp_path is None:
|
||||
raise ValueError("The dict of MPSegment in cppjieba is not provided.")
|
||||
if not isinstance(mp_path, str):
|
||||
raise TypeError("Wrong input type for mp_path, should be string.")
|
||||
if not isinstance(with_offsets, bool):
|
||||
raise TypeError("Wrong input type for with_offsets, should be boolean.")
|
||||
return method(self, *args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
@ -152,6 +162,128 @@ def check_jieba_add_dict(method):
|
|||
return new_method
|
||||
|
||||
|
||||
def check_with_offsets(method):
|
||||
"""Wrapper method to check if with_offsets is the only one parameter."""
|
||||
|
||||
@wraps(method)
|
||||
def new_method(self, *args, **kwargs):
|
||||
[with_offsets], _ = parse_user_args(method, *args, **kwargs)
|
||||
if not isinstance(with_offsets, bool):
|
||||
raise TypeError("Wrong input type for with_offsets, should be boolean.")
|
||||
return method(self, *args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
||||
|
||||
def check_unicode_script_tokenizer(method):
|
||||
"""Wrapper method to check the parameter of UnicodeScriptTokenizer."""
|
||||
|
||||
@wraps(method)
|
||||
def new_method(self, *args, **kwargs):
|
||||
[keep_whitespace, with_offsets], _ = parse_user_args(method, *args, **kwargs)
|
||||
if not isinstance(keep_whitespace, bool):
|
||||
raise TypeError("Wrong input type for keep_whitespace, should be boolean.")
|
||||
if not isinstance(with_offsets, bool):
|
||||
raise TypeError("Wrong input type for with_offsets, should be boolean.")
|
||||
return method(self, *args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
||||
|
||||
def check_wordpiece_tokenizer(method):
|
||||
"""Wrapper method to check the parameter of WordpieceTokenizer."""
|
||||
|
||||
@wraps(method)
|
||||
def new_method(self, *args, **kwargs):
|
||||
[vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets], _ =\
|
||||
parse_user_args(method, *args, **kwargs)
|
||||
if vocab is None:
|
||||
raise ValueError("vocab is not provided.")
|
||||
if not isinstance(vocab, cde.Vocab):
|
||||
raise TypeError("Wrong input type for vocab, should be Vocab object.")
|
||||
if not isinstance(suffix_indicator, str):
|
||||
raise TypeError("Wrong input type for suffix_indicator, should be string.")
|
||||
if not isinstance(unknown_token, str):
|
||||
raise TypeError("Wrong input type for unknown_token, should be string.")
|
||||
if not isinstance(with_offsets, bool):
|
||||
raise TypeError("Wrong input type for with_offsets, should be boolean.")
|
||||
check_uint32(max_bytes_per_token)
|
||||
return method(self, *args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
||||
|
||||
def check_regex_tokenizer(method):
|
||||
"""Wrapper method to check the parameter of RegexTokenizer."""
|
||||
|
||||
@wraps(method)
|
||||
def new_method(self, *args, **kwargs):
|
||||
[delim_pattern, keep_delim_pattern, with_offsets], _ = parse_user_args(method, *args, **kwargs)
|
||||
if delim_pattern is None:
|
||||
raise ValueError("delim_pattern is not provided.")
|
||||
if not isinstance(delim_pattern, str):
|
||||
raise TypeError("Wrong input type for delim_pattern, should be string.")
|
||||
if not isinstance(keep_delim_pattern, str):
|
||||
raise TypeError("Wrong input type for keep_delim_pattern, should be string.")
|
||||
if not isinstance(with_offsets, bool):
|
||||
raise TypeError("Wrong input type for with_offsets, should be boolean.")
|
||||
return method(self, *args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
||||
|
||||
def check_basic_tokenizer(method):
|
||||
"""Wrapper method to check the parameter of RegexTokenizer."""
|
||||
|
||||
@wraps(method)
|
||||
def new_method(self, *args, **kwargs):
|
||||
[lower_case, keep_whitespace, _, preserve_unused, with_offsets], _ =\
|
||||
parse_user_args(method, *args, **kwargs)
|
||||
if not isinstance(lower_case, bool):
|
||||
raise TypeError("Wrong input type for lower_case, should be boolean.")
|
||||
if not isinstance(keep_whitespace, bool):
|
||||
raise TypeError("Wrong input type for keep_whitespace, should be boolean.")
|
||||
if not isinstance(preserve_unused, bool):
|
||||
raise TypeError("Wrong input type for preserve_unused_token, should be boolean.")
|
||||
if not isinstance(with_offsets, bool):
|
||||
raise TypeError("Wrong input type for with_offsets, should be boolean.")
|
||||
return method(self, *args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
||||
|
||||
def check_bert_tokenizer(method):
|
||||
"""Wrapper method to check the parameter of BertTokenizer."""
|
||||
|
||||
@wraps(method)
|
||||
def new_method(self, *args, **kwargs):
|
||||
[vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case, keep_whitespace, _,
|
||||
preserve_unused_token, with_offsets], _ = parse_user_args(method, *args, **kwargs)
|
||||
if vocab is None:
|
||||
raise ValueError("vacab is not provided.")
|
||||
if not isinstance(vocab, cde.Vocab):
|
||||
raise TypeError("Wrong input type for vocab, should be Vocab object.")
|
||||
if not isinstance(suffix_indicator, str):
|
||||
raise TypeError("Wrong input type for suffix_indicator, should be string.")
|
||||
if not isinstance(max_bytes_per_token, int):
|
||||
raise TypeError("Wrong input type for max_bytes_per_token, should be int.")
|
||||
check_uint32(max_bytes_per_token)
|
||||
|
||||
if not isinstance(unknown_token, str):
|
||||
raise TypeError("Wrong input type for unknown_token, should be string.")
|
||||
if not isinstance(lower_case, bool):
|
||||
raise TypeError("Wrong input type for lower_case, should be boolean.")
|
||||
if not isinstance(keep_whitespace, bool):
|
||||
raise TypeError("Wrong input type for keep_whitespace, should be boolean.")
|
||||
if not isinstance(preserve_unused_token, bool):
|
||||
raise TypeError("Wrong input type for preserve_unused_token, should be boolean.")
|
||||
if not isinstance(with_offsets, bool):
|
||||
raise TypeError("Wrong input type for with_offsets, should be boolean.")
|
||||
return method(self, *args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
||||
|
||||
def check_from_dataset(method):
|
||||
"""A wrapper that wrap a parameter checker to the original function."""
|
||||
|
||||
|
|
|
@ -39,21 +39,22 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opFuntions) {
|
|||
std::string dataset_path = datasets_root_path_ + "/jiebadict";
|
||||
std::string hmm_path = dataset_path + "/hmm_model.utf8";
|
||||
std::string mp_path = dataset_path + "/jieba.dict.utf8";
|
||||
std::shared_ptr<Tensor> output_tensor;
|
||||
TensorRow input, output;
|
||||
std::unique_ptr<JiebaTokenizerOp> op(new JiebaTokenizerOp(hmm_path, mp_path));
|
||||
|
||||
std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>("今天天气太好了我们一起去外面玩吧");
|
||||
Status s = op->Compute(input_tensor, &output_tensor);
|
||||
input.push_back(input_tensor);
|
||||
Status s = op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output_tensor->Rank(), 1);
|
||||
EXPECT_EQ(output_tensor->Size(), 7);
|
||||
CheckEqual(output_tensor, {0}, "今天天气");
|
||||
CheckEqual(output_tensor, {1}, "太好了");
|
||||
CheckEqual(output_tensor, {2}, "我们");
|
||||
CheckEqual(output_tensor, {3}, "一起");
|
||||
CheckEqual(output_tensor, {4}, "去");
|
||||
CheckEqual(output_tensor, {5}, "外面");
|
||||
CheckEqual(output_tensor, {6}, "玩吧");
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
EXPECT_EQ(output[0]->Size(), 7);
|
||||
CheckEqual(output[0], {0}, "今天天气");
|
||||
CheckEqual(output[0], {1}, "太好了");
|
||||
CheckEqual(output[0], {2}, "我们");
|
||||
CheckEqual(output[0], {3}, "一起");
|
||||
CheckEqual(output[0], {4}, "去");
|
||||
CheckEqual(output[0], {5}, "外面");
|
||||
CheckEqual(output[0], {6}, "玩吧");
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opAdd) {
|
||||
|
@ -61,16 +62,17 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opAdd) {
|
|||
std::string dataset_path = datasets_root_path_ + "/jiebadict";
|
||||
std::string hmm_path = dataset_path + "/hmm_model.utf8";
|
||||
std::string mp_path = dataset_path + "/jieba.dict.utf8";
|
||||
std::shared_ptr<Tensor> output_tensor;
|
||||
TensorRow input, output;
|
||||
std::unique_ptr<JiebaTokenizerOp> op(new JiebaTokenizerOp(hmm_path, mp_path));
|
||||
|
||||
op->AddWord("男默女泪");
|
||||
std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>("男默女泪");
|
||||
Status s = op->Compute(input_tensor, &output_tensor);
|
||||
input.push_back(input_tensor);
|
||||
Status s = op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output_tensor->Rank(), 1);
|
||||
EXPECT_EQ(output_tensor->Size(), 1);
|
||||
CheckEqual(output_tensor, {0}, "男默女泪");
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
EXPECT_EQ(output[0]->Size(), 1);
|
||||
CheckEqual(output[0], {0}, "男默女泪");
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opEmpty) {
|
||||
|
@ -78,14 +80,15 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opEmpty) {
|
|||
std::string dataset_path = datasets_root_path_ + "/jiebadict";
|
||||
std::string hmm_path = dataset_path + "/hmm_model.utf8";
|
||||
std::string mp_path = dataset_path + "/jieba.dict.utf8";
|
||||
std::shared_ptr<Tensor> output_tensor;
|
||||
TensorRow input, output;
|
||||
std::unique_ptr<JiebaTokenizerOp> op(new JiebaTokenizerOp(hmm_path, mp_path));
|
||||
|
||||
op->AddWord("男默女泪");
|
||||
std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>("");
|
||||
Status s = op->Compute(input_tensor, &output_tensor);
|
||||
input.push_back(input_tensor);
|
||||
Status s = op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output_tensor->Rank(), 1);
|
||||
EXPECT_EQ(output_tensor->Size(), 1);
|
||||
CheckEqual(output_tensor, {0}, "");
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
EXPECT_EQ(output[0]->Size(), 1);
|
||||
CheckEqual(output[0], {0}, "");
|
||||
}
|
|
@ -45,227 +45,245 @@ class MindDataTestTokenizerOp : public UT::Common {
|
|||
|
||||
TEST_F(MindDataTestTokenizerOp, TestUnicodeCharTokenizerOp) {
|
||||
MS_LOG(INFO) << "Doing TestUnicodeCharTokenizerOp.";
|
||||
std::unique_ptr<UnicodeCharTokenizerOp> op(new UnicodeCharTokenizerOp());
|
||||
std::unique_ptr<UnicodeCharTokenizerOp> op(new UnicodeCharTokenizerOp(true));
|
||||
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Hello World!");
|
||||
std::shared_ptr<Tensor> output;
|
||||
Status s = op->Compute(input, &output);
|
||||
TensorRow output;
|
||||
Status s = op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 12);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor1: " << output->ToString();
|
||||
CheckEqual(output, {0}, "H");
|
||||
CheckEqual(output, {1}, "e");
|
||||
CheckEqual(output, {2}, "l");
|
||||
CheckEqual(output, {3}, "l");
|
||||
CheckEqual(output, {4}, "o");
|
||||
CheckEqual(output, {5}, " ");
|
||||
CheckEqual(output, {6}, "W");
|
||||
CheckEqual(output, {7}, "o");
|
||||
CheckEqual(output, {8}, "r");
|
||||
CheckEqual(output, {9}, "l");
|
||||
CheckEqual(output, {10}, "d");
|
||||
CheckEqual(output, {11}, "!");
|
||||
EXPECT_EQ(output[0]->Size(), 12);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "H");
|
||||
CheckEqual(output[0], {1}, "e");
|
||||
CheckEqual(output[0], {2}, "l");
|
||||
CheckEqual(output[0], {3}, "l");
|
||||
CheckEqual(output[0], {4}, "o");
|
||||
CheckEqual(output[0], {5}, " ");
|
||||
CheckEqual(output[0], {6}, "W");
|
||||
CheckEqual(output[0], {7}, "o");
|
||||
CheckEqual(output[0], {8}, "r");
|
||||
CheckEqual(output[0], {9}, "l");
|
||||
CheckEqual(output[0], {10}, "d");
|
||||
CheckEqual(output[0], {11}, "!");
|
||||
|
||||
input = std::make_shared<Tensor>("中国 你好!");
|
||||
s = op->Compute(input, &output);
|
||||
output.clear();
|
||||
s = op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 6);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor2: " << output->ToString();
|
||||
CheckEqual(output, {0}, "中");
|
||||
CheckEqual(output, {1}, "国");
|
||||
CheckEqual(output, {2}, " ");
|
||||
CheckEqual(output, {3}, "你");
|
||||
CheckEqual(output, {4}, "好");
|
||||
CheckEqual(output, {5}, "!");
|
||||
EXPECT_EQ(output[0]->Size(), 6);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "中");
|
||||
CheckEqual(output[0], {1}, "国");
|
||||
CheckEqual(output[0], {2}, " ");
|
||||
CheckEqual(output[0], {3}, "你");
|
||||
CheckEqual(output[0], {4}, "好");
|
||||
CheckEqual(output[0], {5}, "!");
|
||||
|
||||
input = std::make_shared<Tensor>("中");
|
||||
s = op->Compute(input, &output);
|
||||
output.clear();
|
||||
s = op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor3: " << output->ToString();
|
||||
CheckEqual(output, {0}, "中");
|
||||
EXPECT_EQ(output[0]->Size(), 1);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "中");
|
||||
|
||||
input = std::make_shared<Tensor>("H");
|
||||
s = op->Compute(input, &output);
|
||||
output.clear();
|
||||
s = op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor4: " << output->ToString();
|
||||
CheckEqual(output, {0}, "H");
|
||||
EXPECT_EQ(output[0]->Size(), 1);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "H");
|
||||
|
||||
input = std::make_shared<Tensor>(" ");
|
||||
s = op->Compute(input, &output);
|
||||
output.clear();
|
||||
s = op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 2);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor5: " << output->ToString();
|
||||
CheckEqual(output, {0}, " ");
|
||||
CheckEqual(output, {1}, " ");
|
||||
EXPECT_EQ(output[0]->Size(), 2);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, " ");
|
||||
CheckEqual(output[0], {1}, " ");
|
||||
|
||||
input = std::make_shared<Tensor>("");
|
||||
s = op->Compute(input, &output);
|
||||
output.clear();
|
||||
s = op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor6: " << output->ToString();
|
||||
CheckEqual(output, {0}, "");
|
||||
EXPECT_EQ(output[0]->Size(), 1);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor6: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "");
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestTokenizerOp, TestWhitespaceTokenizerOp) {
|
||||
MS_LOG(INFO) << "Doing TestWhitespaceTokenizerOp.";
|
||||
std::unique_ptr<WhitespaceTokenizerOp> op(new WhitespaceTokenizerOp());
|
||||
std::unique_ptr<WhitespaceTokenizerOp> op(new WhitespaceTokenizerOp(true));
|
||||
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China.");
|
||||
std::shared_ptr<Tensor> output;
|
||||
Status s = op->Compute(input, &output);
|
||||
TensorRow output;
|
||||
Status s = op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 3);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor1: " << output->ToString();
|
||||
CheckEqual(output, {0}, "Welcome");
|
||||
CheckEqual(output, {1}, "to");
|
||||
CheckEqual(output, {2}, "China.");
|
||||
EXPECT_EQ(output[0]->Size(), 3);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "Welcome");
|
||||
CheckEqual(output[0], {1}, "to");
|
||||
CheckEqual(output[0], {2}, "China.");
|
||||
|
||||
input = std::make_shared<Tensor>(" hello");
|
||||
s = op->Compute(input, &output);
|
||||
output.clear();
|
||||
s = op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor2: " << output->ToString();
|
||||
CheckEqual(output, {0}, "hello");
|
||||
EXPECT_EQ(output[0]->Size(), 1);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "hello");
|
||||
|
||||
input = std::make_shared<Tensor>("hello");
|
||||
s = op->Compute(input, &output);
|
||||
output.clear();
|
||||
s = op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor3: " << output->ToString();
|
||||
CheckEqual(output, {0}, "hello");
|
||||
EXPECT_EQ(output[0]->Size(), 1);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "hello");
|
||||
|
||||
input = std::make_shared<Tensor>("hello ");
|
||||
s = op->Compute(input, &output);
|
||||
output.clear();
|
||||
s = op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor4: " << output->ToString();
|
||||
CheckEqual(output, {0}, "hello");
|
||||
EXPECT_EQ(output[0]->Size(), 1);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "hello");
|
||||
|
||||
input = std::make_shared<Tensor>(" ");
|
||||
s = op->Compute(input, &output);
|
||||
output.clear();
|
||||
s = op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor5: " << output->ToString();
|
||||
CheckEqual(output, {0}, "");
|
||||
EXPECT_EQ(output[0]->Size(), 1);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "");
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestTokenizerOp, TestUnicodeScriptTokenizer) {
|
||||
MS_LOG(INFO) << "Doing TestUnicodeScriptTokenizer.";
|
||||
std::unique_ptr<UnicodeScriptTokenizerOp> keep_whitespace_op(new UnicodeScriptTokenizerOp(true));
|
||||
std::unique_ptr<UnicodeScriptTokenizerOp> skip_whitespace_op(new UnicodeScriptTokenizerOp(false));
|
||||
std::unique_ptr<UnicodeScriptTokenizerOp> keep_whitespace_op(new UnicodeScriptTokenizerOp(true, true));
|
||||
std::unique_ptr<UnicodeScriptTokenizerOp> skip_whitespace_op(new UnicodeScriptTokenizerOp(false, true));
|
||||
|
||||
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
|
||||
std::shared_ptr<Tensor> output;
|
||||
Status s = keep_whitespace_op->Compute(input, &output);
|
||||
TensorRow output;
|
||||
Status s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 10);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor1: " << output->ToString();
|
||||
CheckEqual(output, {0}, "Welcome");
|
||||
CheckEqual(output, {1}, " ");
|
||||
CheckEqual(output, {2}, "to");
|
||||
CheckEqual(output, {3}, " ");
|
||||
CheckEqual(output, {4}, "China");
|
||||
CheckEqual(output, {5}, ".");
|
||||
CheckEqual(output, {6}, " \n ");
|
||||
CheckEqual(output, {7}, "中国");
|
||||
CheckEqual(output, {8}, "\t");
|
||||
CheckEqual(output, {9}, "北京");
|
||||
s = skip_whitespace_op->Compute(input, &output);
|
||||
EXPECT_EQ(output[0]->Size(), 10);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "Welcome");
|
||||
CheckEqual(output[0], {1}, " ");
|
||||
CheckEqual(output[0], {2}, "to");
|
||||
CheckEqual(output[0], {3}, " ");
|
||||
CheckEqual(output[0], {4}, "China");
|
||||
CheckEqual(output[0], {5}, ".");
|
||||
CheckEqual(output[0], {6}, " \n ");
|
||||
CheckEqual(output[0], {7}, "中国");
|
||||
CheckEqual(output[0], {8}, "\t");
|
||||
CheckEqual(output[0], {9}, "北京");
|
||||
output.clear();
|
||||
s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 6);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor2: " << output->ToString();
|
||||
CheckEqual(output, {0}, "Welcome");
|
||||
CheckEqual(output, {1}, "to");
|
||||
CheckEqual(output, {2}, "China");
|
||||
CheckEqual(output, {3}, ".");
|
||||
CheckEqual(output, {4}, "中国");
|
||||
CheckEqual(output, {5}, "北京");
|
||||
EXPECT_EQ(output[0]->Size(), 6);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "Welcome");
|
||||
CheckEqual(output[0], {1}, "to");
|
||||
CheckEqual(output[0], {2}, "China");
|
||||
CheckEqual(output[0], {3}, ".");
|
||||
CheckEqual(output[0], {4}, "中国");
|
||||
CheckEqual(output[0], {5}, "北京");
|
||||
|
||||
input = std::make_shared<Tensor>(" Welcome to 中国. ");
|
||||
s = skip_whitespace_op->Compute(input, &output);
|
||||
output.clear();
|
||||
s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 4);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor3: " << output->ToString();
|
||||
CheckEqual(output, {0}, "Welcome");
|
||||
CheckEqual(output, {1}, "to");
|
||||
CheckEqual(output, {2}, "中国");
|
||||
CheckEqual(output, {3}, ".");
|
||||
s = keep_whitespace_op->Compute(input, &output);
|
||||
EXPECT_EQ(output[0]->Size(), 4);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "Welcome");
|
||||
CheckEqual(output[0], {1}, "to");
|
||||
CheckEqual(output[0], {2}, "中国");
|
||||
CheckEqual(output[0], {3}, ".");
|
||||
output.clear();
|
||||
s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 8);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor4: " << output->ToString();
|
||||
CheckEqual(output, {0}, " ");
|
||||
CheckEqual(output, {1}, "Welcome");
|
||||
CheckEqual(output, {2}, " ");
|
||||
CheckEqual(output, {3}, "to");
|
||||
CheckEqual(output, {4}, " ");
|
||||
CheckEqual(output, {5}, "中国");
|
||||
CheckEqual(output, {6}, ".");
|
||||
CheckEqual(output, {7}, " ");
|
||||
EXPECT_EQ(output[0]->Size(), 8);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, " ");
|
||||
CheckEqual(output[0], {1}, "Welcome");
|
||||
CheckEqual(output[0], {2}, " ");
|
||||
CheckEqual(output[0], {3}, "to");
|
||||
CheckEqual(output[0], {4}, " ");
|
||||
CheckEqual(output[0], {5}, "中国");
|
||||
CheckEqual(output[0], {6}, ".");
|
||||
CheckEqual(output[0], {7}, " ");
|
||||
|
||||
input = std::make_shared<Tensor>("Hello");
|
||||
s = keep_whitespace_op->Compute(input, &output);
|
||||
output.clear();
|
||||
s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor5: " << output->ToString();
|
||||
CheckEqual(output, {0}, "Hello");
|
||||
EXPECT_EQ(output[0]->Size(), 1);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "Hello");
|
||||
|
||||
input = std::make_shared<Tensor>("H");
|
||||
s = keep_whitespace_op->Compute(input, &output);
|
||||
output.clear();
|
||||
s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor6: " << output->ToString();
|
||||
CheckEqual(output, {0}, "H");
|
||||
EXPECT_EQ(output[0]->Size(), 1);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor6: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "H");
|
||||
|
||||
input = std::make_shared<Tensor>("");
|
||||
s = keep_whitespace_op->Compute(input, &output);
|
||||
output.clear();
|
||||
s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor7: " << output->ToString();
|
||||
CheckEqual(output, {0}, "");
|
||||
EXPECT_EQ(output[0]->Size(), 1);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor7: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "");
|
||||
|
||||
input = std::make_shared<Tensor>("Hello中国Hello世界");
|
||||
s = keep_whitespace_op->Compute(input, &output); EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 4);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor8: " << output->ToString();
|
||||
CheckEqual(output, {0}, "Hello");
|
||||
CheckEqual(output, {1}, "中国");
|
||||
CheckEqual(output, {2}, "Hello");
|
||||
CheckEqual(output, {3}, "世界");
|
||||
output.clear();
|
||||
s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output[0]->Size(), 4);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor8: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "Hello");
|
||||
CheckEqual(output[0], {1}, "中国");
|
||||
CheckEqual(output[0], {2}, "Hello");
|
||||
CheckEqual(output[0], {3}, "世界");
|
||||
|
||||
input = std::make_shared<Tensor>(" ");
|
||||
s = keep_whitespace_op->Compute(input, &output);
|
||||
output.clear();
|
||||
s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor10: " << output->ToString();
|
||||
CheckEqual(output, {0}, " ");
|
||||
EXPECT_EQ(output[0]->Size(), 1);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor10: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, " ");
|
||||
input = std::make_shared<Tensor>(" ");
|
||||
s = skip_whitespace_op->Compute(input, &output);
|
||||
output.clear();
|
||||
s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor11: " << output->ToString();
|
||||
CheckEqual(output, {0}, "");
|
||||
EXPECT_EQ(output[0]->Size(), 1);
|
||||
EXPECT_EQ(output[0]->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor11: " << output[0]->ToString();
|
||||
CheckEqual(output[0], {0}, "");
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestTokenizerOp, TestCaseFold) {
|
||||
|
@ -321,10 +339,10 @@ TEST_F(MindDataTestTokenizerOp, TestRegexReplace) {
|
|||
|
||||
TEST_F(MindDataTestTokenizerOp, TestRegexTokenizer) {
|
||||
MS_LOG(INFO) << "Doing TestRegexTokenizerOp.";
|
||||
std::unique_ptr<RegexTokenizerOp> regex_tokenizer_op(new RegexTokenizerOp("\\p{Cc}|\\p{Cf}|\\s+", ""));
|
||||
std::unique_ptr<RegexTokenizerOp> regex_tokenizer_op(new RegexTokenizerOp("\\p{Cc}|\\p{Cf}|\\s+", "", true));
|
||||
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
|
||||
std::shared_ptr<Tensor> output;
|
||||
Status s = regex_tokenizer_op->Compute(input, &output);
|
||||
TensorRow output;
|
||||
Status s = regex_tokenizer_op->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
}
|
||||
|
||||
|
@ -332,9 +350,10 @@ TEST_F(MindDataTestTokenizerOp, TestBasicTokenizer) {
|
|||
MS_LOG(INFO) << "Doing TestBasicTokenizer.";
|
||||
//bool lower_case, bool keep_whitespace,
|
||||
// NormalizeForm normalization_form, bool preserve_unused_token
|
||||
std::unique_ptr<BasicTokenizerOp> basic_tokenizer(new BasicTokenizerOp(true, true, NormalizeForm::kNone, false));
|
||||
std::unique_ptr<BasicTokenizerOp> basic_tokenizer(new BasicTokenizerOp(true, true, NormalizeForm::kNone, false,
|
||||
true));
|
||||
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. 中国\t北京");
|
||||
std::shared_ptr<Tensor> output;
|
||||
Status s = basic_tokenizer->Compute(input, &output);
|
||||
TensorRow output;
|
||||
Status s = basic_tokenizer->Compute(TensorRow(0, {input}), &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
}
|
|
@ -1,83 +0,0 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
Testing BasicTokenizer op in DE
|
||||
"""
|
||||
import numpy as np
|
||||
import mindspore.dataset as ds
|
||||
from mindspore import log as logger
|
||||
import mindspore.dataset.text as nlp
|
||||
|
||||
BASIC_TOKENIZER_FILE = "../data/dataset/testTokenizerData/basic_tokenizer.txt"
|
||||
|
||||
test_paras = [
|
||||
dict(
|
||||
first=1,
|
||||
last=6,
|
||||
expected_tokens=
|
||||
[['Welcome', 'to', 'Beijing', '北', '京', '欢', '迎', '您'],
|
||||
['長', '風', '破', '浪', '會', '有', '時', ',', '直', '掛', '雲', '帆', '濟', '滄', '海'],
|
||||
['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'],
|
||||
['明', '朝', '(', '1368', '—', '1644', '年', ')', '和', '清', '朝',
|
||||
'(', '1644', '—', '1911', '年', ')', ',', '是', '中', '国', '封',
|
||||
'建', '王', '朝', '史', '上', '最', '后', '两', '个', '朝', '代'],
|
||||
['明', '代', '(', '1368', '-', '1644', ')', 'と', '清', '代',
|
||||
'(', '1644', '-', '1911', ')', 'は', '、', '中', '国', 'の', '封',
|
||||
'建', '王', '朝', 'の', '歴', '史', 'における', '最', '後', 'の2つの', '王', '朝', 'でした'],
|
||||
['명나라', '(', '1368', '-', '1644', ')', '와', '청나라', '(', '1644', '-', '1911', ')', '는',
|
||||
'중국', '봉건', '왕조의', '역사에서', '마지막', '두', '왕조였다']]
|
||||
),
|
||||
dict(
|
||||
first=7,
|
||||
last=7,
|
||||
expected_tokens=[['this', 'is', 'a', 'funky', 'string']],
|
||||
lower_case=True
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def check_basic_tokenizer(first, last, expected_tokens, lower_case=False, keep_whitespace=False,
|
||||
normalization_form=nlp.utils.NormalizeForm.NONE, preserve_unused_token=False):
|
||||
dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
|
||||
basic_tokenizer = nlp.BasicTokenizer(lower_case=lower_case,
|
||||
keep_whitespace=keep_whitespace,
|
||||
normalization_form=normalization_form,
|
||||
preserve_unused_token=preserve_unused_token)
|
||||
|
||||
dataset = dataset.map(operations=basic_tokenizer)
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text'])
|
||||
logger.info("Out:", text)
|
||||
logger.info("Exp:", expected_tokens[count])
|
||||
np.testing.assert_array_equal(text, expected_tokens[count])
|
||||
count = count + 1
|
||||
|
||||
|
||||
def test_basic_tokenizer():
|
||||
"""
|
||||
Test BasicTokenizer
|
||||
"""
|
||||
for paras in test_paras:
|
||||
check_basic_tokenizer(**paras)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_basic_tokenizer()
|
|
@ -1,238 +0,0 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
import numpy as np
|
||||
import mindspore.dataset as ds
|
||||
from mindspore.dataset.text import JiebaTokenizer
|
||||
from mindspore.dataset.text import JiebaMode, to_str
|
||||
|
||||
DATA_FILE = "../data/dataset/testJiebaDataset/3.txt"
|
||||
DATA_ALL_FILE = "../data/dataset/testJiebaDataset/*"
|
||||
|
||||
HMM_FILE = "../data/dataset/jiebadict/hmm_model.utf8"
|
||||
MP_FILE = "../data/dataset/jiebadict/jieba.dict.utf8"
|
||||
|
||||
|
||||
def test_jieba_1():
|
||||
"""Test jieba tokenizer with MP mode"""
|
||||
data = ds.TextFileDataset(DATA_FILE)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
|
||||
ret = []
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_1_1():
|
||||
"""Test jieba tokenizer with HMM mode"""
|
||||
data = ds.TextFileDataset(DATA_FILE)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_1_2():
|
||||
"""Test jieba tokenizer with HMM MIX"""
|
||||
data = ds.TextFileDataset(DATA_FILE)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_2():
|
||||
"""Test add_word"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
jieba_op.add_word("男默女泪")
|
||||
expect = ['男默女泪', '市', '长江大桥']
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=2)
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_2_1():
|
||||
"""Test add_word with freq"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
jieba_op.add_word("男默女泪", 10)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=2)
|
||||
expect = ['男默女泪', '市', '长江大桥']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_2_2():
|
||||
"""Test add_word with invalid None Input"""
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
try:
|
||||
jieba_op.add_word(None)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
||||
def test_jieba_2_3():
|
||||
"""Test add_word with freq, the value of freq affects the result of segmentation"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
jieba_op.add_word("江大桥", 20000)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=2)
|
||||
expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_3():
|
||||
"""Test add_dict with dict"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
|
||||
user_dict = {
|
||||
"男默女泪": 10
|
||||
}
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
jieba_op.add_dict(user_dict)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['男默女泪', '市', '长江大桥']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_3_1():
|
||||
"""Test add_dict with dict"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
|
||||
user_dict = {
|
||||
"男默女泪": 10,
|
||||
"江大桥": 20000
|
||||
}
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
jieba_op.add_dict(user_dict)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['男默女泪', '市长', '江大桥']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_4():
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt"
|
||||
DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt"
|
||||
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
jieba_op.add_dict(DICT_FILE)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_4_1():
|
||||
"""Test add dict with invalid file path"""
|
||||
DICT_FILE = ""
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
try:
|
||||
jieba_op.add_dict(DICT_FILE)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
||||
def test_jieba_5():
|
||||
"""Test add dict with file path"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
|
||||
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
jieba_op.add_word("江大桥", 20000)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def gen():
|
||||
text = np.array("今天天气太好了我们一起去外面玩吧".encode("UTF8"), dtype='S')
|
||||
yield (text,)
|
||||
|
||||
|
||||
def pytoken_op(input_data):
|
||||
te = str(to_str(input_data))
|
||||
tokens = []
|
||||
tokens.append(te[:5].encode("UTF8"))
|
||||
tokens.append(te[5:10].encode("UTF8"))
|
||||
tokens.append(te[10:].encode("UTF8"))
|
||||
return np.array(tokens, dtype='S')
|
||||
|
||||
|
||||
def test_jieba_6():
|
||||
data = ds.GeneratorDataset(gen, column_names=["text"])
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=pytoken_op, num_parallel_workers=1)
|
||||
expect = ['今天天气太', '好了我们一', '起去外面玩吧']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_jieba_1()
|
||||
test_jieba_1_1()
|
||||
test_jieba_1_2()
|
||||
test_jieba_2()
|
||||
test_jieba_2_1()
|
||||
test_jieba_2_2()
|
||||
test_jieba_3()
|
||||
test_jieba_3_1()
|
||||
test_jieba_4()
|
||||
test_jieba_4_1()
|
||||
test_jieba_5()
|
||||
test_jieba_5()
|
||||
test_jieba_6()
|
|
@ -0,0 +1,138 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
Testing BasicTokenizer op in DE
|
||||
"""
|
||||
import numpy as np
|
||||
import mindspore.dataset as ds
|
||||
from mindspore import log as logger
|
||||
import mindspore.dataset.text as text
|
||||
|
||||
BASIC_TOKENIZER_FILE = "../data/dataset/testTokenizerData/basic_tokenizer.txt"
|
||||
|
||||
test_paras = [
|
||||
dict(
|
||||
first=1,
|
||||
last=6,
|
||||
expected_tokens=
|
||||
[['Welcome', 'to', 'Beijing', '北', '京', '欢', '迎', '您'],
|
||||
['長', '風', '破', '浪', '會', '有', '時', ',', '直', '掛', '雲', '帆', '濟', '滄', '海'],
|
||||
['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'],
|
||||
['明', '朝', '(', '1368', '—', '1644', '年', ')', '和', '清', '朝',
|
||||
'(', '1644', '—', '1911', '年', ')', ',', '是', '中', '国', '封',
|
||||
'建', '王', '朝', '史', '上', '最', '后', '两', '个', '朝', '代'],
|
||||
['明', '代', '(', '1368', '-', '1644', ')', 'と', '清', '代',
|
||||
'(', '1644', '-', '1911', ')', 'は', '、', '中', '国', 'の', '封',
|
||||
'建', '王', '朝', 'の', '歴', '史', 'における', '最', '後', 'の2つの', '王', '朝', 'でした'],
|
||||
['명나라', '(', '1368', '-', '1644', ')', '와', '청나라', '(', '1644', '-', '1911', ')', '는',
|
||||
'중국', '봉건', '왕조의', '역사에서', '마지막', '두', '왕조였다']],
|
||||
expected_offsets_start=[[0, 8, 11, 18, 21, 24, 27, 30],
|
||||
[0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42],
|
||||
[0, 4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37],
|
||||
[0, 3, 6, 9, 13, 16, 20, 23, 26, 29, 32, 35, 38, 42, 45, 49,
|
||||
52, 55, 58, 61, 64, 67, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100],
|
||||
[0, 3, 6, 9, 13, 14, 18, 21, 24, 27, 30, 33, 37, 38, 42, 45, 48, 51,
|
||||
54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 93, 96, 99, 109, 112, 115],
|
||||
[0, 10, 11, 15, 16, 20, 21, 25, 35, 36, 40, 41, 45, 46, 50, 57, 64, 74, 87, 97, 101]],
|
||||
expected_offsets_limit=[[7, 10, 18, 21, 24, 27, 30, 33],
|
||||
[3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45],
|
||||
[4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37, 40],
|
||||
[3, 6, 9, 13, 16, 20, 23, 26, 29, 32, 35, 38, 42, 45, 49, 52, 55, 58,
|
||||
61, 64, 67, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 103],
|
||||
[3, 6, 9, 13, 14, 18, 21, 24, 27, 30, 33, 37, 38, 42, 45, 48, 51, 54,
|
||||
57, 60, 63, 66, 69, 72, 75, 78, 81, 93, 96, 99, 109, 112, 115, 124],
|
||||
[9, 11, 15, 16, 20, 21, 24, 34, 36, 40, 41, 45, 46, 49, 56, 63, 73, 86, 96, 100, 113]]
|
||||
),
|
||||
dict(
|
||||
first=7,
|
||||
last=7,
|
||||
expected_tokens=[['this', 'is', 'a', 'funky', 'string']],
|
||||
expected_offsets_start=[[0, 5, 8, 10, 16]],
|
||||
expected_offsets_limit=[[4, 7, 9, 15, 22]],
|
||||
lower_case=True
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def check_basic_tokenizer_default(first, last, expected_tokens, expected_offsets_start, expected_offsets_limit,
|
||||
lower_case=False, keep_whitespace=False,
|
||||
normalization_form=text.utils.NormalizeForm.NONE, preserve_unused_token=False):
|
||||
dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
|
||||
basic_tokenizer = text.BasicTokenizer(lower_case=lower_case,
|
||||
keep_whitespace=keep_whitespace,
|
||||
normalization_form=normalization_form,
|
||||
preserve_unused_token=preserve_unused_token)
|
||||
|
||||
dataset = dataset.map(operations=basic_tokenizer)
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
token = text.to_str(i['text'])
|
||||
logger.info("Out:", token)
|
||||
logger.info("Exp:", expected_tokens[count])
|
||||
np.testing.assert_array_equal(token, expected_tokens[count])
|
||||
count = count + 1
|
||||
|
||||
|
||||
def check_basic_tokenizer_with_offsets(first, last, expected_tokens, expected_offsets_start, expected_offsets_limit,
|
||||
lower_case=False, keep_whitespace=False,
|
||||
normalization_form=text.utils.NormalizeForm.NONE, preserve_unused_token=False):
|
||||
dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
|
||||
basic_tokenizer = text.BasicTokenizer(lower_case=lower_case,
|
||||
keep_whitespace=keep_whitespace,
|
||||
normalization_form=normalization_form,
|
||||
preserve_unused_token=preserve_unused_token,
|
||||
with_offsets=True)
|
||||
|
||||
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
|
||||
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=basic_tokenizer)
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
token = text.to_str(i['token'])
|
||||
logger.info("Out:", token)
|
||||
logger.info("Exp:", expected_tokens[count])
|
||||
np.testing.assert_array_equal(token, expected_tokens[count])
|
||||
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
|
||||
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
|
||||
count = count + 1
|
||||
|
||||
def test_basic_tokenizer_with_offsets():
|
||||
"""
|
||||
Test BasicTokenizer
|
||||
"""
|
||||
for paras in test_paras:
|
||||
check_basic_tokenizer_with_offsets(**paras)
|
||||
|
||||
|
||||
def test_basic_tokenizer_default():
|
||||
"""
|
||||
Test BasicTokenizer
|
||||
"""
|
||||
for paras in test_paras:
|
||||
check_basic_tokenizer_default(**paras)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_basic_tokenizer_default()
|
||||
test_basic_tokenizer_with_offsets()
|
|
@ -18,7 +18,7 @@ Testing BertTokenizer op in DE
|
|||
import numpy as np
|
||||
import mindspore.dataset as ds
|
||||
from mindspore import log as logger
|
||||
import mindspore.dataset.text as nlp
|
||||
import mindspore.dataset.text as text
|
||||
|
||||
BERT_TOKENIZER_FILE = "../data/dataset/testTokenizerData/bert_tokenizer.txt"
|
||||
|
||||
|
@ -39,6 +39,14 @@ test_paras = [
|
|||
['疑', '是', '地', '上', '霜'],
|
||||
['举', '头', '望', '明', '月'],
|
||||
['低', '头', '思', '故', '乡']],
|
||||
expected_offsets_start=[[0, 3, 6, 9, 12],
|
||||
[0, 3, 6, 9, 12],
|
||||
[0, 3, 6, 9, 12],
|
||||
[0, 3, 6, 9, 12]],
|
||||
expected_offsets_limit=[[3, 6, 9, 12, 15],
|
||||
[3, 6, 9, 12, 15],
|
||||
[3, 6, 9, 12, 15],
|
||||
[3, 6, 9, 12, 15]],
|
||||
vocab_list=vocab_bert
|
||||
),
|
||||
# test english text
|
||||
|
@ -46,6 +54,8 @@ test_paras = [
|
|||
first=5,
|
||||
last=5,
|
||||
expect_str=[['i', 'am', 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']],
|
||||
expected_offsets_start=[[0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46]],
|
||||
expected_offsets_limit=[[1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47]],
|
||||
lower_case=True,
|
||||
vocab_list=vocab_bert
|
||||
),
|
||||
|
@ -53,6 +63,8 @@ test_paras = [
|
|||
first=5,
|
||||
last=5,
|
||||
expect_str=[['I', "am", 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']],
|
||||
expected_offsets_start=[[0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46]],
|
||||
expected_offsets_limit=[[1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47]],
|
||||
lower_case=False,
|
||||
vocab_list=vocab_bert
|
||||
),
|
||||
|
@ -63,7 +75,9 @@ test_paras = [
|
|||
expect_str=[
|
||||
['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'],
|
||||
['繁', '體', '字']],
|
||||
normalization_form=nlp.utils.NormalizeForm.NFKC,
|
||||
expected_offsets_start=[[0, 4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37], [0, 3, 6]],
|
||||
expected_offsets_limit=[[4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37, 40], [3, 6, 9]],
|
||||
normalization_form=text.utils.NormalizeForm.NFKC,
|
||||
vocab_list=vocab_bert
|
||||
),
|
||||
# test preserved tokens
|
||||
|
@ -79,6 +93,8 @@ test_paras = [
|
|||
['[unused1]'],
|
||||
['[unused10]']
|
||||
],
|
||||
expected_offsets_start=[[0, 7], [0, 7], [0, 7], [0, 7], [0, 7], [0], [0]],
|
||||
expected_offsets_limit=[[6, 12], [6, 12], [6, 12], [6, 12], [6, 13], [9], [10]],
|
||||
lower_case=False,
|
||||
vocab_list=vocab_bert,
|
||||
preserve_unused_token=True,
|
||||
|
@ -95,6 +111,8 @@ test_paras = [
|
|||
['[unused1]'],
|
||||
['[unused10]']
|
||||
],
|
||||
expected_offsets_start=[[0, 7], [0, 7], [0, 7], [0, 7], [0, 7], [0], [0]],
|
||||
expected_offsets_limit=[[6, 12], [6, 12], [6, 12], [6, 12], [6, 13], [9], [10]],
|
||||
lower_case=True,
|
||||
vocab_list=vocab_bert,
|
||||
preserve_unused_token=True,
|
||||
|
@ -104,6 +122,8 @@ test_paras = [
|
|||
first=15,
|
||||
last=15,
|
||||
expect_str=[['12', '+', '/', '-', '28', '=', '40', '/', '-', '16']],
|
||||
expected_offsets_start=[[0, 2, 3, 4, 5, 7, 8, 10, 11, 12]],
|
||||
expected_offsets_limit=[[2, 3, 4, 5, 7, 8, 10, 11, 12, 14]],
|
||||
preserve_unused_token=True,
|
||||
vocab_list=vocab_bert
|
||||
),
|
||||
|
@ -112,6 +132,8 @@ test_paras = [
|
|||
first=8,
|
||||
last=8,
|
||||
expect_str=[['[UNK]', ' ', '[CLS]']],
|
||||
expected_offsets_start=[[0, 6, 7]],
|
||||
expected_offsets_limit=[[6, 7, 12]],
|
||||
lower_case=False,
|
||||
vocab_list=vocab_bert,
|
||||
preserve_unused_token=True,
|
||||
|
@ -121,6 +143,8 @@ test_paras = [
|
|||
first=8,
|
||||
last=8,
|
||||
expect_str=[['unused', ' ', '[CLS]']],
|
||||
expected_offsets_start=[[0, 6, 7]],
|
||||
expected_offsets_limit=[[6, 7, 12]],
|
||||
lower_case=False,
|
||||
vocab_list=vocab_bert,
|
||||
preserve_unused_token=True,
|
||||
|
@ -131,6 +155,8 @@ test_paras = [
|
|||
first=8,
|
||||
last=8,
|
||||
expect_str=[['unused', ' ', '[', 'CLS', ']']],
|
||||
expected_offsets_start=[[0, 6, 7, 8, 11]],
|
||||
expected_offsets_limit=[[6, 7, 8, 11, 12]],
|
||||
lower_case=False,
|
||||
vocab_list=vocab_bert,
|
||||
preserve_unused_token=False,
|
||||
|
@ -140,20 +166,20 @@ test_paras = [
|
|||
]
|
||||
|
||||
|
||||
def check_bert_tokenizer(first, last, expect_str,
|
||||
vocab_list,
|
||||
suffix_indicator='##',
|
||||
max_bytes_per_token=100, unknown_token='[UNK]',
|
||||
lower_case=False, keep_whitespace=False,
|
||||
normalization_form=nlp.utils.NormalizeForm.NONE,
|
||||
preserve_unused_token=False):
|
||||
def check_bert_tokenizer_default(first, last, expect_str,
|
||||
expected_offsets_start, expected_offsets_limit,
|
||||
vocab_list, suffix_indicator='##',
|
||||
max_bytes_per_token=100, unknown_token='[UNK]',
|
||||
lower_case=False, keep_whitespace=False,
|
||||
normalization_form=text.utils.NormalizeForm.NONE,
|
||||
preserve_unused_token=False):
|
||||
dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
vocab = nlp.Vocab.from_list(vocab_list)
|
||||
tokenizer_op = nlp.BertTokenizer(
|
||||
vocab = text.Vocab.from_list(vocab_list)
|
||||
tokenizer_op = text.BertTokenizer(
|
||||
vocab=vocab, suffix_indicator=suffix_indicator,
|
||||
max_bytes_per_token=max_bytes_per_token, unknown_token=unknown_token,
|
||||
lower_case=lower_case, keep_whitespace=keep_whitespace,
|
||||
|
@ -162,20 +188,59 @@ def check_bert_tokenizer(first, last, expect_str,
|
|||
dataset = dataset.map(operations=tokenizer_op)
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text'])
|
||||
logger.info("Out:", text)
|
||||
token = text.to_str(i['text'])
|
||||
logger.info("Out:", token)
|
||||
logger.info("Exp:", expect_str[count])
|
||||
np.testing.assert_array_equal(text, expect_str[count])
|
||||
np.testing.assert_array_equal(token, expect_str[count])
|
||||
count = count + 1
|
||||
|
||||
|
||||
def test_bert_tokenizer():
|
||||
def check_bert_tokenizer_with_offsets(first, last, expect_str,
|
||||
expected_offsets_start, expected_offsets_limit,
|
||||
vocab_list, suffix_indicator='##',
|
||||
max_bytes_per_token=100, unknown_token='[UNK]',
|
||||
lower_case=False, keep_whitespace=False,
|
||||
normalization_form=text.utils.NormalizeForm.NONE,
|
||||
preserve_unused_token=False):
|
||||
dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
vocab = text.Vocab.from_list(vocab_list)
|
||||
tokenizer_op = text.BertTokenizer(
|
||||
vocab=vocab, suffix_indicator=suffix_indicator, max_bytes_per_token=max_bytes_per_token,
|
||||
unknown_token=unknown_token, lower_case=lower_case, keep_whitespace=keep_whitespace,
|
||||
normalization_form=normalization_form, preserve_unused_token=preserve_unused_token, with_offsets=True)
|
||||
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
|
||||
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer_op)
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
token = text.to_str(i['token'])
|
||||
logger.info("Out:", token)
|
||||
logger.info("Exp:", expect_str[count])
|
||||
np.testing.assert_array_equal(token, expect_str[count])
|
||||
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
|
||||
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
|
||||
count = count + 1
|
||||
|
||||
|
||||
def test_bert_tokenizer_default():
|
||||
"""
|
||||
Test WordpieceTokenizer
|
||||
Test WordpieceTokenizer when with_offsets=False
|
||||
"""
|
||||
for paras in test_paras:
|
||||
check_bert_tokenizer(**paras)
|
||||
check_bert_tokenizer_default(**paras)
|
||||
|
||||
|
||||
def test_bert_tokenizer_with_offsets():
|
||||
"""
|
||||
Test WordpieceTokenizer when with_offsets=True
|
||||
"""
|
||||
for paras in test_paras:
|
||||
check_bert_tokenizer_with_offsets(**paras)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_bert_tokenizer()
|
||||
test_bert_tokenizer_default()
|
||||
test_bert_tokenizer_with_offsets()
|
|
@ -0,0 +1,471 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
import numpy as np
|
||||
import mindspore.dataset as ds
|
||||
from mindspore.dataset.text import JiebaTokenizer
|
||||
from mindspore.dataset.text import JiebaMode, to_str
|
||||
|
||||
DATA_FILE = "../data/dataset/testJiebaDataset/3.txt"
|
||||
DATA_ALL_FILE = "../data/dataset/testJiebaDataset/*"
|
||||
|
||||
HMM_FILE = "../data/dataset/jiebadict/hmm_model.utf8"
|
||||
MP_FILE = "../data/dataset/jiebadict/jieba.dict.utf8"
|
||||
|
||||
|
||||
def test_jieba_1():
|
||||
"""Test jieba tokenizer with MP mode"""
|
||||
data = ds.TextFileDataset(DATA_FILE)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
|
||||
ret = []
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_1_1():
|
||||
"""Test jieba tokenizer with HMM mode"""
|
||||
data = ds.TextFileDataset(DATA_FILE)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_1_2():
|
||||
"""Test jieba tokenizer with HMM MIX"""
|
||||
data = ds.TextFileDataset(DATA_FILE)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_2():
|
||||
"""Test add_word"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
jieba_op.add_word("男默女泪")
|
||||
expect = ['男默女泪', '市', '长江大桥']
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=2)
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_2_1():
|
||||
"""Test add_word with freq"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
jieba_op.add_word("男默女泪", 10)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=2)
|
||||
expect = ['男默女泪', '市', '长江大桥']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_2_2():
|
||||
"""Test add_word with invalid None Input"""
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
try:
|
||||
jieba_op.add_word(None)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
||||
def test_jieba_2_3():
|
||||
"""Test add_word with freq, the value of freq affects the result of segmentation"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
jieba_op.add_word("江大桥", 20000)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=2)
|
||||
expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_3():
|
||||
"""Test add_dict with dict"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
|
||||
user_dict = {
|
||||
"男默女泪": 10
|
||||
}
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
jieba_op.add_dict(user_dict)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['男默女泪', '市', '长江大桥']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_3_1():
|
||||
"""Test add_dict with dict"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
|
||||
user_dict = {
|
||||
"男默女泪": 10,
|
||||
"江大桥": 20000
|
||||
}
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
jieba_op.add_dict(user_dict)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['男默女泪', '市长', '江大桥']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_4():
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt"
|
||||
DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt"
|
||||
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
jieba_op.add_dict(DICT_FILE)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_4_1():
|
||||
"""Test add dict with invalid file path"""
|
||||
DICT_FILE = ""
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
try:
|
||||
jieba_op.add_dict(DICT_FILE)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
||||
def test_jieba_5():
|
||||
"""Test add dict with file path"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
|
||||
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
jieba_op.add_word("江大桥", 20000)
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
def test_jieba_with_offsets_1():
|
||||
"""Test jieba tokenizer with MP mode"""
|
||||
data = ds.TextFileDataset(DATA_FILE)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
|
||||
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
columns_order=["token", "offsets_start", "offsets_limit"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
|
||||
expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
|
||||
expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
|
||||
ret = []
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["token"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
for index, item in enumerate(i["offsets_start"]):
|
||||
assert item == expected_offsets_start[index]
|
||||
for index, item in enumerate(i["offsets_limit"]):
|
||||
assert item == expected_offsets_limit[index]
|
||||
|
||||
|
||||
def test_jieba_with_offsets_1_1():
|
||||
"""Test jieba tokenizer with HMM mode"""
|
||||
data = ds.TextFileDataset(DATA_FILE)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM, with_offsets=True)
|
||||
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
columns_order=["token", "offsets_start", "offsets_limit"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧']
|
||||
expected_offsets_start = [0, 6, 12, 15, 18, 21, 27, 33, 36, 42, 45]
|
||||
expected_offsets_limit = [6, 12, 15, 18, 21, 27, 33, 36, 42, 45, 48]
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["token"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
for index, item in enumerate(i["offsets_start"]):
|
||||
assert item == expected_offsets_start[index]
|
||||
for index, item in enumerate(i["offsets_limit"]):
|
||||
assert item == expected_offsets_limit[index]
|
||||
|
||||
|
||||
def test_jieba_with_offsets_1_2():
|
||||
"""Test jieba tokenizer with HMM MIX"""
|
||||
data = ds.TextFileDataset(DATA_FILE)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX, with_offsets=True)
|
||||
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
columns_order=["token", "offsets_start", "offsets_limit"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
|
||||
expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
|
||||
expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["token"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
for index, item in enumerate(i["offsets_start"]):
|
||||
assert item == expected_offsets_start[index]
|
||||
for index, item in enumerate(i["offsets_limit"]):
|
||||
assert item == expected_offsets_limit[index]
|
||||
|
||||
|
||||
def test_jieba_with_offsets_2():
|
||||
"""Test add_word"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
|
||||
jieba_op.add_word("男默女泪")
|
||||
expect = ['男默女泪', '市', '长江大桥']
|
||||
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
columns_order=["token", "offsets_start", "offsets_limit"],
|
||||
operations=jieba_op, num_parallel_workers=2)
|
||||
expected_offsets_start = [0, 12, 15]
|
||||
expected_offsets_limit = [12, 15, 27]
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["token"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
for index, item in enumerate(i["offsets_start"]):
|
||||
assert item == expected_offsets_start[index]
|
||||
for index, item in enumerate(i["offsets_limit"]):
|
||||
assert item == expected_offsets_limit[index]
|
||||
|
||||
|
||||
def test_jieba_with_offsets_2_1():
|
||||
"""Test add_word with freq"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
|
||||
jieba_op.add_word("男默女泪", 10)
|
||||
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
columns_order=["token", "offsets_start", "offsets_limit"],
|
||||
operations=jieba_op, num_parallel_workers=2)
|
||||
expect = ['男默女泪', '市', '长江大桥']
|
||||
expected_offsets_start = [0, 12, 15]
|
||||
expected_offsets_limit = [12, 15, 27]
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["token"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
for index, item in enumerate(i["offsets_start"]):
|
||||
assert item == expected_offsets_start[index]
|
||||
for index, item in enumerate(i["offsets_limit"]):
|
||||
assert item == expected_offsets_limit[index]
|
||||
|
||||
|
||||
def test_jieba_with_offsets_2_2():
|
||||
"""Test add_word with freq, the value of freq affects the result of segmentation"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
|
||||
jieba_op.add_word("江大桥", 20000)
|
||||
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
columns_order=["token", "offsets_start", "offsets_limit"],
|
||||
operations=jieba_op, num_parallel_workers=2)
|
||||
expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
|
||||
expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51]
|
||||
expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57]
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["token"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
for index, item in enumerate(i["offsets_start"]):
|
||||
assert item == expected_offsets_start[index]
|
||||
for index, item in enumerate(i["offsets_limit"]):
|
||||
assert item == expected_offsets_limit[index]
|
||||
|
||||
|
||||
def test_jieba_with_offsets_3():
|
||||
"""Test add_dict with dict"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
|
||||
user_dict = {
|
||||
"男默女泪": 10
|
||||
}
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
|
||||
jieba_op.add_dict(user_dict)
|
||||
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
columns_order=["token", "offsets_start", "offsets_limit"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['男默女泪', '市', '长江大桥']
|
||||
expected_offsets_start = [0, 12, 15]
|
||||
expected_offsets_limit = [12, 15, 27]
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["token"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
for index, item in enumerate(i["offsets_start"]):
|
||||
assert item == expected_offsets_start[index]
|
||||
for index, item in enumerate(i["offsets_limit"]):
|
||||
assert item == expected_offsets_limit[index]
|
||||
|
||||
|
||||
def test_jieba_with_offsets_3_1():
|
||||
"""Test add_dict with dict"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
|
||||
user_dict = {
|
||||
"男默女泪": 10,
|
||||
"江大桥": 20000
|
||||
}
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
|
||||
jieba_op.add_dict(user_dict)
|
||||
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
columns_order=["token", "offsets_start", "offsets_limit"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['男默女泪', '市长', '江大桥']
|
||||
expected_offsets_start = [0, 12, 18]
|
||||
expected_offsets_limit = [12, 18, 27]
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["token"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
for index, item in enumerate(i["offsets_start"]):
|
||||
assert item == expected_offsets_start[index]
|
||||
for index, item in enumerate(i["offsets_limit"]):
|
||||
assert item == expected_offsets_limit[index]
|
||||
|
||||
|
||||
def test_jieba_with_offsets_4():
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt"
|
||||
DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt"
|
||||
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
|
||||
jieba_op.add_dict(DICT_FILE)
|
||||
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
columns_order=["token", "offsets_start", "offsets_limit"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
|
||||
expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
|
||||
expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["token"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
for index, item in enumerate(i["offsets_start"]):
|
||||
assert item == expected_offsets_start[index]
|
||||
for index, item in enumerate(i["offsets_limit"]):
|
||||
assert item == expected_offsets_limit[index]
|
||||
|
||||
|
||||
def test_jieba_with_offsets_5():
|
||||
"""Test add dict with file path"""
|
||||
DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
|
||||
|
||||
data = ds.TextFileDataset(DATA_FILE4)
|
||||
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
|
||||
jieba_op.add_word("江大桥", 20000)
|
||||
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
columns_order=["token", "offsets_start", "offsets_limit"],
|
||||
operations=jieba_op, num_parallel_workers=1)
|
||||
expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
|
||||
expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51]
|
||||
expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57]
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["token"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
for index, item in enumerate(i["offsets_start"]):
|
||||
assert item == expected_offsets_start[index]
|
||||
for index, item in enumerate(i["offsets_limit"]):
|
||||
assert item == expected_offsets_limit[index]
|
||||
|
||||
def gen():
|
||||
text = np.array("今天天气太好了我们一起去外面玩吧".encode("UTF8"), dtype='S')
|
||||
yield (text,)
|
||||
|
||||
|
||||
def pytoken_op(input_data):
|
||||
te = str(to_str(input_data))
|
||||
tokens = []
|
||||
tokens.append(te[:5].encode("UTF8"))
|
||||
tokens.append(te[5:10].encode("UTF8"))
|
||||
tokens.append(te[10:].encode("UTF8"))
|
||||
return np.array(tokens, dtype='S')
|
||||
|
||||
|
||||
def test_jieba_6():
|
||||
data = ds.GeneratorDataset(gen, column_names=["text"])
|
||||
data = data.map(input_columns=["text"],
|
||||
operations=pytoken_op, num_parallel_workers=1)
|
||||
expect = ['今天天气太', '好了我们一', '起去外面玩吧']
|
||||
for i in data.create_dict_iterator():
|
||||
ret = to_str(i["text"])
|
||||
for index, item in enumerate(ret):
|
||||
assert item == expect[index]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_jieba_1()
|
||||
test_jieba_1_1()
|
||||
test_jieba_1_2()
|
||||
test_jieba_2()
|
||||
test_jieba_2_1()
|
||||
test_jieba_2_2()
|
||||
test_jieba_3()
|
||||
test_jieba_3_1()
|
||||
test_jieba_4()
|
||||
test_jieba_4_1()
|
||||
test_jieba_5()
|
||||
test_jieba_5()
|
||||
test_jieba_6()
|
||||
test_jieba_with_offsets_1()
|
||||
test_jieba_with_offsets_1_1()
|
||||
test_jieba_with_offsets_1_2()
|
||||
test_jieba_with_offsets_2()
|
||||
test_jieba_with_offsets_2_1()
|
||||
test_jieba_with_offsets_2_2()
|
||||
test_jieba_with_offsets_3()
|
||||
test_jieba_with_offsets_3_1()
|
||||
test_jieba_with_offsets_4()
|
||||
test_jieba_with_offsets_5()
|
|
@ -0,0 +1,380 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
Testing UnicodeCharTokenizer op in DE
|
||||
"""
|
||||
import numpy as np
|
||||
import mindspore.dataset as ds
|
||||
from mindspore import log as logger
|
||||
import mindspore.dataset.text as text
|
||||
|
||||
DATA_FILE = "../data/dataset/testTokenizerData/1.txt"
|
||||
NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt"
|
||||
REGEX_REPLACE_FILE = "../data/dataset/testTokenizerData/regex_replace.txt"
|
||||
REGEX_TOKENIZER_FILE = "../data/dataset/testTokenizerData/regex_tokenizer.txt"
|
||||
|
||||
|
||||
def split_by_unicode_char(input_strs):
|
||||
"""
|
||||
Split utf-8 strings to unicode characters
|
||||
"""
|
||||
out = []
|
||||
for s in input_strs:
|
||||
out.append([c for c in s])
|
||||
return out
|
||||
|
||||
|
||||
def test_unicode_char_tokenizer_default():
|
||||
"""
|
||||
Test UnicodeCharTokenizer
|
||||
"""
|
||||
input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ")
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
tokenizer = text.UnicodeCharTokenizer()
|
||||
dataset = dataset.map(operations=tokenizer)
|
||||
tokens = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
token = text.to_str(i['text']).tolist()
|
||||
tokens.append(token)
|
||||
logger.info("The out tokens is : {}".format(tokens))
|
||||
assert split_by_unicode_char(input_strs) == tokens
|
||||
|
||||
|
||||
def test_unicode_char_tokenizer_with_offsets():
|
||||
"""
|
||||
Test UnicodeCharTokenizer
|
||||
"""
|
||||
input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ")
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
tokenizer = text.UnicodeCharTokenizer(with_offsets=True)
|
||||
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
|
||||
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer)
|
||||
tokens = []
|
||||
expected_offsets_start = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
|
||||
[0, 3, 6, 9, 12, 15], [0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16], [0, 1]]
|
||||
expected_offsets_limit = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
|
||||
[3, 6, 9, 12, 15, 18], [3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17], [1, 2]]
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
token = text.to_str(i['token']).tolist()
|
||||
tokens.append(token)
|
||||
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
|
||||
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
|
||||
count += 1
|
||||
logger.info("The out tokens is : {}".format(tokens))
|
||||
assert split_by_unicode_char(input_strs) == tokens
|
||||
|
||||
|
||||
def test_whitespace_tokenizer_default():
|
||||
"""
|
||||
Test WhitespaceTokenizer
|
||||
"""
|
||||
whitespace_strs = [["Welcome", "to", "Beijing!"],
|
||||
["北京欢迎您!"],
|
||||
["我喜欢English!"],
|
||||
[""]]
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
tokenizer = text.WhitespaceTokenizer()
|
||||
dataset = dataset.map(operations=tokenizer)
|
||||
tokens = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
token = text.to_str(i['text']).tolist()
|
||||
tokens.append(token)
|
||||
logger.info("The out tokens is : {}".format(tokens))
|
||||
assert whitespace_strs == tokens
|
||||
|
||||
|
||||
def test_whitespace_tokenizer_with_offsets():
|
||||
"""
|
||||
Test WhitespaceTokenizer
|
||||
"""
|
||||
whitespace_strs = [["Welcome", "to", "Beijing!"],
|
||||
["北京欢迎您!"],
|
||||
["我喜欢English!"],
|
||||
[""]]
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
tokenizer = text.WhitespaceTokenizer(with_offsets=True)
|
||||
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
|
||||
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer)
|
||||
tokens = []
|
||||
expected_offsets_start = [[0, 8, 11], [0], [0], [0]]
|
||||
expected_offsets_limit = [[7, 10, 19], [18], [17], [0]]
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
token = text.to_str(i['token']).tolist()
|
||||
tokens.append(token)
|
||||
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
|
||||
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
|
||||
count += 1
|
||||
|
||||
logger.info("The out tokens is : {}".format(tokens))
|
||||
assert whitespace_strs == tokens
|
||||
|
||||
|
||||
def test_unicode_script_tokenizer_default():
|
||||
"""
|
||||
Test UnicodeScriptTokenizer when para keep_whitespace=False
|
||||
"""
|
||||
unicode_script_strs = [["Welcome", "to", "Beijing", "!"],
|
||||
["北京欢迎您", "!"],
|
||||
["我喜欢", "English", "!"],
|
||||
[""]]
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False)
|
||||
dataset = dataset.map(operations=tokenizer)
|
||||
|
||||
tokens = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
token = text.to_str(i['text']).tolist()
|
||||
tokens.append(token)
|
||||
logger.info("The out tokens is : {}".format(tokens))
|
||||
assert unicode_script_strs == tokens
|
||||
|
||||
|
||||
def test_unicode_script_tokenizer_default2():
|
||||
"""
|
||||
Test UnicodeScriptTokenizer when para keep_whitespace=True
|
||||
"""
|
||||
unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
|
||||
["北京欢迎您", "!"],
|
||||
["我喜欢", "English", "!"],
|
||||
[" "]]
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True)
|
||||
dataset = dataset.map(operations=tokenizer)
|
||||
tokens = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
token = text.to_str(i['text']).tolist()
|
||||
tokens.append(token)
|
||||
logger.info("The out tokens is :", tokens)
|
||||
assert unicode_script_strs2 == tokens
|
||||
|
||||
|
||||
def test_unicode_script_tokenizer_with_offsets():
|
||||
"""
|
||||
Test UnicodeScriptTokenizer when para keep_whitespace=False and with_offsets=True
|
||||
"""
|
||||
unicode_script_strs = [["Welcome", "to", "Beijing", "!"],
|
||||
["北京欢迎您", "!"],
|
||||
["我喜欢", "English", "!"],
|
||||
[""]]
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False, with_offsets=True)
|
||||
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
|
||||
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer)
|
||||
tokens = []
|
||||
expected_offsets_start = [[0, 8, 11, 18], [0, 15], [0, 9, 16], [0]]
|
||||
expected_offsets_limit = [[7, 10, 18, 19], [15, 18], [9, 16, 17], [0]]
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
token = text.to_str(i['token']).tolist()
|
||||
tokens.append(token)
|
||||
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
|
||||
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
|
||||
count += 1
|
||||
logger.info("The out tokens is : {}".format(tokens))
|
||||
assert unicode_script_strs == tokens
|
||||
|
||||
|
||||
def test_unicode_script_tokenizer_with_offsets2():
|
||||
"""
|
||||
Test UnicodeScriptTokenizer when para keep_whitespace=True and with_offsets=True
|
||||
"""
|
||||
unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
|
||||
["北京欢迎您", "!"],
|
||||
["我喜欢", "English", "!"],
|
||||
[" "]]
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True)
|
||||
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
|
||||
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer)
|
||||
tokens = []
|
||||
expected_offsets_start = [[0, 7, 8, 10, 11, 18], [0, 15], [0, 9, 16], [0]]
|
||||
expected_offsets_limit = [[7, 8, 10, 11, 18, 19], [15, 18], [9, 16, 17], [2]]
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
token = text.to_str(i['token']).tolist()
|
||||
tokens.append(token)
|
||||
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
|
||||
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
|
||||
count += 1
|
||||
logger.info("The out tokens is :", tokens)
|
||||
assert unicode_script_strs2 == tokens
|
||||
|
||||
|
||||
def test_case_fold():
|
||||
"""
|
||||
Test CaseFold
|
||||
"""
|
||||
expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "]
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
op = text.CaseFold()
|
||||
dataset = dataset.map(operations=op)
|
||||
|
||||
lower_strs = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
token = text.to_str(i['text']).tolist()
|
||||
lower_strs.append(token)
|
||||
assert lower_strs == expect_strs
|
||||
|
||||
|
||||
def test_normalize_utf8():
|
||||
"""
|
||||
Test NormalizeUTF8
|
||||
"""
|
||||
|
||||
def normalize(normalize_form):
|
||||
dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False)
|
||||
normalize = text.NormalizeUTF8(normalize_form=normalize_form)
|
||||
dataset = dataset.map(operations=normalize)
|
||||
out_bytes = []
|
||||
out_texts = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
out_bytes.append(i['text'])
|
||||
out_texts.append(text.to_str(i['text']).tolist())
|
||||
logger.info("The out bytes is : ", out_bytes)
|
||||
logger.info("The out texts is: ", out_texts)
|
||||
return out_bytes
|
||||
|
||||
expect_normlize_data = [
|
||||
# NFC
|
||||
[b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
|
||||
b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xe1\xba\x9b\xcc\xa3'],
|
||||
# NFKC
|
||||
[b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
|
||||
b'fi', b'25', b'\xe1\xb9\xa9'],
|
||||
# NFD
|
||||
[b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
|
||||
b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xc5\xbf\xcc\xa3\xcc\x87'],
|
||||
# NFKD
|
||||
[b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
|
||||
b'fi', b'25', b's\xcc\xa3\xcc\x87']
|
||||
]
|
||||
assert normalize(text.utils.NormalizeForm.NFC) == expect_normlize_data[0]
|
||||
assert normalize(text.utils.NormalizeForm.NFKC) == expect_normlize_data[1]
|
||||
assert normalize(text.utils.NormalizeForm.NFD) == expect_normlize_data[2]
|
||||
assert normalize(text.utils.NormalizeForm.NFKD) == expect_normlize_data[3]
|
||||
|
||||
|
||||
def test_regex_replace():
|
||||
"""
|
||||
Test RegexReplace
|
||||
"""
|
||||
|
||||
def regex_replace(first, last, expect_str, pattern, replace):
|
||||
dataset = ds.TextFileDataset(REGEX_REPLACE_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
replace_op = text.RegexReplace(pattern, replace)
|
||||
dataset = dataset.map(operations=replace_op)
|
||||
out_text = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
token = text.to_str(i['text']).tolist()
|
||||
out_text.append(token)
|
||||
logger.info("Out:", out_text)
|
||||
logger.info("Exp:", expect_str)
|
||||
assert expect_str == out_text
|
||||
|
||||
regex_replace(1, 2, ['H____ W____', "L__'_ G_"], "\\p{Ll}", '_')
|
||||
regex_replace(3, 5, ['hello', 'world', '31:beijing'], "^(\\d:|b:)", "")
|
||||
regex_replace(6, 6, ["WelcometoChina!"], "\\s+", "")
|
||||
regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "")
|
||||
|
||||
|
||||
def test_regex_tokenizer_default():
|
||||
"""
|
||||
Test RegexTokenizer
|
||||
"""
|
||||
|
||||
def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern):
|
||||
dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern)
|
||||
dataset = dataset.map(operations=tokenizer_op)
|
||||
out_text = []
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
token = text.to_str(i['text']).tolist()
|
||||
np.testing.assert_array_equal(token, expect_str[count])
|
||||
count += 1
|
||||
out_text.append(token)
|
||||
logger.info("Out:", out_text)
|
||||
logger.info("Exp:", expect_str)
|
||||
|
||||
regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], "\\s+", "")
|
||||
regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], "\\s+", "\\s+")
|
||||
regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], r"\p{Han}", r"\p{Han}")
|
||||
regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+")
|
||||
regex_tokenizer(3, 3, [['12', '36']], r"[\p{P}|\p{S}]+", "")
|
||||
regex_tokenizer(3, 3, [['¥+', '¥=?']], r"[\p{N}]+", "")
|
||||
|
||||
|
||||
def test_regex_tokenizer_with_offsets():
|
||||
"""
|
||||
Test RegexTokenizer
|
||||
"""
|
||||
|
||||
def regex_tokenizer(first, last, expect_str, expected_offsets_start, expected_offsets_limit, delim_pattern,
|
||||
keep_delim_pattern):
|
||||
dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True)
|
||||
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
|
||||
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer_op)
|
||||
out_text = []
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
token = text.to_str(i['token']).tolist()
|
||||
np.testing.assert_array_equal(token, expect_str[count])
|
||||
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
|
||||
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
|
||||
count += 1
|
||||
out_text.append(token)
|
||||
logger.info("Out:", out_text)
|
||||
logger.info("Exp:", expect_str)
|
||||
|
||||
regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], [[0, 8, 11]], [[7, 10, 20]], "\\s+", "")
|
||||
regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], [[0, 7, 8, 10, 11]], [[7, 8, 10, 11, 20]],
|
||||
"\\s+", "\\s+")
|
||||
regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], [[0, 3, 6, 9, 12, 15]],
|
||||
[[3, 6, 9, 12, 15, 35]], r"\p{Han}", r"\p{Han}")
|
||||
regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], [[0, 2, 6, 8]], [[2, 6, 8, 13]],
|
||||
r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+")
|
||||
regex_tokenizer(3, 3, [['12', '36']], [[0, 6]], [[2, 8]], r"[\p{P}|\p{S}]+", "")
|
||||
regex_tokenizer(3, 3, [['¥+', '¥=?']], [[2, 8]], [[6, 13]], r"[\p{N}]+", "")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_unicode_char_tokenizer_default()
|
||||
test_unicode_char_tokenizer_with_offsets()
|
||||
test_whitespace_tokenizer_default()
|
||||
test_whitespace_tokenizer_with_offsets()
|
||||
test_unicode_script_tokenizer_default()
|
||||
test_unicode_script_tokenizer_default2()
|
||||
test_unicode_script_tokenizer_with_offsets()
|
||||
test_unicode_script_tokenizer_with_offsets2()
|
||||
test_case_fold()
|
||||
test_normalize_utf8()
|
||||
test_regex_replace()
|
||||
test_regex_tokenizer_default()
|
||||
test_regex_tokenizer_with_offsets()
|
|
@ -0,0 +1,160 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
Testing WordpieceTokenizer op in DE
|
||||
"""
|
||||
import numpy as np
|
||||
import mindspore.dataset as ds
|
||||
from mindspore import log as logger
|
||||
import mindspore.dataset.text as text
|
||||
|
||||
WORDPIECE_TOKENIZER_FILE = "../data/dataset/testTokenizerData/wordpiece_tokenizer.txt"
|
||||
|
||||
vocab_english = [
|
||||
"book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"
|
||||
]
|
||||
|
||||
vocab_chinese = [
|
||||
"我", '最', '喜', '欢', '的', '书', '是', '霍', '乱', '时', '期', '爱', '情'
|
||||
]
|
||||
|
||||
vocab_mix = vocab_chinese + vocab_english
|
||||
|
||||
test_paras = [
|
||||
dict(
|
||||
first=1,
|
||||
last=10,
|
||||
expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'],
|
||||
['era'], ['[UNK]']],
|
||||
expected_offsets_start=[[0], [0, 5], [0], [0], [0], [0, 3], [0], [0], [0], [0]],
|
||||
expected_offsets_limit=[[2], [5, 8], [4], [2], [4], [3, 6], [3], [7], [3], [4]],
|
||||
vocab_list=vocab_english
|
||||
),
|
||||
dict(
|
||||
first=1,
|
||||
last=10,
|
||||
expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'],
|
||||
['era'], ['what']],
|
||||
expected_offsets_start=[[0], [0, 5], [0], [0], [0], [0, 3], [0], [0], [0], [0]],
|
||||
expected_offsets_limit=[[2], [5, 8], [4], [2], [4], [3, 6], [3], [7], [3], [4]],
|
||||
vocab_list=vocab_english,
|
||||
unknown_token=""
|
||||
),
|
||||
dict(
|
||||
first=1,
|
||||
last=10,
|
||||
expect_str=[['my'], ['[UNK]'], ['book'], ['is'], ['love'], ['[UNK]'], ['the'], ['[UNK]'], ['era'], ['[UNK]']],
|
||||
expected_offsets_start=[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
|
||||
expected_offsets_limit=[[2], [5], [4], [2], [4], [5], [3], [5], [3], [4]],
|
||||
vocab_list=vocab_english,
|
||||
max_bytes_per_token=4
|
||||
),
|
||||
dict(
|
||||
first=11,
|
||||
last=25,
|
||||
expect_str=[['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'],
|
||||
['[UNK]']],
|
||||
expected_offsets_start=[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
|
||||
expected_offsets_limit=[[3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3]],
|
||||
vocab_list=vocab_chinese,
|
||||
),
|
||||
dict(
|
||||
first=25,
|
||||
last=25,
|
||||
expect_str=[['您']],
|
||||
expected_offsets_start=[[0]],
|
||||
expected_offsets_limit=[[3]],
|
||||
vocab_list=vocab_chinese,
|
||||
unknown_token=""
|
||||
),
|
||||
dict(
|
||||
first=1,
|
||||
last=25,
|
||||
expect_str=[
|
||||
['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], ['era'],
|
||||
['[UNK]'],
|
||||
['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'],
|
||||
['[UNK]']],
|
||||
expected_offsets_start=[[0], [0, 5], [0], [0], [0], [0, 3], [0], [0], [0], [0],
|
||||
[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
|
||||
expected_offsets_limit=[[2], [5, 8], [4], [2], [4], [3, 6], [3], [7], [3], [4],
|
||||
[3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3]],
|
||||
vocab_list=vocab_mix,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def check_wordpiece_tokenizer_default(first, last, expect_str, expected_offsets_start, expected_offsets_limit,
|
||||
vocab_list, unknown_token='[UNK]', max_bytes_per_token=100):
|
||||
dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
vocab = text.Vocab.from_list(vocab_list)
|
||||
tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=unknown_token,
|
||||
max_bytes_per_token=max_bytes_per_token)
|
||||
dataset = dataset.map(operations=tokenizer_op)
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
token = text.to_str(i['text'])
|
||||
logger.info("Out:", token)
|
||||
logger.info("Exp:", expect_str[count])
|
||||
np.testing.assert_array_equal(token, expect_str[count])
|
||||
count = count + 1
|
||||
|
||||
|
||||
def check_wordpiece_tokenizer_with_offsets(first, last, expect_str, expected_offsets_start, expected_offsets_limit,
|
||||
vocab_list, unknown_token='[UNK]', max_bytes_per_token=100):
|
||||
dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
vocab = text.Vocab.from_list(vocab_list)
|
||||
tokenizer_op = text.WordpieceTokenizer(vocab=vocab, with_offsets=True, unknown_token=unknown_token,
|
||||
max_bytes_per_token=max_bytes_per_token)
|
||||
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
|
||||
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer_op)
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
token = text.to_str(i['token'])
|
||||
logger.info("Out:", token)
|
||||
logger.info("Exp:", expect_str[count])
|
||||
np.testing.assert_array_equal(token, expect_str[count])
|
||||
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
|
||||
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
|
||||
count = count + 1
|
||||
|
||||
|
||||
def test_wordpiece_tokenizer_default():
|
||||
"""
|
||||
Test WordpieceTokenizer
|
||||
"""
|
||||
for paras in test_paras:
|
||||
check_wordpiece_tokenizer_default(**paras)
|
||||
|
||||
|
||||
def test_wordpiece_tokenizer_with_offsets():
|
||||
"""
|
||||
Test WordpieceTokenizer
|
||||
"""
|
||||
for paras in test_paras:
|
||||
check_wordpiece_tokenizer_with_offsets(**paras)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_wordpiece_tokenizer_default()
|
||||
test_wordpiece_tokenizer_with_offsets()
|
|
@ -1,233 +0,0 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
Testing UnicodeCharTokenizer op in DE
|
||||
"""
|
||||
import numpy as np
|
||||
import mindspore.dataset as ds
|
||||
from mindspore import log as logger
|
||||
import mindspore.dataset.text as nlp
|
||||
|
||||
DATA_FILE = "../data/dataset/testTokenizerData/1.txt"
|
||||
NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt"
|
||||
REGEX_REPLACE_FILE = "../data/dataset/testTokenizerData/regex_replace.txt"
|
||||
REGEX_TOKENIZER_FILE = "../data/dataset/testTokenizerData/regex_tokenizer.txt"
|
||||
|
||||
|
||||
def split_by_unicode_char(input_strs):
|
||||
"""
|
||||
Split utf-8 strings to unicode characters
|
||||
"""
|
||||
out = []
|
||||
for s in input_strs:
|
||||
out.append([c for c in s])
|
||||
return out
|
||||
|
||||
|
||||
def test_unicode_char_tokenizer():
|
||||
"""
|
||||
Test UnicodeCharTokenizer
|
||||
"""
|
||||
input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ")
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
tokenizer = nlp.UnicodeCharTokenizer()
|
||||
dataset = dataset.map(operations=tokenizer)
|
||||
tokens = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text']).tolist()
|
||||
tokens.append(text)
|
||||
logger.info("The out tokens is : {}".format(tokens))
|
||||
assert split_by_unicode_char(input_strs) == tokens
|
||||
|
||||
|
||||
def test_whitespace_tokenizer():
|
||||
"""
|
||||
Test WhitespaceTokenizer
|
||||
"""
|
||||
whitespace_strs = [["Welcome", "to", "Beijing!"],
|
||||
["北京欢迎您!"],
|
||||
["我喜欢English!"],
|
||||
[""]]
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
tokenizer = nlp.WhitespaceTokenizer()
|
||||
dataset = dataset.map(operations=tokenizer)
|
||||
tokens = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text']).tolist()
|
||||
tokens.append(text)
|
||||
logger.info("The out tokens is : {}".format(tokens))
|
||||
assert whitespace_strs == tokens
|
||||
|
||||
|
||||
def test_unicode_script_tokenizer():
|
||||
"""
|
||||
Test UnicodeScriptTokenizer when para keep_whitespace=False
|
||||
"""
|
||||
unicode_script_strs = [["Welcome", "to", "Beijing", "!"],
|
||||
["北京欢迎您", "!"],
|
||||
["我喜欢", "English", "!"],
|
||||
[""]]
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=False)
|
||||
dataset = dataset.map(operations=tokenizer)
|
||||
|
||||
tokens = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text']).tolist()
|
||||
tokens.append(text)
|
||||
logger.info("The out tokens is : {}".format(tokens))
|
||||
assert unicode_script_strs == tokens
|
||||
|
||||
|
||||
def test_unicode_script_tokenizer2():
|
||||
"""
|
||||
Test UnicodeScriptTokenizer when para keep_whitespace=True
|
||||
"""
|
||||
unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
|
||||
["北京欢迎您", "!"],
|
||||
["我喜欢", "English", "!"],
|
||||
[" "]]
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=True)
|
||||
dataset = dataset.map(operations=tokenizer)
|
||||
tokens = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text']).tolist()
|
||||
tokens.append(text)
|
||||
logger.info("The out tokens is :", tokens)
|
||||
assert unicode_script_strs2 == tokens
|
||||
|
||||
|
||||
def test_case_fold():
|
||||
"""
|
||||
Test CaseFold
|
||||
"""
|
||||
expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "]
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
op = nlp.CaseFold()
|
||||
dataset = dataset.map(operations=op)
|
||||
|
||||
lower_strs = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text']).tolist()
|
||||
lower_strs.append(text)
|
||||
assert lower_strs == expect_strs
|
||||
|
||||
|
||||
def test_normalize_utf8():
|
||||
"""
|
||||
Test NormalizeUTF8
|
||||
"""
|
||||
|
||||
def normalize(normalize_form):
|
||||
dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False)
|
||||
normalize = nlp.NormalizeUTF8(normalize_form=normalize_form)
|
||||
dataset = dataset.map(operations=normalize)
|
||||
out_bytes = []
|
||||
out_texts = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
out_bytes.append(i['text'])
|
||||
out_texts.append(nlp.to_str(i['text']).tolist())
|
||||
logger.info("The out bytes is : ", out_bytes)
|
||||
logger.info("The out texts is: ", out_texts)
|
||||
return out_bytes
|
||||
|
||||
expect_normlize_data = [
|
||||
# NFC
|
||||
[b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
|
||||
b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xe1\xba\x9b\xcc\xa3'],
|
||||
# NFKC
|
||||
[b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
|
||||
b'fi', b'25', b'\xe1\xb9\xa9'],
|
||||
# NFD
|
||||
[b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
|
||||
b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xc5\xbf\xcc\xa3\xcc\x87'],
|
||||
# NFKD
|
||||
[b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
|
||||
b'fi', b'25', b's\xcc\xa3\xcc\x87']
|
||||
]
|
||||
assert normalize(nlp.utils.NormalizeForm.NFC) == expect_normlize_data[0]
|
||||
assert normalize(nlp.utils.NormalizeForm.NFKC) == expect_normlize_data[1]
|
||||
assert normalize(nlp.utils.NormalizeForm.NFD) == expect_normlize_data[2]
|
||||
assert normalize(nlp.utils.NormalizeForm.NFKD) == expect_normlize_data[3]
|
||||
|
||||
|
||||
def test_regex_replace():
|
||||
"""
|
||||
Test RegexReplace
|
||||
"""
|
||||
|
||||
def regex_replace(first, last, expect_str, pattern, replace):
|
||||
dataset = ds.TextFileDataset(REGEX_REPLACE_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
replace_op = nlp.RegexReplace(pattern, replace)
|
||||
dataset = dataset.map(operations=replace_op)
|
||||
out_text = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text']).tolist()
|
||||
out_text.append(text)
|
||||
logger.info("Out:", out_text)
|
||||
logger.info("Exp:", expect_str)
|
||||
assert expect_str == out_text
|
||||
|
||||
regex_replace(1, 2, ['H____ W____', "L__'_ G_"], "\\p{Ll}", '_')
|
||||
regex_replace(3, 5, ['hello', 'world', '31:beijing'], "^(\\d:|b:)", "")
|
||||
regex_replace(6, 6, ["WelcometoChina!"], "\\s+", "")
|
||||
regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "")
|
||||
|
||||
|
||||
def test_regex_tokenizer():
|
||||
"""
|
||||
Test RegexTokenizer
|
||||
"""
|
||||
|
||||
def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern):
|
||||
dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
tokenizer_op = nlp.RegexTokenizer(delim_pattern, keep_delim_pattern)
|
||||
dataset = dataset.map(operations=tokenizer_op)
|
||||
out_text = []
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text']).tolist()
|
||||
np.testing.assert_array_equal(text, expect_str[count])
|
||||
count += 1
|
||||
out_text.append(text)
|
||||
logger.info("Out:", out_text)
|
||||
logger.info("Exp:", expect_str)
|
||||
|
||||
regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], "\\s+", "")
|
||||
regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], "\\s+", "\\s+")
|
||||
regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], r"\p{Han}", r"\p{Han}")
|
||||
regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+")
|
||||
regex_tokenizer(3, 3, [['12', '36']], r"[\p{P}|\p{S}]+", "")
|
||||
regex_tokenizer(3, 3, [['¥+', '¥=?']], r"[\p{N}]+", "")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_unicode_char_tokenizer()
|
||||
test_whitespace_tokenizer()
|
||||
test_unicode_script_tokenizer()
|
||||
test_unicode_script_tokenizer2()
|
||||
test_case_fold()
|
||||
test_normalize_utf8()
|
||||
test_regex_replace()
|
||||
test_regex_tokenizer()
|
|
@ -1,113 +0,0 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
Testing WordpieceTokenizer op in DE
|
||||
"""
|
||||
import numpy as np
|
||||
import mindspore.dataset as ds
|
||||
from mindspore import log as logger
|
||||
import mindspore.dataset.text as nlp
|
||||
|
||||
WORDPIECE_TOKENIZER_FILE = "../data/dataset/testTokenizerData/wordpiece_tokenizer.txt"
|
||||
|
||||
vocab_english = [
|
||||
"book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"
|
||||
]
|
||||
|
||||
vocab_chinese = [
|
||||
"我", '最', '喜', '欢', '的', '书', '是', '霍', '乱', '时', '期', '爱', '情'
|
||||
]
|
||||
|
||||
vocab_mix = vocab_chinese + vocab_english
|
||||
|
||||
test_paras = [
|
||||
dict(
|
||||
first=1,
|
||||
last=10,
|
||||
expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'],
|
||||
['era'], ['[UNK]']],
|
||||
vocab_list=vocab_english
|
||||
),
|
||||
dict(
|
||||
first=1,
|
||||
last=10,
|
||||
expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'],
|
||||
['era'], ['what']],
|
||||
vocab_list=vocab_english,
|
||||
unknown_token=""
|
||||
),
|
||||
dict(
|
||||
first=1,
|
||||
last=10,
|
||||
expect_str=[['my'], ['[UNK]'], ['book'], ['is'], ['love'], ['[UNK]'], ['the'], ['[UNK]'], ['era'], ['[UNK]']],
|
||||
vocab_list=vocab_english,
|
||||
max_bytes_per_token=4
|
||||
),
|
||||
dict(
|
||||
first=11,
|
||||
last=25,
|
||||
expect_str=[['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'],
|
||||
['[UNK]']],
|
||||
vocab_list=vocab_chinese,
|
||||
),
|
||||
dict(
|
||||
first=25,
|
||||
last=25,
|
||||
expect_str=[['您']],
|
||||
vocab_list=vocab_chinese,
|
||||
unknown_token=""
|
||||
),
|
||||
dict(
|
||||
first=1,
|
||||
last=25,
|
||||
expect_str=[
|
||||
['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], ['era'],
|
||||
['[UNK]'],
|
||||
['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'],
|
||||
['[UNK]']],
|
||||
vocab_list=vocab_mix,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def check_wordpiece_tokenizer(first, last, expect_str, vocab_list, unknown_token='[UNK]', max_bytes_per_token=100):
|
||||
dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
vocab = nlp.Vocab.from_list(vocab_list)
|
||||
tokenizer_op = nlp.WordpieceTokenizer(vocab=vocab, unknown_token=unknown_token,
|
||||
max_bytes_per_token=max_bytes_per_token)
|
||||
dataset = dataset.map(operations=tokenizer_op)
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text'])
|
||||
logger.info("Out:", text)
|
||||
logger.info("Exp:", expect_str[count])
|
||||
np.testing.assert_array_equal(text, expect_str[count])
|
||||
count = count + 1
|
||||
|
||||
|
||||
def test_wordpiece_tokenizer():
|
||||
"""
|
||||
Test WordpieceTokenizer
|
||||
"""
|
||||
for paras in test_paras:
|
||||
check_wordpiece_tokenizer(**paras)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_wordpiece_tokenizer()
|
Loading…
Reference in New Issue