forked from mindspore-Ecosystem/mindspore
!13364 Add WordpieceTokenizer and AddDict to Jieba
From: @alexyuyue Reviewed-by: Signed-off-by:
This commit is contained in:
commit
0cfd8b7c9e
|
@ -134,6 +134,15 @@ inline std::vector<std::pair<std::string, std::vector<int32_t>>> ClassIndexCharT
|
|||
return ret;
|
||||
}
|
||||
|
||||
inline std::vector<std::pair<std::vector<char>, int64_t>> PairStringInt64ToPairCharInt64(
|
||||
const std::vector<std::pair<std::string, int64_t>> &s) {
|
||||
std::vector<std::pair<std::vector<char>, int64_t>> ret;
|
||||
std::transform(s.begin(), s.end(), std::back_inserter(ret), [](auto str) {
|
||||
return std::pair<std::vector<char>, int64_t>(std::vector<char>(str.first.begin(), str.first.end()), str.second);
|
||||
});
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline std::map<std::vector<char>, T> PadInfoStringToChar(const std::map<std::string, T> &s_pad_info) {
|
||||
std::map<std::vector<char>, T> ret;
|
||||
|
|
|
@ -232,12 +232,17 @@ PYBIND_REGISTER(UnicodeCharTokenizerOperation, 1, ([](const py::module *m) {
|
|||
}));
|
||||
}));
|
||||
|
||||
// TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++
|
||||
PYBIND_REGISTER(WordpieceTokenizerOp, 1, ([](const py::module *m) {
|
||||
(void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
|
||||
*m, "WordpieceTokenizerOp")
|
||||
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &,
|
||||
const bool &>());
|
||||
PYBIND_REGISTER(WordpieceTokenizerOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::WordpieceTokenizerOperation, TensorOperation,
|
||||
std::shared_ptr<text::WordpieceTokenizerOperation>>(*m,
|
||||
"WordpieceTokenizerOperation")
|
||||
.def(py::init([](const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
|
||||
int32_t max_bytes_per_token, const std::string &unknown_token, bool with_offsets) {
|
||||
auto wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizerOperation>(
|
||||
vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets);
|
||||
THROW_IF_ERROR(wordpiece_tokenizer->ValidateParams());
|
||||
return wordpiece_tokenizer;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(JiebaMode, 0, ([](const py::module *m) {
|
||||
|
|
|
@ -15,6 +15,8 @@
|
|||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include <fstream>
|
||||
#include <regex>
|
||||
|
||||
#include "minddata/dataset/include/text.h"
|
||||
|
||||
|
@ -131,7 +133,7 @@ std::shared_ptr<TensorOperation> JiebaTokenizer::Parse() {
|
|||
return jieba_tokenizer;
|
||||
}
|
||||
|
||||
Status JiebaTokenizer::AddWord(const std::string &word, int64_t freq) {
|
||||
Status JiebaTokenizer::AddWordChar(const std::vector<char> &word, int64_t freq) {
|
||||
if (word.empty()) {
|
||||
std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
|
@ -142,7 +144,59 @@ Status JiebaTokenizer::AddWord(const std::string &word, int64_t freq) {
|
|||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
data_->words_list_.emplace_back(word, freq);
|
||||
data_->words_list_.emplace_back(CharToString(word), freq);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status JiebaTokenizer::AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict) {
|
||||
for (auto &word_freq_pair : user_dict) {
|
||||
RETURN_IF_NOT_OK(AddWordChar(word_freq_pair.first, word_freq_pair.second));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status JiebaTokenizer::AddDictChar(const std::vector<char> &file_path) {
|
||||
std::vector<std::pair<std::string, int64_t>> user_dict;
|
||||
RETURN_IF_NOT_OK(ParserFile(CharToString(file_path), &user_dict));
|
||||
RETURN_IF_NOT_OK(AddDictChar(PairStringInt64ToPairCharInt64(user_dict)));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status JiebaTokenizer::ParserFile(const std::string &file_path,
|
||||
std::vector<std::pair<std::string, int64_t>> *const user_dict) {
|
||||
std::ifstream ifs(file_path);
|
||||
if (!ifs) {
|
||||
std::string err_msg = "JiebaTokenizer : Fail to load dictionary from the input file, check the file path.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
||||
std::string line;
|
||||
while (std::getline(ifs, line)) {
|
||||
if (line.empty()) {
|
||||
continue;
|
||||
}
|
||||
std::regex regex("^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$");
|
||||
std::smatch tokens;
|
||||
std::regex_match(line, tokens, regex);
|
||||
if (std::regex_match(line, tokens, regex)) {
|
||||
if (tokens.size() == 2) {
|
||||
user_dict->emplace_back(tokens.str(1), 0);
|
||||
} else if (tokens.size() == 3) {
|
||||
user_dict->emplace_back(tokens.str(1), strtoll(tokens.str(2).c_str(), NULL, 0));
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
MS_LOG(INFO) << "JiebaTokenizer::AddDict: The size of user input dictionary is: " << user_dict->size();
|
||||
MS_LOG(INFO) << "Valid rows in input dictionary (Maximum of first 10 rows are shown.):";
|
||||
for (std::size_t i = 0; i != user_dict->size(); ++i) {
|
||||
if (i >= 10) break;
|
||||
MS_LOG(INFO) << user_dict->at(i).first << " " << user_dict->at(i).second;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
@ -310,6 +364,32 @@ std::shared_ptr<TensorOperation> UnicodeCharTokenizer::Parse() {
|
|||
return std::make_shared<UnicodeCharTokenizerOperation>(data_->with_offsets_);
|
||||
}
|
||||
|
||||
// WordpieceTokenizer
|
||||
struct WordpieceTokenizer::Data {
|
||||
Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token,
|
||||
const std::vector<char> &unknown_token, bool with_offsets)
|
||||
: vocab_(vocab),
|
||||
suffix_indicator_(CharToString(suffix_indicator)),
|
||||
max_bytes_per_token_(max_bytes_per_token),
|
||||
unknown_token_(CharToString(unknown_token)),
|
||||
with_offsets_(with_offsets) {}
|
||||
std::shared_ptr<Vocab> vocab_;
|
||||
std::string suffix_indicator_;
|
||||
int32_t max_bytes_per_token_;
|
||||
std::string unknown_token_;
|
||||
bool with_offsets_;
|
||||
};
|
||||
|
||||
WordpieceTokenizer::WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
|
||||
int32_t max_bytes_per_token, const std::vector<char> &unknown_token,
|
||||
bool with_offsets)
|
||||
: data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets)) {}
|
||||
|
||||
std::shared_ptr<TensorOperation> WordpieceTokenizer::Parse() {
|
||||
return std::make_shared<WordpieceTokenizerOperation>(
|
||||
data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->with_offsets_);
|
||||
}
|
||||
|
||||
#ifndef _WIN32
|
||||
// UnicodeScriptTokenizer
|
||||
struct UnicodeScriptTokenizer::Data {
|
||||
|
|
|
@ -52,7 +52,7 @@ class BasicTokenizer final : public TensorTransform {
|
|||
/// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
|
||||
/// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
|
||||
/// '[MASK]' (default=true).
|
||||
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
|
||||
/// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
|
||||
explicit BasicTokenizer(bool lower_case = false, bool keep_whitespace = false,
|
||||
const NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true,
|
||||
bool with_offsets = false);
|
||||
|
@ -88,7 +88,7 @@ class BertTokenizer final : public TensorTransform {
|
|||
/// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
|
||||
/// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
|
||||
/// '[MASK]' (default=true).
|
||||
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
|
||||
/// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
|
||||
explicit BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##",
|
||||
int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]",
|
||||
bool lower_case = false, bool keep_whitespace = false,
|
||||
|
@ -145,7 +145,7 @@ class JiebaTokenizer final : public TensorTransform {
|
|||
/// - JiebaMode.kMP, tokenize with MPSegment algorithm.
|
||||
/// - JiebaMode.kHMM, tokenize with Hiddel Markov Model Segment algorithm.
|
||||
/// - JiebaMode.kMIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
|
||||
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
|
||||
/// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
|
||||
explicit JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path,
|
||||
const JiebaMode &mode = JiebaMode::kMix, bool with_offsets = false)
|
||||
: JiebaTokenizer(StringToChar(hmm_path), StringToChar(mp_path), mode, with_offsets) {}
|
||||
|
@ -156,7 +156,24 @@ class JiebaTokenizer final : public TensorTransform {
|
|||
/// \brief Destructor
|
||||
~JiebaTokenizer() = default;
|
||||
|
||||
Status AddWord(const std::string &word, int64_t freq = 0);
|
||||
/// \brief Add user defined word to JiebaTokenizer's dictionary.
|
||||
/// \param[in] word The word to be added to the JiebaTokenizer instance.
|
||||
/// The added word will not be written into the built-in dictionary on disk.
|
||||
/// \param[in] freq The frequency of the word to be added. The higher the frequency,
|
||||
/// the better chance the word will be tokenized (default=None, use default frequency).
|
||||
Status AddWord(const std::string &word, int64_t freq = 0) { return AddWordChar(StringToChar(word), freq); }
|
||||
|
||||
/// \brief Add user defined dictionary of word-freq pairs to JiebaTokenizer's dictionary.
|
||||
/// \param[in] user_dict Vector of word-freq pairs to be added to JiebaTokenizer's dictionary.
|
||||
Status AddDict(const std::vector<std::pair<std::string, int64_t>> &user_dict) {
|
||||
return AddDictChar(PairStringInt64ToPairCharInt64(user_dict));
|
||||
}
|
||||
|
||||
/// \brief Add user defined dictionary of word-freq pairs to JiebaTokenizer's dictionary from a file.
|
||||
/// Only valid word-freq pairs in user provided file will be added into the dictionary.
|
||||
/// Rows containing invalid input will be ignored, no error nor warning Status is returned.
|
||||
/// \param[in] file_path Path to the dictionary which includes user defined word-freq pairs.
|
||||
Status AddDict(const std::string &file_path) { return AddDictChar(StringToChar(file_path)); }
|
||||
|
||||
protected:
|
||||
/// \brief Function to convert TensorTransform object into a TensorOperation object.
|
||||
|
@ -164,6 +181,20 @@ class JiebaTokenizer final : public TensorTransform {
|
|||
std::shared_ptr<TensorOperation> Parse() override;
|
||||
|
||||
private:
|
||||
/// \brief Parser user defined word by file.
|
||||
/// \param[in] file_path Path to the user defined file.
|
||||
/// \param[in] user_dict Vector of word-freq pairs extracted from the user provided file.
|
||||
Status ParserFile(const std::string &file_path, std::vector<std::pair<std::string, int64_t>> *const user_dict);
|
||||
|
||||
/// \brief Used to translate all API string to vector of char and back
|
||||
Status AddWordChar(const std::vector<char> &word, int64_t freq = 0);
|
||||
|
||||
/// \brief Used to translate all API string to vector of char and back
|
||||
Status AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict);
|
||||
|
||||
/// \brief Used to translate all API string to vector of char and back
|
||||
Status AddDictChar(const std::vector<char> &file_path);
|
||||
|
||||
struct Data;
|
||||
std::shared_ptr<Data> data_;
|
||||
};
|
||||
|
@ -292,7 +323,7 @@ class RegexTokenizer final : public TensorTransform {
|
|||
/// \param[in] keep_delim_pattern The string matched by 'delim_pattern' can be kept as a token if it can be
|
||||
/// matched by 'keep_delim_pattern'. The default value is an empty string ("")
|
||||
/// which means that delimiters will not be kept as an output token (default="").
|
||||
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
|
||||
/// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
|
||||
explicit RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern = "", bool with_offsets = false)
|
||||
: RegexTokenizer(StringToChar(delim_pattern), StringToChar(keep_delim_pattern), with_offsets) {}
|
||||
|
||||
|
@ -416,7 +447,7 @@ class TruncateSequencePair final : public TensorTransform {
|
|||
class UnicodeCharTokenizer final : public TensorTransform {
|
||||
public:
|
||||
/// \brief Constructor.
|
||||
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
|
||||
/// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
|
||||
explicit UnicodeCharTokenizer(bool with_offsets = false);
|
||||
|
||||
/// \brief Destructor
|
||||
|
@ -432,13 +463,45 @@ class UnicodeCharTokenizer final : public TensorTransform {
|
|||
std::shared_ptr<Data> data_;
|
||||
};
|
||||
|
||||
/// \brief Tokenize scalar token or 1-D tokens to 1-D subword tokens.
|
||||
class WordpieceTokenizer final : public TensorTransform {
|
||||
public:
|
||||
/// \brief Constructor.
|
||||
/// \param[in] vocab A Vocab object.
|
||||
/// \param[in] suffix_indicator Used to show that the subword is the last part of a word (default='##').
|
||||
/// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100).
|
||||
/// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty
|
||||
/// string, else return the string specified (default='[UNK]').
|
||||
/// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
|
||||
explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##",
|
||||
int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]",
|
||||
bool with_offsets = false)
|
||||
: WordpieceTokenizer(vocab, StringToChar(suffix_indicator), max_bytes_per_token, StringToChar(unknown_token),
|
||||
with_offsets) {}
|
||||
|
||||
explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
|
||||
int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool with_offsets);
|
||||
|
||||
/// \brief Destructor
|
||||
~WordpieceTokenizer() = default;
|
||||
|
||||
protected:
|
||||
/// \brief Function to convert TensorTransform object into a TensorOperation object.
|
||||
/// \return Shared pointer to TensorOperation object.
|
||||
std::shared_ptr<TensorOperation> Parse() override;
|
||||
|
||||
private:
|
||||
struct Data;
|
||||
std::shared_ptr<Data> data_;
|
||||
};
|
||||
|
||||
#ifndef _WIN32
|
||||
/// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
|
||||
class UnicodeScriptTokenizer final : public TensorTransform {
|
||||
public:
|
||||
/// \brief Constructor.
|
||||
/// \param[in] keep_whitespace If or not emit whitespace tokens (default=false).
|
||||
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
|
||||
/// \param[in] keep_whitespace Whether or not emit whitespace tokens (default=false).
|
||||
/// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
|
||||
explicit UnicodeScriptTokenizer(bool keep_whitespace = false, bool with_offsets = false);
|
||||
|
||||
/// \brief Destructor
|
||||
|
@ -458,7 +521,7 @@ class UnicodeScriptTokenizer final : public TensorTransform {
|
|||
class WhitespaceTokenizer final : public TensorTransform {
|
||||
public:
|
||||
/// \brief Constructor.
|
||||
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
|
||||
/// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
|
||||
explicit WhitespaceTokenizer(bool with_offsets = false);
|
||||
|
||||
/// \brief Destructor
|
||||
|
|
|
@ -36,6 +36,7 @@
|
|||
#include "minddata/dataset/text/kernels/to_number_op.h"
|
||||
#include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h"
|
||||
#include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h"
|
||||
#ifndef _WIN32
|
||||
#include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
|
||||
|
@ -396,6 +397,39 @@ std::shared_ptr<TensorOp> UnicodeCharTokenizerOperation::Build() {
|
|||
return tensor_op;
|
||||
}
|
||||
|
||||
// WordpieceTokenizerOperation
|
||||
WordpieceTokenizerOperation::WordpieceTokenizerOperation(const std::shared_ptr<Vocab> &vocab,
|
||||
const std::string &suffix_indicator,
|
||||
int32_t max_bytes_per_token, const std::string &unknown_token,
|
||||
bool with_offsets)
|
||||
: vocab_(vocab),
|
||||
suffix_indicator_(suffix_indicator),
|
||||
max_bytes_per_token_(max_bytes_per_token),
|
||||
unknown_token_(unknown_token),
|
||||
with_offsets_(with_offsets) {}
|
||||
|
||||
Status WordpieceTokenizerOperation::ValidateParams() {
|
||||
if (vocab_ == nullptr) {
|
||||
std::string err_msg = "WordpieceTokenizer: vocab object type is incorrect or null.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
if (max_bytes_per_token_ < 0) {
|
||||
std::string err_msg =
|
||||
"WordpieceTokenizer : The parameter max_bytes_per_token must be greater than or equal to 0: " +
|
||||
std::to_string(max_bytes_per_token_);
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::shared_ptr<TensorOp> WordpieceTokenizerOperation::Build() {
|
||||
std::shared_ptr<WordpieceTokenizerOp> tensor_op = std::make_shared<WordpieceTokenizerOp>(
|
||||
vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, with_offsets_);
|
||||
return tensor_op;
|
||||
}
|
||||
|
||||
#ifndef _WIN32
|
||||
// UnicodeScriptTokenizerOperation
|
||||
UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets)
|
||||
|
|
|
@ -49,6 +49,7 @@ constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair";
|
|||
constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer";
|
||||
constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer";
|
||||
constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer";
|
||||
constexpr char kWordpieceTokenizerOperation[] = "WordpieceTokenizer";
|
||||
|
||||
/* ####################################### Derived TensorOperation classes ################################# */
|
||||
|
||||
|
@ -318,6 +319,28 @@ class UnicodeCharTokenizerOperation : public TensorOperation {
|
|||
bool with_offsets_;
|
||||
};
|
||||
|
||||
class WordpieceTokenizerOperation : public TensorOperation {
|
||||
public:
|
||||
explicit WordpieceTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
|
||||
int32_t max_bytes_per_token, const std::string &unknown_token,
|
||||
bool with_offsets);
|
||||
|
||||
~WordpieceTokenizerOperation() = default;
|
||||
|
||||
std::shared_ptr<TensorOp> Build() override;
|
||||
|
||||
Status ValidateParams() override;
|
||||
|
||||
std::string Name() const override { return kWordpieceTokenizerOperation; }
|
||||
|
||||
private:
|
||||
std::shared_ptr<Vocab> vocab_;
|
||||
std::string suffix_indicator_;
|
||||
int32_t max_bytes_per_token_;
|
||||
std::string unknown_token_;
|
||||
bool with_offsets_;
|
||||
};
|
||||
|
||||
#ifndef _WIN32
|
||||
class UnicodeScriptTokenizerOperation : public TensorOperation {
|
||||
public:
|
||||
|
|
|
@ -2207,7 +2207,7 @@ def _pyfunc_worker_init(pyfunc_list):
|
|||
# All exceptions will be raised to main processes
|
||||
def _pyfunc_worker_exec(index, *args):
|
||||
"""
|
||||
Internal function for call certain pyfunc in python process.
|
||||
Internal function for call certain pyfunc in Python process.
|
||||
"""
|
||||
# Some threads in multiprocess.pool can't process sigint signal,
|
||||
# and will occur hang problem, so ctrl+c will pass to parent process.
|
||||
|
@ -2352,7 +2352,7 @@ class MapDataset(Dataset):
|
|||
|
||||
# Pass #1, look for Python callables and build list
|
||||
for op in self.operations:
|
||||
# our c transforms is now callable and should not be run in python multithreading
|
||||
# our c transforms is now callable and should not be run in Python multithreading
|
||||
if callable(op) and str(op).find("c_transform") < 0:
|
||||
callable_list.append(op)
|
||||
|
||||
|
@ -2373,7 +2373,7 @@ class MapDataset(Dataset):
|
|||
with _LOCK:
|
||||
_OP_PROCESS.update(process_id)
|
||||
for op in self.operations:
|
||||
# our c transforms is now callable and should not be run in python multithreading
|
||||
# our c transforms is now callable and should not be run in Python multithreading
|
||||
if callable(op) and str(op).find("c_transform") < 0:
|
||||
# Wrap Python callable into _PythonCallable
|
||||
iter_specific_operations.append(_PythonCallable(op, idx, self.process_pool))
|
||||
|
|
|
@ -610,7 +610,7 @@ class SubsetSampler(BuiltinSampler):
|
|||
Samples the elements from a sequence of indices.
|
||||
|
||||
Args:
|
||||
indices (Any iterable python object but string): A sequence of indices.
|
||||
indices (Any iterable Python object but string): A sequence of indices.
|
||||
num_samples (int, optional): Number of elements to sample (default=None, all elements).
|
||||
|
||||
Examples:
|
||||
|
|
|
@ -102,7 +102,7 @@ class JiebaTokenizer(TextTensorOperation):
|
|||
- JiebaMode.MP, tokenize with MPSegment algorithm.
|
||||
- JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm.
|
||||
- JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> from mindspore.dataset.text import JiebaMode
|
||||
|
@ -186,6 +186,9 @@ class JiebaTokenizer(TextTensorOperation):
|
|||
word2 None
|
||||
word3 freq3
|
||||
|
||||
Only valid word-freq pairs in user provided file will be added into the dictionary.
|
||||
Rows containing invalid input will be ignored. No error nor warning Status is returned.
|
||||
|
||||
Examples:
|
||||
>>> from mindspore.dataset.text import JiebaMode
|
||||
>>> jieba_hmm_file = "/path/to/jieba/hmm/file"
|
||||
|
@ -221,16 +224,16 @@ class JiebaTokenizer(TextTensorOperation):
|
|||
"user dict file {} is not exist.".format(file_path))
|
||||
real_file_path = os.path.realpath(file_path)
|
||||
file_dict = open(real_file_path)
|
||||
data_re = re.compile('^(.+?)( [0-9]+)?$', re.U)
|
||||
data_re = re.compile('^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$', re.U)
|
||||
words_list = []
|
||||
for item in file_dict:
|
||||
data = item.strip()
|
||||
if not isinstance(data, str):
|
||||
data = self.__decode(data)
|
||||
words = data_re.match(data).groups()
|
||||
if len(words) != 2:
|
||||
raise ValueError(
|
||||
"user dict file {} format error.".format(real_file_path))
|
||||
tmp = data_re.match(data)
|
||||
if not tmp:
|
||||
continue
|
||||
words = tmp.groups()
|
||||
words_list.append(words)
|
||||
file_dict.close()
|
||||
return words_list
|
||||
|
@ -452,7 +455,7 @@ class UnicodeCharTokenizer(TextTensorOperation):
|
|||
Tokenize a scalar tensor of UTF-8 string to Unicode characters.
|
||||
|
||||
Args:
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
|
@ -474,8 +477,7 @@ class UnicodeCharTokenizer(TextTensorOperation):
|
|||
return cde.UnicodeCharTokenizerOperation(self.with_offsets)
|
||||
|
||||
|
||||
# TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++
|
||||
class WordpieceTokenizer(cde.WordpieceTokenizerOp):
|
||||
class WordpieceTokenizer(TextTensorOperation):
|
||||
"""
|
||||
Tokenize scalar token or 1-D tokens to 1-D subword tokens.
|
||||
|
||||
|
@ -485,7 +487,7 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
|
|||
max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split (default=100).
|
||||
unknown_token (str, optional): When a token cannot be found: if 'unknown_token' is empty string,
|
||||
return the token directly, else return 'unknown_token' (default='[UNK]').
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> vocab_list = ["book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"]
|
||||
|
@ -511,8 +513,10 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
|
|||
self.max_bytes_per_token = max_bytes_per_token
|
||||
self.unknown_token = unknown_token
|
||||
self.with_offsets = with_offsets
|
||||
super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token,
|
||||
self.unknown_token, self.with_offsets)
|
||||
|
||||
def parse(self):
|
||||
return cde.WordpieceTokenizerOperation(self.vocab, self.suffix_indicator, self.max_bytes_per_token,
|
||||
self.unknown_token, self.with_offsets)
|
||||
|
||||
|
||||
class PythonTokenizer:
|
||||
|
@ -572,7 +576,7 @@ if platform.system().lower() != 'windows':
|
|||
only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE).
|
||||
preserve_unused_token (bool, optional): If True, do not split special tokens like
|
||||
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> from mindspore.dataset.text import NormalizeForm
|
||||
|
@ -638,7 +642,7 @@ if platform.system().lower() != 'windows':
|
|||
only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE).
|
||||
preserve_unused_token (bool, optional): If True, do not split special tokens like
|
||||
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> from mindspore.dataset.text import NormalizeForm
|
||||
|
@ -793,7 +797,7 @@ if platform.system().lower() != 'windows':
|
|||
keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token
|
||||
if it can be matched by 'keep_delim_pattern'. The default value is an empty str ('')
|
||||
which means that delimiters will not be kept as an output token (default='').
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
|
@ -829,8 +833,8 @@ if platform.system().lower() != 'windows':
|
|||
UnicodeScriptTokenizer is not supported on Windows platform yet.
|
||||
|
||||
Args:
|
||||
keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False).
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
keep_whitespace (bool, optional): Whether or not emit whitespace tokens (default=False).
|
||||
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
|
@ -865,7 +869,7 @@ if platform.system().lower() != 'windows':
|
|||
WhitespaceTokenizer is not supported on Windows platform yet.
|
||||
|
||||
Args:
|
||||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
|
|
|
@ -1048,7 +1048,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord) {
|
|||
EXPECT_NE(jieba_tokenizer, nullptr);
|
||||
|
||||
// Add word with freq not provided (default 0)
|
||||
jieba_tokenizer->AddWord("男默女泪");
|
||||
ASSERT_OK(jieba_tokenizer->AddWord("男默女泪"));
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({jieba_tokenizer}, {"text"});
|
||||
|
@ -1100,7 +1100,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord1) {
|
|||
EXPECT_NE(jieba_tokenizer, nullptr);
|
||||
|
||||
// Add word with freq is set explicitly to 0
|
||||
jieba_tokenizer->AddWord("男默女泪", 0);
|
||||
ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 0));
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({jieba_tokenizer}, {"text"});
|
||||
|
@ -1152,7 +1152,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord2) {
|
|||
EXPECT_NE(jieba_tokenizer, nullptr);
|
||||
|
||||
// Add word with freq 10
|
||||
jieba_tokenizer->AddWord("男默女泪", 10);
|
||||
ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 10));
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({jieba_tokenizer}, {"text"});
|
||||
|
@ -1204,7 +1204,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord3) {
|
|||
EXPECT_NE(jieba_tokenizer, nullptr);
|
||||
|
||||
// Add word with freq 20000
|
||||
jieba_tokenizer->AddWord("江大桥", 20000);
|
||||
ASSERT_OK(jieba_tokenizer->AddWord("江大桥", 20000));
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({jieba_tokenizer}, {"text"});
|
||||
|
@ -1262,6 +1262,115 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWordFail) {
|
|||
EXPECT_NE(jieba_tokenizer1->AddWord("我们", -1), Status::OK());
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDict) {
|
||||
// Testing AddDict of JiebaTokenizer when the input is a vector of word-freq pair.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDict.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt";
|
||||
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
|
||||
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create jieba_tokenizer operation on ds
|
||||
std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
|
||||
std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
|
||||
EXPECT_NE(jieba_tokenizer, nullptr);
|
||||
|
||||
// Add word with freq 20000
|
||||
std::vector<std::pair<std::string, int64_t>> user_dict = {{"江大桥", 20000}};
|
||||
ASSERT_OK(jieba_tokenizer->AddDict(user_dict));
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({jieba_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"};
|
||||
std::shared_ptr<Tensor> de_expected_tensor;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
|
||||
mindspore::MSTensor expected_tensor =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto txt = row["text"];
|
||||
EXPECT_MSTENSOR_EQ(txt, expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDictFromFile) {
|
||||
// Testing AddDict of JiebaTokenizer when the input is a path to dict.
|
||||
// Test error scenario for AddDict: invalid path
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDictFromFile.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
|
||||
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
|
||||
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create jieba_tokenizer operation on ds
|
||||
std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
|
||||
std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
|
||||
EXPECT_NE(jieba_tokenizer, nullptr);
|
||||
|
||||
// Load dict from txt file
|
||||
std::string user_dict_path = datasets_root_path_ + "/testJiebaDataset/user_dict.txt";
|
||||
std::string invalid_path = datasets_root_path_ + "/testJiebaDataset/invalid_path.txt";
|
||||
EXPECT_ERROR(jieba_tokenizer->AddDict(invalid_path));
|
||||
ASSERT_OK(jieba_tokenizer->AddDict(user_dict_path));
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({jieba_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
|
||||
std::shared_ptr<Tensor> de_expected_tensor;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
|
||||
mindspore::MSTensor expected_tensor =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto txt = row["text"];
|
||||
EXPECT_MSTENSOR_EQ(txt, expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) {
|
||||
// Testing the parameter of SlidingWindow interface when the axis is 0.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess.";
|
||||
|
@ -2662,6 +2771,421 @@ TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess1) {
|
|||
iter->Stop();
|
||||
}
|
||||
|
||||
std::vector<std::string> vocab_english = {"book", "cholera", "era", "favor", "##ite", "my",
|
||||
"is", "love", "dur", "##ing", "the"};
|
||||
|
||||
std::vector<std::string> vocab_chinese = {"我", "最", "喜", "欢", "的", "书", "是", "霍", "乱", "时", "期", "爱", "情"};
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess1) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess1.";
|
||||
// Test WordpieceTokenizer with default parameters on English vocab
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create Take operation on ds
|
||||
ds = ds->Take(10);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create a vocab from vector
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Create WordpieceTokenizer operation on ds
|
||||
std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab);
|
||||
EXPECT_NE(wordpiece_tokenizer, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({wordpiece_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::vector<std::string>> expected = {
|
||||
{"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto txt = row["text"];
|
||||
std::shared_ptr<Tensor> de_expected_tensor;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
|
||||
mindspore::MSTensor expected_tensor =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
|
||||
EXPECT_MSTENSOR_EQ(txt, expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 10);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess2) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess2.";
|
||||
// Test WordpieceTokenizer with empty unknown_token
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create Take operation on ds
|
||||
ds = ds->Take(10);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create a vocab from vector
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Create WordpieceTokenizer operation on ds
|
||||
std::shared_ptr<TensorTransform> wordpiece_tokenizer =
|
||||
std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "", false);
|
||||
EXPECT_NE(wordpiece_tokenizer, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({wordpiece_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::vector<std::string>> expected = {
|
||||
{"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"what"}};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto txt = row["text"];
|
||||
std::shared_ptr<Tensor> de_expected_tensor;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
|
||||
mindspore::MSTensor expected_tensor =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
|
||||
EXPECT_MSTENSOR_EQ(txt, expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 10);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess3) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess3.";
|
||||
// Test WordpieceTokenizer with non-default max_bytes_per_token
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create Take operation on ds
|
||||
ds = ds->Take(10);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create a vocab from vector
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Create WordpieceTokenizer operation on ds
|
||||
std::shared_ptr<TensorTransform> wordpiece_tokenizer =
|
||||
std::make_shared<text::WordpieceTokenizer>(vocab, "##", 4, "[UNK]", false);
|
||||
EXPECT_NE(wordpiece_tokenizer, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({wordpiece_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::vector<std::string>> expected = {{"my"}, {"[UNK]"}, {"book"}, {"is"}, {"love"},
|
||||
{"[UNK]"}, {"the"}, {"[UNK]"}, {"era"}, {"[UNK]"}};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto txt = row["text"];
|
||||
std::shared_ptr<Tensor> de_expected_tensor;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
|
||||
mindspore::MSTensor expected_tensor =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
|
||||
EXPECT_MSTENSOR_EQ(txt, expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 10);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess4) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess4.";
|
||||
// Test WordpieceTokenizer with default parameters on Chinese vocab
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create Skip operation on ds
|
||||
ds = ds->Skip(10);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create Take operation on ds
|
||||
ds = ds->Take(15);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create a vocab from vector
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromVector(vocab_chinese, {}, true, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Create WordpieceTokenizer operation on ds
|
||||
std::shared_ptr<TensorTransform> wordpiece_tokenizer =
|
||||
std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", false);
|
||||
EXPECT_NE(wordpiece_tokenizer, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({wordpiece_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::vector<std::string>> expected = {{"我"}, {"最"}, {"喜"}, {"欢"}, {"的"}, {"书"}, {"是"}, {"霍"},
|
||||
{"乱"}, {"时"}, {"期"}, {"的"}, {"爱"}, {"情"}, {"[UNK]"}};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto txt = row["text"];
|
||||
std::shared_ptr<Tensor> de_expected_tensor;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
|
||||
mindspore::MSTensor expected_tensor =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
|
||||
EXPECT_MSTENSOR_EQ(txt, expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 15);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess5) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess5.";
|
||||
// Test WordpieceTokenizer with with_offsets true
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create Take operation on ds
|
||||
ds = ds->Take(10);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create a vocab from vector
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Create WordpieceTokenizer operation on ds
|
||||
std::shared_ptr<TensorTransform> wordpiece_tokenizer =
|
||||
std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", true);
|
||||
EXPECT_NE(wordpiece_tokenizer, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::vector<std::string>> expected = {
|
||||
{"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}};
|
||||
std::vector<std::vector<uint32_t>> expected_offsets_start = {{0}, {0, 5}, {0}, {0}, {0}, {0, 3}, {0}, {0}, {0}, {0}};
|
||||
std::vector<std::vector<uint32_t>> expected_offsets_limit = {{2}, {5, 8}, {4}, {2}, {4}, {3, 6}, {3}, {7}, {3}, {4}};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto txt = row["token"];
|
||||
std::shared_ptr<Tensor> de_expected_tensor;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
|
||||
mindspore::MSTensor expected_tensor =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
|
||||
EXPECT_MSTENSOR_EQ(txt, expected_tensor);
|
||||
|
||||
auto start = row["offsets_start"];
|
||||
std::shared_ptr<Tensor> de_expected_start_tensor;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], &de_expected_start_tensor));
|
||||
mindspore::MSTensor expected_start_tensor =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_start_tensor));
|
||||
EXPECT_MSTENSOR_EQ(start, expected_start_tensor);
|
||||
|
||||
auto limit = row["offsets_limit"];
|
||||
std::shared_ptr<Tensor> de_expected_limit_tensor;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], &de_expected_limit_tensor));
|
||||
mindspore::MSTensor expected_limit_tensor =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_limit_tensor));
|
||||
EXPECT_MSTENSOR_EQ(limit, expected_limit_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 10);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess6) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess6.";
|
||||
// Test WordpieceTokenizer with max_bytes_per_token equals to 0
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create Take operation on ds
|
||||
ds = ds->Take(10);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create a vocab from vector
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Create WordpieceTokenizer operation on ds
|
||||
std::shared_ptr<TensorTransform> wordpiece_tokenizer =
|
||||
std::make_shared<text::WordpieceTokenizer>(vocab, "##", 0, "[UNK]", true);
|
||||
EXPECT_NE(wordpiece_tokenizer, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::vector<std::string>> expected = {{"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"},
|
||||
{"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto txt = row["token"];
|
||||
std::shared_ptr<Tensor> de_expected_tensor;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
|
||||
mindspore::MSTensor expected_tensor =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
|
||||
EXPECT_MSTENSOR_EQ(txt, expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 10);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail1) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail1.";
|
||||
// Test WordpieceTokenizer with nullptr vocab
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create WordpieceTokenizer operation on ds
|
||||
std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(nullptr);
|
||||
EXPECT_NE(wordpiece_tokenizer, nullptr);
|
||||
|
||||
// Create a Map operation on ds
|
||||
ds = ds->Map({wordpiece_tokenizer});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
// Expect failure: invalid WordpieceTokenizer input with nullptr vocab
|
||||
EXPECT_EQ(iter, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail2) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail2.";
|
||||
// Test WordpieceTokenizer with negative max_bytes_per_token
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create a vocab from vector
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Create WordpieceTokenizer operation on ds
|
||||
std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab, "##", -1);
|
||||
EXPECT_NE(wordpiece_tokenizer, nullptr);
|
||||
|
||||
// Create a Map operation on ds
|
||||
ds = ds->Map({wordpiece_tokenizer});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
// Expect failure: invalid WordpieceTokenizer input with nullptr vocab
|
||||
EXPECT_EQ(iter, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess) {
|
||||
// Testing the parameter of UnicodeScriptTokenizer interface when the with_offsets and the keep_whitespace is default.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess.";
|
||||
|
|
Loading…
Reference in New Issue