!13364 Add WordpieceTokenizer and AddDict to Jieba

From: @alexyuyue
Reviewed-by: 
Signed-off-by:
This commit is contained in:
mindspore-ci-bot 2021-04-06 05:04:13 +08:00 committed by Gitee
commit 0cfd8b7c9e
10 changed files with 785 additions and 43 deletions

View File

@ -134,6 +134,15 @@ inline std::vector<std::pair<std::string, std::vector<int32_t>>> ClassIndexCharT
return ret;
}
inline std::vector<std::pair<std::vector<char>, int64_t>> PairStringInt64ToPairCharInt64(
const std::vector<std::pair<std::string, int64_t>> &s) {
std::vector<std::pair<std::vector<char>, int64_t>> ret;
std::transform(s.begin(), s.end(), std::back_inserter(ret), [](auto str) {
return std::pair<std::vector<char>, int64_t>(std::vector<char>(str.first.begin(), str.first.end()), str.second);
});
return ret;
}
template <class T>
inline std::map<std::vector<char>, T> PadInfoStringToChar(const std::map<std::string, T> &s_pad_info) {
std::map<std::vector<char>, T> ret;

View File

@ -232,12 +232,17 @@ PYBIND_REGISTER(UnicodeCharTokenizerOperation, 1, ([](const py::module *m) {
}));
}));
// TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++
PYBIND_REGISTER(WordpieceTokenizerOp, 1, ([](const py::module *m) {
(void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
*m, "WordpieceTokenizerOp")
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &,
const bool &>());
PYBIND_REGISTER(WordpieceTokenizerOperation, 1, ([](const py::module *m) {
(void)py::class_<text::WordpieceTokenizerOperation, TensorOperation,
std::shared_ptr<text::WordpieceTokenizerOperation>>(*m,
"WordpieceTokenizerOperation")
.def(py::init([](const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
int32_t max_bytes_per_token, const std::string &unknown_token, bool with_offsets) {
auto wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizerOperation>(
vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets);
THROW_IF_ERROR(wordpiece_tokenizer->ValidateParams());
return wordpiece_tokenizer;
}));
}));
PYBIND_REGISTER(JiebaMode, 0, ([](const py::module *m) {

View File

@ -15,6 +15,8 @@
*/
#include <unistd.h>
#include <fstream>
#include <regex>
#include "minddata/dataset/include/text.h"
@ -131,7 +133,7 @@ std::shared_ptr<TensorOperation> JiebaTokenizer::Parse() {
return jieba_tokenizer;
}
Status JiebaTokenizer::AddWord(const std::string &word, int64_t freq) {
Status JiebaTokenizer::AddWordChar(const std::vector<char> &word, int64_t freq) {
if (word.empty()) {
std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided.";
MS_LOG(ERROR) << err_msg;
@ -142,7 +144,59 @@ Status JiebaTokenizer::AddWord(const std::string &word, int64_t freq) {
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
}
data_->words_list_.emplace_back(word, freq);
data_->words_list_.emplace_back(CharToString(word), freq);
return Status::OK();
}
Status JiebaTokenizer::AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict) {
for (auto &word_freq_pair : user_dict) {
RETURN_IF_NOT_OK(AddWordChar(word_freq_pair.first, word_freq_pair.second));
}
return Status::OK();
}
Status JiebaTokenizer::AddDictChar(const std::vector<char> &file_path) {
std::vector<std::pair<std::string, int64_t>> user_dict;
RETURN_IF_NOT_OK(ParserFile(CharToString(file_path), &user_dict));
RETURN_IF_NOT_OK(AddDictChar(PairStringInt64ToPairCharInt64(user_dict)));
return Status::OK();
}
Status JiebaTokenizer::ParserFile(const std::string &file_path,
std::vector<std::pair<std::string, int64_t>> *const user_dict) {
std::ifstream ifs(file_path);
if (!ifs) {
std::string err_msg = "JiebaTokenizer : Fail to load dictionary from the input file, check the file path.";
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
}
std::string line;
while (std::getline(ifs, line)) {
if (line.empty()) {
continue;
}
std::regex regex("^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$");
std::smatch tokens;
std::regex_match(line, tokens, regex);
if (std::regex_match(line, tokens, regex)) {
if (tokens.size() == 2) {
user_dict->emplace_back(tokens.str(1), 0);
} else if (tokens.size() == 3) {
user_dict->emplace_back(tokens.str(1), strtoll(tokens.str(2).c_str(), NULL, 0));
} else {
continue;
}
} else {
continue;
}
}
MS_LOG(INFO) << "JiebaTokenizer::AddDict: The size of user input dictionary is: " << user_dict->size();
MS_LOG(INFO) << "Valid rows in input dictionary (Maximum of first 10 rows are shown.):";
for (std::size_t i = 0; i != user_dict->size(); ++i) {
if (i >= 10) break;
MS_LOG(INFO) << user_dict->at(i).first << " " << user_dict->at(i).second;
}
return Status::OK();
}
@ -310,6 +364,32 @@ std::shared_ptr<TensorOperation> UnicodeCharTokenizer::Parse() {
return std::make_shared<UnicodeCharTokenizerOperation>(data_->with_offsets_);
}
// WordpieceTokenizer
struct WordpieceTokenizer::Data {
Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token,
const std::vector<char> &unknown_token, bool with_offsets)
: vocab_(vocab),
suffix_indicator_(CharToString(suffix_indicator)),
max_bytes_per_token_(max_bytes_per_token),
unknown_token_(CharToString(unknown_token)),
with_offsets_(with_offsets) {}
std::shared_ptr<Vocab> vocab_;
std::string suffix_indicator_;
int32_t max_bytes_per_token_;
std::string unknown_token_;
bool with_offsets_;
};
WordpieceTokenizer::WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
int32_t max_bytes_per_token, const std::vector<char> &unknown_token,
bool with_offsets)
: data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets)) {}
std::shared_ptr<TensorOperation> WordpieceTokenizer::Parse() {
return std::make_shared<WordpieceTokenizerOperation>(
data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->with_offsets_);
}
#ifndef _WIN32
// UnicodeScriptTokenizer
struct UnicodeScriptTokenizer::Data {

View File

@ -52,7 +52,7 @@ class BasicTokenizer final : public TensorTransform {
/// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
/// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
/// '[MASK]' (default=true).
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
/// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
explicit BasicTokenizer(bool lower_case = false, bool keep_whitespace = false,
const NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true,
bool with_offsets = false);
@ -88,7 +88,7 @@ class BertTokenizer final : public TensorTransform {
/// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
/// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
/// '[MASK]' (default=true).
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
/// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
explicit BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##",
int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]",
bool lower_case = false, bool keep_whitespace = false,
@ -145,7 +145,7 @@ class JiebaTokenizer final : public TensorTransform {
/// - JiebaMode.kMP, tokenize with MPSegment algorithm.
/// - JiebaMode.kHMM, tokenize with Hiddel Markov Model Segment algorithm.
/// - JiebaMode.kMIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
/// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
explicit JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path,
const JiebaMode &mode = JiebaMode::kMix, bool with_offsets = false)
: JiebaTokenizer(StringToChar(hmm_path), StringToChar(mp_path), mode, with_offsets) {}
@ -156,7 +156,24 @@ class JiebaTokenizer final : public TensorTransform {
/// \brief Destructor
~JiebaTokenizer() = default;
Status AddWord(const std::string &word, int64_t freq = 0);
/// \brief Add user defined word to JiebaTokenizer's dictionary.
/// \param[in] word The word to be added to the JiebaTokenizer instance.
/// The added word will not be written into the built-in dictionary on disk.
/// \param[in] freq The frequency of the word to be added. The higher the frequency,
/// the better chance the word will be tokenized (default=None, use default frequency).
Status AddWord(const std::string &word, int64_t freq = 0) { return AddWordChar(StringToChar(word), freq); }
/// \brief Add user defined dictionary of word-freq pairs to JiebaTokenizer's dictionary.
/// \param[in] user_dict Vector of word-freq pairs to be added to JiebaTokenizer's dictionary.
Status AddDict(const std::vector<std::pair<std::string, int64_t>> &user_dict) {
return AddDictChar(PairStringInt64ToPairCharInt64(user_dict));
}
/// \brief Add user defined dictionary of word-freq pairs to JiebaTokenizer's dictionary from a file.
/// Only valid word-freq pairs in user provided file will be added into the dictionary.
/// Rows containing invalid input will be ignored, no error nor warning Status is returned.
/// \param[in] file_path Path to the dictionary which includes user defined word-freq pairs.
Status AddDict(const std::string &file_path) { return AddDictChar(StringToChar(file_path)); }
protected:
/// \brief Function to convert TensorTransform object into a TensorOperation object.
@ -164,6 +181,20 @@ class JiebaTokenizer final : public TensorTransform {
std::shared_ptr<TensorOperation> Parse() override;
private:
/// \brief Parser user defined word by file.
/// \param[in] file_path Path to the user defined file.
/// \param[in] user_dict Vector of word-freq pairs extracted from the user provided file.
Status ParserFile(const std::string &file_path, std::vector<std::pair<std::string, int64_t>> *const user_dict);
/// \brief Used to translate all API string to vector of char and back
Status AddWordChar(const std::vector<char> &word, int64_t freq = 0);
/// \brief Used to translate all API string to vector of char and back
Status AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict);
/// \brief Used to translate all API string to vector of char and back
Status AddDictChar(const std::vector<char> &file_path);
struct Data;
std::shared_ptr<Data> data_;
};
@ -292,7 +323,7 @@ class RegexTokenizer final : public TensorTransform {
/// \param[in] keep_delim_pattern The string matched by 'delim_pattern' can be kept as a token if it can be
/// matched by 'keep_delim_pattern'. The default value is an empty string ("")
/// which means that delimiters will not be kept as an output token (default="").
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
/// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
explicit RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern = "", bool with_offsets = false)
: RegexTokenizer(StringToChar(delim_pattern), StringToChar(keep_delim_pattern), with_offsets) {}
@ -416,7 +447,7 @@ class TruncateSequencePair final : public TensorTransform {
class UnicodeCharTokenizer final : public TensorTransform {
public:
/// \brief Constructor.
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
/// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
explicit UnicodeCharTokenizer(bool with_offsets = false);
/// \brief Destructor
@ -432,13 +463,45 @@ class UnicodeCharTokenizer final : public TensorTransform {
std::shared_ptr<Data> data_;
};
/// \brief Tokenize scalar token or 1-D tokens to 1-D subword tokens.
class WordpieceTokenizer final : public TensorTransform {
public:
/// \brief Constructor.
/// \param[in] vocab A Vocab object.
/// \param[in] suffix_indicator Used to show that the subword is the last part of a word (default='##').
/// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100).
/// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty
/// string, else return the string specified (default='[UNK]').
/// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##",
int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]",
bool with_offsets = false)
: WordpieceTokenizer(vocab, StringToChar(suffix_indicator), max_bytes_per_token, StringToChar(unknown_token),
with_offsets) {}
explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool with_offsets);
/// \brief Destructor
~WordpieceTokenizer() = default;
protected:
/// \brief Function to convert TensorTransform object into a TensorOperation object.
/// \return Shared pointer to TensorOperation object.
std::shared_ptr<TensorOperation> Parse() override;
private:
struct Data;
std::shared_ptr<Data> data_;
};
#ifndef _WIN32
/// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
class UnicodeScriptTokenizer final : public TensorTransform {
public:
/// \brief Constructor.
/// \param[in] keep_whitespace If or not emit whitespace tokens (default=false).
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
/// \param[in] keep_whitespace Whether or not emit whitespace tokens (default=false).
/// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
explicit UnicodeScriptTokenizer(bool keep_whitespace = false, bool with_offsets = false);
/// \brief Destructor
@ -458,7 +521,7 @@ class UnicodeScriptTokenizer final : public TensorTransform {
class WhitespaceTokenizer final : public TensorTransform {
public:
/// \brief Constructor.
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
/// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
explicit WhitespaceTokenizer(bool with_offsets = false);
/// \brief Destructor

View File

@ -36,6 +36,7 @@
#include "minddata/dataset/text/kernels/to_number_op.h"
#include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h"
#include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
#include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h"
#ifndef _WIN32
#include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h"
#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
@ -396,6 +397,39 @@ std::shared_ptr<TensorOp> UnicodeCharTokenizerOperation::Build() {
return tensor_op;
}
// WordpieceTokenizerOperation
WordpieceTokenizerOperation::WordpieceTokenizerOperation(const std::shared_ptr<Vocab> &vocab,
const std::string &suffix_indicator,
int32_t max_bytes_per_token, const std::string &unknown_token,
bool with_offsets)
: vocab_(vocab),
suffix_indicator_(suffix_indicator),
max_bytes_per_token_(max_bytes_per_token),
unknown_token_(unknown_token),
with_offsets_(with_offsets) {}
Status WordpieceTokenizerOperation::ValidateParams() {
if (vocab_ == nullptr) {
std::string err_msg = "WordpieceTokenizer: vocab object type is incorrect or null.";
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
}
if (max_bytes_per_token_ < 0) {
std::string err_msg =
"WordpieceTokenizer : The parameter max_bytes_per_token must be greater than or equal to 0: " +
std::to_string(max_bytes_per_token_);
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
}
return Status::OK();
}
std::shared_ptr<TensorOp> WordpieceTokenizerOperation::Build() {
std::shared_ptr<WordpieceTokenizerOp> tensor_op = std::make_shared<WordpieceTokenizerOp>(
vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, with_offsets_);
return tensor_op;
}
#ifndef _WIN32
// UnicodeScriptTokenizerOperation
UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets)

View File

@ -49,6 +49,7 @@ constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair";
constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer";
constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer";
constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer";
constexpr char kWordpieceTokenizerOperation[] = "WordpieceTokenizer";
/* ####################################### Derived TensorOperation classes ################################# */
@ -318,6 +319,28 @@ class UnicodeCharTokenizerOperation : public TensorOperation {
bool with_offsets_;
};
class WordpieceTokenizerOperation : public TensorOperation {
public:
explicit WordpieceTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
int32_t max_bytes_per_token, const std::string &unknown_token,
bool with_offsets);
~WordpieceTokenizerOperation() = default;
std::shared_ptr<TensorOp> Build() override;
Status ValidateParams() override;
std::string Name() const override { return kWordpieceTokenizerOperation; }
private:
std::shared_ptr<Vocab> vocab_;
std::string suffix_indicator_;
int32_t max_bytes_per_token_;
std::string unknown_token_;
bool with_offsets_;
};
#ifndef _WIN32
class UnicodeScriptTokenizerOperation : public TensorOperation {
public:

View File

@ -2207,7 +2207,7 @@ def _pyfunc_worker_init(pyfunc_list):
# All exceptions will be raised to main processes
def _pyfunc_worker_exec(index, *args):
"""
Internal function for call certain pyfunc in python process.
Internal function for call certain pyfunc in Python process.
"""
# Some threads in multiprocess.pool can't process sigint signal,
# and will occur hang problem, so ctrl+c will pass to parent process.
@ -2352,7 +2352,7 @@ class MapDataset(Dataset):
# Pass #1, look for Python callables and build list
for op in self.operations:
# our c transforms is now callable and should not be run in python multithreading
# our c transforms is now callable and should not be run in Python multithreading
if callable(op) and str(op).find("c_transform") < 0:
callable_list.append(op)
@ -2373,7 +2373,7 @@ class MapDataset(Dataset):
with _LOCK:
_OP_PROCESS.update(process_id)
for op in self.operations:
# our c transforms is now callable and should not be run in python multithreading
# our c transforms is now callable and should not be run in Python multithreading
if callable(op) and str(op).find("c_transform") < 0:
# Wrap Python callable into _PythonCallable
iter_specific_operations.append(_PythonCallable(op, idx, self.process_pool))

View File

@ -610,7 +610,7 @@ class SubsetSampler(BuiltinSampler):
Samples the elements from a sequence of indices.
Args:
indices (Any iterable python object but string): A sequence of indices.
indices (Any iterable Python object but string): A sequence of indices.
num_samples (int, optional): Number of elements to sample (default=None, all elements).
Examples:

View File

@ -102,7 +102,7 @@ class JiebaTokenizer(TextTensorOperation):
- JiebaMode.MP, tokenize with MPSegment algorithm.
- JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm.
- JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
Examples:
>>> from mindspore.dataset.text import JiebaMode
@ -186,6 +186,9 @@ class JiebaTokenizer(TextTensorOperation):
word2 None
word3 freq3
Only valid word-freq pairs in user provided file will be added into the dictionary.
Rows containing invalid input will be ignored. No error nor warning Status is returned.
Examples:
>>> from mindspore.dataset.text import JiebaMode
>>> jieba_hmm_file = "/path/to/jieba/hmm/file"
@ -221,16 +224,16 @@ class JiebaTokenizer(TextTensorOperation):
"user dict file {} is not exist.".format(file_path))
real_file_path = os.path.realpath(file_path)
file_dict = open(real_file_path)
data_re = re.compile('^(.+?)( [0-9]+)?$', re.U)
data_re = re.compile('^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$', re.U)
words_list = []
for item in file_dict:
data = item.strip()
if not isinstance(data, str):
data = self.__decode(data)
words = data_re.match(data).groups()
if len(words) != 2:
raise ValueError(
"user dict file {} format error.".format(real_file_path))
tmp = data_re.match(data)
if not tmp:
continue
words = tmp.groups()
words_list.append(words)
file_dict.close()
return words_list
@ -452,7 +455,7 @@ class UnicodeCharTokenizer(TextTensorOperation):
Tokenize a scalar tensor of UTF-8 string to Unicode characters.
Args:
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
@ -474,8 +477,7 @@ class UnicodeCharTokenizer(TextTensorOperation):
return cde.UnicodeCharTokenizerOperation(self.with_offsets)
# TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++
class WordpieceTokenizer(cde.WordpieceTokenizerOp):
class WordpieceTokenizer(TextTensorOperation):
"""
Tokenize scalar token or 1-D tokens to 1-D subword tokens.
@ -485,7 +487,7 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split (default=100).
unknown_token (str, optional): When a token cannot be found: if 'unknown_token' is empty string,
return the token directly, else return 'unknown_token' (default='[UNK]').
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
Examples:
>>> vocab_list = ["book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"]
@ -511,8 +513,10 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
self.max_bytes_per_token = max_bytes_per_token
self.unknown_token = unknown_token
self.with_offsets = with_offsets
super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token,
self.unknown_token, self.with_offsets)
def parse(self):
return cde.WordpieceTokenizerOperation(self.vocab, self.suffix_indicator, self.max_bytes_per_token,
self.unknown_token, self.with_offsets)
class PythonTokenizer:
@ -572,7 +576,7 @@ if platform.system().lower() != 'windows':
only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE).
preserve_unused_token (bool, optional): If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
Examples:
>>> from mindspore.dataset.text import NormalizeForm
@ -638,7 +642,7 @@ if platform.system().lower() != 'windows':
only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE).
preserve_unused_token (bool, optional): If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
Examples:
>>> from mindspore.dataset.text import NormalizeForm
@ -793,7 +797,7 @@ if platform.system().lower() != 'windows':
keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token
if it can be matched by 'keep_delim_pattern'. The default value is an empty str ('')
which means that delimiters will not be kept as an output token (default='').
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
@ -829,8 +833,8 @@ if platform.system().lower() != 'windows':
UnicodeScriptTokenizer is not supported on Windows platform yet.
Args:
keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False).
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
keep_whitespace (bool, optional): Whether or not emit whitespace tokens (default=False).
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
@ -865,7 +869,7 @@ if platform.system().lower() != 'windows':
WhitespaceTokenizer is not supported on Windows platform yet.
Args:
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}

View File

@ -1048,7 +1048,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord) {
EXPECT_NE(jieba_tokenizer, nullptr);
// Add word with freq not provided (default 0)
jieba_tokenizer->AddWord("男默女泪");
ASSERT_OK(jieba_tokenizer->AddWord("男默女泪"));
// Create Map operation on ds
ds = ds->Map({jieba_tokenizer}, {"text"});
@ -1100,7 +1100,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord1) {
EXPECT_NE(jieba_tokenizer, nullptr);
// Add word with freq is set explicitly to 0
jieba_tokenizer->AddWord("男默女泪", 0);
ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 0));
// Create Map operation on ds
ds = ds->Map({jieba_tokenizer}, {"text"});
@ -1152,7 +1152,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord2) {
EXPECT_NE(jieba_tokenizer, nullptr);
// Add word with freq 10
jieba_tokenizer->AddWord("男默女泪", 10);
ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 10));
// Create Map operation on ds
ds = ds->Map({jieba_tokenizer}, {"text"});
@ -1204,7 +1204,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord3) {
EXPECT_NE(jieba_tokenizer, nullptr);
// Add word with freq 20000
jieba_tokenizer->AddWord("江大桥", 20000);
ASSERT_OK(jieba_tokenizer->AddWord("江大桥", 20000));
// Create Map operation on ds
ds = ds->Map({jieba_tokenizer}, {"text"});
@ -1262,6 +1262,115 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWordFail) {
EXPECT_NE(jieba_tokenizer1->AddWord("我们", -1), Status::OK());
}
TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDict) {
// Testing AddDict of JiebaTokenizer when the input is a vector of word-freq pair.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDict.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt";
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
std::shared_ptr<Dataset> ds = TextFile({data_file});
EXPECT_NE(ds, nullptr);
// Create jieba_tokenizer operation on ds
std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
EXPECT_NE(jieba_tokenizer, nullptr);
// Add word with freq 20000
std::vector<std::pair<std::string, int64_t>> user_dict = {{"江大桥", 20000}};
ASSERT_OK(jieba_tokenizer->AddDict(user_dict));
// Create Map operation on ds
ds = ds->Map({jieba_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
iter->GetNextRow(&row);
std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "", "长江大桥", "", "通车", "仪式"};
std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
mindspore::MSTensor expected_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
uint64_t i = 0;
while (row.size() != 0) {
auto txt = row["text"];
EXPECT_MSTENSOR_EQ(txt, expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 1);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDictFromFile) {
// Testing AddDict of JiebaTokenizer when the input is a path to dict.
// Test error scenario for AddDict: invalid path
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDictFromFile.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
std::shared_ptr<Dataset> ds = TextFile({data_file});
EXPECT_NE(ds, nullptr);
// Create jieba_tokenizer operation on ds
std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
EXPECT_NE(jieba_tokenizer, nullptr);
// Load dict from txt file
std::string user_dict_path = datasets_root_path_ + "/testJiebaDataset/user_dict.txt";
std::string invalid_path = datasets_root_path_ + "/testJiebaDataset/invalid_path.txt";
EXPECT_ERROR(jieba_tokenizer->AddDict(invalid_path));
ASSERT_OK(jieba_tokenizer->AddDict(user_dict_path));
// Create Map operation on ds
ds = ds->Map({jieba_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
iter->GetNextRow(&row);
std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "", "外面", "玩吧"};
std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
mindspore::MSTensor expected_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
uint64_t i = 0;
while (row.size() != 0) {
auto txt = row["text"];
EXPECT_MSTENSOR_EQ(txt, expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 1);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) {
// Testing the parameter of SlidingWindow interface when the axis is 0.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess.";
@ -2662,6 +2771,421 @@ TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess1) {
iter->Stop();
}
std::vector<std::string> vocab_english = {"book", "cholera", "era", "favor", "##ite", "my",
"is", "love", "dur", "##ing", "the"};
std::vector<std::string> vocab_chinese = {"", "", "", "", "", "", "", "", "", "", "", "", ""};
TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess1) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess1.";
// Test WordpieceTokenizer with default parameters on English vocab
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create Take operation on ds
ds = ds->Take(10);
EXPECT_NE(ds, nullptr);
// Create a vocab from vector
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Create WordpieceTokenizer operation on ds
std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab);
EXPECT_NE(wordpiece_tokenizer, nullptr);
// Create Map operation on ds
ds = ds->Map({wordpiece_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
iter->GetNextRow(&row);
std::vector<std::vector<std::string>> expected = {
{"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}};
uint64_t i = 0;
while (row.size() != 0) {
auto txt = row["text"];
std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
mindspore::MSTensor expected_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
EXPECT_MSTENSOR_EQ(txt, expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 10);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess2) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess2.";
// Test WordpieceTokenizer with empty unknown_token
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create Take operation on ds
ds = ds->Take(10);
EXPECT_NE(ds, nullptr);
// Create a vocab from vector
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Create WordpieceTokenizer operation on ds
std::shared_ptr<TensorTransform> wordpiece_tokenizer =
std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "", false);
EXPECT_NE(wordpiece_tokenizer, nullptr);
// Create Map operation on ds
ds = ds->Map({wordpiece_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
iter->GetNextRow(&row);
std::vector<std::vector<std::string>> expected = {
{"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"what"}};
uint64_t i = 0;
while (row.size() != 0) {
auto txt = row["text"];
std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
mindspore::MSTensor expected_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
EXPECT_MSTENSOR_EQ(txt, expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 10);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess3) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess3.";
// Test WordpieceTokenizer with non-default max_bytes_per_token
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create Take operation on ds
ds = ds->Take(10);
EXPECT_NE(ds, nullptr);
// Create a vocab from vector
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Create WordpieceTokenizer operation on ds
std::shared_ptr<TensorTransform> wordpiece_tokenizer =
std::make_shared<text::WordpieceTokenizer>(vocab, "##", 4, "[UNK]", false);
EXPECT_NE(wordpiece_tokenizer, nullptr);
// Create Map operation on ds
ds = ds->Map({wordpiece_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
iter->GetNextRow(&row);
std::vector<std::vector<std::string>> expected = {{"my"}, {"[UNK]"}, {"book"}, {"is"}, {"love"},
{"[UNK]"}, {"the"}, {"[UNK]"}, {"era"}, {"[UNK]"}};
uint64_t i = 0;
while (row.size() != 0) {
auto txt = row["text"];
std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
mindspore::MSTensor expected_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
EXPECT_MSTENSOR_EQ(txt, expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 10);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess4) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess4.";
// Test WordpieceTokenizer with default parameters on Chinese vocab
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create Skip operation on ds
ds = ds->Skip(10);
EXPECT_NE(ds, nullptr);
// Create Take operation on ds
ds = ds->Take(15);
EXPECT_NE(ds, nullptr);
// Create a vocab from vector
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(vocab_chinese, {}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Create WordpieceTokenizer operation on ds
std::shared_ptr<TensorTransform> wordpiece_tokenizer =
std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", false);
EXPECT_NE(wordpiece_tokenizer, nullptr);
// Create Map operation on ds
ds = ds->Map({wordpiece_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
iter->GetNextRow(&row);
std::vector<std::vector<std::string>> expected = {{""}, {""}, {""}, {""}, {""}, {""}, {""}, {""},
{""}, {""}, {""}, {""}, {""}, {""}, {"[UNK]"}};
uint64_t i = 0;
while (row.size() != 0) {
auto txt = row["text"];
std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
mindspore::MSTensor expected_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
EXPECT_MSTENSOR_EQ(txt, expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 15);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess5) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess5.";
// Test WordpieceTokenizer with with_offsets true
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create Take operation on ds
ds = ds->Take(10);
EXPECT_NE(ds, nullptr);
// Create a vocab from vector
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Create WordpieceTokenizer operation on ds
std::shared_ptr<TensorTransform> wordpiece_tokenizer =
std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", true);
EXPECT_NE(wordpiece_tokenizer, nullptr);
// Create Map operation on ds
ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
iter->GetNextRow(&row);
std::vector<std::vector<std::string>> expected = {
{"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}};
std::vector<std::vector<uint32_t>> expected_offsets_start = {{0}, {0, 5}, {0}, {0}, {0}, {0, 3}, {0}, {0}, {0}, {0}};
std::vector<std::vector<uint32_t>> expected_offsets_limit = {{2}, {5, 8}, {4}, {2}, {4}, {3, 6}, {3}, {7}, {3}, {4}};
uint64_t i = 0;
while (row.size() != 0) {
auto txt = row["token"];
std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
mindspore::MSTensor expected_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
EXPECT_MSTENSOR_EQ(txt, expected_tensor);
auto start = row["offsets_start"];
std::shared_ptr<Tensor> de_expected_start_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], &de_expected_start_tensor));
mindspore::MSTensor expected_start_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_start_tensor));
EXPECT_MSTENSOR_EQ(start, expected_start_tensor);
auto limit = row["offsets_limit"];
std::shared_ptr<Tensor> de_expected_limit_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], &de_expected_limit_tensor));
mindspore::MSTensor expected_limit_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_limit_tensor));
EXPECT_MSTENSOR_EQ(limit, expected_limit_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 10);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess6) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess6.";
// Test WordpieceTokenizer with max_bytes_per_token equals to 0
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create Take operation on ds
ds = ds->Take(10);
EXPECT_NE(ds, nullptr);
// Create a vocab from vector
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Create WordpieceTokenizer operation on ds
std::shared_ptr<TensorTransform> wordpiece_tokenizer =
std::make_shared<text::WordpieceTokenizer>(vocab, "##", 0, "[UNK]", true);
EXPECT_NE(wordpiece_tokenizer, nullptr);
// Create Map operation on ds
ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
iter->GetNextRow(&row);
std::vector<std::vector<std::string>> expected = {{"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"},
{"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}};
uint64_t i = 0;
while (row.size() != 0) {
auto txt = row["token"];
std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
mindspore::MSTensor expected_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
EXPECT_MSTENSOR_EQ(txt, expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 10);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail1) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail1.";
// Test WordpieceTokenizer with nullptr vocab
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create WordpieceTokenizer operation on ds
std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(nullptr);
EXPECT_NE(wordpiece_tokenizer, nullptr);
// Create a Map operation on ds
ds = ds->Map({wordpiece_tokenizer});
EXPECT_NE(ds, nullptr);
std::shared_ptr<Iterator> iter = ds->CreateIterator();
// Expect failure: invalid WordpieceTokenizer input with nullptr vocab
EXPECT_EQ(iter, nullptr);
}
TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail2) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail2.";
// Test WordpieceTokenizer with negative max_bytes_per_token
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create a vocab from vector
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Create WordpieceTokenizer operation on ds
std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab, "##", -1);
EXPECT_NE(wordpiece_tokenizer, nullptr);
// Create a Map operation on ds
ds = ds->Map({wordpiece_tokenizer});
EXPECT_NE(ds, nullptr);
std::shared_ptr<Iterator> iter = ds->CreateIterator();
// Expect failure: invalid WordpieceTokenizer input with nullptr vocab
EXPECT_EQ(iter, nullptr);
}
TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess) {
// Testing the parameter of UnicodeScriptTokenizer interface when the with_offsets and the keep_whitespace is default.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess.";