!9530 add BasicTokenizer and BertTokenizer C++ API

From: @tiancixiao
Reviewed-by: @mikef,@nsyca
Signed-off-by: @nsyca
This commit is contained in:
mindspore-ci-bot 2020-12-08 04:09:40 +08:00 committed by Gitee
commit 6b5626634c
3 changed files with 795 additions and 0 deletions

View File

@ -18,6 +18,8 @@
#include "minddata/dataset/include/text.h"
#ifndef _WIN32
#include "minddata/dataset/text/kernels/basic_tokenizer_op.h"
#include "minddata/dataset/text/kernels/bert_tokenizer_op.h"
#include "minddata/dataset/text/kernels/case_fold_op.h"
#endif
#include "minddata/dataset/text/kernels/jieba_tokenizer_op.h"
@ -47,6 +49,27 @@ namespace text {
// (In alphabetical order)
#ifndef _WIN32
std::shared_ptr<BasicTokenizerOperation> BasicTokenizer(bool lower_case, bool keep_whitespace,
const NormalizeForm normalize_form, bool preserve_unused_token,
bool with_offsets) {
auto op = std::make_shared<BasicTokenizerOperation>(lower_case, keep_whitespace, normalize_form,
preserve_unused_token, with_offsets);
return op->ValidateParams() ? op : nullptr;
}
std::shared_ptr<BertTokenizerOperation> BertTokenizer(const std::shared_ptr<Vocab> &vocab,
const std::string &suffix_indicator, int32_t max_bytes_per_token,
const std::string &unknown_token, bool lower_case,
bool keep_whitespace, const NormalizeForm normalize_form,
bool preserve_unused_token, bool with_offsets) {
auto op =
std::make_shared<BertTokenizerOperation>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case,
keep_whitespace, normalize_form, preserve_unused_token, with_offsets);
return op->ValidateParams() ? op : nullptr;
}
std::shared_ptr<CaseFoldOperation> CaseFold() {
auto op = std::make_shared<CaseFoldOperation>();
@ -168,6 +191,64 @@ Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::s
// (In alphabetical order)
#ifndef _WIN32
// BasicTokenizerOperation
BasicTokenizerOperation::BasicTokenizerOperation(bool lower_case, bool keep_whitespace,
const NormalizeForm normalize_form, bool preserve_unused_token,
bool with_offsets)
: lower_case_(lower_case),
keep_whitespace_(keep_whitespace),
normalize_form_(normalize_form),
preserve_unused_token_(preserve_unused_token),
with_offsets_(with_offsets) {}
Status BasicTokenizerOperation::ValidateParams() { return Status::OK(); }
std::shared_ptr<TensorOp> BasicTokenizerOperation::Build() {
std::shared_ptr<BasicTokenizerOp> tensor_op = std::make_shared<BasicTokenizerOp>(
lower_case_, keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_);
return tensor_op;
}
// BertTokenizerOperation
BertTokenizerOperation::BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
int32_t max_bytes_per_token, const std::string &unknown_token,
bool lower_case, bool keep_whitespace,
const NormalizeForm normalize_form, bool preserve_unused_token,
bool with_offsets)
: vocab_(vocab),
suffix_indicator_(suffix_indicator),
max_bytes_per_token_(max_bytes_per_token),
unknown_token_(unknown_token),
lower_case_(lower_case),
keep_whitespace_(keep_whitespace),
normalize_form_(normalize_form),
preserve_unused_token_(preserve_unused_token),
with_offsets_(with_offsets) {}
Status BertTokenizerOperation::ValidateParams() {
if (vocab_ == nullptr) {
std::string err_msg = "BertTokenizer: vocab object type is incorrect or null.";
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
}
if (max_bytes_per_token_ < 0) {
std::string err_msg = "BertTokenizer : The parameter max_bytes_per_token must be greater than or equal to 0: " +
std::to_string(max_bytes_per_token_);
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
}
return Status::OK();
}
std::shared_ptr<TensorOp> BertTokenizerOperation::Build() {
std::shared_ptr<BertTokenizerOp> tensor_op =
std::make_shared<BertTokenizerOp>(vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, lower_case_,
keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_);
return tensor_op;
}
// CaseFoldOperation
Status CaseFoldOperation::ValidateParams() { return Status::OK(); }

View File

@ -38,6 +38,8 @@ namespace dataset {
namespace text {
// Char arrays storing name of corresponding classes (in alphabetical order)
constexpr char kBasicTokenizerOperation[] = "BasicTokenizer";
constexpr char kBertTokenizerOperation[] = "BertTokenizer";
constexpr char kCaseFoldOperation[] = "CaseFold";
constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer";
constexpr char kLookupOperation[] = "Lookup";
@ -53,6 +55,8 @@ constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer";
// Text Op classes (in alphabetical order)
#ifndef _WIN32
class BasicTokenizerOperation;
class BertTokenizerOperation;
class CaseFoldOperation;
#endif
class JiebaTokenizerOperation;
@ -72,6 +76,47 @@ class WhitespaceTokenizerOperation;
#endif
#ifndef _WIN32
/// \brief Tokenize a scalar tensor of UTF-8 string by specific rules.
/// \notes BasicTokenizer is not supported on Windows platform yet.
/// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation on input text to
/// fold the text to lower case and strip accents characters. If false, only apply NormalizeUTF8('normalization_form'
/// mode) operation on input text (default=false).
/// \param[in] keep_whitespace If true, the whitespace will be kept in out tokens (default=false).
/// \param[in] normalize_form Used to specify a specific normalize mode. This is only effective when 'lower_case' is
/// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
/// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
/// '[MASK]' (default=true).
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
/// \return Shared pointer to the current TensorOperation.
std::shared_ptr<BasicTokenizerOperation> BasicTokenizer(bool lower_case = false, bool keep_whitespace = false,
const NormalizeForm normalize_form = NormalizeForm::kNone,
bool preserve_unused_token = true, bool with_offsets = false);
/// \brief Tokenizer used for Bert text process.
/// \notes BertTokenizer is not supported on Windows platform yet.
/// \param[in] vocab A Vocab object.
/// \param[in] suffix_indicator Used to show that the subword is the last part of a word (default='##').
/// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100).
/// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty
/// string, else return the string specified(default='[UNK]').
/// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation on input text to
/// fold the text to lower case and strip accents characters. If false, only apply NormalizeUTF8('normalization_form'
/// mode) operation on input text (default=false).
/// \param[in] keep_whitespace If true, the whitespace will be kept in out tokens (default=false).
/// \param[in] normalize_form Used to specify a specific normalize mode. This is only effective when 'lower_case' is
/// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
/// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
/// '[MASK]' (default=true).
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
/// \return Shared pointer to the current TensorOperation.
std::shared_ptr<BertTokenizerOperation> BertTokenizer(const std::shared_ptr<Vocab> &vocab,
const std::string &suffix_indicator = "##",
int32_t max_bytes_per_token = 100,
const std::string &unknown_token = "[UNK]",
bool lower_case = false, bool keep_whitespace = false,
const NormalizeForm normalize_form = NormalizeForm::kNone,
bool preserve_unused_token = true, bool with_offsets = false);
/// \brief Apply case fold operation on UTF-8 string tensor.
/// \return Shared pointer to the current TensorOperation.
std::shared_ptr<CaseFoldOperation> CaseFold();
@ -193,6 +238,54 @@ std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offs
/* ####################################### Derived TensorOperation classes ################################# */
#ifndef _WIN32
class BasicTokenizerOperation : public TensorOperation {
public:
BasicTokenizerOperation(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form,
bool preserve_unused_token, bool with_offsets);
~BasicTokenizerOperation() = default;
std::shared_ptr<TensorOp> Build() override;
Status ValidateParams() override;
std::string Name() const override { return kBasicTokenizerOperation; }
private:
bool lower_case_;
bool keep_whitespace_;
NormalizeForm normalize_form_;
bool preserve_unused_token_;
bool with_offsets_;
};
class BertTokenizerOperation : public TensorOperation {
public:
BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case,
bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
bool with_offsets);
~BertTokenizerOperation() = default;
std::shared_ptr<TensorOp> Build() override;
Status ValidateParams() override;
std::string Name() const override { return kBertTokenizerOperation; }
private:
std::shared_ptr<Vocab> vocab_;
std::string suffix_indicator_;
int32_t max_bytes_per_token_;
std::string unknown_token_;
bool lower_case_;
bool keep_whitespace_;
NormalizeForm normalize_form_;
bool preserve_unused_token_;
bool with_offsets_;
};
class CaseFoldOperation : public TensorOperation {
public:
CaseFoldOperation() = default;

View File

@ -34,6 +34,627 @@ class MindDataTestPipeline : public UT::DatasetOpTesting {
protected:
};
TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess1) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess1.";
// Test BasicTokenizer with default parameters
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create Take operation on ds
ds = ds->Take(6);
EXPECT_NE(ds, nullptr);
// Create BasicTokenizer operation on ds
std::shared_ptr<TensorOperation> basic_tokenizer = text::BasicTokenizer();
EXPECT_NE(basic_tokenizer, nullptr);
// Create Map operation on ds
ds = ds->Map({basic_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
std::vector<std::vector<std::string>> expected = {
{"Welcome", "to", "Beijing", "", "", "", "", ""},
{"", "", "", "", "", "", "", "", "", "", "", "", "", "", ""},
{"😀", "", "", "😃", "", "", "😄", "", "", "😁", "", ""},
{"", "", "", "1368", "", "1644", "", "", "", "", "", "", "1644", "", "1911", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""},
{"", "", "", "1368", "-", "1644", "", "", "", "", "", "1644",
"-", "1911", "", "", "", "", "", "", "", "", "", "",
"", "", "", "における", "", "", "の2つの", "", "", "でした"},
{"명나라", "(", "1368", "-", "1644", ")", "", "청나라", "(", "1644", "-",
"1911", ")", "", "중국", "봉건", "왕조의", "역사에서", "마지막", "", "왕조였다"}};
uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["text"];
std::shared_ptr<Tensor> expected_tensor;
Tensor::CreateFromVector(expected[i], &expected_tensor);
EXPECT_EQ(*ind, *expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 6);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess2) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess2.";
// Test BasicTokenizer with lower_case true
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create Skip operation on ds
ds = ds->Skip(6);
EXPECT_NE(ds, nullptr);
// Create BasicTokenizer operation on ds
std::shared_ptr<TensorOperation> basic_tokenizer = text::BasicTokenizer(true);
EXPECT_NE(basic_tokenizer, nullptr);
// Create Map operation on ds
ds = ds->Map({basic_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
std::vector<std::string> expected = {"this", "is", "a", "funky", "string"};
uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["text"];
std::shared_ptr<Tensor> expected_tensor;
Tensor::CreateFromVector(expected, &expected_tensor);
EXPECT_EQ(*ind, *expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 1);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess3) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess3.";
// Test BasicTokenizer with with_offsets true and lower_case true
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create Skip operation on ds
ds = ds->Skip(6);
EXPECT_NE(ds, nullptr);
// Create BasicTokenizer operation on ds
std::shared_ptr<TensorOperation> basic_tokenizer =
text::BasicTokenizer(true, false, NormalizeForm::kNone, true, true);
EXPECT_NE(basic_tokenizer, nullptr);
// Create Map operation on ds
ds = ds->Map({basic_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
std::vector<std::string> expected_tokens = {"this", "is", "a", "funky", "string"};
std::vector<uint32_t> expected_offsets_start = {0, 5, 8, 10, 16};
std::vector<uint32_t> expected_offsets_limit = {4, 7, 9, 15, 22};
uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["token"];
std::shared_ptr<Tensor> expected_token_tensor;
Tensor::CreateFromVector(expected_tokens, &expected_token_tensor);
EXPECT_EQ(*ind, *expected_token_tensor);
auto start = row["offsets_start"];
std::shared_ptr<Tensor> expected_start_tensor;
Tensor::CreateFromVector(expected_offsets_start, &expected_start_tensor);
EXPECT_EQ(*start, *expected_start_tensor);
auto limit = row["offsets_limit"];
std::shared_ptr<Tensor> expected_limit_tensor;
Tensor::CreateFromVector(expected_offsets_limit, &expected_limit_tensor);
EXPECT_EQ(*limit, *expected_limit_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 1);
// Manually terminate the pipeline
iter->Stop();
}
std::vector<std::string> list = {
"", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "",
"", "i", "am", "mak", "make", "small", "mistake", "##s", "during", "work", "##ing", "hour",
"😀", "😃", "😄", "😁", "+", "/", "-", "=", "12", "28", "40", "16",
" ", "I", "[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]", "[unused1]", "[unused10]"};
TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess1) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess1.";
// Test BertTokenizer with default parameters
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create Take operation on ds
ds = ds->Take(4);
EXPECT_NE(ds, nullptr);
// Create a vocab from vector
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Create BertTokenizer operation on ds
std::shared_ptr<TensorOperation> bert_tokenizer = text::BertTokenizer(vocab);
EXPECT_NE(bert_tokenizer, nullptr);
// Create Map operation on ds
ds = ds->Map({bert_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
std::vector<std::vector<std::string>> expected = {{"", "", "", "", ""},
{"", "", "", "", ""},
{"", "", "", "", ""},
{"", "", "", "", ""}};
uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["text"];
std::shared_ptr<Tensor> expected_tensor;
Tensor::CreateFromVector(expected[i], &expected_tensor);
EXPECT_EQ(*ind, *expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 4);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess2) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess2.";
// Test BertTokenizer with lower_case true
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create Skip operation on ds
ds = ds->Skip(4);
EXPECT_NE(ds, nullptr);
// Create Take operation on ds
ds = ds->Take(1);
EXPECT_NE(ds, nullptr);
// Create a vocab from vector
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Create BertTokenizer operation on ds
std::shared_ptr<TensorOperation> bert_tokenizer = text::BertTokenizer(vocab, "##", 100, "[UNK]", true);
EXPECT_NE(bert_tokenizer, nullptr);
// Create Map operation on ds
ds = ds->Map({bert_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
std::vector<std::string> expected = {"i", "am", "mak", "##ing", "small", "mistake",
"##s", "during", "work", "##ing", "hour", "##s"};
uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["text"];
std::shared_ptr<Tensor> expected_tensor;
Tensor::CreateFromVector(expected, &expected_tensor);
EXPECT_EQ(*ind, *expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 1);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess3) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess3.";
// Test BertTokenizer with normalization_form NFKC
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create Skip operation on ds
ds = ds->Skip(5);
EXPECT_NE(ds, nullptr);
// Create Take operation on ds
ds = ds->Take(2);
EXPECT_NE(ds, nullptr);
// Create a vocab from vector
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Create BertTokenizer operation on ds
std::shared_ptr<TensorOperation> bert_tokenizer =
text::BertTokenizer(vocab, "##", 100, "[UNK]", false, false, NormalizeForm::kNfc);
EXPECT_NE(bert_tokenizer, nullptr);
// Create Map operation on ds
ds = ds->Map({bert_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
std::vector<std::vector<std::string>> expected = {
{"😀", "", "", "😃", "", "", "😄", "", "", "😁", "", ""}, {"", "", ""}};
uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["text"];
std::shared_ptr<Tensor> expected_tensor;
Tensor::CreateFromVector(expected[i], &expected_tensor);
EXPECT_EQ(*ind, *expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 2);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess4) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess4.";
// Test BertTokenizer with keep_whitespace true
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create Skip operation on ds
ds = ds->Skip(7);
EXPECT_NE(ds, nullptr);
// Create Take operation on ds
ds = ds->Take(1);
EXPECT_NE(ds, nullptr);
// Create a vocab from vector
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Create BertTokenizer operation on ds
std::shared_ptr<TensorOperation> bert_tokenizer = text::BertTokenizer(vocab, "##", 100, "[UNK]", false, true);
EXPECT_NE(bert_tokenizer, nullptr);
// Create Map operation on ds
ds = ds->Map({bert_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
std::vector<std::string> expected = {"[UNK]", " ", "[CLS]"};
uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["text"];
std::shared_ptr<Tensor> expected_tensor;
Tensor::CreateFromVector(expected, &expected_tensor);
EXPECT_EQ(*ind, *expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 1);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess5) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess5.";
// Test BertTokenizer with unknown_token empty and keep_whitespace true
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create Skip operation on ds
ds = ds->Skip(7);
EXPECT_NE(ds, nullptr);
// Create Take operation on ds
ds = ds->Take(1);
EXPECT_NE(ds, nullptr);
// Create a vocab from vector
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Create BertTokenizer operation on ds
std::shared_ptr<TensorOperation> bert_tokenizer = text::BertTokenizer(vocab, "##", 100, "", false, true);
EXPECT_NE(bert_tokenizer, nullptr);
// Create Map operation on ds
ds = ds->Map({bert_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
std::vector<std::string> expected = {"unused", " ", "[CLS]"};
uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["text"];
std::shared_ptr<Tensor> expected_tensor;
Tensor::CreateFromVector(expected, &expected_tensor);
EXPECT_EQ(*ind, *expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 1);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess6) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess6.";
// Test BertTokenizer with preserve_unused_token false, unknown_token empty and keep_whitespace true
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create Skip operation on ds
ds = ds->Skip(7);
EXPECT_NE(ds, nullptr);
// Create Take operation on ds
ds = ds->Take(1);
EXPECT_NE(ds, nullptr);
// Create a vocab from vector
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Create BertTokenizer operation on ds
std::shared_ptr<TensorOperation> bert_tokenizer =
text::BertTokenizer(vocab, "##", 100, "", false, true, NormalizeForm::kNone, false);
EXPECT_NE(bert_tokenizer, nullptr);
// Create Map operation on ds
ds = ds->Map({bert_tokenizer}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
std::vector<std::string> expected = {"unused", " ", "[", "CLS", "]"};
uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["text"];
std::shared_ptr<Tensor> expected_tensor;
Tensor::CreateFromVector(expected, &expected_tensor);
EXPECT_EQ(*ind, *expected_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 1);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess7) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess7.";
// Test BertTokenizer with with_offsets true and lower_case true
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create Skip operation on ds
ds = ds->Skip(4);
EXPECT_NE(ds, nullptr);
// Create Take operation on ds
ds = ds->Take(1);
EXPECT_NE(ds, nullptr);
// Create a vocab from vector
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Create BertTokenizer operation on ds
std::shared_ptr<TensorOperation> bert_tokenizer =
text::BertTokenizer(vocab, "##", 100, "[UNK]", true, false, NormalizeForm::kNone, true, true);
EXPECT_NE(bert_tokenizer, nullptr);
// Create Map operation on ds
ds = ds->Map({bert_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
std::vector<std::string> expected_tokens = {"i", "am", "mak", "##ing", "small", "mistake",
"##s", "during", "work", "##ing", "hour", "##s"};
std::vector<uint32_t> expected_offsets_start = {0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46};
std::vector<uint32_t> expected_offsets_limit = {1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47};
uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["token"];
std::shared_ptr<Tensor> expected_token_tensor;
Tensor::CreateFromVector(expected_tokens, &expected_token_tensor);
EXPECT_EQ(*ind, *expected_token_tensor);
auto start = row["offsets_start"];
std::shared_ptr<Tensor> expected_start_tensor;
Tensor::CreateFromVector(expected_offsets_start, &expected_start_tensor);
EXPECT_EQ(*start, *expected_start_tensor);
auto limit = row["offsets_limit"];
std::shared_ptr<Tensor> expected_limit_tensor;
Tensor::CreateFromVector(expected_offsets_limit, &expected_limit_tensor);
EXPECT_EQ(*limit, *expected_limit_tensor);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 1);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestBertTokenizerFail1) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail1.";
// Test BertTokenizer with nullptr vocab
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create BertTokenizer operation on ds
std::shared_ptr<TensorOperation> bert_tokenizer = text::BertTokenizer(nullptr);
// Expect failure: invalid BertTokenizer input with nullptr vocab
EXPECT_EQ(bert_tokenizer, nullptr);
}
TEST_F(MindDataTestPipeline, TestBertTokenizerFail2) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail2.";
// Test BertTokenizer with negative max_bytes_per_token
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create a vocab from vector
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Create BertTokenizer operation on ds
std::shared_ptr<TensorOperation> bert_tokenizer = text::BertTokenizer(vocab, "##", -1);
// Expect failure: invalid BertTokenizer input with nullptr vocab
EXPECT_EQ(bert_tokenizer, nullptr);
}
TEST_F(MindDataTestPipeline, TestCaseFoldSuccess) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaseFoldSuccess.";