forked from mindspore-Ecosystem/mindspore
!9524 Add RegexReplace and RegexTokenizer C++ API
From: @shenwei41 Reviewed-by: Signed-off-by:
This commit is contained in:
commit
921930a3ae
|
@ -25,6 +25,8 @@
|
|||
#include "minddata/dataset/text/kernels/ngram_op.h"
|
||||
#ifndef _WIN32
|
||||
#include "minddata/dataset/text/kernels/normalize_utf8_op.h"
|
||||
#include "minddata/dataset/text/kernels/regex_replace_op.h"
|
||||
#include "minddata/dataset/text/kernels/regex_tokenizer_op.h"
|
||||
#endif
|
||||
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/kernels/sliding_window_op.h"
|
||||
|
@ -80,6 +82,19 @@ std::shared_ptr<NormalizeUTF8Operation> NormalizeUTF8(NormalizeForm normalize_fo
|
|||
|
||||
return op->ValidateParams() ? op : nullptr;
|
||||
}
|
||||
|
||||
std::shared_ptr<RegexReplaceOperation> RegexReplace(std::string pattern, std::string replace, bool replace_all) {
|
||||
auto op = std::make_shared<RegexReplaceOperation>(pattern, replace, replace_all);
|
||||
|
||||
return op->ValidateParams() ? op : nullptr;
|
||||
}
|
||||
|
||||
std::shared_ptr<RegexTokenizerOperation> RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern,
|
||||
bool with_offsets) {
|
||||
auto op = std::make_shared<RegexTokenizerOperation>(delim_pattern, keep_delim_pattern, with_offsets);
|
||||
|
||||
return op->ValidateParams() ? op : nullptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
|
||||
|
@ -276,6 +291,30 @@ std::shared_ptr<TensorOp> NormalizeUTF8Operation::Build() {
|
|||
std::shared_ptr<NormalizeUTF8Op> tensor_op = std::make_shared<NormalizeUTF8Op>(normalize_form_);
|
||||
return tensor_op;
|
||||
}
|
||||
|
||||
// RegexReplaceOperation
|
||||
RegexReplaceOperation::RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all)
|
||||
: pattern_(pattern), replace_(replace), replace_all_(replace_all) {}
|
||||
|
||||
Status RegexReplaceOperation::ValidateParams() { return Status::OK(); }
|
||||
|
||||
std::shared_ptr<TensorOp> RegexReplaceOperation::Build() {
|
||||
std::shared_ptr<RegexReplaceOp> tensor_op = std::make_shared<RegexReplaceOp>(pattern_, replace_, replace_all_);
|
||||
return tensor_op;
|
||||
}
|
||||
|
||||
// RegexTokenizerOperation
|
||||
RegexTokenizerOperation::RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern,
|
||||
bool with_offsets)
|
||||
: delim_pattern_(delim_pattern), keep_delim_pattern_(keep_delim_pattern), with_offsets_(with_offsets) {}
|
||||
|
||||
Status RegexTokenizerOperation::ValidateParams() { return Status::OK(); }
|
||||
|
||||
std::shared_ptr<TensorOp> RegexTokenizerOperation::Build() {
|
||||
std::shared_ptr<RegexTokenizerOp> tensor_op =
|
||||
std::make_shared<RegexTokenizerOp>(delim_pattern_, keep_delim_pattern_, with_offsets_);
|
||||
return tensor_op;
|
||||
}
|
||||
#endif
|
||||
|
||||
// SentencePieceTokenizerOperation
|
||||
|
|
|
@ -309,10 +309,10 @@ void DatasetNode::PrintNode(std::ostream &out, int *level) const {
|
|||
}
|
||||
}
|
||||
|
||||
// Add a node as a child, node's parent needs to be nullptr
|
||||
// Add a node as a child, node's parent needs to be empty
|
||||
// this function will allow child to be a nullptr, in which case it will simply skip
|
||||
void DatasetNode::AddChild(std::shared_ptr<DatasetNode> child) {
|
||||
if (child != nullptr && !child->parent_.size()) {
|
||||
if (child != nullptr && child->parent_.empty()) {
|
||||
children_.push_back(child);
|
||||
child->parent_.push_back(this);
|
||||
} else if (child != nullptr) {
|
||||
|
|
|
@ -43,6 +43,8 @@ constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer";
|
|||
constexpr char kLookupOperation[] = "Lookup";
|
||||
constexpr char kNgramOperation[] = "Ngram";
|
||||
constexpr char kNormalizeUTF8Operation[] = "NormalizeUTF8";
|
||||
constexpr char kRegexReplaceOperation[] = "RegexReplace";
|
||||
constexpr char kRegexTokenizerOperation[] = "RegexTokenizer";
|
||||
constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer";
|
||||
constexpr char kSlidingWindowOperation[] = "SlidingWindow";
|
||||
constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer";
|
||||
|
@ -58,6 +60,8 @@ class LookupOperation;
|
|||
class NgramOperation;
|
||||
#ifndef _WIN32
|
||||
class NormalizeUTF8Operation;
|
||||
class RegexReplaceOperation;
|
||||
class RegexTokenizerOperation;
|
||||
#endif
|
||||
class SentencePieceTokenizerOperation;
|
||||
class SlidingWindowOperation;
|
||||
|
@ -125,6 +129,24 @@ std::shared_ptr<NgramOperation> Ngram(const std::vector<int32_t> &ngrams,
|
|||
/// - NormalizeForm.NFKD, normalize with Normalization Form KD.
|
||||
/// \return Shared pointer to the current TensorOperation.
|
||||
std::shared_ptr<NormalizeUTF8Operation> NormalizeUTF8(NormalizeForm normalize_form = NormalizeForm::kNfkc);
|
||||
|
||||
/// \brief Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'.
|
||||
/// \param[in] pattern The regex expression patterns.
|
||||
/// \param[in] replace The string to replace matched element.
|
||||
/// \param[in] replace_all Confirm whether to replace all. If false, only replace first matched element;
|
||||
/// if true, replace all matched elements (default=true).
|
||||
/// \return Shared pointer to the current TensorOperation.
|
||||
std::shared_ptr<RegexReplaceOperation> RegexReplace(std::string pattern, std::string replace, bool replace_all = true);
|
||||
|
||||
/// \brief Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
|
||||
/// \param[in] delim_pattern The pattern of regex delimiters.
|
||||
/// \param[in] keep_delim_pattern The string matched by 'delim_pattern' can be kept as a token if it can be
|
||||
/// matched by 'keep_delim_pattern'. The default value is an empty string ("")
|
||||
/// which means that delimiters will not be kept as an output token (default="").
|
||||
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
|
||||
/// \return Shared pointer to the current TensorOperation.
|
||||
std::shared_ptr<RegexTokenizerOperation> RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern = "",
|
||||
bool with_offsets = false);
|
||||
#endif
|
||||
|
||||
/// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
|
||||
|
@ -261,6 +283,42 @@ class NormalizeUTF8Operation : public TensorOperation {
|
|||
private:
|
||||
NormalizeForm normalize_form_;
|
||||
};
|
||||
|
||||
class RegexReplaceOperation : public TensorOperation {
|
||||
public:
|
||||
RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all);
|
||||
|
||||
~RegexReplaceOperation() = default;
|
||||
|
||||
std::shared_ptr<TensorOp> Build() override;
|
||||
|
||||
Status ValidateParams() override;
|
||||
|
||||
std::string Name() const override { return kRegexReplaceOperation; }
|
||||
|
||||
private:
|
||||
std::string pattern_;
|
||||
std::string replace_;
|
||||
bool replace_all_;
|
||||
};
|
||||
|
||||
class RegexTokenizerOperation : public TensorOperation {
|
||||
public:
|
||||
explicit RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, bool with_offsets);
|
||||
|
||||
~RegexTokenizerOperation() = default;
|
||||
|
||||
std::shared_ptr<TensorOp> Build() override;
|
||||
|
||||
Status ValidateParams() override;
|
||||
|
||||
std::string Name() const override { return kRegexTokenizerOperation; }
|
||||
|
||||
private:
|
||||
std::string delim_pattern_;
|
||||
std::string keep_delim_pattern_;
|
||||
bool with_offsets_;
|
||||
};
|
||||
#endif
|
||||
|
||||
class SentencePieceTokenizerOperation : public TensorOperation {
|
||||
|
|
|
@ -479,7 +479,7 @@ if platform.system().lower() != 'windows':
|
|||
|
||||
class CaseFold(cde.CaseFoldOp):
|
||||
"""
|
||||
Apply case fold operation on utf-8 string tensor.
|
||||
Apply case fold operation on UTF-8 string tensor.
|
||||
|
||||
Note:
|
||||
CaseFold is not supported on Windows platform yet.
|
||||
|
@ -502,7 +502,7 @@ if platform.system().lower() != 'windows':
|
|||
|
||||
class NormalizeUTF8(cde.NormalizeUTF8Op):
|
||||
"""
|
||||
Apply normalize operation on utf-8 string tensor.
|
||||
Apply normalize operation on UTF-8 string tensor.
|
||||
|
||||
Note:
|
||||
NormalizeUTF8 is not supported on Windows platform yet.
|
||||
|
@ -536,7 +536,7 @@ if platform.system().lower() != 'windows':
|
|||
|
||||
class RegexReplace(cde.RegexReplaceOp):
|
||||
"""
|
||||
Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.
|
||||
Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'.
|
||||
|
||||
See http://userguide.icu-project.org/strings/regexp for support regex pattern.
|
||||
|
||||
|
|
|
@ -691,6 +691,215 @@ TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success3) {
|
|||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess) {
|
||||
// Testing the parameter of RegexReplace interface when the replace_all is true.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create regex_replace operation on ds
|
||||
std::shared_ptr<TensorOperation> regex_replace = text::RegexReplace("\\s+", "_", true);
|
||||
EXPECT_NE(regex_replace, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({regex_replace}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::string> expected = {"Hello_World", "Let's_Go", "1:hello", "2:world",
|
||||
"31:beijing", "Welcome_to_China!", "_我_不想_长大_", "Welcome_to_Shenzhen!"};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
Tensor::CreateScalar(expected[i], &expected_tensor);
|
||||
EXPECT_EQ(*ind, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 8);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess1) {
|
||||
// Testing the parameter of RegexReplace interface when the replace_all is false.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess1.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create regex_replace operation on ds
|
||||
std::shared_ptr<TensorOperation> regex_replace = text::RegexReplace("\\s+", "_", false);
|
||||
EXPECT_NE(regex_replace, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({regex_replace}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::string> expected = {"Hello_World", "Let's_Go", "1:hello", "2:world",
|
||||
"31:beijing", "Welcome_to China!", "_我 不想 长大 ", "Welcome_to Shenzhen!"};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
Tensor::CreateScalar(expected[i], &expected_tensor);
|
||||
EXPECT_EQ(*ind, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 8);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess) {
|
||||
// Testing the parameter of RegexTokenizer interface when the with_offsets is false.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create regex_tokenizer operation on ds
|
||||
std::shared_ptr<TensorOperation> regex_tokenizer = text::RegexTokenizer("\\s+", "\\s+", false);
|
||||
EXPECT_NE(regex_tokenizer, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({regex_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::vector<std::string>> expected = {{"Hello", " ", "World"},
|
||||
{"Let's", " ", "Go"},
|
||||
{"1:hello"},
|
||||
{"2:world"},
|
||||
{"31:beijing"},
|
||||
{"Welcome", " ", "to", " ", "China!"},
|
||||
{" ", "我", " ", "不想", " ", "长大", " "},
|
||||
{"Welcome", " ", "to", " ", "Shenzhen!"}};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
int x = expected[i].size();
|
||||
Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor);
|
||||
EXPECT_EQ(*ind, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 8);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess1) {
|
||||
// Testing the parameter of RegexTokenizer interface when the with_offsets is true.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess1.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create regex_tokenizer operation on ds
|
||||
std::shared_ptr<TensorOperation> regex_tokenizer = text::RegexTokenizer("\\s+", "\\s+", true);
|
||||
EXPECT_NE(regex_tokenizer, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({regex_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
|
||||
{"token", "offsets_start", "offsets_limit"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::vector<std::string>> expected = {{"Hello", " ", "World"},
|
||||
{"Let's", " ", "Go"},
|
||||
{"1:hello"},
|
||||
{"2:world"},
|
||||
{"31:beijing"},
|
||||
{"Welcome", " ", "to", " ", "China!"},
|
||||
{" ", "我", " ", "不想", " ", "长大", " "},
|
||||
{"Welcome", " ", "to", " ", "Shenzhen!"}};
|
||||
|
||||
std::vector<std::vector<uint32_t>> expected_offsets_start = {
|
||||
{0, 5, 6}, {0, 5, 6}, {0}, {0}, {0}, {0, 7, 8, 10, 11}, {0, 2, 5, 6, 12, 14, 20}, {0, 7, 8, 10, 11}};
|
||||
std::vector<std::vector<uint32_t>> expected_offsets_limit = {
|
||||
{5, 6, 11}, {5, 6, 8}, {7}, {7}, {10}, {7, 8, 10, 11, 17}, {2, 5, 6, 12, 14, 20, 21}, {7, 8, 10, 11, 20}};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["offsets_start"];
|
||||
auto ind1 = row["offsets_limit"];
|
||||
auto token = row["token"];
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
std::shared_ptr<Tensor> expected_tensor_offsets_start;
|
||||
std::shared_ptr<Tensor> expected_tensor_offsets_limit;
|
||||
int x = expected[i].size();
|
||||
Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor);
|
||||
Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &expected_tensor_offsets_start);
|
||||
Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &expected_tensor_offsets_limit);
|
||||
EXPECT_EQ(*ind, *expected_tensor_offsets_start);
|
||||
EXPECT_EQ(*ind1, *expected_tensor_offsets_limit);
|
||||
EXPECT_EQ(*token, *expected_tensor);
|
||||
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 8);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess) {
|
||||
// Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is default.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess.";
|
||||
|
|
Loading…
Reference in New Issue