!2390 change output of WordpieceTokenizer and BertTokenizer to 1-D string tensors

Merge pull request !2390 from qianlong21st/wordpiece_tokenizer_1D
This commit is contained in:
mindspore-ci-bot 2020-06-20 14:21:56 +08:00 committed by Gitee
commit 5276db8fd0
4 changed files with 25 additions and 68 deletions

View File

@ -32,23 +32,6 @@ WordpieceTokenizerOp::WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab,
max_bytes_per_token_(max_bytes_per_token),
unknown_token_(unknown_token) {}
void WordpieceTokenizerOp::PadTokens(const std::vector<std::vector<std::string>> &tokens, const std::string &padded_str,
std::vector<std::string> *out_padded_tokens, int *out_cols) const {
int rows = tokens.size();
int max_cols = 0;
for (int i = 0; i < rows; i++) {
max_cols = std::max(max_cols, static_cast<int>(tokens[i].size()));
}
out_padded_tokens->resize(rows * max_cols, padded_str);
for (int i = 0; i < rows; i++) {
int index = i * max_cols;
for (int j = 0; j < tokens[i].size(); j++) {
(*out_padded_tokens)[index++] = tokens[i][j];
}
}
*out_cols = max_cols;
}
Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start,
bool *out_found, int *out_end) const {
CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && start < input_token.size(), "Out of range");
@ -117,20 +100,16 @@ Status WordpieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::
if (input->Rank() > 1 || input->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar or 1-D string tensor");
}
std::vector<std::vector<std::string>> out_tokens(input->Size());
int i = 0;
std::vector<std::string> out_tokens;
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &out_tokens[i++]));
std::vector<std::string> temp_tokens;
RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &temp_tokens));
out_tokens.insert(out_tokens.end(), temp_tokens.begin(), temp_tokens.end());
}
std::vector<std::string> padded_tokens;
int cols = 0;
PadTokens(out_tokens, "<pad>", &padded_tokens, &cols);
std::vector<dsize_t> shapes;
if (input->Rank() == 1) {
shapes.push_back(out_tokens.size());
if (out_tokens.empty()) {
out_tokens.emplace_back("");
}
shapes.push_back(cols);
*output = std::make_shared<Tensor>(std::move(padded_tokens), TensorShape(shapes));
*output = std::make_shared<Tensor>(out_tokens, TensorShape({(dsize_t)out_tokens.size()}));
return Status::OK();
}

View File

@ -48,8 +48,6 @@ class WordpieceTokenizerOp : public TensorOp {
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
protected:
void PadTokens(const std::vector<std::vector<std::string>> &tokens, const std::string &padded_str,
std::vector<std::string> *out_padded_tokens, int *out_cols) const;
Status AddSubword(const std::string &input_token, const int start, const int end,
std::vector<std::string> *out_token) const;
Status FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const;

View File

@ -188,7 +188,7 @@ class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp):
class WordpieceTokenizer(cde.WordpieceTokenizerOp):
"""
Tokenize scalar token or 1-D tokens to subword tokens.
Tokenize scalar token or 1-D tokens to 1-D subword tokens.
Args
vocab(Vocab): a Vocab object.

View File

@ -35,38 +35,24 @@ test_paras = [
dict(
first=1,
last=4,
expect_str=[[[''], [''], [''], [''], ['']],
[[''], [''], [''], [''], ['']],
[[''], [''], [''], [''], ['']],
[[''], [''], [''], [''], ['']]],
expect_str=[['', '', '', '', ''],
['', '', '', '', ''],
['', '', '', '', ''],
['', '', '', '', '']],
vocab_list=vocab_bert
),
# test english text
dict(
first=5,
last=5,
expect_str=[[['i', pad],
["am", pad],
['mak', '##ing'],
['small', pad],
['mistake', '##s'],
['during', pad],
['work', '##ing'],
['hour', '##s']]],
expect_str=[['i', 'am', 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']],
lower_case=True,
vocab_list=vocab_bert
),
dict(
first=5,
last=5,
expect_str=[[['I', pad],
["am", pad],
['mak', '##ing'],
['small', pad],
['mistake', '##s'],
['during', pad],
['work', '##ing'],
['hour', '##s']]],
expect_str=[['I', "am", 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']],
lower_case=False,
vocab_list=vocab_bert
),
@ -75,8 +61,8 @@ test_paras = [
first=6,
last=7,
expect_str=[
[['😀'], [''], [''], ['😃'], [''], [''], ['😄'], [''], [''], ['😁'], [''], ['']],
[[''], [''], ['']]],
['😀', '', '', '😃', '', '', '😄', '', '', '😁', '', ''],
['', '', '']],
normalization_form=nlp.utils.NormalizeForm.NFKC,
vocab_list=vocab_bert
),
@ -85,11 +71,11 @@ test_paras = [
first=8,
last=12,
expect_str=[
[['[UNK]'], ['[CLS]']],
[['[UNK]'], ['[SEP]']],
[['[UNK]'], ['[UNK]']],
[['[UNK]'], ['[PAD]']],
[['[UNK]'], ['[MASK]']],
['[UNK]', '[CLS]'],
['[UNK]', '[SEP]'],
['[UNK]', '[UNK]'],
['[UNK]', '[PAD]'],
['[UNK]', '[MASK]'],
],
lower_case=False,
vocab_list=vocab_bert,
@ -99,7 +85,7 @@ test_paras = [
dict(
first=13,
last=13,
expect_str=[[['12'], ['+'], ['/'], ['-'], ['28'], ['='], ['40'], ['/'], ['-'], ['16']]],
expect_str=[['12', '+', '/', '-', '28', '=', '40', '/', '-', '16']],
preserve_unused_token=True,
vocab_list=vocab_bert
),
@ -107,9 +93,7 @@ test_paras = [
dict(
first=8,
last=8,
expect_str=[
[['[UNK]'], [' '], ['[CLS]']],
],
expect_str=[['[UNK]', ' ', '[CLS]']],
lower_case=False,
vocab_list=vocab_bert,
preserve_unused_token=True,
@ -118,9 +102,7 @@ test_paras = [
dict(
first=8,
last=8,
expect_str=[
[['unused'], [' '], ['[CLS]']],
],
expect_str=[['unused', ' ', '[CLS]']],
lower_case=False,
vocab_list=vocab_bert,
preserve_unused_token=True,
@ -130,9 +112,7 @@ test_paras = [
dict(
first=8,
last=8,
expect_str=[
[['unused'], [' '], ['['], ['CLS'], [']']],
],
expect_str=[['unused', ' ', '[', 'CLS', ']']],
lower_case=False,
vocab_list=vocab_bert,
preserve_unused_token=False,