forked from mindspore-Ecosystem/mindspore
!2390 change output of WordpieceTokenizer and BertTokenizer to 1-D string tensors
Merge pull request !2390 from qianlong21st/wordpiece_tokenizer_1D
This commit is contained in:
commit
5276db8fd0
|
@ -32,23 +32,6 @@ WordpieceTokenizerOp::WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab,
|
|||
max_bytes_per_token_(max_bytes_per_token),
|
||||
unknown_token_(unknown_token) {}
|
||||
|
||||
void WordpieceTokenizerOp::PadTokens(const std::vector<std::vector<std::string>> &tokens, const std::string &padded_str,
|
||||
std::vector<std::string> *out_padded_tokens, int *out_cols) const {
|
||||
int rows = tokens.size();
|
||||
int max_cols = 0;
|
||||
for (int i = 0; i < rows; i++) {
|
||||
max_cols = std::max(max_cols, static_cast<int>(tokens[i].size()));
|
||||
}
|
||||
out_padded_tokens->resize(rows * max_cols, padded_str);
|
||||
for (int i = 0; i < rows; i++) {
|
||||
int index = i * max_cols;
|
||||
for (int j = 0; j < tokens[i].size(); j++) {
|
||||
(*out_padded_tokens)[index++] = tokens[i][j];
|
||||
}
|
||||
}
|
||||
*out_cols = max_cols;
|
||||
}
|
||||
|
||||
Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start,
|
||||
bool *out_found, int *out_end) const {
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && start < input_token.size(), "Out of range");
|
||||
|
@ -117,20 +100,16 @@ Status WordpieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::
|
|||
if (input->Rank() > 1 || input->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar or 1-D string tensor");
|
||||
}
|
||||
std::vector<std::vector<std::string>> out_tokens(input->Size());
|
||||
int i = 0;
|
||||
std::vector<std::string> out_tokens;
|
||||
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
|
||||
RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &out_tokens[i++]));
|
||||
std::vector<std::string> temp_tokens;
|
||||
RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &temp_tokens));
|
||||
out_tokens.insert(out_tokens.end(), temp_tokens.begin(), temp_tokens.end());
|
||||
}
|
||||
std::vector<std::string> padded_tokens;
|
||||
int cols = 0;
|
||||
PadTokens(out_tokens, "<pad>", &padded_tokens, &cols);
|
||||
std::vector<dsize_t> shapes;
|
||||
if (input->Rank() == 1) {
|
||||
shapes.push_back(out_tokens.size());
|
||||
if (out_tokens.empty()) {
|
||||
out_tokens.emplace_back("");
|
||||
}
|
||||
shapes.push_back(cols);
|
||||
*output = std::make_shared<Tensor>(std::move(padded_tokens), TensorShape(shapes));
|
||||
*output = std::make_shared<Tensor>(out_tokens, TensorShape({(dsize_t)out_tokens.size()}));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
|
|
@ -48,8 +48,6 @@ class WordpieceTokenizerOp : public TensorOp {
|
|||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
|
||||
protected:
|
||||
void PadTokens(const std::vector<std::vector<std::string>> &tokens, const std::string &padded_str,
|
||||
std::vector<std::string> *out_padded_tokens, int *out_cols) const;
|
||||
Status AddSubword(const std::string &input_token, const int start, const int end,
|
||||
std::vector<std::string> *out_token) const;
|
||||
Status FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const;
|
||||
|
|
|
@ -188,7 +188,7 @@ class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp):
|
|||
|
||||
class WordpieceTokenizer(cde.WordpieceTokenizerOp):
|
||||
"""
|
||||
Tokenize scalar token or 1-D tokens to subword tokens.
|
||||
Tokenize scalar token or 1-D tokens to 1-D subword tokens.
|
||||
|
||||
Args
|
||||
vocab(Vocab): a Vocab object.
|
||||
|
|
|
@ -35,38 +35,24 @@ test_paras = [
|
|||
dict(
|
||||
first=1,
|
||||
last=4,
|
||||
expect_str=[[['床'], ['前'], ['明'], ['月'], ['光']],
|
||||
[['疑'], ['是'], ['地'], ['上'], ['霜']],
|
||||
[['举'], ['头'], ['望'], ['明'], ['月']],
|
||||
[['低'], ['头'], ['思'], ['故'], ['乡']]],
|
||||
expect_str=[['床', '前', '明', '月', '光'],
|
||||
['疑', '是', '地', '上', '霜'],
|
||||
['举', '头', '望', '明', '月'],
|
||||
['低', '头', '思', '故', '乡']],
|
||||
vocab_list=vocab_bert
|
||||
),
|
||||
# test english text
|
||||
dict(
|
||||
first=5,
|
||||
last=5,
|
||||
expect_str=[[['i', pad],
|
||||
["am", pad],
|
||||
['mak', '##ing'],
|
||||
['small', pad],
|
||||
['mistake', '##s'],
|
||||
['during', pad],
|
||||
['work', '##ing'],
|
||||
['hour', '##s']]],
|
||||
expect_str=[['i', 'am', 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']],
|
||||
lower_case=True,
|
||||
vocab_list=vocab_bert
|
||||
),
|
||||
dict(
|
||||
first=5,
|
||||
last=5,
|
||||
expect_str=[[['I', pad],
|
||||
["am", pad],
|
||||
['mak', '##ing'],
|
||||
['small', pad],
|
||||
['mistake', '##s'],
|
||||
['during', pad],
|
||||
['work', '##ing'],
|
||||
['hour', '##s']]],
|
||||
expect_str=[['I', "am", 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']],
|
||||
lower_case=False,
|
||||
vocab_list=vocab_bert
|
||||
),
|
||||
|
@ -75,8 +61,8 @@ test_paras = [
|
|||
first=6,
|
||||
last=7,
|
||||
expect_str=[
|
||||
[['😀'], ['嘿'], ['嘿'], ['😃'], ['哈'], ['哈'], ['😄'], ['大'], ['笑'], ['😁'], ['嘻'], ['嘻']],
|
||||
[['繁'], ['體'], ['字']]],
|
||||
['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'],
|
||||
['繁', '體', '字']],
|
||||
normalization_form=nlp.utils.NormalizeForm.NFKC,
|
||||
vocab_list=vocab_bert
|
||||
),
|
||||
|
@ -85,11 +71,11 @@ test_paras = [
|
|||
first=8,
|
||||
last=12,
|
||||
expect_str=[
|
||||
[['[UNK]'], ['[CLS]']],
|
||||
[['[UNK]'], ['[SEP]']],
|
||||
[['[UNK]'], ['[UNK]']],
|
||||
[['[UNK]'], ['[PAD]']],
|
||||
[['[UNK]'], ['[MASK]']],
|
||||
['[UNK]', '[CLS]'],
|
||||
['[UNK]', '[SEP]'],
|
||||
['[UNK]', '[UNK]'],
|
||||
['[UNK]', '[PAD]'],
|
||||
['[UNK]', '[MASK]'],
|
||||
],
|
||||
lower_case=False,
|
||||
vocab_list=vocab_bert,
|
||||
|
@ -99,7 +85,7 @@ test_paras = [
|
|||
dict(
|
||||
first=13,
|
||||
last=13,
|
||||
expect_str=[[['12'], ['+'], ['/'], ['-'], ['28'], ['='], ['40'], ['/'], ['-'], ['16']]],
|
||||
expect_str=[['12', '+', '/', '-', '28', '=', '40', '/', '-', '16']],
|
||||
preserve_unused_token=True,
|
||||
vocab_list=vocab_bert
|
||||
),
|
||||
|
@ -107,9 +93,7 @@ test_paras = [
|
|||
dict(
|
||||
first=8,
|
||||
last=8,
|
||||
expect_str=[
|
||||
[['[UNK]'], [' '], ['[CLS]']],
|
||||
],
|
||||
expect_str=[['[UNK]', ' ', '[CLS]']],
|
||||
lower_case=False,
|
||||
vocab_list=vocab_bert,
|
||||
preserve_unused_token=True,
|
||||
|
@ -118,9 +102,7 @@ test_paras = [
|
|||
dict(
|
||||
first=8,
|
||||
last=8,
|
||||
expect_str=[
|
||||
[['unused'], [' '], ['[CLS]']],
|
||||
],
|
||||
expect_str=[['unused', ' ', '[CLS]']],
|
||||
lower_case=False,
|
||||
vocab_list=vocab_bert,
|
||||
preserve_unused_token=True,
|
||||
|
@ -130,9 +112,7 @@ test_paras = [
|
|||
dict(
|
||||
first=8,
|
||||
last=8,
|
||||
expect_str=[
|
||||
[['unused'], [' '], ['['], ['CLS'], [']']],
|
||||
],
|
||||
expect_str=[['unused', ' ', '[', 'CLS', ']']],
|
||||
lower_case=False,
|
||||
vocab_list=vocab_bert,
|
||||
preserve_unused_token=False,
|
||||
|
|
Loading…
Reference in New Issue