forked from mindspore-Ecosystem/mindspore
!8263 [MD] Update the example of BertTokenizer
Merge pull request !8263 from xiefangqi/xfq_fix_bert_comment
This commit is contained in:
commit
d79bcc923e
|
@ -348,12 +348,12 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
|
|||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'],
|
||||
>>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]',
|
||||
>>> max_bytes_per_token=100, with_offsets=False)
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'],
|
||||
>>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]',
|
||||
>>> max_bytes_per_token=100, with_offsets=True)
|
||||
>>> data2 = data2.map(operations=tokenizer_op,
|
||||
>>> input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
|
@ -607,19 +607,19 @@ if platform.system().lower() != 'windows':
|
|||
>>>
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.BasicTokenizer(lower_case=False,
|
||||
>>> keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE,
|
||||
>>> preserve_unused_token=True,
|
||||
>>> with_offsets=False)
|
||||
>>> keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE,
|
||||
>>> preserve_unused_token=True,
|
||||
>>> with_offsets=False)
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.BasicTokenizer(lower_case=False,
|
||||
>>> keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE,
|
||||
>>> preserve_unused_token=True,
|
||||
>>> with_offsets=True)
|
||||
>>> keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE,
|
||||
>>> preserve_unused_token=True,
|
||||
>>> with_offsets=True)
|
||||
>>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
|
@ -665,17 +665,17 @@ if platform.system().lower() != 'windows':
|
|||
>>>
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100,
|
||||
>>> unknown_token=100, lower_case=False, keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
|
||||
>>> with_offsets=False)
|
||||
>>> unknown_token='[UNK]', lower_case=False, keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
|
||||
>>> with_offsets=False)
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100,
|
||||
>>> unknown_token=100, lower_case=False, keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
|
||||
>>> with_offsets=True)
|
||||
>>> unknown_token='[UNK]', lower_case=False, keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
|
||||
>>> with_offsets=True)
|
||||
>>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
|
|
Loading…
Reference in New Issue