From a027834087017be5c8244c863ebbe4cb130daf1e Mon Sep 17 00:00:00 2001 From: xiefangqi Date: Thu, 5 Nov 2020 16:29:23 +0800 Subject: [PATCH] fix bert comments error --- mindspore/dataset/text/transforms.py | 32 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index 7ff283933d7..7c1990b5c43 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -348,12 +348,12 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp): >>> import mindspore.dataset.text as text >>> >>> # If with_offsets=False, default output one column {["text", dtype=str]} - >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'], + >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]', >>> max_bytes_per_token=100, with_offsets=False) >>> data1 = data1.map(operations=tokenizer_op) >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], >>> # ["offsets_limit", dtype=uint32]} - >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'], + >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]', >>> max_bytes_per_token=100, with_offsets=True) >>> data2 = data2.map(operations=tokenizer_op, >>> input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], @@ -607,19 +607,19 @@ if platform.system().lower() != 'windows': >>> >>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> tokenizer_op = text.BasicTokenizer(lower_case=False, - >>> keep_whitespace=False, - >>> normalization_form=NormalizeForm.NONE, - >>> preserve_unused_token=True, - >>> with_offsets=False) + >>> keep_whitespace=False, + >>> normalization_form=NormalizeForm.NONE, + >>> preserve_unused_token=True, + >>> with_offsets=False) >>> data1 = data1.map(operations=tokenizer_op) >>> # If with_offsets=False, then output three columns {["token", dtype=str], >>> # ["offsets_start", dtype=uint32], >>> # ["offsets_limit", dtype=uint32]} >>> tokenizer_op = text.BasicTokenizer(lower_case=False, - >>> keep_whitespace=False, - >>> normalization_form=NormalizeForm.NONE, - >>> preserve_unused_token=True, - >>> with_offsets=True) + >>> keep_whitespace=False, + >>> normalization_form=NormalizeForm.NONE, + >>> preserve_unused_token=True, + >>> with_offsets=True) >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], >>> output_columns=["token", "offsets_start", "offsets_limit"], >>> column_order=["token", "offsets_start", "offsets_limit"]) @@ -665,17 +665,17 @@ if platform.system().lower() != 'windows': >>> >>> # If with_offsets=False, default output one column {["text", dtype=str]} >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100, - >>> unknown_token=100, lower_case=False, keep_whitespace=False, - >>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True, - >>> with_offsets=False) + >>> unknown_token='[UNK]', lower_case=False, keep_whitespace=False, + >>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True, + >>> with_offsets=False) >>> data1 = data1.map(operations=tokenizer_op) >>> # If with_offsets=False, then output three columns {["token", dtype=str], >>> # ["offsets_start", dtype=uint32], >>> # ["offsets_limit", dtype=uint32]} >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100, - >>> unknown_token=100, lower_case=False, keep_whitespace=False, - >>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True, - >>> with_offsets=True) + >>> unknown_token='[UNK]', lower_case=False, keep_whitespace=False, + >>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True, + >>> with_offsets=True) >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], >>> output_columns=["token", "offsets_start", "offsets_limit"], >>> column_order=["token", "offsets_start", "offsets_limit"])