forked from mindspore-Ecosystem/mindspore
!2433 add comment for dataset.text
Merge pull request !2433 from qianlong21st/add_text_comment
This commit is contained in:
commit
3654f0f9ac
|
@ -11,9 +11,11 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
mindspore.dataset.text
|
||||
This module is to support text processing for nlp. It includes two parts:
|
||||
transforms and utils. transforms is a high performance
|
||||
nlp text processing module which is developed with icu4c and cppjieba.
|
||||
utils provides some general methods for nlp text processing.
|
||||
"""
|
||||
import platform
|
||||
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, TruncateSequencePair, \
|
||||
|
|
|
@ -12,9 +12,37 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
c transforms for all text related operators
|
||||
"""
|
||||
The module text.transforms is inheritted from _c_dataengine
|
||||
which is implemented basing on icu4c and cppjieba in C++.
|
||||
It's a high performance module to process nlp text.
|
||||
Users can use Vocab to build their own dictionary,
|
||||
use appropriate tokenizers to split sentences into different tokens,
|
||||
and use Lookup to find the index of tokens in Vocab.
|
||||
|
||||
.. Note::
|
||||
Constructor's arguments for every class in this module must be saved into the
|
||||
class attributes (self.xxx) to support save() and load().
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>> dataset_file = "path/to/text_file_path"
|
||||
>>> # sentences as line data saved in a file
|
||||
>>> dataset = ds.TextFileDataset(dataset_file, shuffle=False)
|
||||
>>> # tokenize sentence to unicode characters
|
||||
>>> tokenizer = text.UnicodeCharTokenizer()
|
||||
>>> # load vocabulary form list
|
||||
>>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您'])
|
||||
>>> # lookup is an operation for mapping tokens to ids
|
||||
>>> lookup = text.Lookup(vocab)
|
||||
>>> dataset = dataset.map(operations=[tokenizer, lookup])
|
||||
>>> for i in dataset.create_dict_iterator():
|
||||
>>> print(i)
|
||||
>>> # if text line in dataset_file is:
|
||||
>>> # 深圳欢迎您
|
||||
>>> # then the output will be:
|
||||
>>> # {'text': array([0, 1, 2, 3, 4], dtype=int32)}
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import platform
|
||||
|
@ -203,8 +231,8 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
|
|||
|
||||
Args:
|
||||
vocab (Vocab): a Vocab object.
|
||||
suffix_indicator (str, optional): Used to show that the subword is the last part of a word(default '##').
|
||||
max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default 100).
|
||||
suffix_indicator (str, optional): Used to show that the subword is the last part of a word(default='##').
|
||||
max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default=100).
|
||||
unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string,
|
||||
return the token directly, else return 'unknown_token'(default='[UNK]').
|
||||
"""
|
||||
|
@ -299,7 +327,7 @@ if platform.system().lower() != 'windows':
|
|||
The original string will be split by matched elements.
|
||||
keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token
|
||||
if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''),
|
||||
in this situation, delimiters will not kept as a output token.
|
||||
in this situation, delimiters will not kept as a output token(default='').
|
||||
"""
|
||||
|
||||
def __init__(self, delim_pattern, keep_delim_pattern=''):
|
||||
|
|
|
@ -12,7 +12,9 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Some basic function for text
|
||||
The module text.utils provides some general methods for nlp text processing.
|
||||
For example, you can use Vocab to build a dictionary,
|
||||
use to_bytes and to_str to encode and decode strings into a specified format.
|
||||
"""
|
||||
from enum import IntEnum
|
||||
|
||||
|
@ -52,12 +54,12 @@ class Vocab(cde.Vocab):
|
|||
min_frequency/max_frequency can be None, which corresponds to 0/total_words separately
|
||||
(default=None, all words are included).
|
||||
top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
|
||||
taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None,
|
||||
taken. top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None,
|
||||
all words are included).
|
||||
special_tokens(list, optional): a list of strings, each one is a special token. for example
|
||||
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
|
||||
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens
|
||||
is specified and special_first is set to None, special_tokens will be prepended. (default=None).
|
||||
is specified and special_first is set to None, special_tokens will be prepended (default=None).
|
||||
|
||||
Returns:
|
||||
Vocab, Vocab object built from dataset.
|
||||
|
@ -81,7 +83,7 @@ class Vocab(cde.Vocab):
|
|||
special_tokens(list, optional): a list of strings, each one is a special token. for example
|
||||
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
|
||||
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens
|
||||
is specified and special_first is set to None, special_tokens will be prepended. (default=None).
|
||||
is specified and special_first is set to None, special_tokens will be prepended (default=None).
|
||||
"""
|
||||
|
||||
return super().from_list(word_list, special_tokens, special_first)
|
||||
|
@ -101,7 +103,7 @@ class Vocab(cde.Vocab):
|
|||
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
|
||||
special_first (bool, optional): whether special_tokens will be prepended/appended to vocab,
|
||||
If special_tokens is specified and special_first is set to None,
|
||||
special_tokens will be prepended. (default=None).
|
||||
special_tokens will be prepended (default=None).
|
||||
"""
|
||||
|
||||
return super().from_file(file_path, delimiter, vocab_size, special_tokens, special_first)
|
||||
|
@ -157,12 +159,14 @@ def to_bytes(array, encoding='utf8'):
|
|||
|
||||
|
||||
class JiebaMode(IntEnum):
|
||||
"""An enumeration for JiebaTokenizer, effective enumeration types are MIX, MP, HMM."""
|
||||
MIX = 0
|
||||
MP = 1
|
||||
HMM = 2
|
||||
|
||||
|
||||
class NormalizeForm(IntEnum):
|
||||
"""An enumeration for NormalizeUTF8, effective enumeration types are NONE, NFC, NFKC, NFD, NFKD."""
|
||||
NONE = 0
|
||||
NFC = 1
|
||||
NFKC = 2
|
||||
|
|
Loading…
Reference in New Issue