!7050 dataset API docstring: Update/add text examples
Merge pull request !7050 from cathwong/ckw_api_text_examples
This commit is contained in:
commit
c1b9efe8e6
|
@ -159,7 +159,7 @@ def get_monitor_sampling_interval():
|
|||
Get the default interval of performance monitor sampling.
|
||||
|
||||
Returns:
|
||||
Interval: interval (in milliseconds) for performance monitor sampling.
|
||||
Int, interval (in milliseconds) for performance monitor sampling.
|
||||
"""
|
||||
return _config.get_monitor_sampling_interval()
|
||||
|
||||
|
|
|
@ -28,16 +28,16 @@ Examples:
|
|||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> dataset_file = "path/to/text_file_path"
|
||||
>>> # sentences as line data saved in a file
|
||||
>>> dataset = ds.TextFileDataset(dataset_file, shuffle=False)
|
||||
>>> # tokenize sentence to unicode characters
|
||||
>>> # Create a dataset for text sentences saved as line data in a file
|
||||
>>> data1 = ds.TextFileDataset(dataset_file, shuffle=False)
|
||||
>>> # Tokenize sentences to unicode characters
|
||||
>>> tokenizer = text.UnicodeCharTokenizer()
|
||||
>>> # load vocabulary form list
|
||||
>>> # Load vocabulary from list
|
||||
>>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您'])
|
||||
>>> # lookup is an operation for mapping tokens to ids
|
||||
>>> # Use Lookup operator to map tokens to ids
|
||||
>>> lookup = text.Lookup(vocab)
|
||||
>>> dataset = dataset.map(operations=[tokenizer, lookup])
|
||||
>>> for i in dataset.create_dict_iterator():
|
||||
>>> data1 = data1.map(operations=[tokenizer, lookup])
|
||||
>>> for i in data1.create_dict_iterator():
|
||||
>>> print(i)
|
||||
>>> # if text line in dataset_file is:
|
||||
>>> # 深圳欢迎您
|
||||
|
@ -69,6 +69,15 @@ class Lookup(cde.LookupOp):
|
|||
unknown_token (str, optional): Word used for lookup if the word being looked up is out-of-vocabulary (OOV).
|
||||
If unknown_token is OOV, a runtime error will be thrown (default=None).
|
||||
data_type (mindspore.dtype, optional): mindspore.dtype that lookup maps string to (default=mstype.int32)
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # Load vocabulary from list
|
||||
>>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您'])
|
||||
>>> # Use Lookup operator to map tokens to ids
|
||||
>>> lookup = text.Lookup(vocab, "<unk>")
|
||||
>>> data1 = data1.map(operations=[lookup])
|
||||
"""
|
||||
|
||||
@check_lookup
|
||||
|
@ -86,12 +95,14 @@ class SlidingWindow(cde.SlidingWindowOp):
|
|||
axis (int, optional): The axis along which the sliding window is computed (default=0).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # Data before
|
||||
>>> # | col1 |
|
||||
>>> # +-------------+
|
||||
>>> # | [1,2,3,4,5] |
|
||||
>>> # +-------------+
|
||||
>>> data = data.map(operations=SlidingWindow(3, 0))
|
||||
>>> data1 = data1.map(operations=text.SlidingWindow(3, 0))
|
||||
>>> # Data after
|
||||
>>> # | col1 |
|
||||
>>> # +-------------+
|
||||
|
@ -125,6 +136,11 @@ class Ngram(cde.NgramOp):
|
|||
separator (str, optional): symbol used to join strings together. For example. if 2-gram is
|
||||
["mindspore", "amazing"] with separator="-", the result would be ["mindspore-amazing"]
|
||||
(default=None, which means whitespace is used).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> data1 = data1.map(operations=text.Ngram(3, separator=" "))
|
||||
"""
|
||||
|
||||
@check_ngram
|
||||
|
@ -157,15 +173,17 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
|
|||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=False)
|
||||
>>> data = data.map(operations=tokenizer_op)
|
||||
>>> tokenizer_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=False)
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
|
||||
>>> data = data.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
>>> tokenizer_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
|
||||
>>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
"""
|
||||
|
||||
@check_jieba_init
|
||||
|
@ -191,6 +209,16 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
|
|||
The added word will not be written into the built-in dictionary on disk.
|
||||
freq (int, optional): The frequency of the word to be added. The higher the frequency,
|
||||
the better chance the word will be tokenized (default=None, use default frequency).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=text.JiebaMode.MP)
|
||||
>>> with open(VOCAB_FILE, 'r') as f:
|
||||
>>> for line in f:
|
||||
>>> word = line.split(',')[0]
|
||||
>>> jieba_op.add_word(word)
|
||||
>>> data1 = data1.map(operations=jieba_op, input_columns=["text"])
|
||||
"""
|
||||
|
||||
if freq is None:
|
||||
|
@ -213,6 +241,14 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
|
|||
word1 freq1
|
||||
word2
|
||||
word3 freq3
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> user_dict = {"男默女泪": 10}
|
||||
>>> jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
>>> jieba_op.add_dict(user_dict)
|
||||
>>> data1 = data1.map(operations=jieba_op, input_columns=["text"])
|
||||
"""
|
||||
|
||||
if isinstance(user_dict, str):
|
||||
|
@ -277,15 +313,17 @@ class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp):
|
|||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.UnicodeCharTokenizer()
|
||||
>>> dataset = dataset.map(operations=tokenizer_op)
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.UnicodeCharTokenizer(True)
|
||||
>>> data = data.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
>>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
"""
|
||||
|
||||
@check_with_offsets
|
||||
|
@ -307,17 +345,19 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
|
|||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'],
|
||||
>>> max_bytes_per_token=100, with_offsets=False)
|
||||
>>> dataset = dataset.map(operations=tokenizer_op)
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'],
|
||||
>>> max_bytes_per_token=100, with_offsets=True)
|
||||
>>> data = data.map(operations=tokenizer_op,
|
||||
>>> input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
>>> data2 = data2.map(operations=tokenizer_op,
|
||||
>>> input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
"""
|
||||
|
||||
@check_wordpiece_tokenizer
|
||||
|
@ -351,6 +391,13 @@ class SentencePieceTokenizer(cde.SentencePieceTokenizerOp):
|
|||
mode (Union[str, SentencePieceVocab]): If the input parameter is a file, then it is of type string.
|
||||
If the input parameter is a SentencePieceVocab object, then it is of type SentencePieceVocab.
|
||||
out_type (Union[str, int]): The type of output.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
|
||||
>>> tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
||||
>>> data1 = data1.map(operations=tokenizer)
|
||||
"""
|
||||
|
||||
def __init__(self, mode, out_type):
|
||||
|
@ -374,16 +421,18 @@ if platform.system().lower() != 'windows':
|
|||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.WhitespaceTokenizer()
|
||||
>>> dataset = dataset.map(operations=tokenizer_op)
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.WhitespaceTokenizer(True)
|
||||
>>> data = data.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
>>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
"""
|
||||
|
||||
@check_with_offsets
|
||||
|
@ -401,16 +450,18 @@ if platform.system().lower() != 'windows':
|
|||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=False)
|
||||
>>> dataset = dataset.map(operations=tokenizer_op)
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=True)
|
||||
>>> data = data.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
>>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
"""
|
||||
|
||||
@check_unicode_script_tokenizer
|
||||
|
@ -423,6 +474,12 @@ if platform.system().lower() != 'windows':
|
|||
class CaseFold(cde.CaseFoldOp):
|
||||
"""
|
||||
Apply case fold operation on utf-8 string tensor.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> case_op = text.CaseFold()
|
||||
>>> data1 = data1.map(operations=case_op)
|
||||
"""
|
||||
|
||||
|
||||
|
@ -434,7 +491,6 @@ if platform.system().lower() != 'windows':
|
|||
NormalizeForm.NFKD: cde.NormalizeForm.DE_NORMALIZE_NFKD
|
||||
}
|
||||
|
||||
|
||||
class NormalizeUTF8(cde.NormalizeUTF8Op):
|
||||
"""
|
||||
Apply normalize operation on utf-8 string tensor.
|
||||
|
@ -450,6 +506,12 @@ if platform.system().lower() != 'windows':
|
|||
- NormalizeForm.NFKC, normalize with Normalization Form KC.
|
||||
- NormalizeForm.NFD, normalize with Normalization Form D.
|
||||
- NormalizeForm.NFKD, normalize with Normalization Form KD.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> normalize_op = text.NormalizeUTF8(normalize_form=NormalizeForm.NFC)
|
||||
>>> data1 = data1.map(operations=normalize_op)
|
||||
"""
|
||||
|
||||
def __init__(self, normalize_form=NormalizeForm.NFKC):
|
||||
|
@ -471,6 +533,14 @@ if platform.system().lower() != 'windows':
|
|||
replace (str): the string to replace matched element.
|
||||
replace_all (bool, optional): If False, only replace first matched element;
|
||||
if True, replace all matched elements (default=True).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> pattern = 'Canada'
|
||||
>>> replace = 'China'
|
||||
>>> replace_op = text.RegexReplace(pattern, replace)
|
||||
>>> data1 = data1.map(operations=replace_op)
|
||||
"""
|
||||
|
||||
def __init__(self, pattern, replace, replace_all=True):
|
||||
|
@ -495,16 +565,18 @@ if platform.system().lower() != 'windows':
|
|||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=False)
|
||||
>>> dataset = dataset.map(operations=tokenizer_op)
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True)
|
||||
>>> data = data.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
>>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
"""
|
||||
|
||||
@check_regex_tokenizer
|
||||
|
@ -531,13 +603,15 @@ if platform.system().lower() != 'windows':
|
|||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.BasicTokenizer(lower_case=False,
|
||||
>>> keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE,
|
||||
>>> preserve_unused_token=True,
|
||||
>>> with_offsets=False)
|
||||
>>> dataset = dataset.map(operations=tokenizer_op)
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
|
@ -546,9 +620,9 @@ if platform.system().lower() != 'windows':
|
|||
>>> normalization_form=NormalizeForm.NONE,
|
||||
>>> preserve_unused_token=True,
|
||||
>>> with_offsets=True)
|
||||
>>> data = data.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
>>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
"""
|
||||
|
||||
@check_basic_tokenizer
|
||||
|
@ -587,12 +661,14 @@ if platform.system().lower() != 'windows':
|
|||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100,
|
||||
>>> unknown_token=100, lower_case=False, keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
|
||||
>>> with_offsets=False)
|
||||
>>> dataset = dataset.map(operations=tokenizer_op)
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
|
@ -600,9 +676,9 @@ if platform.system().lower() != 'windows':
|
|||
>>> unknown_token=100, lower_case=False, keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
|
||||
>>> with_offsets=True)
|
||||
>>> data = data.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
>>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
"""
|
||||
|
||||
@check_bert_tokenizer
|
||||
|
@ -636,12 +712,14 @@ class TruncateSequencePair(cde.TruncateSequencePairOp):
|
|||
max_length (int): Maximum length required.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # Data before
|
||||
>>> # | col1 | col2 |
|
||||
>>> # +---------+---------|
|
||||
>>> # | [1,2,3] | [4,5] |
|
||||
>>> # +---------+---------+
|
||||
>>> data = data.map(operations=TruncateSequencePair(4))
|
||||
>>> data1 = data1.map(operations=text.TruncateSequencePair(4))
|
||||
>>> # Data after
|
||||
>>> # | col1 | col2 |
|
||||
>>> # +---------+---------+
|
||||
|
@ -670,6 +748,13 @@ class ToNumber(cde.ToNumberOp):
|
|||
|
||||
Raises:
|
||||
RuntimeError: If strings are invalid to cast, or are out of range after being casted.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>> import mindspore.common.dtype as mstype
|
||||
>>>
|
||||
>>> to_number_op = text.ToNumber(mstype.int8)
|
||||
>>> data1 = data1.map(operations=to_number_op)
|
||||
"""
|
||||
|
||||
@check_to_number
|
||||
|
@ -687,9 +772,11 @@ class PythonTokenizer:
|
|||
tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> def my_tokenizer(line):
|
||||
>>> return line.split()
|
||||
>>> data = data.map(operations=PythonTokenizer(my_tokenizer))
|
||||
>>> data1 = data1.map(operations=text.PythonTokenizer(my_tokenizer))
|
||||
"""
|
||||
|
||||
@check_python_tokenizer
|
||||
|
|
|
@ -83,6 +83,7 @@ class TypeCast(cde.TypeCastOp):
|
|||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.transforms.c_transforms as c_transforms
|
||||
>>> import mindspore.common.dtype as mstype
|
||||
>>>
|
||||
>>> type_cast_op = c_transforms.TypeCast(mstype.int32)
|
||||
"""
|
||||
|
|
|
@ -77,7 +77,7 @@ class Compose:
|
|||
>>>
|
||||
>>> dataset_dir = "path/to/imagefolder_directory"
|
||||
>>> # create a dataset that reads all files in dataset_dir with 8 threads
|
||||
>>> dataset = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8)
|
||||
>>> data1 = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8)
|
||||
>>> # create a list of transformations to be applied to the image data
|
||||
>>> transform = py_transforms.Compose([py_vision.Decode(),
|
||||
>>> py_vision.RandomHorizontalFlip(0.5),
|
||||
|
@ -85,7 +85,7 @@ class Compose:
|
|||
>>> py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)),
|
||||
>>> py_vision.RandomErasing()])
|
||||
>>> # apply the transform to the dataset through dataset.map()
|
||||
>>> dataset = dataset.map(operations=transform, input_columns="image")
|
||||
>>> data1 = data1.map(operations=transform, input_columns="image")
|
||||
>>>
|
||||
>>> # Compose is also be invoked implicitly, by just passing in a list of ops
|
||||
>>> # the above example then becomes:
|
||||
|
@ -96,7 +96,7 @@ class Compose:
|
|||
>>> py_vision.RandomErasing()]
|
||||
>>>
|
||||
>>> # apply the transform to the dataset through dataset.map()
|
||||
>>> dataset = dataset.map(operations=transform_list, input_columns="image")
|
||||
>>> data2 = data2.map(operations=transform_list, input_columns="image")
|
||||
>>>
|
||||
>>> # Certain C++ and Python ops can be combined, but not all of them
|
||||
>>> # An example of combined operations
|
||||
|
@ -104,20 +104,20 @@ class Compose:
|
|||
>>> import mindspore.dataset.transforms.c_transforms as c_transforms
|
||||
>>> import mindspore.dataset.vision.c_transforms as c_vision
|
||||
>>>
|
||||
>>> data = ds.NumpySlicesDataset(arr, column_names=["cols"], shuffle=False)
|
||||
>>> data3 = ds.NumpySlicesDataset(arr, column_names=["cols"], shuffle=False)
|
||||
>>> transformed_list = [py_transforms.OneHotOp(2), c_transforms.Mask(c_transforms.Relational.EQ, 1)]
|
||||
>>> data = data.map(operations=transformed_list, input_columns=["cols"])
|
||||
>>> data3 = data3.map(operations=transformed_list, input_columns=["cols"])
|
||||
>>>
|
||||
>>> # Here is an example of mixing vision ops
|
||||
>>> data_dir = "/path/to/imagefolder_directory"
|
||||
>>> data1 = ds.ImageFolderDataset(dataset_dir=data_dir, shuffle=False)
|
||||
>>> data4 = ds.ImageFolderDataset(dataset_dir=data_dir, shuffle=False)
|
||||
>>> input_columns = ["column_names"]
|
||||
>>> op_list=[c_vision.Decode(),
|
||||
>>> c_vision.Resize((224, 244)),
|
||||
>>> py_vision.ToPIL(),
|
||||
>>> np.array, # need to convert PIL image to a NumPy array to pass it to C++ operation
|
||||
>>> c_vision.Resize((24, 24))]
|
||||
>>> data1 = data1.map(operations=op_list, input_columns=input_columns)
|
||||
>>> data4 = data4.map(operations=op_list, input_columns=input_columns)
|
||||
"""
|
||||
|
||||
@check_compose_list
|
||||
|
|
Loading…
Reference in New Issue