forked from mindspore-Ecosystem/mindspore
!10821 #I28T16[improve examples in dataset API document to make it okay to run(part1)]
From: @ming__blue Reviewed-by: Signed-off-by:
This commit is contained in:
commit
620b2f4256
|
@ -88,15 +88,8 @@ def zip(datasets):
|
|||
TypeError: If datasets is not a tuple.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_dir1 = "path/to/imagefolder_directory1"
|
||||
>>> dataset_dir2 = "path/to/imagefolder_directory2"
|
||||
>>> ds1 = ds.ImageFolderDataset(dataset_dir1, num_parallel_workers=8)
|
||||
>>> ds2 = ds.ImageFolderDataset(dataset_dir2, num_parallel_workers=8)
|
||||
>>>
|
||||
>>> # Create a dataset which is the combination of ds1 and ds2
|
||||
>>> data = ds.zip((ds1, ds2))
|
||||
>>> # Create a dataset which is the combination of dataset_1 and dataset_2
|
||||
>>> dataset = ds.zip((dataset_1, dataset_2))
|
||||
"""
|
||||
if len(datasets) <= 1:
|
||||
raise ValueError(
|
||||
|
@ -319,28 +312,27 @@ class Dataset:
|
|||
BucketBatchByLengthDataset, dataset bucketed and batched by length.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # data is an instance of Dataset object.
|
||||
>>>
|
||||
>>> # Create a dataset where every 100 rows is combined into a batch
|
||||
>>> # and drops the last incomplete batch if there is one.
|
||||
>>> import numpy as np
|
||||
>>> def generate_2_columns(n):
|
||||
... for i in range(n):
|
||||
... yield (np.array([i]), np.array([j for j in range(i + 1)]))
|
||||
>>> column_names = ["col1", "col2"]
|
||||
>>> dataset = ds.GeneratorDataset(generate_2_columns(202), column_names)
|
||||
>>> bucket_boundaries = [5, 10]
|
||||
>>> bucket_batch_sizes = [5, 1, 1]
|
||||
>>> element_length_function = (lambda col1, col2: max(len(col1), len(col2)))
|
||||
>>>
|
||||
>>> # Will pad col1 to shape [2, bucket_boundaries[i]] where i is the
|
||||
>>> # index of the bucket that is currently being batched.
|
||||
>>> # Will pad col2 to a shape where each dimension is the longest in all
|
||||
>>> # the elements currently being batched.
|
||||
>>> pad_info = {"col1", ([2, None], -1)}
|
||||
>>> pad_info = {"col1": ([2, None], -1)}
|
||||
>>> pad_to_bucket_boundary = True
|
||||
>>>
|
||||
>>> data = data.bucket_batch_by_length(column_names, bucket_boundaries,
|
||||
>>> bucket_batch_sizes,
|
||||
>>> element_length_function, pad_info,
|
||||
>>> pad_to_bucket_boundary)
|
||||
>>> dataset = dataset.bucket_batch_by_length(column_names, bucket_boundaries,
|
||||
... bucket_batch_sizes,
|
||||
... element_length_function, pad_info,
|
||||
... pad_to_bucket_boundary)
|
||||
"""
|
||||
return BucketBatchByLengthDataset(self, column_names, bucket_boundaries, bucket_batch_sizes,
|
||||
element_length_function, pad_info,
|
||||
|
@ -397,26 +389,21 @@ class Dataset:
|
|||
BatchDataset, dataset batched.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # data is an instance of Dataset object.
|
||||
>>>
|
||||
>>> # Create a dataset where every 100 rows is combined into a batch
|
||||
>>> # and drops the last incomplete batch if there is one.
|
||||
>>> data = data.batch(100, True)
|
||||
>>>
|
||||
>>> dataset = dataset.batch(100, True)
|
||||
>>> # resize image according to its batch number, if it's 5-th batch, resize to (5^2, 5^2) = (25, 25)
|
||||
>>> def np_resize(col, batchInfo):
|
||||
>>> output = col.copy()
|
||||
>>> s = (batchInfo.get_batch_num() + 1) ** 2
|
||||
>>> index = 0
|
||||
>>> for c in col:
|
||||
>>> img = Image.fromarray(c.astype('uint8')).convert('RGB')
|
||||
>>> img = img.resize((s, s), Image.ANTIALIAS)
|
||||
>>> output[index] = np.array(img)
|
||||
>>> index += 1
|
||||
>>> return (output,)
|
||||
>>> data = data.batch(batch_size=8, input_columns=["image"], per_batch_map=np_resize)
|
||||
... output = col.copy()
|
||||
... s = (batchInfo.get_batch_num() + 1) ** 2
|
||||
... index = 0
|
||||
... for c in col:
|
||||
... img = Image.fromarray(c.astype('uint8')).convert('RGB')
|
||||
... img = img.resize((s, s), Image.ANTIALIAS)
|
||||
... output[index] = np.array(img)
|
||||
... index += 1
|
||||
... return (output,)
|
||||
>>> dataset = dataset.batch(batch_size=8, input_columns=["image"], per_batch_map=np_resize)
|
||||
"""
|
||||
return BatchDataset(self, batch_size, drop_remainder, num_parallel_workers, per_batch_map, input_columns,
|
||||
output_columns, column_order, pad_info, python_multiprocessing)
|
||||
|
@ -438,13 +425,34 @@ class Dataset:
|
|||
RuntimeError: If condition name already exists.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>> import numpy as np
|
||||
>>> def gen():
|
||||
... for i in range(100):
|
||||
... yield (np.array(i),)
|
||||
>>>
|
||||
>>> # data is an instance of Dataset object.
|
||||
>>> data = data.sync_wait("callback1")
|
||||
>>> data = data.batch(batch_size)
|
||||
>>> for batch_data in data.create_dict_iterator():
|
||||
>>> data = data.sync_update("callback1")
|
||||
>>> class Augment:
|
||||
... def __init__(self, loss):
|
||||
... self.loss = loss
|
||||
...
|
||||
... def preprocess(self, input_):
|
||||
... return input_
|
||||
...
|
||||
... def update(self, data):
|
||||
... self.loss = data["loss"]
|
||||
>>>
|
||||
>>> batch_size = 4
|
||||
>>> dataset = ds.GeneratorDataset(gen, column_names=["input"])
|
||||
>>>
|
||||
>>> aug = Augment(0)
|
||||
>>> dataset = dataset.sync_wait(condition_name="policy", callback=aug.update)
|
||||
>>> dataset = dataset.map(operations=[aug.preprocess], input_columns=["input"])
|
||||
>>> dataset = dataset.batch(batch_size)
|
||||
>>> count = 0
|
||||
>>> for data in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||
... assert data["input"][0] == count
|
||||
... count += batch_size
|
||||
... data = {"loss": count}
|
||||
... dataset.sync_update(condition_name="policy", data=data)
|
||||
"""
|
||||
return SyncWaitDataset(self, condition_name, num_batch, callback)
|
||||
|
||||
|
@ -474,14 +482,11 @@ class Dataset:
|
|||
RuntimeError: If exist sync operators before shuffle.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # data is an instance of Dataset object.
|
||||
>>> # dataset is an instance of Dataset object.
|
||||
>>> # Optionally set the seed for the first epoch
|
||||
>>> ds.config.set_seed(58)
|
||||
>>>
|
||||
>>> # Create a shuffled dataset using a shuffle buffer of size 4
|
||||
>>> data = data.shuffle(4)
|
||||
>>> dataset = dataset.shuffle(4)
|
||||
"""
|
||||
return ShuffleDataset(self, buffer_size)
|
||||
|
||||
|
@ -500,17 +505,14 @@ class Dataset:
|
|||
Dataset, dataset applied by the function.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # Declare a function which returns a Dataset object
|
||||
>>> def flat_map_func(x):
|
||||
>>> data_dir = text.to_str(x[0])
|
||||
>>> d = ds.ImageFolderDataset(data_dir)
|
||||
>>> return d
|
||||
>>> # data is an instance of a Dataset object.
|
||||
>>> data = ds.TextFileDataset(DATA_FILE)
|
||||
>>> data = data.flat_map(flat_map_func)
|
||||
... image_folder_dataset_dir = text.to_str(x[0])
|
||||
... d = ds.ImageFolderDataset(image_folder_dataset_dir)
|
||||
... return d
|
||||
>>> # dataset is an instance of a Dataset object.
|
||||
>>> dataset = ds.TextFileDataset(text_file_dataset_dir)
|
||||
>>> dataset = dataset.flat_map(flat_map_func)
|
||||
|
||||
Raises:
|
||||
TypeError: If `func` is not a function.
|
||||
|
@ -584,13 +586,9 @@ class Dataset:
|
|||
MapDataset, dataset after mapping operation.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>> import mindspore.dataset.vision.c_transforms as c_transforms
|
||||
>>>
|
||||
>>> # data is an instance of Dataset which has 2 columns, "image" and "label".
|
||||
>>> # dataset is an instance of Dataset which has 2 columns, "image" and "label".
|
||||
>>> # ds_pyfunc is an instance of Dataset which has 3 columns, "col0", "col1", and "col2".
|
||||
>>> # Each column is a 2D array of integers.
|
||||
>>>
|
||||
>>> # Set the global configuration value for num_parallel_workers to be 2.
|
||||
>>> # Operations which use this configuration value will use 2 worker threads,
|
||||
>>> # unless otherwise specified in the operator's constructor.
|
||||
|
@ -599,8 +597,8 @@ class Dataset:
|
|||
>>> ds.config.set_num_parallel_workers(2)
|
||||
>>>
|
||||
>>> # Define two operations, where each operation accepts 1 input column and outputs 1 column.
|
||||
>>> decode_op = c_transforms.Decode(rgb_format=True)
|
||||
>>> random_jitter_op = c_transforms.RandomColorAdjust((0.8, 0.8), (1, 1), (1, 1), (0, 0))
|
||||
>>> decode_op = c_vision.Decode(rgb_format=True)
|
||||
>>> random_jitter_op = c_vision.RandomColorAdjust((0.8, 0.8), (1, 1), (1, 1), (0, 0))
|
||||
>>>
|
||||
>>> # 1) Simple map example
|
||||
>>>
|
||||
|
@ -610,31 +608,31 @@ class Dataset:
|
|||
>>> # Apply decode_op on column "image". This column will be replaced by the outputted
|
||||
>>> # column of decode_op. Since column_order is not provided, both columns "image"
|
||||
>>> # and "label" will be propagated to the child node in their original order.
|
||||
>>> ds_decoded = data.map(operations, input_columns)
|
||||
>>> dataset = dataset.map(operations, input_columns)
|
||||
>>>
|
||||
>>> # Rename column "image" to "decoded_image".
|
||||
>>> output_columns = ["decoded_image"]
|
||||
>>> ds_decoded = data.map(operations, input_columns, output_columns)
|
||||
>>> dataset = dataset.map(operations, input_columns, output_columns)
|
||||
>>>
|
||||
>>> # Specify the order of the columns.
|
||||
>>> column_order ["label", "image"]
|
||||
>>> ds_decoded = data.map(operations, input_columns, None, column_order)
|
||||
>>> dataset = dataset.map(operations, input_columns, None, column_order)
|
||||
>>>
|
||||
>>> # Rename column "image" to "decoded_image" and also specify the order of the columns.
|
||||
>>> column_order ["label", "decoded_image"]
|
||||
>>> output_columns = ["decoded_image"]
|
||||
>>> ds_decoded = data.map(operations, input_columns, output_columns, column_order)
|
||||
>>> dataset = dataset.map(operations, input_columns, output_columns, column_order)
|
||||
>>>
|
||||
>>> # Rename column "image" to "decoded_image" and keep only this column.
|
||||
>>> column_order ["decoded_image"]
|
||||
>>> output_columns = ["decoded_image"]
|
||||
>>> ds_decoded = data.map(operations, input_columns, output_columns, column_order)
|
||||
>>> dataset = dataset.map(operations, input_columns, output_columns, column_order)
|
||||
>>>
|
||||
>>> # A simple example using pyfunc: Renaming columns and specifying column order
|
||||
>>> # work in the same way as the previous examples.
|
||||
>>> input_columns = ["col0"]
|
||||
>>> operations = [(lambda x: x + 1)]
|
||||
>>> ds_mapped = ds_pyfunc.map(operations, input_columns)
|
||||
>>> dataset = dataset.map(operations, input_columns)
|
||||
>>>
|
||||
>>> # 2) Map example with more than one operation
|
||||
>>>
|
||||
|
@ -651,20 +649,20 @@ class Dataset:
|
|||
>>> # the column outputted by random_jitter_op (the very last operation). All other
|
||||
>>> # columns are unchanged. Since column_order is not specified, the order of the
|
||||
>>> # columns will remain the same.
|
||||
>>> ds_mapped = data.map(operations, input_columns)
|
||||
>>> dataset = dataset.map(operations, input_columns)
|
||||
>>>
|
||||
>>> # Create a dataset that is identical to ds_mapped, except the column "image"
|
||||
>>> # that is outputted by random_jitter_op is renamed to "image_transformed".
|
||||
>>> # Specifying column order works in the same way as examples in 1).
|
||||
>>> output_columns = ["image_transformed"]
|
||||
>>> ds_mapped_and_renamed = data.map(operation, input_columns, output_columns)
|
||||
>>> dataset = dataset.map(operation, input_columns, output_columns)
|
||||
>>>
|
||||
>>> # Multiple operations using pyfunc: Renaming columns and specifying column order
|
||||
>>> # work in the same way as examples in 1).
|
||||
>>> input_columns = ["col0"]
|
||||
>>> operations = [(lambda x: x + x), (lambda x: x - 1)]
|
||||
>>> output_columns = ["col0_mapped"]
|
||||
>>> ds_mapped = ds_pyfunc.map(operations, input_columns, output_columns)
|
||||
>>> dataset = dataset.map(operations, input_columns, output_columns)
|
||||
>>>
|
||||
>>> # 3) Example where number of input columns is not equal to number of output columns
|
||||
>>>
|
||||
|
@ -687,11 +685,11 @@ class Dataset:
|
|||
>>>
|
||||
>>> # Propagate all columns to the child node in this order:
|
||||
>>> column_order = ["col0", "col2", "mod2", "mod3", "mod5", "mod7", "col1"]
|
||||
>>> ds_mapped = ds_pyfunc.map(operations, input_columns, output_columns, column_order)
|
||||
>>> dataset = dataset.map(operations, input_columns, output_columns, column_order)
|
||||
>>>
|
||||
>>> # Propagate some columns to the child node in this order:
|
||||
>>> column_order = ["mod7", "mod3", "col1"]
|
||||
>>> ds_mapped = ds_pyfunc.map(operations, input_columns, output_columns, column_order)
|
||||
>>> dataset = dataset.map(operations, input_columns, output_columns, column_order)
|
||||
"""
|
||||
|
||||
return MapDataset(self, operations, input_columns, output_columns, column_order, num_parallel_workers,
|
||||
|
@ -716,10 +714,9 @@ class Dataset:
|
|||
FilterDataset, dataset filtered.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>> # generator data(0 ~ 63)
|
||||
>>> # filter the data that greater than or equal to 11
|
||||
>>> dataset_f = dataset.filter(predicate=lambda data: data < 11, input_columns = ["data"])
|
||||
>>> dataset = dataset.filter(predicate=lambda data: data < 11, input_columns = ["data"])
|
||||
"""
|
||||
return FilterDataset(self, predicate, input_columns, num_parallel_workers)
|
||||
|
||||
|
@ -742,22 +739,20 @@ class Dataset:
|
|||
RepeatDataset, dataset repeated.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # data is an instance of Dataset object.
|
||||
>>> # dataset is an instance of Dataset object.
|
||||
>>>
|
||||
>>> # Create a dataset where the dataset is repeated for 50 epochs
|
||||
>>> repeated = data.repeat(50)
|
||||
>>> dataset = dataset.repeat(50)
|
||||
>>>
|
||||
>>> # Create a dataset where each epoch is shuffled individually
|
||||
>>> shuffled_and_repeated = data.shuffle(10)
|
||||
>>> shuffled_and_repeated = shuffled_and_repeated.repeat(50)
|
||||
>>> dataset = dataset.shuffle(10)
|
||||
>>> dataset = dataset.repeat(50)
|
||||
>>>
|
||||
>>> # Create a dataset where the dataset is first repeated for
|
||||
>>> # 50 epochs before shuffling. The shuffle operator will treat
|
||||
>>> # the entire 50 epochs as one big dataset.
|
||||
>>> repeat_and_shuffle = data.repeat(50)
|
||||
>>> repeat_and_shuffle = repeat_and_shuffle.shuffle(10)
|
||||
>>> dataset = dataset.repeat(50)
|
||||
>>> dataset = dataset.shuffle(10)
|
||||
"""
|
||||
return RepeatDataset(self, count)
|
||||
|
||||
|
@ -773,11 +768,9 @@ class Dataset:
|
|||
SkipDataset, dataset skipped.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # data is an instance of Dataset object.
|
||||
>>> # dataset is an instance of Dataset object.
|
||||
>>> # Create a dataset which skips first 3 elements from data
|
||||
>>> data = data.skip(3)
|
||||
>>> dataset = dataset.skip(3)
|
||||
"""
|
||||
return SkipDataset(self, count)
|
||||
|
||||
|
@ -799,11 +792,9 @@ class Dataset:
|
|||
TakeDataset, dataset taken.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # data is an instance of Dataset object.
|
||||
>>> # dataset is an instance of Dataset object.
|
||||
>>> # Create a dataset where the dataset includes 50 elements.
|
||||
>>> data = data.take(50)
|
||||
>>> dataset = dataset.take(50)
|
||||
"""
|
||||
return TakeDataset(self, count)
|
||||
|
||||
|
@ -911,14 +902,10 @@ class Dataset:
|
|||
tuple(Dataset), a tuple of datasets that have been split.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_files = "/path/to/text_file/*"
|
||||
>>>
|
||||
>>> # TextFileDataset is not a mappable dataset, so this non-optimized split will be called.
|
||||
>>> # Since many datasets have shuffle on by default, set shuffle to False if split will be called!
|
||||
>>> data = ds.TextFileDataset(dataset_files, shuffle=False)
|
||||
>>> train, test = data.split([0.9, 0.1])
|
||||
>>> dataset = ds.TextFileDataset(text_file_dataset_dir, shuffle=False)
|
||||
>>> train_dataset, test_dataset = dataset.split([0.9, 0.1])
|
||||
"""
|
||||
if self.is_shuffled():
|
||||
logger.warning("Dataset is shuffled before split.")
|
||||
|
@ -960,11 +947,8 @@ class Dataset:
|
|||
ZipDataset, dataset zipped.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # ds1 and ds2 are instances of Dataset object
|
||||
>>> # Create a dataset which is the combination of ds1 and ds2
|
||||
>>> data = ds1.zip(ds2)
|
||||
>>> # Create a dataset which is the combination of dataset and dataset_1
|
||||
>>> dataset = dataset.zip(dataset_1)
|
||||
"""
|
||||
if isinstance(datasets, tuple):
|
||||
datasets = (self, *datasets)
|
||||
|
@ -990,14 +974,10 @@ class Dataset:
|
|||
ConcatDataset, dataset concatenated.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # ds1 and ds2 are instances of Dataset object
|
||||
>>>
|
||||
>>> # Create a dataset by concatenating ds1 and ds2 with "+" operator
|
||||
>>> data1 = ds1 + ds2
|
||||
>>> # Create a dataset by concatenating ds1 and ds2 with concat operation
|
||||
>>> data1 = ds1.concat(ds2)
|
||||
>>> # Create a dataset by concatenating dataset_1 and dataset_2 with "+" operator
|
||||
>>> dataset = dataset_1 + dataset_2
|
||||
>>> # Create a dataset by concatenating dataset_1 and dataset_2 with concat operation
|
||||
>>> dataset = dataset_1.concat(dataset_2)
|
||||
"""
|
||||
if isinstance(datasets, Dataset):
|
||||
datasets = [self] + [datasets]
|
||||
|
@ -1020,16 +1000,14 @@ class Dataset:
|
|||
RenameDataset, dataset renamed.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # data is an instance of Dataset object.
|
||||
>>> # dataset is an instance of Dataset object.
|
||||
>>> input_columns = ["input_col1", "input_col2", "input_col3"]
|
||||
>>> output_columns = ["output_col1", "output_col2", "output_col3"]
|
||||
>>>
|
||||
>>> # Create a dataset where input_col1 is renamed to output_col1, and
|
||||
>>> # input_col2 is renamed to output_col2, and input_col3 is renamed
|
||||
>>> # to output_col3.
|
||||
>>> data = data.rename(input_columns=input_columns, output_columns=output_columns)
|
||||
>>> dataset = dataset.rename(input_columns=input_columns, output_columns=output_columns)
|
||||
"""
|
||||
|
||||
return RenameDataset(self, input_columns, output_columns)
|
||||
|
@ -1049,14 +1027,12 @@ class Dataset:
|
|||
ProjectDataset, dataset projected.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # data is an instance of Dataset object
|
||||
>>> # dataset is an instance of Dataset object
|
||||
>>> columns_to_project = ["column3", "column1", "column2"]
|
||||
>>>
|
||||
>>> # Create a dataset that consists of column3, column1, column2
|
||||
>>> # in that order, regardless of the original order of columns.
|
||||
>>> data = data.project(columns=columns_to_project)
|
||||
>>> dataset = dataset.project(columns=columns_to_project)
|
||||
"""
|
||||
|
||||
return ProjectDataset(self, columns)
|
||||
|
@ -1084,11 +1060,17 @@ class Dataset:
|
|||
Vocab, vocab built from the dataset.
|
||||
|
||||
Example:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # data is an instance of Dataset object
|
||||
>>> data = data.build_vocab(columns=["column3", "column1", "column2"], freq_range=(1, 10), top_k=5,
|
||||
>>> special_tokens=["<pad>", "<unk>"], special_first=True)
|
||||
>>> def gen_corpus():
|
||||
... # key: word, value: number of occurrences, reason for using letters is so their order is apparent
|
||||
... corpus = {"Z": 4, "Y": 4, "X": 4, "W": 3, "U": 3, "V": 2, "T": 1}
|
||||
... for k, v in corpus.items():
|
||||
... yield (np.array([k] * v, dtype='S'),)
|
||||
>>> column_names = ["column1","column2","column3"]
|
||||
>>> dataset = ds.GeneratorDataset(gen_corpus, column_names)
|
||||
>>> dataset = dataset.build_vocab(columns=["column3", "column1", "column2"],
|
||||
... freq_range=(1, 10), top_k=5,
|
||||
... special_tokens=["<pad>", "<unk>"],
|
||||
... special_first=True,vocab='vocab')
|
||||
|
||||
"""
|
||||
vocab = cde.Vocab()
|
||||
|
@ -1143,13 +1125,19 @@ class Dataset:
|
|||
SentencePieceVocab, vocab built from the dataset.
|
||||
|
||||
Example:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # data is an instance of Dataset object
|
||||
>>> data = data.build_sentencepiece_vocab(columns=["column3", "column1", "column2"], vocab_size=5000,
|
||||
>>> character_coverage=0.9995, model_type=SentencePieceModel.Unigram,
|
||||
>>> params={})
|
||||
|
||||
>>> from mindspore.dataset.text import SentencePieceModel
|
||||
>>> def gen_corpus():
|
||||
... # key: word, value: number of occurrences, reason for using letters is so their order is apparent
|
||||
... corpus = {"Z": 4, "Y": 4, "X": 4, "W": 3, "U": 3, "V": 2, "T": 1}
|
||||
... for k, v in corpus.items():
|
||||
... yield (np.array([k] * v, dtype='S'),)
|
||||
>>> column_names = ["column1","column2","column3"]
|
||||
>>> dataset = ds.GeneratorDataset(gen_corpus, column_names)
|
||||
>>> dataset = dataset.build_sentencepiece_vocab(columns=["column3", "column1", "column2"],
|
||||
... vocab_size=5000,
|
||||
... character_coverage=0.9995,
|
||||
... model_type=SentencePieceModel.Unigram,
|
||||
... params={},vocab='vocab')
|
||||
"""
|
||||
vocab = cde.SentencePieceVocab()
|
||||
|
||||
|
@ -1184,17 +1172,15 @@ class Dataset:
|
|||
Dataset, dataset applied by the function.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # data is an instance of Dataset object
|
||||
>>> # dataset is an instance of Dataset object
|
||||
>>>
|
||||
>>> # Declare an apply_func function which returns a Dataset object
|
||||
>>> def apply_func(ds):
|
||||
>>> ds = ds.batch(2)
|
||||
>>> return ds
|
||||
>>> def apply_func(data):
|
||||
... data = data.batch(2)
|
||||
... return data
|
||||
>>>
|
||||
>>> # Use apply to call apply_func
|
||||
>>> data = data.apply(apply_func)
|
||||
>>> dataset = dataset.apply(apply_func)
|
||||
|
||||
Raises:
|
||||
TypeError: If apply_func is not a function.
|
||||
|
@ -1356,16 +1342,14 @@ class Dataset:
|
|||
TupleIterator, tuple iterator over the dataset.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # data is an instance of Dataset object
|
||||
>>> # dataset is an instance of Dataset object
|
||||
>>>
|
||||
>>> # Create an iterator
|
||||
>>> # The columns in the data obtained by the iterator will not be changed.
|
||||
>>> iterator = data.create_tuple_iterator()
|
||||
>>> # The columns in the dataset obtained by the iterator will not be changed.
|
||||
>>> iterator = dataset.create_tuple_iterator()
|
||||
>>> for item in iterator:
|
||||
>>> # convert the returned tuple to a list and print
|
||||
>>> print(list(item))
|
||||
... # convert the returned tuple to a list and print
|
||||
... print(list(item))
|
||||
"""
|
||||
if output_numpy is None:
|
||||
output_numpy = False
|
||||
|
@ -1391,16 +1375,14 @@ class Dataset:
|
|||
DictIterator, dictionary iterator over the dataset.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # data is an instance of Dataset object
|
||||
>>> # dataset is an instance of Dataset object
|
||||
>>>
|
||||
>>> # create an iterator
|
||||
>>> # The columns in the data obtained by the iterator might be changed.
|
||||
>>> iterator = data.create_dict_iterator()
|
||||
>>> iterator = dataset.create_dict_iterator()
|
||||
>>> for item in iterator:
|
||||
>>> # print the data in column1
|
||||
>>> print(item["column1"])
|
||||
... # print the data in column1
|
||||
... print(item["column1"])
|
||||
"""
|
||||
if output_numpy is None:
|
||||
output_numpy = False
|
||||
|
@ -1422,11 +1404,9 @@ class Dataset:
|
|||
tuple, tuple of the input index information.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # data is an instance of Dataset object
|
||||
>>> data = ds.NumpySlicesDataset([1, 2, 3], column_names=["col_1"])
|
||||
>>> print(data.input_indexs())
|
||||
>>> # dataset is an instance of Dataset object
|
||||
>>> dataset = ds.NumpySlicesDataset([1, 2, 3], column_names=["col_1"])
|
||||
>>> print(dataset.input_indexs)
|
||||
"""
|
||||
if self._input_indexs != ():
|
||||
return self._input_indexs
|
||||
|
@ -1718,15 +1698,12 @@ class MappableDataset(SourceDataset):
|
|||
new_sampler (Sampler): The sampler to use for the current dataset.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_dir = "/path/to/imagefolder_directory"
|
||||
>>> # Note: A SequentialSampler is created by default
|
||||
>>> data = ds.ImageFolderDataset(dataset_dir)
|
||||
>>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir)
|
||||
>>>
|
||||
>>> # Use a DistributedSampler instead of the SequentialSampler
|
||||
>>> new_sampler = ds.DistributedSampler(10, 2)
|
||||
>>> data.use_sampler(new_sampler)
|
||||
>>> dataset.use_sampler(new_sampler)
|
||||
"""
|
||||
if new_sampler is None:
|
||||
raise TypeError("Input sampler can not be None.")
|
||||
|
@ -1804,21 +1781,17 @@ class MappableDataset(SourceDataset):
|
|||
tuple(Dataset), a tuple of datasets that have been split.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_dir = "/path/to/imagefolder_directory"
|
||||
>>>
|
||||
>>> # Since many datasets have shuffle on by default, set shuffle to False if split will be called!
|
||||
>>> data = ds.ImageFolderDataset(dataset_dir, shuffle=False)
|
||||
>>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir, shuffle=False)
|
||||
>>>
|
||||
>>> # Set the seed, and tell split to use this seed when randomizing.
|
||||
>>> # This is needed because sharding will be done later
|
||||
>>> ds.config.set_seed(58)
|
||||
>>> train, test = data.split([0.9, 0.1])
|
||||
>>> train_dataset, test_dataset = dataset.split([0.9, 0.1])
|
||||
>>>
|
||||
>>> # To shard the train dataset, use a DistributedSampler
|
||||
>>> train_sampler = ds.DistributedSampler(10, 2)
|
||||
>>> train.use_sampler(train_sampler)
|
||||
>>> train_dataset.use_sampler(train_sampler)
|
||||
"""
|
||||
if self.is_shuffled():
|
||||
logger.warning("Dataset is shuffled before split.")
|
||||
|
@ -3062,20 +3035,17 @@ class ImageFolderDataset(MappableDataset):
|
|||
ValueError: If shard_id is invalid (< 0 or >= num_shards).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # Set path to the imagefolder directory.
|
||||
>>> # This directory needs to contain sub-directories which contain the images
|
||||
>>> dataset_dir = "/path/to/imagefolder_directory"
|
||||
>>>
|
||||
>>> # 1) Read all samples (image files) in dataset_dir with 8 threads
|
||||
>>> imagefolder_dataset = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8)
|
||||
>>> # 1) Read all samples (image files) in image_folder_dataset_dir with 8 threads
|
||||
>>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir,
|
||||
... num_parallel_workers=8)
|
||||
>>>
|
||||
>>> # 2) Read all samples (image files) from folder cat and folder dog with label 0 and 1
|
||||
>>> imagefolder_dataset = ds.ImageFolderDataset(dataset_dir, class_indexing={"cat":0, "dog":1})
|
||||
>>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir,
|
||||
... class_indexing={"cat":0, "dog":1})
|
||||
>>>
|
||||
>>> # 3) Read all samples (image files) in dataset_dir with extensions .JPEG and .png (case sensitive)
|
||||
>>> imagefolder_dataset = ds.ImageFolderDataset(dataset_dir, extensions=[".JPEG", ".png"])
|
||||
>>> # 3) Read all samples (image files) in image_folder_dataset_dir with extensions .JPEG and .png (case sensitive)
|
||||
>>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir,
|
||||
... extensions=[".JPEG", ".png"])
|
||||
"""
|
||||
|
||||
@check_imagefolderdataset
|
||||
|
@ -3195,9 +3165,8 @@ class MnistDataset(MappableDataset):
|
|||
(default=None, expected order behavior shown in the table).
|
||||
sampler (Sampler, optional): Object used to choose samples from the
|
||||
dataset (default=None, expected order behavior shown in the table).
|
||||
num_shards (int, optional): Number of shards that the dataset will be divided
|
||||
into (default=None). When this argument is specified, 'num_samples' reflects
|
||||
the max sample number of per shard.
|
||||
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
|
||||
When this argument is specified, 'num_samples' reflects the max sample number of per shard.
|
||||
shard_id (int, optional): The shard ID within num_shards (default=None). This
|
||||
argument can only be specified when num_shards is also specified.
|
||||
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
|
||||
|
@ -3211,11 +3180,8 @@ class MnistDataset(MappableDataset):
|
|||
ValueError: If shard_id is invalid (< 0 or >= num_shards).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_dir = "/path/to/mnist_folder"
|
||||
>>> # Read 3 samples from MNIST dataset
|
||||
>>> mnist_dataset = ds.MnistDataset(dataset_dir=dataset_dir, num_samples=3)
|
||||
>>> dataset = ds.MnistDataset(dataset_dir=mnist_dataset_dir, num_samples=3)
|
||||
>>> # Note: In mnist_dataset dataset, each dictionary has keys "image" and "label"
|
||||
"""
|
||||
|
||||
|
@ -3718,33 +3684,31 @@ class GeneratorDataset(MappableDataset):
|
|||
option could be beneficial if the Python operation is computational heavy (default=True).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # 1) Multidimensional generator function as callable input
|
||||
>>> def GeneratorMD():
|
||||
>>> for i in range(64):
|
||||
>>> yield (np.array([[i, i + 1], [i + 2, i + 3]]),)
|
||||
... for i in range(64):
|
||||
... yield (np.array([[i, i + 1], [i + 2, i + 3]]),)
|
||||
>>> # Create multi_dimension_generator_dataset with GeneratorMD and column name "multi_dimensional_data"
|
||||
>>> multi_dimension_generator_dataset = ds.GeneratorDataset(GeneratorMD, ["multi_dimensional_data"])
|
||||
>>>
|
||||
>>> # 2) Multi-column generator function as callable input
|
||||
>>> def GeneratorMC(maxid = 64):
|
||||
>>> for i in range(maxid):
|
||||
>>> yield (np.array([i]), np.array([[i, i + 1], [i + 2, i + 3]]))
|
||||
... for i in range(maxid):
|
||||
... yield (np.array([i]), np.array([[i, i + 1], [i + 2, i + 3]]))
|
||||
>>> # Create multi_column_generator_dataset with GeneratorMC and column names "col1" and "col2"
|
||||
>>> multi_column_generator_dataset = ds.GeneratorDataset(GeneratorMC, ["col1", "col2"])
|
||||
>>>
|
||||
>>> # 3) Iterable dataset as iterable input
|
||||
>>> class MyIterable():
|
||||
>>> def __iter__(self):
|
||||
>>> return # User implementation
|
||||
... def __iter__(self):
|
||||
... return # User implementation
|
||||
>>> # Create iterable_generator_dataset with MyIterable object
|
||||
>>> iterable_generator_dataset = ds.GeneratorDataset(MyIterable(), ["col1"])
|
||||
>>>
|
||||
>>> # 4) Random accessible dataset as random accessible input
|
||||
>>> class MyRA():
|
||||
>>> def __getitem__(self, index):
|
||||
>>> return # User implementation
|
||||
... def __getitem__(self, index):
|
||||
... return # User implementation
|
||||
>>> # Create ra_generator_dataset with MyRA object
|
||||
>>> ra_generator_dataset = ds.GeneratorDataset(MyRA(), ["col1"])
|
||||
>>> # List/Dict/Tuple is also random accessible
|
||||
|
@ -3882,22 +3846,21 @@ class TFRecordDataset(SourceDataset):
|
|||
(default=None which means no cache is used).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>> import mindspore.common.dtype as mstype
|
||||
>>>
|
||||
>>> dataset_files = ["/path/to/1", "/path/to/2"] # contains 1 or multiple tf data files
|
||||
>>> tfrecord_dataset_dir = ["/path/to/tfrecord_dataset_file"] # contains 1 or multiple tf data files
|
||||
>>>
|
||||
>>> # 1) Get all rows from dataset_files with no explicit schema
|
||||
>>> # 1) Get all rows from tfrecord_dataset_dir with no explicit schema
|
||||
>>> # The meta-data in the first row will be used as a schema.
|
||||
>>> tfdataset = ds.TFRecordDataset(dataset_files=dataset_files)
|
||||
>>> dataset = ds.TFRecordDataset(dataset_files=tfrecord_dataset_dir)
|
||||
>>>
|
||||
>>> # 2) Get all rows from dataset_files with user-defined schema
|
||||
>>> schema = ds.Schema()
|
||||
>>> # 2) Get all rows from tfrecord_dataset_dir with user-defined schema
|
||||
>>> schema = ds.Schema("/path/to/tfrecord_schema_file")
|
||||
>>> schema.add_column('col_1d', de_type=mindspore.int64, shape=[2])
|
||||
>>> tfdataset = ds.TFRecordDataset(dataset_files=dataset_files, schema=schema)
|
||||
>>> dataset = ds.TFRecordDataset(dataset_files=tfrecord_dataset_dir, schema=schema)
|
||||
>>>
|
||||
>>> # 3) Get all rows from dataset_files with schema file "./schema.json"
|
||||
>>> tfdataset = ds.TFRecordDataset(dataset_files=dataset_files, schema="./schema.json")
|
||||
>>> # 3) Get all rows from tfrecord_dataset_dir with schema file "./schema.json"
|
||||
>>> dataset = ds.TFRecordDataset(dataset_files=tfrecord_dataset_dir, schema="./schema.json")
|
||||
"""
|
||||
|
||||
def parse(self, children=None):
|
||||
|
@ -4075,16 +4038,12 @@ class ManifestDataset(MappableDataset):
|
|||
ValueError: If shard_id is invalid (< 0 or >= num_shards).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_file = "/path/to/manifest_file.manifest"
|
||||
>>>
|
||||
>>> # 1) Read all samples specified in manifest_file dataset with 8 threads for training
|
||||
>>> manifest_dataset = ds.ManifestDataset(dataset_file, usage="train", num_parallel_workers=8)
|
||||
>>> # 1) Read all samples specified in manifest_dataset_dir dataset with 8 threads for training
|
||||
>>> dataset = ds.ManifestDataset(manifest_dataset_dir, usage="train", num_parallel_workers=8)
|
||||
>>>
|
||||
>>> # 2) Read samples (specified in manifest_file.manifest) for shard 0
|
||||
>>> # in a 2-way distributed training setup
|
||||
>>> manifest_dataset = ds.ManifestDataset(dataset_file, num_shards=2, shard_id=0)
|
||||
>>> dataset = ds.ManifestDataset(manifest_dataset_dir, num_shards=2, shard_id=0)
|
||||
|
||||
"""
|
||||
|
||||
|
@ -4239,18 +4198,14 @@ class Cifar10Dataset(MappableDataset):
|
|||
ValueError: If shard_id is invalid (< 0 or >= num_shards).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_dir = "/path/to/cifar10_dataset_directory"
|
||||
>>>
|
||||
>>> # 1) Get all samples from CIFAR10 dataset in sequence
|
||||
>>> dataset = ds.Cifar10Dataset(dataset_dir=dataset_dir, shuffle=False)
|
||||
>>> dataset = ds.Cifar10Dataset(dataset_dir=cifar10_dataset_dir, shuffle=False)
|
||||
>>>
|
||||
>>> # 2) Randomly select 350 samples from CIFAR10 dataset
|
||||
>>> dataset = ds.Cifar10Dataset(dataset_dir=dataset_dir, num_samples=350, shuffle=True)
|
||||
>>> dataset = ds.Cifar10Dataset(dataset_dir=cifar10_dataset_dir, num_samples=350, shuffle=True)
|
||||
>>>
|
||||
>>> # 3) Get samples from CIFAR10 dataset for shard 0 in a 2-way distributed training
|
||||
>>> dataset = ds.Cifar10Dataset(dataset_dir=dataset_dir, num_shards=2, shard_id=0)
|
||||
>>> dataset = ds.Cifar10Dataset(dataset_dir=cifar10_dataset_dir, num_shards=2, shard_id=0)
|
||||
>>>
|
||||
>>> # In CIFAR10 dataset, each dictionary has keys "image" and "label"
|
||||
"""
|
||||
|
@ -4381,15 +4336,11 @@ class Cifar100Dataset(MappableDataset):
|
|||
ValueError: If shard_id is invalid (< 0 or >= num_shards).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_dir = "/path/to/cifar100_dataset_directory"
|
||||
>>>
|
||||
>>> # 1) Get all samples from CIFAR100 dataset in sequence
|
||||
>>> cifar100_dataset = ds.Cifar100Dataset(dataset_dir=dataset_dir, shuffle=False)
|
||||
>>> dataset = ds.Cifar100Dataset(dataset_dir=cifar100_dataset_dir, shuffle=False)
|
||||
>>>
|
||||
>>> # 2) Randomly select 350 samples from CIFAR100 dataset
|
||||
>>> cifar100_dataset = ds.Cifar100Dataset(dataset_dir=dataset_dir, num_samples=350, shuffle=True)
|
||||
>>> dataset = ds.Cifar100Dataset(dataset_dir=cifar100_dataset_dir, num_samples=350, shuffle=True)
|
||||
>>>
|
||||
>>> # In CIFAR100 dataset, each dictionary has 3 keys: "image", "fine_label" and "coarse_label"
|
||||
"""
|
||||
|
@ -4544,12 +4495,11 @@ class Schema:
|
|||
RuntimeError: If schema file failed to load.
|
||||
|
||||
Example:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>> import mindspore.common.dtype as mstype
|
||||
>>>
|
||||
>>> # Create schema; specify column name, mindspore.dtype and shape of the column
|
||||
>>> schema = ds.Schema()
|
||||
>>> schema.add_column('col1', de_type=mindspore.int64, shape=[2])
|
||||
>>> schema.add_column('col1', de_type=mstype.int64, shape=[2])
|
||||
"""
|
||||
|
||||
@check_schema
|
||||
|
@ -4733,21 +4683,17 @@ class VOCDataset(MappableDataset):
|
|||
ValueError: If shard_id is invalid (< 0 or >= num_shards).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_dir = "/path/to/voc_dataset_directory"
|
||||
>>>
|
||||
>>> # 1) Read VOC data for segmentatation training
|
||||
>>> voc_dataset = ds.VOCDataset(dataset_dir, task="Segmentation", usage="train")
|
||||
>>> dataset = ds.VOCDataset(voc_dataset_dir, task="Segmentation", usage="train")
|
||||
>>>
|
||||
>>> # 2) Read VOC data for detection training
|
||||
>>> voc_dataset = ds.VOCDataset(dataset_dir, task="Detection", usage="train")
|
||||
>>> dataset = ds.VOCDataset(voc_dataset_dir, task="Detection", usage="train")
|
||||
>>>
|
||||
>>> # 3) Read all VOC dataset samples in dataset_dir with 8 threads in random order
|
||||
>>> voc_dataset = ds.VOCDataset(dataset_dir, task="Detection", usage="train", num_parallel_workers=8)
|
||||
>>> # 3) Read all VOC dataset samples in voc_dataset_dir with 8 threads in random order
|
||||
>>> dataset = ds.VOCDataset(voc_dataset_dir, task="Detection", usage="train", num_parallel_workers=8)
|
||||
>>>
|
||||
>>> # 4) Read then decode all VOC dataset samples in dataset_dir in sequence
|
||||
>>> voc_dataset = ds.VOCDataset(dataset_dir, task="Detection", usage="train", decode=True, shuffle=False)
|
||||
>>> # 4) Read then decode all VOC dataset samples in voc_dataset_dir in sequence
|
||||
>>> dataset = ds.VOCDataset(voc_dataset_dir, task="Detection", usage="train", decode=True, shuffle=False)
|
||||
>>>
|
||||
>>> # In VOC dataset, if task='Segmentation', each dictionary has keys "image" and "target"
|
||||
>>> # In VOC dataset, if task='Detection', each dictionary has keys "image" and "annotation"
|
||||
|
@ -4928,22 +4874,17 @@ class CocoDataset(MappableDataset):
|
|||
ValueError: If shard_id is invalid (< 0 or >= num_shards).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_dir = "/path/to/coco_dataset_directory/image_folder"
|
||||
>>> annotation_file = "/path/to/coco_dataset_directory/annotation_folder/annotation.json"
|
||||
>>>
|
||||
>>> # 1) Read COCO data for Detection task
|
||||
>>> coco_dataset = ds.CocoDataset(dataset_dir, annotation_file=annotation_file, task='Detection')
|
||||
>>> dataset = ds.CocoDataset(coco_dataset_dir, annotation_file=coco_annotation_file, task='Detection')
|
||||
>>>
|
||||
>>> # 2) Read COCO data for Stuff task
|
||||
>>> coco_dataset = ds.CocoDataset(dataset_dir, annotation_file=annotation_file, task='Stuff')
|
||||
>>> dataset = ds.CocoDataset(coco_dataset_dir, annotation_file=coco_annotation_file, task='Stuff')
|
||||
>>>
|
||||
>>> # 3) Read COCO data for Panoptic task
|
||||
>>> coco_dataset = ds.CocoDataset(dataset_dir, annotation_file=annotation_file, task='Panoptic')
|
||||
>>> dataset = ds.CocoDataset(coco_dataset_dir, annotation_file=coco_annotation_file, task='Panoptic')
|
||||
>>>
|
||||
>>> # 4) Read COCO data for Keypoint task
|
||||
>>> coco_dataset = ds.CocoDataset(dataset_dir, annotation_file=annotation_file, task='Keypoint')
|
||||
>>> dataset = ds.CocoDataset(coco_dataset_dir, annotation_file=coco_annotation_file, task='Keypoint')
|
||||
>>>
|
||||
>>> # In COCO dataset, each dictionary has keys "image" and "annotation"
|
||||
"""
|
||||
|
@ -5071,10 +5012,7 @@ class CelebADataset(MappableDataset):
|
|||
(default=None which means no cache is used).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_dir = "/path/to/celeba_directory"
|
||||
>>> dataset = ds.CelebADataset(dataset_dir=dataset_dir, usage='train')
|
||||
>>> dataset = ds.CelebADataset(dataset_dir=celeba_dataset_dir, usage='train')
|
||||
"""
|
||||
|
||||
def parse(self, children=None):
|
||||
|
@ -5185,10 +5123,8 @@ class CLUEDataset(SourceDataset):
|
|||
(default=None which means no cache is used).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_files = ["/path/to/1", "/path/to/2"] # contains 1 or multiple text files
|
||||
>>> dataset = ds.CLUEDataset(dataset_files=dataset_files, task='AFQMC', usage='train')
|
||||
>>> clue_dataset_dir = ["/path/to/clue_dataset_file"] # contains 1 or multiple text files
|
||||
>>> dataset = ds.CLUEDataset(dataset_files=clue_dataset_dir, task='AFQMC', usage='train')
|
||||
"""
|
||||
|
||||
def parse(self, children=None):
|
||||
|
@ -5421,10 +5357,8 @@ class CSVDataset(SourceDataset):
|
|||
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_files = ["/path/to/1", "/path/to/2"] # contains 1 or multiple text files
|
||||
>>> dataset = ds.CSVDataset(dataset_files=dataset_files, column_names=['col1', 'col2', 'col3', 'col4'])
|
||||
>>> csv_dataset_dir = ["/path/to/csv_dataset_file"]
|
||||
>>> dataset = ds.CSVDataset(dataset_files=csv_dataset_dir, column_names=['col1', 'col2', 'col3', 'col4'])
|
||||
"""
|
||||
|
||||
def parse(self, children=None):
|
||||
|
@ -5528,10 +5462,8 @@ class TextFileDataset(SourceDataset):
|
|||
(default=None which means no cache is used).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_files = ["/path/to/1", "/path/to/2"] # contains 1 or multiple text files
|
||||
>>> dataset = ds.TextFileDataset(dataset_files=dataset_files)
|
||||
>>> # contains 1 or multiple text files
|
||||
>>> dataset = ds.TextFileDataset(dataset_files=text_file_dataset_dir)
|
||||
"""
|
||||
|
||||
def parse(self, children=None):
|
||||
|
@ -5725,24 +5657,22 @@ class NumpySlicesDataset(GeneratorDataset):
|
|||
when num_shards is also specified. Random accessible input is required.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> # 1) Input data can be a list
|
||||
>>> data = [1, 2, 3]
|
||||
>>> dataset1 = ds.NumpySlicesDataset(data, column_names=["column_1"])
|
||||
>>> dataset = ds.NumpySlicesDataset(data, column_names=["column_1"])
|
||||
>>>
|
||||
>>> # 2) Input data can be a dictionary, and column_names will be its keys
|
||||
>>> data = {"a": [1, 2], "b": [3, 4]}
|
||||
>>> dataset2 = ds.NumpySlicesDataset(data)
|
||||
>>> dataset = ds.NumpySlicesDataset(data)
|
||||
>>>
|
||||
>>> # 3) Input data can be a tuple of lists (or NumPy arrays), each tuple element refers to data in each column
|
||||
>>> data = ([1, 2], [3, 4], [5, 6])
|
||||
>>> dataset3 = ds.NumpySlicesDataset(data, column_names=["column_1", "column_2", "column_3"])
|
||||
>>> dataset = ds.NumpySlicesDataset(data, column_names=["column_1", "column_2", "column_3"])
|
||||
>>>
|
||||
>>> # 4) Load data from CSV file
|
||||
>>> import pandas as pd
|
||||
>>> df = pd.read_csv("file.csv")
|
||||
>>> dataset4 = ds.NumpySlicesDataset(dict(df), shuffle=False)
|
||||
>>> df = pd.read_csv(csv_dataset_dir)
|
||||
>>> dataset = ds.NumpySlicesDataset(dict(df), shuffle=False)
|
||||
"""
|
||||
|
||||
@check_numpyslicesdataset
|
||||
|
@ -5787,9 +5717,9 @@ class PaddedDataset(GeneratorDataset):
|
|||
ValueError: If the padded_samples is empty.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>> data1 = [{'image': np.zeros(1, np.uint8)}, {'image': np.zeros(2, np.uint8)}]
|
||||
>>> ds1 = ds.PaddedDataset(data1)
|
||||
>>> import numpy as np
|
||||
>>> data = [{'image': np.zeros(1, np.uint8)}, {'image': np.zeros(2, np.uint8)}]
|
||||
>>> dataset = ds.PaddedDataset(data)
|
||||
"""
|
||||
|
||||
@check_paddeddataset
|
||||
|
|
|
@ -72,11 +72,9 @@ class GraphData:
|
|||
the server automatically exits (default=True).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> nodes = data_graph.get_all_nodes(0)
|
||||
>>> features = data_graph.get_node_feature(nodes, [1])
|
||||
>>> graph_dataset = ds.GraphData(graph_dataset_dir, 2)
|
||||
>>> nodes = graph_dataset.get_all_nodes(0)
|
||||
>>> features = graph_dataset.get_node_feature(nodes, [1])
|
||||
"""
|
||||
|
||||
@check_gnn_graphdata
|
||||
|
@ -116,10 +114,7 @@ class GraphData:
|
|||
numpy.ndarray, array of nodes.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> nodes = data_graph.get_all_nodes(0)
|
||||
>>> nodes = graph_dataset.get_all_nodes(0)
|
||||
|
||||
Raises:
|
||||
TypeError: If `node_type` is not integer.
|
||||
|
@ -140,10 +135,7 @@ class GraphData:
|
|||
numpy.ndarray, array of edges.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> nodes = data_graph.get_all_edges(0)
|
||||
>>> edges = graph_dataset.get_all_edges(0)
|
||||
|
||||
Raises:
|
||||
TypeError: If `edge_type` is not integer.
|
||||
|
@ -183,11 +175,8 @@ class GraphData:
|
|||
numpy.ndarray, array of neighbors.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> nodes = data_graph.get_all_nodes(0)
|
||||
>>> neighbors = data_graph.get_all_neighbors(nodes, 0)
|
||||
>>> nodes = graph_dataset.get_all_nodes(0)
|
||||
>>> neighbors = graph_dataset.get_all_neighbors(nodes, 0)
|
||||
|
||||
Raises:
|
||||
TypeError: If `node_list` is not list or ndarray.
|
||||
|
@ -222,11 +211,8 @@ class GraphData:
|
|||
numpy.ndarray, array of neighbors.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> nodes = data_graph.get_all_nodes(0)
|
||||
>>> neighbors = data_graph.get_sampled_neighbors(nodes, [2, 2], [0, 0])
|
||||
>>> nodes = graph_dataset.get_all_nodes(0)
|
||||
>>> neighbors = graph_dataset.get_sampled_neighbors(nodes, [2, 2], [0, 0])
|
||||
|
||||
Raises:
|
||||
TypeError: If `node_list` is not list or ndarray.
|
||||
|
@ -254,11 +240,8 @@ class GraphData:
|
|||
numpy.ndarray, array of neighbors.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> nodes = data_graph.get_all_nodes(0)
|
||||
>>> neg_neighbors = data_graph.get_neg_sampled_neighbors(nodes, 5, 0)
|
||||
>>> nodes = graph_dataset.get_all_nodes(0)
|
||||
>>> neg_neighbors = graph_dataset.get_neg_sampled_neighbors(nodes, 5, 0)
|
||||
|
||||
Raises:
|
||||
TypeError: If `node_list` is not list or ndarray.
|
||||
|
@ -283,11 +266,8 @@ class GraphData:
|
|||
numpy.ndarray, array of features.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> nodes = data_graph.get_all_nodes(0)
|
||||
>>> features = data_graph.get_node_feature(nodes, [1])
|
||||
>>> nodes = graph_dataset.get_all_nodes(0)
|
||||
>>> features = graph_dataset.get_node_feature(nodes, [1])
|
||||
|
||||
Raises:
|
||||
TypeError: If `node_list` is not list or ndarray.
|
||||
|
@ -315,11 +295,8 @@ class GraphData:
|
|||
numpy.ndarray, array of features.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> edges = data_graph.get_all_edges(0)
|
||||
>>> features = data_graph.get_edge_feature(edges, [1])
|
||||
>>> edges = graph_dataset.get_all_edges(0)
|
||||
>>> features = graph_dataset.get_edge_feature(edges, [1])
|
||||
|
||||
Raises:
|
||||
TypeError: If `edge_list` is not list or ndarray.
|
||||
|
@ -370,10 +347,7 @@ class GraphData:
|
|||
numpy.ndarray, array of nodes.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> data_graph = ds.GraphData('dataset_file', 2)
|
||||
>>> nodes = data_graph.random_walk([1,2], [1,2,1,2,1])
|
||||
>>> nodes = graph_dataset.random_walk([1,2], [1,2,1,2,1])
|
||||
|
||||
Raises:
|
||||
TypeError: If `target_nodes` is not list or ndarray.
|
||||
|
|
|
@ -321,13 +321,11 @@ class DistributedSampler(BuiltinSampler):
|
|||
should be no more than num_shards.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_dir = "path/to/imagefolder_directory"
|
||||
>>>
|
||||
>>> # creates a distributed sampler with 10 shards in total. This shard is shard 5.
|
||||
>>> sampler = ds.DistributedSampler(10, 5)
|
||||
>>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler)
|
||||
>>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir,
|
||||
... num_parallel_workers=8,
|
||||
... sampler=sampler)
|
||||
|
||||
Raises:
|
||||
ValueError: If num_shards is not positive.
|
||||
|
@ -403,13 +401,11 @@ class PKSampler(BuiltinSampler):
|
|||
num_samples (int, optional): The number of samples to draw (default=None, all elements).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_dir = "path/to/imagefolder_directory"
|
||||
>>>
|
||||
>>> # creates a PKSampler that will get 3 samples from every class.
|
||||
>>> sampler = ds.PKSampler(3)
|
||||
>>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler)
|
||||
>>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir,
|
||||
... num_parallel_workers=8,
|
||||
... sampler=sampler)
|
||||
|
||||
Raises:
|
||||
ValueError: If num_val is not positive.
|
||||
|
@ -472,13 +468,11 @@ class RandomSampler(BuiltinSampler):
|
|||
num_samples (int, optional): Number of elements to sample (default=None, all elements).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_dir = "path/to/imagefolder_directory"
|
||||
>>>
|
||||
>>> # creates a RandomSampler
|
||||
>>> sampler = ds.RandomSampler()
|
||||
>>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler)
|
||||
>>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir,
|
||||
... num_parallel_workers=8,
|
||||
... sampler=sampler)
|
||||
|
||||
Raises:
|
||||
ValueError: If replacement is not boolean.
|
||||
|
@ -528,13 +522,11 @@ class SequentialSampler(BuiltinSampler):
|
|||
num_samples (int, optional): Number of elements to sample (default=None, all elements).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_dir = "path/to/imagefolder_directory"
|
||||
>>>
|
||||
>>> # creates a SequentialSampler
|
||||
>>> sampler = ds.SequentialSampler()
|
||||
>>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler)
|
||||
>>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir,
|
||||
... num_parallel_workers=8,
|
||||
... sampler=sampler)
|
||||
"""
|
||||
|
||||
def __init__(self, start_index=None, num_samples=None):
|
||||
|
@ -579,15 +571,13 @@ class SubsetSampler(BuiltinSampler):
|
|||
num_samples (int, optional): Number of elements to sample (default=None, all elements).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>> indices = [0, 1, 2, 3, 4, 5]
|
||||
>>>
|
||||
>>> dataset_dir = "path/to/imagefolder_directory"
|
||||
>>>
|
||||
>>> indices = [0, 1, 2, 3, 7, 88, 119]
|
||||
>>>
|
||||
>>> # creates a SubsetSampler, will sample from the provided indices
|
||||
>>> sampler = ds.SubsetSampler(indices)
|
||||
>>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler)
|
||||
>>> # creates a SubsetRandomSampler, will sample from the provided indices
|
||||
>>> sampler = ds.SubsetRandomSampler(indices)
|
||||
>>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir,
|
||||
... num_parallel_workers=8,
|
||||
... sampler=sampler)
|
||||
"""
|
||||
|
||||
def __init__(self, indices, num_samples=None):
|
||||
|
@ -679,15 +669,13 @@ class WeightedRandomSampler(BuiltinSampler):
|
|||
replacement (bool): If True, put the sample ID back for the next draw (default=True).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>>
|
||||
>>> dataset_dir = "path/to/imagefolder_directory"
|
||||
>>>
|
||||
>>> weights = [0.9, 0.01, 0.4, 0.8, 0.1, 0.1, 0.3]
|
||||
>>>
|
||||
>>> # creates a WeightedRandomSampler that will sample 4 elements without replacement
|
||||
>>> sampler = ds.WeightedRandomSampler(weights, 4)
|
||||
>>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler)
|
||||
>>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir,
|
||||
... num_parallel_workers=8,
|
||||
... sampler=sampler)
|
||||
|
||||
Raises:
|
||||
ValueError: If num_samples is not positive.
|
||||
|
|
|
@ -40,16 +40,13 @@ def serialize(dataset, json_filepath=""):
|
|||
OSError cannot open a file
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>> import mindspore.dataset.transforms.c_transforms as C
|
||||
>>> DATA_DIR = "../../data/testMnistData"
|
||||
>>> data = ds.MnistDataset(DATA_DIR, 100)
|
||||
>>> one_hot_encode = C.OneHot(10) # num_classes is input argument
|
||||
>>> data = data.map(operation=one_hot_encode, input_column_names="label")
|
||||
>>> data = data.batch(batch_size=10, drop_remainder=True)
|
||||
>>>
|
||||
>>> ds.engine.serialize(data, json_filepath="mnist_dataset_pipeline.json") # serialize it to json file
|
||||
>>> serialized_data = ds.engine.serialize(data) # serialize it to Python dict
|
||||
>>> dataset = ds.MnistDataset(mnist_dataset_dir, 100)
|
||||
>>> one_hot_encode = c_transforms.OneHot(10) # num_classes is input argument
|
||||
>>> dataset = dataset.map(operation=one_hot_encode, input_column_names="label")
|
||||
>>> dataset = dataset.batch(batch_size=10, drop_remainder=True)
|
||||
>>> # serialize it to json file
|
||||
>>> ds.engine.serialize(dataset, json_filepath="/path/to/mnist_dataset_pipeline.json")
|
||||
>>> serialized_data = ds.engine.serialize(dataset) # serialize it to Python dict
|
||||
"""
|
||||
return dataset.to_json(json_filepath)
|
||||
|
||||
|
@ -69,20 +66,16 @@ def deserialize(input_dict=None, json_filepath=None):
|
|||
OSError cannot open a file.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>> import mindspore.dataset.transforms.c_transforms as C
|
||||
>>> DATA_DIR = "../../data/testMnistData"
|
||||
>>> data = ds.MnistDataset(DATA_DIR, 100)
|
||||
>>> one_hot_encode = C.OneHot(10) # num_classes is input argument
|
||||
>>> data = data.map(operation=one_hot_encode, input_column_names="label")
|
||||
>>> data = data.batch(batch_size=10, drop_remainder=True)
|
||||
>>>
|
||||
>>> dataset = ds.MnistDataset(mnist_dataset_dir, 100)
|
||||
>>> one_hot_encode = c_transforms.OneHot(10) # num_classes is input argument
|
||||
>>> dataset = dataset.map(operation=one_hot_encode, input_column_names="label")
|
||||
>>> dataset = dataset.batch(batch_size=10, drop_remainder=True)
|
||||
>>> # Use case 1: to/from json file
|
||||
>>> ds.engine.serialize(data, json_filepath="mnist_dataset_pipeline.json")
|
||||
>>> data = ds.engine.deserialize(json_filepath="mnist_dataset_pipeline.json")
|
||||
>>> ds.engine.serialize(dataset, json_filepath="/path/to/mnist_dataset_pipeline.json")
|
||||
>>> dataset = ds.engine.deserialize(json_filepath="/path/to/mnist_dataset_pipeline.json")
|
||||
>>> # Use case 2: to/from Python dictionary
|
||||
>>> serialized_data = ds.engine.serialize(data)
|
||||
>>> data = ds.engine.deserialize(input_dict=serialized_data)
|
||||
>>> serialized_data = ds.engine.serialize(dataset)
|
||||
>>> dataset = ds.engine.deserialize(input_dict=serialized_data)
|
||||
|
||||
"""
|
||||
data = None
|
||||
|
|
|
@ -24,21 +24,18 @@ and use Lookup to find the index of tokens in Vocab.
|
|||
class attributes (self.xxx) to support save() and load().
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> dataset_file = "path/to/text_file_path"
|
||||
>>> text_file_dataset_dir = "/path/to/text_file_dataset_file"
|
||||
>>> # Create a dataset for text sentences saved as line data in a file
|
||||
>>> data1 = ds.TextFileDataset(dataset_file, shuffle=False)
|
||||
>>> text_file_dataset = ds.TextFileDataset(text_file_dataset_dir, shuffle=False)
|
||||
>>> # Tokenize sentences to unicode characters
|
||||
>>> tokenizer = text.UnicodeCharTokenizer()
|
||||
>>> # Load vocabulary from list
|
||||
>>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您'])
|
||||
>>> # Use Lookup operator to map tokens to ids
|
||||
>>> lookup = text.Lookup(vocab)
|
||||
>>> data1 = data1.map(operations=[tokenizer, lookup])
|
||||
>>> for i in data1.create_dict_iterator():
|
||||
>>> print(i)
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=[tokenizer, lookup])
|
||||
>>> for i in text_file_dataset.create_dict_iterator():
|
||||
... print(i)
|
||||
>>> # if text line in dataset_file is:
|
||||
>>> # 深圳欢迎您
|
||||
>>> # then the output will be:
|
||||
|
@ -132,17 +129,18 @@ class JiebaTokenizer(TextTensorOperation):
|
|||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> from mindspore.dataset.text import JiebaMode
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=False)
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
>>> jieba_hmm_file = "/path/to/jieba/hmm/file"
|
||||
>>> jieba_mp_file = "/path/to/jieba/mp/file"
|
||||
>>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=False)
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
|
||||
>>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
... # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=True)
|
||||
>>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"],
|
||||
... output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
... column_order=["token", "offsets_start", "offsets_limit"])
|
||||
"""
|
||||
|
||||
@check_jieba_init
|
||||
|
@ -178,14 +176,16 @@ class JiebaTokenizer(TextTensorOperation):
|
|||
the better chance the word will be tokenized (default=None, use default frequency).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=text.JiebaMode.MP)
|
||||
>>> with open(VOCAB_FILE, 'r') as f:
|
||||
>>> from mindspore.dataset.text import JiebaMode
|
||||
>>> jieba_hmm_file = "/path/to/jieba/hmm/file"
|
||||
>>> jieba_mp_file = "/path/to/jieba/mp/file"
|
||||
>>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=text.JiebaMode.MP)
|
||||
>>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file"
|
||||
>>> with open(sentence_piece_vocab_file, 'r') as f:
|
||||
>>> for line in f:
|
||||
>>> word = line.split(',')[0]
|
||||
>>> jieba_op.add_word(word)
|
||||
>>> data1 = data1.map(operations=jieba_op, input_columns=["text"])
|
||||
... word = line.split(',')[0]
|
||||
... jieba_op.add_word(word)
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=jieba_op, input_columns=["text"])
|
||||
"""
|
||||
|
||||
if freq is None:
|
||||
|
@ -210,12 +210,13 @@ class JiebaTokenizer(TextTensorOperation):
|
|||
word3 freq3
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> from mindspore.dataset.text import JiebaMode
|
||||
>>> jieba_hmm_file = "/path/to/jieba/hmm/file"
|
||||
>>> jieba_mp_file = "/path/to/jieba/mp/file"
|
||||
>>> user_dict = {"男默女泪": 10}
|
||||
>>> jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
|
||||
>>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP)
|
||||
>>> jieba_op.add_dict(user_dict)
|
||||
>>> data1 = data1.map(operations=jieba_op, input_columns=["text"])
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=jieba_op, input_columns=["text"])
|
||||
"""
|
||||
|
||||
if isinstance(user_dict, str):
|
||||
|
@ -283,13 +284,11 @@ class Lookup(TextTensorOperation):
|
|||
data_type (mindspore.dtype, optional): mindspore.dtype that lookup maps string to (default=mstype.int32)
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # Load vocabulary from list
|
||||
>>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您'])
|
||||
>>> # Use Lookup operator to map tokens to ids
|
||||
>>> lookup = text.Lookup(vocab)
|
||||
>>> data1 = data1.map(operations=[lookup])
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=[lookup])
|
||||
"""
|
||||
|
||||
@check_lookup
|
||||
|
@ -323,9 +322,7 @@ class Ngram(TextTensorOperation):
|
|||
(default=None, which means whitespace is used).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> data1 = data1.map(operations=text.Ngram(3, separator=" "))
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=text.Ngram(3, separator=""))
|
||||
"""
|
||||
|
||||
@check_ngram
|
||||
|
@ -349,11 +346,12 @@ class SentencePieceTokenizer(TextTensorOperation):
|
|||
out_type (Union[str, int]): The type of output.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
|
||||
>>> from mindspore.dataset.text import SentencePieceModel, SPieceTokenizerOutType
|
||||
>>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file"
|
||||
>>> vocab = text.SentencePieceVocab.from_file([sentence_piece_vocab_file], 5000, 0.9995,
|
||||
... SentencePieceModel.UNIGRAM, {})
|
||||
>>> tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
||||
>>> data1 = data1.map(operations=tokenizer)
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=tokenizer)
|
||||
"""
|
||||
|
||||
def __init__(self, mode, out_type):
|
||||
|
@ -390,7 +388,6 @@ class SlidingWindow(TextTensorOperation):
|
|||
>>> # | [3,4,5]] |
|
||||
>>> # +--------------+
|
||||
"""
|
||||
|
||||
@check_slidingwindow
|
||||
def __init__(self, width, axis=0):
|
||||
self.width = width
|
||||
|
@ -418,11 +415,11 @@ class ToNumber(TextTensorOperation):
|
|||
RuntimeError: If strings are invalid to cast, or are out of range after being casted.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>> import mindspore.common.dtype as mstype
|
||||
>>>
|
||||
>>> data = [["1", "2", "3"]]
|
||||
>>> dataset = ds.NumpySlicesDataset(data)
|
||||
>>> to_number_op = text.ToNumber(mstype.int8)
|
||||
>>> data1 = data1.map(operations=to_number_op)
|
||||
>>> dataset = dataset.map(operations=to_number_op)
|
||||
"""
|
||||
|
||||
@check_to_number
|
||||
|
@ -514,15 +511,15 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
|
|||
>>>
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]',
|
||||
>>> max_bytes_per_token=100, with_offsets=False)
|
||||
... max_bytes_per_token=100, with_offsets=False)
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]',
|
||||
>>> max_bytes_per_token=100, with_offsets=True)
|
||||
... max_bytes_per_token=100, with_offsets=True)
|
||||
>>> data2 = data2.map(operations=tokenizer_op,
|
||||
>>> input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
... input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
... column_order=["token", "offsets_start", "offsets_limit"])
|
||||
"""
|
||||
|
||||
@check_wordpiece_tokenizer
|
||||
|
@ -545,11 +542,9 @@ class PythonTokenizer:
|
|||
tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> def my_tokenizer(line):
|
||||
>>> return line.split()
|
||||
>>> data1 = data1.map(operations=text.PythonTokenizer(my_tokenizer))
|
||||
... return line.split()
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=text.PythonTokenizer(my_tokenizer))
|
||||
"""
|
||||
|
||||
@check_python_tokenizer
|
||||
|
@ -590,26 +585,27 @@ if platform.system().lower() != 'windows':
|
|||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.BasicTokenizer(lower_case=False,
|
||||
>>> keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE,
|
||||
>>> preserve_unused_token=True,
|
||||
>>> with_offsets=False)
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
... keep_whitespace=False,
|
||||
... normalization_form=NormalizeForm.NONE,
|
||||
... preserve_unused_token=True,
|
||||
... with_offsets=False)
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.BasicTokenizer(lower_case=False,
|
||||
>>> keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE,
|
||||
>>> preserve_unused_token=True,
|
||||
>>> with_offsets=True)
|
||||
>>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
... keep_whitespace=False,
|
||||
... normalization_form=NormalizeForm.NONE,
|
||||
... preserve_unused_token=True,
|
||||
... with_offsets=True)
|
||||
>>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"],
|
||||
... output_columns=["token", "offsets_start",
|
||||
... "offsets_limit"],
|
||||
... column_order=["token", "offsets_start",
|
||||
... "offsets_limit"])
|
||||
|
||||
"""
|
||||
|
||||
@check_basic_tokenizer
|
||||
|
@ -653,24 +649,32 @@ if platform.system().lower() != 'windows':
|
|||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> from mindspore.dataset.text import NormalizeForm
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> vocab_list = ["床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头", "望", "低",
|
||||
... "思", "故", "乡","繁", "體", "字", "嘿", "哈", "大", "笑", "嘻", "i", "am", "mak",
|
||||
... "make", "small", "mistake", "##s", "during", "work", "##ing", "hour", "😀", "😃",
|
||||
... "😄", "😁", "+", "/", "-", "=", "12", "28", "40", "16", " ", "I", "[CLS]", "[SEP]",
|
||||
... "[UNK]", "[PAD]", "[MASK]", "[unused1]", "[unused10]"]
|
||||
>>> vocab = text.Vocab.from_list(vocab_list)
|
||||
>>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100,
|
||||
>>> unknown_token='[UNK]', lower_case=False, keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
|
||||
>>> with_offsets=False)
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
... unknown_token='[UNK]', lower_case=False, keep_whitespace=False,
|
||||
... normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
|
||||
... with_offsets=False)
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100,
|
||||
>>> unknown_token='[UNK]', lower_case=False, keep_whitespace=False,
|
||||
>>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
|
||||
>>> with_offsets=True)
|
||||
>>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
... unknown_token='[UNK]', lower_case=False, keep_whitespace=False,
|
||||
... normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
|
||||
... with_offsets=True)
|
||||
>>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"],
|
||||
... output_columns=["token", "offsets_start",
|
||||
... "offsets_limit"],
|
||||
... column_order=["token", "offsets_start",
|
||||
... "offsets_limit"])
|
||||
|
||||
"""
|
||||
|
||||
@check_bert_tokenizer
|
||||
|
@ -704,10 +708,8 @@ if platform.system().lower() != 'windows':
|
|||
CaseFold is not supported on Windows platform yet.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> case_op = text.CaseFold()
|
||||
>>> data1 = data1.map(operations=case_op)
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=case_op)
|
||||
"""
|
||||
|
||||
def parse(self):
|
||||
|
@ -734,10 +736,9 @@ if platform.system().lower() != 'windows':
|
|||
- NormalizeForm.NFKD, normalize with Normalization Form KD.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> from mindspore.dataset.text import NormalizeForm
|
||||
>>> normalize_op = text.NormalizeUTF8(normalize_form=NormalizeForm.NFC)
|
||||
>>> data1 = data1.map(operations=normalize_op)
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=normalize_op)
|
||||
"""
|
||||
|
||||
def __init__(self, normalize_form=NormalizeForm.NFKC):
|
||||
|
@ -767,12 +768,10 @@ if platform.system().lower() != 'windows':
|
|||
if True, replace all matched elements (default=True).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> pattern = 'Canada'
|
||||
>>> replace = 'China'
|
||||
>>> replace_op = text.RegexReplace(pattern, replace)
|
||||
>>> data1 = data1.map(operations=replace_op)
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=replace_op)
|
||||
"""
|
||||
|
||||
def __init__(self, pattern, replace, replace_all=True):
|
||||
|
@ -802,18 +801,19 @@ if platform.system().lower() != 'windows':
|
|||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=False)
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
>>> delim_pattern = r"[ |,]"
|
||||
>>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=False)
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True)
|
||||
>>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
>>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=True)
|
||||
>>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"],
|
||||
... output_columns=["token", "offsets_start",
|
||||
... "offsets_limit"],
|
||||
... column_order=["token", "offsets_start",
|
||||
... "offsets_limit"])
|
||||
"""
|
||||
|
||||
@check_regex_tokenizer
|
||||
|
@ -838,18 +838,19 @@ if platform.system().lower() != 'windows':
|
|||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=False)
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
>>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=False)
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
|
||||
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
|
||||
>>> # ["offsets_start", dtype=uint32],
|
||||
>>> # ["offsets_limit", dtype=uint32]}
|
||||
>>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=True)
|
||||
>>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"],
|
||||
>>> output_columns=["token", "offsets_start", "offsets_limit"],
|
||||
>>> column_order=["token", "offsets_start", "offsets_limit"])
|
||||
>>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True)
|
||||
>>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"],
|
||||
... output_columns=["token", "offsets_start",
|
||||
... "offsets_limit"],
|
||||
... column_order=["token", "offsets_start",
|
||||
... "offsets_limit"])
|
||||
|
||||
"""
|
||||
|
||||
@check_unicode_script_tokenizer
|
||||
|
@ -874,8 +875,6 @@ if platform.system().lower() != 'windows':
|
|||
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.text as text
|
||||
>>>
|
||||
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
|
||||
>>> tokenizer_op = text.WhitespaceTokenizer()
|
||||
>>> data1 = data1.map(operations=tokenizer_op)
|
||||
|
|
|
@ -46,14 +46,8 @@ class OneHot(cde.OneHotOp):
|
|||
RuntimeError: feature size is bigger than num_classes.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.transforms.c_transforms as c_transforms
|
||||
>>> import mindspore.dataset.vision.c_transforms as c_vision
|
||||
>>>
|
||||
>>> onehot_op = c_transforms.OneHot(num_classes=10)
|
||||
>>> data1 = data1.map(operations=onehot_op, input_columns=["label"])
|
||||
>>> mixup_batch_op = c_vision.MixUpBatch(alpha=0.8)
|
||||
>>> data1 = data1.batch(4)
|
||||
>>> data1 = data1.map(operations=mixup_batch_op, input_columns=["image", "label"])
|
||||
>>> mnist_dataset = mnist_dataset.map(operations=onehot_op, input_columns=["label"])
|
||||
"""
|
||||
|
||||
@check_num_classes
|
||||
|
@ -72,9 +66,15 @@ class Fill(cde.FillOp):
|
|||
to fill created tensor with.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.transforms.c_transforms as c_transforms
|
||||
>>>
|
||||
>>> import numpy as np
|
||||
>>> from mindspore.dataset import GeneratorDataset
|
||||
>>> # Generate 1d int numpy array from 0 - 63
|
||||
>>> def generator_1d():
|
||||
>>> for i in range(64):
|
||||
... yield (np.array([i]),)
|
||||
>>> generator_dataset = GeneratorDataset(generator_1d,column_names='col')
|
||||
>>> fill_op = c_transforms.Fill(3)
|
||||
>>> generator_dataset = generator_dataset.map(operations=fill_op)
|
||||
"""
|
||||
|
||||
@check_fill_value
|
||||
|
@ -90,10 +90,16 @@ class TypeCast(cde.TypeCastOp):
|
|||
data_type (mindspore.dtype): mindspore.dtype to be cast to.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.transforms.c_transforms as c_transforms
|
||||
>>> import numpy as np
|
||||
>>> import mindspore.common.dtype as mstype
|
||||
>>>
|
||||
>>> from mindspore.dataset import GeneratorDataset
|
||||
>>> # Generate 1d int numpy array from 0 - 63
|
||||
>>> def generator_1d():
|
||||
>>> for i in range(64):
|
||||
... yield (np.array([i]),)
|
||||
>>> generator_dataset = GeneratorDataset(generator_1d,column_names='col')
|
||||
>>> type_cast_op = c_transforms.TypeCast(mstype.int32)
|
||||
>>> generator_dataset = generator_dataset.map(operations=type_cast_op)
|
||||
"""
|
||||
|
||||
@check_de_type
|
||||
|
@ -149,14 +155,15 @@ class Slice(cde.SliceOp):
|
|||
5. :py:obj:`Ellipses`: Slice the whole dimension. Similar to `:` in Python indexing.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.transforms.c_transforms as c_transforms
|
||||
>>>
|
||||
>>> # Data before
|
||||
>>> # | col |
|
||||
>>> # +---------+
|
||||
>>> # | [1,2,3] |
|
||||
>>> # +---------|
|
||||
>>> data1 = data1.map(operations=c_transforms.Slice(slice(1,3))) # slice indices 1 and 2 only
|
||||
>>> data = [[1, 2, 3]]
|
||||
>>> numpy_slices_dataset = ds.NumpySlicesDataset(data, ["col"])
|
||||
>>> # slice indices 1 and 2 only
|
||||
>>> numpy_slices_dataset = numpy_slices_dataset.map(operations=c_transforms.Slice(slice(1,3)))
|
||||
>>> # Data after
|
||||
>>> # | col |
|
||||
>>> # +---------+
|
||||
|
@ -200,16 +207,17 @@ class Mask(cde.MaskOp):
|
|||
dtype (mindspore.dtype, optional): Type of the generated mask (Default to bool).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.transforms.c_transforms as c_transforms
|
||||
>>>
|
||||
>>> from mindspore.dataset.transforms.c_transforms import Relational
|
||||
>>> # Data before
|
||||
>>> # | col1 |
|
||||
>>> # | col |
|
||||
>>> # +---------+
|
||||
>>> # | [1,2,3] |
|
||||
>>> # +---------+
|
||||
>>> data1 = data1.map(operations=c_transforms.Mask(Relational.EQ, 2))
|
||||
>>> data = [[1, 2, 3]]
|
||||
>>> numpy_slices_dataset = ds.NumpySlicesDataset(data, ["col"])
|
||||
>>> numpy_slices_dataset = numpy_slices_dataset.map(operations=c_transforms.Mask(Relational.EQ, 2))
|
||||
>>> # Data after
|
||||
>>> # | col1 |
|
||||
>>> # | col |
|
||||
>>> # +--------------------+
|
||||
>>> # | [False,True,False] |
|
||||
>>> # +--------------------+
|
||||
|
@ -233,14 +241,15 @@ class PadEnd(cde.PadEndOp):
|
|||
string in case of tensors of strings.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.transforms.c_transforms as c_transforms
|
||||
>>>
|
||||
>>> # Data before
|
||||
>>> # | col |
|
||||
>>> # +---------+
|
||||
>>> # | [1,2,3] |
|
||||
>>> # +---------|
|
||||
>>> data1 = data1.map(operations=c_transforms.PadEnd(pad_shape=[4], pad_value=10))
|
||||
>>> data = [[1, 2, 3]]
|
||||
>>> numpy_slices_dataset = ds.NumpySlicesDataset(data, ["col"])
|
||||
>>> numpy_slices_dataset = numpy_slices_dataset.map(operations=c_transforms.PadEnd(pad_shape=[4],
|
||||
... pad_value=10))
|
||||
>>> # Data after
|
||||
>>> # | col |
|
||||
>>> # +------------+
|
||||
|
@ -265,12 +274,14 @@ class Concatenate(cde.ConcatenateOp):
|
|||
append (numpy.array, optional): NumPy array to be appended to the already concatenated tensors (Default=None).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.transforms.c_transforms as c_transforms
|
||||
>>>
|
||||
>>> import numpy as np
|
||||
>>> # concatenate string
|
||||
>>> prepend_tensor = np.array(["dw", "df"], dtype='S')
|
||||
>>> append_tensor = np.array(["dwsdf", "df"], dtype='S')
|
||||
>>> concatenate_op = c_transforms.Concatenate(0, prepend_tensor, append_tensor)
|
||||
>>> data = [["This","is","a","string"]]
|
||||
>>> dataset = ds.NumpySlicesDataset(data)
|
||||
>>> dataset = dataset.map(operations=concatenate_op)
|
||||
"""
|
||||
|
||||
@check_concat_type
|
||||
|
@ -287,15 +298,17 @@ class Duplicate(cde.DuplicateOp):
|
|||
Duplicate the input tensor to a new output tensor. The input tensor is carried over to the output list.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.transforms.c_transforms as c_transforms
|
||||
>>>
|
||||
>>> # Data before
|
||||
>>> # | x |
|
||||
>>> # +---------+
|
||||
>>> # | [1,2,3] |
|
||||
>>> # +---------+
|
||||
>>> data1 = data1.map(operations=c_transforms.Duplicate(), input_columns=["x"],
|
||||
>>> output_columns=["x", "y"], column_order=["x", "y"])
|
||||
>>> data = [[1,2,3]]
|
||||
>>> numpy_slices_dataset = ds.NumpySlicesDataset(data, ["x"])
|
||||
>>> numpy_slices_dataset = numpy_slices_dataset.map(operations=c_transforms.Duplicate(),
|
||||
... input_columns=["x"],
|
||||
... output_columns=["x", "y"],
|
||||
... column_order=["x", "y"])
|
||||
>>> # Data after
|
||||
>>> # | x | y |
|
||||
>>> # +---------+---------+
|
||||
|
@ -319,15 +332,17 @@ class Unique(cde.UniqueOp):
|
|||
Call batch op before calling this function.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.transforms.c_transforms as c_transforms
|
||||
>>>
|
||||
>>> # Data before
|
||||
>>> # | x |
|
||||
>>> # +--------------------+
|
||||
>>> # | [[0,1,2], [1,2,3]] |
|
||||
>>> # +--------------------+
|
||||
>>> data1 = data1.map(operations=c_transforms.Unique(), input_columns=["x"],
|
||||
>>> output_columns=["x", "y", "z"], column_order=["x", "y", "z"])
|
||||
>>> data = [[[0,1,2], [1,2,3]]]
|
||||
>>> dataset = ds.NumpySlicesDataset(data, ["x"])
|
||||
>>> dataset = dataset.map(operations=c_transforms.Unique(),
|
||||
... input_columns=["x"],
|
||||
... output_columns=["x", "y", "z"],
|
||||
... column_order=["x", "y", "z"])
|
||||
>>> # Data after
|
||||
>>> # | x | y |z |
|
||||
>>> # +---------+-----------------+---------+
|
||||
|
@ -343,11 +358,8 @@ class Compose():
|
|||
transforms (list): List of transformations to be applied.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.transforms.c_transforms as c_transforms
|
||||
>>> import mindspore.dataset.vision.c_transforms as c_vision
|
||||
>>>
|
||||
>>> compose = c_transforms.Compose([c_vision.Decode(), c_vision.RandomCrop(512)])
|
||||
>>> data1 = data1.map(operations=compose)
|
||||
>>> image_folder_dataset = image_folder_dataset.map(operations=compose)
|
||||
"""
|
||||
|
||||
@check_random_transform_ops
|
||||
|
@ -372,11 +384,8 @@ class RandomApply():
|
|||
prob (float, optional): The probability to apply the transformation list (default=0.5)
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.transforms.c_transforms as c_transforms
|
||||
>>> import mindspore.dataset.vision.c_transforms as c_vision
|
||||
>>>
|
||||
>>> rand_apply = c_transforms.RandomApply([c_vision.RandomCrop(512)])
|
||||
>>> data1 = data1.map(operations=rand_apply)
|
||||
>>> image_folder_dataset = image_folder_dataset.map(operations=rand_apply)
|
||||
"""
|
||||
|
||||
@check_random_transform_ops
|
||||
|
@ -402,11 +411,8 @@ class RandomChoice():
|
|||
transforms (list): List of transformations to be chosen from to apply.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.transforms.c_transforms as c_transforms
|
||||
>>> import mindspore.dataset.vision.c_transforms as c_vision
|
||||
>>>
|
||||
>>> rand_choice = c_transforms.RandomChoice([c_vision.CenterCrop(50), c_vision.RandomCrop(512)])
|
||||
>>> data1 = data1.map(operations=rand_choice)
|
||||
>>> image_folder_dataset = image_folder_dataset.map(operations=rand_choice)
|
||||
"""
|
||||
|
||||
@check_random_transform_ops
|
||||
|
|
|
@ -31,11 +31,9 @@ class OneHotOp:
|
|||
(Default=0.0 means no smoothing is applied.)
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.transforms as py_transforms
|
||||
>>>
|
||||
>>> transforms_list = [py_transforms.OneHotOp(num_classes=10, smoothing_rate=0.1)]
|
||||
>>> transform = py_transforms.Compose(transforms_list)
|
||||
>>> data1 = data1.map(input_columns=["label"], operations=transform())
|
||||
>>> mnist_dataset = mnist_dataset(input_columns=["label"], operations=transform)
|
||||
"""
|
||||
|
||||
@check_one_hot_op
|
||||
|
@ -71,53 +69,44 @@ class Compose:
|
|||
transforms (list): List of transformations to be applied.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>> import mindspore.dataset.vision.py_transforms as py_vision
|
||||
>>> import mindspore.dataset.transforms.py_transforms as py_transforms
|
||||
>>>
|
||||
>>> dataset_dir = "path/to/imagefolder_directory"
|
||||
>>> image_folder_dataset_dir = "/path/to/image_folder_dataset_directory"
|
||||
>>> # create a dataset that reads all files in dataset_dir with 8 threads
|
||||
>>> data1 = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8)
|
||||
>>> image_folder_dataset = ds.ImageFolderDataset(image_folder_dataset_dir, num_parallel_workers=8)
|
||||
>>> # create a list of transformations to be applied to the image data
|
||||
>>> transform = py_transforms.Compose([py_vision.Decode(),
|
||||
>>> py_vision.RandomHorizontalFlip(0.5),
|
||||
>>> py_vision.ToTensor(),
|
||||
>>> py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)),
|
||||
>>> py_vision.RandomErasing()])
|
||||
>>> # apply the transform to the dataset through dataset.map()
|
||||
>>> data1 = data1.map(operations=transform, input_columns="image")
|
||||
... py_vision.RandomHorizontalFlip(0.5),
|
||||
... py_vision.ToTensor(),
|
||||
... py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)),
|
||||
... py_vision.RandomErasing()])
|
||||
>>> # apply the transform to the dataset through dataset.map function
|
||||
>>> image_folder_dataset = image_folder_dataset.map(operations=transform, input_columns=["image"])
|
||||
>>>
|
||||
>>> # Compose is also be invoked implicitly, by just passing in a list of ops
|
||||
>>> # the above example then becomes:
|
||||
>>> transform_list = [py_vision.Decode(),
|
||||
>>> py_vision.RandomHorizontalFlip(0.5),
|
||||
>>> py_vision.ToTensor(),
|
||||
>>> py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)),
|
||||
>>> py_vision.RandomErasing()]
|
||||
... py_vision.RandomHorizontalFlip(0.5),
|
||||
... py_vision.ToTensor(),
|
||||
... py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)),
|
||||
... py_vision.RandomErasing()]
|
||||
>>>
|
||||
>>> # apply the transform to the dataset through dataset.map()
|
||||
>>> data2 = data2.map(operations=transform_list, input_columns="image")
|
||||
>>> image_folder_dataset_1 = image_folder_dataset_1.map(operations=transform_list, input_columns=["image"])
|
||||
>>>
|
||||
>>> # Certain C++ and Python ops can be combined, but not all of them
|
||||
>>> # An example of combined operations
|
||||
>>> import mindspore.dataset as ds
|
||||
>>> import mindspore.dataset.transforms.c_transforms as c_transforms
|
||||
>>> import mindspore.dataset.vision.c_transforms as c_vision
|
||||
>>>
|
||||
>>> data3 = ds.NumpySlicesDataset(arr, column_names=["cols"], shuffle=False)
|
||||
>>> arr = [0, 1]
|
||||
>>> dataset = ds.NumpySlicesDataset(arr, column_names=["cols"], shuffle=False)
|
||||
>>> transformed_list = [py_transforms.OneHotOp(2), c_transforms.Mask(c_transforms.Relational.EQ, 1)]
|
||||
>>> data3 = data3.map(operations=transformed_list, input_columns=["cols"])
|
||||
>>> dataset = dataset.map(operations=transformed_list, input_columns=["cols"])
|
||||
>>>
|
||||
>>> # Here is an example of mixing vision ops
|
||||
>>> data_dir = "/path/to/imagefolder_directory"
|
||||
>>> data4 = ds.ImageFolderDataset(dataset_dir=data_dir, shuffle=False)
|
||||
>>> input_columns = ["column_names"]
|
||||
>>> import numpy as np
|
||||
>>> op_list=[c_vision.Decode(),
|
||||
>>> c_vision.Resize((224, 244)),
|
||||
>>> py_vision.ToPIL(),
|
||||
>>> np.array, # need to convert PIL image to a NumPy array to pass it to C++ operation
|
||||
>>> c_vision.Resize((24, 24))]
|
||||
>>> data4 = data4.map(operations=op_list, input_columns=input_columns)
|
||||
... c_vision.Resize((224, 244)),
|
||||
... py_vision.ToPIL(),
|
||||
... np.array, # need to convert PIL image to a NumPy array to pass it to C++ operation
|
||||
... c_vision.Resize((24, 24))]
|
||||
>>> image_folder_dataset = image_folder_dataset.map(operations=op_list, input_columns=["image"])
|
||||
"""
|
||||
|
||||
@check_compose_list
|
||||
|
@ -144,12 +133,14 @@ class RandomApply:
|
|||
prob (float, optional): The probability to apply the transformation list (default=0.5).
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.vision.py_transforms as py_vision
|
||||
>>> from mindspore.dataset.transforms.py_transforms import Compose
|
||||
>>>
|
||||
>>> Compose([py_vision.Decode(),
|
||||
>>> py_vision.RandomApply(transforms_list, prob=0.6),
|
||||
>>> py_vision.ToTensor()])
|
||||
>>> transform_list = [py_vision.RandomHorizontalFlip(0.5),
|
||||
... py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)),
|
||||
... py_vision.RandomErasing()]
|
||||
>>> transforms = Compose([py_vision.Decode(),
|
||||
... py_transforms.RandomApply(transforms_list, prob=0.6),
|
||||
... py_vision.ToTensor()])
|
||||
>>> image_folder_dataset = image_folder_dataset.map(operations=transforms, input_columns=["image"])
|
||||
"""
|
||||
|
||||
@check_random_apply
|
||||
|
@ -178,12 +169,14 @@ class RandomChoice:
|
|||
transforms (list): List of transformations to be chosen from to apply.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.vision.py_transforms as py_vision
|
||||
>>> from mindspore.dataset.transforms.py_transforms import Compose, RandomChoice
|
||||
>>>
|
||||
>>> Compose([py_vision.Decode(),
|
||||
>>> RandomChoice(transforms_list),
|
||||
>>> py_vision.ToTensor()])
|
||||
>>> transform_list = [py_vision.RandomHorizontalFlip(0.5),
|
||||
... py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)),
|
||||
... py_vision.RandomErasing()]
|
||||
>>> transforms = Compose([py_vision.Decode(),
|
||||
... py_transforms.RandomChoice(transform_list),
|
||||
... py_vision.ToTensor()])
|
||||
>>> image_folder_dataset = image_folder_dataset.map(operations=transforms, input_columns=["image"])
|
||||
"""
|
||||
|
||||
@check_transforms_list
|
||||
|
@ -211,12 +204,14 @@ class RandomOrder:
|
|||
transforms (list): List of the transformations to apply.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset.vision.py_transforms as py_vision
|
||||
>>> from mindspore.dataset.transforms.py_transforms import Compose
|
||||
>>>
|
||||
>>> Compose([py_vision.Decode(),
|
||||
>>> py_vision.RandomOrder(transforms_list),
|
||||
>>> py_vision.ToTensor()])
|
||||
>>> transform_list = [py_vision.RandomHorizontalFlip(0.5),
|
||||
... py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)),
|
||||
... py_vision.RandomErasing()]
|
||||
>>> transforms = Compose([py_vision.Decode(),
|
||||
... py_transforms.RandomOrder(transforms_list),
|
||||
... py_vision.ToTensor()])
|
||||
>>> image_folder_dataset = image_folder_dataset.map(operations=transforms, input_columns=["image"])
|
||||
"""
|
||||
|
||||
@check_transforms_list
|
||||
|
|
Loading…
Reference in New Issue