diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index aa895e70b83..956f67e8c80 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -266,9 +266,9 @@ class Dataset: be dropped and not propagated to the child node. num_parallel_workers (int, optional): Number of workers to process the Dataset in parallel (default=None). per_batch_map (callable, optional): Per batch map callable. A callable which takes - (list[Tensor], list[Tensor], ..., BatchInfo) as input parameters. Each list[Tensor] represent a batch of - Tensors on a given column. The number of lists should match with number of entries in input_columns. The - last parameter of the callable should always be a BatchInfo object. + (list[Tensor], list[Tensor], ..., BatchInfo) as input parameters. Each list[Tensor] represents a batch + of Tensors on a given column. The number of lists should match with number of entries in input_columns. + The last parameter of the callable should always be a BatchInfo object. input_columns (list[str], optional): List of names of the input columns. The size of the list should match with signature of per_batch_map callable. pad_info (dict, optional): Whether to perform padding on selected columns. pad_info={"col1":([224,224],0)} @@ -2734,7 +2734,7 @@ class MnistDataset(MappableDataset): num_samples (int, optional): The number of images to be included in the dataset (default=None, all images). num_parallel_workers (int, optional): Number of workers to read the data - (default=value, set in the config). + (default=None, set in the config). shuffle (bool, optional): Whether or not to perform shuffle on the dataset (default=None, expected order behavior shown in the table). sampler (Sampler, optional): Object used to choose samples from the @@ -2811,7 +2811,7 @@ class MnistDataset(MappableDataset): class MindDataset(MappableDataset): """ - A source dataset that reads from shard files and database. + A source dataset that reads MindRecord files. Args: dataset_file (Union[str, list[str]]): One of file names or file list in dataset. @@ -3138,7 +3138,7 @@ class _GeneratorWorker(multiprocessing.Process): class GeneratorDataset(MappableDataset): """ - A source dataset that generate data from python by invoking python data source each epoch. + A source dataset that generates data from python by invoking python data source each epoch. This dataset can take in a sampler. sampler and shuffle are mutually exclusive. Table below shows what input args are allowed and their expected behavior. @@ -3351,7 +3351,7 @@ class TFRecordDataset(SourceDataset): Args: dataset_files (Union[str, list[str]]): String or list of files to be read or glob strings to search for a - pattern of files. The list will be sorted in a lexicographical order. + pattern of files. The list will be sorted in a lexicographical order. schema (Union[str, Schema], optional): Path to the json schema file or schema object (default=None). If the schema is not provided, the meta data from the TFData file is considered the schema. columns_list (list[str], optional): List of columns to be read (default=None, read all columns) @@ -3534,7 +3534,7 @@ class ManifestDataset(MappableDataset): Args: dataset_file (str): File to be read. - usage (str, optional): Need train, eval or inference data (default="train"). + usage (str, optional): acceptable usages include train, eval and inference (default="train"). num_samples (int, optional): The number of images to be included in the dataset. (default=None, all images). num_parallel_workers (int, optional): Number of workers to read the data diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index ec1120e4d36..09fbba90219 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -72,6 +72,7 @@ class Lookup(cde.LookupOp): def __init__(self, vocab, unknown_token=None): super().__init__(vocab, unknown_token) + class SlidingWindow(cde.SlidingWindowOp): """ TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension axis @@ -101,6 +102,7 @@ class SlidingWindow(cde.SlidingWindowOp): def __init__(self, width, axis=0): super().__init__(width=width, axis=axis) + class Ngram(cde.NgramOp): """ TensorOp to generate n-gram from a 1-D string Tensor. @@ -511,8 +513,8 @@ if platform.system().lower() != 'windows': on input text to make the text to lower case and strip accents characters; If False, only apply NormalizeUTF8('normalization_form' mode) operation on input text(default=False). keep_whitespace(bool, optional): If True, the whitespace will be kept in out tokens(default=False). - normalization_form(NormalizeForm, optional): Used to specify a specific normlaize mode, - only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE'). + normalization_form(NormalizeForm, optional): Used to specify a specific normalize mode, + only effective when 'lower_case' is False. See NormalizeUTF8 for details(default=NormalizeForm.NONE). preserve_unused_token(bool, optional): If True, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True). with_offsets (bool, optional): If or not output offsets of tokens (default=False). diff --git a/mindspore/dataset/text/utils.py b/mindspore/dataset/text/utils.py index c6bfe4f14c9..22328ad5432 100644 --- a/mindspore/dataset/text/utils.py +++ b/mindspore/dataset/text/utils.py @@ -132,12 +132,13 @@ class Vocab(cde.Vocab): Build a vocab object from a dict. Args: - word_dict (dict): dict contains word, id pairs where word should be str and id int. id is recommended to - start from 0 and be continuous. ValueError will be raised if id is negative. + word_dict (dict): dict contains word and id pairs, where word should be str and id be int. id is recommended + to start from 0 and be continuous. ValueError will be raised if id is negative. """ return super().from_dict(word_dict) + class SentencePieceVocab(cde.SentencePieceVocab): """ SentencePiece obiect that is used to segmentate words @@ -151,9 +152,9 @@ class SentencePieceVocab(cde.SentencePieceVocab): Args: dataset(Dataset): Dataset to build sentencepiece. col_names(list): The list of the col name. - vocab_size(int): Vocabulary size, the type of uint32_t. + vocab_size(int): Vocabulary size. character_coverage(float): Amount of characters covered by the model, good defaults are: 0.9995 for - languages. with rich character set like Japanse or Chinese and 1.0 for other languages with small + languages. with rich character set like Japanese or Chinese and 1.0 for other languages with small character set. model_type(SentencePieceModel): Choose from unigram (default), bpe, char, or word. The input sentence must be pretokenized when using word type. @@ -261,6 +262,7 @@ class NormalizeForm(IntEnum): NFD = 3 NFKD = 4 + class SentencePieceModel(IntEnum): """An enumeration for SentencePieceModel, effective enumeration types are UNIGRAM, BPE, CHAR, WORD.""" UNIGRAM = 0 @@ -275,11 +277,13 @@ DE_C_INTER_SENTENCEPIECE_MODE = { SentencePieceModel.WORD: cde.SentencePieceModel.DE_SENTENCE_PIECE_WORD } + class SPieceTokenizerOutType(IntEnum): """An enumeration for SPieceTokenizerOutType, effective enumeration types are STRING, INT.""" STRING = 0 INT = 1 + class SPieceTokenizerLoadType(IntEnum): """An enumeration for SPieceTokenizerLoadType, effective enumeration types are FILE, MODEL.""" FILE = 0 diff --git a/mindspore/dataset/transforms/c_transforms.py b/mindspore/dataset/transforms/c_transforms.py index d00387a36e1..f4b7cde1faa 100644 --- a/mindspore/dataset/transforms/c_transforms.py +++ b/mindspore/dataset/transforms/c_transforms.py @@ -204,7 +204,7 @@ class Concatenate(cde.ConcatenateOp): Tensor operation that concatenates all columns into a single tensor. Args: - axis (int, optional): axis to concatenate the tensors along (Default=0). + axis (int, optional): concatenate the tensors along given axis (Default=0). prepend (numpy.array, optional): numpy array to be prepended to the already concatenated tensors (Default=None). append (numpy.array, optional): numpy array to be appended to the already concatenated tensors (Default=None). """ diff --git a/mindspore/dataset/transforms/vision/c_transforms.py b/mindspore/dataset/transforms/vision/c_transforms.py index 25d291a21fc..d5a2cd23c87 100644 --- a/mindspore/dataset/transforms/vision/c_transforms.py +++ b/mindspore/dataset/transforms/vision/c_transforms.py @@ -188,8 +188,8 @@ class Normalize(cde.NormalizeOp): Normalize the input image with respect to mean and standard deviation. Args: - mean (sequence): List or tuple of mean values for each channel, w.r.t channel order. - std (sequence): List or tuple of standard deviations for each channel, w.r.t. channel order. + mean (sequence): List or tuple of mean values for each channel, with respect to channel order. + std (sequence): List or tuple of standard deviations for each channel, with respect to channel order. """ @check_normalize_c diff --git a/mindspore/mindrecord/filereader.py b/mindspore/mindrecord/filereader.py index ba48fb8cc5e..c97bbd687d0 100644 --- a/mindspore/mindrecord/filereader.py +++ b/mindspore/mindrecord/filereader.py @@ -23,6 +23,7 @@ from .common.exceptions import ParamValueError, ParamTypeError __all__ = ['FileReader'] + class FileReader: """ Class to read MindRecord File series. @@ -31,7 +32,7 @@ class FileReader: file_name (str, list[str]): One of MindRecord File or file list. num_consumer(int, optional): Number of consumer threads which load data to memory (default=4). It should not be smaller than 1 or larger than the number of CPU. - columns (list[str], optional): List of fields which correspond data would be read (default=None). + columns (list[str], optional): List of fields which corresponding data would be read (default=None). operator(int, optional): Reserved parameter for operators (default=None). Raises: diff --git a/mindspore/mindrecord/filewriter.py b/mindspore/mindrecord/filewriter.py index f8b5448c239..fd3a3f188f5 100644 --- a/mindspore/mindrecord/filewriter.py +++ b/mindspore/mindrecord/filewriter.py @@ -275,7 +275,7 @@ class FileWriter: def commit(self): """ - Flush data to disk and generate the correspond db files. + Flush data to disk and generate the corresponding db files. Returns: MSRStatus, SUCCESS or FAILED. diff --git a/mindspore/mindrecord/tools/imagenet_to_mr.py b/mindspore/mindrecord/tools/imagenet_to_mr.py index 59695c87340..9c1326ee6dd 100644 --- a/mindspore/mindrecord/tools/imagenet_to_mr.py +++ b/mindspore/mindrecord/tools/imagenet_to_mr.py @@ -25,12 +25,13 @@ from ..shardutils import check_filename __all__ = ['ImageNetToMR'] + class ImageNetToMR: """ Class is for transformation from imagenet to MindRecord. Args: - map_file (str): the map file which indicate label. + map_file (str): the map file which indicates label. the map file content should like this: .. code-block:: diff --git a/mindspore/mindrecord/tools/mnist_to_mr.py b/mindspore/mindrecord/tools/mnist_to_mr.py index 046788535dc..3de6fb94125 100644 --- a/mindspore/mindrecord/tools/mnist_to_mr.py +++ b/mindspore/mindrecord/tools/mnist_to_mr.py @@ -37,7 +37,7 @@ class MnistToMR: Class is for transformation from Mnist to MindRecord. Args: - source (str): directory which contain t10k-images-idx3-ubyte.gz, + source (str): directory which contains t10k-images-idx3-ubyte.gz, train-images-idx3-ubyte.gz, t10k-labels-idx1-ubyte.gz, train-labels-idx1-ubyte.gz. destination (str): the MindRecord file directory to transform into.