!19765 Fix docs issues

Merge pull request !19765 from luoyang/code_docs_python-doc
2021-07-09 07:48:49 +00:00 · 2021-07-09 07:48:49 +00:00 · f86644358e
parent ce73174a7e 465d956661
commit f86644358e
6 changed files with 109 additions and 58 deletions
--- a/mindspore/ccsrc/minddata/dataset/include/dataset/text.h
+++ b/mindspore/ccsrc/minddata/dataset/include/dataset/text.h
@ -117,7 +117,6 @@ class BertTokenizer final : public TensorTransform {
 };

 /// \brief Apply case fold operation on UTF-8 string tensors.
-/// \return Shared pointer to the current TensorOperation.
 class CaseFold final : public TensorTransform {
 public:
  /// \brief Constructor.
@ -142,7 +141,8 @@ class JiebaTokenizer final : public TensorTransform {
  ///   official website of cppjieba (https://github.com/yanyiwu/cppjieba).
  /// \param[in] mp_path Dictionary file is used by the MPSegment algorithm. The dictionary can be obtained on the
  ///   official website of cppjieba (https://github.com/yanyiwu/cppjieba).
-  /// \param[in] mode Valid values can be any of JiebaMode.MP, JiebaMode.HMM and JiebaMode.MIX (default=JiebaMode.MIX).
+  /// \param[in] mode Valid values can be any of JiebaMode.kMP, JiebaMode.kHMM and JiebaMode.kMIX
+  ///   (default=JiebaMode.kMIX).
  ///   - JiebaMode.kMP, tokenizes with MPSegment algorithm.
  ///   - JiebaMode.kHMM, tokenizes with Hidden Markov Model Segment algorithm.
  ///   - JiebaMode.kMIX, tokenizes with a mix of MPSegment and HMMSegment algorithms.
@ -248,7 +248,7 @@ class Ngram final : public TensorTransform {
  /// \param[in] left_pad {"pad_token", pad_width}. Padding performed on left side of the sequence. pad_width will
  ///    be capped at n-1. left_pad=("_",2) would pad the left side of the sequence with "__" (default={"", 0}}).
  /// \param[in] right_pad {"pad_token", pad_width}. Padding performed on right side of the sequence.pad_width will
-  ///    be capped at n-1. right_pad=("-":2) would pad the right side of the sequence with "--" (default={"", 0}}).
+  ///    be capped at n-1. right_pad=("-",2) would pad the right side of the sequence with "--" (default={"", 0}}).
  /// \param[in] separator Symbol used to join strings together (default=" ").
  explicit Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad = {"", 0},
                 const std::pair<std::string, int32_t> &right_pad = {"", 0}, const std::string &separator = " ")
@ -276,14 +276,13 @@ class NormalizeUTF8 final : public TensorTransform {
 public:
  /// \brief Constructor.
  /// \param[in] normalize_form Valid values can be any of [NormalizeForm::kNone,NormalizeForm::kNfc,
-  ///   NormalizeForm::kNfkc,
-  ///   NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc).
+  ///   NormalizeForm::kNfkc, NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc).
  ///   See http://unicode.org/reports/tr15/ for details.
-  ///   - NormalizeForm.NONE, remain the input string tensor unchanged.
-  ///   - NormalizeForm.NFC, normalizes with Normalization Form C.
-  ///   - NormalizeForm.NFKC, normalizes with Normalization Form KC.
-  ///   - NormalizeForm.NFD, normalizes with Normalization Form D.
-  ///   - NormalizeForm.NFKD, normalizes with Normalization Form KD.
+  ///   - NormalizeForm.kNone, remain the input string tensor unchanged.
+  ///   - NormalizeForm.kNfc, normalizes with Normalization Form C.
+  ///   - NormalizeForm.kNfkc, normalizes with Normalization Form KC.
+  ///   - NormalizeForm.kNfd, normalizes with Normalization Form D.
+  ///   - NormalizeForm.kNfkd, normalizes with Normalization Form KD.
  explicit NormalizeUTF8(NormalizeForm normalize_form = NormalizeForm::kNfkc);

  /// \brief Destructor
--- a/mindspore/dataset/core/config.py
+++ b/mindspore/dataset/core/config.py
@ -79,7 +79,6 @@ def set_seed(seed):
    If the seed is set, the generated random number will be fixed, this helps to
    produce deterministic results.

-
    Note:
        This set_seed function sets the seed in the Python random library and numpy.random library
        for deterministic Python augmentations using randomness. This set_seed function should
@ -113,6 +112,11 @@ def get_seed():

    Returns:
        int, random number seed.
+
+    Examples:
+        >>> # Get the global configuration of seed.
+        >>> # If set_seed() is never called before, the default value(std::mt19937::default_seed) will be returned.
+        >>> seed = ds.config.get_seed()
    """
    return _config.get_seed()

@ -147,6 +151,11 @@ def get_prefetch_size():

    Returns:
        int, total number of rows to be prefetched.
+
+    Examples:
+        >>> # Get the global configuration of prefetch size.
+        >>> # If set_prefetch_size() is never called before, the default value(20) will be returned.
+        >>> prefetch_size = ds.config.get_prefetch_size()
    """
    return _config.get_op_connector_size()

@ -174,12 +183,17 @@ def set_num_parallel_workers(num):

 def get_num_parallel_workers():
    """
-    Get the default number of parallel workers.
+    Get the global configuration of number of parallel workers.
    This is the DEFAULT num_parallel_workers value used for each operation, it is not related
    to AutoNumWorker feature.

    Returns:
        int, number of parallel workers to be used as a default for each operation.
+
+    Examples:
+        >>> # Get the global configuration of parallel workers.
+        >>> # If set_num_parallel_workers() is never called before, the default value(8) will be returned.
+        >>> num_parallel_workers = ds.config.get_num_parallel_workers()
    """
    return _config.get_num_parallel_workers()

@ -206,11 +220,15 @@ def set_numa_enable(numa_enable):

 def get_numa_enable():
    """
-    Get the default state of numa enabled.
+    Get the state of numa to indicate enabled/disabled.
    This is the DEFAULT numa enabled value used for the all process.

    Returns:
        bool, the default state of numa enabled.
+
+    Examples:
+        >>> # Get the global configuration of numa.
+        >>> numa_state = ds.config.get_numa_enable()
    """
    return _config.get_numa_enable()

@ -236,10 +254,15 @@ def set_monitor_sampling_interval(interval):

 def get_monitor_sampling_interval():
    """
-    Get the default interval of performance monitor sampling.
+    Get the global configuration of sampling interval of performance monitor.

    Returns:
        int, interval (in milliseconds) for performance monitor sampling.
+
+    Examples:
+        >>> # Get the global configuration of monitor sampling interval.
+        >>> # If set_monitor_sampling_interval() is never called before, the default value(1000) will be returned.
+        >>> ds.config.get_monitor_sampling_interval()
    """
    return _config.get_monitor_sampling_interval()

@ -299,9 +322,10 @@ def get_auto_num_workers():
    Get the setting (turned on or off) automatic number of workers.

    Returns:
-        bool, whether auto num worker feature is turned on.
+        bool, whether auto number worker feature is turned on.

    Examples:
+        >>> # Get the global configuration of auto number worker feature.
        >>> num_workers = ds.config.get_auto_num_workers()
    """
    return _config.get_auto_num_workers()
@ -334,6 +358,11 @@ def get_callback_timeout():

    Returns:
        int, Timeout (in seconds) to be used to end the wait in DSWaitedCallback in case of a deadlock.
+
+    Examples:
+        >>> # Get the global configuration of callback timeout.
+        >>> # If set_callback_timeout() is never called before, the default value(60) will be returned.
+        >>> ds.config.get_callback_timeout()
    """
    return _config.get_callback_timeout()

@ -394,6 +423,10 @@ def get_enable_shared_mem():

    Returns:
        bool, the state of shared mem enabled variable (default=True).
+
+    Examples:
+        >>> # Get the flag of shared memory feature.
+        >>> shared_mem_flag = ds.config.get_enable_shared_mem()
    """
    return _config.get_enable_shared_mem()

@ -410,12 +443,14 @@ def set_enable_shared_mem(enable):
        TypeError: If enable is not a boolean data type.

    Examples:
+        >>> # Enable shared memory feature to improve the performance of Python multiprocessing.
        >>> ds.config.set_enable_shared_mem(True)
    """
    if not isinstance(enable, bool):
        raise TypeError("enable must be of type bool.")
    _config.set_enable_shared_mem(enable)

+
 def set_sending_batches(batch_num):
    """
    Set the default sending batches when training with sink_mode=True in Ascend device.
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@ -334,7 +334,7 @@ class Dataset:
        Serialize a pipeline into JSON string and dump into file if filename is provided.

        Args:
-            filename (str): filename of json file to be saved as
+            filename (str): filename of JSON file to be saved as.

        Returns:
            str, JSON string of the pipeline.
@ -1511,7 +1511,7 @@ class Dataset:

    def get_col_names(self):
        """
-        Renturn the names of the columns in dataset.
+        Return the names of the columns in dataset.

        Returns:
            list, list of column names in the dataset.
@ -1582,7 +1582,7 @@ class Dataset:

    def dynamic_min_max_shapes(self):
        """
-        Get minimum and maximum data length of dynamic source data, for graph compilation of ME.
+        Get minimum and maximum data length of dynamic source data, for dynamic graph compilation.

        Returns:
            lists, min_shapes, max_shapes of source data.
@ -2187,7 +2187,7 @@ class BatchDataset(Dataset):
            self.per_batch_map = _PythonCallable(self.per_batch_map, idx, self.process_pool, arg_q_list, res_q_list)
            self.hook = _ExceptHookHandler()
            atexit.register(_mp_pool_exit_preprocess)
-            # If python version greater than 3.8, we need to close ThreadPool in atexit for unclean pool teardown.
+            # If Python version greater than 3.8, we need to close ThreadPool in atexit for unclean pool teardown.
            if sys.version_info >= (3, 8):
                atexit.register(self.process_pool.close)
        else:
@ -2682,7 +2682,7 @@ class MapDataset(Dataset):
                self.operations = iter_specific_operations
                self.hook = _ExceptHookHandler()
                atexit.register(_mp_pool_exit_preprocess)
-                # If python version greater than 3.8, we need to close ThreadPool in atexit for unclean pool teardown.
+                # If Python version greater than 3.8, we need to close ThreadPool in atexit for unclean pool teardown.
                if sys.version_info >= (3, 8):
                    atexit.register(self.process_pool.close)

@ -3002,7 +3002,7 @@ class TransferDataset(Dataset):
        input_dataset (Dataset): Input Dataset to be transferred.
        send_epoch_end (bool, optional): Whether to send end of sequence to device or not (default=True).
        create_data_info_queue (bool, optional): Whether to create queue which stores
-            types and shapes of data or not(default=False).
+            types and shapes of data or not (default=False).

    Raises:
        TypeError: If device_type is empty.
@ -4798,12 +4798,12 @@ class VOCDataset(MappableDataset):
        title        = {The Pascal Visual Object Classes (VOC) Challenge},
        journal      = {International Journal of Computer Vision},
        volume       = {88},
-        year         = {2010},
+        year         = {2012},
        number       = {2},
        month        = {jun},
        pages        = {303--338},
        biburl       = {http://host.robots.ox.ac.uk/pascal/VOC/pubs/everingham10.html#bibtex},
-        howpublished = {http://host.robots.ox.ac.uk/pascal/VOC/voc{year}/index.html}
+        howpublished = {http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html}
        }
    """

@ -4959,10 +4959,11 @@ class CocoDataset(MappableDataset):

    About COCO dataset:

-    COCO is a large-scale object detection, segmentation, and captioning dataset.
-    It contains 91 common object categories with 82 of them having more than 5,000
-    labeled instances. In contrast to the popular ImageNet dataset, COCO has fewer
-    categories but more instances per category.
+    COCO(Microsoft Common Objects in Context) is a large-scale object detection, segmentation, and captioning dataset
+    with several features: Object segmentation, Recognition in context, Superpixel stuff segmentation,
+    330K images (>200K labeled), 1.5 million object instances, 80 object categories, 91 stuff categories,
+    5 captions per image, 250,000 people with keypoints. In contrast to the popular ImageNet dataset, COCO has fewer
+    categories but more instances in per category.

    You can unzip the original COCO-2017 dataset files into this directory structure and read by MindSpore's API.

@ -5304,7 +5305,7 @@ class CLUEDataset(SourceDataset):

    About CLUE dataset:

-    CLUE, a Chinese Language Understanding Evaluation benchmark. It contains eight different
+    CLUE, a Chinese Language Understanding Evaluation benchmark. It contains multiple
    tasks, including single-sentence classification, sentence pair classification, and machine
    reading comprehension.

--- a/mindspore/dataset/engine/queue.py
+++ b/mindspore/dataset/engine/queue.py
@ -14,7 +14,7 @@
 # ==============================================================================
 """
 This dataset module creates an internal queue class to more optimally pass data
-between multiple processes in python.  It has same API as multiprocessing.queue
+between multiple processes in Python.  It has same API as multiprocessing.queue
 but it will pass large data through shared memory.
 """

--- a/mindspore/dataset/engine/samplers.py
+++ b/mindspore/dataset/engine/samplers.py
@ -121,29 +121,29 @@ class BuiltinSampler:
        self.child_sampler = sampler

    def get_child(self):
-        """ add a child sampler. """
+        """ Get the child sampler. """
        return self.child_sampler

    def parse_child(self):
-        """Parse the child sampler."""
+        """ Parse the child sampler. """
        c_child_sampler = None
        if self.child_sampler is not None:
            c_child_sampler = self.child_sampler.parse()
        return c_child_sampler

    def parse_child_for_minddataset(self):
-        """Parse the child sampler for MindRecord."""
+        """ Parse the child sampler for MindRecord. """
        c_child_sampler = None
        if self.child_sampler is not None:
            c_child_sampler = self.child_sampler.parse_for_minddataset()
        return c_child_sampler

    def is_shuffled(self):
-        """ not implemented """
+        """ Not implemented. """
        raise NotImplementedError("Sampler must implement is_shuffled.")

    def is_sharded(self):
-        """ not implemented """
+        """ Not implemented. """
        raise NotImplementedError("Sampler must implement is_sharded.")

    def get_num_samples(self):
@ -313,8 +313,10 @@ class DistributedSampler(BuiltinSampler):
        shard_id (int): Shard ID of the current shard, which should within the range of [0, num_shards-1].
        shuffle (bool, optional): If True, the indices are shuffled, otherwise it will not be shuffled(default=True).
        num_samples (int, optional): The number of samples to draw (default=None, which means sample all elements).
-        offset(int, optional): The starting shard ID where the elements in the dataset are sent to (default=-1), which
-            should be no more than num_shards.
+        offset(int, optional): The starting shard ID where the elements in the dataset are sent to, which
+            should be no more than num_shards. This parameter is only valid when a ConcatDataset takes
+            a DistributedSampler as its sampler. It will affect the number of samples of per shard
+            (default=-1, which means each shard has same number of samples).

    Examples:
        >>> # creates a distributed sampler with 10 shards in total. This shard is shard 5.
@ -329,9 +331,9 @@ class DistributedSampler(BuiltinSampler):
        TypeError: If shuffle is not a boolean value.
        TypeError: If num_samples is not an integer value.
        TypeError: If offset is not an integer value.
+        ValueError: If num_samples is a negative value.
        RuntimeError: If num_shards is not a positive value.
        RuntimeError: If shard_id is smaller than 0 or equal to num_shards or larger than num_shards.
-        RuntimeError: If num_samples is a negative value.
        RuntimeError: If offset is greater than num_shards.
    """

@ -411,7 +413,7 @@ class PKSampler(BuiltinSampler):
        num_class (int, optional): Number of classes to sample (default=None, sample all classes).
            The parameter does not supported to specify currently.
        shuffle (bool, optional): If True, the class IDs are shuffled, otherwise it will not be
-            shuffled(default=False).
+            shuffled (default=False).
        class_column (str, optional): Name of column with class labels for MindDataset (default='label').
        num_samples (int, optional): The number of samples to draw (default=None, which means sample all elements).

@ -423,13 +425,12 @@ class PKSampler(BuiltinSampler):
        ...                                 sampler=sampler)

    Raises:
-        TypeError: If num_val is not a positive value.
        TypeError: If shuffle is not a boolean value.
        TypeError: If class_column is not a str value.
        TypeError: If num_samples is not an integer value.
        NotImplementedError: If num_class is not None.
        RuntimeError: If num_val is not a positive value.
-        RuntimeError: If num_samples is a negative value.
+        ValueError: If num_samples is a negative value.
    """

    def __init__(self, num_val, num_class=None, shuffle=False, class_column='label', num_samples=None):
@ -508,7 +509,7 @@ class RandomSampler(BuiltinSampler):
    Raises:
        TypeError: If replacement is not a boolean value.
        TypeError: If num_samples is not an integer value.
-        RuntimeError: If num_samples is a negative value.
+        ValueError: If num_samples is a negative value.
     """

    def __init__(self, replacement=False, num_samples=None):
@ -573,7 +574,7 @@ class SequentialSampler(BuiltinSampler):
        TypeError: If start_index is not an integer value.
        TypeError: If num_samples is not an integer value.
        RuntimeError: If start_index is a negative value.
-        RuntimeError: If num_samples is a negative value.
+        ValueError: If num_samples is a negative value.
    """

    def __init__(self, start_index=None, num_samples=None):
@ -641,7 +642,7 @@ class SubsetSampler(BuiltinSampler):
    Raises:
        TypeError: If type of indices element is not a number.
        TypeError: If num_samples is not an integer value.
-        RuntimeError: If num_samples is a negative value.
+        ValueError: If num_samples is a negative value.
    """

    def __init__(self, indices, num_samples=None):
@ -713,7 +714,7 @@ class SubsetRandomSampler(SubsetSampler):
    Samples the elements randomly from a sequence of indices.

    Args:
-        indices (Any iterable python object but string): A sequence of indices.
+        indices (Any iterable Python object but string): A sequence of indices.
        num_samples (int, optional): Number of elements to sample (default=None, which means sample all elements).

    Examples:
@ -726,7 +727,7 @@ class SubsetRandomSampler(SubsetSampler):
    Raises:
        TypeError: If type of indices element is not a number.
        TypeError: If num_samples is not an integer value.
-        RuntimeError: If num_samples is a negative value.
+        ValueError: If num_samples is a negative value.
    """

    def parse(self):
@ -806,7 +807,7 @@ class WeightedRandomSampler(BuiltinSampler):
        TypeError: If num_samples is not an integer value.
        TypeError: If replacement is not a boolean value.
        RuntimeError: If weights is empty or all zero.
-        RuntimeError: If num_samples is a negative value.
+        ValueError: If num_samples is a negative value.
    """

    def __init__(self, weights, num_samples=None, replacement=True):
--- a/mindspore/dataset/engine/serializer_deserializer.py
+++ b/mindspore/dataset/engine/serializer_deserializer.py
@ -27,15 +27,15 @@ from ..vision.utils import Inter, Border, ImageBatchFormat

 def serialize(dataset, json_filepath=""):
    """
-    Serialize dataset pipeline into a json file.
+    Serialize dataset pipeline into a JSON file.

    Note:
-        Currently some python objects are not supported to be serialized.
-        For python function serialization of map operator, de.serialize will only return its function name.
+        Currently some Python objects are not supported to be serialized.
+        For Python function serialization of map operator, de.serialize will only return its function name.

    Args:
        dataset (Dataset): The starting node.
-        json_filepath (str): The filepath where a serialized json file will be generated.
+        json_filepath (str): The filepath where a serialized JSON file will be generated.

    Returns:
       Dict, The dictionary contains the serialized dataset graph.
@ -48,7 +48,7 @@ def serialize(dataset, json_filepath=""):
        >>> one_hot_encode = c_transforms.OneHot(10)  # num_classes is input argument
        >>> dataset = dataset.map(operation=one_hot_encode, input_column_names="label")
        >>> dataset = dataset.batch(batch_size=10, drop_remainder=True)
-        >>> # serialize it to json file
+        >>> # serialize it to JSON file
        >>> ds.engine.serialize(dataset, json_filepath="/path/to/mnist_dataset_pipeline.json")
        >>> serialized_data = ds.engine.serialize(dataset)  # serialize it to Python dict
    """
@ -57,27 +57,27 @@ def serialize(dataset, json_filepath=""):

 def deserialize(input_dict=None, json_filepath=None):
    """
-    Construct a de pipeline from a json file produced by de.serialize().
+    Construct a de pipeline from a JSON file produced by de.serialize().

    Note:
-        Currently python function deserialization of map operator are not supported.
+        Currently Python function deserialization of map operator are not supported.

    Args:
        input_dict (dict): A Python dictionary containing a serialized dataset graph.
-        json_filepath (str): A path to the json file.
+        json_filepath (str): A path to the JSON file.

    Returns:
        de.Dataset or None if error occurs.

    Raises:
-        OSError: Can not open the json file.
+        OSError: Can not open the JSON file.

    Examples:
        >>> dataset = ds.MnistDataset(mnist_dataset_dir, 100)
        >>> one_hot_encode = c_transforms.OneHot(10)  # num_classes is input argument
        >>> dataset = dataset.map(operation=one_hot_encode, input_column_names="label")
        >>> dataset = dataset.batch(batch_size=10, drop_remainder=True)
-        >>> # Use case 1: to/from json file
+        >>> # Use case 1: to/from JSON file
        >>> ds.engine.serialize(dataset, json_filepath="/path/to/mnist_dataset_pipeline.json")
        >>> dataset = ds.engine.deserialize(json_filepath="/path/to/mnist_dataset_pipeline.json")
        >>> # Use case 2: to/from Python dictionary
@ -113,8 +113,15 @@ def show(dataset, indentation=2):

    Args:
        dataset (Dataset): The starting node.
-        indentation (int, optional): The indentation used by the json print.
+        indentation (int, optional): The indentation used by the JSON print.
            Do not indent if indentation is None.
+
+    Examples:
+        >>> dataset = ds.MnistDataset(mnist_dataset_dir, 100)
+        >>> one_hot_encode = c_transforms.OneHot(10)
+        >>> dataset = dataset.map(operation=one_hot_encode, input_column_names="label")
+        >>> dataset = dataset.batch(batch_size=10, drop_remainder=True)
+        >>> ds.show(dataset)
    """

    pipeline = dataset.to_json()
@ -128,13 +135,21 @@ def compare(pipeline1, pipeline2):
    Args:
        pipeline1 (Dataset): a dataset pipeline.
        pipeline2 (Dataset): a dataset pipeline.
+
+    Returns:
+        Whether pipeline1 is equal to pipeline2.
+
+    Examples:
+        >>> pipeline1 = ds.MnistDataset(mnist_dataset_dir, 100)
+        >>> pipeline2 = ds.Cifar10Dataset(cifar_dataset_dir, 100)
+        >>> ds.compare(pipeline1, pipeline2)
    """

    return pipeline1.to_json() == pipeline2.to_json()


 def construct_pipeline(node):
-    """Construct the Python Dataset objects by following the dictionary deserialized from json file."""
+    """Construct the Python Dataset objects by following the dictionary deserialized from JSON file."""
    op_type = node.get('op_type')
    if not op_type:
        raise ValueError("op_type field in the json file can't be None.")