update mindrecord api doc

2021-05-25 20:00:29 +08:00 · 2021-05-25 20:00:29 +08:00 · 784f88db80
parent ea3d92c2ec
commit 784f88db80
9 changed files with 79 additions and 60 deletions
--- a/mindspore/mindrecord/init.py
+++ b/mindspore/mindrecord/init.py
@ -13,13 +13,13 @@
 # limitations under the License.
 # ==============================================================================
 """
-Introduction of mindrecord:
+Introduction of MindRecord.

-Mindrecord is a module to implement reading, writing, search and
-converting for MindSpore format dataset. Users could load(modify)
-mindrecord data through FileReader(FileWriter). Users could also
-convert other format datasets to mindrecord data through
-corresponding sub-module.
+MindRecord is a module to implement reading, writing, searching and
+converting for MindSpore format dataset. Users could use the FileWriter
+API to generate MindRecord data and use the MindDataset API to load
+MindRecord data. Users could also convert other format datasets to
+mindrecord data through corresponding sub-module.
 """

 from .filewriter import FileWriter
--- a/mindspore/mindrecord/filereader.py
+++ b/mindspore/mindrecord/filereader.py
@ -1,4 +1,4 @@
-# Copyright 2019 Huawei Technologies Co., Ltd
+# Copyright 2019-2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """
-This module is to read data from mindrecord.
+This module is to read data from MindRecord.
 """
 from .shardreader import ShardReader
 from .shardheader import ShardHeader
@ -26,17 +26,22 @@ __all__ = ['FileReader']

 class FileReader:
    """
-    Class to read MindRecord File series.
+    Class to read MindRecord files.
+
+    Note:
+        If `file_name` is a filename string, it tries to load all MindRecord files generated \
+        in a conversion, and throws an exceptions if a MindRecord file is missing.
+        If `file_name` is a filename list, only the MindRecord files in the list are loaded.

    Args:
-       file_name (str, list[str]): One of MindRecord File or a file list.
-       num_consumer(int, optional): Number of consumer threads which load data to memory (default=4).
-           It should not be smaller than 1 or larger than the number of CPUs.
-       columns (list[str], optional): A list of fields where corresponding data would be read (default=None).
-       operator(int, optional): Reserved parameter for operators (default=None).
+       file_name (str, list[str]): One of MindRecord file or a file list.
+       num_consumer(int, optional): Number of reader workers which load data. Default: 4.
+           It should not be smaller than 1 or larger than the number of processor cores.
+       columns (list[str], optional): A list of fields where corresponding data would be read. Default: None.
+       operator(int, optional): Reserved parameter for operators. Default: None.

    Raises:
-        ParamValueError: If file_name, num_consumer or columns is invalid.
+        ParamValueError: If `file_name`, `num_consumer` or `columns` is invalid.
    """

    @check_parameter
@ -58,7 +63,7 @@ class FileReader:
        Yield a batch of data according to columns at a time.

        Yields:
-            dictionary: keys are the same as columns.
+            Dict: a batch whose keys are the same as columns.

        Raises:
            MRMUnsupportedSchemaError: If schema is invalid.
--- a/mindspore/mindrecord/filewriter.py
+++ b/mindspore/mindrecord/filewriter.py
@ -1,4 +1,4 @@
-# Copyright 2019 Huawei Technologies Co., Ltd
+# Copyright 2019-2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -33,14 +33,14 @@ __all__ = ['FileWriter']

 class FileWriter:
    """
-    Class to write user defined raw data into MindRecord File series.
+    Class to write user defined raw data into MindRecord files.

    Note:
        The mindrecord file may fail to be read if the file name is modified.

    Args:
-        file_name (str): File name of MindRecord File.
-        shard_num (int, optional): The Number of MindRecord File (default=1).
+        file_name (str): File name of MindRecord file.
+        shard_num (int, optional): The Number of MindRecord file. Default: 1.
            It should be between [1, 1000].

    Raises:
@ -86,12 +86,12 @@ class FileWriter:
            file_name (str): String of MindRecord file name.

        Returns:
-            FileWriter, file writer for the opened MindRecord file.
+            FileWriter, file writer object for the opened MindRecord file.

        Raises:
            ParamValueError: If file_name is invalid.
            FileNameError: If path contains invalid characters.
-            MRMOpenError: If failed to open MindRecord File.
+            MRMOpenError: If failed to open MindRecord file.
            MRMOpenForAppendError: If failed to open file for appending data.
        """
        check_filename(file_name)
@ -113,11 +113,11 @@ class FileWriter:

    def add_schema(self, content, desc=None):
        """
-        Return a schema id if schema is added successfully, or raise an exception.
+        The schema is added to describe the raw data to be written.

        Args:
-            content (dict): Dictionary of user defined schema.
-            desc (str, optional): String of schema description (default=None).
+            content (dict): Dictionary of schema content.
+            desc (str, optional): String of schema description, Default: None.

        Returns:
            int, schema id.
@ -137,8 +137,13 @@ class FileWriter:
        """
        Select index fields from schema to accelerate reading.

+        Note:
+            The index fields should be primitive type. e.g. int/float/str.
+            If the function is not called, the fields of the primitive type
+            in schema are set as indexes by default.
+
        Args:
-            index_fields (list[str]): Fields would be set as index which should be primitive type.
+            index_fields (list[str]): fields from schema.

        Returns:
            MSRStatus, SUCCESS or FAILED.
@ -207,28 +212,37 @@ class FileWriter:

    def open_and_set_header(self):
        """
-        Open writer and set header.
+        Open writer and set header. The function is only used for parallel \
+        writing and is called before the `write_raw_data`.
+
+        Returns:
+            MSRStatus, SUCCESS or FAILED.
+
+        Raises:
+            MRMOpenError: If failed to open MindRecord file.
+            MRMSetHeaderError: If failed to set header.
        """
        if not self._writer.is_open:
-            self._writer.open(self._paths)
+            ret = self._writer.open(self._paths)
        if not self._writer.get_shard_header():
-            self._writer.set_shard_header(self._header)
+            return self._writer.set_shard_header(self._header)
+        return ret

    def write_raw_data(self, raw_data, parallel_writer=False):
        """
-        Write raw data and generate sequential pair of MindRecord File and \
-        validate data based on predefined schema by default.
+        Convert raw data into a seried of consecutive MindRecord \
+        files after the raw data is verified against the schema.

        Args:
           raw_data (list[dict]): List of raw data.
-           parallel_writer (bool, optional): Load data parallel if it equals to True (default=False).
+           parallel_writer (bool, optional): Write raw data in parallel if it equals to True. Default: False.

        Returns:
            MSRStatus, SUCCESS or FAILED.

        Raises:
            ParamTypeError: If index field is invalid.
-            MRMOpenError: If failed to open MindRecord File.
+            MRMOpenError: If failed to open MindRecord file.
            MRMValidateDataError: If data does not match blob fields.
            MRMSetHeaderError: If failed to set header.
            MRMWriteDatasetError: If failed to write dataset.
@ -248,8 +262,8 @@ class FileWriter:
    def set_header_size(self, header_size):
        """
        Set the size of header which contains shard information, schema information, \
-        page meta information, etc. The larger the header, the more training data \
-        a single Mindrecord file can store.
+        page meta information, etc. The larger a header, the more data \
+        the MindRecord file can store.

        Args:
            header_size (int): Size of header, between 16KB and 128MB.
@ -265,9 +279,9 @@ class FileWriter:

    def set_page_size(self, page_size):
        """
-        Set the size of page which mainly refers to the block to store training data, \
-        and the training data will be split into raw page and blob page in mindrecord. \
-        The larger the page, the more training data a single page can store.
+        Set the size of page that represents the area where data is stored, \
+        and the areas are divided into two types: raw page and blob page. \
+        The larger a page, the more data the page can store.

        Args:
           page_size (int): Size of page, between 32KB and 256MB.
@ -282,13 +296,13 @@ class FileWriter:

    def commit(self):
        """
-        Flush data to disk and generate the corresponding database files.
+        Flush data in memory to disk and generate the corresponding database files.

        Returns:
            MSRStatus, SUCCESS or FAILED.

        Raises:
-            MRMOpenError: If failed to open MindRecord File.
+            MRMOpenError: If failed to open MindRecord file.
            MRMSetHeaderError: If failed to set header.
            MRMIndexGeneratorError: If failed to create index generator.
            MRMGenerateIndexError: If failed to write to database.
--- a/mindspore/mindrecord/mindpage.py
+++ b/mindspore/mindrecord/mindpage.py
@ -1,4 +1,4 @@
-# Copyright 2019 Huawei Technologies Co., Ltd
+# Copyright 2019-2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """
-This module is to support reading page from mindrecord.
+This module is to support reading page from MindRecord.
 """

 from mindspore import log as logger
@ -26,12 +26,12 @@ __all__ = ['MindPage']

 class MindPage:
    """
-    Class to read MindRecord File series in pagination.
+    Class to read MindRecord files in pagination.

    Args:
-        file_name (str): One of MindRecord File or a file list.
-        num_consumer(int, optional): The number of consumer threads which load data to memory (default=4).
-            It should not be smaller than 1 or larger than the number of CPUs.
+        file_name (str): One of MindRecord files or a file list.
+        num_consumer(int, optional): The number of reader workers which load data. Default: 4.
+            It should not be smaller than 1 or larger than the number of processor cores.

    Raises:
        ParamValueError: If `file_name`, `num_consumer` or columns is invalid.
--- a/mindspore/mindrecord/tools/cifar10_to_mr.py
+++ b/mindspore/mindrecord/tools/cifar10_to_mr.py
@ -73,7 +73,7 @@ class Cifar10ToMR:
        Execute transformation from cifar10 to MindRecord.

        Args:
-            fields (list[str], optional): A list of index fields, e.g.["label"] (default=None).
+            fields (list[str], optional): A list of index fields. Default: None.

        Returns:
            MSRStatus, whether cifar10 is successfully transformed to MindRecord.
--- a/mindspore/mindrecord/tools/csv_to_mr.py
+++ b/mindspore/mindrecord/tools/csv_to_mr.py
@ -37,8 +37,8 @@ class CsvToMR:
    Args:
        source (str): the file path of csv.
        destination (str): the MindRecord file path to transform into.
-        columns_list(list[str], optional): A list of columns to be read(default=None).
-        partition_number (int, optional): partition size (default=1).
+        columns_list(list[str], optional): A list of columns to be read. Default: None.
+        partition_number (int, optional): partition size, Default: 1.

    Raises:
        ValueError: If `source`, `destination`, `partition_number` is invalid.
--- a/mindspore/mindrecord/tools/imagenet_to_mr.py
+++ b/mindspore/mindrecord/tools/imagenet_to_mr.py
@ -42,7 +42,7 @@ class ImageNetToMR:

        image_dir (str): image directory contains n02119789, n02100735, n02110185 and n02096294 directory.
        destination (str): the MindRecord file path to transform into.
-        partition_number (int, optional): partition size (default=1).
+        partition_number (int, optional): partition size. Default: 1.

    Raises:
        ValueError: If `map_file`, `image_dir` or `destination` is invalid.
--- a/mindspore/mindrecord/tools/mnist_to_mr.py
+++ b/mindspore/mindrecord/tools/mnist_to_mr.py
@ -42,7 +42,7 @@ class MnistToMR:
                      train-images-idx3-ubyte.gz, t10k-labels-idx1-ubyte.gz
                      and train-labels-idx1-ubyte.gz.
        destination (str): the MindRecord file directory to transform into.
-        partition_number (int, optional): partition size (default=1).
+        partition_number (int, optional): partition size. Default: 1.

    Raises:
        ValueError: If `source`, `destination`, `partition_number` is invalid.
--- a/mindspore/mindrecord/tools/tfrecord_to_mr.py
+++ b/mindspore/mindrecord/tools/tfrecord_to_mr.py
@ -70,17 +70,17 @@ class TFRecordToMR:
    Args:
        source (str): the TFRecord file to be transformed.
        destination (str): the MindRecord file path to transform into.
-        feature_dict (dict): a dictionary that states the feature type, e.g.
-            feature_dict = {"xxxx": tf.io.FixedLenFeature([], tf.string), \
-                            "yyyy": tf.io.FixedLenFeature([], tf.int64)}
-
-            **Follow case which uses VarLenFeature is not supported.**
-
-            feature_dict = {"context": {"xxxx": tf.io.FixedLenFeature([], tf.string), \
-                                        "yyyy": tf.io.VarLenFeature(tf.int64)}, \
-                            "sequence": {"zzzz": tf.io.FixedLenSequenceFeature([], tf.float32)}}
+        feature_dict (dict): a dictionary that states the feature type,
        bytes_fields (list, optional): the bytes fields which are in `feature_dict` and can be images bytes.

+    Examples:
+        >>> feature_dict = {"xxxx": tf.io.FixedLenFeature([], tf.string),
+        ...                 "yyyy": tf.io.FixedLenFeature([], tf.int64)}
+        >>> # Follow case which uses VarLenFeature is not supported.
+        >>> feature_dict = {"context": {"xxxx": tf.io.FixedLenFeature([], tf.string),
+        ...                             "yyyy": tf.io.VarLenFeature(tf.int64)},
+        ...                             "sequence": {"zzzz": tf.io.FixedLenSequenceFeature([], tf.float32)}}
+
    Raises:
        ValueError: If parameter is invalid.
        Exception: when tensorflow module is not found or version is not correct.