update mindrecord api doc

This commit is contained in:
liyong 2021-05-25 20:00:29 +08:00
parent ea3d92c2ec
commit 784f88db80
9 changed files with 79 additions and 60 deletions

View File

@ -13,13 +13,13 @@
# limitations under the License.
# ==============================================================================
"""
Introduction of mindrecord:
Introduction of MindRecord.
Mindrecord is a module to implement reading, writing, search and
converting for MindSpore format dataset. Users could load(modify)
mindrecord data through FileReader(FileWriter). Users could also
convert other format datasets to mindrecord data through
corresponding sub-module.
MindRecord is a module to implement reading, writing, searching and
converting for MindSpore format dataset. Users could use the FileWriter
API to generate MindRecord data and use the MindDataset API to load
MindRecord data. Users could also convert other format datasets to
mindrecord data through corresponding sub-module.
"""
from .filewriter import FileWriter

View File

@ -1,4 +1,4 @@
# Copyright 2019 Huawei Technologies Co., Ltd
# Copyright 2019-2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -13,7 +13,7 @@
# limitations under the License.
# ==============================================================================
"""
This module is to read data from mindrecord.
This module is to read data from MindRecord.
"""
from .shardreader import ShardReader
from .shardheader import ShardHeader
@ -26,17 +26,22 @@ __all__ = ['FileReader']
class FileReader:
"""
Class to read MindRecord File series.
Class to read MindRecord files.
Note:
If `file_name` is a filename string, it tries to load all MindRecord files generated \
in a conversion, and throws an exceptions if a MindRecord file is missing.
If `file_name` is a filename list, only the MindRecord files in the list are loaded.
Args:
file_name (str, list[str]): One of MindRecord File or a file list.
num_consumer(int, optional): Number of consumer threads which load data to memory (default=4).
It should not be smaller than 1 or larger than the number of CPUs.
columns (list[str], optional): A list of fields where corresponding data would be read (default=None).
operator(int, optional): Reserved parameter for operators (default=None).
file_name (str, list[str]): One of MindRecord file or a file list.
num_consumer(int, optional): Number of reader workers which load data. Default: 4.
It should not be smaller than 1 or larger than the number of processor cores.
columns (list[str], optional): A list of fields where corresponding data would be read. Default: None.
operator(int, optional): Reserved parameter for operators. Default: None.
Raises:
ParamValueError: If file_name, num_consumer or columns is invalid.
ParamValueError: If `file_name`, `num_consumer` or `columns` is invalid.
"""
@check_parameter
@ -58,7 +63,7 @@ class FileReader:
Yield a batch of data according to columns at a time.
Yields:
dictionary: keys are the same as columns.
Dict: a batch whose keys are the same as columns.
Raises:
MRMUnsupportedSchemaError: If schema is invalid.

View File

@ -1,4 +1,4 @@
# Copyright 2019 Huawei Technologies Co., Ltd
# Copyright 2019-2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -33,14 +33,14 @@ __all__ = ['FileWriter']
class FileWriter:
"""
Class to write user defined raw data into MindRecord File series.
Class to write user defined raw data into MindRecord files.
Note:
The mindrecord file may fail to be read if the file name is modified.
Args:
file_name (str): File name of MindRecord File.
shard_num (int, optional): The Number of MindRecord File (default=1).
file_name (str): File name of MindRecord file.
shard_num (int, optional): The Number of MindRecord file. Default: 1.
It should be between [1, 1000].
Raises:
@ -86,12 +86,12 @@ class FileWriter:
file_name (str): String of MindRecord file name.
Returns:
FileWriter, file writer for the opened MindRecord file.
FileWriter, file writer object for the opened MindRecord file.
Raises:
ParamValueError: If file_name is invalid.
FileNameError: If path contains invalid characters.
MRMOpenError: If failed to open MindRecord File.
MRMOpenError: If failed to open MindRecord file.
MRMOpenForAppendError: If failed to open file for appending data.
"""
check_filename(file_name)
@ -113,11 +113,11 @@ class FileWriter:
def add_schema(self, content, desc=None):
"""
Return a schema id if schema is added successfully, or raise an exception.
The schema is added to describe the raw data to be written.
Args:
content (dict): Dictionary of user defined schema.
desc (str, optional): String of schema description (default=None).
content (dict): Dictionary of schema content.
desc (str, optional): String of schema description, Default: None.
Returns:
int, schema id.
@ -137,8 +137,13 @@ class FileWriter:
"""
Select index fields from schema to accelerate reading.
Note:
The index fields should be primitive type. e.g. int/float/str.
If the function is not called, the fields of the primitive type
in schema are set as indexes by default.
Args:
index_fields (list[str]): Fields would be set as index which should be primitive type.
index_fields (list[str]): fields from schema.
Returns:
MSRStatus, SUCCESS or FAILED.
@ -207,28 +212,37 @@ class FileWriter:
def open_and_set_header(self):
"""
Open writer and set header.
Open writer and set header. The function is only used for parallel \
writing and is called before the `write_raw_data`.
Returns:
MSRStatus, SUCCESS or FAILED.
Raises:
MRMOpenError: If failed to open MindRecord file.
MRMSetHeaderError: If failed to set header.
"""
if not self._writer.is_open:
self._writer.open(self._paths)
ret = self._writer.open(self._paths)
if not self._writer.get_shard_header():
self._writer.set_shard_header(self._header)
return self._writer.set_shard_header(self._header)
return ret
def write_raw_data(self, raw_data, parallel_writer=False):
"""
Write raw data and generate sequential pair of MindRecord File and \
validate data based on predefined schema by default.
Convert raw data into a seried of consecutive MindRecord \
files after the raw data is verified against the schema.
Args:
raw_data (list[dict]): List of raw data.
parallel_writer (bool, optional): Load data parallel if it equals to True (default=False).
parallel_writer (bool, optional): Write raw data in parallel if it equals to True. Default: False.
Returns:
MSRStatus, SUCCESS or FAILED.
Raises:
ParamTypeError: If index field is invalid.
MRMOpenError: If failed to open MindRecord File.
MRMOpenError: If failed to open MindRecord file.
MRMValidateDataError: If data does not match blob fields.
MRMSetHeaderError: If failed to set header.
MRMWriteDatasetError: If failed to write dataset.
@ -248,8 +262,8 @@ class FileWriter:
def set_header_size(self, header_size):
"""
Set the size of header which contains shard information, schema information, \
page meta information, etc. The larger the header, the more training data \
a single Mindrecord file can store.
page meta information, etc. The larger a header, the more data \
the MindRecord file can store.
Args:
header_size (int): Size of header, between 16KB and 128MB.
@ -265,9 +279,9 @@ class FileWriter:
def set_page_size(self, page_size):
"""
Set the size of page which mainly refers to the block to store training data, \
and the training data will be split into raw page and blob page in mindrecord. \
The larger the page, the more training data a single page can store.
Set the size of page that represents the area where data is stored, \
and the areas are divided into two types: raw page and blob page. \
The larger a page, the more data the page can store.
Args:
page_size (int): Size of page, between 32KB and 256MB.
@ -282,13 +296,13 @@ class FileWriter:
def commit(self):
"""
Flush data to disk and generate the corresponding database files.
Flush data in memory to disk and generate the corresponding database files.
Returns:
MSRStatus, SUCCESS or FAILED.
Raises:
MRMOpenError: If failed to open MindRecord File.
MRMOpenError: If failed to open MindRecord file.
MRMSetHeaderError: If failed to set header.
MRMIndexGeneratorError: If failed to create index generator.
MRMGenerateIndexError: If failed to write to database.

View File

@ -1,4 +1,4 @@
# Copyright 2019 Huawei Technologies Co., Ltd
# Copyright 2019-2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -13,7 +13,7 @@
# limitations under the License.
# ==============================================================================
"""
This module is to support reading page from mindrecord.
This module is to support reading page from MindRecord.
"""
from mindspore import log as logger
@ -26,12 +26,12 @@ __all__ = ['MindPage']
class MindPage:
"""
Class to read MindRecord File series in pagination.
Class to read MindRecord files in pagination.
Args:
file_name (str): One of MindRecord File or a file list.
num_consumer(int, optional): The number of consumer threads which load data to memory (default=4).
It should not be smaller than 1 or larger than the number of CPUs.
file_name (str): One of MindRecord files or a file list.
num_consumer(int, optional): The number of reader workers which load data. Default: 4.
It should not be smaller than 1 or larger than the number of processor cores.
Raises:
ParamValueError: If `file_name`, `num_consumer` or columns is invalid.

View File

@ -73,7 +73,7 @@ class Cifar10ToMR:
Execute transformation from cifar10 to MindRecord.
Args:
fields (list[str], optional): A list of index fields, e.g.["label"] (default=None).
fields (list[str], optional): A list of index fields. Default: None.
Returns:
MSRStatus, whether cifar10 is successfully transformed to MindRecord.

View File

@ -37,8 +37,8 @@ class CsvToMR:
Args:
source (str): the file path of csv.
destination (str): the MindRecord file path to transform into.
columns_list(list[str], optional): A list of columns to be read(default=None).
partition_number (int, optional): partition size (default=1).
columns_list(list[str], optional): A list of columns to be read. Default: None.
partition_number (int, optional): partition size, Default: 1.
Raises:
ValueError: If `source`, `destination`, `partition_number` is invalid.

View File

@ -42,7 +42,7 @@ class ImageNetToMR:
image_dir (str): image directory contains n02119789, n02100735, n02110185 and n02096294 directory.
destination (str): the MindRecord file path to transform into.
partition_number (int, optional): partition size (default=1).
partition_number (int, optional): partition size. Default: 1.
Raises:
ValueError: If `map_file`, `image_dir` or `destination` is invalid.

View File

@ -42,7 +42,7 @@ class MnistToMR:
train-images-idx3-ubyte.gz, t10k-labels-idx1-ubyte.gz
and train-labels-idx1-ubyte.gz.
destination (str): the MindRecord file directory to transform into.
partition_number (int, optional): partition size (default=1).
partition_number (int, optional): partition size. Default: 1.
Raises:
ValueError: If `source`, `destination`, `partition_number` is invalid.

View File

@ -70,17 +70,17 @@ class TFRecordToMR:
Args:
source (str): the TFRecord file to be transformed.
destination (str): the MindRecord file path to transform into.
feature_dict (dict): a dictionary that states the feature type, e.g.
feature_dict = {"xxxx": tf.io.FixedLenFeature([], tf.string), \
"yyyy": tf.io.FixedLenFeature([], tf.int64)}
**Follow case which uses VarLenFeature is not supported.**
feature_dict = {"context": {"xxxx": tf.io.FixedLenFeature([], tf.string), \
"yyyy": tf.io.VarLenFeature(tf.int64)}, \
"sequence": {"zzzz": tf.io.FixedLenSequenceFeature([], tf.float32)}}
feature_dict (dict): a dictionary that states the feature type,
bytes_fields (list, optional): the bytes fields which are in `feature_dict` and can be images bytes.
Examples:
>>> feature_dict = {"xxxx": tf.io.FixedLenFeature([], tf.string),
... "yyyy": tf.io.FixedLenFeature([], tf.int64)}
>>> # Follow case which uses VarLenFeature is not supported.
>>> feature_dict = {"context": {"xxxx": tf.io.FixedLenFeature([], tf.string),
... "yyyy": tf.io.VarLenFeature(tf.int64)},
... "sequence": {"zzzz": tf.io.FixedLenSequenceFeature([], tf.float32)}}
Raises:
ValueError: If parameter is invalid.
Exception: when tensorflow module is not found or version is not correct.