forked from mindspore-Ecosystem/mindspore
update mindrecord api doc
This commit is contained in:
parent
ea3d92c2ec
commit
784f88db80
|
@ -13,13 +13,13 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
"""
|
"""
|
||||||
Introduction of mindrecord:
|
Introduction of MindRecord.
|
||||||
|
|
||||||
Mindrecord is a module to implement reading, writing, search and
|
MindRecord is a module to implement reading, writing, searching and
|
||||||
converting for MindSpore format dataset. Users could load(modify)
|
converting for MindSpore format dataset. Users could use the FileWriter
|
||||||
mindrecord data through FileReader(FileWriter). Users could also
|
API to generate MindRecord data and use the MindDataset API to load
|
||||||
convert other format datasets to mindrecord data through
|
MindRecord data. Users could also convert other format datasets to
|
||||||
corresponding sub-module.
|
mindrecord data through corresponding sub-module.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from .filewriter import FileWriter
|
from .filewriter import FileWriter
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2019 Huawei Technologies Co., Ltd
|
# Copyright 2019-2021 Huawei Technologies Co., Ltd
|
||||||
#
|
#
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
# you may not use this file except in compliance with the License.
|
# you may not use this file except in compliance with the License.
|
||||||
|
@ -13,7 +13,7 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
"""
|
"""
|
||||||
This module is to read data from mindrecord.
|
This module is to read data from MindRecord.
|
||||||
"""
|
"""
|
||||||
from .shardreader import ShardReader
|
from .shardreader import ShardReader
|
||||||
from .shardheader import ShardHeader
|
from .shardheader import ShardHeader
|
||||||
|
@ -26,17 +26,22 @@ __all__ = ['FileReader']
|
||||||
|
|
||||||
class FileReader:
|
class FileReader:
|
||||||
"""
|
"""
|
||||||
Class to read MindRecord File series.
|
Class to read MindRecord files.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
If `file_name` is a filename string, it tries to load all MindRecord files generated \
|
||||||
|
in a conversion, and throws an exceptions if a MindRecord file is missing.
|
||||||
|
If `file_name` is a filename list, only the MindRecord files in the list are loaded.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_name (str, list[str]): One of MindRecord File or a file list.
|
file_name (str, list[str]): One of MindRecord file or a file list.
|
||||||
num_consumer(int, optional): Number of consumer threads which load data to memory (default=4).
|
num_consumer(int, optional): Number of reader workers which load data. Default: 4.
|
||||||
It should not be smaller than 1 or larger than the number of CPUs.
|
It should not be smaller than 1 or larger than the number of processor cores.
|
||||||
columns (list[str], optional): A list of fields where corresponding data would be read (default=None).
|
columns (list[str], optional): A list of fields where corresponding data would be read. Default: None.
|
||||||
operator(int, optional): Reserved parameter for operators (default=None).
|
operator(int, optional): Reserved parameter for operators. Default: None.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ParamValueError: If file_name, num_consumer or columns is invalid.
|
ParamValueError: If `file_name`, `num_consumer` or `columns` is invalid.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@check_parameter
|
@check_parameter
|
||||||
|
@ -58,7 +63,7 @@ class FileReader:
|
||||||
Yield a batch of data according to columns at a time.
|
Yield a batch of data according to columns at a time.
|
||||||
|
|
||||||
Yields:
|
Yields:
|
||||||
dictionary: keys are the same as columns.
|
Dict: a batch whose keys are the same as columns.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
MRMUnsupportedSchemaError: If schema is invalid.
|
MRMUnsupportedSchemaError: If schema is invalid.
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2019 Huawei Technologies Co., Ltd
|
# Copyright 2019-2021 Huawei Technologies Co., Ltd
|
||||||
#
|
#
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
# you may not use this file except in compliance with the License.
|
# you may not use this file except in compliance with the License.
|
||||||
|
@ -33,14 +33,14 @@ __all__ = ['FileWriter']
|
||||||
|
|
||||||
class FileWriter:
|
class FileWriter:
|
||||||
"""
|
"""
|
||||||
Class to write user defined raw data into MindRecord File series.
|
Class to write user defined raw data into MindRecord files.
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
The mindrecord file may fail to be read if the file name is modified.
|
The mindrecord file may fail to be read if the file name is modified.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_name (str): File name of MindRecord File.
|
file_name (str): File name of MindRecord file.
|
||||||
shard_num (int, optional): The Number of MindRecord File (default=1).
|
shard_num (int, optional): The Number of MindRecord file. Default: 1.
|
||||||
It should be between [1, 1000].
|
It should be between [1, 1000].
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
|
@ -86,12 +86,12 @@ class FileWriter:
|
||||||
file_name (str): String of MindRecord file name.
|
file_name (str): String of MindRecord file name.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
FileWriter, file writer for the opened MindRecord file.
|
FileWriter, file writer object for the opened MindRecord file.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ParamValueError: If file_name is invalid.
|
ParamValueError: If file_name is invalid.
|
||||||
FileNameError: If path contains invalid characters.
|
FileNameError: If path contains invalid characters.
|
||||||
MRMOpenError: If failed to open MindRecord File.
|
MRMOpenError: If failed to open MindRecord file.
|
||||||
MRMOpenForAppendError: If failed to open file for appending data.
|
MRMOpenForAppendError: If failed to open file for appending data.
|
||||||
"""
|
"""
|
||||||
check_filename(file_name)
|
check_filename(file_name)
|
||||||
|
@ -113,11 +113,11 @@ class FileWriter:
|
||||||
|
|
||||||
def add_schema(self, content, desc=None):
|
def add_schema(self, content, desc=None):
|
||||||
"""
|
"""
|
||||||
Return a schema id if schema is added successfully, or raise an exception.
|
The schema is added to describe the raw data to be written.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
content (dict): Dictionary of user defined schema.
|
content (dict): Dictionary of schema content.
|
||||||
desc (str, optional): String of schema description (default=None).
|
desc (str, optional): String of schema description, Default: None.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
int, schema id.
|
int, schema id.
|
||||||
|
@ -137,8 +137,13 @@ class FileWriter:
|
||||||
"""
|
"""
|
||||||
Select index fields from schema to accelerate reading.
|
Select index fields from schema to accelerate reading.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
The index fields should be primitive type. e.g. int/float/str.
|
||||||
|
If the function is not called, the fields of the primitive type
|
||||||
|
in schema are set as indexes by default.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
index_fields (list[str]): Fields would be set as index which should be primitive type.
|
index_fields (list[str]): fields from schema.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
MSRStatus, SUCCESS or FAILED.
|
MSRStatus, SUCCESS or FAILED.
|
||||||
|
@ -207,28 +212,37 @@ class FileWriter:
|
||||||
|
|
||||||
def open_and_set_header(self):
|
def open_and_set_header(self):
|
||||||
"""
|
"""
|
||||||
Open writer and set header.
|
Open writer and set header. The function is only used for parallel \
|
||||||
|
writing and is called before the `write_raw_data`.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
MSRStatus, SUCCESS or FAILED.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
MRMOpenError: If failed to open MindRecord file.
|
||||||
|
MRMSetHeaderError: If failed to set header.
|
||||||
"""
|
"""
|
||||||
if not self._writer.is_open:
|
if not self._writer.is_open:
|
||||||
self._writer.open(self._paths)
|
ret = self._writer.open(self._paths)
|
||||||
if not self._writer.get_shard_header():
|
if not self._writer.get_shard_header():
|
||||||
self._writer.set_shard_header(self._header)
|
return self._writer.set_shard_header(self._header)
|
||||||
|
return ret
|
||||||
|
|
||||||
def write_raw_data(self, raw_data, parallel_writer=False):
|
def write_raw_data(self, raw_data, parallel_writer=False):
|
||||||
"""
|
"""
|
||||||
Write raw data and generate sequential pair of MindRecord File and \
|
Convert raw data into a seried of consecutive MindRecord \
|
||||||
validate data based on predefined schema by default.
|
files after the raw data is verified against the schema.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
raw_data (list[dict]): List of raw data.
|
raw_data (list[dict]): List of raw data.
|
||||||
parallel_writer (bool, optional): Load data parallel if it equals to True (default=False).
|
parallel_writer (bool, optional): Write raw data in parallel if it equals to True. Default: False.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
MSRStatus, SUCCESS or FAILED.
|
MSRStatus, SUCCESS or FAILED.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ParamTypeError: If index field is invalid.
|
ParamTypeError: If index field is invalid.
|
||||||
MRMOpenError: If failed to open MindRecord File.
|
MRMOpenError: If failed to open MindRecord file.
|
||||||
MRMValidateDataError: If data does not match blob fields.
|
MRMValidateDataError: If data does not match blob fields.
|
||||||
MRMSetHeaderError: If failed to set header.
|
MRMSetHeaderError: If failed to set header.
|
||||||
MRMWriteDatasetError: If failed to write dataset.
|
MRMWriteDatasetError: If failed to write dataset.
|
||||||
|
@ -248,8 +262,8 @@ class FileWriter:
|
||||||
def set_header_size(self, header_size):
|
def set_header_size(self, header_size):
|
||||||
"""
|
"""
|
||||||
Set the size of header which contains shard information, schema information, \
|
Set the size of header which contains shard information, schema information, \
|
||||||
page meta information, etc. The larger the header, the more training data \
|
page meta information, etc. The larger a header, the more data \
|
||||||
a single Mindrecord file can store.
|
the MindRecord file can store.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
header_size (int): Size of header, between 16KB and 128MB.
|
header_size (int): Size of header, between 16KB and 128MB.
|
||||||
|
@ -265,9 +279,9 @@ class FileWriter:
|
||||||
|
|
||||||
def set_page_size(self, page_size):
|
def set_page_size(self, page_size):
|
||||||
"""
|
"""
|
||||||
Set the size of page which mainly refers to the block to store training data, \
|
Set the size of page that represents the area where data is stored, \
|
||||||
and the training data will be split into raw page and blob page in mindrecord. \
|
and the areas are divided into two types: raw page and blob page. \
|
||||||
The larger the page, the more training data a single page can store.
|
The larger a page, the more data the page can store.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
page_size (int): Size of page, between 32KB and 256MB.
|
page_size (int): Size of page, between 32KB and 256MB.
|
||||||
|
@ -282,13 +296,13 @@ class FileWriter:
|
||||||
|
|
||||||
def commit(self):
|
def commit(self):
|
||||||
"""
|
"""
|
||||||
Flush data to disk and generate the corresponding database files.
|
Flush data in memory to disk and generate the corresponding database files.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
MSRStatus, SUCCESS or FAILED.
|
MSRStatus, SUCCESS or FAILED.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
MRMOpenError: If failed to open MindRecord File.
|
MRMOpenError: If failed to open MindRecord file.
|
||||||
MRMSetHeaderError: If failed to set header.
|
MRMSetHeaderError: If failed to set header.
|
||||||
MRMIndexGeneratorError: If failed to create index generator.
|
MRMIndexGeneratorError: If failed to create index generator.
|
||||||
MRMGenerateIndexError: If failed to write to database.
|
MRMGenerateIndexError: If failed to write to database.
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2019 Huawei Technologies Co., Ltd
|
# Copyright 2019-2021 Huawei Technologies Co., Ltd
|
||||||
#
|
#
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
# you may not use this file except in compliance with the License.
|
# you may not use this file except in compliance with the License.
|
||||||
|
@ -13,7 +13,7 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
"""
|
"""
|
||||||
This module is to support reading page from mindrecord.
|
This module is to support reading page from MindRecord.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from mindspore import log as logger
|
from mindspore import log as logger
|
||||||
|
@ -26,12 +26,12 @@ __all__ = ['MindPage']
|
||||||
|
|
||||||
class MindPage:
|
class MindPage:
|
||||||
"""
|
"""
|
||||||
Class to read MindRecord File series in pagination.
|
Class to read MindRecord files in pagination.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_name (str): One of MindRecord File or a file list.
|
file_name (str): One of MindRecord files or a file list.
|
||||||
num_consumer(int, optional): The number of consumer threads which load data to memory (default=4).
|
num_consumer(int, optional): The number of reader workers which load data. Default: 4.
|
||||||
It should not be smaller than 1 or larger than the number of CPUs.
|
It should not be smaller than 1 or larger than the number of processor cores.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ParamValueError: If `file_name`, `num_consumer` or columns is invalid.
|
ParamValueError: If `file_name`, `num_consumer` or columns is invalid.
|
||||||
|
|
|
@ -73,7 +73,7 @@ class Cifar10ToMR:
|
||||||
Execute transformation from cifar10 to MindRecord.
|
Execute transformation from cifar10 to MindRecord.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
fields (list[str], optional): A list of index fields, e.g.["label"] (default=None).
|
fields (list[str], optional): A list of index fields. Default: None.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
MSRStatus, whether cifar10 is successfully transformed to MindRecord.
|
MSRStatus, whether cifar10 is successfully transformed to MindRecord.
|
||||||
|
|
|
@ -37,8 +37,8 @@ class CsvToMR:
|
||||||
Args:
|
Args:
|
||||||
source (str): the file path of csv.
|
source (str): the file path of csv.
|
||||||
destination (str): the MindRecord file path to transform into.
|
destination (str): the MindRecord file path to transform into.
|
||||||
columns_list(list[str], optional): A list of columns to be read(default=None).
|
columns_list(list[str], optional): A list of columns to be read. Default: None.
|
||||||
partition_number (int, optional): partition size (default=1).
|
partition_number (int, optional): partition size, Default: 1.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If `source`, `destination`, `partition_number` is invalid.
|
ValueError: If `source`, `destination`, `partition_number` is invalid.
|
||||||
|
|
|
@ -42,7 +42,7 @@ class ImageNetToMR:
|
||||||
|
|
||||||
image_dir (str): image directory contains n02119789, n02100735, n02110185 and n02096294 directory.
|
image_dir (str): image directory contains n02119789, n02100735, n02110185 and n02096294 directory.
|
||||||
destination (str): the MindRecord file path to transform into.
|
destination (str): the MindRecord file path to transform into.
|
||||||
partition_number (int, optional): partition size (default=1).
|
partition_number (int, optional): partition size. Default: 1.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If `map_file`, `image_dir` or `destination` is invalid.
|
ValueError: If `map_file`, `image_dir` or `destination` is invalid.
|
||||||
|
|
|
@ -42,7 +42,7 @@ class MnistToMR:
|
||||||
train-images-idx3-ubyte.gz, t10k-labels-idx1-ubyte.gz
|
train-images-idx3-ubyte.gz, t10k-labels-idx1-ubyte.gz
|
||||||
and train-labels-idx1-ubyte.gz.
|
and train-labels-idx1-ubyte.gz.
|
||||||
destination (str): the MindRecord file directory to transform into.
|
destination (str): the MindRecord file directory to transform into.
|
||||||
partition_number (int, optional): partition size (default=1).
|
partition_number (int, optional): partition size. Default: 1.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If `source`, `destination`, `partition_number` is invalid.
|
ValueError: If `source`, `destination`, `partition_number` is invalid.
|
||||||
|
|
|
@ -70,17 +70,17 @@ class TFRecordToMR:
|
||||||
Args:
|
Args:
|
||||||
source (str): the TFRecord file to be transformed.
|
source (str): the TFRecord file to be transformed.
|
||||||
destination (str): the MindRecord file path to transform into.
|
destination (str): the MindRecord file path to transform into.
|
||||||
feature_dict (dict): a dictionary that states the feature type, e.g.
|
feature_dict (dict): a dictionary that states the feature type,
|
||||||
feature_dict = {"xxxx": tf.io.FixedLenFeature([], tf.string), \
|
|
||||||
"yyyy": tf.io.FixedLenFeature([], tf.int64)}
|
|
||||||
|
|
||||||
**Follow case which uses VarLenFeature is not supported.**
|
|
||||||
|
|
||||||
feature_dict = {"context": {"xxxx": tf.io.FixedLenFeature([], tf.string), \
|
|
||||||
"yyyy": tf.io.VarLenFeature(tf.int64)}, \
|
|
||||||
"sequence": {"zzzz": tf.io.FixedLenSequenceFeature([], tf.float32)}}
|
|
||||||
bytes_fields (list, optional): the bytes fields which are in `feature_dict` and can be images bytes.
|
bytes_fields (list, optional): the bytes fields which are in `feature_dict` and can be images bytes.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> feature_dict = {"xxxx": tf.io.FixedLenFeature([], tf.string),
|
||||||
|
... "yyyy": tf.io.FixedLenFeature([], tf.int64)}
|
||||||
|
>>> # Follow case which uses VarLenFeature is not supported.
|
||||||
|
>>> feature_dict = {"context": {"xxxx": tf.io.FixedLenFeature([], tf.string),
|
||||||
|
... "yyyy": tf.io.VarLenFeature(tf.int64)},
|
||||||
|
... "sequence": {"zzzz": tf.io.FixedLenSequenceFeature([], tf.float32)}}
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If parameter is invalid.
|
ValueError: If parameter is invalid.
|
||||||
Exception: when tensorflow module is not found or version is not correct.
|
Exception: when tensorflow module is not found or version is not correct.
|
||||||
|
|
Loading…
Reference in New Issue