!3318 [MD] add pydoc for save ops

Merge pull request !3318 from liyong126/fix_save_doc
This commit is contained in:
mindspore-ci-bot 2020-07-23 10:40:24 +08:00 committed by Gitee
commit 7b54fd8304
2 changed files with 55 additions and 2 deletions

View File

@ -410,6 +410,7 @@ Status DEPipeline::SaveDataset(const std::vector<std::string> &file_names, const
std::vector<std::string> index_fields; std::vector<std::string> index_fields;
s = FetchMetaFromTensorRow(column_name_id_map, row, &mr_json, &index_fields); s = FetchMetaFromTensorRow(column_name_id_map, row, &mr_json, &index_fields);
RETURN_IF_NOT_OK(s); RETURN_IF_NOT_OK(s);
MS_LOG(DEBUG) << "Schema of saved mindrecord: " << mr_json.dump();
if (mindrecord::SUCCESS != if (mindrecord::SUCCESS !=
mindrecord::ShardHeader::initialize(&mr_header, mr_json, index_fields, blob_fields, mr_schema_id)) { mindrecord::ShardHeader::initialize(&mr_header, mr_json, index_fields, blob_fields, mr_schema_id)) {
RETURN_STATUS_UNEXPECTED("Error: failed to initialize ShardHeader."); RETURN_STATUS_UNEXPECTED("Error: failed to initialize ShardHeader.");
@ -569,6 +570,7 @@ Status DEPipeline::FetchMetaFromTensorRow(const std::unordered_map<std::string,
if (column_name_id_map.empty()) { if (column_name_id_map.empty()) {
RETURN_STATUS_UNEXPECTED("Error: column not found."); RETURN_STATUS_UNEXPECTED("Error: column not found.");
} }
json dataset_schema;
for (auto &col : column_name_id_map) { for (auto &col : column_name_id_map) {
auto idx = col.second; auto idx = col.second;
auto column_name = col.first; auto column_name = col.first;
@ -580,6 +582,7 @@ Status DEPipeline::FetchMetaFromTensorRow(const std::unordered_map<std::string,
auto shapes = column_shape.AsVector(); auto shapes = column_shape.AsVector();
std::vector<int> mr_shape(shapes.begin(), shapes.end()); std::vector<int> mr_shape(shapes.begin(), shapes.end());
std::string el = column_type.ToString(); std::string el = column_type.ToString();
dataset_schema[column_name] = el;
if (mindrecord::kTypesMap.find(el) == mindrecord::kTypesMap.end()) { if (mindrecord::kTypesMap.find(el) == mindrecord::kTypesMap.end()) {
std::string err_msg("Error: can not support data type: " + el); std::string err_msg("Error: can not support data type: " + el);
RETURN_STATUS_UNEXPECTED(err_msg); RETURN_STATUS_UNEXPECTED(err_msg);
@ -605,6 +608,7 @@ Status DEPipeline::FetchMetaFromTensorRow(const std::unordered_map<std::string,
if (mr_type == "bytes" || !mr_shape.empty()) continue; if (mr_type == "bytes" || !mr_shape.empty()) continue;
index_fields->emplace_back(column_name); // candidate of index fields index_fields->emplace_back(column_name); // candidate of index fields
} }
MS_LOG(DEBUG) << "Schema of dataset: " << dataset_schema.dump();
return Status::OK(); return Status::OK();
} }
Status DEPipeline::BuildMindrecordSamplerChain(const py::handle &handle, Status DEPipeline::BuildMindrecordSamplerChain(const py::handle &handle,

View File

@ -1042,12 +1042,61 @@ class Dataset:
""" """
Save the dynamic data processed by dataset pipeline as common dataset format, support: mindrecord. Save the dynamic data processed by dataset pipeline as common dataset format, support: mindrecord.
Implicit type casting exists when saving data as mindrecord. Table below shows how to do type casting.
.. list-table:: Implicit Type Casting of Saving as mindrecord
:widths: 25 25 50
:header-rows: 1
* - type in 'dataset'
- type in 'mindrecord'
- detail
* - DE_BOOL
- None
- Not support
* - DE_INT8
- int32
-
* - DE_UINT8
- bytes(1D uint8)
- Drop dimension
* - DE_INT16
- int32
-
* - DE_UINT16
- int32
-
* - DE_INT32
- int32
-
* - DE_UINT32
- int64
-
* - DE_INT64
- int64
-
* - DE_UINT64
- None
- Not support
* - DE_FLOAT16
- float32
-
* - DE_FLOAT32
- float32
-
* - DE_FLOAT64
- float64
-
* - DE_STRING
- string
- Not support multi-dimensional DE_STRING
Note: Note:
1. To save the samples in order, should set dataset's shuffle false and num_files 1. 1. To save the samples in order, should set dataset's shuffle false and num_files 1.
2. Before call the function, do not use batch, repeat operator or data augmentation operators 2. Before call the function, do not use batch, repeat operator or data augmentation operators
with random attribute in map operator. with random attribute in map operator.
3. Mindreocrd do not support np.uint64, multi-dimensional np.uint8(drop dimension) and 3. Mindrecord does not support DE_UINT64, multi-dimensional DE_UINT8(drop dimension) and
multi-dimensional string. multi-dimensional DE_STRING.
Args: Args:
file_name (str): Path to dataset file. file_name (str): Path to dataset file.