forked from mindspore-Ecosystem/mindspore
!47949 [MD] Fix Bug Printing Issue And Reclarify TFRecord Docs
Merge pull request !47949 from davidanugraha/bug_tfrecord_fix
This commit is contained in:
commit
acdb30b7d4
|
@ -12,8 +12,9 @@ mindspore.dataset.TFRecordDataset
|
|||
- **columns_list** (list[str], 可选) - 指定从TFRecord文件中读取的数据列。默认值:None,读取所有列。
|
||||
- **num_samples** (int, 可选) - 指定从数据集中读取的样本数。默认值:None,读取全部样本。
|
||||
|
||||
- 如果 `num_samples` 为None,并且numRows字段(由参数 `schema` 定义)不存在,则读取所有数据集;
|
||||
- 如果 `num_samples` 为None,并且numRows字段(由参数 `schema` 定义)的值大于0,则读取numRows条数据;
|
||||
- 如果 `num_samples` 为None,并且numRows字段(由参数 `schema` 定义)不存在,则读取所有数据集。
|
||||
- 如果 `compression_type` 不是 None, `num_samples` 为None,并且numRows字段(由参数 `schema` 定义)的值大于0,则读取所有数据集。
|
||||
- 如果 `compression_type` 为None, `num_samples` 为None,并且numRows字段(由参数 `schema` 定义)的值大于0,则读取numRows条数据。
|
||||
- 如果 `num_samples` 和numRows字段(由参数 `schema` 定义)的值都大于0,此时仅有参数 `num_samples` 生效且读取给定数量的数据。
|
||||
- 如果 `compression_type` 不是 None,并且提供了 `num_samples` ,那么 `num_samples` 将是为每个分片从压缩文件中读取的行数。
|
||||
强烈建议在 `compression_type` 为 "GZIP" 或 "ZLIB" 时提供 `num_samples` 以避免性能下降。
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020-2022 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2023 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -240,7 +240,7 @@ class DATASET_API Dataset : public std::enable_shared_from_this<Dataset> {
|
|||
/// corresponds to the value to pad with. If a column is not specified, then that column will be padded to the
|
||||
/// longest in the current batch, and 0 will be used as the padding value. Any unspecified dimensions will be
|
||||
/// padded to the longest in the current batch, unless if pad_to_bucket_boundary is true. If no padding is
|
||||
/// wanted, set pad_info to None (default=empty dictionary).
|
||||
/// wanted, set pad_info to empty map (default=empty map).
|
||||
/// \param[in] pad_to_bucket_boundary If true, will pad each unspecified dimension in pad_info to the
|
||||
/// bucket_boundary minus 1. If there are any elements that fall into the last bucket,
|
||||
/// an error will occur (default=false).
|
||||
|
@ -777,7 +777,7 @@ class DATASET_API BucketBatchByLengthDataset : public Dataset {
|
|||
/// corresponds to the value to pad with. If a column is not specified, then that column will be padded to the
|
||||
/// longest in the current batch, and 0 will be used as the padding value. Any unspecified dimensions will be
|
||||
/// padded to the longest in the current batch, unless if pad_to_bucket_boundary is true. If no padding is
|
||||
/// wanted, set pad_info to None (default=empty dictionary).
|
||||
/// wanted, set pad_info to empty map (default=empty map).
|
||||
/// \param[in] pad_to_bucket_boundary If true, will pad each unspecified dimension in pad_info to the
|
||||
/// bucket_boundary minus 1. If there are any elements that fall into the last bucket,
|
||||
/// an error will occur (default=false).
|
||||
|
@ -5729,10 +5729,13 @@ class DATASET_API TFRecordDataset : public Dataset {
|
|||
/// \param[in] columns_list List of columns to be read. (Default = {}, read all columns).
|
||||
/// \param[in] num_samples The number of samples to be included in the dataset.
|
||||
/// (Default = 0 means all samples).
|
||||
/// If `num_samples` is 0 and numRows(parsed from schema) does not exist, read the full dataset;
|
||||
/// If `num_samples` is 0 and numRows(parsed from schema) is greater than 0, read numRows rows;
|
||||
/// If `num_samples` is 0 and numRows(parsed from `schema` ) does not exist, read the full dataset.
|
||||
/// If `compression_type` is not "" and `num_samples` is 0, and numRows(parsed from `schema` ) is
|
||||
/// greater than 0, read the full dataset.
|
||||
/// If `compression_type` is "", `num_samples` is 0, and numRows(parsed from `schema` ) is
|
||||
/// greater than 0, read numRows rows.
|
||||
/// If both `num_samples` and numRows(parsed from schema) are greater than 0, read `num_samples` rows.
|
||||
/// If `compression_type` is not None and `num_samples` is provided, then `num_samples` will be
|
||||
/// If `compression_type` is not "" and `num_samples` is provided, then `num_samples` will be
|
||||
/// interpreted as number of rows to be read per shard from the compressed files.
|
||||
/// It is highly recommended to provide `num_samples` when `compression_type` is "GZIP" or "ZLIB"
|
||||
/// to avoid performance degradation.
|
||||
|
@ -5756,8 +5759,6 @@ class DATASET_API TFRecordDataset : public Dataset {
|
|||
/// "" - No compression is used.
|
||||
/// "GZIP" - GZIP compression is used.
|
||||
/// "ZLIB" - ZLIB compression is used.
|
||||
/// This will automatically get equal rows for all shards and thus cannot have the case
|
||||
/// where `num_samples` is None.
|
||||
TFRecordDataset(const std::vector<std::vector<char>> &dataset_files, const std::vector<char> &schema,
|
||||
const std::vector<std::vector<char>> &columns_list, int64_t num_samples, ShuffleMode shuffle,
|
||||
int32_t num_shards, int32_t shard_id, bool shard_equal_rows,
|
||||
|
@ -5771,10 +5772,13 @@ class DATASET_API TFRecordDataset : public Dataset {
|
|||
/// \param[in] columns_list List of columns to be read (Default = {}, read all columns).
|
||||
/// \param[in] num_samples The number of samples to be included in the dataset
|
||||
/// (Default = 0 means all samples).
|
||||
/// If num_samples is 0 and numRows(parsed from schema) does not exist, read the full dataset;
|
||||
/// If num_samples is 0 and numRows(parsed from schema) is greater than 0, read numRows rows;
|
||||
/// If `num_samples` is 0 and numRows(parsed from `schema` ) does not exist, read the full dataset.
|
||||
/// If `compression_type` is not "", `num_samples` is 0, and numRows(parsed from `schema` ) is
|
||||
/// greater than 0, read the full dataset.
|
||||
/// If `compression_type` is "", `num_samples` is 0, and numRows(parsed from `schema` ) is
|
||||
/// greater than 0, read numRows rows.
|
||||
/// If both num_samples and numRows(parsed from schema) are greater than 0, read num_samples rows.
|
||||
/// If `compression_type` is not None and `num_samples` is provided, then `num_samples` will be
|
||||
/// If `compression_type` is not "" and `num_samples` is provided, then `num_samples` will be
|
||||
/// interpreted as number of rows to be read per shard from the compressed files.
|
||||
/// It is highly recommended to provide `num_samples` when `compression_type` is "GZIP" or "ZLIB"
|
||||
/// to avoid performance degradation.
|
||||
|
@ -5798,9 +5802,6 @@ class DATASET_API TFRecordDataset : public Dataset {
|
|||
/// "" - No compression is used.
|
||||
/// "GZIP" - GZIP compression is used.
|
||||
/// "ZLIB" - ZLIB compression is used.
|
||||
/// This will automatically get equal rows for all shards and thus cannot have the case
|
||||
/// where `num_samples` is None.
|
||||
|
||||
TFRecordDataset(const std::vector<std::vector<char>> &dataset_files, const std::shared_ptr<SchemaObj> &schema,
|
||||
const std::vector<std::vector<char>> &columns_list, int64_t num_samples, ShuffleMode shuffle,
|
||||
int32_t num_shards, int32_t shard_id, bool shard_equal_rows,
|
||||
|
@ -5818,10 +5819,13 @@ class DATASET_API TFRecordDataset : public Dataset {
|
|||
/// \param[in] columns_list List of columns to be read (Default = {}, read all columns).
|
||||
/// \param[in] num_samples The number of samples to be included in the dataset
|
||||
/// (Default = 0 means all samples).
|
||||
/// If num_samples is 0 and numRows(parsed from schema) does not exist, read the full dataset;
|
||||
/// If num_samples is 0 and numRows(parsed from schema) is greater than 0, read numRows rows;
|
||||
/// If `num_samples` is 0 and numRows(parsed from `schema` ) does not exist, read the full dataset.
|
||||
/// If `compression_type` is not "", `num_samples` is 0, and numRows(parsed from `schema` ) is
|
||||
/// greater than 0, read the full dataset.
|
||||
/// If `compression_type` is "", `num_samples` is 0, and numRows(parsed from `schema` ) is
|
||||
/// greater than 0, read numRows rows.
|
||||
/// If both num_samples and numRows(parsed from schema) are greater than 0, read num_samples rows.
|
||||
/// If `compression_type` is not None and `num_samples` is provided, then `num_samples` will be
|
||||
/// If `compression_type` is not "" and `num_samples` is provided, then `num_samples` will be
|
||||
/// interpreted as number of rows to be read per shard from the compressed files.
|
||||
/// It is highly recommended to provide `num_samples` when `compression_type` is "GZIP" or "ZLIB"
|
||||
/// to avoid performance degradation.
|
||||
|
@ -5845,8 +5849,6 @@ class DATASET_API TFRecordDataset : public Dataset {
|
|||
/// "" - No compression is used.
|
||||
/// "GZIP" - GZIP compression is used.
|
||||
/// "ZLIB" - ZLIB compression is used.
|
||||
/// This will automatically get equal rows for all shards and thus cannot have the case
|
||||
/// where `num_samples` is None.
|
||||
/// \return Shared pointer to the TFRecordDataset.
|
||||
/// \par Example
|
||||
/// \code
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2019-2022 Huawei Technologies Co., Ltd
|
||||
# Copyright 2019-2023 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -253,8 +253,11 @@ class TFRecordDataset(SourceDataset, UnionBaseDataset):
|
|||
Default: None.
|
||||
columns_list (list[str], optional): List of columns to be read. Default: None, read all columns.
|
||||
num_samples (int, optional): The number of samples (rows) to be included in the dataset. Default: None.
|
||||
If `num_samples` is None and numRows(parsed from `schema` ) does not exist, read the full dataset;
|
||||
If `num_samples` is None and numRows(parsed from `schema` ) is greater than 0, read numRows rows;
|
||||
If `num_samples` is None and numRows(parsed from `schema` ) does not exist, read the full dataset.
|
||||
If `compression_type` is not None, `num_samples` is None, and numRows(parsed from `schema` ) is
|
||||
greater than 0, read the full dataset.
|
||||
If `compression_type` is None, `num_samples` is None, and numRows(parsed from `schema` ) is
|
||||
greater than 0, read numRows rows.
|
||||
If both `num_samples` and numRows(parsed from `schema` ) are greater than 0, read `num_samples` rows.
|
||||
If `compression_type` is not None and `num_samples` is provided, then `num_samples` will be
|
||||
interpreted as the number of rows to be read per shard from the compressed files.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2019-2022 Huawei Technologies Co., Ltd
|
||||
# Copyright 2019-2023 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -587,8 +587,8 @@ def check_tfrecorddataset(method):
|
|||
|
||||
compression_type = param_dict.get('compression_type')
|
||||
if compression_type is not None and compression_type not in ['', 'ZLIB', 'GZIP']:
|
||||
raise ValueError("Input compression_type can only be either '' (no compression), 'ZLIB', or 'GZIP', \
|
||||
but got '" + str(compression_type) + "'.")
|
||||
raise ValueError("Input compression_type can only be either '' (no compression), 'ZLIB', or 'GZIP', " +
|
||||
"but got '" + str(compression_type) + "'.")
|
||||
if compression_type is not None and compression_type in ['ZLIB', 'GZIP'] and \
|
||||
param_dict.get('num_samples') is not None:
|
||||
if param_dict.get('num_shards') is not None and ((isinstance(dataset_files, str) and \
|
||||
|
|
Loading…
Reference in New Issue