!47949 [MD] Fix Bug Printing Issue And Reclarify TFRecord Docs

Merge pull request !47949 from davidanugraha/bug_tfrecord_fix
2023-01-18 07:23:37 +00:00 · 2023-01-18 07:23:37 +00:00 · acdb30b7d4
parent c53ba3eb51 f5c2768091
commit acdb30b7d4
4 changed files with 33 additions and 27 deletions
--- a/docs/api/api_python/dataset/mindspore.dataset.TFRecordDataset.rst
+++ b/docs/api/api_python/dataset/mindspore.dataset.TFRecordDataset.rst
@ -12,8 +12,9 @@ mindspore.dataset.TFRecordDataset
        - **columns_list** (list[str], 可选) - 指定从TFRecord文件中读取的数据列。默认值：None，读取所有列。
        - **num_samples** (int, 可选) - 指定从数据集中读取的样本数。默认值：None，读取全部样本。

-          - 如果 `num_samples` 为None，并且numRows字段（由参数 `schema` 定义）不存在，则读取所有数据集；
-          - 如果 `num_samples` 为None，并且numRows字段（由参数 `schema` 定义）的值大于0，则读取numRows条数据；
+          - 如果 `num_samples` 为None，并且numRows字段（由参数 `schema` 定义）不存在，则读取所有数据集。
+          - 如果 `compression_type` 不是 None， `num_samples` 为None，并且numRows字段（由参数 `schema` 定义）的值大于0，则读取所有数据集。
+          - 如果 `compression_type` 为None， `num_samples` 为None，并且numRows字段（由参数 `schema` 定义）的值大于0，则读取numRows条数据。
          - 如果 `num_samples` 和numRows字段（由参数 `schema` 定义）的值都大于0，此时仅有参数 `num_samples` 生效且读取给定数量的数据。
          - 如果 `compression_type` 不是 None，并且提供了 `num_samples` ，那么 `num_samples` 将是为每个分片从压缩文件中读取的行数。
            强烈建议在 `compression_type` 为 "GZIP" 或 "ZLIB" 时提供 `num_samples` 以避免性能下降。
--- a/mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h
+++ b/mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2022 Huawei Technologies Co., Ltd
+ * Copyright 2020-2023 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -240,7 +240,7 @@ class DATASET_API Dataset : public std::enable_shared_from_this<Dataset> {
  ///    corresponds to the value to pad with. If a column is not specified, then that column will be padded to the
  ///    longest in the current batch, and 0 will be used as the padding value. Any unspecified dimensions will be
  ///    padded to the longest in the current batch, unless if pad_to_bucket_boundary is true. If no padding is
-  ///    wanted, set pad_info to None (default=empty dictionary).
+  ///    wanted, set pad_info to empty map (default=empty map).
  /// \param[in] pad_to_bucket_boundary If true, will pad each unspecified dimension in pad_info to the
  ///    bucket_boundary minus 1. If there are any elements that fall into the last bucket,
  ///    an error will occur (default=false).
@ -777,7 +777,7 @@ class DATASET_API BucketBatchByLengthDataset : public Dataset {
  ///    corresponds to the value to pad with. If a column is not specified, then that column will be padded to the
  ///    longest in the current batch, and 0 will be used as the padding value. Any unspecified dimensions will be
  ///    padded to the longest in the current batch, unless if pad_to_bucket_boundary is true. If no padding is
-  ///    wanted, set pad_info to None (default=empty dictionary).
+  ///    wanted, set pad_info to empty map (default=empty map).
  /// \param[in] pad_to_bucket_boundary If true, will pad each unspecified dimension in pad_info to the
  ///    bucket_boundary minus 1. If there are any elements that fall into the last bucket,
  ///    an error will occur (default=false).
@ -5729,10 +5729,13 @@ class DATASET_API TFRecordDataset : public Dataset {
  /// \param[in] columns_list List of columns to be read. (Default = {}, read all columns).
  /// \param[in] num_samples The number of samples to be included in the dataset.
  ///     (Default = 0 means all samples).
-  ///     If `num_samples` is 0 and numRows(parsed from schema) does not exist, read the full dataset;
-  ///     If `num_samples` is 0 and numRows(parsed from schema) is greater than 0, read numRows rows;
+  ///     If `num_samples` is 0 and numRows(parsed from `schema` ) does not exist, read the full dataset.
+  ///     If `compression_type` is not "" and `num_samples` is 0, and numRows(parsed from `schema` ) is
+  ///     greater than 0, read the full dataset.
+  ///     If `compression_type` is "", `num_samples` is 0, and numRows(parsed from `schema` ) is
+  ///     greater than 0, read numRows rows.
  ///     If both `num_samples` and numRows(parsed from schema) are greater than 0, read `num_samples` rows.
-  ///     If `compression_type` is not None and `num_samples` is provided, then `num_samples` will be
+  ///     If `compression_type` is not "" and `num_samples` is provided, then `num_samples` will be
  ///     interpreted as number of rows to be read per shard from the compressed files.
  ///     It is highly recommended to provide `num_samples` when `compression_type` is "GZIP" or "ZLIB"
  ///     to avoid performance degradation.
@ -5756,8 +5759,6 @@ class DATASET_API TFRecordDataset : public Dataset {
  ///     "" - No compression is used.
  ///     "GZIP" - GZIP compression is used.
  ///     "ZLIB" - ZLIB compression is used.
-  ///     This will automatically get equal rows for all shards and thus cannot have the case
-  ///     where `num_samples` is None.
  TFRecordDataset(const std::vector<std::vector<char>> &dataset_files, const std::vector<char> &schema,
                  const std::vector<std::vector<char>> &columns_list, int64_t num_samples, ShuffleMode shuffle,
                  int32_t num_shards, int32_t shard_id, bool shard_equal_rows,
@ -5771,10 +5772,13 @@ class DATASET_API TFRecordDataset : public Dataset {
  /// \param[in] columns_list List of columns to be read (Default = {}, read all columns).
  /// \param[in] num_samples The number of samples to be included in the dataset
  ///     (Default = 0 means all samples).
-  ///     If num_samples is 0 and numRows(parsed from schema) does not exist, read the full dataset;
-  ///     If num_samples is 0 and numRows(parsed from schema) is greater than 0, read numRows rows;
+  ///     If `num_samples` is 0 and numRows(parsed from `schema` ) does not exist, read the full dataset.
+  ///     If `compression_type` is not "", `num_samples` is 0, and numRows(parsed from `schema` ) is
+  ///     greater than 0, read the full dataset.
+  ///     If `compression_type` is "", `num_samples` is 0, and numRows(parsed from `schema` ) is
+  ///     greater than 0, read numRows rows.
  ///     If both num_samples and numRows(parsed from schema) are greater than 0, read num_samples rows.
-  ///     If `compression_type` is not None and `num_samples` is provided, then `num_samples` will be
+  ///     If `compression_type` is not "" and `num_samples` is provided, then `num_samples` will be
  ///     interpreted as number of rows to be read per shard from the compressed files.
  ///     It is highly recommended to provide `num_samples` when `compression_type` is "GZIP" or "ZLIB"
  ///     to avoid performance degradation.
@ -5798,9 +5802,6 @@ class DATASET_API TFRecordDataset : public Dataset {
  ///     "" - No compression is used.
  ///     "GZIP" - GZIP compression is used.
  ///     "ZLIB" - ZLIB compression is used.
-  ///     This will automatically get equal rows for all shards and thus cannot have the case
-  ///     where `num_samples` is None.
-
  TFRecordDataset(const std::vector<std::vector<char>> &dataset_files, const std::shared_ptr<SchemaObj> &schema,
                  const std::vector<std::vector<char>> &columns_list, int64_t num_samples, ShuffleMode shuffle,
                  int32_t num_shards, int32_t shard_id, bool shard_equal_rows,
@ -5818,10 +5819,13 @@ class DATASET_API TFRecordDataset : public Dataset {
 /// \param[in] columns_list List of columns to be read (Default = {}, read all columns).
 /// \param[in] num_samples The number of samples to be included in the dataset
 ///     (Default = 0 means all samples).
-///     If num_samples is 0 and numRows(parsed from schema) does not exist, read the full dataset;
-///     If num_samples is 0 and numRows(parsed from schema) is greater than 0, read numRows rows;
+///     If `num_samples` is 0 and numRows(parsed from `schema` ) does not exist, read the full dataset.
+///     If `compression_type` is not "", `num_samples` is 0, and numRows(parsed from `schema` ) is
+///     greater than 0, read the full dataset.
+///     If `compression_type` is "", `num_samples` is 0, and numRows(parsed from `schema` ) is
+///     greater than 0, read numRows rows.
 ///     If both num_samples and numRows(parsed from schema) are greater than 0, read num_samples rows.
-///     If `compression_type` is not None and `num_samples` is provided, then `num_samples` will be
+///     If `compression_type` is not "" and `num_samples` is provided, then `num_samples` will be
 ///     interpreted as number of rows to be read per shard from the compressed files.
 ///     It is highly recommended to provide `num_samples` when `compression_type` is "GZIP" or "ZLIB"
 ///     to avoid performance degradation.
@ -5845,8 +5849,6 @@ class DATASET_API TFRecordDataset : public Dataset {
 ///     "" - No compression is used.
 ///     "GZIP" - GZIP compression is used.
 ///     "ZLIB" - ZLIB compression is used.
-///     This will automatically get equal rows for all shards and thus cannot have the case
-///     where `num_samples` is None.
 /// \return Shared pointer to the TFRecordDataset.
 /// \par Example
 /// \code
--- a/mindspore/python/mindspore/dataset/engine/datasets_standard_format.py
+++ b/mindspore/python/mindspore/dataset/engine/datasets_standard_format.py
@ -1,4 +1,4 @@
-# Copyright 2019-2022 Huawei Technologies Co., Ltd
+# Copyright 2019-2023 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -253,8 +253,11 @@ class TFRecordDataset(SourceDataset, UnionBaseDataset):
            Default: None.
        columns_list (list[str], optional): List of columns to be read. Default: None, read all columns.
        num_samples (int, optional): The number of samples (rows) to be included in the dataset. Default: None.
-            If `num_samples` is None and numRows(parsed from `schema` ) does not exist, read the full dataset;
-            If `num_samples` is None and numRows(parsed from `schema` ) is greater than 0, read numRows rows;
+            If `num_samples` is None and numRows(parsed from `schema` ) does not exist, read the full dataset.
+            If `compression_type` is not None, `num_samples` is None, and numRows(parsed from `schema` ) is
+            greater than 0, read the full dataset.
+            If `compression_type` is None, `num_samples` is None, and numRows(parsed from `schema` ) is
+            greater than 0, read numRows rows.
            If both `num_samples` and numRows(parsed from `schema` ) are greater than 0, read `num_samples` rows.
            If `compression_type` is not None and `num_samples` is provided, then `num_samples` will be
            interpreted as the number of rows to be read per shard from the compressed files.
--- a/mindspore/python/mindspore/dataset/engine/validators.py
+++ b/mindspore/python/mindspore/dataset/engine/validators.py
@ -1,4 +1,4 @@
-# Copyright 2019-2022 Huawei Technologies Co., Ltd
+# Copyright 2019-2023 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -587,8 +587,8 @@ def check_tfrecorddataset(method):

        compression_type = param_dict.get('compression_type')
        if compression_type is not None and compression_type not in ['', 'ZLIB', 'GZIP']:
-            raise ValueError("Input compression_type can only be either '' (no compression), 'ZLIB', or 'GZIP', \
-                             but got '" + str(compression_type) + "'.")
+            raise ValueError("Input compression_type can only be either '' (no compression), 'ZLIB', or 'GZIP', " +
+                             "but got '" + str(compression_type) + "'.")
        if compression_type is not None and compression_type in ['ZLIB', 'GZIP'] and \
            param_dict.get('num_samples') is not None:
            if param_dict.get('num_shards') is not None and ((isinstance(dataset_files, str) and \