modify log and API comment

2021-07-09 19:36:36 +08:00 · 2021-07-09 19:36:36 +08:00 · a58e64b27c
parent da9957ef58
commit a58e64b27c
5 changed files with 32 additions and 10 deletions
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/batch_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/batch_op.cc
@ -205,7 +205,7 @@ Status BatchOp::BatchRows(const std::unique_ptr<TensorQTable> *src, TensorRow *d
          first_shape.Print(shape1);
          old_tensor->shape().Print(shape2);
          RETURN_STATUS_UNEXPECTED(
-            "Invalid data, expect same shape for each data row, but got inconsistent data shapes in column " +
+            "Invalid data, batch operation expect same shape for each data row, but got inconsistent shape in column " +
            std::to_string(i) + " expected shape for this column is:" + shape1.str() + ", got shape:" + shape2.str());
        }
      }
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/generator_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/generator_op.cc
@ -101,7 +101,7 @@ Status GeneratorOp::PyRowToTensorRow(py::object py_data, TensorRow *tensor_row)
      "Invalid data, Generator should return same number of NumPy arrays as specified in column_names, the size of"
      " column_names is:" +
        std::to_string(column_names_.size()) +
-        "and number of returned NumPy array is:" + std::to_string(py_row.size()));
+        " and number of returned NumPy array is:" + std::to_string(py_row.size()));
  }
  // Iterate over two containers simultaneously for memory copy
  for (int i = 0; i < py_row.size(); ++i) {
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@ -776,14 +776,11 @@ class Dataset:
    @check_repeat
    def repeat(self, count=None):
        """
-        Repeat this dataset N times where N = count. Repeat infinitely if the count is None or -1.
+        Repeat this dataset `count` times. Repeat infinitely if the count is None or -1.

        Note:
            The order of using repeat and batch reflects the number of batches. It is recommended that
            the repeat operation be used after the batch operation.
-            If dataset_sink_mode is False, the repeat operation is invalid.
-            If dataset_sink_mode is True, repeat count must be equal to the epoch of training. Otherwise,
-            errors could occur since the amount of data is not the amount training requires.

        Args:
            count (int): Number of times the dataset is going to be repeated (default=None).
--- a/mindspore/dataset/text/transforms.py
+++ b/mindspore/dataset/text/transforms.py
@ -334,6 +334,9 @@ class SentencePieceTokenizer(TextTensorOperation):
        out_type (SPieceTokenizerOutType): The type of output, it can be any of [SPieceTokenizerOutType.STRING,
            SPieceTokenizerOutType.INT].

+            - SPieceTokenizerOutType.STRING, means output type of SentencePice Tokenizer is string.
+            - SPieceTokenizerOutType.INT, means output type of SentencePice Tokenizer is int.
+
    Examples:
        >>> from mindspore.dataset.text import SentencePieceModel, SPieceTokenizerOutType
        >>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file"
@ -573,8 +576,16 @@ if platform.system().lower() != 'windows':
                on input text to fold the text to lower case and strip accents characters. If False, only apply
                NormalizeUTF8 operation with the specified mode on input text (default=False).
            keep_whitespace (bool, optional): If True, the whitespace will be kept in output tokens (default=False).
-            normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode. This is
-                only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE).
+            normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode
+                (default=NormalizeForm.NONE). This is only effective when `lower_case` is False. It can be any of
+                [NormalizeForm.NONE, NormalizeForm.NFC, NormalizeForm.NFKC, NormalizeForm.NFD, NormalizeForm.NFKD].
+
+                - NormalizeForm.NONE, do nothing for input string tensor.
+                - NormalizeForm.NFC, normalize with Normalization Form C.
+                - NormalizeForm.NFKC, normalize with Normalization Form KC.
+                - NormalizeForm.NFD, normalize with Normalization Form D.
+                - NormalizeForm.NFKD, normalize with Normalization Form KD.
+
            preserve_unused_token (bool, optional): If True, do not split special tokens like
                '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
            with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
--- a/mindspore/dataset/text/utils.py
+++ b/mindspore/dataset/text/utils.py
@ -152,7 +152,14 @@ class SentencePieceVocab(cde.SentencePieceVocab):
                character set.
            model_type(SentencePieceModel): It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE,
                SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input
-                sentence must be pre-tokenized when using word type.
+                sentence must be pre-tokenized when using SentencePieceModel.WORD type.
+
+                - SentencePieceModel.UNIGRAM, Unigram Language Model means the next word in the sentence is assumed to
+                  be independent of the previous words generated by the model.
+                - SentencePieceModel.BPE, refers to byte pair encoding algorithm, which replaces the most frequent pair
+                  of bytes in a sentence with a single, unused byte.
+                - SentencePieceModel.CHAR, refers to char based sentencePiece Model type.
+                - SentencePieceModel.WORD, refers to word based sentencePiece Model type.

            params(dict): A dictionary with no incoming parameters.

@ -177,7 +184,14 @@ class SentencePieceVocab(cde.SentencePieceVocab):
                character set.
            model_type(SentencePieceModel): It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE,
                SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input
-                sentence must be pre-tokenized when using word type.
+                sentence must be pre-tokenized when using SentencePieceModel.WORD type.
+
+                - SentencePieceModel.UNIGRAM, Unigram Language Model means the next word in the sentence is assumed to
+                  be independent of the previous words generated by the model.
+                - SentencePieceModel.BPE, refers to byte pair encoding algorithm, which replaces the most frequent pair
+                  of bytes in a sentence with a single, unused byte.
+                - SentencePieceModel.CHAR, refers to char based sentencePiece Model type.
+                - SentencePieceModel.WORD, refers to word based sentencePiece Model type.

            params(dict): A dictionary with no incoming parameters(The parameters are derived from SentencePiece
                library).