modify log and API comment

This commit is contained in:
ms_yan 2021-07-09 19:36:36 +08:00
parent da9957ef58
commit a58e64b27c
5 changed files with 32 additions and 10 deletions

View File

@ -205,7 +205,7 @@ Status BatchOp::BatchRows(const std::unique_ptr<TensorQTable> *src, TensorRow *d
first_shape.Print(shape1);
old_tensor->shape().Print(shape2);
RETURN_STATUS_UNEXPECTED(
"Invalid data, expect same shape for each data row, but got inconsistent data shapes in column " +
"Invalid data, batch operation expect same shape for each data row, but got inconsistent shape in column " +
std::to_string(i) + " expected shape for this column is:" + shape1.str() + ", got shape:" + shape2.str());
}
}

View File

@ -101,7 +101,7 @@ Status GeneratorOp::PyRowToTensorRow(py::object py_data, TensorRow *tensor_row)
"Invalid data, Generator should return same number of NumPy arrays as specified in column_names, the size of"
" column_names is:" +
std::to_string(column_names_.size()) +
"and number of returned NumPy array is:" + std::to_string(py_row.size()));
" and number of returned NumPy array is:" + std::to_string(py_row.size()));
}
// Iterate over two containers simultaneously for memory copy
for (int i = 0; i < py_row.size(); ++i) {

View File

@ -776,14 +776,11 @@ class Dataset:
@check_repeat
def repeat(self, count=None):
"""
Repeat this dataset N times where N = count. Repeat infinitely if the count is None or -1.
Repeat this dataset `count` times. Repeat infinitely if the count is None or -1.
Note:
The order of using repeat and batch reflects the number of batches. It is recommended that
the repeat operation be used after the batch operation.
If dataset_sink_mode is False, the repeat operation is invalid.
If dataset_sink_mode is True, repeat count must be equal to the epoch of training. Otherwise,
errors could occur since the amount of data is not the amount training requires.
Args:
count (int): Number of times the dataset is going to be repeated (default=None).

View File

@ -334,6 +334,9 @@ class SentencePieceTokenizer(TextTensorOperation):
out_type (SPieceTokenizerOutType): The type of output, it can be any of [SPieceTokenizerOutType.STRING,
SPieceTokenizerOutType.INT].
- SPieceTokenizerOutType.STRING, means output type of SentencePice Tokenizer is string.
- SPieceTokenizerOutType.INT, means output type of SentencePice Tokenizer is int.
Examples:
>>> from mindspore.dataset.text import SentencePieceModel, SPieceTokenizerOutType
>>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file"
@ -573,8 +576,16 @@ if platform.system().lower() != 'windows':
on input text to fold the text to lower case and strip accents characters. If False, only apply
NormalizeUTF8 operation with the specified mode on input text (default=False).
keep_whitespace (bool, optional): If True, the whitespace will be kept in output tokens (default=False).
normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode. This is
only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE).
normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode
(default=NormalizeForm.NONE). This is only effective when `lower_case` is False. It can be any of
[NormalizeForm.NONE, NormalizeForm.NFC, NormalizeForm.NFKC, NormalizeForm.NFD, NormalizeForm.NFKD].
- NormalizeForm.NONE, do nothing for input string tensor.
- NormalizeForm.NFC, normalize with Normalization Form C.
- NormalizeForm.NFKC, normalize with Normalization Form KC.
- NormalizeForm.NFD, normalize with Normalization Form D.
- NormalizeForm.NFKD, normalize with Normalization Form KD.
preserve_unused_token (bool, optional): If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).

View File

@ -152,7 +152,14 @@ class SentencePieceVocab(cde.SentencePieceVocab):
character set.
model_type(SentencePieceModel): It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE,
SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input
sentence must be pre-tokenized when using word type.
sentence must be pre-tokenized when using SentencePieceModel.WORD type.
- SentencePieceModel.UNIGRAM, Unigram Language Model means the next word in the sentence is assumed to
be independent of the previous words generated by the model.
- SentencePieceModel.BPE, refers to byte pair encoding algorithm, which replaces the most frequent pair
of bytes in a sentence with a single, unused byte.
- SentencePieceModel.CHAR, refers to char based sentencePiece Model type.
- SentencePieceModel.WORD, refers to word based sentencePiece Model type.
params(dict): A dictionary with no incoming parameters.
@ -177,7 +184,14 @@ class SentencePieceVocab(cde.SentencePieceVocab):
character set.
model_type(SentencePieceModel): It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE,
SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input
sentence must be pre-tokenized when using word type.
sentence must be pre-tokenized when using SentencePieceModel.WORD type.
- SentencePieceModel.UNIGRAM, Unigram Language Model means the next word in the sentence is assumed to
be independent of the previous words generated by the model.
- SentencePieceModel.BPE, refers to byte pair encoding algorithm, which replaces the most frequent pair
of bytes in a sentence with a single, unused byte.
- SentencePieceModel.CHAR, refers to char based sentencePiece Model type.
- SentencePieceModel.WORD, refers to word based sentencePiece Model type.
params(dict): A dictionary with no incoming parameters(The parameters are derived from SentencePiece
library).