modify log and API comment
This commit is contained in:
parent
da9957ef58
commit
a58e64b27c
|
@ -205,7 +205,7 @@ Status BatchOp::BatchRows(const std::unique_ptr<TensorQTable> *src, TensorRow *d
|
|||
first_shape.Print(shape1);
|
||||
old_tensor->shape().Print(shape2);
|
||||
RETURN_STATUS_UNEXPECTED(
|
||||
"Invalid data, expect same shape for each data row, but got inconsistent data shapes in column " +
|
||||
"Invalid data, batch operation expect same shape for each data row, but got inconsistent shape in column " +
|
||||
std::to_string(i) + " expected shape for this column is:" + shape1.str() + ", got shape:" + shape2.str());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -101,7 +101,7 @@ Status GeneratorOp::PyRowToTensorRow(py::object py_data, TensorRow *tensor_row)
|
|||
"Invalid data, Generator should return same number of NumPy arrays as specified in column_names, the size of"
|
||||
" column_names is:" +
|
||||
std::to_string(column_names_.size()) +
|
||||
"and number of returned NumPy array is:" + std::to_string(py_row.size()));
|
||||
" and number of returned NumPy array is:" + std::to_string(py_row.size()));
|
||||
}
|
||||
// Iterate over two containers simultaneously for memory copy
|
||||
for (int i = 0; i < py_row.size(); ++i) {
|
||||
|
|
|
@ -776,14 +776,11 @@ class Dataset:
|
|||
@check_repeat
|
||||
def repeat(self, count=None):
|
||||
"""
|
||||
Repeat this dataset N times where N = count. Repeat infinitely if the count is None or -1.
|
||||
Repeat this dataset `count` times. Repeat infinitely if the count is None or -1.
|
||||
|
||||
Note:
|
||||
The order of using repeat and batch reflects the number of batches. It is recommended that
|
||||
the repeat operation be used after the batch operation.
|
||||
If dataset_sink_mode is False, the repeat operation is invalid.
|
||||
If dataset_sink_mode is True, repeat count must be equal to the epoch of training. Otherwise,
|
||||
errors could occur since the amount of data is not the amount training requires.
|
||||
|
||||
Args:
|
||||
count (int): Number of times the dataset is going to be repeated (default=None).
|
||||
|
|
|
@ -334,6 +334,9 @@ class SentencePieceTokenizer(TextTensorOperation):
|
|||
out_type (SPieceTokenizerOutType): The type of output, it can be any of [SPieceTokenizerOutType.STRING,
|
||||
SPieceTokenizerOutType.INT].
|
||||
|
||||
- SPieceTokenizerOutType.STRING, means output type of SentencePice Tokenizer is string.
|
||||
- SPieceTokenizerOutType.INT, means output type of SentencePice Tokenizer is int.
|
||||
|
||||
Examples:
|
||||
>>> from mindspore.dataset.text import SentencePieceModel, SPieceTokenizerOutType
|
||||
>>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file"
|
||||
|
@ -573,8 +576,16 @@ if platform.system().lower() != 'windows':
|
|||
on input text to fold the text to lower case and strip accents characters. If False, only apply
|
||||
NormalizeUTF8 operation with the specified mode on input text (default=False).
|
||||
keep_whitespace (bool, optional): If True, the whitespace will be kept in output tokens (default=False).
|
||||
normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode. This is
|
||||
only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE).
|
||||
normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode
|
||||
(default=NormalizeForm.NONE). This is only effective when `lower_case` is False. It can be any of
|
||||
[NormalizeForm.NONE, NormalizeForm.NFC, NormalizeForm.NFKC, NormalizeForm.NFD, NormalizeForm.NFKD].
|
||||
|
||||
- NormalizeForm.NONE, do nothing for input string tensor.
|
||||
- NormalizeForm.NFC, normalize with Normalization Form C.
|
||||
- NormalizeForm.NFKC, normalize with Normalization Form KC.
|
||||
- NormalizeForm.NFD, normalize with Normalization Form D.
|
||||
- NormalizeForm.NFKD, normalize with Normalization Form KD.
|
||||
|
||||
preserve_unused_token (bool, optional): If True, do not split special tokens like
|
||||
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
|
||||
with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
|
||||
|
|
|
@ -152,7 +152,14 @@ class SentencePieceVocab(cde.SentencePieceVocab):
|
|||
character set.
|
||||
model_type(SentencePieceModel): It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE,
|
||||
SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input
|
||||
sentence must be pre-tokenized when using word type.
|
||||
sentence must be pre-tokenized when using SentencePieceModel.WORD type.
|
||||
|
||||
- SentencePieceModel.UNIGRAM, Unigram Language Model means the next word in the sentence is assumed to
|
||||
be independent of the previous words generated by the model.
|
||||
- SentencePieceModel.BPE, refers to byte pair encoding algorithm, which replaces the most frequent pair
|
||||
of bytes in a sentence with a single, unused byte.
|
||||
- SentencePieceModel.CHAR, refers to char based sentencePiece Model type.
|
||||
- SentencePieceModel.WORD, refers to word based sentencePiece Model type.
|
||||
|
||||
params(dict): A dictionary with no incoming parameters.
|
||||
|
||||
|
@ -177,7 +184,14 @@ class SentencePieceVocab(cde.SentencePieceVocab):
|
|||
character set.
|
||||
model_type(SentencePieceModel): It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE,
|
||||
SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input
|
||||
sentence must be pre-tokenized when using word type.
|
||||
sentence must be pre-tokenized when using SentencePieceModel.WORD type.
|
||||
|
||||
- SentencePieceModel.UNIGRAM, Unigram Language Model means the next word in the sentence is assumed to
|
||||
be independent of the previous words generated by the model.
|
||||
- SentencePieceModel.BPE, refers to byte pair encoding algorithm, which replaces the most frequent pair
|
||||
of bytes in a sentence with a single, unused byte.
|
||||
- SentencePieceModel.CHAR, refers to char based sentencePiece Model type.
|
||||
- SentencePieceModel.WORD, refers to word based sentencePiece Model type.
|
||||
|
||||
params(dict): A dictionary with no incoming parameters(The parameters are derived from SentencePiece
|
||||
library).
|
||||
|
|
Loading…
Reference in New Issue