diff --git a/docs/api/api_python/dataset/dataset_method/operation/mindspore.dataset.TextBaseDataset.build_sentencepiece_vocab.rst b/docs/api/api_python/dataset/dataset_method/operation/mindspore.dataset.TextBaseDataset.build_sentencepiece_vocab.rst index 479180f919c..4e96d993820 100644 --- a/docs/api/api_python/dataset/dataset_method/operation/mindspore.dataset.TextBaseDataset.build_sentencepiece_vocab.rst +++ b/docs/api/api_python/dataset/dataset_method/operation/mindspore.dataset.TextBaseDataset.build_sentencepiece_vocab.rst @@ -6,6 +6,8 @@ mindspore.dataset.TextBaseDataset.build_sentencepiece_vocab 迭代源数据集对象获取数据并构建SentencePiece词汇表。 源数据集要求的是文本类数据集。 + .. note:: mindspore.dataset.Dataset.build_sentencepiece_vocab 从2.0版本开始弃用。请使用mindspore.dataset.text.SentencePieceVocab.from_dataset代替。 + 参数: - **columns** (list[str]) - 指定 `build_sentencepiece_vocab` 操作的输入列,会从该列获取数据构造词汇表。 - **vocab_size** (int) - 词汇表的容量。 diff --git a/docs/api/api_python/dataset/dataset_method/operation/mindspore.dataset.TextBaseDataset.build_vocab.rst b/docs/api/api_python/dataset/dataset_method/operation/mindspore.dataset.TextBaseDataset.build_vocab.rst index 1ec51610e27..e46eb08cba9 100644 --- a/docs/api/api_python/dataset/dataset_method/operation/mindspore.dataset.TextBaseDataset.build_vocab.rst +++ b/docs/api/api_python/dataset/dataset_method/operation/mindspore.dataset.TextBaseDataset.build_vocab.rst @@ -8,6 +8,8 @@ mindspore.dataset.TextBaseDataset.build_vocab 收集数据集中所有的不重复单词。返回 `top_k` 个最常见的单词组成的词汇表(如果指定了 `top_k` )。 + .. note:: mindspore.dataset.Dataset.build_vocab 从2.0版本开始弃用。请使用mindspore.dataset.text.Vocab.from_dataset代替。 + 参数: - **columns** (Union[str, list[str]]) - 指定 `build_vocab` 操作的输入列,会从该列获取数据构造词汇表。 - **freq_range** (tuple[int]) - 由(min_frequency, max_frequency)组成的整数元组,代表词汇出现的频率范围,在这个频率范围的词汇会被保存下来。 diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/batch_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/batch_op.cc index 8b6ab10a619..e469dcd495b 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/batch_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/batch_op.cc @@ -303,7 +303,7 @@ Status BatchOp::WorkerEntry(int32_t workerId) { RETURN_IF_NOT_OK(worker_out_queues_[workerId]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagEOF))); } else if (table_pair.second.ctrl_ == batchCtrl::kWait) { RETURN_IF_NOT_OK(worker_out_queues_[workerId]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagWait))); - TaskManager::FindMe()->Wait(); // wait for auto tune update workers successful + RETURN_IF_NOT_OK(TaskManager::FindMe()->Wait()); // wait for auto tune update workers successful TaskManager::FindMe()->Clear(); } else if (table_pair.second.ctrl_ == batchCtrl::kNoCtrl) { TensorRow new_row; diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/map_op/map_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/map_op/map_op.cc index 1a0e44227fd..29d99bbc937 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/map_op/map_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/map_op/map_op.cc @@ -214,7 +214,7 @@ Status MapOp::WorkerEntry(int32_t worker_id) { } RETURN_IF_NOT_OK(worker_out_queues_[worker_id]->EmplaceBack(std::move(in_row))); if (in_row.wait()) { - TaskManager::FindMe()->Wait(); // wait for auto tune update workers successful + RETURN_IF_NOT_OK(TaskManager::FindMe()->Wait()); // wait for auto tune update workers successful TaskManager::FindMe()->Clear(); } } else { diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/parallel_op.h b/mindspore/ccsrc/minddata/dataset/engine/datasetops/parallel_op.h index 67d3a0e13b0..c5b69c83860 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/parallel_op.h +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/parallel_op.h @@ -368,7 +368,7 @@ class ParallelOp : public DatasetOp { // If num_worker signals are received, wakes up the main thread if (++num_workers_paused_ == num_workers_) { wait_for_workers_post_.Set(); - wait_for_collector_.Wait(); + RETURN_IF_NOT_OK(wait_for_collector_.Wait()); wait_for_collector_.Clear(); num_rows = 0; } diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cifar_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cifar_op.cc index f1a9752c517..42e05510f25 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cifar_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cifar_op.cc @@ -58,7 +58,7 @@ Status CifarOp::LoadTensorRow(row_id_type index, TensorRow *trow) { std::shared_ptr fine_label; std::shared_ptr ori_image = cifar_image_label_pairs_[index].first; std::shared_ptr copy_image; - uint64_t path_index = std::ceil(index / kCifarBlockImageNum); + uint64_t path_index = static_cast(std::ceil(index / kCifarBlockImageNum)); RETURN_IF_NOT_OK(Tensor::CreateFromTensor(ori_image, ©_image)); RETURN_IF_NOT_OK(Tensor::CreateScalar(cifar_image_label_pairs_[index].second[0], &label)); diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mappable_leaf_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mappable_leaf_op.cc index b406351b11c..58234d12e5f 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mappable_leaf_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mappable_leaf_op.cc @@ -135,7 +135,7 @@ Status MappableLeafOp::WorkerEntry(int32_t worker_id) { while (io_block != nullptr) { if (io_block->wait()) { RETURN_IF_NOT_OK(worker_out_queues_[worker_id]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagWait))); - TaskManager::FindMe()->Wait(); // wait for auto tune update workers successful + RETURN_IF_NOT_OK(TaskManager::FindMe()->Wait()); // wait for auto tune update workers successful TaskManager::FindMe()->Clear(); } else if (io_block->eoe()) { RETURN_IF_NOT_OK(worker_out_queues_[worker_id]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagEOE))); diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mindrecord_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mindrecord_op.cc index 54eca6aee9a..c0dd31695c9 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mindrecord_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mindrecord_op.cc @@ -148,7 +148,7 @@ Status MindRecordOp::WorkerEntry(int32_t worker_id) { while (io_block != nullptr) { if (io_block->wait()) { RETURN_IF_NOT_OK(worker_out_queues_[worker_id]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagWait))); - TaskManager::FindMe()->Wait(); // wait for auto tune update workers successful + RETURN_IF_NOT_OK(TaskManager::FindMe()->Wait()); // wait for auto tune update workers successful TaskManager::FindMe()->Clear(); } else if (io_block->eoe()) { RETURN_IF_NOT_OK(worker_out_queues_[worker_id]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagEOE))); diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/speech_commands_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/speech_commands_op.cc index 169d51b929b..a272ee189ce 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/speech_commands_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/speech_commands_op.cc @@ -180,7 +180,7 @@ Status SpeechCommandsOp::GetFileInfo(const std::string &file_path, std::string * std::smatch result; { std::unique_lock _lock(mux_); - regex_match(filename, result, std::regex("(.*)_nohash_(\\d+)\\.wav")); + (void)regex_match(filename, result, std::regex("(.*)_nohash_(\\d+)\\.wav")); } CHECK_FAIL_RETURN_UNEXPECTED(!(result[0] == "" || result[1] == ""), "Invalid file name, failed to get file info: " + filename); diff --git a/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc b/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc index 1268e6e2f27..6c5ff9f85a7 100644 --- a/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc +++ b/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc @@ -1104,7 +1104,7 @@ Status ShardWriter::SerializeRawData(std::map> &raw_ thread_num = kThreadNumber; } // Set the number of samples processed by each thread - int group_num = ceil(row_count * 1.0 / thread_num); + int group_num = static_cast(ceil(row_count * 1.0 / thread_num)); std::vector thread_set(thread_num); int work_thread_num = 0; for (uint32_t x = 0; x < thread_num; ++x) { diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/max_pool_with_argmax_cpu_kernel.cc b/mindspore/ccsrc/plugin/device/cpu/kernel/max_pool_with_argmax_cpu_kernel.cc index e8d908569c6..01fba5ecf5b 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/max_pool_with_argmax_cpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/max_pool_with_argmax_cpu_kernel.cc @@ -151,7 +151,9 @@ bool MaxPoolWithArgmaxCpuKernelMod::LaunchKernel(const std::vector(outputs.at(kIndex1)->addr); MS_EXCEPTION_IF_NULL(mask); - int cWeight, hWeight, wWeight; + int cWeight; + int hWeight; + int wWeight; if (data_format_ == Format::NHWC) { cWeight = 1; wWeight = channel_ * cWeight; diff --git a/mindspore/python/mindspore/dataset/engine/datasets.py b/mindspore/python/mindspore/dataset/engine/datasets.py index 22b6f489dbb..e23cd614d4e 100644 --- a/mindspore/python/mindspore/dataset/engine/datasets.py +++ b/mindspore/python/mindspore/dataset/engine/datasets.py @@ -1900,6 +1900,7 @@ class VisionBaseDataset(Dataset): raise NotImplementedError("Dataset has to implement parse method.") +# pylint: disable=abstract-method class TextBaseDataset(Dataset): """ Abstract class to represent a text source dataset which produces content to the data pipeline. @@ -1919,6 +1920,10 @@ class TextBaseDataset(Dataset): Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab which contains top_k most frequent words (if top_k is specified). + Note: + mindspore.dataset.Dataset.build_vocab is deprecated from version 2.0 + and will be removed in a future version. Use mindspore.dataset.text.Vocab.from_dataset instead. + Args: columns(Union[str, list[str]]): Column names to get words from. freq_range(tuple[int]): A tuple of integers (min_frequency, max_frequency). Words within the frequency @@ -1950,6 +1955,67 @@ class TextBaseDataset(Dataset): ... special_first=True) """ + raise NotImplementedError("mindspore.dataset.Dataset.build_vocab is deprecated from version 2.0 " + "and will be removed in a future version. " + "Use mindspore.dataset.text.Vocab.from_dataset instead.") + + def build_sentencepiece_vocab(self, columns, vocab_size, character_coverage, model_type, params): + """ + Function to create a SentencePieceVocab from source dataset. + Desired source dataset is a text type dataset. + + Note: + mindspore.dataset.Dataset.build_sentencepiece_vocab is deprecated from version 2.0 + and will be removed in a future version. Use mindspore.dataset.text.SentencePieceVocab.from_dataset instead. + + Args: + columns(list[str]): Column names to get words from. + vocab_size(int): Vocabulary size. + character_coverage(float): Percentage of characters covered by the model, must be between + 0.98 and 1.0 Good defaults are: 0.9995 for languages with rich character sets like + Japanese or Chinese character sets, and 1.0 for other languages with small character sets + like English or Latin. + model_type(SentencePieceModel): Model type. Choose from unigram (default), bpe, char, or word. + The input sentence must be pretokenized when using word type. + params(dict): Any extra optional parameters of sentencepiece library according to your raw data + + Returns: + SentencePieceVocab, vocab built from the dataset. + + Examples: + >>> from mindspore.dataset.text import SentencePieceModel + >>> + >>> # You can construct any text dataset as source, take TextFileDataset as example. + >>> dataset = ds.TextFileDataset("/path/to/sentence/piece/vocab/file", shuffle=False) + >>> dataset = dataset.build_sentencepiece_vocab(["text"], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) + """ + raise NotImplementedError("mindspore.dataset.Dataset.build_sentencepiece_vocab is deprecated from version 2.0 " + "and will be removed in a future version. " + "Use mindspore.dataset.text.SentencePieceVocab.from_dataset instead.") + + def _build_vocab(self, columns, freq_range, top_k, special_tokens, special_first): + """ + Function to create a Vocab from source dataset. + Desired source dataset is a text type dataset. + + Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab + which contains top_k most frequent words (if top_k is specified). + + Args: + columns(Union[str, list[str]]): Column names to get words from. + freq_range(tuple[int]): A tuple of integers (min_frequency, max_frequency). Words within the frequency + range will be stored. + Naturally 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency + can be set to default, which corresponds to 0/total_words separately. + top_k(int): Number of words to be built into vocab. top_k most frequent words are + taken. The top_k is taken after freq_range. If not enough top_k, all words will be taken + special_tokens(list[str]): A list of strings, each one is a special token. + special_first(bool): Whether special_tokens will be prepended/appended to vocab, If special_tokens + is specified and special_first is set to default, special_tokens will be prepended. + + Returns: + Vocab, vocab built from the dataset. + """ vocab = cde.Vocab() columns = replace_none(columns, []) if not isinstance(columns, list): @@ -1981,7 +2047,7 @@ class TextBaseDataset(Dataset): return vocab - def build_sentencepiece_vocab(self, columns, vocab_size, character_coverage, model_type, params): + def _build_sentencepiece_vocab(self, columns, vocab_size, character_coverage, model_type, params): """ Function to create a SentencePieceVocab from source dataset. Desired source dataset is a text type dataset. @@ -1999,13 +2065,6 @@ class TextBaseDataset(Dataset): Returns: SentencePieceVocab, vocab built from the dataset. - - Examples: - >>> from mindspore.dataset.text import SentencePieceModel - >>> - >>> # You can construct any text dataset as source, take TextFileDataset as example. - >>> dataset = ds.TextFileDataset("/path/to/sentence/piece/vocab/file", shuffle=False) - >>> dataset = dataset.build_sentencepiece_vocab(["text"], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) """ if not isinstance(model_type, SentencePieceModel): raise TypeError("Argument model_type with value {0} is not of type SentencePieceModel, but got {1}." \ diff --git a/mindspore/python/mindspore/dataset/text/utils.py b/mindspore/python/mindspore/dataset/text/utils.py index e39a2ef3ef0..e2ec242b85b 100644 --- a/mindspore/python/mindspore/dataset/text/utils.py +++ b/mindspore/python/mindspore/dataset/text/utils.py @@ -246,9 +246,10 @@ class SentencePieceVocab: """ sentence_piece_vocab = cls() - sentence_piece_vocab.c_sentence_piece_vocab = dataset.build_sentencepiece_vocab(col_names, vocab_size, - character_coverage, - model_type, params) + # pylint: disable=protected-access + sentence_piece_vocab.c_sentence_piece_vocab = dataset._build_sentencepiece_vocab(col_names, vocab_size, + character_coverage, + model_type, params) return sentence_piece_vocab @classmethod @@ -428,7 +429,8 @@ class Vocab: """ vocab = cls() - vocab.c_vocab = dataset.build_vocab(columns, freq_range, top_k, special_tokens, special_first) + # pylint: disable=protected-access + vocab.c_vocab = dataset._build_vocab(columns, freq_range, top_k, special_tokens, special_first) return vocab @classmethod