Fix code clean

This commit is contained in:
shenwei41 2023-01-29 15:21:36 +08:00
parent 98143eb165
commit 3e46cf2d05
13 changed files with 88 additions and 21 deletions

View File

@ -6,6 +6,8 @@ mindspore.dataset.TextBaseDataset.build_sentencepiece_vocab
迭代源数据集对象获取数据并构建SentencePiece词汇表。
源数据集要求的是文本类数据集。
.. note:: mindspore.dataset.Dataset.build_sentencepiece_vocab 从2.0版本开始弃用。请使用mindspore.dataset.text.SentencePieceVocab.from_dataset代替。
参数:
- **columns** (list[str]) - 指定 `build_sentencepiece_vocab` 操作的输入列,会从该列获取数据构造词汇表。
- **vocab_size** (int) - 词汇表的容量。

View File

@ -8,6 +8,8 @@ mindspore.dataset.TextBaseDataset.build_vocab
收集数据集中所有的不重复单词。返回 `top_k` 个最常见的单词组成的词汇表(如果指定了 `top_k` )。
.. note:: mindspore.dataset.Dataset.build_vocab 从2.0版本开始弃用。请使用mindspore.dataset.text.Vocab.from_dataset代替。
参数:
- **columns** (Union[str, list[str]]) - 指定 `build_vocab` 操作的输入列,会从该列获取数据构造词汇表。
- **freq_range** (tuple[int]) - 由(min_frequency, max_frequency)组成的整数元组,代表词汇出现的频率范围,在这个频率范围的词汇会被保存下来。

View File

@ -263,7 +263,7 @@ Status BatchOp::WorkerEntry(int32_t workerId) {
RETURN_IF_NOT_OK(worker_out_queues_[workerId]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagEOF)));
} else if (table_pair.second.ctrl_ == batchCtrl::kWait) {
RETURN_IF_NOT_OK(worker_out_queues_[workerId]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagWait)));
TaskManager::FindMe()->Wait(); // wait for auto tune update workers successful
RETURN_IF_NOT_OK(TaskManager::FindMe()->Wait()); // wait for auto tune update workers successful
TaskManager::FindMe()->Clear();
} else if (table_pair.second.ctrl_ == batchCtrl::kNoCtrl) {
TensorRow new_row;

View File

@ -214,7 +214,7 @@ Status MapOp::WorkerEntry(int32_t worker_id) {
}
RETURN_IF_NOT_OK(worker_out_queues_[worker_id]->EmplaceBack(std::move(in_row)));
if (in_row.wait()) {
TaskManager::FindMe()->Wait(); // wait for auto tune update workers successful
RETURN_IF_NOT_OK(TaskManager::FindMe()->Wait()); // wait for auto tune update workers successful
TaskManager::FindMe()->Clear();
}
} else {

View File

@ -368,7 +368,7 @@ class ParallelOp : public DatasetOp {
// If num_worker signals are received, wakes up the main thread
if (++num_workers_paused_ == num_workers_) {
wait_for_workers_post_.Set();
wait_for_collector_.Wait();
RETURN_IF_NOT_OK(wait_for_collector_.Wait());
wait_for_collector_.Clear();
num_rows = 0;
}

View File

@ -58,7 +58,7 @@ Status CifarOp::LoadTensorRow(row_id_type index, TensorRow *trow) {
std::shared_ptr<Tensor> fine_label;
std::shared_ptr<Tensor> ori_image = cifar_image_label_pairs_[index].first;
std::shared_ptr<Tensor> copy_image;
uint64_t path_index = std::ceil(index / kCifarBlockImageNum);
uint64_t path_index = static_cast<uint64_t>(std::ceil(index / kCifarBlockImageNum));
RETURN_IF_NOT_OK(Tensor::CreateFromTensor(ori_image, &copy_image));
RETURN_IF_NOT_OK(Tensor::CreateScalar(cifar_image_label_pairs_[index].second[0], &label));

View File

@ -135,7 +135,7 @@ Status MappableLeafOp::WorkerEntry(int32_t worker_id) {
while (io_block != nullptr) {
if (io_block->wait()) {
RETURN_IF_NOT_OK(worker_out_queues_[worker_id]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagWait)));
TaskManager::FindMe()->Wait(); // wait for auto tune update workers successful
RETURN_IF_NOT_OK(TaskManager::FindMe()->Wait()); // wait for auto tune update workers successful
TaskManager::FindMe()->Clear();
} else if (io_block->eoe()) {
RETURN_IF_NOT_OK(worker_out_queues_[worker_id]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagEOE)));

View File

@ -148,7 +148,7 @@ Status MindRecordOp::WorkerEntry(int32_t worker_id) {
while (io_block != nullptr) {
if (io_block->wait()) {
RETURN_IF_NOT_OK(worker_out_queues_[worker_id]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagWait)));
TaskManager::FindMe()->Wait(); // wait for auto tune update workers successful
RETURN_IF_NOT_OK(TaskManager::FindMe()->Wait()); // wait for auto tune update workers successful
TaskManager::FindMe()->Clear();
} else if (io_block->eoe()) {
RETURN_IF_NOT_OK(worker_out_queues_[worker_id]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagEOE)));

View File

@ -180,7 +180,7 @@ Status SpeechCommandsOp::GetFileInfo(const std::string &file_path, std::string *
std::smatch result;
{
std::unique_lock<std::mutex> _lock(mux_);
regex_match(filename, result, std::regex("(.*)_nohash_(\\d+)\\.wav"));
(void)regex_match(filename, result, std::regex("(.*)_nohash_(\\d+)\\.wav"));
}
CHECK_FAIL_RETURN_UNEXPECTED(!(result[0] == "" || result[1] == ""),
"Invalid file name, failed to get file info: " + filename);

View File

@ -1104,7 +1104,7 @@ Status ShardWriter::SerializeRawData(std::map<uint64_t, std::vector<json>> &raw_
thread_num = kThreadNumber;
}
// Set the number of samples processed by each thread
int group_num = ceil(row_count * 1.0 / thread_num);
int group_num = static_cast<int>(ceil(row_count * 1.0 / thread_num));
std::vector<std::thread> thread_set(thread_num);
int work_thread_num = 0;
for (uint32_t x = 0; x < thread_num; ++x) {

View File

@ -151,7 +151,9 @@ bool MaxPoolWithArgmaxCpuKernelMod::LaunchKernel(const std::vector<kernel::Addre
MS_EXCEPTION_IF_NULL(output);
auto *mask = reinterpret_cast<int32_t *>(outputs.at(kIndex1)->addr);
MS_EXCEPTION_IF_NULL(mask);
int cWeight, hWeight, wWeight;
int cWeight;
int hWeight;
int wWeight;
if (data_format_ == Format::NHWC) {
cWeight = 1;
wWeight = channel_ * cWeight;

View File

@ -1900,6 +1900,7 @@ class VisionBaseDataset(Dataset):
raise NotImplementedError("Dataset has to implement parse method.")
# pylint: disable=abstract-method
class TextBaseDataset(Dataset):
"""
Abstract class to represent a text source dataset which produces content to the data pipeline.
@ -1919,6 +1920,10 @@ class TextBaseDataset(Dataset):
Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab
which contains top_k most frequent words (if top_k is specified).
Note:
mindspore.dataset.Dataset.build_vocab is deprecated from version 2.0
and will be removed in a future version. Use mindspore.dataset.text.Vocab.from_dataset instead.
Args:
columns(Union[str, list[str]]): Column names to get words from.
freq_range(tuple[int]): A tuple of integers (min_frequency, max_frequency). Words within the frequency
@ -1950,6 +1955,67 @@ class TextBaseDataset(Dataset):
... special_first=True)
"""
raise NotImplementedError("mindspore.dataset.Dataset.build_vocab is deprecated from version 2.0 "
"and will be removed in a future version. "
"Use mindspore.dataset.text.Vocab.from_dataset instead.")
def build_sentencepiece_vocab(self, columns, vocab_size, character_coverage, model_type, params):
"""
Function to create a SentencePieceVocab from source dataset.
Desired source dataset is a text type dataset.
Note:
mindspore.dataset.Dataset.build_sentencepiece_vocab is deprecated from version 2.0
and will be removed in a future version. Use mindspore.dataset.text.SentencePieceVocab.from_dataset instead.
Args:
columns(list[str]): Column names to get words from.
vocab_size(int): Vocabulary size.
character_coverage(float): Percentage of characters covered by the model, must be between
0.98 and 1.0 Good defaults are: 0.9995 for languages with rich character sets like
Japanese or Chinese character sets, and 1.0 for other languages with small character sets
like English or Latin.
model_type(SentencePieceModel): Model type. Choose from unigram (default), bpe, char, or word.
The input sentence must be pretokenized when using word type.
params(dict): Any extra optional parameters of sentencepiece library according to your raw data
Returns:
SentencePieceVocab, vocab built from the dataset.
Examples:
>>> from mindspore.dataset.text import SentencePieceModel
>>>
>>> # You can construct any text dataset as source, take TextFileDataset as example.
>>> dataset = ds.TextFileDataset("/path/to/sentence/piece/vocab/file", shuffle=False)
>>> dataset = dataset.build_sentencepiece_vocab(["text"], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
"""
raise NotImplementedError("mindspore.dataset.Dataset.build_sentencepiece_vocab is deprecated from version 2.0 "
"and will be removed in a future version. "
"Use mindspore.dataset.text.SentencePieceVocab.from_dataset instead.")
def _build_vocab(self, columns, freq_range, top_k, special_tokens, special_first):
"""
Function to create a Vocab from source dataset.
Desired source dataset is a text type dataset.
Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab
which contains top_k most frequent words (if top_k is specified).
Args:
columns(Union[str, list[str]]): Column names to get words from.
freq_range(tuple[int]): A tuple of integers (min_frequency, max_frequency). Words within the frequency
range will be stored.
Naturally 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency
can be set to default, which corresponds to 0/total_words separately.
top_k(int): Number of words to be built into vocab. top_k most frequent words are
taken. The top_k is taken after freq_range. If not enough top_k, all words will be taken
special_tokens(list[str]): A list of strings, each one is a special token.
special_first(bool): Whether special_tokens will be prepended/appended to vocab, If special_tokens
is specified and special_first is set to default, special_tokens will be prepended.
Returns:
Vocab, vocab built from the dataset.
"""
vocab = cde.Vocab()
columns = replace_none(columns, [])
if not isinstance(columns, list):
@ -1981,7 +2047,7 @@ class TextBaseDataset(Dataset):
return vocab
def build_sentencepiece_vocab(self, columns, vocab_size, character_coverage, model_type, params):
def _build_sentencepiece_vocab(self, columns, vocab_size, character_coverage, model_type, params):
"""
Function to create a SentencePieceVocab from source dataset.
Desired source dataset is a text type dataset.
@ -1999,13 +2065,6 @@ class TextBaseDataset(Dataset):
Returns:
SentencePieceVocab, vocab built from the dataset.
Examples:
>>> from mindspore.dataset.text import SentencePieceModel
>>>
>>> # You can construct any text dataset as source, take TextFileDataset as example.
>>> dataset = ds.TextFileDataset("/path/to/sentence/piece/vocab/file", shuffle=False)
>>> dataset = dataset.build_sentencepiece_vocab(["text"], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
"""
if not isinstance(model_type, SentencePieceModel):
raise TypeError("Argument model_type with value {0} is not of type SentencePieceModel, but got {1}." \

View File

@ -246,9 +246,10 @@ class SentencePieceVocab:
"""
sentence_piece_vocab = cls()
sentence_piece_vocab.c_sentence_piece_vocab = dataset.build_sentencepiece_vocab(col_names, vocab_size,
character_coverage,
model_type, params)
# pylint: disable=protected-access
sentence_piece_vocab.c_sentence_piece_vocab = dataset._build_sentencepiece_vocab(col_names, vocab_size,
character_coverage,
model_type, params)
return sentence_piece_vocab
@classmethod
@ -428,7 +429,8 @@ class Vocab:
"""
vocab = cls()
vocab.c_vocab = dataset.build_vocab(columns, freq_range, top_k, special_tokens, special_first)
# pylint: disable=protected-access
vocab.c_vocab = dataset._build_vocab(columns, freq_range, top_k, special_tokens, special_first)
return vocab
@classmethod