forked from mindspore-Ecosystem/mindspore
!48153 Fix code clean
Merge pull request !48153 from shenwei41/fix_waring
This commit is contained in:
commit
91be7a2d31
|
@ -6,6 +6,8 @@ mindspore.dataset.TextBaseDataset.build_sentencepiece_vocab
|
|||
迭代源数据集对象获取数据并构建SentencePiece词汇表。
|
||||
源数据集要求的是文本类数据集。
|
||||
|
||||
.. note:: mindspore.dataset.Dataset.build_sentencepiece_vocab 从2.0版本开始弃用。请使用mindspore.dataset.text.SentencePieceVocab.from_dataset代替。
|
||||
|
||||
参数:
|
||||
- **columns** (list[str]) - 指定 `build_sentencepiece_vocab` 操作的输入列,会从该列获取数据构造词汇表。
|
||||
- **vocab_size** (int) - 词汇表的容量。
|
||||
|
|
|
@ -8,6 +8,8 @@ mindspore.dataset.TextBaseDataset.build_vocab
|
|||
|
||||
收集数据集中所有的不重复单词。返回 `top_k` 个最常见的单词组成的词汇表(如果指定了 `top_k` )。
|
||||
|
||||
.. note:: mindspore.dataset.Dataset.build_vocab 从2.0版本开始弃用。请使用mindspore.dataset.text.Vocab.from_dataset代替。
|
||||
|
||||
参数:
|
||||
- **columns** (Union[str, list[str]]) - 指定 `build_vocab` 操作的输入列,会从该列获取数据构造词汇表。
|
||||
- **freq_range** (tuple[int]) - 由(min_frequency, max_frequency)组成的整数元组,代表词汇出现的频率范围,在这个频率范围的词汇会被保存下来。
|
||||
|
|
|
@ -303,7 +303,7 @@ Status BatchOp::WorkerEntry(int32_t workerId) {
|
|||
RETURN_IF_NOT_OK(worker_out_queues_[workerId]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagEOF)));
|
||||
} else if (table_pair.second.ctrl_ == batchCtrl::kWait) {
|
||||
RETURN_IF_NOT_OK(worker_out_queues_[workerId]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagWait)));
|
||||
TaskManager::FindMe()->Wait(); // wait for auto tune update workers successful
|
||||
RETURN_IF_NOT_OK(TaskManager::FindMe()->Wait()); // wait for auto tune update workers successful
|
||||
TaskManager::FindMe()->Clear();
|
||||
} else if (table_pair.second.ctrl_ == batchCtrl::kNoCtrl) {
|
||||
TensorRow new_row;
|
||||
|
|
|
@ -214,7 +214,7 @@ Status MapOp::WorkerEntry(int32_t worker_id) {
|
|||
}
|
||||
RETURN_IF_NOT_OK(worker_out_queues_[worker_id]->EmplaceBack(std::move(in_row)));
|
||||
if (in_row.wait()) {
|
||||
TaskManager::FindMe()->Wait(); // wait for auto tune update workers successful
|
||||
RETURN_IF_NOT_OK(TaskManager::FindMe()->Wait()); // wait for auto tune update workers successful
|
||||
TaskManager::FindMe()->Clear();
|
||||
}
|
||||
} else {
|
||||
|
|
|
@ -368,7 +368,7 @@ class ParallelOp : public DatasetOp {
|
|||
// If num_worker signals are received, wakes up the main thread
|
||||
if (++num_workers_paused_ == num_workers_) {
|
||||
wait_for_workers_post_.Set();
|
||||
wait_for_collector_.Wait();
|
||||
RETURN_IF_NOT_OK(wait_for_collector_.Wait());
|
||||
wait_for_collector_.Clear();
|
||||
num_rows = 0;
|
||||
}
|
||||
|
|
|
@ -58,7 +58,7 @@ Status CifarOp::LoadTensorRow(row_id_type index, TensorRow *trow) {
|
|||
std::shared_ptr<Tensor> fine_label;
|
||||
std::shared_ptr<Tensor> ori_image = cifar_image_label_pairs_[index].first;
|
||||
std::shared_ptr<Tensor> copy_image;
|
||||
uint64_t path_index = std::ceil(index / kCifarBlockImageNum);
|
||||
uint64_t path_index = static_cast<uint64_t>(std::ceil(index / kCifarBlockImageNum));
|
||||
RETURN_IF_NOT_OK(Tensor::CreateFromTensor(ori_image, ©_image));
|
||||
RETURN_IF_NOT_OK(Tensor::CreateScalar(cifar_image_label_pairs_[index].second[0], &label));
|
||||
|
||||
|
|
|
@ -135,7 +135,7 @@ Status MappableLeafOp::WorkerEntry(int32_t worker_id) {
|
|||
while (io_block != nullptr) {
|
||||
if (io_block->wait()) {
|
||||
RETURN_IF_NOT_OK(worker_out_queues_[worker_id]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagWait)));
|
||||
TaskManager::FindMe()->Wait(); // wait for auto tune update workers successful
|
||||
RETURN_IF_NOT_OK(TaskManager::FindMe()->Wait()); // wait for auto tune update workers successful
|
||||
TaskManager::FindMe()->Clear();
|
||||
} else if (io_block->eoe()) {
|
||||
RETURN_IF_NOT_OK(worker_out_queues_[worker_id]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagEOE)));
|
||||
|
|
|
@ -148,7 +148,7 @@ Status MindRecordOp::WorkerEntry(int32_t worker_id) {
|
|||
while (io_block != nullptr) {
|
||||
if (io_block->wait()) {
|
||||
RETURN_IF_NOT_OK(worker_out_queues_[worker_id]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagWait)));
|
||||
TaskManager::FindMe()->Wait(); // wait for auto tune update workers successful
|
||||
RETURN_IF_NOT_OK(TaskManager::FindMe()->Wait()); // wait for auto tune update workers successful
|
||||
TaskManager::FindMe()->Clear();
|
||||
} else if (io_block->eoe()) {
|
||||
RETURN_IF_NOT_OK(worker_out_queues_[worker_id]->EmplaceBack(TensorRow(TensorRow::TensorRowFlags::kFlagEOE)));
|
||||
|
|
|
@ -180,7 +180,7 @@ Status SpeechCommandsOp::GetFileInfo(const std::string &file_path, std::string *
|
|||
std::smatch result;
|
||||
{
|
||||
std::unique_lock<std::mutex> _lock(mux_);
|
||||
regex_match(filename, result, std::regex("(.*)_nohash_(\\d+)\\.wav"));
|
||||
(void)regex_match(filename, result, std::regex("(.*)_nohash_(\\d+)\\.wav"));
|
||||
}
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(!(result[0] == "" || result[1] == ""),
|
||||
"Invalid file name, failed to get file info: " + filename);
|
||||
|
|
|
@ -1104,7 +1104,7 @@ Status ShardWriter::SerializeRawData(std::map<uint64_t, std::vector<json>> &raw_
|
|||
thread_num = kThreadNumber;
|
||||
}
|
||||
// Set the number of samples processed by each thread
|
||||
int group_num = ceil(row_count * 1.0 / thread_num);
|
||||
int group_num = static_cast<int>(ceil(row_count * 1.0 / thread_num));
|
||||
std::vector<std::thread> thread_set(thread_num);
|
||||
int work_thread_num = 0;
|
||||
for (uint32_t x = 0; x < thread_num; ++x) {
|
||||
|
|
|
@ -151,7 +151,9 @@ bool MaxPoolWithArgmaxCpuKernelMod::LaunchKernel(const std::vector<kernel::Addre
|
|||
MS_EXCEPTION_IF_NULL(output);
|
||||
auto *mask = reinterpret_cast<int32_t *>(outputs.at(kIndex1)->addr);
|
||||
MS_EXCEPTION_IF_NULL(mask);
|
||||
int cWeight, hWeight, wWeight;
|
||||
int cWeight;
|
||||
int hWeight;
|
||||
int wWeight;
|
||||
if (data_format_ == Format::NHWC) {
|
||||
cWeight = 1;
|
||||
wWeight = channel_ * cWeight;
|
||||
|
|
|
@ -1900,6 +1900,7 @@ class VisionBaseDataset(Dataset):
|
|||
raise NotImplementedError("Dataset has to implement parse method.")
|
||||
|
||||
|
||||
# pylint: disable=abstract-method
|
||||
class TextBaseDataset(Dataset):
|
||||
"""
|
||||
Abstract class to represent a text source dataset which produces content to the data pipeline.
|
||||
|
@ -1919,6 +1920,10 @@ class TextBaseDataset(Dataset):
|
|||
Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab
|
||||
which contains top_k most frequent words (if top_k is specified).
|
||||
|
||||
Note:
|
||||
mindspore.dataset.Dataset.build_vocab is deprecated from version 2.0
|
||||
and will be removed in a future version. Use mindspore.dataset.text.Vocab.from_dataset instead.
|
||||
|
||||
Args:
|
||||
columns(Union[str, list[str]]): Column names to get words from.
|
||||
freq_range(tuple[int]): A tuple of integers (min_frequency, max_frequency). Words within the frequency
|
||||
|
@ -1950,6 +1955,67 @@ class TextBaseDataset(Dataset):
|
|||
... special_first=True)
|
||||
|
||||
"""
|
||||
raise NotImplementedError("mindspore.dataset.Dataset.build_vocab is deprecated from version 2.0 "
|
||||
"and will be removed in a future version. "
|
||||
"Use mindspore.dataset.text.Vocab.from_dataset instead.")
|
||||
|
||||
def build_sentencepiece_vocab(self, columns, vocab_size, character_coverage, model_type, params):
|
||||
"""
|
||||
Function to create a SentencePieceVocab from source dataset.
|
||||
Desired source dataset is a text type dataset.
|
||||
|
||||
Note:
|
||||
mindspore.dataset.Dataset.build_sentencepiece_vocab is deprecated from version 2.0
|
||||
and will be removed in a future version. Use mindspore.dataset.text.SentencePieceVocab.from_dataset instead.
|
||||
|
||||
Args:
|
||||
columns(list[str]): Column names to get words from.
|
||||
vocab_size(int): Vocabulary size.
|
||||
character_coverage(float): Percentage of characters covered by the model, must be between
|
||||
0.98 and 1.0 Good defaults are: 0.9995 for languages with rich character sets like
|
||||
Japanese or Chinese character sets, and 1.0 for other languages with small character sets
|
||||
like English or Latin.
|
||||
model_type(SentencePieceModel): Model type. Choose from unigram (default), bpe, char, or word.
|
||||
The input sentence must be pretokenized when using word type.
|
||||
params(dict): Any extra optional parameters of sentencepiece library according to your raw data
|
||||
|
||||
Returns:
|
||||
SentencePieceVocab, vocab built from the dataset.
|
||||
|
||||
Examples:
|
||||
>>> from mindspore.dataset.text import SentencePieceModel
|
||||
>>>
|
||||
>>> # You can construct any text dataset as source, take TextFileDataset as example.
|
||||
>>> dataset = ds.TextFileDataset("/path/to/sentence/piece/vocab/file", shuffle=False)
|
||||
>>> dataset = dataset.build_sentencepiece_vocab(["text"], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
|
||||
"""
|
||||
raise NotImplementedError("mindspore.dataset.Dataset.build_sentencepiece_vocab is deprecated from version 2.0 "
|
||||
"and will be removed in a future version. "
|
||||
"Use mindspore.dataset.text.SentencePieceVocab.from_dataset instead.")
|
||||
|
||||
def _build_vocab(self, columns, freq_range, top_k, special_tokens, special_first):
|
||||
"""
|
||||
Function to create a Vocab from source dataset.
|
||||
Desired source dataset is a text type dataset.
|
||||
|
||||
Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab
|
||||
which contains top_k most frequent words (if top_k is specified).
|
||||
|
||||
Args:
|
||||
columns(Union[str, list[str]]): Column names to get words from.
|
||||
freq_range(tuple[int]): A tuple of integers (min_frequency, max_frequency). Words within the frequency
|
||||
range will be stored.
|
||||
Naturally 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency
|
||||
can be set to default, which corresponds to 0/total_words separately.
|
||||
top_k(int): Number of words to be built into vocab. top_k most frequent words are
|
||||
taken. The top_k is taken after freq_range. If not enough top_k, all words will be taken
|
||||
special_tokens(list[str]): A list of strings, each one is a special token.
|
||||
special_first(bool): Whether special_tokens will be prepended/appended to vocab, If special_tokens
|
||||
is specified and special_first is set to default, special_tokens will be prepended.
|
||||
|
||||
Returns:
|
||||
Vocab, vocab built from the dataset.
|
||||
"""
|
||||
vocab = cde.Vocab()
|
||||
columns = replace_none(columns, [])
|
||||
if not isinstance(columns, list):
|
||||
|
@ -1981,7 +2047,7 @@ class TextBaseDataset(Dataset):
|
|||
|
||||
return vocab
|
||||
|
||||
def build_sentencepiece_vocab(self, columns, vocab_size, character_coverage, model_type, params):
|
||||
def _build_sentencepiece_vocab(self, columns, vocab_size, character_coverage, model_type, params):
|
||||
"""
|
||||
Function to create a SentencePieceVocab from source dataset.
|
||||
Desired source dataset is a text type dataset.
|
||||
|
@ -1999,13 +2065,6 @@ class TextBaseDataset(Dataset):
|
|||
|
||||
Returns:
|
||||
SentencePieceVocab, vocab built from the dataset.
|
||||
|
||||
Examples:
|
||||
>>> from mindspore.dataset.text import SentencePieceModel
|
||||
>>>
|
||||
>>> # You can construct any text dataset as source, take TextFileDataset as example.
|
||||
>>> dataset = ds.TextFileDataset("/path/to/sentence/piece/vocab/file", shuffle=False)
|
||||
>>> dataset = dataset.build_sentencepiece_vocab(["text"], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
|
||||
"""
|
||||
if not isinstance(model_type, SentencePieceModel):
|
||||
raise TypeError("Argument model_type with value {0} is not of type SentencePieceModel, but got {1}." \
|
||||
|
|
|
@ -246,9 +246,10 @@ class SentencePieceVocab:
|
|||
"""
|
||||
|
||||
sentence_piece_vocab = cls()
|
||||
sentence_piece_vocab.c_sentence_piece_vocab = dataset.build_sentencepiece_vocab(col_names, vocab_size,
|
||||
character_coverage,
|
||||
model_type, params)
|
||||
# pylint: disable=protected-access
|
||||
sentence_piece_vocab.c_sentence_piece_vocab = dataset._build_sentencepiece_vocab(col_names, vocab_size,
|
||||
character_coverage,
|
||||
model_type, params)
|
||||
return sentence_piece_vocab
|
||||
|
||||
@classmethod
|
||||
|
@ -428,7 +429,8 @@ class Vocab:
|
|||
"""
|
||||
|
||||
vocab = cls()
|
||||
vocab.c_vocab = dataset.build_vocab(columns, freq_range, top_k, special_tokens, special_first)
|
||||
# pylint: disable=protected-access
|
||||
vocab.c_vocab = dataset._build_vocab(columns, freq_range, top_k, special_tokens, special_first)
|
||||
return vocab
|
||||
|
||||
@classmethod
|
||||
|
|
Loading…
Reference in New Issue