diff --git a/mindspore/ccsrc/minddata/dataset/engine/cache/cache_server.cc b/mindspore/ccsrc/minddata/dataset/engine/cache/cache_server.cc index 5a98314f9ab..23fc7b7da7a 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/cache/cache_server.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/cache/cache_server.cc @@ -1281,7 +1281,8 @@ int32_t CacheServer::Builder::AdjustNumWorkers(int32_t num_workers) { num_workers = std::max(num_numa_nodes, num_workers); // But also it shouldn't be too many more than the hardware concurrency int32_t num_cpus = hw_info_->GetCpuCount(); - num_workers = std::min(2 * num_cpus, num_workers); + constexpr int32_t kThreadsPerCore = 2; + num_workers = std::min(kThreadsPerCore * num_cpus, num_workers); // Round up num_workers to a multiple of numa nodes. auto remainder = num_workers % num_numa_nodes; if (remainder > 0) num_workers += (num_numa_nodes - remainder); diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/bucket_batch_by_length_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/bucket_batch_by_length_op.cc index 6242938df6c..5818768af5e 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/bucket_batch_by_length_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/bucket_batch_by_length_op.cc @@ -63,13 +63,13 @@ Status BucketBatchByLengthOp::Builder::SanityCheck() const { return Status::OK(); } -Status BucketBatchByLengthOp::Builder::Build(std::shared_ptr *new_bucket_batch_by_length_op) { +Status BucketBatchByLengthOp::Builder::Build(std::shared_ptr *bucket_batch_by_length_op) { RETURN_IF_NOT_OK(SanityCheck()); // insert 0 for the first bucket (void)builder_bucket_boundaries_.insert(builder_bucket_boundaries_.begin(), 0); - *new_bucket_batch_by_length_op = std::make_shared( + *bucket_batch_by_length_op = std::make_shared( builder_length_dependent_columns_, builder_bucket_boundaries_, builder_bucket_batch_sizes_, builder_element_length_function_, builder_pad_info_, builder_pad_to_bucket_boundary_, builder_drop_remainder_, builder_op_connector_size_); diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc index cb96bdc93c4..e4ae12a6648 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc @@ -41,7 +41,7 @@ BuildVocabOp::BuildVocabOp(std::shared_ptr vocab, std::vector>(num_workers * op_conn_size); collector_queue_ = - std::make_unique>>>(num_workers * op_conn_size); + std::make_unique>>>((num_workers * op_conn_size)); } Status BuildVocabOp::WorkerEntry(int32_t worker_id) { diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/parallel_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/parallel_op.cc index fdb69ffb0a7..76aeda6c284 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/parallel_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/parallel_op.cc @@ -35,8 +35,9 @@ ParallelOp::ParallelOp(int32_t num_workers, int32_t op_connector_size, std::shar epoch_sync_flag_(false) { // reduce excessive memory usage with high parallelism // when num_workers > 4, reduce op_connector_size to have similar total size if there were only 4 workers - if (num_workers_ > 4) { - oc_queue_size_ = std::max(1, op_connector_size * 4 / num_workers_); + constexpr int32_t worker_limit = 4; + if (num_workers_ > worker_limit) { + oc_queue_size_ = std::max(1, op_connector_size * worker_limit / num_workers_); } } diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/celeba_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/celeba_op.cc index 20c7f49789d..dd17b98b1a9 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/celeba_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/celeba_op.cc @@ -193,11 +193,15 @@ bool CelebAOp::CheckDatasetTypeValid() { return false; } // train:0, valid=1, test=2 - if (usage_ == "train" && (type == 0)) { + constexpr int32_t train_type = 0; + constexpr int32_t valid_type = 1; + constexpr int32_t test_type = 2; + + if (usage_ == "train" && (type == train_type)) { return true; - } else if (usage_ == "valid" && (type == 1)) { + } else if (usage_ == "valid" && (type == valid_type)) { return true; - } else if (usage_ == "test" && (type == 2)) { + } else if (usage_ == "test" && (type == test_type)) { return true; } diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cifar_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cifar_op.cc index e7b87b90d01..a6ae0dcda96 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cifar_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cifar_op.cc @@ -205,7 +205,9 @@ Status CifarOp::ReadCifar10BlockData() { Status CifarOp::ReadCifar100BlockData() { // CIFAR 100 has 2 bin files. train.bin (60K imgs) 153,700KB and test.bin (30,740KB) (10K imgs) // each img has two labels. Each row then is 32 * 32 *5 + 2 = 3,074 Bytes - uint32_t num_cifar100_records = 0; // test:10000, train:50000 + uint32_t num_cifar100_records = 0; // test:10000, train:50000 + constexpr uint32_t num_cifar100_test_records = 10000; + constexpr uint32_t num_cifar100_train_records = 50000; uint32_t block_size = (kCifarImageSize + 2) * kCifarBlockImageNum; // about 2M std::vector image_data(block_size * sizeof(unsigned char), 0); for (auto &file : cifar_files_) { @@ -220,9 +222,9 @@ Status CifarOp::ReadCifar100BlockData() { if (usage_ == "test" && file_name.find("test") == std::string::npos) continue; if (file_name.find("test") != std::string::npos) { - num_cifar100_records = 10000; + num_cifar100_records = num_cifar100_test_records; } else if (file_name.find("train") != std::string::npos) { - num_cifar100_records = 50000; + num_cifar100_records = num_cifar100_train_records; } else { RETURN_STATUS_UNEXPECTED("Invalid file, Cifar100 train/test file not found in: " + file_name); } diff --git a/mindspore/ccsrc/minddata/dataset/engine/execution_tree.cc b/mindspore/ccsrc/minddata/dataset/engine/execution_tree.cc index 75558c2c753..4d74a154a38 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/execution_tree.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/execution_tree.cc @@ -165,10 +165,8 @@ Status ExecutionTree::Launch() { std::string err_msg = "Invalid thread number."; RETURN_STATUS_UNEXPECTED(err_msg); } - if (thread_num > 8) - cv::setNumThreads(8); - else - cv::setNumThreads(thread_num); + constexpr int32_t max_cv_threads_cnt = 8; + cv::setNumThreads(thread_num > max_cv_threads_cnt ? max_cv_threads_cnt : thread_num); #endif // Tree must be built and prepared before it can be launched! diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.cc index 453204cd16b..8ad5310ad66 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.cc @@ -109,7 +109,7 @@ Status CSVNode::Build(std::vector> *const node_ops) { std::make_shared>(CsvOp::FLOAT, std::dynamic_pointer_cast>(v)->value)); } else if (v->type == CsvType::STRING) { column_default_list.push_back(std::make_shared>( - CsvOp::STRING, std::dynamic_pointer_cast>(v)->value)); + CsvOp::STRING, (std::dynamic_pointer_cast>(v))->value)); } } diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.cc index 405284f653e..bb3fce5fdb3 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.cc @@ -81,7 +81,8 @@ void MindDataNode::Print(std::ostream &out) const { out << Name() + "(file:" + d Status MindDataNode::ValidateParams() { RETURN_IF_NOT_OK(DatasetNode::ValidateParams()); - if (!search_for_pattern_ && dataset_files_.size() > 4096) { + constexpr size_t max_len = 4096; + if (!search_for_pattern_ && dataset_files_.size() > max_len) { std::string err_msg = "MindDataNode: length of dataset_file must be less than or equal to 4096, dataset_file length: " + std::to_string(dataset_file_.size()); diff --git a/mindspore/ccsrc/minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.cc b/mindspore/ccsrc/minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.cc index e934dc4d2e9..e211f03b228 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.cc @@ -40,8 +40,9 @@ Status TensorOpFusionPass::Visit(std::shared_ptr node, bool *const modi [](auto op, const std::string &nm) { return op->Name() == nm; }); if (itr != ops.end()) { MS_LOG(WARNING) << "Fusing pre-build Decode and RandomCropResize into one pre-build."; - auto op = dynamic_cast((*(itr + 1))->Build().get()); - (*itr) = std::make_shared(std::make_shared(*op)); + auto fused_op = dynamic_cast((*(itr + 1))->Build().get()); + RETURN_UNEXPECTED_IF_NULL(fused_op); + (*itr) = std::make_shared(std::make_shared(*fused_op)); ops.erase(itr + 1); node->setOperations(ops); *modified = true; @@ -55,10 +56,10 @@ Status TensorOpFusionPass::Visit(std::shared_ptr node, bool *const modi // return here if no pattern is found RETURN_OK_IF_TRUE(itr == ops.end()); - auto *op = dynamic_cast((itr + 1)->get()); - RETURN_UNEXPECTED_IF_NULL(op); + auto *fused_ir = dynamic_cast((itr + 1)->get()); + RETURN_UNEXPECTED_IF_NULL(fused_ir); // fuse the two ops - (*itr) = std::make_shared(*op); + (*itr) = std::make_shared(*fused_ir); ops.erase(itr + 1); node->setOperations(ops); *modified = true; diff --git a/mindspore/ccsrc/minddata/dataset/engine/opt/post/auto_worker_pass.cc b/mindspore/ccsrc/minddata/dataset/engine/opt/post/auto_worker_pass.cc index 3c70bc7f6c0..ead6bd4d69f 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/opt/post/auto_worker_pass.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/opt/post/auto_worker_pass.cc @@ -47,7 +47,8 @@ Status AutoWorkerPass::RunOnTree(std::shared_ptr root_ir, bool *con float max_weight = 0; for (const auto &p : pass.weight_profile_) max_weight = std::max(max_weight, p.second); RETURN_IF_NOT_OK(pass.Run(root_ir, modified)); - if (pass.parallel_ops_.size() > 3) { + constexpr size_t max_num_ops = 3; + if (pass.parallel_ops_.size() > max_num_ops) { MS_LOG(WARNING) << "AutoNumWorker right now is only suitable for simple dataset pipelines that has at most, 1 leaf " << "1 batch and 1 map. AutoNumWorker may not be optimal for usage on complex pipelines."; } diff --git a/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.cc b/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.cc index b261357336a..62ae0bdf5f1 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.cc @@ -360,15 +360,16 @@ Status OperatorCpu::Analyze(std::string *name, double *utilization, std::string // Only analyze the middle half of the samples // Starting and ending may be impacted by startup or ending pipeline activities - int start_analyze = total_samples / 4; - int end_analyze = total_samples - start_analyze; + constexpr int64_t sample_sections = 4; + int64 start_analyze = total_samples / sample_sections; + int64 end_analyze = total_samples - start_analyze; double op_util = 0; *utilization = 0; // start loop from 0 was as don't want to analyze op -1 for (auto op_id = 0; op_id < id_count_; op_id++) { - int sum = 0; - int index = op_id + 1; + int64 sum = 0; + int64 index = op_id + 1; for (int i = start_analyze; i < end_analyze; i++) { sum += cpu_op_util_[i][index].user_utilization_; sum += cpu_op_util_[i][index].sys_utilization_; @@ -517,11 +518,12 @@ Status ProcessCpu::Analyze(std::string *name, double *utilization, std::string * name->clear(); name->append("process_info"); int total_samples = process_util_.size(); - int sum = 0; + int64 sum = 0; // Only analyze the middle half of the samples // Starting and ending may be impacted by startup or ending pipeline activities - int start_analyze = total_samples / 4; - int end_analyze = total_samples - start_analyze; + constexpr int64_t sample_sections = 4; + int64 start_analyze = total_samples / sample_sections; + int64 end_analyze = total_samples - start_analyze; for (int i = start_analyze; i < end_analyze; i++) { sum += process_util_[i].user_utilization_; @@ -614,7 +616,8 @@ Status CpuSampling::SaveSamplingItervalToFile() { Status CpuSampling::Analyze() { std::string name; double utilization = 0; - + constexpr double total_cpu_thold = 90; + constexpr double op_cpu_thold = 80; // Keep track of specific information returned by differentn CPU sampling types double total_utilization = 0; double max_op_utilization = 0; @@ -633,7 +636,7 @@ Status CpuSampling::Analyze() { detailed_op_cpu_message = extra_message; } } - if ((total_utilization < 90) && (max_op_utilization > 80)) { + if ((total_utilization < total_cpu_thold) && (max_op_utilization > op_cpu_thold)) { MS_LOG(WARNING) << "Operator " << max_op_name << " is using " << max_op_utilization << "% CPU per thread. " << "This operator may benefit from increasing num_parallel_workers." << "Full Operator CPU utiliization for all operators: " << detailed_op_cpu_message << std::endl; diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/center_crop_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/center_crop_op.cc index 70529817dab..03663279942 100644 --- a/mindspore/ccsrc/minddata/dataset/kernels/image/center_crop_op.cc +++ b/mindspore/ccsrc/minddata/dataset/kernels/image/center_crop_op.cc @@ -43,7 +43,8 @@ Status CenterCropOp::Compute(const std::shared_ptr &input, std::shared_p int32_t left = crop_wid_ - input->shape()[1]; std::shared_ptr pad_image; - CHECK_FAIL_RETURN_UNEXPECTED((top < input->shape()[0] * 3 && left < input->shape()[1] * 3), + constexpr int64_t pad_limit = 3; + CHECK_FAIL_RETURN_UNEXPECTED((top < input->shape()[0] * pad_limit && left < input->shape()[1] * pad_limit), "CenterCrop: CenterCropOp padding size is more than 3 times the original size."); if (top > 0 && left > 0) { // padding only diff --git a/mindspore/ccsrc/minddata/dataset/kernels/ir/validators.cc b/mindspore/ccsrc/minddata/dataset/kernels/ir/validators.cc index 6d6f31aca93..668337777c2 100644 --- a/mindspore/ccsrc/minddata/dataset/kernels/ir/validators.cc +++ b/mindspore/ccsrc/minddata/dataset/kernels/ir/validators.cc @@ -60,7 +60,8 @@ Status ValidateVectorColorAttribute(const std::string &op_name, const std::strin for (auto &attr_val : attr) { RETURN_IF_NOT_OK(ValidateScalar(op_name, attr_name, attr_val, range, false, false)); } - if (attr.size() == 2 && (attr[0] > attr[1])) { + constexpr size_t attr_size_two = 2; + if (attr.size() == attr_size_two && (attr[0] > attr[1])) { std::string err_msg = op_name + ":" + attr_name + " lower bound must be less or equal to upper bound, got lb: " + std::to_string(attr[0]) + ", ub: " + std::to_string(attr[1]); diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/ngram_op.cc b/mindspore/ccsrc/minddata/dataset/text/kernels/ngram_op.cc index 5f19dee6730..0e5884ff6b9 100644 --- a/mindspore/ccsrc/minddata/dataset/text/kernels/ngram_op.cc +++ b/mindspore/ccsrc/minddata/dataset/text/kernels/ngram_op.cc @@ -44,7 +44,7 @@ Status NgramOp::Compute(const std::shared_ptr &input, std::shared_ptrshape().NumOfElements()); str_buffer.reserve(l_pad_with_sp_.size() * l_len_ + r_pad_with_sp_.size() * r_len_ + input->SizeInBytes()); offsets.push_back(str_buffer.size()); // insert 0 as the starting pos - for (int i = 0; i < l_len_; i++) offsets.push_back((str_buffer += l_pad_with_sp_).size()); + for (int l_i = 0; l_i < l_len_; l_i++) offsets.push_back((str_buffer += l_pad_with_sp_).size()); for (auto itr = input->begin(); itr != input->end(); itr++) { str_buffer += (*itr); @@ -52,7 +52,7 @@ Status NgramOp::Compute(const std::shared_ptr &input, std::shared_ptr 0, "Ngram: ngrams needs to be a positive number.\n"); @@ -63,8 +63,8 @@ Status NgramOp::Compute(const std::shared_ptr &input, std::shared_ptr= 0, "Ngram: get offsets failed."); - for (int i = start_ind; i < end_ind - n; i++) { - res.emplace_back(str_buffer.substr(offsets[i], offsets[i + n] - offsets[i] - separator_.size())); + for (int ind = start_ind; ind < end_ind - n; ind++) { + res.emplace_back(str_buffer.substr(offsets[ind], offsets[ind + n] - offsets[ind] - separator_.size())); } } } diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index 829b5cbe69b..ddc0ff475c2 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -53,12 +53,12 @@ from .iterators import DictIterator, TupleIterator, DummyIterator, check_iterato ITERATORS_LIST, _unset_iterator_cleanup from .queue import _SharedQueue from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \ - check_rename, check_numpyslicesdataset, check_device_send, \ - check_take, check_project, check_imagefolderdataset, check_mnist_cifar_dataset, check_manifestdataset, \ - check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset, \ - check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat, \ - check_random_dataset, check_split, check_bucket_batch_by_length, check_cluedataset, check_save, check_csvdataset, \ - check_paddeddataset, check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send + check_rename, check_numpyslicesdataset, check_device_send, check_take, check_project, check_imagefolderdataset, \ + check_mnist_cifar_dataset, check_manifestdataset, check_tfrecorddataset, check_vocdataset, check_cocodataset, \ + check_celebadataset, check_minddataset, check_generatordataset, check_sync_wait, check_zip_dataset, \ + check_add_column, check_textfiledataset, check_concat, check_random_dataset, check_split, \ + check_bucket_batch_by_length, check_cluedataset, check_save, check_csvdataset, check_paddeddataset, \ + check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send from ..core.config import get_callback_timeout, _init_device_info, get_enable_shared_mem, get_num_parallel_workers, \ get_prefetch_size, get_dynamic_columns from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist @@ -88,7 +88,7 @@ def shuffle_to_shuffle_mode(shuffle): if shuffle is None or shuffle: shuffle_mode = cde.ShuffleMode.GLOBAL # Global shuffle else: - shuffle_mode = cde.ShuffleMode.FALSE # No shuffle + shuffle_mode = cde.ShuffleMode.FALSE # No shuffle else: shuffle_mode = ShuffleToShuffleMode[shuffle] return shuffle_mode @@ -156,6 +156,7 @@ def _get_operator_process(): fetched_all = fetched_all and item_full return op_process, fetched_all + def _set_dataset_permissions(file_name, num_files): """ set saved dataset files' permissions to 600 @@ -174,6 +175,7 @@ def _set_dataset_permissions(file_name, num_files): if os.path.exists(index_file): os.chmod(index_file, stat.S_IRUSR | stat.S_IWUSR) + class Dataset: """ Abstract class to represent a dataset in DataEngine's data pipeline. @@ -1593,7 +1595,7 @@ class Dataset: for col in data.keys(): if col in dynamic_columns: shape_mismatch = "dynamic column [" + col + "] with shape " + str(dynamic_columns[col]) + \ - " does not match dataset column [" + col + "] with shape " + str(list(data[col].shape)) + " does not match dataset column [" + col + "] with shape " + str(list(data[col].shape)) if data[col].ndim != len(dynamic_columns[col]): raise RuntimeError(shape_mismatch) for dim in range(len(dynamic_columns[col])): @@ -1850,6 +1852,7 @@ class MappableDataset(SourceDataset): self.sampler = samplers.select_sampler(num_samples, sampler, shuffle, num_shards, shard_id) def add_sampler(self, new_sampler): + """ add a sampler """ # note: By adding a sampler, the sampled IDs will flow to new_sampler # after first passing through the current samplers attached to this dataset. self.dataset_size = None @@ -2365,7 +2368,6 @@ def _pyfunc_worker_init(pyfunc_list, args_queue, ret_queue): _RET_QUEUE = ret_queue - # Pyfunc worker execution function # All exceptions will be raised to main processes def _pyfunc_worker_exec(index, qid, *args): @@ -2388,6 +2390,7 @@ def _pyfunc_worker_exec(index, qid, *args): ## not using shared memory for passing arguments, call function directly return _GLOBAL_PYFUNC_LIST[index](*args) + # PythonCallable wrapper for multiprocess pyfunc class _PythonCallable: """ @@ -3416,12 +3419,11 @@ class SamplerFn: self.eof = threading.Event() # Create workers - #get default queue size and adjust queuesize per worker if there are large # workers + # get default queue size and adjust queuesize per worker if there are large # workers queue_size = get_prefetch_size() queue_size = min(queue_size, queue_size * 4 // num_worker) queue_size = max(2, queue_size) - for _ in range(num_worker): if multi_process is True: worker = _GeneratorWorkerMp(dataset, self.eof, max_rowsize, queue_size) diff --git a/mindspore/dataset/engine/samplers.py b/mindspore/dataset/engine/samplers.py index 98c1fb1ae86..673cf51e829 100644 --- a/mindspore/dataset/engine/samplers.py +++ b/mindspore/dataset/engine/samplers.py @@ -121,6 +121,7 @@ class BuiltinSampler: self.child_sampler = sampler def get_child(self): + """ add a child sampler """ return self.child_sampler def parse_child(self): @@ -136,9 +137,11 @@ class BuiltinSampler: return c_child_sampler def is_shuffled(self): + """ not implemented """ raise NotImplementedError("Sampler must implement is_shuffled.") def is_sharded(self): + """ not implemented """ raise NotImplementedError("Sampler must implement is_sharded.") def get_num_samples(self): diff --git a/mindspore/dataset/engine/serializer_deserializer.py b/mindspore/dataset/engine/serializer_deserializer.py index c8770e7e75b..61ec30f947f 100644 --- a/mindspore/dataset/engine/serializer_deserializer.py +++ b/mindspore/dataset/engine/serializer_deserializer.py @@ -392,6 +392,7 @@ def construct_tensor_ops(operations): def to_policy(op_list): + """ op_list to policy """ policy_tensor_ops = [] for policy_list in op_list: sub_policy_tensor_ops = [] @@ -403,12 +404,17 @@ def to_policy(op_list): def to_shuffle_mode(shuffle): - if shuffle == 2: return "global" - if shuffle == 1: return "files" - return False + """ int to shuffle mode """ + ret_val = False + if shuffle == 2: + ret_val = "global" + elif shuffle == 1: + ret_val = "files" + return ret_val def to_interpolation_mode(inter): + """ int to interpolation mode """ return { 0: Inter.LINEAR, 1: Inter.NEAREST, @@ -418,6 +424,7 @@ def to_interpolation_mode(inter): def to_border_mode(border): + """ int to border mode """ return { 0: Border.CONSTANT, 1: Border.EDGE, @@ -427,6 +434,7 @@ def to_border_mode(border): def to_mstype(data_type): + """ str to mstype """ return { "bool": mstype.bool_, "int8": mstype.int8, @@ -445,6 +453,7 @@ def to_mstype(data_type): def to_image_batch_format(image_batch_format): + """ int to image batch format """ return { 0: ImageBatchFormat.NHWC, 1: ImageBatchFormat.NCHW @@ -452,4 +461,5 @@ def to_image_batch_format(image_batch_format): def check_and_replace_input(input_value, expect, replace): + """ check and replace input arg """ return replace if input_value == expect else input_value diff --git a/mindspore/dataset/engine/validators.py b/mindspore/dataset/engine/validators.py index f06adbebcd7..9ee57241f17 100644 --- a/mindspore/dataset/engine/validators.py +++ b/mindspore/dataset/engine/validators.py @@ -465,7 +465,8 @@ def check_pad_info(key, val): for dim in val[0]: if dim is not None: type_check(dim, (int,), "dim in pad_shape") - assert dim > 0, "pad shape should be positive integers" + if dim <= 0: + raise ValueError("pad shape should be positive integers") if val[1] is not None: type_check(val[1], (int, float, str, bytes), "pad_value") diff --git a/mindspore/dataset/text/__init__.py b/mindspore/dataset/text/__init__.py index 0ee47b7cd00..803ba030a0c 100644 --- a/mindspore/dataset/text/__init__.py +++ b/mindspore/dataset/text/__init__.py @@ -18,12 +18,11 @@ NLP text processing module which is developed with ICU4C and cppjieba. utils provides some general methods for NLP text processing. """ import platform -from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, TruncateSequencePair, \ - ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer +from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, \ + TruncateSequencePair, ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm, SentencePieceVocab, SentencePieceModel, \ SPieceTokenizerOutType, SPieceTokenizerLoadType - __all__ = [ "Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram", "to_str", "to_bytes", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber", diff --git a/mindspore/dataset/text/utils.py b/mindspore/dataset/text/utils.py index 5fcafb16205..7dde91a4c95 100644 --- a/mindspore/dataset/text/utils.py +++ b/mindspore/dataset/text/utils.py @@ -79,7 +79,7 @@ class Vocab(cde.Vocab): word_list(list): a list of string where each element is a word of type string. special_tokens(list, optional): a list of strings, each one is a special token. for example special_tokens=["",""] (default=None, no special tokens will be added). - special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens + special_first(bool, optional): whether special_tokens is prepended or appended to vocab. If special_tokens is specified and special_first is set to True, special_tokens will be prepended (default=True). Returns: