forked from mindspore-Ecosystem/mindspore
!17161 FIX selected CI
From: @ziruiwu Reviewed-by: @robingrosman,@john_tzanakakis Signed-off-by: @robingrosman
This commit is contained in:
commit
583857799f
|
@ -1281,7 +1281,8 @@ int32_t CacheServer::Builder::AdjustNumWorkers(int32_t num_workers) {
|
||||||
num_workers = std::max(num_numa_nodes, num_workers);
|
num_workers = std::max(num_numa_nodes, num_workers);
|
||||||
// But also it shouldn't be too many more than the hardware concurrency
|
// But also it shouldn't be too many more than the hardware concurrency
|
||||||
int32_t num_cpus = hw_info_->GetCpuCount();
|
int32_t num_cpus = hw_info_->GetCpuCount();
|
||||||
num_workers = std::min(2 * num_cpus, num_workers);
|
constexpr int32_t kThreadsPerCore = 2;
|
||||||
|
num_workers = std::min(kThreadsPerCore * num_cpus, num_workers);
|
||||||
// Round up num_workers to a multiple of numa nodes.
|
// Round up num_workers to a multiple of numa nodes.
|
||||||
auto remainder = num_workers % num_numa_nodes;
|
auto remainder = num_workers % num_numa_nodes;
|
||||||
if (remainder > 0) num_workers += (num_numa_nodes - remainder);
|
if (remainder > 0) num_workers += (num_numa_nodes - remainder);
|
||||||
|
|
|
@ -63,13 +63,13 @@ Status BucketBatchByLengthOp::Builder::SanityCheck() const {
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
Status BucketBatchByLengthOp::Builder::Build(std::shared_ptr<BucketBatchByLengthOp> *new_bucket_batch_by_length_op) {
|
Status BucketBatchByLengthOp::Builder::Build(std::shared_ptr<BucketBatchByLengthOp> *bucket_batch_by_length_op) {
|
||||||
RETURN_IF_NOT_OK(SanityCheck());
|
RETURN_IF_NOT_OK(SanityCheck());
|
||||||
|
|
||||||
// insert 0 for the first bucket
|
// insert 0 for the first bucket
|
||||||
(void)builder_bucket_boundaries_.insert(builder_bucket_boundaries_.begin(), 0);
|
(void)builder_bucket_boundaries_.insert(builder_bucket_boundaries_.begin(), 0);
|
||||||
|
|
||||||
*new_bucket_batch_by_length_op = std::make_shared<BucketBatchByLengthOp>(
|
*bucket_batch_by_length_op = std::make_shared<BucketBatchByLengthOp>(
|
||||||
builder_length_dependent_columns_, builder_bucket_boundaries_, builder_bucket_batch_sizes_,
|
builder_length_dependent_columns_, builder_bucket_boundaries_, builder_bucket_batch_sizes_,
|
||||||
builder_element_length_function_, builder_pad_info_, builder_pad_to_bucket_boundary_, builder_drop_remainder_,
|
builder_element_length_function_, builder_pad_info_, builder_pad_to_bucket_boundary_, builder_drop_remainder_,
|
||||||
builder_op_connector_size_);
|
builder_op_connector_size_);
|
||||||
|
|
|
@ -41,7 +41,7 @@ BuildVocabOp::BuildVocabOp(std::shared_ptr<Vocab> vocab, std::vector<std::string
|
||||||
// init two queues for thread sync
|
// init two queues for thread sync
|
||||||
distributor_queue_ = std::make_unique<Queue<TensorRow>>(num_workers * op_conn_size);
|
distributor_queue_ = std::make_unique<Queue<TensorRow>>(num_workers * op_conn_size);
|
||||||
collector_queue_ =
|
collector_queue_ =
|
||||||
std::make_unique<Queue<std::unique_ptr<std::unordered_map<std::string, int64_t>>>>(num_workers * op_conn_size);
|
std::make_unique<Queue<std::unique_ptr<std::unordered_map<std::string, int64_t>>>>((num_workers * op_conn_size));
|
||||||
}
|
}
|
||||||
|
|
||||||
Status BuildVocabOp::WorkerEntry(int32_t worker_id) {
|
Status BuildVocabOp::WorkerEntry(int32_t worker_id) {
|
||||||
|
|
|
@ -35,8 +35,9 @@ ParallelOp::ParallelOp(int32_t num_workers, int32_t op_connector_size, std::shar
|
||||||
epoch_sync_flag_(false) {
|
epoch_sync_flag_(false) {
|
||||||
// reduce excessive memory usage with high parallelism
|
// reduce excessive memory usage with high parallelism
|
||||||
// when num_workers > 4, reduce op_connector_size to have similar total size if there were only 4 workers
|
// when num_workers > 4, reduce op_connector_size to have similar total size if there were only 4 workers
|
||||||
if (num_workers_ > 4) {
|
constexpr int32_t worker_limit = 4;
|
||||||
oc_queue_size_ = std::max(1, op_connector_size * 4 / num_workers_);
|
if (num_workers_ > worker_limit) {
|
||||||
|
oc_queue_size_ = std::max(1, op_connector_size * worker_limit / num_workers_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -193,11 +193,15 @@ bool CelebAOp::CheckDatasetTypeValid() {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// train:0, valid=1, test=2
|
// train:0, valid=1, test=2
|
||||||
if (usage_ == "train" && (type == 0)) {
|
constexpr int32_t train_type = 0;
|
||||||
|
constexpr int32_t valid_type = 1;
|
||||||
|
constexpr int32_t test_type = 2;
|
||||||
|
|
||||||
|
if (usage_ == "train" && (type == train_type)) {
|
||||||
return true;
|
return true;
|
||||||
} else if (usage_ == "valid" && (type == 1)) {
|
} else if (usage_ == "valid" && (type == valid_type)) {
|
||||||
return true;
|
return true;
|
||||||
} else if (usage_ == "test" && (type == 2)) {
|
} else if (usage_ == "test" && (type == test_type)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -205,7 +205,9 @@ Status CifarOp::ReadCifar10BlockData() {
|
||||||
Status CifarOp::ReadCifar100BlockData() {
|
Status CifarOp::ReadCifar100BlockData() {
|
||||||
// CIFAR 100 has 2 bin files. train.bin (60K imgs) 153,700KB and test.bin (30,740KB) (10K imgs)
|
// CIFAR 100 has 2 bin files. train.bin (60K imgs) 153,700KB and test.bin (30,740KB) (10K imgs)
|
||||||
// each img has two labels. Each row then is 32 * 32 *5 + 2 = 3,074 Bytes
|
// each img has two labels. Each row then is 32 * 32 *5 + 2 = 3,074 Bytes
|
||||||
uint32_t num_cifar100_records = 0; // test:10000, train:50000
|
uint32_t num_cifar100_records = 0; // test:10000, train:50000
|
||||||
|
constexpr uint32_t num_cifar100_test_records = 10000;
|
||||||
|
constexpr uint32_t num_cifar100_train_records = 50000;
|
||||||
uint32_t block_size = (kCifarImageSize + 2) * kCifarBlockImageNum; // about 2M
|
uint32_t block_size = (kCifarImageSize + 2) * kCifarBlockImageNum; // about 2M
|
||||||
std::vector<unsigned char> image_data(block_size * sizeof(unsigned char), 0);
|
std::vector<unsigned char> image_data(block_size * sizeof(unsigned char), 0);
|
||||||
for (auto &file : cifar_files_) {
|
for (auto &file : cifar_files_) {
|
||||||
|
@ -220,9 +222,9 @@ Status CifarOp::ReadCifar100BlockData() {
|
||||||
if (usage_ == "test" && file_name.find("test") == std::string::npos) continue;
|
if (usage_ == "test" && file_name.find("test") == std::string::npos) continue;
|
||||||
|
|
||||||
if (file_name.find("test") != std::string::npos) {
|
if (file_name.find("test") != std::string::npos) {
|
||||||
num_cifar100_records = 10000;
|
num_cifar100_records = num_cifar100_test_records;
|
||||||
} else if (file_name.find("train") != std::string::npos) {
|
} else if (file_name.find("train") != std::string::npos) {
|
||||||
num_cifar100_records = 50000;
|
num_cifar100_records = num_cifar100_train_records;
|
||||||
} else {
|
} else {
|
||||||
RETURN_STATUS_UNEXPECTED("Invalid file, Cifar100 train/test file not found in: " + file_name);
|
RETURN_STATUS_UNEXPECTED("Invalid file, Cifar100 train/test file not found in: " + file_name);
|
||||||
}
|
}
|
||||||
|
|
|
@ -165,10 +165,8 @@ Status ExecutionTree::Launch() {
|
||||||
std::string err_msg = "Invalid thread number.";
|
std::string err_msg = "Invalid thread number.";
|
||||||
RETURN_STATUS_UNEXPECTED(err_msg);
|
RETURN_STATUS_UNEXPECTED(err_msg);
|
||||||
}
|
}
|
||||||
if (thread_num > 8)
|
constexpr int32_t max_cv_threads_cnt = 8;
|
||||||
cv::setNumThreads(8);
|
cv::setNumThreads(thread_num > max_cv_threads_cnt ? max_cv_threads_cnt : thread_num);
|
||||||
else
|
|
||||||
cv::setNumThreads(thread_num);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Tree must be built and prepared before it can be launched!
|
// Tree must be built and prepared before it can be launched!
|
||||||
|
|
|
@ -109,7 +109,7 @@ Status CSVNode::Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) {
|
||||||
std::make_shared<CsvOp::Record<float>>(CsvOp::FLOAT, std::dynamic_pointer_cast<CsvRecord<float>>(v)->value));
|
std::make_shared<CsvOp::Record<float>>(CsvOp::FLOAT, std::dynamic_pointer_cast<CsvRecord<float>>(v)->value));
|
||||||
} else if (v->type == CsvType::STRING) {
|
} else if (v->type == CsvType::STRING) {
|
||||||
column_default_list.push_back(std::make_shared<CsvOp::Record<std::string>>(
|
column_default_list.push_back(std::make_shared<CsvOp::Record<std::string>>(
|
||||||
CsvOp::STRING, std::dynamic_pointer_cast<CsvRecord<std::string>>(v)->value));
|
CsvOp::STRING, (std::dynamic_pointer_cast<CsvRecord<std::string>>(v))->value));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -81,7 +81,8 @@ void MindDataNode::Print(std::ostream &out) const { out << Name() + "(file:" + d
|
||||||
|
|
||||||
Status MindDataNode::ValidateParams() {
|
Status MindDataNode::ValidateParams() {
|
||||||
RETURN_IF_NOT_OK(DatasetNode::ValidateParams());
|
RETURN_IF_NOT_OK(DatasetNode::ValidateParams());
|
||||||
if (!search_for_pattern_ && dataset_files_.size() > 4096) {
|
constexpr size_t max_len = 4096;
|
||||||
|
if (!search_for_pattern_ && dataset_files_.size() > max_len) {
|
||||||
std::string err_msg =
|
std::string err_msg =
|
||||||
"MindDataNode: length of dataset_file must be less than or equal to 4096, dataset_file length: " +
|
"MindDataNode: length of dataset_file must be less than or equal to 4096, dataset_file length: " +
|
||||||
std::to_string(dataset_file_.size());
|
std::to_string(dataset_file_.size());
|
||||||
|
|
|
@ -40,8 +40,9 @@ Status TensorOpFusionPass::Visit(std::shared_ptr<MapNode> node, bool *const modi
|
||||||
[](auto op, const std::string &nm) { return op->Name() == nm; });
|
[](auto op, const std::string &nm) { return op->Name() == nm; });
|
||||||
if (itr != ops.end()) {
|
if (itr != ops.end()) {
|
||||||
MS_LOG(WARNING) << "Fusing pre-build Decode and RandomCropResize into one pre-build.";
|
MS_LOG(WARNING) << "Fusing pre-build Decode and RandomCropResize into one pre-build.";
|
||||||
auto op = dynamic_cast<RandomCropAndResizeOp *>((*(itr + 1))->Build().get());
|
auto fused_op = dynamic_cast<RandomCropAndResizeOp *>((*(itr + 1))->Build().get());
|
||||||
(*itr) = std::make_shared<transforms::PreBuiltOperation>(std::make_shared<RandomCropDecodeResizeOp>(*op));
|
RETURN_UNEXPECTED_IF_NULL(fused_op);
|
||||||
|
(*itr) = std::make_shared<transforms::PreBuiltOperation>(std::make_shared<RandomCropDecodeResizeOp>(*fused_op));
|
||||||
ops.erase(itr + 1);
|
ops.erase(itr + 1);
|
||||||
node->setOperations(ops);
|
node->setOperations(ops);
|
||||||
*modified = true;
|
*modified = true;
|
||||||
|
@ -55,10 +56,10 @@ Status TensorOpFusionPass::Visit(std::shared_ptr<MapNode> node, bool *const modi
|
||||||
|
|
||||||
// return here if no pattern is found
|
// return here if no pattern is found
|
||||||
RETURN_OK_IF_TRUE(itr == ops.end());
|
RETURN_OK_IF_TRUE(itr == ops.end());
|
||||||
auto *op = dynamic_cast<vision::RandomResizedCropOperation *>((itr + 1)->get());
|
auto *fused_ir = dynamic_cast<vision::RandomResizedCropOperation *>((itr + 1)->get());
|
||||||
RETURN_UNEXPECTED_IF_NULL(op);
|
RETURN_UNEXPECTED_IF_NULL(fused_ir);
|
||||||
// fuse the two ops
|
// fuse the two ops
|
||||||
(*itr) = std::make_shared<vision::RandomCropDecodeResizeOperation>(*op);
|
(*itr) = std::make_shared<vision::RandomCropDecodeResizeOperation>(*fused_ir);
|
||||||
ops.erase(itr + 1);
|
ops.erase(itr + 1);
|
||||||
node->setOperations(ops);
|
node->setOperations(ops);
|
||||||
*modified = true;
|
*modified = true;
|
||||||
|
|
|
@ -47,7 +47,8 @@ Status AutoWorkerPass::RunOnTree(std::shared_ptr<DatasetNode> root_ir, bool *con
|
||||||
float max_weight = 0;
|
float max_weight = 0;
|
||||||
for (const auto &p : pass.weight_profile_) max_weight = std::max(max_weight, p.second);
|
for (const auto &p : pass.weight_profile_) max_weight = std::max(max_weight, p.second);
|
||||||
RETURN_IF_NOT_OK(pass.Run(root_ir, modified));
|
RETURN_IF_NOT_OK(pass.Run(root_ir, modified));
|
||||||
if (pass.parallel_ops_.size() > 3) {
|
constexpr size_t max_num_ops = 3;
|
||||||
|
if (pass.parallel_ops_.size() > max_num_ops) {
|
||||||
MS_LOG(WARNING) << "AutoNumWorker right now is only suitable for simple dataset pipelines that has at most, 1 leaf "
|
MS_LOG(WARNING) << "AutoNumWorker right now is only suitable for simple dataset pipelines that has at most, 1 leaf "
|
||||||
<< "1 batch and 1 map. AutoNumWorker may not be optimal for usage on complex pipelines.";
|
<< "1 batch and 1 map. AutoNumWorker may not be optimal for usage on complex pipelines.";
|
||||||
}
|
}
|
||||||
|
|
|
@ -360,15 +360,16 @@ Status OperatorCpu::Analyze(std::string *name, double *utilization, std::string
|
||||||
|
|
||||||
// Only analyze the middle half of the samples
|
// Only analyze the middle half of the samples
|
||||||
// Starting and ending may be impacted by startup or ending pipeline activities
|
// Starting and ending may be impacted by startup or ending pipeline activities
|
||||||
int start_analyze = total_samples / 4;
|
constexpr int64_t sample_sections = 4;
|
||||||
int end_analyze = total_samples - start_analyze;
|
int64 start_analyze = total_samples / sample_sections;
|
||||||
|
int64 end_analyze = total_samples - start_analyze;
|
||||||
double op_util = 0;
|
double op_util = 0;
|
||||||
*utilization = 0;
|
*utilization = 0;
|
||||||
|
|
||||||
// start loop from 0 was as don't want to analyze op -1
|
// start loop from 0 was as don't want to analyze op -1
|
||||||
for (auto op_id = 0; op_id < id_count_; op_id++) {
|
for (auto op_id = 0; op_id < id_count_; op_id++) {
|
||||||
int sum = 0;
|
int64 sum = 0;
|
||||||
int index = op_id + 1;
|
int64 index = op_id + 1;
|
||||||
for (int i = start_analyze; i < end_analyze; i++) {
|
for (int i = start_analyze; i < end_analyze; i++) {
|
||||||
sum += cpu_op_util_[i][index].user_utilization_;
|
sum += cpu_op_util_[i][index].user_utilization_;
|
||||||
sum += cpu_op_util_[i][index].sys_utilization_;
|
sum += cpu_op_util_[i][index].sys_utilization_;
|
||||||
|
@ -517,11 +518,12 @@ Status ProcessCpu::Analyze(std::string *name, double *utilization, std::string *
|
||||||
name->clear();
|
name->clear();
|
||||||
name->append("process_info");
|
name->append("process_info");
|
||||||
int total_samples = process_util_.size();
|
int total_samples = process_util_.size();
|
||||||
int sum = 0;
|
int64 sum = 0;
|
||||||
// Only analyze the middle half of the samples
|
// Only analyze the middle half of the samples
|
||||||
// Starting and ending may be impacted by startup or ending pipeline activities
|
// Starting and ending may be impacted by startup or ending pipeline activities
|
||||||
int start_analyze = total_samples / 4;
|
constexpr int64_t sample_sections = 4;
|
||||||
int end_analyze = total_samples - start_analyze;
|
int64 start_analyze = total_samples / sample_sections;
|
||||||
|
int64 end_analyze = total_samples - start_analyze;
|
||||||
|
|
||||||
for (int i = start_analyze; i < end_analyze; i++) {
|
for (int i = start_analyze; i < end_analyze; i++) {
|
||||||
sum += process_util_[i].user_utilization_;
|
sum += process_util_[i].user_utilization_;
|
||||||
|
@ -614,7 +616,8 @@ Status CpuSampling::SaveSamplingItervalToFile() {
|
||||||
Status CpuSampling::Analyze() {
|
Status CpuSampling::Analyze() {
|
||||||
std::string name;
|
std::string name;
|
||||||
double utilization = 0;
|
double utilization = 0;
|
||||||
|
constexpr double total_cpu_thold = 90;
|
||||||
|
constexpr double op_cpu_thold = 80;
|
||||||
// Keep track of specific information returned by differentn CPU sampling types
|
// Keep track of specific information returned by differentn CPU sampling types
|
||||||
double total_utilization = 0;
|
double total_utilization = 0;
|
||||||
double max_op_utilization = 0;
|
double max_op_utilization = 0;
|
||||||
|
@ -633,7 +636,7 @@ Status CpuSampling::Analyze() {
|
||||||
detailed_op_cpu_message = extra_message;
|
detailed_op_cpu_message = extra_message;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ((total_utilization < 90) && (max_op_utilization > 80)) {
|
if ((total_utilization < total_cpu_thold) && (max_op_utilization > op_cpu_thold)) {
|
||||||
MS_LOG(WARNING) << "Operator " << max_op_name << " is using " << max_op_utilization << "% CPU per thread. "
|
MS_LOG(WARNING) << "Operator " << max_op_name << " is using " << max_op_utilization << "% CPU per thread. "
|
||||||
<< "This operator may benefit from increasing num_parallel_workers."
|
<< "This operator may benefit from increasing num_parallel_workers."
|
||||||
<< "Full Operator CPU utiliization for all operators: " << detailed_op_cpu_message << std::endl;
|
<< "Full Operator CPU utiliization for all operators: " << detailed_op_cpu_message << std::endl;
|
||||||
|
|
|
@ -43,7 +43,8 @@ Status CenterCropOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_p
|
||||||
int32_t left = crop_wid_ - input->shape()[1];
|
int32_t left = crop_wid_ - input->shape()[1];
|
||||||
std::shared_ptr<Tensor> pad_image;
|
std::shared_ptr<Tensor> pad_image;
|
||||||
|
|
||||||
CHECK_FAIL_RETURN_UNEXPECTED((top < input->shape()[0] * 3 && left < input->shape()[1] * 3),
|
constexpr int64_t pad_limit = 3;
|
||||||
|
CHECK_FAIL_RETURN_UNEXPECTED((top < input->shape()[0] * pad_limit && left < input->shape()[1] * pad_limit),
|
||||||
"CenterCrop: CenterCropOp padding size is more than 3 times the original size.");
|
"CenterCrop: CenterCropOp padding size is more than 3 times the original size.");
|
||||||
|
|
||||||
if (top > 0 && left > 0) { // padding only
|
if (top > 0 && left > 0) { // padding only
|
||||||
|
|
|
@ -60,7 +60,8 @@ Status ValidateVectorColorAttribute(const std::string &op_name, const std::strin
|
||||||
for (auto &attr_val : attr) {
|
for (auto &attr_val : attr) {
|
||||||
RETURN_IF_NOT_OK(ValidateScalar(op_name, attr_name, attr_val, range, false, false));
|
RETURN_IF_NOT_OK(ValidateScalar(op_name, attr_name, attr_val, range, false, false));
|
||||||
}
|
}
|
||||||
if (attr.size() == 2 && (attr[0] > attr[1])) {
|
constexpr size_t attr_size_two = 2;
|
||||||
|
if (attr.size() == attr_size_two && (attr[0] > attr[1])) {
|
||||||
std::string err_msg = op_name + ":" + attr_name +
|
std::string err_msg = op_name + ":" + attr_name +
|
||||||
" lower bound must be less or equal to upper bound, got lb: " + std::to_string(attr[0]) +
|
" lower bound must be less or equal to upper bound, got lb: " + std::to_string(attr[0]) +
|
||||||
", ub: " + std::to_string(attr[1]);
|
", ub: " + std::to_string(attr[1]);
|
||||||
|
|
|
@ -44,7 +44,7 @@ Status NgramOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Te
|
||||||
offsets.reserve(1 + l_len_ + r_len_ + input->shape().NumOfElements());
|
offsets.reserve(1 + l_len_ + r_len_ + input->shape().NumOfElements());
|
||||||
str_buffer.reserve(l_pad_with_sp_.size() * l_len_ + r_pad_with_sp_.size() * r_len_ + input->SizeInBytes());
|
str_buffer.reserve(l_pad_with_sp_.size() * l_len_ + r_pad_with_sp_.size() * r_len_ + input->SizeInBytes());
|
||||||
offsets.push_back(str_buffer.size()); // insert 0 as the starting pos
|
offsets.push_back(str_buffer.size()); // insert 0 as the starting pos
|
||||||
for (int i = 0; i < l_len_; i++) offsets.push_back((str_buffer += l_pad_with_sp_).size());
|
for (int l_i = 0; l_i < l_len_; l_i++) offsets.push_back((str_buffer += l_pad_with_sp_).size());
|
||||||
|
|
||||||
for (auto itr = input->begin<std::string_view>(); itr != input->end<std::string_view>(); itr++) {
|
for (auto itr = input->begin<std::string_view>(); itr != input->end<std::string_view>(); itr++) {
|
||||||
str_buffer += (*itr);
|
str_buffer += (*itr);
|
||||||
|
@ -52,7 +52,7 @@ Status NgramOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Te
|
||||||
offsets.push_back(str_buffer.size());
|
offsets.push_back(str_buffer.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < r_len_; i++) offsets.push_back((str_buffer += r_pad_with_sp_).size());
|
for (int r_i = 0; r_i < r_len_; r_i++) offsets.push_back((str_buffer += r_pad_with_sp_).size());
|
||||||
|
|
||||||
for (auto n : ngrams_) {
|
for (auto n : ngrams_) {
|
||||||
CHECK_FAIL_RETURN_UNEXPECTED(n > 0, "Ngram: ngrams needs to be a positive number.\n");
|
CHECK_FAIL_RETURN_UNEXPECTED(n > 0, "Ngram: ngrams needs to be a positive number.\n");
|
||||||
|
@ -63,8 +63,8 @@ Status NgramOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Te
|
||||||
} else {
|
} else {
|
||||||
CHECK_FAIL_RETURN_UNEXPECTED(end_ind - n >= 0, "Ngram: get offsets failed.");
|
CHECK_FAIL_RETURN_UNEXPECTED(end_ind - n >= 0, "Ngram: get offsets failed.");
|
||||||
|
|
||||||
for (int i = start_ind; i < end_ind - n; i++) {
|
for (int ind = start_ind; ind < end_ind - n; ind++) {
|
||||||
res.emplace_back(str_buffer.substr(offsets[i], offsets[i + n] - offsets[i] - separator_.size()));
|
res.emplace_back(str_buffer.substr(offsets[ind], offsets[ind + n] - offsets[ind] - separator_.size()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -53,12 +53,12 @@ from .iterators import DictIterator, TupleIterator, DummyIterator, check_iterato
|
||||||
ITERATORS_LIST, _unset_iterator_cleanup
|
ITERATORS_LIST, _unset_iterator_cleanup
|
||||||
from .queue import _SharedQueue
|
from .queue import _SharedQueue
|
||||||
from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \
|
from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \
|
||||||
check_rename, check_numpyslicesdataset, check_device_send, \
|
check_rename, check_numpyslicesdataset, check_device_send, check_take, check_project, check_imagefolderdataset, \
|
||||||
check_take, check_project, check_imagefolderdataset, check_mnist_cifar_dataset, check_manifestdataset, \
|
check_mnist_cifar_dataset, check_manifestdataset, check_tfrecorddataset, check_vocdataset, check_cocodataset, \
|
||||||
check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset, \
|
check_celebadataset, check_minddataset, check_generatordataset, check_sync_wait, check_zip_dataset, \
|
||||||
check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat, \
|
check_add_column, check_textfiledataset, check_concat, check_random_dataset, check_split, \
|
||||||
check_random_dataset, check_split, check_bucket_batch_by_length, check_cluedataset, check_save, check_csvdataset, \
|
check_bucket_batch_by_length, check_cluedataset, check_save, check_csvdataset, check_paddeddataset, \
|
||||||
check_paddeddataset, check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send
|
check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send
|
||||||
from ..core.config import get_callback_timeout, _init_device_info, get_enable_shared_mem, get_num_parallel_workers, \
|
from ..core.config import get_callback_timeout, _init_device_info, get_enable_shared_mem, get_num_parallel_workers, \
|
||||||
get_prefetch_size, get_dynamic_columns
|
get_prefetch_size, get_dynamic_columns
|
||||||
from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist
|
from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist
|
||||||
|
@ -88,7 +88,7 @@ def shuffle_to_shuffle_mode(shuffle):
|
||||||
if shuffle is None or shuffle:
|
if shuffle is None or shuffle:
|
||||||
shuffle_mode = cde.ShuffleMode.GLOBAL # Global shuffle
|
shuffle_mode = cde.ShuffleMode.GLOBAL # Global shuffle
|
||||||
else:
|
else:
|
||||||
shuffle_mode = cde.ShuffleMode.FALSE # No shuffle
|
shuffle_mode = cde.ShuffleMode.FALSE # No shuffle
|
||||||
else:
|
else:
|
||||||
shuffle_mode = ShuffleToShuffleMode[shuffle]
|
shuffle_mode = ShuffleToShuffleMode[shuffle]
|
||||||
return shuffle_mode
|
return shuffle_mode
|
||||||
|
@ -156,6 +156,7 @@ def _get_operator_process():
|
||||||
fetched_all = fetched_all and item_full
|
fetched_all = fetched_all and item_full
|
||||||
return op_process, fetched_all
|
return op_process, fetched_all
|
||||||
|
|
||||||
|
|
||||||
def _set_dataset_permissions(file_name, num_files):
|
def _set_dataset_permissions(file_name, num_files):
|
||||||
"""
|
"""
|
||||||
set saved dataset files' permissions to 600
|
set saved dataset files' permissions to 600
|
||||||
|
@ -174,6 +175,7 @@ def _set_dataset_permissions(file_name, num_files):
|
||||||
if os.path.exists(index_file):
|
if os.path.exists(index_file):
|
||||||
os.chmod(index_file, stat.S_IRUSR | stat.S_IWUSR)
|
os.chmod(index_file, stat.S_IRUSR | stat.S_IWUSR)
|
||||||
|
|
||||||
|
|
||||||
class Dataset:
|
class Dataset:
|
||||||
"""
|
"""
|
||||||
Abstract class to represent a dataset in DataEngine's data pipeline.
|
Abstract class to represent a dataset in DataEngine's data pipeline.
|
||||||
|
@ -1593,7 +1595,7 @@ class Dataset:
|
||||||
for col in data.keys():
|
for col in data.keys():
|
||||||
if col in dynamic_columns:
|
if col in dynamic_columns:
|
||||||
shape_mismatch = "dynamic column [" + col + "] with shape " + str(dynamic_columns[col]) + \
|
shape_mismatch = "dynamic column [" + col + "] with shape " + str(dynamic_columns[col]) + \
|
||||||
" does not match dataset column [" + col + "] with shape " + str(list(data[col].shape))
|
" does not match dataset column [" + col + "] with shape " + str(list(data[col].shape))
|
||||||
if data[col].ndim != len(dynamic_columns[col]):
|
if data[col].ndim != len(dynamic_columns[col]):
|
||||||
raise RuntimeError(shape_mismatch)
|
raise RuntimeError(shape_mismatch)
|
||||||
for dim in range(len(dynamic_columns[col])):
|
for dim in range(len(dynamic_columns[col])):
|
||||||
|
@ -1850,6 +1852,7 @@ class MappableDataset(SourceDataset):
|
||||||
self.sampler = samplers.select_sampler(num_samples, sampler, shuffle, num_shards, shard_id)
|
self.sampler = samplers.select_sampler(num_samples, sampler, shuffle, num_shards, shard_id)
|
||||||
|
|
||||||
def add_sampler(self, new_sampler):
|
def add_sampler(self, new_sampler):
|
||||||
|
""" add a sampler """
|
||||||
# note: By adding a sampler, the sampled IDs will flow to new_sampler
|
# note: By adding a sampler, the sampled IDs will flow to new_sampler
|
||||||
# after first passing through the current samplers attached to this dataset.
|
# after first passing through the current samplers attached to this dataset.
|
||||||
self.dataset_size = None
|
self.dataset_size = None
|
||||||
|
@ -2365,7 +2368,6 @@ def _pyfunc_worker_init(pyfunc_list, args_queue, ret_queue):
|
||||||
_RET_QUEUE = ret_queue
|
_RET_QUEUE = ret_queue
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Pyfunc worker execution function
|
# Pyfunc worker execution function
|
||||||
# All exceptions will be raised to main processes
|
# All exceptions will be raised to main processes
|
||||||
def _pyfunc_worker_exec(index, qid, *args):
|
def _pyfunc_worker_exec(index, qid, *args):
|
||||||
|
@ -2388,6 +2390,7 @@ def _pyfunc_worker_exec(index, qid, *args):
|
||||||
## not using shared memory for passing arguments, call function directly
|
## not using shared memory for passing arguments, call function directly
|
||||||
return _GLOBAL_PYFUNC_LIST[index](*args)
|
return _GLOBAL_PYFUNC_LIST[index](*args)
|
||||||
|
|
||||||
|
|
||||||
# PythonCallable wrapper for multiprocess pyfunc
|
# PythonCallable wrapper for multiprocess pyfunc
|
||||||
class _PythonCallable:
|
class _PythonCallable:
|
||||||
"""
|
"""
|
||||||
|
@ -3416,12 +3419,11 @@ class SamplerFn:
|
||||||
self.eof = threading.Event()
|
self.eof = threading.Event()
|
||||||
# Create workers
|
# Create workers
|
||||||
|
|
||||||
#get default queue size and adjust queuesize per worker if there are large # workers
|
# get default queue size and adjust queuesize per worker if there are large # workers
|
||||||
queue_size = get_prefetch_size()
|
queue_size = get_prefetch_size()
|
||||||
queue_size = min(queue_size, queue_size * 4 // num_worker)
|
queue_size = min(queue_size, queue_size * 4 // num_worker)
|
||||||
queue_size = max(2, queue_size)
|
queue_size = max(2, queue_size)
|
||||||
|
|
||||||
|
|
||||||
for _ in range(num_worker):
|
for _ in range(num_worker):
|
||||||
if multi_process is True:
|
if multi_process is True:
|
||||||
worker = _GeneratorWorkerMp(dataset, self.eof, max_rowsize, queue_size)
|
worker = _GeneratorWorkerMp(dataset, self.eof, max_rowsize, queue_size)
|
||||||
|
|
|
@ -121,6 +121,7 @@ class BuiltinSampler:
|
||||||
self.child_sampler = sampler
|
self.child_sampler = sampler
|
||||||
|
|
||||||
def get_child(self):
|
def get_child(self):
|
||||||
|
""" add a child sampler """
|
||||||
return self.child_sampler
|
return self.child_sampler
|
||||||
|
|
||||||
def parse_child(self):
|
def parse_child(self):
|
||||||
|
@ -136,9 +137,11 @@ class BuiltinSampler:
|
||||||
return c_child_sampler
|
return c_child_sampler
|
||||||
|
|
||||||
def is_shuffled(self):
|
def is_shuffled(self):
|
||||||
|
""" not implemented """
|
||||||
raise NotImplementedError("Sampler must implement is_shuffled.")
|
raise NotImplementedError("Sampler must implement is_shuffled.")
|
||||||
|
|
||||||
def is_sharded(self):
|
def is_sharded(self):
|
||||||
|
""" not implemented """
|
||||||
raise NotImplementedError("Sampler must implement is_sharded.")
|
raise NotImplementedError("Sampler must implement is_sharded.")
|
||||||
|
|
||||||
def get_num_samples(self):
|
def get_num_samples(self):
|
||||||
|
|
|
@ -392,6 +392,7 @@ def construct_tensor_ops(operations):
|
||||||
|
|
||||||
|
|
||||||
def to_policy(op_list):
|
def to_policy(op_list):
|
||||||
|
""" op_list to policy """
|
||||||
policy_tensor_ops = []
|
policy_tensor_ops = []
|
||||||
for policy_list in op_list:
|
for policy_list in op_list:
|
||||||
sub_policy_tensor_ops = []
|
sub_policy_tensor_ops = []
|
||||||
|
@ -403,12 +404,17 @@ def to_policy(op_list):
|
||||||
|
|
||||||
|
|
||||||
def to_shuffle_mode(shuffle):
|
def to_shuffle_mode(shuffle):
|
||||||
if shuffle == 2: return "global"
|
""" int to shuffle mode """
|
||||||
if shuffle == 1: return "files"
|
ret_val = False
|
||||||
return False
|
if shuffle == 2:
|
||||||
|
ret_val = "global"
|
||||||
|
elif shuffle == 1:
|
||||||
|
ret_val = "files"
|
||||||
|
return ret_val
|
||||||
|
|
||||||
|
|
||||||
def to_interpolation_mode(inter):
|
def to_interpolation_mode(inter):
|
||||||
|
""" int to interpolation mode """
|
||||||
return {
|
return {
|
||||||
0: Inter.LINEAR,
|
0: Inter.LINEAR,
|
||||||
1: Inter.NEAREST,
|
1: Inter.NEAREST,
|
||||||
|
@ -418,6 +424,7 @@ def to_interpolation_mode(inter):
|
||||||
|
|
||||||
|
|
||||||
def to_border_mode(border):
|
def to_border_mode(border):
|
||||||
|
""" int to border mode """
|
||||||
return {
|
return {
|
||||||
0: Border.CONSTANT,
|
0: Border.CONSTANT,
|
||||||
1: Border.EDGE,
|
1: Border.EDGE,
|
||||||
|
@ -427,6 +434,7 @@ def to_border_mode(border):
|
||||||
|
|
||||||
|
|
||||||
def to_mstype(data_type):
|
def to_mstype(data_type):
|
||||||
|
""" str to mstype """
|
||||||
return {
|
return {
|
||||||
"bool": mstype.bool_,
|
"bool": mstype.bool_,
|
||||||
"int8": mstype.int8,
|
"int8": mstype.int8,
|
||||||
|
@ -445,6 +453,7 @@ def to_mstype(data_type):
|
||||||
|
|
||||||
|
|
||||||
def to_image_batch_format(image_batch_format):
|
def to_image_batch_format(image_batch_format):
|
||||||
|
""" int to image batch format """
|
||||||
return {
|
return {
|
||||||
0: ImageBatchFormat.NHWC,
|
0: ImageBatchFormat.NHWC,
|
||||||
1: ImageBatchFormat.NCHW
|
1: ImageBatchFormat.NCHW
|
||||||
|
@ -452,4 +461,5 @@ def to_image_batch_format(image_batch_format):
|
||||||
|
|
||||||
|
|
||||||
def check_and_replace_input(input_value, expect, replace):
|
def check_and_replace_input(input_value, expect, replace):
|
||||||
|
""" check and replace input arg """
|
||||||
return replace if input_value == expect else input_value
|
return replace if input_value == expect else input_value
|
||||||
|
|
|
@ -465,7 +465,8 @@ def check_pad_info(key, val):
|
||||||
for dim in val[0]:
|
for dim in val[0]:
|
||||||
if dim is not None:
|
if dim is not None:
|
||||||
type_check(dim, (int,), "dim in pad_shape")
|
type_check(dim, (int,), "dim in pad_shape")
|
||||||
assert dim > 0, "pad shape should be positive integers"
|
if dim <= 0:
|
||||||
|
raise ValueError("pad shape should be positive integers")
|
||||||
if val[1] is not None:
|
if val[1] is not None:
|
||||||
type_check(val[1], (int, float, str, bytes), "pad_value")
|
type_check(val[1], (int, float, str, bytes), "pad_value")
|
||||||
|
|
||||||
|
|
|
@ -18,12 +18,11 @@ NLP text processing module which is developed with ICU4C and cppjieba.
|
||||||
utils provides some general methods for NLP text processing.
|
utils provides some general methods for NLP text processing.
|
||||||
"""
|
"""
|
||||||
import platform
|
import platform
|
||||||
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, TruncateSequencePair, \
|
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, \
|
||||||
ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer
|
TruncateSequencePair, ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer
|
||||||
from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm, SentencePieceVocab, SentencePieceModel, \
|
from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm, SentencePieceVocab, SentencePieceModel, \
|
||||||
SPieceTokenizerOutType, SPieceTokenizerLoadType
|
SPieceTokenizerOutType, SPieceTokenizerLoadType
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram",
|
"Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram",
|
||||||
"to_str", "to_bytes", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber",
|
"to_str", "to_bytes", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber",
|
||||||
|
|
|
@ -79,7 +79,7 @@ class Vocab(cde.Vocab):
|
||||||
word_list(list): a list of string where each element is a word of type string.
|
word_list(list): a list of string where each element is a word of type string.
|
||||||
special_tokens(list, optional): a list of strings, each one is a special token. for example
|
special_tokens(list, optional): a list of strings, each one is a special token. for example
|
||||||
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
|
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
|
||||||
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens
|
special_first(bool, optional): whether special_tokens is prepended or appended to vocab. If special_tokens
|
||||||
is specified and special_first is set to True, special_tokens will be prepended (default=True).
|
is specified and special_first is set to True, special_tokens will be prepended (default=True).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
Loading…
Reference in New Issue