diff --git a/build.sh b/build.sh index 9e032688358..c2b4698276e 100755 --- a/build.sh +++ b/build.sh @@ -393,7 +393,7 @@ build_mindspore() CMAKE_VERBOSE="--verbose" fi cmake --build . --target package ${CMAKE_VERBOSE} -j$THREAD_NUM - echo "success to build mindspore project!" + echo "success building mindspore project!" } checkndk() { diff --git a/mindspore/ccsrc/minddata/dataset/api/datasets.cc b/mindspore/ccsrc/minddata/dataset/api/datasets.cc index 8ace68d2a3c..375f7b521b8 100644 --- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc +++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc @@ -21,6 +21,7 @@ #include "minddata/dataset/include/transforms.h" #include "minddata/dataset/engine/dataset_iterator.h" // Source dataset headers (in alphabetical order) +#include "minddata/dataset/engine/datasetops/source/album_op.h" #include "minddata/dataset/engine/datasetops/source/celeba_op.h" #include "minddata/dataset/engine/datasetops/source/cifar_op.h" #include "minddata/dataset/engine/datasetops/source/clue_op.h" @@ -117,6 +118,15 @@ std::shared_ptr Schema(const std::string &schema_file) { // FUNCTIONS TO CREATE DATASETS FOR LEAF-NODE DATASETS // (In alphabetical order) +// Function to create a AlbumDataset. +std::shared_ptr Album(const std::string &dataset_dir, const std::string &data_schema, + const std::vector &column_names, bool decode, + const std::shared_ptr &sampler) { + auto ds = std::make_shared(dataset_dir, data_schema, column_names, decode, sampler); + + return ds->ValidateParams() ? ds : nullptr; +} + // Function to create a CelebADataset. std::shared_ptr CelebA(const std::string &dataset_dir, const std::string &dataset_type, const std::shared_ptr &sampler, bool decode, @@ -687,6 +697,49 @@ bool ValidateDatasetShardParams(const std::string &dataset_name, int32_t num_sha // DERIVED DATASET CLASSES LEAF-NODE DATASETS // (In alphabetical order) +// Constructor for AlbumDataset +AlbumDataset::AlbumDataset(const std::string &dataset_dir, const std::string &data_schema, + const std::vector &column_names, bool decode, + const std::shared_ptr &sampler) + : dataset_dir_(dataset_dir), + schema_path_(data_schema), + column_names_(column_names), + decode_(decode), + sampler_(sampler) {} + +bool AlbumDataset::ValidateParams() { + if (!ValidateDatasetDirParam("AlbumDataset", dataset_dir_)) { + return false; + } + + if (!ValidateDatasetFilesParam("AlbumDataset", {schema_path_})) { + return false; + } + + return true; +} + +// Function to build AlbumDataset +std::vector> AlbumDataset::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + // If user does not specify Sampler, create a default sampler, i.e., RandomSampler. + if (sampler_ == nullptr) { + sampler_ = CreateDefaultSampler(); + } + + auto schema = std::make_unique(); + RETURN_EMPTY_IF_ERROR(schema->LoadSchemaFile(schema_path_, column_names_)); + + // Argument that is not exposed to user in the API. + std::set extensions = {}; + + node_ops.push_back(std::make_shared(num_workers_, rows_per_buffer_, dataset_dir_, connector_que_size_, + decode_, extensions, std::move(schema), std::move(sampler_->Build()))); + return node_ops; +} + // Constructor for CelebADataset CelebADataset::CelebADataset(const std::string &dataset_dir, const std::string &dataset_type, const std::shared_ptr &sampler, const bool &decode, diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/CMakeLists.txt index 868c6fdb891..ae97e2f3f52 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/CMakeLists.txt +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/CMakeLists.txt @@ -13,6 +13,7 @@ set(DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES text_file_op.cc clue_op.cc csv_op.cc + album_op.cc ) set(DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.cc new file mode 100644 index 00000000000..d48759b66cc --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.cc @@ -0,0 +1,508 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "minddata/dataset/engine/datasetops/source/album_op.h" +#include +#include +#include "minddata/dataset/core/config_manager.h" +#include "minddata/dataset/core/tensor_shape.h" +#include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h" +#include "minddata/dataset/engine/db_connector.h" +#include "minddata/dataset/engine/execution_tree.h" +#include "minddata/dataset/engine/opt/pass.h" +#include "minddata/dataset/kernels/image/image_utils.h" + +namespace mindspore { +namespace dataset { +AlbumOp::Builder::Builder() : builder_decode_(false), builder_sampler_(nullptr), builder_schema_file_("") { + std::shared_ptr cfg = GlobalContext::config_manager(); + builder_num_workers_ = cfg->num_parallel_workers(); + builder_rows_per_buffer_ = cfg->rows_per_buffer(); + builder_op_connector_size_ = cfg->op_connector_size(); +} + +Status AlbumOp::Builder::Build(std::shared_ptr *ptr) { + RETURN_IF_NOT_OK(SanityCheck()); + if (builder_sampler_ == nullptr) { + int64_t num_samples = 0; // default num samples of 0 means to sample entire set of data + int64_t start_index = 0; + builder_sampler_ = std::make_shared(start_index, num_samples); + } + + builder_schema_ = std::make_unique(); + Path schema_file(builder_schema_file_); + if (builder_schema_file_ == "" || !schema_file.Exists()) { + RETURN_STATUS_UNEXPECTED("Schema not provided"); + } else { + MS_LOG(INFO) << "Schema file provided: " << builder_schema_file_ << "."; + builder_schema_->LoadSchemaFile(builder_schema_file_, builder_columns_to_load_); + } + *ptr = std::make_shared(builder_num_workers_, builder_rows_per_buffer_, builder_dir_, + builder_op_connector_size_, builder_decode_, builder_extensions_, + std::move(builder_schema_), std::move(builder_sampler_)); + return Status::OK(); +} + +Status AlbumOp::Builder::SanityCheck() { + Path dir(builder_dir_); + std::string err_msg; + err_msg += dir.IsDirectory() == false ? "Album path is invalid or not set\n" : ""; + err_msg += builder_num_workers_ <= 0 ? "Num of parallel workers is set to 0\n" : ""; + return err_msg.empty() ? Status::OK() : Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err_msg); +} + +AlbumOp::AlbumOp(int32_t num_wkrs, int32_t rows_per_buffer, std::string file_dir, int32_t queue_size, bool do_decode, + const std::set &exts, std::unique_ptr data_schema, + std::shared_ptr sampler) + : ParallelOp(num_wkrs, queue_size), + rows_per_buffer_(rows_per_buffer), + folder_path_(file_dir), + decode_(do_decode), + extensions_(exts), + data_schema_(std::move(data_schema)), + sampler_(std::move(sampler)), + row_cnt_(0), + buf_cnt_(0), + sampler_ind_(0), + dirname_offset_(0) { + // Set the column name map (base class field) + for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) { + column_name_id_map_[data_schema_->column(i).name()] = i; + } + io_block_queues_.Init(num_workers_, queue_size); +} + +// Helper function for string comparison +bool StrComp(const std::string &a, const std::string &b) { + // returns 1 if string a is alphabetically + // less than string b + // quite similar to strcmp operation + return a < b; +} + +// Single thread to go through the folder directory and gets all file names +// calculate numRows then return +Status AlbumOp::PrescanEntry() { + Path folder(folder_path_); + dirname_offset_ = folder_path_.length(); + std::shared_ptr dirItr = Path::DirIterator::OpenDirectory(&folder); + if (folder.Exists() == false || dirItr == nullptr) { + RETURN_STATUS_UNEXPECTED("Error unable to open: " + folder_path_); + } + MS_LOG(INFO) << "Album folder Path found: " << folder_path_ << "."; + + while (dirItr->hasNext()) { + Path file = dirItr->next(); + if (extensions_.empty() || extensions_.find(file.Extension()) != extensions_.end()) { + (void)image_rows_.push_back(file.toString().substr(dirname_offset_)); + } else { + MS_LOG(INFO) << "Album operator unsupported file found: " << file.toString() + << ", extension: " << file.Extension() << "."; + } + } + + std::sort(image_rows_.begin(), image_rows_.end(), StrComp); + num_rows_ = image_rows_.size(); + return Status::OK(); +} + +// Main logic, Register Queue with TaskGroup, launch all threads and do the functor's work +Status AlbumOp::operator()() { + RETURN_IF_NOT_OK(this->PrescanEntry()); + RETURN_IF_NOT_OK(LaunchThreadsAndInitOp()); + std::unique_ptr sampler_buffer; + RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer)); + while (true) { // each iterator is 1 epoch + std::vector keys; + keys.reserve(rows_per_buffer_); + while (sampler_buffer->eoe() == false) { + TensorRow sample_row; + RETURN_IF_NOT_OK(sampler_buffer->PopRow(&sample_row)); + std::shared_ptr sample_ids = sample_row[0]; + for (auto itr = sample_ids->begin(); itr != sample_ids->end(); ++itr) { + if ((*itr) >= num_rows_) continue; // index out of bound, skipping + keys.push_back(*itr); + row_cnt_++; + if (row_cnt_ % rows_per_buffer_ == 0) { + RETURN_IF_NOT_OK( + io_block_queues_[buf_cnt_++ % num_workers_]->Add(std::make_unique(keys, IOBlock::kDeIoBlockNone))); + keys.clear(); + } + } + RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer)); + } + if (keys.empty() == false) { + RETURN_IF_NOT_OK( + io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique(keys, IOBlock::kDeIoBlockNone))); + } + if (IsLastIteration()) { + std::unique_ptr eoe_block = std::make_unique(IOBlock::kDeIoBlockFlagEoe); + std::unique_ptr eof_block = std::make_unique(IOBlock::kDeIoBlockFlagEof); + RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::move(eoe_block))); + RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::move(eof_block))); + for (int32_t i = 0; i < num_workers_; ++i) { + RETURN_IF_NOT_OK( + io_block_queues_[i]->Add(std::make_unique(std::vector(), IOBlock::kDeIoBlockNone))); + } + return Status::OK(); + } else { // not the last repeat. Sleep master thread, wait for the wake-up from reset + RETURN_IF_NOT_OK( + io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique(IOBlock::kDeIoBlockFlagEoe))); + RETURN_IF_NOT_OK(wp_.Wait()); // Master thread goes to sleep after it has made all the IOBlocks + wp_.Clear(); + RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer)); + } + UpdateRepeatAndEpochCounter(); + } +} + +// contains the main logic of pulling a IOBlock from IOBlockQueue, load a buffer and push the buffer to out_connector_ +// IMPORTANT: 1 IOBlock produces 1 DataBuffer +Status AlbumOp::WorkerEntry(int32_t worker_id) { + TaskManager::FindMe()->Post(); + int64_t buffer_id = worker_id; + std::unique_ptr io_block; + RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&io_block)); + while (io_block != nullptr) { + if (io_block->eoe() == true) { + RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique(0, DataBuffer::kDeBFlagEOE))); + buffer_id = worker_id; + } else if (io_block->eof() == true) { + RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique(0, DataBuffer::kDeBFlagEOF))); + } else { + std::vector keys; + RETURN_IF_NOT_OK(io_block->GetKeys(&keys)); + if (keys.empty() == true) return Status::OK(); // empty key is a quit signal for workers + std::unique_ptr db = std::make_unique(buffer_id, DataBuffer::kDeBFlagNone); + RETURN_IF_NOT_OK(LoadBuffer(keys, &db)); + RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(db))); + buffer_id += num_workers_; + } + RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&io_block)); + } + RETURN_STATUS_UNEXPECTED("Unexpected nullptr received in worker"); +} + +// Only support JPEG/PNG/GIF/BMP +// Optimization: Could take in a tensor +Status AlbumOp::CheckImageType(const std::string &file_name, bool *valid) { + std::ifstream file_handle; + constexpr int read_num = 3; + *valid = false; + file_handle.open(file_name, std::ios::binary | std::ios::in); + if (!file_handle.is_open()) { + RETURN_STATUS_UNEXPECTED("Can not open image file " + file_name); + } + unsigned char file_type[read_num]; + (void)file_handle.read(reinterpret_cast(file_type), read_num); + + if (file_handle.fail()) { + file_handle.close(); + RETURN_STATUS_UNEXPECTED("Read image file failed " + file_name); + } + file_handle.close(); + if (file_type[0] == 0xff && file_type[1] == 0xd8 && file_type[2] == 0xff) { + // Normal JPEGs start with \xff\xd8\xff\xe0 + // JPEG with EXIF stats with \xff\xd8\xff\xe1 + // Use \xff\xd8\xff to cover both. + *valid = true; + } else if (file_type[0] == 0x89 && file_type[1] == 0x50 && file_type[2] == 0x4e) { + // It's a PNG + *valid = true; + } else if (file_type[0] == 0x47 && file_type[1] == 0x49 && file_type[2] == 0x46) { + // It's a GIF + *valid = true; + } else if (file_type[0] == 0x42 && file_type[1] == 0x4d) { + // It's a BMP + *valid = true; + } + return Status::OK(); +} + +Status AlbumOp::LoadImageTensor(const std::string &image_file_path, uint32_t col_num, TensorRow *row) { + std::shared_ptr image; + std::ifstream fs; + fs.open(image_file_path, std::ios::binary | std::ios::in); + if (fs.fail()) { + MS_LOG(INFO) << "Image file not found:" << image_file_path << "."; + // If file doesn't exist, we don't flag this as error in input check, simply skip + return Status::OK(); + } + + MS_LOG(INFO) << "Image file found: " << image_file_path << "."; + + // check that the file is an image before decoding + bool valid = false; + RETURN_IF_NOT_OK(CheckImageType(image_file_path, &valid)); + RETURN_IF_NOT_OK(Tensor::CreateFromFile(image_file_path, &image)); + if (decode_ && valid) { + Status rc = Decode(image, &image); + if (rc.IsError()) { + std::string err = "Fail to decode image:" + image_file_path; + RETURN_STATUS_UNEXPECTED(err); + } + } + row->push_back(std::move(image)); + return Status::OK(); +} + +Status AlbumOp::LoadStringArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) { + std::vector data = json_obj; + + MS_LOG(INFO) << "String array label found: " << data << "."; + std::shared_ptr label; + RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &label)); + row->push_back(std::move(label)); + return Status::OK(); +} + +Status AlbumOp::LoadStringTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) { + std::string data = json_obj; + // now we iterate over the elements in json + + MS_LOG(INFO) << "String label found: " << data << "."; + std::shared_ptr label; + RETURN_IF_NOT_OK(Tensor::CreateScalar(data, &label)); + row->push_back(std::move(label)); + return Status::OK(); +} + +Status AlbumOp::LoadIntArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) { + std::shared_ptr label; + // consider templating this function to handle all ints + if (data_schema_->column(col_num).type() == DataType(DataType::DE_INT64)) { + std::vector data; + + // Iterate over the integer list and add those values to the output shape tensor + auto items = json_obj.items(); + using it_type = decltype(items.begin()); + (void)std::transform(items.begin(), items.end(), std::back_inserter(data), [](it_type j) { return j.value(); }); + + RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &label)); + } else if (data_schema_->column(col_num).type() == DataType(DataType::DE_INT32)) { + std::vector data; + + // Iterate over the integer list and add those values to the output shape tensor + auto items = json_obj.items(); + using it_type = decltype(items.begin()); + (void)std::transform(items.begin(), items.end(), std::back_inserter(data), [](it_type j) { return j.value(); }); + + MS_LOG(INFO) << "Int array found: " << data << "."; + RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &label)); + } else { + RETURN_STATUS_UNEXPECTED("Error in Load Int Tensor"); + } + row->push_back(std::move(label)); + return Status::OK(); +} + +Status AlbumOp::LoadIDTensor(const std::string &file, uint32_t col_num, TensorRow *row) { + if (data_schema_->column(col_num).type() == DataType(DataType::DE_STRING)) { + std::shared_ptr id; + RETURN_IF_NOT_OK(Tensor::CreateScalar(file, &id)); + row->push_back(std::move(id)); + return Status::OK(); + } + // hack to get the file name without extension, the 1 is to get rid of the backslash character + int64_t image_id = std::atoi(file.substr(1, file.find(".")).c_str()); + std::shared_ptr id; + RETURN_IF_NOT_OK(Tensor::CreateScalar(image_id, &id)); + MS_LOG(INFO) << "File ID " << image_id << "."; + row->push_back(std::move(id)); + return Status::OK(); +} + +Status AlbumOp::LoadEmptyTensor(uint32_t col_num, TensorRow *row) { + // hack to get the file name without extension, the 1 is to get rid of the backslash character + std::shared_ptr empty_tensor; + RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({}), data_schema_->column(col_num).type(), &empty_tensor)); + row->push_back(std::move(empty_tensor)); + return Status::OK(); +} + +// Loads a tensor with float value, issue with float64, we don't have reverse look up to the type +// So we actually have to check what type we want to fill the tensor with. +// Float64 doesn't work with reinterpret cast here. Otherwise we limit the float in the schema to +// only be float32, seems like a weird limitation to impose +Status AlbumOp::LoadFloatTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) { + std::shared_ptr float_tensor; + if (data_schema_->column(col_num).type() == DataType(DataType::DE_FLOAT64)) { + double data = json_obj; + MS_LOG(INFO) << "double found: " << json_obj << "."; + RETURN_IF_NOT_OK(Tensor::CreateScalar(data, &float_tensor)); + } else if (data_schema_->column(col_num).type() == DataType(DataType::DE_FLOAT32)) { + float data = json_obj; + RETURN_IF_NOT_OK(Tensor::CreateScalar(data, &float_tensor)); + MS_LOG(INFO) << "float found: " << json_obj << "."; + } + row->push_back(std::move(float_tensor)); + return Status::OK(); +} + +// Load 1 TensorRow (image,label) using 1 ImageColumns. 1 function call produces 1 TensorTow in a DataBuffer +// possible optimization: the helper functions of LoadTensorRow should be optimized +// to take a reference to a column descriptor? +Status AlbumOp::LoadTensorRow(const std::string &file, TensorRow *row) { + // testing here is to just print out file path + (*row) = {}; + MS_LOG(INFO) << "Image row file: " << file << "."; + + std::ifstream file_handle(folder_path_ + file); + if (!file_handle.is_open()) { + RETURN_STATUS_UNEXPECTED("Json file " + folder_path_ + file + " can not open."); + } + std::string line; + while (getline(file_handle, line)) { + try { + nlohmann::json js = nlohmann::json::parse(line); + MS_LOG(INFO) << "This Line: " << line << "."; + + // note if take a schema here, then we have to iterate over all column descriptors in schema and check for key + // get columns in schema: + int32_t columns = data_schema_->NumColumns(); + + // loop over each column descriptor, this can optimized by swtich cases + for (int32_t i = 0; i < columns; i++) { + // special case to handle + if (data_schema_->column(i).name() == "id") { + // id is internal, special case to load from file + RETURN_IF_NOT_OK(LoadIDTensor(file, i, row)); + continue; + } + // find if key does not exist, insert placeholder nullptr if not found + if (js.find(data_schema_->column(i).name()) == js.end()) { + // iterator not found, push nullptr as placeholder + MS_LOG(INFO) << "Pushing empty tensor for column: " << data_schema_->column(i).name() << "."; + RETURN_IF_NOT_OK(LoadEmptyTensor(i, row)); + continue; + } + nlohmann::json column_value = js.at(data_schema_->column(i).name()); + MS_LOG(INFO) << "This column is: " << data_schema_->column(i).name() << "."; + bool is_array = column_value.is_array(); + // load single string + if (column_value.is_string() && data_schema_->column(i).type() == DataType(DataType::DE_STRING)) { + RETURN_IF_NOT_OK(LoadStringTensor(column_value, i, row)); + continue; + } + // load string array + if (is_array && data_schema_->column(i).type() == DataType(DataType::DE_STRING)) { + RETURN_IF_NOT_OK(LoadStringArrayTensor(column_value, i, row)); + continue; + } + // load image file + if (column_value.is_string() && data_schema_->column(i).type() != DataType(DataType::DE_STRING)) { + std::string image_file_path = column_value; + RETURN_IF_NOT_OK(LoadImageTensor(image_file_path, i, row)); + continue; + } + // load float array + if (!is_array && (data_schema_->column(i).type() == DataType(DataType::DE_FLOAT32) || + data_schema_->column(i).type() == DataType(DataType::DE_FLOAT64))) { + RETURN_IF_NOT_OK(LoadFloatTensor(column_value, i, row)); + continue; + } + // int array + if (is_array && (data_schema_->column(i).type() == DataType(DataType::DE_INT64) || + data_schema_->column(i).type() == DataType(DataType::DE_INT32))) { + RETURN_IF_NOT_OK(LoadIntArrayTensor(column_value, i, row)); + continue; + } else { + MS_LOG(WARNING) << "Value type for column: " << data_schema_->column(i).name() << " is not supported."; + continue; + } + } + } catch (const std::exception &err) { + file_handle.close(); + RETURN_STATUS_UNEXPECTED("Parse Json file failed"); + } + } + file_handle.close(); + return Status::OK(); +} + +// Looping over LoadTensorRow to make 1 DataBuffer. 1 function call produces 1 buffer +Status AlbumOp::LoadBuffer(const std::vector &keys, std::unique_ptr *db) { + std::unique_ptr deq = std::make_unique(); + TensorRow trow; + + for (const int64_t &key : keys) { + RETURN_IF_NOT_OK(this->LoadTensorRow(image_rows_[key], &trow)); + deq->push_back(std::move(trow)); + } + (*db)->set_tensor_table(std::move(deq)); + return Status::OK(); +} + +void AlbumOp::Print(std::ostream &out, bool show_all) const { + // Always show the id and name as first line regardless if this summary or detailed print + out << "(" << std::setw(2) << operator_id_ << ") :"; + if (!show_all) { + // Call the super class for displaying any common 1-liner info + ParallelOp::Print(out, show_all); + // Then show any custom derived-internal 1-liner info for this op + out << "\n"; + } else { + // Call the super class for displaying any common detailed info + ParallelOp::Print(out, show_all); + // Then show any custom derived-internal stuff + out << "\nNumber of rows:" << num_rows_ << "\nAlbum directory: " << folder_path_ << "\n\n"; + } +} + +// Reset Sampler and wakeup Master thread (functor) +Status AlbumOp::Reset() { + RETURN_IF_NOT_OK(sampler_->ResetSampler()); + row_cnt_ = 0; + wp_.Set(); // wake up master thread after reset is done + return Status::OK(); +} + +// hand shake with Sampler, allow Sampler to call RandomAccessOp's functions to get NumRows +Status AlbumOp::InitSampler() { + RETURN_IF_NOT_OK(sampler_->HandshakeRandomAccessOp(this)); + return Status::OK(); +} + +Status AlbumOp::LaunchThreadsAndInitOp() { + RETURN_UNEXPECTED_IF_NULL(tree_); + // registers QueueList and individual Queues for interrupt services + RETURN_IF_NOT_OK(io_block_queues_.Register(tree_->AllTasks())); + RETURN_IF_NOT_OK(wp_.Register(tree_->AllTasks())); + // launch main workers that load DataBuffers by reading all images + RETURN_IF_NOT_OK(tree_->LaunchWorkers(num_workers_, std::bind(&AlbumOp::WorkerEntry, this, std::placeholders::_1))); + TaskManager::FindMe()->Post(); + RETURN_IF_NOT_OK(this->InitSampler()); // pass numRows to Sampler + return Status::OK(); +} + +// Visitor accept method for NodePass +Status AlbumOp::Accept(NodePass *p, bool *modified) { + // Downcast shared pointer then call visitor + return p->RunOnNode(shared_from_base(), modified); +} + +Status AlbumOp::ComputeColMap() { + // Set the column name map (base class field) + if (column_name_id_map_.empty()) { + for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) { + column_name_id_map_[data_schema_->column(i).name()] = i; + } + } else { + MS_LOG(WARNING) << "Column name map is already set!"; + } + return Status::OK(); +} +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.h b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.h new file mode 100644 index 00000000000..3ef4e7bf894 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.h @@ -0,0 +1,298 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_ALBUM_OP_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_ALBUM_OP_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "minddata/dataset/core/tensor.h" +#include "minddata/dataset/engine/data_buffer.h" +#include "minddata/dataset/engine/data_schema.h" +#include "minddata/dataset/engine/datasetops/parallel_op.h" +#include "minddata/dataset/engine/datasetops/source/io_block.h" +#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h" +#include "minddata/dataset/util/path.h" +#include "minddata/dataset/util/queue.h" +#include "minddata/dataset/util/services.h" +#include "minddata/dataset/util/status.h" +#include "minddata/dataset/util/wait_post.h" + +namespace mindspore { +namespace dataset { +// Forward declares +template +class Queue; + +// Define row information as a list of file objects to read +using FolderImages = std::shared_ptr>>; + +/// \class AlbumOp album_op.h +class AlbumOp : public ParallelOp, public RandomAccessOp { + public: + class Builder { + public: + /// \brief Constructor for Builder class of AlbumOp + Builder(); + + /// \brief Destructor. + ~Builder() = default; + + /// \brief Setter method + /// \param[in] rows_per_buffer + /// \return Builder setter method returns reference to the builder + Builder &SetRowsPerBuffer(int32_t rows_per_buffer) { + builder_rows_per_buffer_ = rows_per_buffer; + return *this; + } + + /// \brief Setter method + /// \param[in] size + /// \return Builder setter method returns reference to the builder + Builder &SetOpConnectorSize(int32_t size) { + builder_op_connector_size_ = size; + return *this; + } + + /// \brief Setter method + /// \param[in] exts - file extensions to be read + /// \return Builder setter method returns reference to the builder + Builder &SetExtensions(const std::set &exts) { + builder_extensions_ = exts; + return *this; + } + + /// \brief Setter method + /// \param[in] do_decode + /// \return Builder setter method returns reference to the builder + Builder &SetDecode(bool do_decode) { + builder_decode_ = do_decode; + return *this; + } + + /// \brief Setter method + /// \param[in] num_workers + /// \return Builder setter method returns reference to the builder + Builder &SetNumWorkers(int32_t num_workers) { + builder_num_workers_ = num_workers; + return *this; + } + + /// \brief Setter method + /// \param[in] sampler + /// \return Builder setter method returns reference to the builder + Builder &SetSampler(std::shared_ptr sampler) { + builder_sampler_ = std::move(sampler); + return *this; + } + + /// \brief Setter method + /// \param[in] dir - dataset directory + /// \return Builder setter method returns reference to the builder + Builder &SetAlbumDir(const std::string &dir) { + builder_dir_ = dir; + return *this; + } + + /// \brief Setter method + /// \param[in] file - schema file to load + /// \return Builder setter method returns reference to the builder + Builder &SetSchemaFile(const std::string &file) { + builder_schema_file_ = file; + return *this; + } + + /// \brief Setter method + /// \param[in] columns - input columns + /// \return Builder setter method returns reference to the builder + Builder &SetColumnsToLoad(const std::vector &columns) { + builder_columns_to_load_ = columns; + return *this; + } + + /// \brief Check validity of input args + /// \return - The error code return + Status SanityCheck(); + + /// \brief The builder "build" method creates the final object. + /// \param[inout] std::shared_ptr *op - DatasetOp + /// \return - The error code return + Status Build(std::shared_ptr *op); + + private: + bool builder_decode_; + std::vector builder_columns_to_load_; + std::string builder_dir_; + std::string builder_schema_file_; + int32_t builder_num_workers_; + int32_t builder_rows_per_buffer_; + int32_t builder_op_connector_size_; + std::set builder_extensions_; + std::shared_ptr builder_sampler_; + std::unique_ptr builder_schema_; + }; + + /// \brief Constructor + /// \param[in] num_wkrs - Num of workers reading images in parallel + /// \param[in] rows_per_buffer Number of images (rows) in each buffer + /// \param[in] file_dir - directory of Album + /// \param[in] queue_size - connector size + /// \param[in] do_decode - decode image files + /// \param[in] exts - set of file extensions to read, if empty, read everything under the dir + /// \param[in] data_schema - schema of dataset + /// \param[in] sampler - sampler tells AlbumOp what to read + AlbumOp(int32_t num_wkrs, int32_t rows_per_buffer, std::string file_dir, int32_t queue_size, bool do_decode, + const std::set &exts, std::unique_ptr data_schema, std::shared_ptr sampler); + + /// \brief Destructor. + ~AlbumOp() = default; + + /// \brief Initialize AlbumOp related var, calls the function to walk all files + /// \return - The error code return + Status PrescanEntry(); + + /// \brief Worker thread pulls a number of IOBlock from IOBlock Queue, make a buffer and push it to Connector + /// \param[in] int32_t workerId - id of each worker + /// \return Status - The error code return + Status WorkerEntry(int32_t worker_id) override; + + /// \brief Main Loop of AlbumOp + /// Master thread: Fill IOBlockQueue, then goes to sleep + /// Worker thread: pulls IOBlock from IOBlockQueue, work on it then put buffer to mOutConnector + /// \return Status - The error code return + Status operator()() override; + + /// \brief A print method typically used for debugging + /// \param[in] out + /// \param[in] show_all + void Print(std::ostream &out, bool show_all) const override; + + /// \brief Check if image ia valid.Only support JPEG/PNG/GIF/BMP + /// This function could be optimized to return the tensor to reduce open/closing files + /// \return Status - The error code return + Status CheckImageType(const std::string &file_name, bool *valid); + + // Base-class override for NodePass visitor acceptor. + // @param p - Pointer to the NodePass to be accepted. + // @param modified - Whether this node visit modified the pipeline. + // @return - Status of the node visit. + Status Accept(NodePass *p, bool *modified) override; + + // Op name getter + // @return Name of the current Op + std::string Name() const override { return "AlbumOp"; } + + private: + /// \brief Initialize Sampler, calls sampler->Init() within + /// \return Status The error code return + Status InitSampler(); + + /// \brief Load image to tensor row + /// \param[in] image_file Image name of file + /// \param[in] col_num Column num in schema + /// \param[inout] row Tensor row to push to + /// \return Status The error code return + Status LoadImageTensor(const std::string &image_file, uint32_t col_num, TensorRow *row); + + /// \brief Load vector of ints to tensor, append tensor to tensor row + /// \param[in] json_obj Json object containing multi-dimensional label + /// \param[in] col_num Column num in schema + /// \param[inout] row Tensor row to push to + /// \return Status The error code return + Status LoadIntArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row); + + /// \brief Load string array into a tensor, append tensor to tensor row + /// \param[in] json_obj Json object containing string tensor + /// \param[in] col_num Column num in schema + /// \param[inout] row Tensor row to push to + /// \return Status The error code return + Status LoadStringArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row); + + /// \brief Load string into a tensor, append tensor to tensor row + /// \param[in] json_obj Json object containing string tensor + /// \param[in] col_num Column num in schema + /// \param[inout] row Tensor row to push to + /// \return Status The error code return + Status LoadStringTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row); + + /// \brief Load float value to tensor row + /// \param[in] json_obj Json object containing float + /// \param[in] col_num Column num in schema + /// \param[inout] row Tensor row to push to + /// \return Status The error code return + Status LoadFloatTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row); + + /// \brief Load emtpy tensor to tensor row + /// \param[in] col_num Column num in schema + /// \param[inout] row Tensor row to push to + /// \return Status The error code return + Status LoadEmptyTensor(uint32_t col_num, TensorRow *row); + + /// \brief Load id from file name to tensor row + /// \param[in] file The file name to get ID from + /// \param[in] col_num Column num in schema + /// \param[inout] row Tensor row to push to + /// \return Status The error code return + Status LoadIDTensor(const std::string &file, uint32_t col_num, TensorRow *row); + + /// \brief Load a tensor row according to a json file + /// \param[in] ImageColumns file Json file location + /// \param[inout] TensorRow row Json content stored into a tensor row + /// \return Status The error code return + Status LoadTensorRow(const std::string &file, TensorRow *row); + + /// \param[in] const std::vector &keys Keys in ioblock + /// \param[inout] std::unique_ptr db Databuffer to push to + /// \return Status The error code return + Status LoadBuffer(const std::vector &keys, std::unique_ptr *db); + + /// \brief Called first when function is called + /// \return The error code return + Status LaunchThreadsAndInitOp(); + + /// \brief reset Op + /// \return Status The error code return + Status Reset() override; + + // Private function for computing the assignment of the column name map. + // @return - Status + Status ComputeColMap() override; + + int32_t rows_per_buffer_; + std::string folder_path_; // directory of image folder + bool decode_; + std::set extensions_; // extensions allowed + std::unordered_map col_name_map_; + std::unique_ptr data_schema_; + std::shared_ptr sampler_; + int64_t row_cnt_; + int64_t buf_cnt_; + int64_t sampler_ind_; + int64_t dirname_offset_; + WaitPost wp_; + std::vector image_rows_; + QueueList> io_block_queues_; // queues of IOBlocks +}; +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_ALBUM_OP_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc index 78dfae5dbe2..7959c35371e 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc @@ -134,7 +134,6 @@ Status ImageFolderOp::operator()() { TensorRow sample_row; RETURN_IF_NOT_OK(sampler_buffer->PopRow(&sample_row)); std::shared_ptr sample_ids = sample_row[0]; - if (sample_ids->type() != DataType(DataType::DE_INT64)) RETURN_STATUS_UNEXPECTED("Sampler Tensor isn't int64"); for (auto itr = sample_ids->begin(); itr != sample_ids->end(); ++itr) { if ((*itr) >= num_rows_) continue; // index out of bound, skipping keys.push_back(*itr); diff --git a/mindspore/ccsrc/minddata/dataset/engine/opt/pass.cc b/mindspore/ccsrc/minddata/dataset/engine/opt/pass.cc index 4a2041e63d0..43fff5925ea 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/opt/pass.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/opt/pass.cc @@ -30,6 +30,7 @@ #include "minddata/dataset/engine/datasetops/repeat_op.h" #include "minddata/dataset/engine/datasetops/skip_op.h" #include "minddata/dataset/engine/datasetops/shuffle_op.h" +#include "minddata/dataset/engine/datasetops/source/album_op.h" #include "minddata/dataset/engine/datasetops/source/celeba_op.h" #include "minddata/dataset/engine/datasetops/source/cifar_op.h" #include "minddata/dataset/engine/datasetops/source/coco_op.h" @@ -199,6 +200,11 @@ Status NodePass::RunOnNode(std::shared_ptr node, bool *modified) return RunOnNode(std::static_pointer_cast(node), modified); } +Status NodePass::RunOnNode(std::shared_ptr node, bool *modified) { + // Fallback to base class visitor by default + return RunOnNode(std::static_pointer_cast(node), modified); +} + Status NodePass::RunOnNode(std::shared_ptr node, bool *modified) { // Fallback to base class visitor by default return RunOnNode(std::static_pointer_cast(node), modified); diff --git a/mindspore/ccsrc/minddata/dataset/engine/opt/pass.h b/mindspore/ccsrc/minddata/dataset/engine/opt/pass.h index f154b6c2054..b4c676589ea 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/opt/pass.h +++ b/mindspore/ccsrc/minddata/dataset/engine/opt/pass.h @@ -49,6 +49,8 @@ class FilterOp; class GeneratorOp; #endif +class AlbumOp; + class RandomDataOp; class RepeatOp; @@ -178,6 +180,8 @@ class NodePass : public Pass { virtual Status RunOnNode(std::shared_ptr node, bool *modified); + virtual Status RunOnNode(std::shared_ptr node, bool *modified); + virtual Status RunOnNode(std::shared_ptr node, bool *modified); virtual Status RunOnNode(std::shared_ptr node, bool *modified); diff --git a/mindspore/ccsrc/minddata/dataset/engine/opt/pre/cache_transform_pass.cc b/mindspore/ccsrc/minddata/dataset/engine/opt/pre/cache_transform_pass.cc index d8e1ddfc32c..12e47a3c514 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/opt/pre/cache_transform_pass.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/opt/pre/cache_transform_pass.cc @@ -21,6 +21,7 @@ #include "minddata/dataset/engine/datasetops/cache_lookup_op.h" #include "minddata/dataset/engine/datasetops/cache_merge_op.h" #include "minddata/dataset/engine/datasetops/cache_op.h" +#include "minddata/dataset/engine/datasetops/source/album_op.h" #include "minddata/dataset/engine/datasetops/source/celeba_op.h" #include "minddata/dataset/engine/datasetops/source/cifar_op.h" #include "minddata/dataset/engine/datasetops/source/coco_op.h" @@ -152,6 +153,11 @@ Status CacheTransformPass::CachePass::RunOnNode(std::shared_ptr n return MappableCacheLeafSetup(std::static_pointer_cast(node)); } +// Perform leaf node cache transform identification +Status CacheTransformPass::CachePass::RunOnNode(std::shared_ptr node, bool *modified) { + return MappableCacheLeafSetup(std::static_pointer_cast(node)); +} + // Perform leaf node cache transform identification Status CacheTransformPass::CachePass::RunOnNode(std::shared_ptr node, bool *modified) { return MappableCacheLeafSetup(std::static_pointer_cast(node)); diff --git a/mindspore/ccsrc/minddata/dataset/engine/opt/pre/cache_transform_pass.h b/mindspore/ccsrc/minddata/dataset/engine/opt/pre/cache_transform_pass.h index 970461d48f2..89525d07e89 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/opt/pre/cache_transform_pass.h +++ b/mindspore/ccsrc/minddata/dataset/engine/opt/pre/cache_transform_pass.h @@ -79,6 +79,12 @@ class CacheTransformPass : public TreePass { /// \return Status The error code return Status RunOnNode(std::shared_ptr node, bool *modified) override; + /// \brief Perform leaf node cache tranform identifications + /// \param[in] node The node being visited + /// \param[inout] modified Indicator if the node was changed at all + /// \return Status The error code return + Status RunOnNode(std::shared_ptr node, bool *modified) override; + /// \brief Perform leaf node cache tranform identifications /// \param[in] node The node being visited /// \param[inout] modified Indicator if the node was changed at all diff --git a/mindspore/ccsrc/minddata/dataset/engine/opt/util/printer_pass.cc b/mindspore/ccsrc/minddata/dataset/engine/opt/util/printer_pass.cc index 02f7bf8dfaa..4d68558d7af 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/opt/util/printer_pass.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/opt/util/printer_pass.cc @@ -111,5 +111,11 @@ Status PrinterPass::RunOnNode(std::shared_ptr node, bool *modifie std::cout << "Visiting ImageFolderOp" << '\n'; return Status::OK(); } + +Status PrinterPass::RunOnNode(std::shared_ptr node, bool *modified) { + *modified = false; + std::cout << "Visiting ImageFolderOp" << '\n'; + return Status::OK(); +} } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/opt/util/printer_pass.h b/mindspore/ccsrc/minddata/dataset/engine/opt/util/printer_pass.h index d469554a93f..74a50e217b7 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/opt/util/printer_pass.h +++ b/mindspore/ccsrc/minddata/dataset/engine/opt/util/printer_pass.h @@ -58,6 +58,8 @@ class PrinterPass : public NodePass { Status RunOnNode(std::shared_ptr node, bool *modified) override; Status RunOnNode(std::shared_ptr node, bool *modified) override; + + Status RunOnNode(std::shared_ptr node, bool *modified) override; }; } // namespace dataset diff --git a/mindspore/ccsrc/minddata/dataset/include/datasets.h b/mindspore/ccsrc/minddata/dataset/include/datasets.h index ea3f65a5ed9..8564827b4d5 100644 --- a/mindspore/ccsrc/minddata/dataset/include/datasets.h +++ b/mindspore/ccsrc/minddata/dataset/include/datasets.h @@ -48,6 +48,7 @@ class TensorOperation; class SchemaObj; class SamplerObj; // Datasets classes (in alphabetical order) +class AlbumDataset; class CelebADataset; class Cifar10Dataset; class Cifar100Dataset; @@ -79,13 +80,27 @@ class ZipDataset; /// \return Shared pointer to the current schema std::shared_ptr Schema(const std::string &schema_file = ""); +/// \brief Function to create an AlbumDataset +/// \notes The generated dataset is specified through setting a schema +/// \param[in] dataset_dir Path to the root directory that contains the dataset +/// \param[in] data_schema Path to dataset schema file +/// \param[in] column_names Column names used to specify columns to load, if empty, will read all columns. +/// (default = {}) +/// \param[in] decode the option to decode the images in dataset (default = false) +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, +/// A `RandomSampler` will be used to randomly iterate the entire dataset (default = nullptr) +/// \return Shared pointer to the current Dataset +std::shared_ptr Album(const std::string &dataset_dir, const std::string &data_schema, + const std::vector &column_names = {}, bool decode = false, + const std::shared_ptr &sampler = nullptr); + /// \brief Function to create a CelebADataset /// \notes The generated dataset has two columns ['image', 'attr']. -// The type of the image tensor is uint8. The attr tensor is uint32 and one hot type. +// The type of the image tensor is uint8. The attr tensor is uint32 and one hot type. /// \param[in] dataset_dir Path to the root directory that contains the dataset. /// \param[in] dataset_type One of 'all', 'train', 'valid' or 'test'. /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` -/// will be used to randomly iterate the entire dataset +/// will be used to randomly iterate the entire dataset /// \param[in] decode Decode the images after reading (default=false). /// \param[in] extensions Set of file extensions to be included in the dataset (default={}). /// \return Shared pointer to the current Dataset @@ -97,7 +112,7 @@ std::shared_ptr CelebA(const std::string &dataset_dir, const std: /// \notes The generated dataset has two columns ['image', 'label'] /// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` -/// will be used to randomly iterate the entire dataset +/// will be used to randomly iterate the entire dataset /// \return Shared pointer to the current Dataset std::shared_ptr Cifar10(const std::string &dataset_dir, const std::shared_ptr &sampler = nullptr); @@ -106,7 +121,7 @@ std::shared_ptr Cifar10(const std::string &dataset_dir, /// \notes The generated dataset has three columns ['image', 'coarse_label', 'fine_label'] /// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` -/// will be used to randomly iterate the entire dataset +/// will be used to randomly iterate the entire dataset /// \return Shared pointer to the current Dataset std::shared_ptr Cifar100(const std::string &dataset_dir, const std::shared_ptr &sampler = nullptr); @@ -114,19 +129,19 @@ std::shared_ptr Cifar100(const std::string &dataset_dir, /// \brief Function to create a CLUEDataset /// \notes The generated dataset has a variable number of columns depending on the task and usage /// \param[in] dataset_files List of files to be read to search for a pattern of files. The list -/// will be sorted in a lexicographical order. +/// will be sorted in a lexicographical order. /// \param[in] task The kind of task, one of "AFQMC", "TNEWS", "IFLYTEK", "CMNLI", "WSC" and "CSL" (default="AFQMC"). /// \param[in] usage Be used to "train", "test" or "eval" data (default="train"). /// \param[in] num_samples The number of samples to be included in the dataset. -/// (Default = 0 means all samples.) -/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode::kGlobal) -/// Can be any of: -/// ShuffleMode::kFalse - No shuffling is performed. -/// ShuffleMode::kFiles - Shuffle files only. -/// ShuffleMode::kGlobal - Shuffle both the files and samples. +/// (Default = 0 means all samples.) +/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode.kGlobal) +/// Can be any of: +/// ShuffleMode.kFalse - No shuffling is performed. +/// ShuffleMode.kFiles - Shuffle files only. +/// ShuffleMode.kGlobal - Shuffle both the files and samples. /// \param[in] num_shards Number of shards that the dataset should be divided into. (Default = 1) /// \param[in] shard_id The shard ID within num_shards. This argument should be -/// specified only when num_shards is also specified. (Default = 0) +/// specified only when num_shards is also specified. (Default = 0) /// \return Shared pointer to the current CLUEDataset std::shared_ptr CLUE(const std::vector &dataset_files, const std::string &task = "AFQMC", const std::string &usage = "train", int64_t num_samples = 0, @@ -135,19 +150,19 @@ std::shared_ptr CLUE(const std::vector &dataset_files, /// \brief Function to create a CocoDataset /// \notes The generated dataset has multi-columns : -/// - task='Detection', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32], -/// ['iscrowd', dtype=uint32]]. -/// - task='Stuff', column: [['image', dtype=uint8], ['segmentation',dtype=float32], ['iscrowd', dtype=uint32]]. -/// - task='Keypoint', column: [['image', dtype=uint8], ['keypoints', dtype=float32], -/// ['num_keypoints', dtype=uint32]]. -/// - task='Panoptic', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32], -/// ['iscrowd', dtype=uint32], ['area', dtype=uitn32]]. +/// - task='Detection', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32], +/// ['iscrowd', dtype=uint32]]. +/// - task='Stuff', column: [['image', dtype=uint8], ['segmentation',dtype=float32], ['iscrowd', dtype=uint32]]. +/// - task='Keypoint', column: [['image', dtype=uint8], ['keypoints', dtype=float32], +/// ['num_keypoints', dtype=uint32]]. +/// - task='Panoptic', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32], +/// ['iscrowd', dtype=uint32], ['area', dtype=uitn32]]. /// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] annotation_file Path to the annotation json /// \param[in] task Set the task type of reading coco data, now support 'Detection'/'Stuff'/'Panoptic'/'Keypoint' /// \param[in] decode Decode the images after reading /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` -/// will be used to randomly iterate the entire dataset +/// will be used to randomly iterate the entire dataset /// \return Shared pointer to the current Dataset std::shared_ptr Coco(const std::string &dataset_dir, const std::string &annotation_file, const std::string &task = "Detection", const bool &decode = false, @@ -181,12 +196,12 @@ std::shared_ptr CSV(const std::vector &dataset_files, c /// \brief Function to create an ImageFolderDataset /// \notes A source dataset that reads images from a tree of directories -/// All images within one folder have the same label -/// The generated dataset has two columns ['image', 'label'] +/// All images within one folder have the same label +/// The generated dataset has two columns ['image', 'label'] /// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] decode A flag to decode in ImageFolder /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, -/// A `RandomSampler` will be used to randomly iterate the entire dataset +/// A `RandomSampler` will be used to randomly iterate the entire dataset /// \param[in] extensions File extensions to be read /// \param[in] class_indexing a class name to label map /// \return Shared pointer to the current ImageFolderDataset @@ -200,9 +215,9 @@ std::shared_ptr ImageFolder(const std::string &dataset_dir, /// \param[in] dataset_file The dataset file to be read /// \param[in] usage Need "train", "eval" or "inference" data (default="train") /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, -/// A `RandomSampler` will be used to randomly iterate the entire dataset +/// A `RandomSampler` will be used to randomly iterate the entire dataset /// \param[in] class_indexing A str-to-int mapping from label name to index (default={}, the folder -/// names will be sorted alphabetically and each class will be given a unique index starting from 0). +/// names will be sorted alphabetically and each class will be given a unique index starting from 0). /// \param[in] decode Decode the images after reading (default=false). /// \return Shared pointer to the current ManifestDataset std::shared_ptr Manifest(std::string dataset_file, std::string usage = "train", @@ -214,7 +229,7 @@ std::shared_ptr Manifest(std::string dataset_file, std::string /// \notes The generated dataset has two columns ['image', 'label'] /// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, -/// A `RandomSampler` will be used to randomly iterate the entire dataset +/// A `RandomSampler` will be used to randomly iterate the entire dataset /// \return Shared pointer to the current MnistDataset std::shared_ptr Mnist(const std::string &dataset_dir, const std::shared_ptr &sampler = nullptr); @@ -245,17 +260,17 @@ std::shared_ptr RandomData(const int32_t &total_rows = 0, T schem /// \brief Function to create a TextFileDataset /// \notes The generated dataset has one column ['text'] /// \param[in] dataset_files List of files to be read to search for a pattern of files. The list -/// will be sorted in a lexicographical order. +/// will be sorted in a lexicographical order. /// \param[in] num_samples The number of samples to be included in the dataset. -/// (Default = 0 means all samples.) -/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode::kGlobal) -/// Can be any of: -/// ShuffleMode::kFalse - No shuffling is performed. -/// ShuffleMode::kFiles - Shuffle files only. -/// ShuffleMode::kGlobal - Shuffle both the files and samples. +/// (Default = 0 means all samples.) +/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode.kGlobal) +/// Can be any of: +/// ShuffleMode.kFalse - No shuffling is performed. +/// ShuffleMode.kFiles - Shuffle files only. +/// ShuffleMode.kGlobal - Shuffle both the files and samples. /// \param[in] num_shards Number of shards that the dataset should be divided into. (Default = 1) /// \param[in] shard_id The shard ID within num_shards. This argument should be -/// specified only when num_shards is also specified. (Default = 0) +/// specified only when num_shards is also specified. (Default = 0) /// \return Shared pointer to the current TextFileDataset std::shared_ptr TextFile(const std::vector &dataset_files, int64_t num_samples = 0, ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, @@ -263,16 +278,16 @@ std::shared_ptr TextFile(const std::vector &datase /// \brief Function to create a VOCDataset /// \notes The generated dataset has multi-columns : -/// - task='Detection', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['label', dtype=uint32], -/// ['difficult', dtype=uint32], ['truncate', dtype=uint32]]. -/// - task='Segmentation', column: [['image', dtype=uint8], ['target',dtype=uint8]]. +/// - task='Detection', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['label', dtype=uint32], +/// ['difficult', dtype=uint32], ['truncate', dtype=uint32]]. +/// - task='Segmentation', column: [['image', dtype=uint8], ['target',dtype=uint8]]. /// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] task Set the task type of reading voc data, now only support "Segmentation" or "Detection" /// \param[in] mode Set the data list txt file to be readed /// \param[in] class_indexing A str-to-int mapping from label name to index /// \param[in] decode Decode the images after reading /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` -/// will be used to randomly iterate the entire dataset +/// will be used to randomly iterate the entire dataset /// \return Shared pointer to the current Dataset std::shared_ptr VOC(const std::string &dataset_dir, const std::string &task = "Segmentation", const std::string &mode = "train", @@ -335,9 +350,9 @@ class Dataset : public std::enable_shared_from_this { /// \notes Combines batch_size number of consecutive rows into batches /// \param[in] batch_size Path to the root directory that contains the dataset /// \param[in] drop_remainder Determines whether or not to drop the last possibly incomplete - /// batch. If true, and if there are less than batch_size rows - /// available to make the last batch, then those rows will - /// be dropped and not propagated to the next node + /// batch. If true, and if there are less than batch_size rows + /// available to make the last batch, then those rows will + /// be dropped and not propagated to the next node /// \return Shared pointer to the current BatchDataset std::shared_ptr Batch(int32_t batch_size, bool drop_remainder = false); @@ -368,16 +383,16 @@ class Dataset : public std::enable_shared_from_this { /// \brief Function to create a MapDataset /// \notes Applies each operation in operations to this dataset /// \param[in] operations Vector of operations to be applied on the dataset. Operations are - /// applied in the order they appear in this list + /// applied in the order they appear in this list /// \param[in] input_columns Vector of the names of the columns that will be passed to the first - /// operation as input. The size of this list must match the number of - /// input columns expected by the first operator. The default input_columns - /// is the first column + /// operation as input. The size of this list must match the number of + /// input columns expected by the first operator. The default input_columns + /// is the first column /// \param[in] output_columns Vector of names assigned to the columns outputted by the last operation - /// This parameter is mandatory if len(input_columns) != len(output_columns) - /// The size of this list must match the number of output columns of the - /// last operation. The default output_columns will have the same - /// name as the input columns, i.e., the columns will be replaced + /// This parameter is mandatory if len(input_columns) != len(output_columns) + /// The size of this list must match the number of output columns of the + /// last operation. The default output_columns will have the same + /// name as the input columns, i.e., the columns will be replaced /// \param[in] project_columns A list of column names to project /// \return Shared pointer to the current MapDataset std::shared_ptr Map(std::vector> operations, @@ -404,7 +419,7 @@ class Dataset : public std::enable_shared_from_this { /// \param[in] count Number of times the dataset should be repeated /// \return Shared pointer to the current Dataset /// \note Repeat will return shared pointer to `Dataset` instead of `RepeatDataset` - /// due to a limitation in the current implementation + /// due to a limitation in the current implementation std::shared_ptr Repeat(int32_t count = -1); /// \brief Function to create a Shuffle Dataset @@ -506,6 +521,31 @@ class SchemaObj { // DERIVED DATASET CLASSES FOR LEAF-NODE DATASETS // (In alphabetical order) +class AlbumDataset : public Dataset { + public: + /// \brief Constructor + AlbumDataset(const std::string &dataset_dir, const std::string &data_schema, + const std::vector &column_names, bool decode, const std::shared_ptr &sampler); + + /// \brief Destructor + ~AlbumDataset() = default; + + /// \brief a base class override function to create a runtime dataset op object from this class + /// \return shared pointer to the newly created DatasetOp + std::vector> Build() override; + + /// \brief Parameters validation + /// \return bool true if all the params are valid + bool ValidateParams() override; + + private: + std::string dataset_dir_; + std::string schema_path_; + std::vector column_names_; + bool decode_; + std::shared_ptr sampler_; +}; + class CelebADataset : public Dataset { public: /// \brief Constructor diff --git a/tests/ut/cpp/dataset/CMakeLists.txt b/tests/ut/cpp/dataset/CMakeLists.txt index 29454767910..53406423508 100644 --- a/tests/ut/cpp/dataset/CMakeLists.txt +++ b/tests/ut/cpp/dataset/CMakeLists.txt @@ -5,6 +5,7 @@ SET(DE_UT_SRCS common/cvop_common.cc common/bboxop_common.cc auto_contrast_op_test.cc + album_op_test.cc batch_op_test.cc bit_functions_test.cc storage_container_test.cc @@ -101,6 +102,7 @@ SET(DE_UT_SRCS c_api_samplers_test.cc c_api_transforms_test.cc c_api_dataset_ops_test.cc + c_api_dataset_album_test.cc c_api_dataset_cifar_test.cc c_api_dataset_clue_test.cc c_api_dataset_coco_test.cc diff --git a/tests/ut/cpp/dataset/album_op_test.cc b/tests/ut/cpp/dataset/album_op_test.cc new file mode 100644 index 00000000000..fcd81ed19ed --- /dev/null +++ b/tests/ut/cpp/dataset/album_op_test.cc @@ -0,0 +1,208 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include "common/common.h" +#include "minddata/dataset/core/client.h" +#include "minddata/dataset/core/global_context.h" +#include "minddata/dataset/engine/datasetops/source/album_op.h" +#include "minddata/dataset/engine/datasetops/source/sampler/distributed_sampler.h" +#include "minddata/dataset/engine/datasetops/source/sampler/pk_sampler.h" +#include "minddata/dataset/engine/datasetops/source/sampler/random_sampler.h" +#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h" +#include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h" +#include "minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.h" +#include "minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.h" +#include "minddata/dataset/util/path.h" +#include "minddata/dataset/util/status.h" +#include "gtest/gtest.h" +#include "utils/log_adapter.h" +#include "securec.h" +#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/include/transforms.h" + +using namespace mindspore::dataset; +using mindspore::MsLogLevel::ERROR; +using mindspore::ExceptionType::NoExceptionType; +using mindspore::LogStream; + +std::shared_ptr Batch(int batch_size = 1, bool drop = false, int rows_per_buf = 2); + +std::shared_ptr Repeat(int repeat_cnt); + +std::shared_ptr Build(std::vector> ops); + +std::shared_ptr Album(int64_t num_works, int64_t rows, int64_t conns, std::string path, + bool shuf = false, std::unique_ptr sampler = nullptr, + bool decode = false) { + std::shared_ptr so; + AlbumOp::Builder builder; + Status rc = builder.SetNumWorkers(num_works) + .SetAlbumDir(path) + .SetRowsPerBuffer(rows) + .SetOpConnectorSize(conns) + .SetExtensions({".json"}) + .SetSampler(std::move(sampler)) + .SetDecode(decode) + .Build(&so); + return so; +} + +std::shared_ptr AlbumSchema(int64_t num_works, int64_t rows, int64_t conns, std::string path, + std::string schema_file, std::vector column_names = {}, + bool shuf = false, std::unique_ptr sampler = nullptr, + bool decode = false) { + std::shared_ptr so; + AlbumOp::Builder builder; + Status rc = builder.SetNumWorkers(num_works) + .SetSchemaFile(schema_file) + .SetColumnsToLoad(column_names) + .SetAlbumDir(path) + .SetRowsPerBuffer(rows) + .SetOpConnectorSize(conns) + .SetExtensions({".json"}) + .SetSampler(std::move(sampler)) + .SetDecode(decode) + .Build(&so); + return so; +} + +class MindDataTestAlbum : public UT::DatasetOpTesting { + protected: +}; + +TEST_F(MindDataTestAlbum, TestSequentialAlbumWithSchema) { + std::string folder_path = datasets_root_path_ + "/testAlbum/images"; + std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json"; + std::vector column_names = {"image", "label", "id"}; + auto tree = Build({AlbumSchema(16, 2, 32, folder_path, schema_file, column_names, false), Repeat(2)}); + tree->Prepare(); + Status rc = tree->Launch(); + if (rc.IsError()) { + MS_LOG(ERROR) << "Return code error detected during tree launch: " << "."; + EXPECT_TRUE(false); + } else { + DatasetIterator di(tree); + TensorMap tensor_map; + di.GetNextAsMap(&tensor_map); + EXPECT_TRUE(rc.IsOk()); + uint64_t i = 0; + int32_t label = 0; + while (tensor_map.size() != 0) { + tensor_map["label"]->GetItemAt(&label, {}); + MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape" + << tensor_map["label"] << "\n"; + i++; + di.GetNextAsMap(&tensor_map); + } + MS_LOG(INFO) << "got rows" << i << "\n"; + EXPECT_TRUE(i == 14); + } +} + +TEST_F(MindDataTestAlbum, TestSequentialAlbumWithSchemaNoOrder) { + std::string folder_path = datasets_root_path_ + "/testAlbum/images"; + std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json"; + auto tree = Build({AlbumSchema(16, 2, 32, folder_path, schema_file), Repeat(2)}); + tree->Prepare(); + Status rc = tree->Launch(); + if (rc.IsError()) { + MS_LOG(ERROR) << "Return code error detected during tree launch: " << "."; + EXPECT_TRUE(false); + } else { + DatasetIterator di(tree); + TensorMap tensor_map; + di.GetNextAsMap(&tensor_map); + EXPECT_TRUE(rc.IsOk()); + uint64_t i = 0; + int32_t label = 0; + while (tensor_map.size() != 0) { + tensor_map["label"]->GetItemAt(&label, {}); + MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape" + << tensor_map["label"] << "\n"; + i++; + di.GetNextAsMap(&tensor_map); + } + MS_LOG(INFO) << "got rows" << i << "\n"; + EXPECT_TRUE(i == 14); + } +} + +TEST_F(MindDataTestAlbum, TestSequentialAlbumWithSchemaFloat) { + std::string folder_path = datasets_root_path_ + "/testAlbum/images"; + // add the priority column + std::string schema_file = datasets_root_path_ + "/testAlbum/floatSchema.json"; + auto tree = Build({AlbumSchema(16, 2, 32, folder_path, schema_file), Repeat(2)}); + tree->Prepare(); + Status rc = tree->Launch(); + if (rc.IsError()) { + MS_LOG(ERROR) << "Return code error detected during tree launch: " << "."; + EXPECT_TRUE(false); + } else { + DatasetIterator di(tree); + TensorMap tensor_map; + di.GetNextAsMap(&tensor_map); + EXPECT_TRUE(rc.IsOk()); + uint64_t i = 0; + int32_t label = 0; + double priority = 0; + while (tensor_map.size() != 0) { + tensor_map["label"]->GetItemAt(&label, {}); + tensor_map["_priority"]->GetItemAt(&priority, {}); + MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape" + << tensor_map["label"] << "priority: " << priority << "\n"; + i++; + di.GetNextAsMap(&tensor_map); + } + MS_LOG(INFO) << "got rows" << i << "\n"; + EXPECT_TRUE(i == 14); + } +} + +TEST_F(MindDataTestAlbum, TestSequentialAlbumWithFullSchema) { + std::string folder_path = datasets_root_path_ + "/testAlbum/images"; + // add the priority column + std::string schema_file = datasets_root_path_ + "/testAlbum/fullSchema.json"; + auto tree = Build({AlbumSchema(16, 2, 32, folder_path, schema_file), Repeat(2)}); + tree->Prepare(); + Status rc = tree->Launch(); + if (rc.IsError()) { + MS_LOG(ERROR) << "Return code error detected during tree launch: " << "."; + EXPECT_TRUE(false); + } else { + DatasetIterator di(tree); + TensorMap tensor_map; + di.GetNextAsMap(&tensor_map); + EXPECT_TRUE(rc.IsOk()); + uint64_t i = 0; + int32_t label = 0; + double priority = 0; + while (tensor_map.size() != 0) { + tensor_map["label"]->GetItemAt(&label, {}); + tensor_map["_priority"]->GetItemAt(&priority, {}); + MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape" + << tensor_map["label"] << "priority: " << priority << " embedding : " << + tensor_map["_embedding"]->shape() << "\n"; + i++; + di.GetNextAsMap(&tensor_map); + } + MS_LOG(INFO) << "got rows" << i << "\n"; + EXPECT_TRUE(i == 14); + } +} + diff --git a/tests/ut/cpp/dataset/c_api_dataset_album_test.cc b/tests/ut/cpp/dataset/c_api_dataset_album_test.cc new file mode 100644 index 00000000000..820909d8e83 --- /dev/null +++ b/tests/ut/cpp/dataset/c_api_dataset_album_test.cc @@ -0,0 +1,136 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "common/common.h" +#include "minddata/dataset/include/datasets.h" + +using namespace mindspore::dataset::api; +using mindspore::dataset::Tensor; + +class MindDataTestPipeline : public UT::DatasetOpTesting { + protected: +}; + +TEST_F(MindDataTestPipeline, TestAlbumBasic) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumBasic."; + + std::string folder_path = datasets_root_path_ + "/testAlbum/images"; + std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json"; + std::vector column_names = {"image", "label", "id"}; + // Create a Album Dataset + std::shared_ptr ds = Album(folder_path, schema_file, column_names); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + uint64_t i = 0; + while (row.size() != 0) { + i++; + auto image = row["image"]; + MS_LOG(INFO) << "Tensor image shape: " << image->shape(); + iter->GetNextRow(&row); + } + + EXPECT_EQ(i, 7); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestAlbumDecode) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumDecode."; + std::string folder_path = datasets_root_path_ + "/testAlbum/images"; + std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json"; + std::vector column_names = {"image", "label", "id"}; + // Create a Album Dataset + std::shared_ptr ds = Album(folder_path, schema_file, column_names, true); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + uint64_t i = 0; + while (row.size() != 0) { + i++; + auto image = row["image"]; + auto shape = image->shape(); + MS_LOG(INFO) << "Tensor image shape size: " << shape.Size(); + MS_LOG(INFO) << "Tensor image shape: " << image->shape(); + EXPECT_GT(shape.Size(), 1); // Verify decode=true took effect + iter->GetNextRow(&row); + } + + EXPECT_EQ(i, 7); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestAlbumNumSamplers) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumNumSamplers."; + + std::string folder_path = datasets_root_path_ + "/testAlbum/images"; + std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json"; + std::vector column_names = {"image", "label", "id"}; + // Create a Album Dataset + std::shared_ptr ds = Album(folder_path, schema_file, column_names, true, SequentialSampler(0, 1)); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + uint64_t i = 0; + while (row.size() != 0) { + i++; + auto image = row["image"]; + MS_LOG(INFO) << "Tensor image shape: " << image->shape(); + iter->GetNextRow(&row); + } + + EXPECT_EQ(i, 1); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestAlbumError) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumError."; + std::string folder_path = datasets_root_path_ + "/testAlbum/ima"; + std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json"; + std::vector column_names = {"image", "label", "id"}; + // Create a Album Dataset + std::shared_ptr ds = Album(folder_path, schema_file, column_names, true, SequentialSampler(0, 1)); + + EXPECT_EQ(ds, nullptr); +} diff --git a/tests/ut/cpp/runtest.sh b/tests/ut/cpp/runtest.sh index 283d6bc6399..a0503a0e94e 100755 --- a/tests/ut/cpp/runtest.sh +++ b/tests/ut/cpp/runtest.sh @@ -32,6 +32,8 @@ export GLOG_v=2 ## prepare data for dataset & mindrecord cp -fr $PROJECT_PATH/tests/ut/data ${PROJECT_PATH}/build/mindspore/tests/ut/cpp/ +## prepare album dataset, uses absolute path so has to be generated +python ${PROJECT_PATH}/build/mindspore/tests/ut/cpp/data/dataset/testAlbum/gen_json.py if [ $# -gt 0 ]; then ./ut_tests --gtest_filter=$1 diff --git a/tests/ut/data/dataset/testAlbum/bin/sample.bin b/tests/ut/data/dataset/testAlbum/bin/sample.bin new file mode 100644 index 00000000000..497fabf7b0f --- /dev/null +++ b/tests/ut/data/dataset/testAlbum/bin/sample.bin @@ -0,0 +1 @@ +just some random stuff diff --git a/tests/ut/data/dataset/testAlbum/datasetSchema.json b/tests/ut/data/dataset/testAlbum/datasetSchema.json new file mode 100644 index 00000000000..9c6ff994e6f --- /dev/null +++ b/tests/ut/data/dataset/testAlbum/datasetSchema.json @@ -0,0 +1,16 @@ +{ + "columns": { + "image": { + "type": "uint8", + "rank": 1 + }, + "label" : { + "type": "string", + "rank": 1 + }, + "id" : { + "type": "int64", + "rank": 0 + } + } +} diff --git a/tests/ut/data/dataset/testAlbum/floatSchema.json b/tests/ut/data/dataset/testAlbum/floatSchema.json index a6856b4ed7b..16d1b9b29d1 100644 --- a/tests/ut/data/dataset/testAlbum/floatSchema.json +++ b/tests/ut/data/dataset/testAlbum/floatSchema.json @@ -5,7 +5,7 @@ "rank": 1 }, "label" : { - "type": "int32", + "type": "string", "rank": 1 }, "id" : { diff --git a/tests/ut/data/dataset/testAlbum/fullSchema.json b/tests/ut/data/dataset/testAlbum/fullSchema.json index ca040f76b12..6e9f497468b 100644 --- a/tests/ut/data/dataset/testAlbum/fullSchema.json +++ b/tests/ut/data/dataset/testAlbum/fullSchema.json @@ -5,7 +5,7 @@ "rank": 1 }, "label" : { - "type": "int32", + "type": "string", "rank": 1 }, "id" : { diff --git a/tests/ut/data/dataset/testAlbum/gen_json.py b/tests/ut/data/dataset/testAlbum/gen_json.py index b7f1bf01850..3b741805888 100644 --- a/tests/ut/data/dataset/testAlbum/gen_json.py +++ b/tests/ut/data/dataset/testAlbum/gen_json.py @@ -2,21 +2,21 @@ import json import os def dump_json_from_dict(structure, file_name): - with open(file_name + '.json', 'w') as file_path: - json.dump(structure, file_path) + with open(file_name + '.json', 'w') as fp: + json.dump(structure, fp) if __name__ == '__main__': - # iterate over directory - DIRECTORY = "imagefolder" - i = 0 + # iterate over DIRECTORY + DIRECTORY = os.path.dirname(os.path.realpath(__file__)) + "/original" + PARENT_DIR = os.path.dirname(DIRECTORY) + i = -1 for filename in os.listdir(DIRECTORY): default_dict = {} default_dict.update(dataset='') - default_dict.update(image=(os.path.join(DIRECTORY, filename))) - default_dict.update(label=[1, 2]) + default_dict.update(image=os.path.abspath(os.path.join(DIRECTORY, filename))) + default_dict.update(label=['3', '2']) default_dict.update(_priority=0.8) - default_dict.update(_embedding='sample.bin') - default_dict.update(_segmented_image=(os.path.join(DIRECTORY, filename))) - default_dict.update(_processed_image=(os.path.join(DIRECTORY, filename))) + default_dict.update(_embedding=os.path.abspath(os.path.join(PARENT_DIR, 'sample.bin'))) + default_dict.update(_processed_image=os.path.abspath(os.path.join(DIRECTORY, filename))) i = i + 1 - dump_json_from_dict(default_dict, 'images/'+str(i)) + dump_json_from_dict(default_dict, PARENT_DIR + '/images/'+str(i)) diff --git a/tests/ut/data/dataset/testAlbum/images/0.json b/tests/ut/data/dataset/testAlbum/images/0.json new file mode 100644 index 00000000000..d55e6c6172f --- /dev/null +++ b/tests/ut/data/dataset/testAlbum/images/0.json @@ -0,0 +1 @@ +{"dataset": "", "image": "original/apple_expect_decoded.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "sample.bin", "_processed_image": "original/apple_expect_decoded.jpg"} diff --git a/tests/ut/data/dataset/testAlbum/images/1.json b/tests/ut/data/dataset/testAlbum/images/1.json index 24006176445..e6cf6a9ad7b 100644 --- a/tests/ut/data/dataset/testAlbum/images/1.json +++ b/tests/ut/data/dataset/testAlbum/images/1.json @@ -1 +1 @@ -{"dataset": "", "image": "imagefolder/apple_expect_decoded.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_decoded.jpg", "_processed_image": "imagefolder/apple_expect_decoded.jpg"} \ No newline at end of file +{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_resize_bilinear.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_resize_bilinear.jpg"} diff --git a/tests/ut/data/dataset/testAlbum/images/2.json b/tests/ut/data/dataset/testAlbum/images/2.json index 1f8c7facb69..102e545a6d8 100644 --- a/tests/ut/data/dataset/testAlbum/images/2.json +++ b/tests/ut/data/dataset/testAlbum/images/2.json @@ -1 +1 @@ -{"dataset": "", "image": "imagefolder/apple_expect_resize_bilinear.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_resize_bilinear.jpg", "_processed_image": "imagefolder/apple_expect_resize_bilinear.jpg"} \ No newline at end of file +{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_changemode.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_changemode.jpg"} diff --git a/tests/ut/data/dataset/testAlbum/images/3.json b/tests/ut/data/dataset/testAlbum/images/3.json index 16152dd4541..69b64bee80a 100644 --- a/tests/ut/data/dataset/testAlbum/images/3.json +++ b/tests/ut/data/dataset/testAlbum/images/3.json @@ -1 +1 @@ -{"dataset": "", "image": "imagefolder/apple_expect_changemode.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_changemode.jpg", "_processed_image": "imagefolder/apple_expect_changemode.jpg"} \ No newline at end of file +{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_not_flip.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_not_flip.jpg"} diff --git a/tests/ut/data/dataset/testAlbum/images/4.json b/tests/ut/data/dataset/testAlbum/images/4.json index bea1d60fb88..cbf9bfbf412 100644 --- a/tests/ut/data/dataset/testAlbum/images/4.json +++ b/tests/ut/data/dataset/testAlbum/images/4.json @@ -1 +1 @@ -{"dataset": "", "image": "imagefolder/apple_expect_not_flip.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_not_flip.jpg", "_processed_image": "imagefolder/apple_expect_not_flip.jpg"} \ No newline at end of file +{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_flipped_horizontal.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_flipped_horizontal.jpg"} diff --git a/tests/ut/data/dataset/testAlbum/images/5.json b/tests/ut/data/dataset/testAlbum/images/5.json index 7806df3621e..220e2a712f4 100644 --- a/tests/ut/data/dataset/testAlbum/images/5.json +++ b/tests/ut/data/dataset/testAlbum/images/5.json @@ -1 +1 @@ -{"dataset": "", "image": "imagefolder/apple_expect_flipped_horizontal.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_flipped_horizontal.jpg", "_processed_image": "imagefolder/apple_expect_flipped_horizontal.jpg"} \ No newline at end of file +{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_rescaled.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_rescaled.jpg"} diff --git a/tests/ut/data/dataset/testAlbum/images/6.json b/tests/ut/data/dataset/testAlbum/images/6.json index 0bf9757ebd3..d0d503fc7fc 100644 --- a/tests/ut/data/dataset/testAlbum/images/6.json +++ b/tests/ut/data/dataset/testAlbum/images/6.json @@ -1 +1 @@ -{"dataset": "", "image": "imagefolder/apple_expect_rescaled.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_rescaled.jpg", "_processed_image": "imagefolder/apple_expect_rescaled.jpg"} \ No newline at end of file +{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_flipped_vertical.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_flipped_vertical.jpg"} diff --git a/tests/ut/data/dataset/testAlbum/images/7.json b/tests/ut/data/dataset/testAlbum/images/7.json deleted file mode 100644 index d0bbb9df61c..00000000000 --- a/tests/ut/data/dataset/testAlbum/images/7.json +++ /dev/null @@ -1 +0,0 @@ -{"dataset": "", "image": "imagefolder/apple_expect_flipped_vertical.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_flipped_vertical.jpg", "_processed_image": "imagefolder/apple_expect_flipped_vertical.jpg"} \ No newline at end of file diff --git a/tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_changemode.jpg b/tests/ut/data/dataset/testAlbum/original/apple_expect_changemode.jpg similarity index 100% rename from tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_changemode.jpg rename to tests/ut/data/dataset/testAlbum/original/apple_expect_changemode.jpg diff --git a/tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_decoded.jpg b/tests/ut/data/dataset/testAlbum/original/apple_expect_decoded.jpg similarity index 100% rename from tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_decoded.jpg rename to tests/ut/data/dataset/testAlbum/original/apple_expect_decoded.jpg diff --git a/tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_flipped_horizontal.jpg b/tests/ut/data/dataset/testAlbum/original/apple_expect_flipped_horizontal.jpg similarity index 100% rename from tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_flipped_horizontal.jpg rename to tests/ut/data/dataset/testAlbum/original/apple_expect_flipped_horizontal.jpg diff --git a/tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_flipped_vertical.jpg b/tests/ut/data/dataset/testAlbum/original/apple_expect_flipped_vertical.jpg similarity index 100% rename from tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_flipped_vertical.jpg rename to tests/ut/data/dataset/testAlbum/original/apple_expect_flipped_vertical.jpg diff --git a/tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_not_flip.jpg b/tests/ut/data/dataset/testAlbum/original/apple_expect_not_flip.jpg similarity index 100% rename from tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_not_flip.jpg rename to tests/ut/data/dataset/testAlbum/original/apple_expect_not_flip.jpg diff --git a/tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_rescaled.jpg b/tests/ut/data/dataset/testAlbum/original/apple_expect_rescaled.jpg similarity index 100% rename from tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_rescaled.jpg rename to tests/ut/data/dataset/testAlbum/original/apple_expect_rescaled.jpg diff --git a/tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_resize_bilinear.jpg b/tests/ut/data/dataset/testAlbum/original/apple_expect_resize_bilinear.jpg similarity index 100% rename from tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_resize_bilinear.jpg rename to tests/ut/data/dataset/testAlbum/original/apple_expect_resize_bilinear.jpg diff --git a/tests/ut/data/dataset/testAlbum/processed/apple_expect_changemode.jpg b/tests/ut/data/dataset/testAlbum/processed/apple_expect_changemode.jpg new file mode 100644 index 00000000000..d7a0624a6f9 Binary files /dev/null and b/tests/ut/data/dataset/testAlbum/processed/apple_expect_changemode.jpg differ diff --git a/tests/ut/data/dataset/testAlbum/processed/apple_expect_decoded.jpg b/tests/ut/data/dataset/testAlbum/processed/apple_expect_decoded.jpg new file mode 100644 index 00000000000..d7a0624a6f9 Binary files /dev/null and b/tests/ut/data/dataset/testAlbum/processed/apple_expect_decoded.jpg differ diff --git a/tests/ut/data/dataset/testAlbum/processed/apple_expect_flipped_horizontal.jpg b/tests/ut/data/dataset/testAlbum/processed/apple_expect_flipped_horizontal.jpg new file mode 100644 index 00000000000..144f6a02e09 Binary files /dev/null and b/tests/ut/data/dataset/testAlbum/processed/apple_expect_flipped_horizontal.jpg differ diff --git a/tests/ut/data/dataset/testAlbum/processed/apple_expect_flipped_vertical.jpg b/tests/ut/data/dataset/testAlbum/processed/apple_expect_flipped_vertical.jpg new file mode 100644 index 00000000000..cc0f26ac922 Binary files /dev/null and b/tests/ut/data/dataset/testAlbum/processed/apple_expect_flipped_vertical.jpg differ diff --git a/tests/ut/data/dataset/testAlbum/processed/apple_expect_not_flip.jpg b/tests/ut/data/dataset/testAlbum/processed/apple_expect_not_flip.jpg new file mode 100644 index 00000000000..d7a0624a6f9 Binary files /dev/null and b/tests/ut/data/dataset/testAlbum/processed/apple_expect_not_flip.jpg differ diff --git a/tests/ut/data/dataset/testAlbum/processed/apple_expect_rescaled.jpg b/tests/ut/data/dataset/testAlbum/processed/apple_expect_rescaled.jpg new file mode 100644 index 00000000000..0a74fdf4d24 Binary files /dev/null and b/tests/ut/data/dataset/testAlbum/processed/apple_expect_rescaled.jpg differ diff --git a/tests/ut/data/dataset/testAlbum/processed/apple_expect_resize_bilinear.jpg b/tests/ut/data/dataset/testAlbum/processed/apple_expect_resize_bilinear.jpg new file mode 100644 index 00000000000..9925508ac90 Binary files /dev/null and b/tests/ut/data/dataset/testAlbum/processed/apple_expect_resize_bilinear.jpg differ diff --git a/tests/ut/data/dataset/testAlbum/segmented/apple_expect_changemode.jpg b/tests/ut/data/dataset/testAlbum/segmented/apple_expect_changemode.jpg new file mode 100644 index 00000000000..d7a0624a6f9 Binary files /dev/null and b/tests/ut/data/dataset/testAlbum/segmented/apple_expect_changemode.jpg differ diff --git a/tests/ut/data/dataset/testAlbum/segmented/apple_expect_decoded.jpg b/tests/ut/data/dataset/testAlbum/segmented/apple_expect_decoded.jpg new file mode 100644 index 00000000000..d7a0624a6f9 Binary files /dev/null and b/tests/ut/data/dataset/testAlbum/segmented/apple_expect_decoded.jpg differ diff --git a/tests/ut/data/dataset/testAlbum/segmented/apple_expect_flipped_horizontal.jpg b/tests/ut/data/dataset/testAlbum/segmented/apple_expect_flipped_horizontal.jpg new file mode 100644 index 00000000000..144f6a02e09 Binary files /dev/null and b/tests/ut/data/dataset/testAlbum/segmented/apple_expect_flipped_horizontal.jpg differ diff --git a/tests/ut/data/dataset/testAlbum/segmented/apple_expect_flipped_vertical.jpg b/tests/ut/data/dataset/testAlbum/segmented/apple_expect_flipped_vertical.jpg new file mode 100644 index 00000000000..cc0f26ac922 Binary files /dev/null and b/tests/ut/data/dataset/testAlbum/segmented/apple_expect_flipped_vertical.jpg differ diff --git a/tests/ut/data/dataset/testAlbum/segmented/apple_expect_not_flip.jpg b/tests/ut/data/dataset/testAlbum/segmented/apple_expect_not_flip.jpg new file mode 100644 index 00000000000..d7a0624a6f9 Binary files /dev/null and b/tests/ut/data/dataset/testAlbum/segmented/apple_expect_not_flip.jpg differ diff --git a/tests/ut/data/dataset/testAlbum/segmented/apple_expect_rescaled.jpg b/tests/ut/data/dataset/testAlbum/segmented/apple_expect_rescaled.jpg new file mode 100644 index 00000000000..0a74fdf4d24 Binary files /dev/null and b/tests/ut/data/dataset/testAlbum/segmented/apple_expect_rescaled.jpg differ diff --git a/tests/ut/data/dataset/testAlbum/segmented/apple_expect_resize_bilinear.jpg b/tests/ut/data/dataset/testAlbum/segmented/apple_expect_resize_bilinear.jpg new file mode 100644 index 00000000000..9925508ac90 Binary files /dev/null and b/tests/ut/data/dataset/testAlbum/segmented/apple_expect_resize_bilinear.jpg differ