From a88273bd3f1bfa846ba8d650b93b777caf4b8208 Mon Sep 17 00:00:00 2001 From: jiangzhiwen Date: Thu, 6 Aug 2020 09:12:01 +0800 Subject: [PATCH] c++ api support for Manifest Dataset --- .../ccsrc/minddata/dataset/api/datasets.cc | 57 +++++ .../ccsrc/minddata/dataset/include/datasets.h | 41 ++++ .../dataset/c_api_dataset_manifest_test.cc | 206 ++++++++++++++++++ 3 files changed, 304 insertions(+) create mode 100644 tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc diff --git a/mindspore/ccsrc/minddata/dataset/api/datasets.cc b/mindspore/ccsrc/minddata/dataset/api/datasets.cc index eb68832675f..da1478538e3 100644 --- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc +++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc @@ -26,6 +26,7 @@ #include "minddata/dataset/engine/datasetops/source/clue_op.h" #include "minddata/dataset/engine/datasetops/source/coco_op.h" #include "minddata/dataset/engine/datasetops/source/image_folder_op.h" +#include "minddata/dataset/engine/datasetops/source/manifest_op.h" #include "minddata/dataset/engine/datasetops/source/mnist_op.h" #include "minddata/dataset/engine/datasetops/source/text_file_op.h" #include "minddata/dataset/engine/datasetops/source/voc_op.h" @@ -44,6 +45,7 @@ // Sampler headers (in alphabetical order) #include "minddata/dataset/engine/datasetops/source/sampler/sampler.h" #include "minddata/dataset/engine/datasetops/source/sampler/random_sampler.h" +#include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h" #include "minddata/dataset/core/config_manager.h" #include "minddata/dataset/util/random.h" @@ -164,6 +166,16 @@ std::shared_ptr ImageFolder(const std::string &dataset_dir, return ds->ValidateParams() ? ds : nullptr; } +// Function to create a ManifestDataset. +std::shared_ptr Manifest(std::string dataset_file, std::string usage, + std::shared_ptr sampler, + const std::map &class_indexing, bool decode) { + auto ds = std::make_shared(dataset_file, usage, sampler, class_indexing, decode); + + // Call derived class validation method. + return ds->ValidateParams() ? ds : nullptr; +} + // Function to create a MnistDataset. std::shared_ptr Mnist(const std::string &dataset_dir, const std::shared_ptr &sampler) { auto ds = std::make_shared(dataset_dir, sampler); @@ -877,6 +889,51 @@ std::vector> ImageFolderDataset::Build() { return node_ops; } +ManifestDataset::ManifestDataset(std::string dataset_file, std::string usage, std::shared_ptr sampler, + const std::map &class_indexing, bool decode) + : dataset_file_(dataset_file), usage_(usage), decode_(decode), class_index_(class_indexing), sampler_(sampler) {} + +bool ManifestDataset::ValidateParams() { + Path manifest_file(dataset_file_); + if (!manifest_file.Exists()) { + MS_LOG(ERROR) << "dataset file: [" << dataset_file_ << "] is invalid or not exist"; + return false; + } + + std::vector usage_list = {"train", "eval", "inference"}; + if (find(usage_list.begin(), usage_list.end(), usage_) == usage_list.end()) { + MS_LOG(ERROR) << "usage should be train, eval or inference."; + return false; + } + + return true; +} + +std::vector> ManifestDataset::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + // If user does not specify Sampler, create a default sampler based on the shuffle variable. + if (sampler_ == nullptr) { + sampler_ = CreateDefaultSampler(); + } + + // Do internal Schema generation. + auto schema = std::make_unique(); + RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); + TensorShape scalar = TensorShape::CreateScalar(); + RETURN_EMPTY_IF_ERROR( + schema->AddColumn(ColDescriptor("label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar))); + + std::shared_ptr manifest_op; + manifest_op = + std::make_shared(num_workers_, rows_per_buffer_, dataset_file_, connector_que_size_, decode_, + class_index_, std::move(schema), std::move(sampler_->Build()), usage_); + + node_ops.push_back(manifest_op); + return node_ops; +} + MnistDataset::MnistDataset(std::string dataset_dir, std::shared_ptr sampler) : dataset_dir_(dataset_dir), sampler_(sampler) {} diff --git a/mindspore/ccsrc/minddata/dataset/include/datasets.h b/mindspore/ccsrc/minddata/dataset/include/datasets.h index b193179c1fd..e487e335f6f 100644 --- a/mindspore/ccsrc/minddata/dataset/include/datasets.h +++ b/mindspore/ccsrc/minddata/dataset/include/datasets.h @@ -49,6 +49,7 @@ class Cifar100Dataset; class CLUEDataset; class CocoDataset; class ImageFolderDataset; +class ManifestDataset; class MnistDataset; class TextFileDataset; class VOCDataset; @@ -154,6 +155,21 @@ std::shared_ptr ImageFolder(const std::string &dataset_dir, const std::set &extensions = {}, const std::map &class_indexing = {}); +/// \brief Function to create a ManifestDataset +/// \notes The generated dataset has two columns ['image', 'label'] +/// \param[in] dataset_file The dataset file to be read +/// \param[in] usage Need "train", "eval" or "inference" data (default="train") +/// \param[in] decode Decode the images after reading (default=false). +/// \param[in] class_indexing A str-to-int mapping from label name to index (default={}, the folder +/// names will be sorted alphabetically and each class will be given a unique index starting from 0). +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, +/// A `RandomSampler` will be used to randomly iterate the entire dataset +/// \return Shared pointer to the current ManifestDataset +std::shared_ptr Manifest(std::string dataset_file, std::string usage = "train", + std::shared_ptr sampler = nullptr, + const std::map &class_indexing = {}, + bool decode = false); + /// \brief Function to create a MnistDataset /// \notes The generated dataset has two columns ['image', 'label'] /// \param[in] dataset_dir Path to the root directory that contains the dataset @@ -500,6 +516,31 @@ class ImageFolderDataset : public Dataset { std::set exts_; }; +class ManifestDataset : public Dataset { + public: + /// \brief Constructor + ManifestDataset(std::string dataset_file, std::string usage, std::shared_ptr sampler, + const std::map &class_indexing, bool decode); + + /// \brief Destructor + ~ManifestDataset() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return The list of shared pointers to the newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return bool true if all the params are valid + bool ValidateParams() override; + + private: + std::string dataset_file_; + std::string usage_; + bool decode_; + std::map class_index_; + std::shared_ptr sampler_; +}; + class MnistDataset : public Dataset { public: /// \brief Constructor diff --git a/tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc b/tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc new file mode 100644 index 00000000000..5e4c91c7651 --- /dev/null +++ b/tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc @@ -0,0 +1,206 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "common/common.h" +#include "minddata/dataset/include/datasets.h" + +using namespace mindspore::dataset::api; +using mindspore::dataset::Tensor; + +class MindDataTestPipeline : public UT::DatasetOpTesting { + protected: +}; + +TEST_F(MindDataTestPipeline, TestManifestBasic) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestBasic."; + + std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json"; + // Create a Manifest Dataset + std::shared_ptr ds = Manifest(file_path); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + uint64_t i = 0; + while (row.size() != 0) { + i++; + auto image = row["image"]; + MS_LOG(INFO) << "Tensor image shape: " << image->shape(); + iter->GetNextRow(&row); + } + + EXPECT_EQ(i, 2); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestManifestDecode) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestDecode."; + + std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json"; + // Create a Manifest Dataset + std::shared_ptr ds = Manifest(file_path, "train", nullptr, {}, true); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + uint64_t i = 0; + while (row.size() != 0) { + i++; + auto image = row["image"]; + auto shape = image->shape(); + MS_LOG(INFO) << "Tensor image shape size: " << shape.Size(); + MS_LOG(INFO) << "Tensor image shape: " << image->shape(); + EXPECT_GT(shape.Size(), 1); // Verify decode=true took effect + iter->GetNextRow(&row); + } + + EXPECT_EQ(i, 2); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestManifestEval) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestEval."; + + std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json"; + // Create a Manifest Dataset + std::shared_ptr ds = Manifest(file_path, "eval"); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + uint64_t i = 0; + while (row.size() != 0) { + i++; + auto image = row["image"]; + MS_LOG(INFO) << "Tensor image shape: " << image->shape(); + iter->GetNextRow(&row); + } + + EXPECT_EQ(i, 1); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestManifestClassIndex) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestClassIndex."; + + std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json"; + std::map map; + map["cat"] = 111; // forward slash is not good, but we need to add this somewhere, also in windows, its a '\' + map["dog"] = 222; // forward slash is not good, but we need to add this somewhere, also in windows, its a '\' + map["wrong folder name"] = 1234; // this is skipped + std::vector expected_label = {111, 222}; + + // Create a Manifest Dataset + std::shared_ptr ds = Manifest(file_path, "train", nullptr, map, true); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + uint64_t i = 0; + int32_t label_idx = 0; + while (row.size() != 0) { + i++; + auto image = row["image"]; + MS_LOG(INFO) << "Tensor image shape: " << image->shape(); + row["label"]->GetItemAt(&label_idx, {}); + MS_LOG(INFO) << "Tensor label value: " << label_idx; + auto label_it = std::find(expected_label.begin(), expected_label.end(), label_idx); + EXPECT_NE(label_it, expected_label.end()); + iter->GetNextRow(&row); + } + + EXPECT_EQ(i, 2); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestManifestNumSamplers) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestNumSamplers."; + + std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json"; + // Create a Manifest Dataset + std::shared_ptr ds = Manifest(file_path, "train", SequentialSampler(0, 1), {}, true); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + uint64_t i = 0; + while (row.size() != 0) { + i++; + auto image = row["image"]; + MS_LOG(INFO) << "Tensor image shape: " << image->shape(); + iter->GetNextRow(&row); + } + + EXPECT_EQ(i, 1); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestManifestError) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestError."; + + std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json"; + // Create a Manifest Dataset with not exist file + std::shared_ptr ds0 = Manifest("NotExistFile", "train"); + EXPECT_EQ(ds0, nullptr); + + // Create a Manifest Dataset with invalid usage + std::shared_ptr ds1 = Manifest(file_path, "invalid_usage"); + EXPECT_EQ(ds1, nullptr); +}