!4208 C++ API Support for Manifest Dataset

Merge pull request !4208 from jiangzhiwen/jzw/c_api_manifest
This commit is contained in:
mindspore-ci-bot 2020-08-18 10:50:41 +08:00 committed by Gitee
commit b92e776052
3 changed files with 304 additions and 0 deletions

View File

@ -26,6 +26,7 @@
#include "minddata/dataset/engine/datasetops/source/clue_op.h" #include "minddata/dataset/engine/datasetops/source/clue_op.h"
#include "minddata/dataset/engine/datasetops/source/coco_op.h" #include "minddata/dataset/engine/datasetops/source/coco_op.h"
#include "minddata/dataset/engine/datasetops/source/image_folder_op.h" #include "minddata/dataset/engine/datasetops/source/image_folder_op.h"
#include "minddata/dataset/engine/datasetops/source/manifest_op.h"
#include "minddata/dataset/engine/datasetops/source/mnist_op.h" #include "minddata/dataset/engine/datasetops/source/mnist_op.h"
#include "minddata/dataset/engine/datasetops/source/text_file_op.h" #include "minddata/dataset/engine/datasetops/source/text_file_op.h"
#include "minddata/dataset/engine/datasetops/source/voc_op.h" #include "minddata/dataset/engine/datasetops/source/voc_op.h"
@ -44,6 +45,7 @@
// Sampler headers (in alphabetical order) // Sampler headers (in alphabetical order)
#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h" #include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
#include "minddata/dataset/engine/datasetops/source/sampler/random_sampler.h" #include "minddata/dataset/engine/datasetops/source/sampler/random_sampler.h"
#include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
#include "minddata/dataset/core/config_manager.h" #include "minddata/dataset/core/config_manager.h"
#include "minddata/dataset/util/random.h" #include "minddata/dataset/util/random.h"
@ -164,6 +166,16 @@ std::shared_ptr<ImageFolderDataset> ImageFolder(const std::string &dataset_dir,
return ds->ValidateParams() ? ds : nullptr; return ds->ValidateParams() ? ds : nullptr;
} }
// Function to create a ManifestDataset.
std::shared_ptr<ManifestDataset> Manifest(std::string dataset_file, std::string usage,
std::shared_ptr<SamplerObj> sampler,
const std::map<std::string, int32_t> &class_indexing, bool decode) {
auto ds = std::make_shared<ManifestDataset>(dataset_file, usage, sampler, class_indexing, decode);
// Call derived class validation method.
return ds->ValidateParams() ? ds : nullptr;
}
// Function to create a MnistDataset. // Function to create a MnistDataset.
std::shared_ptr<MnistDataset> Mnist(const std::string &dataset_dir, const std::shared_ptr<SamplerObj> &sampler) { std::shared_ptr<MnistDataset> Mnist(const std::string &dataset_dir, const std::shared_ptr<SamplerObj> &sampler) {
auto ds = std::make_shared<MnistDataset>(dataset_dir, sampler); auto ds = std::make_shared<MnistDataset>(dataset_dir, sampler);
@ -877,6 +889,51 @@ std::vector<std::shared_ptr<DatasetOp>> ImageFolderDataset::Build() {
return node_ops; return node_ops;
} }
ManifestDataset::ManifestDataset(std::string dataset_file, std::string usage, std::shared_ptr<SamplerObj> sampler,
const std::map<std::string, int32_t> &class_indexing, bool decode)
: dataset_file_(dataset_file), usage_(usage), decode_(decode), class_index_(class_indexing), sampler_(sampler) {}
bool ManifestDataset::ValidateParams() {
Path manifest_file(dataset_file_);
if (!manifest_file.Exists()) {
MS_LOG(ERROR) << "dataset file: [" << dataset_file_ << "] is invalid or not exist";
return false;
}
std::vector<std::string> usage_list = {"train", "eval", "inference"};
if (find(usage_list.begin(), usage_list.end(), usage_) == usage_list.end()) {
MS_LOG(ERROR) << "usage should be train, eval or inference.";
return false;
}
return true;
}
std::vector<std::shared_ptr<DatasetOp>> ManifestDataset::Build() {
// A vector containing shared pointer to the Dataset Ops that this object will create
std::vector<std::shared_ptr<DatasetOp>> node_ops;
// If user does not specify Sampler, create a default sampler based on the shuffle variable.
if (sampler_ == nullptr) {
sampler_ = CreateDefaultSampler();
}
// Do internal Schema generation.
auto schema = std::make_unique<DataSchema>();
RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1)));
TensorShape scalar = TensorShape::CreateScalar();
RETURN_EMPTY_IF_ERROR(
schema->AddColumn(ColDescriptor("label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar)));
std::shared_ptr<ManifestOp> manifest_op;
manifest_op =
std::make_shared<ManifestOp>(num_workers_, rows_per_buffer_, dataset_file_, connector_que_size_, decode_,
class_index_, std::move(schema), std::move(sampler_->Build()), usage_);
node_ops.push_back(manifest_op);
return node_ops;
}
MnistDataset::MnistDataset(std::string dataset_dir, std::shared_ptr<SamplerObj> sampler) MnistDataset::MnistDataset(std::string dataset_dir, std::shared_ptr<SamplerObj> sampler)
: dataset_dir_(dataset_dir), sampler_(sampler) {} : dataset_dir_(dataset_dir), sampler_(sampler) {}

View File

@ -49,6 +49,7 @@ class Cifar100Dataset;
class CLUEDataset; class CLUEDataset;
class CocoDataset; class CocoDataset;
class ImageFolderDataset; class ImageFolderDataset;
class ManifestDataset;
class MnistDataset; class MnistDataset;
class TextFileDataset; class TextFileDataset;
class VOCDataset; class VOCDataset;
@ -154,6 +155,21 @@ std::shared_ptr<ImageFolderDataset> ImageFolder(const std::string &dataset_dir,
const std::set<std::string> &extensions = {}, const std::set<std::string> &extensions = {},
const std::map<std::string, int32_t> &class_indexing = {}); const std::map<std::string, int32_t> &class_indexing = {});
/// \brief Function to create a ManifestDataset
/// \notes The generated dataset has two columns ['image', 'label']
/// \param[in] dataset_file The dataset file to be read
/// \param[in] usage Need "train", "eval" or "inference" data (default="train")
/// \param[in] decode Decode the images after reading (default=false).
/// \param[in] class_indexing A str-to-int mapping from label name to index (default={}, the folder
/// names will be sorted alphabetically and each class will be given a unique index starting from 0).
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`,
/// A `RandomSampler` will be used to randomly iterate the entire dataset
/// \return Shared pointer to the current ManifestDataset
std::shared_ptr<ManifestDataset> Manifest(std::string dataset_file, std::string usage = "train",
std::shared_ptr<SamplerObj> sampler = nullptr,
const std::map<std::string, int32_t> &class_indexing = {},
bool decode = false);
/// \brief Function to create a MnistDataset /// \brief Function to create a MnistDataset
/// \notes The generated dataset has two columns ['image', 'label'] /// \notes The generated dataset has two columns ['image', 'label']
/// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] dataset_dir Path to the root directory that contains the dataset
@ -500,6 +516,31 @@ class ImageFolderDataset : public Dataset {
std::set<std::string> exts_; std::set<std::string> exts_;
}; };
class ManifestDataset : public Dataset {
public:
/// \brief Constructor
ManifestDataset(std::string dataset_file, std::string usage, std::shared_ptr<SamplerObj> sampler,
const std::map<std::string, int32_t> &class_indexing, bool decode);
/// \brief Destructor
~ManifestDataset() = default;
/// \brief a base class override function to create the required runtime dataset op objects for this class
/// \return The list of shared pointers to the newly created DatasetOps
std::vector<std::shared_ptr<DatasetOp>> Build() override;
/// \brief Parameters validation
/// \return bool true if all the params are valid
bool ValidateParams() override;
private:
std::string dataset_file_;
std::string usage_;
bool decode_;
std::map<std::string, int32_t> class_index_;
std::shared_ptr<SamplerObj> sampler_;
};
class MnistDataset : public Dataset { class MnistDataset : public Dataset {
public: public:
/// \brief Constructor /// \brief Constructor

View File

@ -0,0 +1,206 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "common/common.h"
#include "minddata/dataset/include/datasets.h"
using namespace mindspore::dataset::api;
using mindspore::dataset::Tensor;
class MindDataTestPipeline : public UT::DatasetOpTesting {
protected:
};
TEST_F(MindDataTestPipeline, TestManifestBasic) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestBasic.";
std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json";
// Create a Manifest Dataset
std::shared_ptr<Dataset> ds = Manifest(file_path);
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
uint64_t i = 0;
while (row.size() != 0) {
i++;
auto image = row["image"];
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
iter->GetNextRow(&row);
}
EXPECT_EQ(i, 2);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestManifestDecode) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestDecode.";
std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json";
// Create a Manifest Dataset
std::shared_ptr<Dataset> ds = Manifest(file_path, "train", nullptr, {}, true);
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
uint64_t i = 0;
while (row.size() != 0) {
i++;
auto image = row["image"];
auto shape = image->shape();
MS_LOG(INFO) << "Tensor image shape size: " << shape.Size();
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
EXPECT_GT(shape.Size(), 1); // Verify decode=true took effect
iter->GetNextRow(&row);
}
EXPECT_EQ(i, 2);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestManifestEval) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestEval.";
std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json";
// Create a Manifest Dataset
std::shared_ptr<Dataset> ds = Manifest(file_path, "eval");
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
uint64_t i = 0;
while (row.size() != 0) {
i++;
auto image = row["image"];
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
iter->GetNextRow(&row);
}
EXPECT_EQ(i, 1);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestManifestClassIndex) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestClassIndex.";
std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json";
std::map<std::string, int32_t> map;
map["cat"] = 111; // forward slash is not good, but we need to add this somewhere, also in windows, its a '\'
map["dog"] = 222; // forward slash is not good, but we need to add this somewhere, also in windows, its a '\'
map["wrong folder name"] = 1234; // this is skipped
std::vector<int> expected_label = {111, 222};
// Create a Manifest Dataset
std::shared_ptr<Dataset> ds = Manifest(file_path, "train", nullptr, map, true);
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
uint64_t i = 0;
int32_t label_idx = 0;
while (row.size() != 0) {
i++;
auto image = row["image"];
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
row["label"]->GetItemAt<int32_t>(&label_idx, {});
MS_LOG(INFO) << "Tensor label value: " << label_idx;
auto label_it = std::find(expected_label.begin(), expected_label.end(), label_idx);
EXPECT_NE(label_it, expected_label.end());
iter->GetNextRow(&row);
}
EXPECT_EQ(i, 2);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestManifestNumSamplers) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestNumSamplers.";
std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json";
// Create a Manifest Dataset
std::shared_ptr<Dataset> ds = Manifest(file_path, "train", SequentialSampler(0, 1), {}, true);
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
uint64_t i = 0;
while (row.size() != 0) {
i++;
auto image = row["image"];
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
iter->GetNextRow(&row);
}
EXPECT_EQ(i, 1);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestManifestError) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestError.";
std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json";
// Create a Manifest Dataset with not exist file
std::shared_ptr<Dataset> ds0 = Manifest("NotExistFile", "train");
EXPECT_EQ(ds0, nullptr);
// Create a Manifest Dataset with invalid usage
std::shared_ptr<Dataset> ds1 = Manifest(file_path, "invalid_usage");
EXPECT_EQ(ds1, nullptr);
}