forked from mindspore-Ecosystem/mindspore
!4208 C++ API Support for Manifest Dataset
Merge pull request !4208 from jiangzhiwen/jzw/c_api_manifest
This commit is contained in:
commit
b92e776052
|
@ -26,6 +26,7 @@
|
|||
#include "minddata/dataset/engine/datasetops/source/clue_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/coco_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/image_folder_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/manifest_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/mnist_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/text_file_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/voc_op.h"
|
||||
|
@ -44,6 +45,7 @@
|
|||
// Sampler headers (in alphabetical order)
|
||||
#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/sampler/random_sampler.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
|
||||
|
||||
#include "minddata/dataset/core/config_manager.h"
|
||||
#include "minddata/dataset/util/random.h"
|
||||
|
@ -164,6 +166,16 @@ std::shared_ptr<ImageFolderDataset> ImageFolder(const std::string &dataset_dir,
|
|||
return ds->ValidateParams() ? ds : nullptr;
|
||||
}
|
||||
|
||||
// Function to create a ManifestDataset.
|
||||
std::shared_ptr<ManifestDataset> Manifest(std::string dataset_file, std::string usage,
|
||||
std::shared_ptr<SamplerObj> sampler,
|
||||
const std::map<std::string, int32_t> &class_indexing, bool decode) {
|
||||
auto ds = std::make_shared<ManifestDataset>(dataset_file, usage, sampler, class_indexing, decode);
|
||||
|
||||
// Call derived class validation method.
|
||||
return ds->ValidateParams() ? ds : nullptr;
|
||||
}
|
||||
|
||||
// Function to create a MnistDataset.
|
||||
std::shared_ptr<MnistDataset> Mnist(const std::string &dataset_dir, const std::shared_ptr<SamplerObj> &sampler) {
|
||||
auto ds = std::make_shared<MnistDataset>(dataset_dir, sampler);
|
||||
|
@ -877,6 +889,51 @@ std::vector<std::shared_ptr<DatasetOp>> ImageFolderDataset::Build() {
|
|||
return node_ops;
|
||||
}
|
||||
|
||||
ManifestDataset::ManifestDataset(std::string dataset_file, std::string usage, std::shared_ptr<SamplerObj> sampler,
|
||||
const std::map<std::string, int32_t> &class_indexing, bool decode)
|
||||
: dataset_file_(dataset_file), usage_(usage), decode_(decode), class_index_(class_indexing), sampler_(sampler) {}
|
||||
|
||||
bool ManifestDataset::ValidateParams() {
|
||||
Path manifest_file(dataset_file_);
|
||||
if (!manifest_file.Exists()) {
|
||||
MS_LOG(ERROR) << "dataset file: [" << dataset_file_ << "] is invalid or not exist";
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<std::string> usage_list = {"train", "eval", "inference"};
|
||||
if (find(usage_list.begin(), usage_list.end(), usage_) == usage_list.end()) {
|
||||
MS_LOG(ERROR) << "usage should be train, eval or inference.";
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<DatasetOp>> ManifestDataset::Build() {
|
||||
// A vector containing shared pointer to the Dataset Ops that this object will create
|
||||
std::vector<std::shared_ptr<DatasetOp>> node_ops;
|
||||
|
||||
// If user does not specify Sampler, create a default sampler based on the shuffle variable.
|
||||
if (sampler_ == nullptr) {
|
||||
sampler_ = CreateDefaultSampler();
|
||||
}
|
||||
|
||||
// Do internal Schema generation.
|
||||
auto schema = std::make_unique<DataSchema>();
|
||||
RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1)));
|
||||
TensorShape scalar = TensorShape::CreateScalar();
|
||||
RETURN_EMPTY_IF_ERROR(
|
||||
schema->AddColumn(ColDescriptor("label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar)));
|
||||
|
||||
std::shared_ptr<ManifestOp> manifest_op;
|
||||
manifest_op =
|
||||
std::make_shared<ManifestOp>(num_workers_, rows_per_buffer_, dataset_file_, connector_que_size_, decode_,
|
||||
class_index_, std::move(schema), std::move(sampler_->Build()), usage_);
|
||||
|
||||
node_ops.push_back(manifest_op);
|
||||
return node_ops;
|
||||
}
|
||||
|
||||
MnistDataset::MnistDataset(std::string dataset_dir, std::shared_ptr<SamplerObj> sampler)
|
||||
: dataset_dir_(dataset_dir), sampler_(sampler) {}
|
||||
|
||||
|
|
|
@ -49,6 +49,7 @@ class Cifar100Dataset;
|
|||
class CLUEDataset;
|
||||
class CocoDataset;
|
||||
class ImageFolderDataset;
|
||||
class ManifestDataset;
|
||||
class MnistDataset;
|
||||
class TextFileDataset;
|
||||
class VOCDataset;
|
||||
|
@ -154,6 +155,21 @@ std::shared_ptr<ImageFolderDataset> ImageFolder(const std::string &dataset_dir,
|
|||
const std::set<std::string> &extensions = {},
|
||||
const std::map<std::string, int32_t> &class_indexing = {});
|
||||
|
||||
/// \brief Function to create a ManifestDataset
|
||||
/// \notes The generated dataset has two columns ['image', 'label']
|
||||
/// \param[in] dataset_file The dataset file to be read
|
||||
/// \param[in] usage Need "train", "eval" or "inference" data (default="train")
|
||||
/// \param[in] decode Decode the images after reading (default=false).
|
||||
/// \param[in] class_indexing A str-to-int mapping from label name to index (default={}, the folder
|
||||
/// names will be sorted alphabetically and each class will be given a unique index starting from 0).
|
||||
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`,
|
||||
/// A `RandomSampler` will be used to randomly iterate the entire dataset
|
||||
/// \return Shared pointer to the current ManifestDataset
|
||||
std::shared_ptr<ManifestDataset> Manifest(std::string dataset_file, std::string usage = "train",
|
||||
std::shared_ptr<SamplerObj> sampler = nullptr,
|
||||
const std::map<std::string, int32_t> &class_indexing = {},
|
||||
bool decode = false);
|
||||
|
||||
/// \brief Function to create a MnistDataset
|
||||
/// \notes The generated dataset has two columns ['image', 'label']
|
||||
/// \param[in] dataset_dir Path to the root directory that contains the dataset
|
||||
|
@ -500,6 +516,31 @@ class ImageFolderDataset : public Dataset {
|
|||
std::set<std::string> exts_;
|
||||
};
|
||||
|
||||
class ManifestDataset : public Dataset {
|
||||
public:
|
||||
/// \brief Constructor
|
||||
ManifestDataset(std::string dataset_file, std::string usage, std::shared_ptr<SamplerObj> sampler,
|
||||
const std::map<std::string, int32_t> &class_indexing, bool decode);
|
||||
|
||||
/// \brief Destructor
|
||||
~ManifestDataset() = default;
|
||||
|
||||
/// \brief a base class override function to create the required runtime dataset op objects for this class
|
||||
/// \return The list of shared pointers to the newly created DatasetOps
|
||||
std::vector<std::shared_ptr<DatasetOp>> Build() override;
|
||||
|
||||
/// \brief Parameters validation
|
||||
/// \return bool true if all the params are valid
|
||||
bool ValidateParams() override;
|
||||
|
||||
private:
|
||||
std::string dataset_file_;
|
||||
std::string usage_;
|
||||
bool decode_;
|
||||
std::map<std::string, int32_t> class_index_;
|
||||
std::shared_ptr<SamplerObj> sampler_;
|
||||
};
|
||||
|
||||
class MnistDataset : public Dataset {
|
||||
public:
|
||||
/// \brief Constructor
|
||||
|
|
|
@ -0,0 +1,206 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "common/common.h"
|
||||
#include "minddata/dataset/include/datasets.h"
|
||||
|
||||
using namespace mindspore::dataset::api;
|
||||
using mindspore::dataset::Tensor;
|
||||
|
||||
class MindDataTestPipeline : public UT::DatasetOpTesting {
|
||||
protected:
|
||||
};
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestManifestBasic) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestBasic.";
|
||||
|
||||
std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json";
|
||||
// Create a Manifest Dataset
|
||||
std::shared_ptr<Dataset> ds = Manifest(file_path);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
auto image = row["image"];
|
||||
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
|
||||
iter->GetNextRow(&row);
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 2);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestManifestDecode) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestDecode.";
|
||||
|
||||
std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json";
|
||||
// Create a Manifest Dataset
|
||||
std::shared_ptr<Dataset> ds = Manifest(file_path, "train", nullptr, {}, true);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
auto image = row["image"];
|
||||
auto shape = image->shape();
|
||||
MS_LOG(INFO) << "Tensor image shape size: " << shape.Size();
|
||||
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
|
||||
EXPECT_GT(shape.Size(), 1); // Verify decode=true took effect
|
||||
iter->GetNextRow(&row);
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 2);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestManifestEval) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestEval.";
|
||||
|
||||
std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json";
|
||||
// Create a Manifest Dataset
|
||||
std::shared_ptr<Dataset> ds = Manifest(file_path, "eval");
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
auto image = row["image"];
|
||||
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
|
||||
iter->GetNextRow(&row);
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestManifestClassIndex) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestClassIndex.";
|
||||
|
||||
std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json";
|
||||
std::map<std::string, int32_t> map;
|
||||
map["cat"] = 111; // forward slash is not good, but we need to add this somewhere, also in windows, its a '\'
|
||||
map["dog"] = 222; // forward slash is not good, but we need to add this somewhere, also in windows, its a '\'
|
||||
map["wrong folder name"] = 1234; // this is skipped
|
||||
std::vector<int> expected_label = {111, 222};
|
||||
|
||||
// Create a Manifest Dataset
|
||||
std::shared_ptr<Dataset> ds = Manifest(file_path, "train", nullptr, map, true);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t i = 0;
|
||||
int32_t label_idx = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
auto image = row["image"];
|
||||
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
|
||||
row["label"]->GetItemAt<int32_t>(&label_idx, {});
|
||||
MS_LOG(INFO) << "Tensor label value: " << label_idx;
|
||||
auto label_it = std::find(expected_label.begin(), expected_label.end(), label_idx);
|
||||
EXPECT_NE(label_it, expected_label.end());
|
||||
iter->GetNextRow(&row);
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 2);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestManifestNumSamplers) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestNumSamplers.";
|
||||
|
||||
std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json";
|
||||
// Create a Manifest Dataset
|
||||
std::shared_ptr<Dataset> ds = Manifest(file_path, "train", SequentialSampler(0, 1), {}, true);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
auto image = row["image"];
|
||||
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
|
||||
iter->GetNextRow(&row);
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestManifestError) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestError.";
|
||||
|
||||
std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json";
|
||||
// Create a Manifest Dataset with not exist file
|
||||
std::shared_ptr<Dataset> ds0 = Manifest("NotExistFile", "train");
|
||||
EXPECT_EQ(ds0, nullptr);
|
||||
|
||||
// Create a Manifest Dataset with invalid usage
|
||||
std::shared_ptr<Dataset> ds1 = Manifest(file_path, "invalid_usage");
|
||||
EXPECT_EQ(ds1, nullptr);
|
||||
}
|
Loading…
Reference in New Issue