forked from mindspore-Ecosystem/mindspore
!7295 [MD] C++ api add MindDataset
Merge pull request !7295 from luoyang/c-api-pyfunc
This commit is contained in:
commit
48601f51b0
|
@ -31,6 +31,7 @@
|
|||
#include "minddata/dataset/engine/datasetops/source/image_folder_op.h"
|
||||
#ifndef ENABLE_ANDROID
|
||||
#include "minddata/dataset/engine/datasetops/source/manifest_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/mindrecord_op.h"
|
||||
#endif
|
||||
#include "minddata/dataset/engine/datasetops/source/mnist_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/random_data_op.h"
|
||||
|
@ -223,6 +224,27 @@ std::shared_ptr<ManifestDataset> Manifest(const std::string &dataset_file, const
|
|||
}
|
||||
#endif
|
||||
|
||||
// Function to create a MindDataDataset.
|
||||
std::shared_ptr<MindDataDataset> MindData(const std::string &dataset_file, const std::vector<std::string> &columns_list,
|
||||
const std::shared_ptr<SamplerObj> &sampler, nlohmann::json padded_sample,
|
||||
int64_t num_padded) {
|
||||
auto ds = std::make_shared<MindDataDataset>(dataset_file, columns_list, sampler, padded_sample, num_padded);
|
||||
|
||||
// Call derived class validation method.
|
||||
return ds->ValidateParams() ? ds : nullptr;
|
||||
}
|
||||
|
||||
// Function to create a MindDataDataset.
|
||||
std::shared_ptr<MindDataDataset> MindData(const std::vector<std::string> &dataset_files,
|
||||
const std::vector<std::string> &columns_list,
|
||||
const std::shared_ptr<SamplerObj> &sampler, nlohmann::json padded_sample,
|
||||
int64_t num_padded) {
|
||||
auto ds = std::make_shared<MindDataDataset>(dataset_files, columns_list, sampler, padded_sample, num_padded);
|
||||
|
||||
// Call derived class validation method.
|
||||
return ds->ValidateParams() ? ds : nullptr;
|
||||
}
|
||||
|
||||
// Function to create a MnistDataset.
|
||||
std::shared_ptr<MnistDataset> Mnist(const std::string &dataset_dir, const std::string &usage,
|
||||
const std::shared_ptr<SamplerObj> &sampler) {
|
||||
|
@ -709,6 +731,11 @@ Status ValidateDatasetFilesParam(const std::string &dataset_name, const std::vec
|
|||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
if (access(dataset_file.toString().c_str(), R_OK) == -1) {
|
||||
std::string err_msg = dataset_name + ": No access to specified dataset file: " + f;
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
|
@ -1388,6 +1415,146 @@ std::vector<std::shared_ptr<DatasetOp>> ManifestDataset::Build() {
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifndef ENABLE_ANDROID
|
||||
MindDataDataset::MindDataDataset(const std::vector<std::string> &dataset_files,
|
||||
const std::vector<std::string> &columns_list,
|
||||
const std::shared_ptr<SamplerObj> &sampler, nlohmann::json padded_sample,
|
||||
int64_t num_padded)
|
||||
: dataset_file_(std::string()),
|
||||
dataset_files_(dataset_files),
|
||||
search_for_pattern_(false),
|
||||
columns_list_(columns_list),
|
||||
sampler_(sampler),
|
||||
padded_sample_(padded_sample),
|
||||
sample_bytes_({}),
|
||||
num_padded_(num_padded) {}
|
||||
|
||||
MindDataDataset::MindDataDataset(const std::string &dataset_file, const std::vector<std::string> &columns_list,
|
||||
const std::shared_ptr<SamplerObj> &sampler, nlohmann::json padded_sample,
|
||||
int64_t num_padded)
|
||||
: dataset_file_(dataset_file),
|
||||
dataset_files_({}),
|
||||
search_for_pattern_(true),
|
||||
columns_list_(columns_list),
|
||||
sampler_(sampler),
|
||||
padded_sample_(padded_sample),
|
||||
sample_bytes_({}),
|
||||
num_padded_(num_padded) {}
|
||||
|
||||
Status MindDataDataset::ValidateParams() {
|
||||
if (!search_for_pattern_ && dataset_files_.size() > 4096) {
|
||||
std::string err_msg =
|
||||
"MindDataDataset: length of dataset_file must be less than or equal to 4096, dataset_file length: " +
|
||||
std::to_string(dataset_file_.size());
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
||||
std::vector<std::string> dataset_file_vec =
|
||||
search_for_pattern_ ? std::vector<std::string>{dataset_file_} : dataset_files_;
|
||||
RETURN_IF_NOT_OK(ValidateDatasetFilesParam("MindDataDataset", dataset_file_vec));
|
||||
|
||||
RETURN_IF_NOT_OK(ValidateDatasetSampler("MindDataDataset", sampler_));
|
||||
|
||||
if (!columns_list_.empty()) {
|
||||
RETURN_IF_NOT_OK(ValidateDatasetColumnParam("MindDataDataset", "columns_list", columns_list_));
|
||||
}
|
||||
|
||||
if (padded_sample_ != nullptr) {
|
||||
if (num_padded_ < 0) {
|
||||
std::string err_msg =
|
||||
"MindDataDataset: num_padded must be greater than or equal to zero, num_padded: " + std::to_string(num_padded_);
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
if (columns_list_.empty()) {
|
||||
std::string err_msg = "MindDataDataset: padded_sample is specified and requires columns_list as well";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
for (std::string &column : columns_list_) {
|
||||
if (padded_sample_.find(column) == padded_sample_.end()) {
|
||||
std::string err_msg =
|
||||
"MindDataDataset: " + column + " in columns_list does not match any column in padded_sample";
|
||||
MS_LOG(ERROR) << err_msg << ", padded_sample: " << padded_sample_;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (num_padded_ > 0) {
|
||||
if (padded_sample_ == nullptr) {
|
||||
std::string err_msg = "MindDataDataset: num_padded is specified but padded_sample is not";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Helper function to create runtime sampler for minddata dataset
|
||||
Status MindDataDataset::BuildMindDatasetSamplerChain(
|
||||
const std::shared_ptr<SamplerObj> &sampler, std::vector<std::shared_ptr<mindrecord::ShardOperator>> *operators_,
|
||||
int64_t num_padded) {
|
||||
std::shared_ptr<mindrecord::ShardOperator> op = sampler->BuildForMindDataset();
|
||||
if (op == nullptr) {
|
||||
std::string err_msg =
|
||||
"MindDataDataset: Unsupported sampler is supplied for MindDataset. Supported sampler list: "
|
||||
"SubsetRandomSampler, PkSampler, RandomSampler, SequentialSampler and DistributedSampler";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
std::stack<std::shared_ptr<mindrecord::ShardOperator>> stack_ops;
|
||||
while (op != nullptr) {
|
||||
auto sampler_op = std::dynamic_pointer_cast<mindrecord::ShardDistributedSample>(op);
|
||||
if (sampler_op && num_padded > 0) {
|
||||
sampler_op->SetNumPaddedSamples(num_padded);
|
||||
stack_ops.push(sampler_op);
|
||||
} else {
|
||||
stack_ops.push(op);
|
||||
}
|
||||
op = op->GetChildOp();
|
||||
}
|
||||
while (!stack_ops.empty()) {
|
||||
operators_->push_back(stack_ops.top());
|
||||
stack_ops.pop();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Helper function to set sample_bytes from py::byte type
|
||||
void MindDataDataset::SetSampleBytes(std::map<std::string, std::string> *sample_bytes) {
|
||||
sample_bytes_ = *sample_bytes;
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<DatasetOp>> MindDataDataset::Build() {
|
||||
// A vector containing shared pointer to the Dataset Ops that this object will create
|
||||
std::vector<std::shared_ptr<DatasetOp>> node_ops;
|
||||
|
||||
std::vector<std::shared_ptr<ShardOperator>> operators_;
|
||||
RETURN_EMPTY_IF_ERROR(BuildMindDatasetSamplerChain(sampler_, &operators_, num_padded_));
|
||||
|
||||
std::shared_ptr<MindRecordOp> mindrecord_op;
|
||||
// If pass a string to MindData(), it will be treated as a pattern to search for matched files,
|
||||
// else if pass a vector to MindData(), it will be treated as specified files to be read
|
||||
if (search_for_pattern_) {
|
||||
std::vector<std::string> dataset_file_vec_ = {dataset_file_};
|
||||
mindrecord_op = std::make_shared<MindRecordOp>(num_workers_, rows_per_buffer_, dataset_file_vec_,
|
||||
search_for_pattern_, connector_que_size_, columns_list_, operators_,
|
||||
num_padded_, padded_sample_, sample_bytes_);
|
||||
} else {
|
||||
mindrecord_op = std::make_shared<MindRecordOp>(num_workers_, rows_per_buffer_, dataset_files_, search_for_pattern_,
|
||||
connector_que_size_, columns_list_, operators_, num_padded_,
|
||||
padded_sample_, sample_bytes_);
|
||||
}
|
||||
|
||||
RETURN_EMPTY_IF_ERROR(mindrecord_op->Init());
|
||||
node_ops.push_back(mindrecord_op);
|
||||
|
||||
return node_ops;
|
||||
}
|
||||
#endif
|
||||
|
||||
MnistDataset::MnistDataset(std::string dataset_dir, std::string usage, std::shared_ptr<SamplerObj> sampler)
|
||||
: dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {}
|
||||
|
||||
|
|
|
@ -69,7 +69,7 @@ PYBIND_REGISTER(ShardSequentialSample, 0, ([](const py::module *m) {
|
|||
(void)py::class_<mindrecord::ShardSequentialSample, mindrecord::ShardSample,
|
||||
std::shared_ptr<mindrecord::ShardSequentialSample>>(*m,
|
||||
"MindrecordSequentialSampler")
|
||||
.def(py::init([](int num_samples, int start_index) {
|
||||
.def(py::init([](int64_t num_samples, int64_t start_index) {
|
||||
return std::make_shared<mindrecord::ShardSequentialSample>(num_samples, start_index);
|
||||
}));
|
||||
}));
|
||||
|
|
|
@ -23,10 +23,28 @@
|
|||
#include "minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/sampler/pk_sampler.h"
|
||||
|
||||
#include "minddata/mindrecord/include/shard_distributed_sample.h"
|
||||
#include "minddata/mindrecord/include/shard_operator.h"
|
||||
#include "minddata/mindrecord/include/shard_pk_sample.h"
|
||||
#include "minddata/mindrecord/include/shard_sample.h"
|
||||
#include "minddata/mindrecord/include/shard_sequential_sample.h"
|
||||
#include "minddata/mindrecord/include/shard_shuffle.h"
|
||||
#include "minddata/dataset/util/random.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
namespace api {
|
||||
|
||||
#define RETURN_NULL_IF_ERROR(_s) \
|
||||
do { \
|
||||
Status __rc = (_s); \
|
||||
if (__rc.IsError()) { \
|
||||
MS_LOG(ERROR) << __rc; \
|
||||
return nullptr; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
// Constructor
|
||||
SamplerObj::SamplerObj() {}
|
||||
|
||||
/// Function to create a Distributed Sampler.
|
||||
|
@ -126,8 +144,17 @@ bool DistributedSamplerObj::ValidateParams() {
|
|||
}
|
||||
|
||||
std::shared_ptr<Sampler> DistributedSamplerObj::Build() {
|
||||
return std::make_shared<dataset::DistributedSampler>(num_samples_, num_shards_, shard_id_, shuffle_, seed_, offset_,
|
||||
even_dist_);
|
||||
// runtime sampler object
|
||||
auto sampler = std::make_shared<dataset::DistributedSampler>(num_samples_, num_shards_, shard_id_, shuffle_, seed_,
|
||||
offset_, even_dist_);
|
||||
return sampler;
|
||||
}
|
||||
|
||||
std::shared_ptr<mindrecord::ShardOperator> DistributedSamplerObj::BuildForMindDataset() {
|
||||
// runtime mindrecord sampler object
|
||||
auto mind_sampler = std::make_shared<mindrecord::ShardDistributedSample>(num_shards_, shard_id_, shuffle_, seed_,
|
||||
num_samples_, offset_);
|
||||
return mind_sampler;
|
||||
}
|
||||
|
||||
// PKSampler
|
||||
|
@ -148,7 +175,23 @@ bool PKSamplerObj::ValidateParams() {
|
|||
}
|
||||
|
||||
std::shared_ptr<Sampler> PKSamplerObj::Build() {
|
||||
return std::make_shared<dataset::PKSampler>(num_samples_, num_val_, shuffle_);
|
||||
// runtime sampler object
|
||||
auto sampler = std::make_shared<dataset::PKSampler>(num_samples_, num_val_, shuffle_);
|
||||
|
||||
return sampler;
|
||||
}
|
||||
|
||||
std::shared_ptr<mindrecord::ShardOperator> PKSamplerObj::BuildForMindDataset() {
|
||||
// runtime mindrecord sampler object
|
||||
std::shared_ptr<mindrecord::ShardOperator> mind_sampler;
|
||||
if (shuffle_ == true) {
|
||||
mind_sampler = std::make_shared<mindrecord::ShardPkSample>("label", num_val_, std::numeric_limits<int64_t>::max(),
|
||||
GetSeed(), num_samples_);
|
||||
} else {
|
||||
mind_sampler = std::make_shared<mindrecord::ShardPkSample>("label", num_val_, num_samples_);
|
||||
}
|
||||
|
||||
return mind_sampler;
|
||||
}
|
||||
|
||||
// RandomSampler
|
||||
|
@ -164,11 +207,22 @@ bool RandomSamplerObj::ValidateParams() {
|
|||
}
|
||||
|
||||
std::shared_ptr<Sampler> RandomSamplerObj::Build() {
|
||||
// runtime sampler object
|
||||
bool reshuffle_each_epoch = true;
|
||||
auto sampler = std::make_shared<dataset::RandomSampler>(num_samples_, replacement_, reshuffle_each_epoch);
|
||||
|
||||
return sampler;
|
||||
}
|
||||
|
||||
std::shared_ptr<mindrecord::ShardOperator> RandomSamplerObj::BuildForMindDataset() {
|
||||
// runtime mindrecord sampler object
|
||||
bool reshuffle_each_epoch_ = true;
|
||||
auto mind_sampler =
|
||||
std::make_shared<mindrecord::ShardShuffle>(GetSeed(), num_samples_, replacement_, reshuffle_each_epoch_);
|
||||
|
||||
return mind_sampler;
|
||||
}
|
||||
|
||||
// SequentialSampler
|
||||
SequentialSamplerObj::SequentialSamplerObj(int64_t start_index, int64_t num_samples)
|
||||
: start_index_(start_index), num_samples_(num_samples) {}
|
||||
|
@ -188,10 +242,19 @@ bool SequentialSamplerObj::ValidateParams() {
|
|||
}
|
||||
|
||||
std::shared_ptr<Sampler> SequentialSamplerObj::Build() {
|
||||
// runtime sampler object
|
||||
auto sampler = std::make_shared<dataset::SequentialSampler>(num_samples_, start_index_);
|
||||
|
||||
return sampler;
|
||||
}
|
||||
|
||||
std::shared_ptr<mindrecord::ShardOperator> SequentialSamplerObj::BuildForMindDataset() {
|
||||
// runtime mindrecord sampler object
|
||||
auto mind_sampler = std::make_shared<mindrecord::ShardSequentialSample>(num_samples_, start_index_);
|
||||
|
||||
return mind_sampler;
|
||||
}
|
||||
|
||||
// SubsetRandomSampler
|
||||
SubsetRandomSamplerObj::SubsetRandomSamplerObj(std::vector<int64_t> indices, int64_t num_samples)
|
||||
: indices_(std::move(indices)), num_samples_(num_samples) {}
|
||||
|
@ -206,10 +269,19 @@ bool SubsetRandomSamplerObj::ValidateParams() {
|
|||
}
|
||||
|
||||
std::shared_ptr<Sampler> SubsetRandomSamplerObj::Build() {
|
||||
// runtime sampler object
|
||||
auto sampler = std::make_shared<dataset::SubsetRandomSampler>(num_samples_, indices_);
|
||||
|
||||
return sampler;
|
||||
}
|
||||
|
||||
std::shared_ptr<mindrecord::ShardOperator> SubsetRandomSamplerObj::BuildForMindDataset() {
|
||||
// runtime mindrecord sampler object
|
||||
auto mind_sampler = std::make_shared<mindrecord::ShardSample>(indices_, GetSeed());
|
||||
|
||||
return mind_sampler;
|
||||
}
|
||||
|
||||
// WeightedRandomSampler
|
||||
WeightedRandomSamplerObj::WeightedRandomSamplerObj(std::vector<double> weights, int64_t num_samples, bool replacement)
|
||||
: weights_(std::move(weights)), num_samples_(num_samples), replacement_(replacement) {}
|
||||
|
|
|
@ -66,6 +66,7 @@ class CsvBase;
|
|||
class ImageFolderDataset;
|
||||
#ifndef ENABLE_ANDROID
|
||||
class ManifestDataset;
|
||||
class MindDataDataset;
|
||||
#endif
|
||||
class MnistDataset;
|
||||
class RandomDataset;
|
||||
|
@ -244,6 +245,37 @@ std::shared_ptr<ManifestDataset> Manifest(const std::string &dataset_file, const
|
|||
bool decode = false);
|
||||
#endif
|
||||
|
||||
#ifndef ENABLE_ANDROID
|
||||
/// \brief Function to create a MindDataDataset
|
||||
/// \param[in] dataset_file File name of one component of a mindrecord source. Other files with identical source
|
||||
/// in the same path will be found and loaded automatically.
|
||||
/// \param[in] columns_list List of columns to be read (default={})
|
||||
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
|
||||
/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()),
|
||||
/// supported sampler list: SubsetRandomSampler, PkSampler, RandomSampler, SequentialSampler, DistributedSampler.
|
||||
/// \param[in] padded_sample Samples will be appended to dataset, where keys are the same as column_list.
|
||||
/// \param[in] num_padded Number of padding samples. Dataset size plus num_padded should be divisible by num_shards.
|
||||
/// \return Shared pointer to the current MindDataDataset
|
||||
std::shared_ptr<MindDataDataset> MindData(const std::string &dataset_file,
|
||||
const std::vector<std::string> &columns_list = {},
|
||||
const std::shared_ptr<SamplerObj> &sampler = RandomSampler(),
|
||||
nlohmann::json padded_sample = nullptr, int64_t num_padded = 0);
|
||||
|
||||
/// \brief Function to create a MindDataDataset
|
||||
/// \param[in] dataset_files List of dataset files to be read directly.
|
||||
/// \param[in] columns_list List of columns to be read (default={})
|
||||
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
|
||||
/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()),
|
||||
/// supported sampler list: SubsetRandomSampler, PkSampler, RandomSampler, SequentialSampler, DistributedSampler.
|
||||
/// \param[in] padded_sample Samples will be appended to dataset, where keys are the same as column_list.
|
||||
/// \param[in] num_padded Number of padding samples. Dataset size plus num_padded should be divisible by num_shards.
|
||||
/// \return Shared pointer to the current MindDataDataset
|
||||
std::shared_ptr<MindDataDataset> MindData(const std::vector<std::string> &dataset_files,
|
||||
const std::vector<std::string> &columns_list = {},
|
||||
const std::shared_ptr<SamplerObj> &sampler = RandomSampler(),
|
||||
nlohmann::json padded_sample = nullptr, int64_t num_padded = 0);
|
||||
#endif
|
||||
|
||||
/// \brief Function to create a MnistDataset
|
||||
/// \notes The generated dataset has two columns ["image", "label"]
|
||||
/// \param[in] dataset_dir Path to the root directory that contains the dataset
|
||||
|
@ -938,6 +970,50 @@ class ManifestDataset : public Dataset {
|
|||
};
|
||||
#endif
|
||||
|
||||
#ifndef ENABLE_ANDROID
|
||||
class MindDataDataset : public Dataset {
|
||||
public:
|
||||
/// \brief Constructor
|
||||
MindDataDataset(const std::vector<std::string> &dataset_files, const std::vector<std::string> &columns_list,
|
||||
const std::shared_ptr<SamplerObj> &sampler, nlohmann::json padded_sample, int64_t num_padded);
|
||||
|
||||
/// \brief Constructor
|
||||
MindDataDataset(const std::string &dataset_file, const std::vector<std::string> &columns_list,
|
||||
const std::shared_ptr<SamplerObj> &sampler, nlohmann::json padded_sample, int64_t num_padded);
|
||||
|
||||
/// \brief Destructor
|
||||
~MindDataDataset() = default;
|
||||
|
||||
/// \brief a base class override function to create the required runtime dataset op objects for this class
|
||||
/// \return The list of shared pointers to the newly created DatasetOps
|
||||
std::vector<std::shared_ptr<DatasetOp>> Build() override;
|
||||
|
||||
/// \brief Parameters validation
|
||||
/// \return Status Status::OK() if all the parameters are valid
|
||||
Status ValidateParams() override;
|
||||
|
||||
/// \brief Build sampler chain for minddata dataset
|
||||
/// \return Status Status::OK() if input sampler is valid
|
||||
Status BuildMindDatasetSamplerChain(const std::shared_ptr<SamplerObj> &sampler,
|
||||
std::vector<std::shared_ptr<mindrecord::ShardOperator>> *operators_,
|
||||
int64_t num_padded);
|
||||
|
||||
/// \brief Set sample_bytes when padded_sample has py::byte value
|
||||
/// \note Pybind will use this function to set sample_bytes into MindDataDataset
|
||||
void SetSampleBytes(std::map<std::string, std::string> *sample_bytes);
|
||||
|
||||
private:
|
||||
std::string dataset_file_; // search_for_pattern_ will be true in this mode
|
||||
std::vector<std::string> dataset_files_; // search_for_pattern_ will be false in this mode
|
||||
bool search_for_pattern_;
|
||||
std::vector<std::string> columns_list_;
|
||||
std::shared_ptr<SamplerObj> sampler_;
|
||||
nlohmann::json padded_sample_;
|
||||
std::map<std::string, std::string> sample_bytes_; // enable in python
|
||||
int64_t num_padded_;
|
||||
};
|
||||
#endif
|
||||
|
||||
class MnistDataset : public Dataset {
|
||||
public:
|
||||
/// \brief Constructor
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "minddata/dataset/engine/datasetops/source/mindrecord_op.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
@ -30,12 +31,24 @@ namespace api {
|
|||
|
||||
class SamplerObj : public std::enable_shared_from_this<SamplerObj> {
|
||||
public:
|
||||
/// \brief Constructor
|
||||
SamplerObj();
|
||||
|
||||
/// \brief Destructor
|
||||
~SamplerObj() = default;
|
||||
|
||||
virtual std::shared_ptr<Sampler> Build() = 0;
|
||||
/// \brief Pure virtual function for derived class to implement parameters validation
|
||||
/// \return bool true if all the parameters are valid
|
||||
virtual bool ValidateParams() = 0;
|
||||
|
||||
/// \brief Pure virtual function to convert a SamplerObj class into a runtime sampler object
|
||||
/// \return Shared pointers to the newly created Sampler
|
||||
virtual std::shared_ptr<Sampler> Build() = 0;
|
||||
|
||||
/// \brief Virtual function to convert a SamplerObj class into a runtime mindrecord sampler object,
|
||||
/// only override by SubsetRandomSampler, PkSampler, RandomSampler, SequentialSampler, DistributedSampler
|
||||
/// \return Shared pointers to the newly created Sampler
|
||||
virtual std::shared_ptr<mindrecord::ShardOperator> BuildForMindDataset() { return nullptr; }
|
||||
};
|
||||
|
||||
class DistributedSamplerObj;
|
||||
|
@ -110,6 +123,8 @@ class DistributedSamplerObj : public SamplerObj {
|
|||
|
||||
std::shared_ptr<Sampler> Build() override;
|
||||
|
||||
std::shared_ptr<mindrecord::ShardOperator> BuildForMindDataset() override;
|
||||
|
||||
bool ValidateParams() override;
|
||||
|
||||
private:
|
||||
|
@ -130,6 +145,8 @@ class PKSamplerObj : public SamplerObj {
|
|||
|
||||
std::shared_ptr<Sampler> Build() override;
|
||||
|
||||
std::shared_ptr<mindrecord::ShardOperator> BuildForMindDataset() override;
|
||||
|
||||
bool ValidateParams() override;
|
||||
|
||||
private:
|
||||
|
@ -146,6 +163,8 @@ class RandomSamplerObj : public SamplerObj {
|
|||
|
||||
std::shared_ptr<Sampler> Build() override;
|
||||
|
||||
std::shared_ptr<mindrecord::ShardOperator> BuildForMindDataset() override;
|
||||
|
||||
bool ValidateParams() override;
|
||||
|
||||
private:
|
||||
|
@ -161,6 +180,8 @@ class SequentialSamplerObj : public SamplerObj {
|
|||
|
||||
std::shared_ptr<Sampler> Build() override;
|
||||
|
||||
std::shared_ptr<mindrecord::ShardOperator> BuildForMindDataset() override;
|
||||
|
||||
bool ValidateParams() override;
|
||||
|
||||
private:
|
||||
|
@ -176,6 +197,8 @@ class SubsetRandomSamplerObj : public SamplerObj {
|
|||
|
||||
std::shared_ptr<Sampler> Build() override;
|
||||
|
||||
std::shared_ptr<mindrecord::ShardOperator> BuildForMindDataset() override;
|
||||
|
||||
bool ValidateParams() override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -27,7 +27,7 @@ namespace mindspore {
|
|||
namespace mindrecord {
|
||||
class ShardSequentialSample : public ShardSample {
|
||||
public:
|
||||
ShardSequentialSample(int n, int offset);
|
||||
ShardSequentialSample(int64_t n, int64_t offset);
|
||||
|
||||
ShardSequentialSample(float per, float per_offset);
|
||||
|
||||
|
@ -38,7 +38,7 @@ class ShardSequentialSample : public ShardSample {
|
|||
int64_t GetNumSamples(int64_t dataset_size, int64_t num_classes) override;
|
||||
|
||||
private:
|
||||
int offset_;
|
||||
int64_t offset_;
|
||||
float per_;
|
||||
float per_offset_;
|
||||
};
|
||||
|
|
|
@ -22,7 +22,7 @@ using mindspore::MsLogLevel::ERROR;
|
|||
|
||||
namespace mindspore {
|
||||
namespace mindrecord {
|
||||
ShardSequentialSample::ShardSequentialSample(int n, int offset)
|
||||
ShardSequentialSample::ShardSequentialSample(int64_t n, int64_t offset)
|
||||
: ShardSample(n), offset_(offset), per_(0.0f), per_offset_(0.0f) {}
|
||||
|
||||
ShardSequentialSample::ShardSequentialSample(float per, float per_offset)
|
||||
|
|
|
@ -3047,7 +3047,10 @@ class MindDataset(MappableDataset):
|
|||
A source dataset that reads MindRecord files.
|
||||
|
||||
Args:
|
||||
dataset_file (Union[str, list[str]]): One of file names or file list in dataset.
|
||||
dataset_file (Union[str, list[str]]): If dataset_file is a str, it represents for
|
||||
a file name of one component of a mindrecord source, other files with identical source
|
||||
in the same path will be found and loaded automatically. If dataset_file is a list,
|
||||
it represents for a list of dataset files to be read directly.
|
||||
columns_list (list[str], optional): List of columns to be read (default=None).
|
||||
num_parallel_workers (int, optional): The number of readers (default=None).
|
||||
shuffle (bool, optional): Whether or not to perform shuffle on the dataset
|
||||
|
@ -3059,7 +3062,7 @@ class MindDataset(MappableDataset):
|
|||
dataset (default=None, sampler is exclusive
|
||||
with shuffle and block_reader). Support list: SubsetRandomSampler,
|
||||
PkSampler, RandomSampler, SequentialSampler, DistributedSampler.
|
||||
padded_sample (dict, optional): Samples will be appended to dataset, which
|
||||
padded_sample (dict, optional): Samples will be appended to dataset, where
|
||||
keys are the same as column_list.
|
||||
num_padded (int, optional): Number of padding samples. Dataset size
|
||||
plus num_padded should be divisible by num_shards.
|
||||
|
|
|
@ -0,0 +1,411 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "common/common.h"
|
||||
#include "minddata/dataset/include/datasets.h"
|
||||
|
||||
using namespace mindspore::dataset::api;
|
||||
using mindspore::dataset::Tensor;
|
||||
|
||||
class MindDataTestPipeline : public UT::DatasetOpTesting {
|
||||
protected:
|
||||
};
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestMindDataSuccess1) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMindDataSuccess1 with string file pattern.";
|
||||
|
||||
// Create a MindData Dataset
|
||||
// Pass one mindrecord shard file to parse dataset info, and search for other mindrecord files with same dataset info,
|
||||
// thus all records in imagenet.mindrecord0 ~ imagenet.mindrecord3 will be read
|
||||
std::string file_path = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord0";
|
||||
std::shared_ptr<Dataset> ds = MindData(file_path);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
auto image = row["file_name"];
|
||||
MS_LOG(INFO) << "Tensor image file name: " << *image;
|
||||
iter->GetNextRow(&row);
|
||||
}
|
||||
|
||||
// Each *.mindrecord file has 5 rows, so there are 20 rows in total(imagenet.mindrecord0 ~ imagenet.mindrecord3)
|
||||
EXPECT_EQ(i, 20);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestMindDataSuccess2) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMindDataSuccess2 with a vector of single mindrecord file.";
|
||||
|
||||
// Create a MindData Dataset
|
||||
// Pass a list of mindrecord file name, files in list will be read directly but not search for related files
|
||||
std::string file_path1 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord0";
|
||||
std::shared_ptr<Dataset> ds = MindData(std::vector<std::string>{file_path1});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
auto image = row["file_name"];
|
||||
MS_LOG(INFO) << "Tensor image file name: " << *image;
|
||||
iter->GetNextRow(&row);
|
||||
}
|
||||
|
||||
// Only records in imagenet.mindrecord0 are read
|
||||
EXPECT_EQ(i, 5);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestMindDataSuccess3) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMindDataSuccess3 with a vector of multiple mindrecord files.";
|
||||
|
||||
// Create a MindData Dataset
|
||||
// Pass a list of mindrecord file name, files in list will be read directly but not search for related files
|
||||
std::string file_path1 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord0";
|
||||
std::string file_path2 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord1";
|
||||
std::vector<std::string> file_list = {file_path1, file_path2};
|
||||
std::shared_ptr<Dataset> ds = MindData(file_list);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
auto image = row["file_name"];
|
||||
MS_LOG(INFO) << "Tensor image file name: " << *image;
|
||||
iter->GetNextRow(&row);
|
||||
}
|
||||
|
||||
// Only records in imagenet.mindrecord0 and imagenet.mindrecord1 are read
|
||||
EXPECT_EQ(i, 10);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestMindDataSuccess4) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMindDataSuccess4 with specified column.";
|
||||
|
||||
// Create a MindData Dataset
|
||||
// Pass one mindrecord shard file to parse dataset info, and search for other mindrecord files with same dataset info,
|
||||
// thus all records in imagenet.mindrecord0 ~ imagenet.mindrecord3 will be read
|
||||
std::string file_path1 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord1";
|
||||
std::shared_ptr<Dataset> ds = MindData(file_path1, {"label"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
auto label = row["label"];
|
||||
MS_LOG(INFO) << "Tensor label: " << *label;
|
||||
iter->GetNextRow(&row);
|
||||
}
|
||||
|
||||
// Shard file "mindrecord0/mindrecord1/mindrecord2/mindrecord3" have same dataset info,
|
||||
// thus if input file is any of them, all records in imagenet.mindrecord* will be read
|
||||
EXPECT_EQ(i, 20);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestMindDataSuccess5) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMindDataSuccess5 with specified sampler.";
|
||||
|
||||
// Create a MindData Dataset
|
||||
// Pass one mindrecord shard file to parse dataset info, and search for other mindrecord files with same dataset info,
|
||||
// thus all records in imagenet.mindrecord0 ~ imagenet.mindrecord3 will be read
|
||||
std::string file_path1 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord0";
|
||||
std::shared_ptr<Dataset> ds = MindData(file_path1, {}, SequentialSampler(0, 3));
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
auto label = row["label"];
|
||||
|
||||
std::shared_ptr<Tensor> expected_item;
|
||||
Tensor::CreateScalar((int64_t)0, &expected_item);
|
||||
EXPECT_EQ(*expected_item, *label);
|
||||
|
||||
iter->GetNextRow(&row);
|
||||
}
|
||||
|
||||
// SequentialSampler will return 3 samples
|
||||
EXPECT_EQ(i, 3);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestMindDataSuccess6) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMindDataSuccess6 with num_samples out of range.";
|
||||
|
||||
// Create a MindData Dataset
|
||||
// Pass a list of mindrecord file name, files in list will be read directly but not search for related files
|
||||
// imagenet.mindrecord0 file has 5 rows, but num_samples is larger than 5
|
||||
std::string file_path1 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord0";
|
||||
std::vector<std::string> file_list = {file_path1};
|
||||
|
||||
// Check sequential sampler, output number is 10, with duplicate samples(a little weird, wait to fix)
|
||||
std::shared_ptr<Dataset> ds1 = MindData(file_list, {}, SequentialSampler(0, 10));
|
||||
EXPECT_NE(ds1, nullptr);
|
||||
|
||||
// Check random sampler, output number is 5, same rows with file
|
||||
std::shared_ptr<Dataset> ds2 = MindData(file_list, {}, RandomSampler(false, 10));
|
||||
EXPECT_NE(ds2, nullptr);
|
||||
|
||||
// Check pk sampler, output number is 2, get 2 samples with label 0
|
||||
std::shared_ptr<Dataset> ds3 = MindData(file_list, {}, PKSampler(2, false, 10));
|
||||
EXPECT_NE(ds3, nullptr);
|
||||
|
||||
// Check distributed sampler, output number is 3, get 3 samples in shard 0
|
||||
std::shared_ptr<Dataset> ds4 = MindData(file_list, {}, DistributedSampler(2, 0, false, 10));
|
||||
EXPECT_NE(ds4, nullptr);
|
||||
|
||||
// Check distributed sampler get 3 samples with indice 0, 1 ,2
|
||||
std::shared_ptr<Dataset> ds5 = MindData(file_list, {}, SubsetRandomSampler({0, 1, 2}, 10));
|
||||
EXPECT_NE(ds5, nullptr);
|
||||
|
||||
std::vector<std::shared_ptr<Dataset>> ds = {ds1, ds2, ds3, ds4, ds5};
|
||||
std::vector<int32_t> expected_samples = {10, 5, 2, 3, 3};
|
||||
|
||||
for (int32_t i = 0; i < ds.size(); i++) {
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds[i]->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t j = 0;
|
||||
while (row.size() != 0) {
|
||||
j++;
|
||||
MS_LOG(INFO) << "Tensor label: " << *row["label"];
|
||||
iter->GetNextRow(&row);
|
||||
}
|
||||
EXPECT_EQ(j, expected_samples[i]);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestMindDataSuccess7) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMindDataSuccess7 with padded sample.";
|
||||
|
||||
// Create pad sample for MindDataset
|
||||
auto pad = nlohmann::json::object();
|
||||
pad["file_name"] = "does_not_exist.jpg";
|
||||
pad["label"] = 999;
|
||||
|
||||
// Create a MindData Dataset
|
||||
// Pass a list of mindrecord file name, files in list will be read directly but not search for related files
|
||||
std::string file_path1 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord0";
|
||||
std::vector<std::string> file_list = {file_path1};
|
||||
std::shared_ptr<Dataset> ds = MindData(file_list, {"file_name", "label"}, SequentialSampler(), pad, 4);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create a Skip operation on ds, skip original data in mindrecord and get padded samples
|
||||
ds = ds->Skip(5);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
auto image = row["file_name"];
|
||||
auto label = row["label"];
|
||||
MS_LOG(INFO) << "Tensor file name: " << *image;
|
||||
MS_LOG(INFO) << "Tensor label: " << *label;
|
||||
|
||||
std::shared_ptr<Tensor> expected_item;
|
||||
Tensor::CreateScalar((int64_t)999, &expected_item);
|
||||
EXPECT_EQ(*expected_item, *label);
|
||||
|
||||
iter->GetNextRow(&row);
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 4);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestMindDataFail1) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMindDataFail1 with incorrect file path.";
|
||||
|
||||
// Create a MindData Dataset with incorrect pattern
|
||||
std::string file_path1 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/apple.mindrecord0";
|
||||
std::shared_ptr<Dataset> ds1 = MindData(file_path1);
|
||||
EXPECT_EQ(ds1, nullptr);
|
||||
|
||||
// Create a MindData Dataset with incorrect file path
|
||||
std::string file_path2 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/apple.mindrecord0";
|
||||
std::vector<std::string> file_list = {file_path2};
|
||||
std::shared_ptr<Dataset> ds2 = MindData(file_list);
|
||||
EXPECT_EQ(ds2, nullptr);
|
||||
|
||||
// Create a MindData Dataset with incorrect file path
|
||||
// ATTENTION: file_path3 is not a pattern to search for ".mindrecord*"
|
||||
std::string file_path3 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord";
|
||||
std::shared_ptr<Dataset> ds3 = MindData(file_path3);
|
||||
EXPECT_EQ(ds3, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestMindDataFail2) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMindDataFail2 with incorrect column name.";
|
||||
|
||||
// Create a MindData Dataset with incorrect column name
|
||||
std::string file_path1 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord0";
|
||||
std::shared_ptr<Dataset> ds1 = MindData(file_path1, {""});
|
||||
EXPECT_EQ(ds1, nullptr);
|
||||
|
||||
// Create a MindData Dataset with duplicate column name
|
||||
std::string file_path2 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord0";
|
||||
std::shared_ptr<Dataset> ds2 = MindData(file_path2, {"label", "label"});
|
||||
EXPECT_EQ(ds2, nullptr);
|
||||
|
||||
// Create a MindData Dataset with unexpected column name
|
||||
std::string file_path3 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord0";
|
||||
std::vector<std::string> file_list = {file_path3};
|
||||
std::shared_ptr<Dataset> ds3 = MindData(file_list, {"label", "not_exist"});
|
||||
EXPECT_NE(ds3, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds3->CreateIterator();
|
||||
EXPECT_EQ(iter, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestMindDataFail3) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMindDataFail3 with unsupported sampler.";
|
||||
|
||||
// Create a MindData Dataset with unsupported sampler
|
||||
std::string file_path1 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord0";
|
||||
std::shared_ptr<Dataset> ds1 = MindData(file_path1, {}, WeightedRandomSampler({1, 1, 1, 1}));
|
||||
EXPECT_NE(ds1, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter1 = ds1->CreateIterator();
|
||||
EXPECT_EQ(iter1, nullptr);
|
||||
|
||||
// Create a MindData Dataset with incorrect sampler
|
||||
std::string file_path2 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord0";
|
||||
std::shared_ptr<Dataset> ds2 = MindData(file_path2, {}, nullptr);
|
||||
EXPECT_EQ(ds2, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestMindDataFail4) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMindData with padded sample.";
|
||||
|
||||
// Create a MindData Dataset
|
||||
std::string file_path1 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord0";
|
||||
std::shared_ptr<Dataset> ds1 = MindData(file_path1, {}, RandomSampler(), nullptr, 2);
|
||||
|
||||
// num_padded is specified but padded_sample is not
|
||||
EXPECT_EQ(ds1, nullptr);
|
||||
|
||||
// Create paded sample for MindDataset
|
||||
auto pad = nlohmann::json::object();
|
||||
pad["file_name"] = "1.jpg";
|
||||
pad["label"] = 123456;
|
||||
|
||||
// Create a MindData Dataset
|
||||
std::string file_path2 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord0";
|
||||
std::shared_ptr<Dataset> ds2 = MindData(file_path2, {"label"}, RandomSampler(), pad, -2);
|
||||
|
||||
// num_padded must be greater than or equal to zero
|
||||
EXPECT_EQ(ds2, nullptr);
|
||||
|
||||
// Create a MindData Dataset
|
||||
std::string file_path3 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord0";
|
||||
std::shared_ptr<Dataset> ds3 = MindData(file_path3, {}, RandomSampler(), pad, 1);
|
||||
|
||||
// padded_sample is specified and requires columns_list as well
|
||||
EXPECT_EQ(ds3, nullptr);
|
||||
|
||||
// Create paded sample with unmatch column name
|
||||
auto pad2 = nlohmann::json::object();
|
||||
pad2["a"] = "1.jpg";
|
||||
pad2["b"] = 123456;
|
||||
|
||||
// Create a MindData Dataset
|
||||
std::string file_path4 = datasets_root_path_ + "/../mindrecord/testMindDataSet/testImageNetData/imagenet.mindrecord0";
|
||||
std::shared_ptr<Dataset> ds4 = MindData(file_path4, {"file_name", "label"}, RandomSampler(), pad2, 1);
|
||||
|
||||
// columns_list does not match any column in padded_sample
|
||||
EXPECT_EQ(ds4, nullptr);
|
||||
}
|
Loading…
Reference in New Issue