forked from mindspore-Ecosystem/mindspore
add randomdataset and schema
This commit is contained in:
parent
2cc6230f81
commit
b91e56375e
|
@ -27,6 +27,7 @@
|
|||
#include "minddata/dataset/engine/datasetops/source/coco_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/image_folder_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/mnist_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/random_data_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/text_file_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/voc_op.h"
|
||||
// Dataset operator headers (in alphabetical order)
|
||||
|
@ -100,6 +101,15 @@ Dataset::Dataset() {
|
|||
worker_connector_size_ = cfg->worker_connector_size();
|
||||
}
|
||||
|
||||
/// \brief Function to create a SchemaObj
|
||||
/// \param[in] schema_file Path of schema file
|
||||
/// \return Shared pointer to the current schema
|
||||
std::shared_ptr<SchemaObj> Schema(const std::string &schema_file) {
|
||||
auto schema = std::make_shared<SchemaObj>(schema_file);
|
||||
|
||||
return schema->init() ? schema : nullptr;
|
||||
}
|
||||
|
||||
// FUNCTIONS TO CREATE DATASETS FOR LEAF-NODE DATASETS
|
||||
// (In alphabetical order)
|
||||
|
||||
|
@ -353,6 +363,163 @@ std::shared_ptr<ZipDataset> Dataset::Zip(const std::vector<std::shared_ptr<Datas
|
|||
return ds->ValidateParams() ? ds : nullptr;
|
||||
}
|
||||
|
||||
SchemaObj::SchemaObj(const std::string &schema_file) : schema_file_(schema_file), num_rows_(0), dataset_type_("") {}
|
||||
|
||||
// SchemaObj init function
|
||||
bool SchemaObj::init() {
|
||||
if (schema_file_ != "") {
|
||||
Path schema_file(schema_file_);
|
||||
if (!schema_file.Exists()) {
|
||||
MS_LOG(ERROR) << "The file " << schema_file << " does not exist or permission denied!";
|
||||
return false;
|
||||
}
|
||||
|
||||
nlohmann::json js;
|
||||
try {
|
||||
std::ifstream in(schema_file_);
|
||||
in >> js;
|
||||
} catch (const std::exception &err) {
|
||||
MS_LOG(ERROR) << "Schema file failed to load";
|
||||
return false;
|
||||
}
|
||||
return from_json(js);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Function to add a column to schema with a mstype de_type
|
||||
bool SchemaObj::add_column(std::string name, TypeId de_type, std::vector<int32_t> shape) {
|
||||
nlohmann::json new_column;
|
||||
new_column["name"] = name;
|
||||
// if de_type is mstype
|
||||
DataType data_type = dataset::MSTypeToDEType(de_type);
|
||||
new_column["type"] = data_type.ToString();
|
||||
if (shape.size() > 0) {
|
||||
new_column["shape"] = shape;
|
||||
new_column["rank"] = shape.size();
|
||||
} else {
|
||||
new_column["rank"] = 1;
|
||||
}
|
||||
columns_.push_back(new_column);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Function to add a column to schema with a string de_type
|
||||
bool SchemaObj::add_column(std::string name, std::string de_type, std::vector<int32_t> shape) {
|
||||
nlohmann::json new_column;
|
||||
new_column["name"] = name;
|
||||
DataType data_type(de_type);
|
||||
new_column["type"] = data_type.ToString();
|
||||
if (shape.size() > 0) {
|
||||
new_column["shape"] = shape;
|
||||
new_column["rank"] = shape.size();
|
||||
} else {
|
||||
new_column["rank"] = 1;
|
||||
}
|
||||
columns_.push_back(new_column);
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string SchemaObj::to_json() {
|
||||
nlohmann::json json_file;
|
||||
json_file["columns"] = columns_;
|
||||
if (dataset_type_ != "") {
|
||||
json_file["datasetType"] = dataset_type_;
|
||||
}
|
||||
|
||||
if (num_rows_ > 0) {
|
||||
json_file["numRows"] = num_rows_;
|
||||
}
|
||||
|
||||
return json_file.dump(2);
|
||||
}
|
||||
|
||||
bool SchemaObj::parse_column(nlohmann::json columns) {
|
||||
std::string name, de_type;
|
||||
std::vector<int32_t> shape;
|
||||
|
||||
columns_.clear();
|
||||
if (columns.type() == nlohmann::json::value_t::array) {
|
||||
// reference to python list
|
||||
for (auto column : columns) {
|
||||
auto key_name = column.find("name");
|
||||
if (key_name == column.end()) {
|
||||
MS_LOG(ERROR) << "Column's name is missing";
|
||||
return false;
|
||||
}
|
||||
name = *key_name;
|
||||
|
||||
auto key_type = column.find("type");
|
||||
if (key_type == column.end()) {
|
||||
MS_LOG(ERROR) << "Column's type is missing";
|
||||
return false;
|
||||
}
|
||||
de_type = *key_type;
|
||||
|
||||
shape.clear();
|
||||
auto key_shape = column.find("shape");
|
||||
if (key_shape != column.end()) {
|
||||
shape.insert(shape.end(), (*key_shape).begin(), (*key_shape).end());
|
||||
}
|
||||
if (!add_column(name, de_type, shape)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else if (columns.type() == nlohmann::json::value_t::object) {
|
||||
for (const auto &it_child : columns.items()) {
|
||||
name = it_child.key();
|
||||
auto key_type = it_child.value().find("type");
|
||||
if (key_type == it_child.value().end()) {
|
||||
MS_LOG(ERROR) << "Column's type is missing";
|
||||
return false;
|
||||
}
|
||||
de_type = *key_type;
|
||||
|
||||
shape.clear();
|
||||
auto key_shape = it_child.value().find("shape");
|
||||
if (key_shape != it_child.value().end()) {
|
||||
shape.insert(shape.end(), (*key_shape).begin(), (*key_shape).end());
|
||||
}
|
||||
|
||||
if (!add_column(name, de_type, shape)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
MS_LOG(ERROR) << "columns must be dict or list, columns contain name, type, shape(optional).";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SchemaObj::from_json(nlohmann::json json_obj) {
|
||||
for (const auto &it_child : json_obj.items()) {
|
||||
if (it_child.key() == "datasetType") {
|
||||
dataset_type_ = it_child.value();
|
||||
} else if (it_child.key() == "numRows") {
|
||||
num_rows_ = it_child.value();
|
||||
} else if (it_child.key() == "columns") {
|
||||
if (!parse_column(it_child.value())) {
|
||||
MS_LOG(ERROR) << "parse columns failed";
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Unknown field " << it_child.key();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (columns_.empty()) {
|
||||
MS_LOG(ERROR) << "Columns are missing.";
|
||||
return false;
|
||||
}
|
||||
if (num_rows_ <= 0) {
|
||||
MS_LOG(ERROR) << "numRows must be greater than 0";
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// OTHER FUNCTIONS
|
||||
// (In alphabetical order)
|
||||
|
||||
|
@ -864,6 +1031,67 @@ std::vector<std::shared_ptr<DatasetOp>> MnistDataset::Build() {
|
|||
return node_ops;
|
||||
}
|
||||
|
||||
// ValideParams for RandomDataset
|
||||
bool RandomDataset::ValidateParams() {
|
||||
if (total_rows_ < 0) {
|
||||
MS_LOG(ERROR) << "RandomDataset: total_rows must be greater than 0, now get " << total_rows_;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t RandomDataset::GenRandomInt(int32_t min, int32_t max) {
|
||||
std::uniform_int_distribution<int32_t> uniDist(min, max);
|
||||
return uniDist(rand_gen_);
|
||||
}
|
||||
|
||||
// Build for RandomDataset
|
||||
std::vector<std::shared_ptr<DatasetOp>> RandomDataset::Build() {
|
||||
// A vector containing shared pointer to the Dataset Ops that this object will create
|
||||
std::vector<std::shared_ptr<DatasetOp>> node_ops;
|
||||
|
||||
rand_gen_.seed(GetSeed()); // seed the random generator
|
||||
// If total rows was not given, then randomly pick a number
|
||||
std::shared_ptr<SchemaObj> schema_obj;
|
||||
if (!schema_path_.empty()) schema_obj = std::make_shared<SchemaObj>(schema_path_);
|
||||
|
||||
if (schema_obj != nullptr && total_rows_ == 0) {
|
||||
total_rows_ = schema_obj->get_num_rows();
|
||||
}
|
||||
|
||||
// If user does not specify Sampler, create a default sampler based on the shuffle variable.
|
||||
if (sampler_ == nullptr) {
|
||||
sampler_ = CreateDefaultSampler();
|
||||
}
|
||||
|
||||
std::string schema_json_string, schema_file_path;
|
||||
if (schema_ != nullptr) {
|
||||
schema_->set_dataset_type("Random");
|
||||
if (total_rows_ != 0) {
|
||||
schema_->set_num_rows(total_rows_);
|
||||
}
|
||||
schema_json_string = schema_->to_json();
|
||||
} else {
|
||||
schema_file_path = schema_path_;
|
||||
}
|
||||
|
||||
std::unique_ptr<DataSchema> data_schema;
|
||||
std::vector<std::string> columns_to_load;
|
||||
if (!schema_file_path.empty() || !schema_json_string.empty()) {
|
||||
data_schema = std::make_unique<DataSchema>();
|
||||
if (!schema_file_path.empty()) {
|
||||
data_schema->LoadSchemaFile(schema_file_path, columns_to_load);
|
||||
} else if (!schema_json_string.empty()) {
|
||||
data_schema->LoadSchemaString(schema_json_string, columns_to_load);
|
||||
}
|
||||
}
|
||||
std::shared_ptr<RandomDataOp> op;
|
||||
op = std::make_shared<RandomDataOp>(num_workers_, connector_que_size_, rows_per_buffer_, total_rows_,
|
||||
std::move(data_schema), std::move(sampler_->Build()));
|
||||
node_ops.push_back(op);
|
||||
return node_ops;
|
||||
}
|
||||
|
||||
// Constructor for TextFileDataset
|
||||
TextFileDataset::TextFileDataset(std::vector<std::string> dataset_files, int32_t num_samples, ShuffleMode shuffle,
|
||||
int32_t num_shards, int32_t shard_id)
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
*/
|
||||
|
||||
#include "minddata/dataset/include/de_tensor.h"
|
||||
#include "minddata/dataset/include/type_id.h"
|
||||
#include "minddata/dataset/core/constants.h"
|
||||
#include "minddata/dataset/core/data_type.h"
|
||||
#include "mindspore/core/ir/dtype/type_id.h"
|
||||
|
@ -23,68 +24,6 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace tensor {
|
||||
dataset::DataType MSTypeToDEType(TypeId data_type) {
|
||||
switch (data_type) {
|
||||
case kNumberTypeBool:
|
||||
return dataset::DataType(dataset::DataType::DE_BOOL);
|
||||
case kNumberTypeInt8:
|
||||
return dataset::DataType(dataset::DataType::DE_INT8);
|
||||
case kNumberTypeUInt8:
|
||||
return dataset::DataType(dataset::DataType::DE_UINT8);
|
||||
case kNumberTypeInt16:
|
||||
return dataset::DataType(dataset::DataType::DE_INT16);
|
||||
case kNumberTypeUInt16:
|
||||
return dataset::DataType(dataset::DataType::DE_UINT16);
|
||||
case kNumberTypeInt32:
|
||||
return dataset::DataType(dataset::DataType::DE_INT32);
|
||||
case kNumberTypeUInt32:
|
||||
return dataset::DataType(dataset::DataType::DE_UINT32);
|
||||
case kNumberTypeInt64:
|
||||
return dataset::DataType(dataset::DataType::DE_INT64);
|
||||
case kNumberTypeUInt64:
|
||||
return dataset::DataType(dataset::DataType::DE_UINT64);
|
||||
case kNumberTypeFloat16:
|
||||
return dataset::DataType(dataset::DataType::DE_FLOAT16);
|
||||
case kNumberTypeFloat32:
|
||||
return dataset::DataType(dataset::DataType::DE_FLOAT32);
|
||||
case kNumberTypeFloat64:
|
||||
return dataset::DataType(dataset::DataType::DE_FLOAT64);
|
||||
default:
|
||||
return dataset::DataType(dataset::DataType::DE_UNKNOWN);
|
||||
}
|
||||
}
|
||||
|
||||
TypeId DETypeToMSType(dataset::DataType data_type) {
|
||||
switch (data_type.value()) {
|
||||
case dataset::DataType::DE_BOOL:
|
||||
return mindspore::TypeId::kNumberTypeBool;
|
||||
case dataset::DataType::DE_INT8:
|
||||
return mindspore::TypeId::kNumberTypeInt8;
|
||||
case dataset::DataType::DE_UINT8:
|
||||
return mindspore::TypeId::kNumberTypeUInt8;
|
||||
case dataset::DataType::DE_INT16:
|
||||
return mindspore::TypeId::kNumberTypeInt16;
|
||||
case dataset::DataType::DE_UINT16:
|
||||
return mindspore::TypeId::kNumberTypeUInt16;
|
||||
case dataset::DataType::DE_INT32:
|
||||
return mindspore::TypeId::kNumberTypeInt32;
|
||||
case dataset::DataType::DE_UINT32:
|
||||
return mindspore::TypeId::kNumberTypeUInt32;
|
||||
case dataset::DataType::DE_INT64:
|
||||
return mindspore::TypeId::kNumberTypeInt64;
|
||||
case dataset::DataType::DE_UINT64:
|
||||
return mindspore::TypeId::kNumberTypeUInt64;
|
||||
case dataset::DataType::DE_FLOAT16:
|
||||
return mindspore::TypeId::kNumberTypeFloat16;
|
||||
case dataset::DataType::DE_FLOAT32:
|
||||
return mindspore::TypeId::kNumberTypeFloat32;
|
||||
case dataset::DataType::DE_FLOAT64:
|
||||
return mindspore::TypeId::kNumberTypeFloat64;
|
||||
default:
|
||||
return kTypeUnknown;
|
||||
}
|
||||
}
|
||||
|
||||
MSTensor *DETensor::CreateTensor(TypeId data_type, const std::vector<int> &shape) {
|
||||
return new DETensor(data_type, shape);
|
||||
}
|
||||
|
@ -100,7 +39,7 @@ DETensor::DETensor(TypeId data_type, const std::vector<int> &shape) {
|
|||
t_shape.reserve(shape.size());
|
||||
std::transform(shape.begin(), shape.end(), std::back_inserter(t_shape),
|
||||
[](int s) -> dataset::dsize_t { return static_cast<dataset::dsize_t>(s); });
|
||||
dataset::Tensor::CreateEmpty(dataset::TensorShape(t_shape), MSTypeToDEType(data_type), &this->tensor_impl_);
|
||||
dataset::Tensor::CreateEmpty(dataset::TensorShape(t_shape), dataset::MSTypeToDEType(data_type), &this->tensor_impl_);
|
||||
}
|
||||
|
||||
DETensor::DETensor(std::shared_ptr<dataset::Tensor> tensor_ptr) { this->tensor_impl_ = std::move(tensor_ptr); }
|
||||
|
@ -120,14 +59,14 @@ std::shared_ptr<dataset::Tensor> DETensor::tensor() const {
|
|||
|
||||
TypeId DETensor::data_type() const {
|
||||
MS_ASSERT(this->tensor_impl_ != nullptr);
|
||||
return DETypeToMSType(this->tensor_impl_->type());
|
||||
return dataset::DETypeToMSType(this->tensor_impl_->type());
|
||||
}
|
||||
|
||||
TypeId DETensor::set_data_type(TypeId data_type) {
|
||||
MS_ASSERT(this->tensor_impl_ != nullptr);
|
||||
if (data_type != this->data_type()) {
|
||||
std::shared_ptr<dataset::Tensor> temp;
|
||||
dataset::Tensor::CreateFromMemory(this->tensor_impl_->shape(), MSTypeToDEType(data_type),
|
||||
dataset::Tensor::CreateFromMemory(this->tensor_impl_->shape(), dataset::MSTypeToDEType(data_type),
|
||||
this->tensor_impl_->GetBuffer(), &temp);
|
||||
this->tensor_impl_ = temp;
|
||||
}
|
||||
|
|
|
@ -50,13 +50,6 @@ Status RandomDataOp::Builder::Build(std::shared_ptr<RandomDataOp> *out_op) {
|
|||
std::make_shared<RandomDataOp>(builder_num_workers_, builder_op_connector_size_, builder_rows_per_buffer_,
|
||||
builder_total_rows_, std::move(builder_data_schema_), std::move(builder_sampler_));
|
||||
|
||||
// If the user did not provide a schema, then we will ask the op to generate a pseudo-random
|
||||
// schema.
|
||||
// See details of generateSchema function to learn what type of schema it will create.
|
||||
if ((*out_op)->data_schema_ == nullptr) {
|
||||
RETURN_IF_NOT_OK((*out_op)->GenerateSchema());
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
@ -85,6 +78,12 @@ RandomDataOp::RandomDataOp(int32_t num_workers, int32_t op_connector_size, int64
|
|||
if (total_rows_ == 0) {
|
||||
total_rows_ = GenRandomInt(1, kMaxTotalRows);
|
||||
}
|
||||
// If the user did not provide a schema, then we will ask the op to generate a pseudo-random
|
||||
// schema.
|
||||
// See details of generateSchema function to learn what type of schema it will create.
|
||||
if (data_schema_ == nullptr) {
|
||||
GenerateSchema();
|
||||
}
|
||||
// Everyone is already out from the sync area.
|
||||
all_out_.Set();
|
||||
}
|
||||
|
@ -106,11 +105,7 @@ void RandomDataOp::Print(std::ostream &out, bool show_all) const {
|
|||
}
|
||||
|
||||
// Helper function to produce a default/random schema if one didn't exist
|
||||
Status RandomDataOp::GenerateSchema() {
|
||||
if (data_schema_ != nullptr) {
|
||||
return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "Generating a schema but one already exists!");
|
||||
}
|
||||
|
||||
void RandomDataOp::GenerateSchema() {
|
||||
// To randomly create a schema, we need to choose:
|
||||
// a) how many columns
|
||||
// b) the type of each column
|
||||
|
@ -144,8 +139,6 @@ Status RandomDataOp::GenerateSchema() {
|
|||
|
||||
data_schema_->AddColumn(*newCol);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Class functor operator () override.
|
||||
|
|
|
@ -213,9 +213,8 @@ class RandomDataOp : public ParallelOp {
|
|||
|
||||
/**
|
||||
* Helper function to produce a default/random schema if one didn't exist
|
||||
@return Status - The error code return
|
||||
*/
|
||||
Status GenerateSchema();
|
||||
void GenerateSchema();
|
||||
|
||||
/**
|
||||
* Performs a synchronization between workers at the end of an epoch
|
||||
|
|
|
@ -24,9 +24,11 @@
|
|||
#include <utility>
|
||||
#include <string>
|
||||
#include "minddata/dataset/core/constants.h"
|
||||
#include "minddata/dataset/engine/data_schema.h"
|
||||
#include "minddata/dataset/include/tensor.h"
|
||||
#include "minddata/dataset/include/iterator.h"
|
||||
#include "minddata/dataset/include/samplers.h"
|
||||
#include "minddata/dataset/include/type_id.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
@ -40,6 +42,7 @@ class TensorShape;
|
|||
namespace api {
|
||||
|
||||
class TensorOperation;
|
||||
class SchemaObj;
|
||||
class SamplerObj;
|
||||
// Datasets classes (in alphabetical order)
|
||||
class CelebADataset;
|
||||
|
@ -49,6 +52,7 @@ class CLUEDataset;
|
|||
class CocoDataset;
|
||||
class ImageFolderDataset;
|
||||
class MnistDataset;
|
||||
class RandomDataset;
|
||||
class TextFileDataset;
|
||||
class VOCDataset;
|
||||
// Dataset Op classes (in alphabetical order)
|
||||
|
@ -63,6 +67,11 @@ class SkipDataset;
|
|||
class TakeDataset;
|
||||
class ZipDataset;
|
||||
|
||||
/// \brief Function to create a SchemaObj
|
||||
/// \param[in] schema_file Path of schema file
|
||||
/// \return Shared pointer to the current schema
|
||||
std::shared_ptr<SchemaObj> Schema(const std::string &schema_file = "");
|
||||
|
||||
/// \brief Function to create a CelebADataset
|
||||
/// \notes The generated dataset has two columns ['image', 'attr'].
|
||||
// The type of the image tensor is uint8. The attr tensor is uint32 and one hot type.
|
||||
|
@ -167,6 +176,21 @@ std::shared_ptr<MnistDataset> Mnist(std::string dataset_dir, std::shared_ptr<Sam
|
|||
std::shared_ptr<ConcatDataset> operator+(const std::shared_ptr<Dataset> &datasets1,
|
||||
const std::shared_ptr<Dataset> &datasets2);
|
||||
|
||||
/// \brief Function to create a RandomDataset
|
||||
/// \param[in] total_rows Number of rows for the dataset to generate (default=0, number of rows is random)
|
||||
/// \param[in] schema SchemaObj to set column type, data type and data shape
|
||||
/// \param[in] columns_list List of columns to be read (default=None, read all columns)
|
||||
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
|
||||
/// will be used to randomly iterate the entire dataset
|
||||
/// \return Shared pointer to the current Dataset
|
||||
template <typename T = std::shared_ptr<SchemaObj>>
|
||||
std::shared_ptr<RandomDataset> RandomData(const int32_t &total_rows = 0, T schema = nullptr,
|
||||
std::vector<std::string> columns_list = {},
|
||||
std::shared_ptr<SamplerObj> sampler = nullptr) {
|
||||
auto ds = std::make_shared<RandomDataset>(total_rows, schema, std::move(columns_list), std::move(sampler));
|
||||
return ds->ValidateParams() ? ds : nullptr;
|
||||
}
|
||||
|
||||
/// \brief Function to create a TextFileDataset
|
||||
/// \notes The generated dataset has one column ['text']
|
||||
/// \param[in] dataset_files List of files to be read to search for a pattern of files. The list
|
||||
|
@ -335,6 +359,66 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
|
|||
int32_t worker_connector_size_;
|
||||
};
|
||||
|
||||
class SchemaObj {
|
||||
public:
|
||||
/// \brief Constructor
|
||||
explicit SchemaObj(const std::string &schema_file = "");
|
||||
|
||||
/// \brief Destructor
|
||||
~SchemaObj() = default;
|
||||
|
||||
/// \brief SchemaObj init function
|
||||
/// \return bool true if schema init success
|
||||
bool init();
|
||||
|
||||
/// \brief Add new column to the schema
|
||||
/// \param[in] name name of the column.
|
||||
/// \param[in] de_type data type of the column(TypeId).
|
||||
/// \param[in] shape shape of the column.
|
||||
/// \return bool true if schema init success
|
||||
bool add_column(std::string name, TypeId de_type, std::vector<int32_t> shape);
|
||||
|
||||
/// \brief Add new column to the schema
|
||||
/// \param[in] name name of the column.
|
||||
/// \param[in] de_type data type of the column(std::string).
|
||||
/// \param[in] shape shape of the column.
|
||||
/// \return bool true if schema init success
|
||||
bool add_column(std::string name, std::string de_type, std::vector<int32_t> shape);
|
||||
|
||||
/// \brief Get a JSON string of the schema
|
||||
/// \return JSON string of the schema
|
||||
std::string to_json();
|
||||
|
||||
/// \brief Get a JSON string of the schema
|
||||
std::string to_string() { return to_json(); }
|
||||
|
||||
/// \brief set a new value to dataset_type
|
||||
inline void set_dataset_type(std::string dataset_type) { dataset_type_ = dataset_type; }
|
||||
|
||||
/// \brief set a new value to num_rows
|
||||
inline void set_num_rows(int32_t num_rows) { num_rows_ = num_rows; }
|
||||
|
||||
/// \brief get the current num_rows
|
||||
inline int32_t get_num_rows() { return num_rows_; }
|
||||
|
||||
private:
|
||||
/// \brief Parse the columns and add it to columns
|
||||
/// \param[in] columns dataset attribution information, decoded from schema file.
|
||||
/// support both nlohmann::json::value_t::array and nlohmann::json::value_t::onject.
|
||||
/// \return JSON string of the schema
|
||||
bool parse_column(nlohmann::json columns);
|
||||
|
||||
/// \brief Get schema file from json file
|
||||
/// \param[in] json_obj object of json parsed.
|
||||
/// \return bool true if json dump success
|
||||
bool from_json(nlohmann::json json_obj);
|
||||
|
||||
int32_t num_rows_;
|
||||
std::string dataset_type_;
|
||||
std::string schema_file_;
|
||||
nlohmann::json columns_;
|
||||
};
|
||||
|
||||
/* ####################################### Derived Dataset classes ################################# */
|
||||
|
||||
// DERIVED DATASET CLASSES FOR LEAF-NODE DATASETS
|
||||
|
@ -517,6 +601,53 @@ class MnistDataset : public Dataset {
|
|||
std::shared_ptr<SamplerObj> sampler_;
|
||||
};
|
||||
|
||||
class RandomDataset : public Dataset {
|
||||
public:
|
||||
// Some constants to provide limits to random generation.
|
||||
static constexpr int32_t kMaxNumColumns = 4;
|
||||
static constexpr int32_t kMaxRank = 4;
|
||||
static constexpr int32_t kMaxDimValue = 32;
|
||||
|
||||
/// \brief Constructor
|
||||
RandomDataset(const int32_t &total_rows, std::shared_ptr<SchemaObj> schema, std::vector<std::string> columns_list,
|
||||
std::shared_ptr<SamplerObj> sampler)
|
||||
: total_rows_(total_rows),
|
||||
schema_path_(""),
|
||||
schema_(std::move(schema)),
|
||||
columns_list_(columns_list),
|
||||
sampler_(std::move(sampler)) {}
|
||||
|
||||
/// \brief Constructor
|
||||
RandomDataset(const int32_t &total_rows, std::string schema_path, std::vector<std::string> columns_list,
|
||||
std::shared_ptr<SamplerObj> sampler)
|
||||
: total_rows_(total_rows), schema_path_(schema_path), columns_list_(columns_list), sampler_(std::move(sampler)) {}
|
||||
|
||||
/// \brief Destructor
|
||||
~RandomDataset() = default;
|
||||
|
||||
/// \brief a base class override function to create the required runtime dataset op objects for this class
|
||||
/// \return The list of shared pointers to the newly created DatasetOps
|
||||
std::vector<std::shared_ptr<DatasetOp>> Build() override;
|
||||
|
||||
/// \brief Parameters validation
|
||||
/// \return bool true if all the params are valid
|
||||
bool ValidateParams() override;
|
||||
|
||||
private:
|
||||
/// \brief A quick inline for producing a random number between (and including) min/max
|
||||
/// \param[in] min minimum number that can be generated.
|
||||
/// \param[in] max maximum number that can be generated.
|
||||
/// \return The generated random number
|
||||
int32_t GenRandomInt(int32_t min, int32_t max);
|
||||
|
||||
int32_t total_rows_;
|
||||
std::string schema_path_;
|
||||
std::shared_ptr<SchemaObj> schema_;
|
||||
std::vector<std::string> columns_list_;
|
||||
std::shared_ptr<SamplerObj> sampler_;
|
||||
std::mt19937 rand_gen_;
|
||||
};
|
||||
|
||||
/// \class TextFileDataset
|
||||
/// \brief A Dataset derived class to represent TextFile dataset
|
||||
class TextFileDataset : public Dataset {
|
||||
|
|
|
@ -0,0 +1,88 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_
|
||||
|
||||
#include "minddata/dataset/core/data_type.h"
|
||||
#include "mindspore/core/ir/dtype/type_id.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
inline dataset::DataType MSTypeToDEType(TypeId data_type) {
|
||||
switch (data_type) {
|
||||
case kNumberTypeBool:
|
||||
return dataset::DataType(dataset::DataType::DE_BOOL);
|
||||
case kNumberTypeInt8:
|
||||
return dataset::DataType(dataset::DataType::DE_INT8);
|
||||
case kNumberTypeUInt8:
|
||||
return dataset::DataType(dataset::DataType::DE_UINT8);
|
||||
case kNumberTypeInt16:
|
||||
return dataset::DataType(dataset::DataType::DE_INT16);
|
||||
case kNumberTypeUInt16:
|
||||
return dataset::DataType(dataset::DataType::DE_UINT16);
|
||||
case kNumberTypeInt32:
|
||||
return dataset::DataType(dataset::DataType::DE_INT32);
|
||||
case kNumberTypeUInt32:
|
||||
return dataset::DataType(dataset::DataType::DE_UINT32);
|
||||
case kNumberTypeInt64:
|
||||
return dataset::DataType(dataset::DataType::DE_INT64);
|
||||
case kNumberTypeUInt64:
|
||||
return dataset::DataType(dataset::DataType::DE_UINT64);
|
||||
case kNumberTypeFloat16:
|
||||
return dataset::DataType(dataset::DataType::DE_FLOAT16);
|
||||
case kNumberTypeFloat32:
|
||||
return dataset::DataType(dataset::DataType::DE_FLOAT32);
|
||||
case kNumberTypeFloat64:
|
||||
return dataset::DataType(dataset::DataType::DE_FLOAT64);
|
||||
default:
|
||||
return dataset::DataType(dataset::DataType::DE_UNKNOWN);
|
||||
}
|
||||
}
|
||||
|
||||
inline TypeId DETypeToMSType(dataset::DataType data_type) {
|
||||
switch (data_type.value()) {
|
||||
case dataset::DataType::DE_BOOL:
|
||||
return mindspore::TypeId::kNumberTypeBool;
|
||||
case dataset::DataType::DE_INT8:
|
||||
return mindspore::TypeId::kNumberTypeInt8;
|
||||
case dataset::DataType::DE_UINT8:
|
||||
return mindspore::TypeId::kNumberTypeUInt8;
|
||||
case dataset::DataType::DE_INT16:
|
||||
return mindspore::TypeId::kNumberTypeInt16;
|
||||
case dataset::DataType::DE_UINT16:
|
||||
return mindspore::TypeId::kNumberTypeUInt16;
|
||||
case dataset::DataType::DE_INT32:
|
||||
return mindspore::TypeId::kNumberTypeInt32;
|
||||
case dataset::DataType::DE_UINT32:
|
||||
return mindspore::TypeId::kNumberTypeUInt32;
|
||||
case dataset::DataType::DE_INT64:
|
||||
return mindspore::TypeId::kNumberTypeInt64;
|
||||
case dataset::DataType::DE_UINT64:
|
||||
return mindspore::TypeId::kNumberTypeUInt64;
|
||||
case dataset::DataType::DE_FLOAT16:
|
||||
return mindspore::TypeId::kNumberTypeFloat16;
|
||||
case dataset::DataType::DE_FLOAT32:
|
||||
return mindspore::TypeId::kNumberTypeFloat32;
|
||||
case dataset::DataType::DE_FLOAT64:
|
||||
return mindspore::TypeId::kNumberTypeFloat64;
|
||||
default:
|
||||
return kTypeUnknown;
|
||||
}
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_
|
|
@ -100,6 +100,7 @@ SET(DE_UT_SRCS
|
|||
c_api_dataset_clue_test.cc
|
||||
c_api_dataset_coco_test.cc
|
||||
c_api_dataset_filetext_test.cc
|
||||
c_api_dataset_randomdata_test.cc
|
||||
c_api_dataset_voc_test.cc
|
||||
c_api_datasets_test.cc
|
||||
c_api_dataset_iterator_test.cc
|
||||
|
|
|
@ -0,0 +1,271 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "common/common.h"
|
||||
#include "minddata/dataset/include/datasets.h"
|
||||
#include "minddata/dataset/core/config_manager.h"
|
||||
#include "minddata/dataset/core/global_context.h"
|
||||
|
||||
#include "mindspore/core/ir/dtype/type_id.h"
|
||||
|
||||
using namespace mindspore::dataset;
|
||||
using namespace mindspore::dataset::api;
|
||||
using mindspore::dataset::Tensor;
|
||||
using mindspore::dataset::TensorShape;
|
||||
using mindspore::dataset::DataType;
|
||||
|
||||
class MindDataTestPipeline : public UT::DatasetOpTesting {
|
||||
protected:
|
||||
};
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestRandomDatasetBasic1) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetBasic1.";
|
||||
|
||||
// Create a RandomDataset
|
||||
std::shared_ptr<SchemaObj> schema = Schema();
|
||||
schema->add_column("image", mindspore::TypeId::kNumberTypeUInt8, {2});
|
||||
schema->add_column("label", mindspore::TypeId::kNumberTypeUInt8, {1});
|
||||
std::shared_ptr<Dataset> ds = RandomData(50, schema);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
ds = ds->SetNumWorkers(4);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create a Repeat operation on ds
|
||||
ds = ds->Repeat(4);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
// Check if RandomDataOp read correct columns
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto image = row["image"];
|
||||
auto label = row["label"];
|
||||
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
|
||||
MS_LOG(INFO) << "Tensor label shape: " << label->shape();
|
||||
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 200);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestRandomDatasetBasic2) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetBasic2.";
|
||||
|
||||
// Create a RandomDataset
|
||||
std::shared_ptr<Dataset> ds = RandomData(10);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
ds = ds->SetNumWorkers(1);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create a Repeat operation on ds
|
||||
ds = ds->Repeat(2);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
// Check if RandomDataOp read correct columns
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto image = row["image"];
|
||||
auto label = row["label"];
|
||||
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
|
||||
MS_LOG(INFO) << "Tensor label shape: " << label->shape();
|
||||
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 20);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestRandomDatasetBasic3) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetBasic3.";
|
||||
|
||||
// Create a RandomDataset
|
||||
u_int32_t curr_seed = GlobalContext::config_manager()->seed();
|
||||
GlobalContext::config_manager()->set_seed(246);
|
||||
|
||||
std::string SCHEMA_FILE = datasets_root_path_ + "/testTFTestAllTypes/datasetSchema.json";
|
||||
std::shared_ptr<SchemaObj> schema = Schema(SCHEMA_FILE);
|
||||
std::shared_ptr<Dataset> ds = RandomData(0, schema);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create a Repeat operation on ds
|
||||
ds = ds->Repeat(2);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
// Check if RandomDataOp read correct columns
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto col_sint16 = row["col_sint16"];
|
||||
auto col_sint32 = row["col_sint32"];
|
||||
auto col_sint64 = row["col_sint64"];
|
||||
auto col_float = row["col_float"];
|
||||
auto col_1d = row["col_1d"];
|
||||
auto col_2d = row["col_2d"];
|
||||
auto col_3d = row["col_3d"];
|
||||
auto col_binary = row["col_binary"];
|
||||
|
||||
// validate shape
|
||||
ASSERT_EQ(col_sint16->shape(), TensorShape({1}));
|
||||
ASSERT_EQ(col_sint32->shape(), TensorShape({1}));
|
||||
ASSERT_EQ(col_sint64->shape(), TensorShape({1}));
|
||||
ASSERT_EQ(col_float->shape(), TensorShape({1}));
|
||||
ASSERT_EQ(col_1d->shape(), TensorShape({2}));
|
||||
ASSERT_EQ(col_2d->shape(), TensorShape({2, 2}));
|
||||
ASSERT_EQ(col_3d->shape(), TensorShape({2, 2, 2}));
|
||||
ASSERT_EQ(col_binary->shape(), TensorShape({1}));
|
||||
|
||||
// validate Rank
|
||||
ASSERT_EQ(col_sint16->Rank(), 1);
|
||||
ASSERT_EQ(col_sint32->Rank(), 1);
|
||||
ASSERT_EQ(col_sint64->Rank(), 1);
|
||||
ASSERT_EQ(col_float->Rank(), 1);
|
||||
ASSERT_EQ(col_1d->Rank(), 1);
|
||||
ASSERT_EQ(col_2d->Rank(), 2);
|
||||
ASSERT_EQ(col_3d->Rank(), 3);
|
||||
ASSERT_EQ(col_binary->Rank(), 1);
|
||||
|
||||
// validate type
|
||||
ASSERT_EQ(col_sint16->type(), DataType::DE_INT16);
|
||||
ASSERT_EQ(col_sint32->type(), DataType::DE_INT32);
|
||||
ASSERT_EQ(col_sint64->type(), DataType::DE_INT64);
|
||||
ASSERT_EQ(col_float->type(), DataType::DE_FLOAT32);
|
||||
ASSERT_EQ(col_1d->type(), DataType::DE_INT64);
|
||||
ASSERT_EQ(col_2d->type(), DataType::DE_INT64);
|
||||
ASSERT_EQ(col_3d->type(), DataType::DE_INT64);
|
||||
ASSERT_EQ(col_binary->type(), DataType::DE_UINT8);
|
||||
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 984);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
GlobalContext::config_manager()->set_seed(curr_seed);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestRandomDatasetBasic4) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetBasic3.";
|
||||
|
||||
// Create a RandomDataset
|
||||
u_int32_t curr_seed = GlobalContext::config_manager()->seed();
|
||||
GlobalContext::config_manager()->set_seed(246);
|
||||
|
||||
std::string SCHEMA_FILE = datasets_root_path_ + "/testTFTestAllTypes/datasetSchema.json";
|
||||
std::shared_ptr<Dataset> ds = RandomData(0, SCHEMA_FILE);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create a Repeat operation on ds
|
||||
ds = ds->Repeat(2);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
// Check if RandomDataOp read correct columns
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto col_sint16 = row["col_sint16"];
|
||||
auto col_sint32 = row["col_sint32"];
|
||||
auto col_sint64 = row["col_sint64"];
|
||||
auto col_float = row["col_float"];
|
||||
auto col_1d = row["col_1d"];
|
||||
auto col_2d = row["col_2d"];
|
||||
auto col_3d = row["col_3d"];
|
||||
auto col_binary = row["col_binary"];
|
||||
|
||||
// validate shape
|
||||
ASSERT_EQ(col_sint16->shape(), TensorShape({1}));
|
||||
ASSERT_EQ(col_sint32->shape(), TensorShape({1}));
|
||||
ASSERT_EQ(col_sint64->shape(), TensorShape({1}));
|
||||
ASSERT_EQ(col_float->shape(), TensorShape({1}));
|
||||
ASSERT_EQ(col_1d->shape(), TensorShape({2}));
|
||||
ASSERT_EQ(col_2d->shape(), TensorShape({2, 2}));
|
||||
ASSERT_EQ(col_3d->shape(), TensorShape({2, 2, 2}));
|
||||
ASSERT_EQ(col_binary->shape(), TensorShape({1}));
|
||||
|
||||
// validate Rank
|
||||
ASSERT_EQ(col_sint16->Rank(), 1);
|
||||
ASSERT_EQ(col_sint32->Rank(), 1);
|
||||
ASSERT_EQ(col_sint64->Rank(), 1);
|
||||
ASSERT_EQ(col_float->Rank(), 1);
|
||||
ASSERT_EQ(col_1d->Rank(), 1);
|
||||
ASSERT_EQ(col_2d->Rank(), 2);
|
||||
ASSERT_EQ(col_3d->Rank(), 3);
|
||||
ASSERT_EQ(col_binary->Rank(), 1);
|
||||
|
||||
// validate type
|
||||
ASSERT_EQ(col_sint16->type(), DataType::DE_INT16);
|
||||
ASSERT_EQ(col_sint32->type(), DataType::DE_INT32);
|
||||
ASSERT_EQ(col_sint64->type(), DataType::DE_INT64);
|
||||
ASSERT_EQ(col_float->type(), DataType::DE_FLOAT32);
|
||||
ASSERT_EQ(col_1d->type(), DataType::DE_INT64);
|
||||
ASSERT_EQ(col_2d->type(), DataType::DE_INT64);
|
||||
ASSERT_EQ(col_3d->type(), DataType::DE_INT64);
|
||||
ASSERT_EQ(col_binary->type(), DataType::DE_UINT8);
|
||||
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 984);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
GlobalContext::config_manager()->set_seed(curr_seed);
|
||||
}
|
Loading…
Reference in New Issue