forked from mindspore-Ecosystem/mindspore
Random Data Op
This commit is contained in:
parent
05676676e9
commit
270bf831a9
|
@ -28,6 +28,7 @@
|
|||
#include "dataset/engine/datasetops/source/manifest_op.h"
|
||||
#include "dataset/engine/datasetops/source/cifar_op.h"
|
||||
#include "dataset/engine/datasetops/source/celeba_op.h"
|
||||
#include "dataset/engine/datasetops/source/random_data_op.h"
|
||||
#include "dataset/engine/datasetops/source/text_file_op.h"
|
||||
#include "dataset/engine/datasetops/filter_op.h"
|
||||
#include "mindrecord/include/shard_category.h"
|
||||
|
@ -65,6 +66,7 @@ static std::unordered_map<uint32_t, pFunction> g_parse_op_func_ = {{kStorage, &D
|
|||
{kCifar10, &DEPipeline::ParseCifar10Op},
|
||||
{kCifar100, &DEPipeline::ParseCifar100Op},
|
||||
{kCelebA, &DEPipeline::ParseCelebAOp},
|
||||
{kRandomData, &DEPipeline::ParseRandomDataOp},
|
||||
{kTextFile, &DEPipeline::ParseTextFileOp}};
|
||||
|
||||
DEPipeline::DEPipeline() : iterator_(nullptr) {
|
||||
|
@ -972,6 +974,45 @@ Status DEPipeline::ParseCifar100Op(const py::dict &args, std::shared_ptr<Dataset
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DEPipeline::ParseRandomDataOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
|
||||
// Required arguments
|
||||
RandomDataOp::Builder builder;
|
||||
|
||||
if (args["num_samples"].is_none()) {
|
||||
std::string err_msg = "Error: num_samples is a required argument";
|
||||
RETURN_STATUS_UNEXPECTED(err_msg);
|
||||
}
|
||||
std::vector<std::string> columns_to_load;
|
||||
bool schema_exists = false;
|
||||
// Optional arguments
|
||||
for (auto arg : args) {
|
||||
std::string key = py::str(arg.first);
|
||||
py::handle value = arg.second;
|
||||
if (key == "num_parallel_workers") {
|
||||
(void)builder.SetNumWorkers(ToInt(value));
|
||||
} else if (key == "schema_file_path" || key == "schema_json_string") {
|
||||
schema_exists = true;
|
||||
} else if (key == "num_samples") {
|
||||
(void)builder.SetTotalRows(ToInt(value));
|
||||
} else if (key == "columns_list") {
|
||||
columns_to_load = ToStringVector(value);
|
||||
}
|
||||
}
|
||||
if (schema_exists) {
|
||||
std::unique_ptr<DataSchema> schema = std::make_unique<DataSchema>();
|
||||
if (args.contains("schema_file_path")) {
|
||||
RETURN_IF_NOT_OK(schema->LoadSchemaFile(ToString(args["schema_file_path"]), columns_to_load));
|
||||
} else {
|
||||
RETURN_IF_NOT_OK(schema->LoadSchemaString(ToString(args["schema_json_string"]), columns_to_load));
|
||||
}
|
||||
(void)builder.SetDataSchema(std::move(schema));
|
||||
}
|
||||
std::shared_ptr<RandomDataOp> op;
|
||||
RETURN_IF_NOT_OK(builder.Build(&op));
|
||||
*ptr = op;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
int32_t DEPipeline::GetNumClasses() const { return num_classes_; }
|
||||
|
||||
Status DEPipeline::ParseMnistOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
|
||||
|
|
|
@ -60,6 +60,7 @@ enum OpName {
|
|||
kCifar10,
|
||||
kCifar100,
|
||||
kCelebA,
|
||||
kRandomData,
|
||||
kTextFile
|
||||
};
|
||||
|
||||
|
@ -142,6 +143,8 @@ class DEPipeline {
|
|||
|
||||
Status ParseCifar100Op(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
|
||||
|
||||
Status ParseRandomDataOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
|
||||
|
||||
void PrintTree();
|
||||
|
||||
int32_t GetNumClasses() const;
|
||||
|
|
|
@ -47,6 +47,7 @@
|
|||
#include "dataset/engine/datasetops/source/mnist_op.h"
|
||||
#include "dataset/engine/datasetops/source/manifest_op.h"
|
||||
#include "dataset/engine/datasetops/source/mindrecord_op.h"
|
||||
#include "dataset/engine/datasetops/source/random_data_op.h"
|
||||
#include "dataset/engine/datasetops/source/sampler/distributed_sampler.h"
|
||||
#include "dataset/engine/datasetops/source/sampler/pk_sampler.h"
|
||||
#include "dataset/engine/datasetops/source/sampler/random_sampler.h"
|
||||
|
@ -489,6 +490,7 @@ PYBIND11_MODULE(_c_dataengine, m) {
|
|||
.value("VOC", OpName::kVoc)
|
||||
.value("CIFAR10", OpName::kCifar10)
|
||||
.value("CIFAR100", OpName::kCifar100)
|
||||
.value("RANDOMDATA", OpName::kRandomData)
|
||||
.value("CELEBA", OpName::kCelebA)
|
||||
.value("TEXTFILE", OpName::kTextFile);
|
||||
|
||||
|
|
|
@ -466,5 +466,24 @@ Status DataSchema::PreLoadExceptionCheck(const nlohmann::json &js) {
|
|||
"\"columns\" node is required in the schema json file.");
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Loops through all columns in the schema and returns a map with the column
|
||||
// name to column index number.
|
||||
Status DataSchema::GetColumnNameMap(std::unordered_map<std::string, int32_t> *out_column_name_map) {
|
||||
if (out_column_name_map == nullptr) {
|
||||
return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
|
||||
"unexpected null output column name map.");
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < col_descs_.size(); ++i) {
|
||||
if (col_descs_[i].name().empty()) {
|
||||
return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
|
||||
"Constructing column name map from schema, but found empty column name.");
|
||||
}
|
||||
(*out_column_name_map)[col_descs_[i].name()] = i;
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "dataset/core/constants.h"
|
||||
|
@ -180,6 +181,12 @@ class DataSchema {
|
|||
|
||||
static const char DEFAULT_DATA_SCHEMA_FILENAME[];
|
||||
|
||||
// Loops through all columns in the schema and returns a map with the column
|
||||
// name to column index number.
|
||||
// @param out_column_name_map - The output map of columns names to column index
|
||||
// @return Status - The error code return
|
||||
Status GetColumnNameMap(std::unordered_map<std::string, int32_t> *out_column_name_map);
|
||||
|
||||
private:
|
||||
// Internal helper function. Parses the json schema file in any order and produces a schema that
|
||||
// does not follow any particular order (json standard does not enforce any ordering protocol).
|
||||
|
|
|
@ -17,6 +17,7 @@ add_library(engine-datasetops-source OBJECT
|
|||
${FEATURE_SRCS}
|
||||
manifest_op.cc
|
||||
cifar_op.cc
|
||||
random_data_op.cc
|
||||
celeba_op.cc
|
||||
text_file_op.cc
|
||||
)
|
||||
|
|
|
@ -0,0 +1,414 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dataset/engine/datasetops/source/random_data_op.h"
|
||||
#include <iomanip>
|
||||
#include <random>
|
||||
#include "dataset/engine/execution_tree.h"
|
||||
#include "dataset/core/config_manager.h"
|
||||
#include "dataset/util/random.h"
|
||||
#include "dataset/util/wait_post.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
// Builder constructor. Creates the builder object.
|
||||
RandomDataOp::Builder::Builder()
|
||||
: builder_data_schema_(nullptr),
|
||||
builder_num_workers_(0),
|
||||
builder_op_connector_size_(0),
|
||||
builder_rows_per_buffer_(0),
|
||||
builder_total_rows_(0) {
|
||||
// Some arguments to the RandomDataOp have a default argument that is taken from the config.
|
||||
// The user may override these defaults by using the builder set methods.
|
||||
std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
|
||||
builder_rows_per_buffer_ = cfg->rows_per_buffer();
|
||||
builder_num_workers_ = cfg->num_parallel_workers();
|
||||
builder_op_connector_size_ = cfg->op_connector_size();
|
||||
}
|
||||
|
||||
// The build method that produces the instantiated RandomDataOp as a shared pointer
|
||||
Status RandomDataOp::Builder::Build(std::shared_ptr<RandomDataOp> *out_op) {
|
||||
RETURN_IF_NOT_OK(SanityCheck());
|
||||
|
||||
*out_op = std::make_shared<RandomDataOp>(builder_num_workers_, builder_op_connector_size_, builder_rows_per_buffer_,
|
||||
builder_total_rows_, std::move(builder_data_schema_));
|
||||
|
||||
// If the user did not provide a schema, then we will ask the op to generate a pseudo-random
|
||||
// schema.
|
||||
// See details of generateSchema function to learn what type of schema it will create.
|
||||
if ((*out_op)->data_schema_ == nullptr) {
|
||||
RETURN_IF_NOT_OK((*out_op)->GenerateSchema());
|
||||
}
|
||||
|
||||
// Extract the column name mapping from the schema and save it in the class.
|
||||
// This will be needed when constructing buffers.
|
||||
RETURN_IF_NOT_OK((*out_op)->data_schema_->GetColumnNameMap(&((*out_op)->column_name_map_)));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Check if the required parameters are set by the builder.
|
||||
Status RandomDataOp::Builder::SanityCheck() const {
|
||||
// There actually is no required arguments for the random data op at all.
|
||||
// Some arguments are preset with global values from config, and if they are not given by the user
|
||||
// then we create them randomly. Leaving this function here for consistency with other operators.
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Constructor for RandomDataOp
|
||||
RandomDataOp::RandomDataOp(int32_t num_workers, int32_t op_connector_size, int64_t rows_per_buffer, int64_t total_rows,
|
||||
std::unique_ptr<DataSchema> data_schema)
|
||||
: ParallelOp(num_workers, op_connector_size),
|
||||
buffer_id_(0),
|
||||
rows_per_buffer_(rows_per_buffer),
|
||||
total_rows_(total_rows),
|
||||
epoch_buffers_sent_(0),
|
||||
guys_in_(0),
|
||||
guys_out_(num_workers_),
|
||||
eoe_worker_id_(0),
|
||||
data_schema_(std::move(data_schema)) {
|
||||
rand_gen_.seed(GetSeed()); // seed the random generator
|
||||
// If total rows was not given, then randomly pick a number
|
||||
if (total_rows_ == 0) {
|
||||
total_rows_ = GenRandomInt(1, kMaxTotalRows);
|
||||
}
|
||||
// Everyone is already out from the sync area.
|
||||
all_out_.Set();
|
||||
}
|
||||
|
||||
// A print method typically used for debugging
|
||||
void RandomDataOp::Print(std::ostream &out, bool show_all) const {
|
||||
// Always show the id and name as first line regardless if this summary or detailed print
|
||||
out << "(" << std::setw(2) << operator_id_ << ") <RandomDataOp>:";
|
||||
if (!show_all) {
|
||||
// Call the super class for displaying any common 1-liner info
|
||||
ParallelOp::Print(out, show_all);
|
||||
// Then show any custom derived-internal 1-liner info for this op
|
||||
out << " [total rows: " << total_rows_ << "]\n";
|
||||
} else {
|
||||
// Call the super class for displaying any common detailed info
|
||||
ParallelOp::Print(out, show_all);
|
||||
// Then show any custom derived-internal stuff
|
||||
out << "\nTotal_rows: " << total_rows_
|
||||
<< "\nRows per buffer: " << rows_per_buffer_
|
||||
<< "\nSchema:\n" << *data_schema_ << "\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to produce a default/random schema if one didn't exist
|
||||
Status RandomDataOp::GenerateSchema() {
|
||||
if (data_schema_ != nullptr) {
|
||||
return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "Generating a schema but one already exists!");
|
||||
}
|
||||
|
||||
// To randomly create a schema, we need to choose:
|
||||
// a) how many columns
|
||||
// b) the type of each column
|
||||
// c) the shape of each column (number of dimensions i.e. rank)
|
||||
// d) the shape of each column (dimension values)
|
||||
data_schema_ = std::make_unique<DataSchema>();
|
||||
std::unique_ptr<TensorShape> newShape;
|
||||
std::unique_ptr<ColDescriptor> newCol;
|
||||
|
||||
// Loop over the number of chosen columns
|
||||
int32_t numColumns = GenRandomInt(1, kMaxNumColumns);
|
||||
for (int32_t i = 0; i < numColumns; i++) {
|
||||
// For each column:
|
||||
// - choose a datatype
|
||||
// - generate a shape that randomly chooses the number of dimensions and the dimension values.
|
||||
DataType::Type newType = static_cast<DataType::Type>(GenRandomInt(0, kMaxDataType));
|
||||
int32_t rank = GenRandomInt(1, kMaxRank);
|
||||
std::vector<dsize_t> dims;
|
||||
for (int32_t d = 0; d < rank; d++) {
|
||||
// 0 is not a valid dimension value. however, we can support "*" or unknown, so map the random
|
||||
// 0 value to the unknown attribute if 0 is chosen
|
||||
dsize_t dim_value = static_cast<dsize_t>(GenRandomInt(0, kMaxDimValue));
|
||||
if (dim_value == 0) dim_value = TensorShape::kDimUnknown;
|
||||
dims.push_back(dim_value);
|
||||
}
|
||||
newShape = std::make_unique<TensorShape>(dims);
|
||||
|
||||
// Create the column descriptor
|
||||
std::string colName = "c" + std::to_string(i);
|
||||
newCol = std::make_unique<ColDescriptor>(colName, DataType(newType), TensorImpl::kFlexible, rank,
|
||||
newShape.get());
|
||||
|
||||
data_schema_->AddColumn(*newCol);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Class functor operator () override.
|
||||
// All DatasetOps operate by launching a thread (see ExecutionTree). This class functor will
|
||||
// provide the master loop that drives the logic for performing the work.
|
||||
Status RandomDataOp::operator()() {
|
||||
// First, compute how many buffers we'll need to satisfy the total row count.
|
||||
// The only reason we do this is for the purpose of throttling worker count if needed.
|
||||
int64_t buffers_needed = total_rows_ / rows_per_buffer_;
|
||||
if (total_rows_ % rows_per_buffer_ != 0) {
|
||||
buffers_needed++;
|
||||
}
|
||||
|
||||
// If the amount of workers we have exceeds the number of buffers to produce, then we'll have
|
||||
// idle workers doing nothing. In that case, let's throttle the worker count.
|
||||
if (num_workers_ > buffers_needed) {
|
||||
MS_LOG(INFO) << "RandomDataOp throttling worker count from " << num_workers_ << "to " << buffers_needed;
|
||||
num_workers_ = buffers_needed;
|
||||
num_producers_ = num_workers_;
|
||||
guys_out_ = num_workers_;
|
||||
// The output connector was already created with a different worker count. We have to drop and recreate
|
||||
// that connector.
|
||||
DatasetOp::CreateConnector(num_producers_, num_workers_);
|
||||
}
|
||||
|
||||
// Assign the number of rows to each worker in a round robin fashion.
|
||||
worker_max_rows_.reserve(num_workers_);
|
||||
worker_rows_packed_.reserve(num_workers_);
|
||||
// init the counts to zero to start.
|
||||
for (int32_t w = 0; w < num_workers_; w++) {
|
||||
worker_max_rows_.push_back(0);
|
||||
worker_rows_packed_.push_back(0);
|
||||
}
|
||||
// then assign round robin row counts
|
||||
int32_t currentWorker = 0;
|
||||
for (int64_t r = 0; r < total_rows_; r++) {
|
||||
worker_max_rows_[currentWorker]++;
|
||||
currentWorker = (currentWorker + 1) % num_workers_;
|
||||
}
|
||||
|
||||
// Next, compute the total buffer count. This stat is needed during reset logic
|
||||
for (int32_t w = 0; w < num_workers_; w++) {
|
||||
int64_t worker_buffers = 0;
|
||||
worker_buffers = worker_max_rows_[w] / rows_per_buffer_;
|
||||
if (worker_max_rows_[w] % rows_per_buffer_ != 0) worker_buffers++;
|
||||
epoch_buffers_sent_ += worker_buffers;
|
||||
}
|
||||
|
||||
// For the connector to work, we need to target the correct worker channel for the eoe.
|
||||
// This will initialize it for the first one. reset() handles for the rest of the epochs.
|
||||
eoe_worker_id_ = epoch_buffers_sent_ % num_workers_;
|
||||
epoch_buffers_sent_++; // Add the eoe buffer to the count for subsequent epochs
|
||||
|
||||
// RandomDataOp doesn't need the master thread to stay around. Kick off the workers and then master exits.
|
||||
RETURN_IF_NOT_OK(
|
||||
tree_->LaunchWorkers(num_workers_, std::bind(&RandomDataOp::WorkerEntry, this, std::placeholders::_1)));
|
||||
|
||||
// required task group setup after launching workers
|
||||
TaskManager::FindMe()->Post();
|
||||
RETURN_IF_NOT_OK(epoch_sync_wait_post_.Register(tree_->AllTasks()));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Performs a synchronization between workers at the end of an epoch
|
||||
Status RandomDataOp::EpochSync(int32_t worker_id, bool *quitting) {
|
||||
MS_LOG(INFO) << "RandomDataOp worker " << worker_id << " syncing at end of epoch";
|
||||
|
||||
// Sync on the guys_in counter
|
||||
// We have to wait the last guy is out.
|
||||
all_out_.Wait();
|
||||
// If we are not in a repeat loop, or that was the last repeat already, then setup our exit
|
||||
// condition from the master loop.
|
||||
if (!BitTest(op_ctrl_flags_, kDeOpRepeated) || BitTest(op_ctrl_flags_, kDeOpLastRepeat)) {
|
||||
*quitting = true;
|
||||
}
|
||||
|
||||
auto prev = guys_in_.fetch_add(1);
|
||||
bool last_guy_in = (prev + 1) == num_workers_;
|
||||
// If we are the last worker to hit this sync point, we have some extra tasks
|
||||
if (last_guy_in) {
|
||||
MS_LOG(INFO) << "RandomDataOp worker " << worker_id << " is the last one to sync. eoe sent as worker "
|
||||
<< eoe_worker_id_;
|
||||
// Prepare for sync
|
||||
all_out_.Clear();
|
||||
// Always flow eoe at the end
|
||||
std::unique_ptr<DataBuffer> eoe_buffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
|
||||
RETURN_IF_NOT_OK(out_connector_->Add(eoe_worker_id_, std::move(eoe_buffer)));
|
||||
// If we're done then also flow the eof
|
||||
if (*quitting) {
|
||||
// The eof needs to be sent from the next sender in the round robin, so +1
|
||||
int32_t eof_worker_id = (eoe_worker_id_ + 1) % num_workers_;
|
||||
MS_LOG(INFO) << "RandomDataOp worker " << worker_id << " has no more epochs. sending eof as worker "
|
||||
<< eof_worker_id;
|
||||
std::unique_ptr<DataBuffer> eof_buffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF);
|
||||
RETURN_IF_NOT_OK(out_connector_->Add(eof_worker_id, std::move(eof_buffer)));
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for the reset to wake us up if we're not quitting
|
||||
if (!(*quitting)) {
|
||||
MS_LOG(INFO) << "RandomDataOp worker " << worker_id << " entering sync wait.";
|
||||
RETURN_IF_NOT_OK(epoch_sync_wait_post_.Wait());
|
||||
prev = guys_out_.fetch_add(1);
|
||||
bool last_guy_out = (prev + 1) == num_workers_;
|
||||
// Last guy out will clear the wait post and set the row counts
|
||||
if (last_guy_out) {
|
||||
MS_LOG(INFO) << "RandomDataOp worker " << worker_id << " last guy out clearing wait post.";
|
||||
epoch_sync_wait_post_.Clear();
|
||||
guys_in_ = 0;
|
||||
all_out_.Set();
|
||||
}
|
||||
}
|
||||
|
||||
MS_LOG(INFO) << "RandomDataOp worker " << worker_id << " epoch sync complete.";
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// The entry point code for when workers are launched
|
||||
Status RandomDataOp::WorkerEntry(int32_t worker_id) {
|
||||
MS_LOG(INFO) << "RandomDataOp worker " << worker_id << " entry";
|
||||
|
||||
// handshake with the master first to tell it we're alive
|
||||
TaskManager::FindMe()->Post();
|
||||
|
||||
bool quitting = false;
|
||||
std::unique_ptr<TensorQTable> new_tensor_table = nullptr;
|
||||
|
||||
// Loop until the quitting variable gets set to true
|
||||
do {
|
||||
// If we have not yet reached the row count for this worker then produce another record
|
||||
if (worker_rows_packed_[worker_id] < worker_max_rows_[worker_id]) {
|
||||
TensorRow new_row;
|
||||
|
||||
// Start a new tensor table if needed
|
||||
if (new_tensor_table == nullptr) {
|
||||
new_tensor_table = std::make_unique<TensorQTable>();
|
||||
}
|
||||
|
||||
// Create the data for the row
|
||||
RETURN_IF_NOT_OK(CreateRandomRow(worker_id, &new_row));
|
||||
|
||||
// Add the row to our table
|
||||
new_tensor_table->push_back(std::move(new_row));
|
||||
worker_rows_packed_[worker_id]++;
|
||||
|
||||
// If the tensor table is at capacity then it's time to send it to output
|
||||
if (new_tensor_table->size() == rows_per_buffer_) {
|
||||
RETURN_IF_NOT_OK(PackAndSend(worker_id, std::move(new_tensor_table)));
|
||||
}
|
||||
} else {
|
||||
// We've reached the total row count for this worker, so it's time for epoch sync.
|
||||
// There is likely some records built but not sent yet, so take care of those first
|
||||
// (this buffer will be smaller than rows_per_buffer)
|
||||
if (new_tensor_table != nullptr && new_tensor_table->size() > 0) {
|
||||
RETURN_IF_NOT_OK(PackAndSend(worker_id, std::move(new_tensor_table)));
|
||||
}
|
||||
|
||||
// Now, let's enter the epoch sync
|
||||
RETURN_IF_NOT_OK(EpochSync(worker_id, &quitting));
|
||||
}
|
||||
} while (!quitting);
|
||||
|
||||
MS_LOG(INFO) << "RandomDataOp worker " << worker_id << " is now quitting.";
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// A helper function to stuff the tensor table into a buffer and send it to output connector
|
||||
Status RandomDataOp::PackAndSend(int32_t worker_id, std::unique_ptr<TensorQTable> in_table) {
|
||||
auto new_buffer = std::make_unique<DataBuffer>(GetNextBufferId(), DataBuffer::kDeBFlagNone);
|
||||
new_buffer->set_tensor_table(std::move(in_table));
|
||||
new_buffer->set_column_name_map(column_name_map_);
|
||||
RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(new_buffer)));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// A helper function to create random data for the row
|
||||
Status RandomDataOp::CreateRandomRow(int32_t worker_id, TensorRow *new_row) {
|
||||
if (new_row == nullptr) {
|
||||
return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "Missing tensor row output");
|
||||
}
|
||||
|
||||
// Create a tensor for each column, then add the tensor to the row
|
||||
for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
|
||||
const ColDescriptor current_col = data_schema_->column(i);
|
||||
std::vector<dsize_t> current_shape = current_col.shape().AsVector();
|
||||
std::unique_ptr<TensorShape> new_shape = nullptr;
|
||||
std::unique_ptr<unsigned char[]> buf = nullptr;
|
||||
std::shared_ptr<Tensor> new_tensor = nullptr;
|
||||
|
||||
// We need to resolve the shape to fill in any unknown dimensions with random
|
||||
// values, then use that as our shape for this tensor.
|
||||
for (int j = 0; j < current_shape.size(); ++j) {
|
||||
if (current_shape[j] == TensorShape::kDimUnknown) {
|
||||
current_shape[j] = static_cast<dsize_t>(GenRandomInt(1, kMaxDimValue));
|
||||
}
|
||||
}
|
||||
|
||||
new_shape = std::make_unique<TensorShape>(current_shape);
|
||||
int64_t size_in_bytes = new_shape->NumOfElements() * current_col.type().SizeInBytes();
|
||||
|
||||
// Generate a random byte of data. This may cause some funny data for things like doubles,floats, bools
|
||||
// however the random data op is not too concerned about the physical data itself.
|
||||
std::uniform_int_distribution<uint8_t> uniDist(0, 255);
|
||||
uint8_t random_byte = uniDist(rand_gen_);
|
||||
|
||||
// Now, create a chunk of memory for the entire tensor and copy this byte in repeatedly.
|
||||
buf = std::make_unique<unsigned char[]>(size_in_bytes);
|
||||
int ret_code = memset_s(buf.get(), size_in_bytes, random_byte, size_in_bytes);
|
||||
if (ret_code != 0) {
|
||||
return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "Failed to set random bytes for a tensor.");
|
||||
}
|
||||
|
||||
RETURN_IF_NOT_OK(
|
||||
Tensor::CreateTensor(&new_tensor, current_col.tensorImpl(), *new_shape, current_col.type(), buf.get()));
|
||||
|
||||
// Add this tensor to the tensor row for output
|
||||
(*new_row).push_back(std::move(new_tensor));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Overrides base class reset method. When an operator does a reset, it cleans up any state
|
||||
// info from it's previous execution and then initializes itself so that it can be executed
|
||||
// again.
|
||||
Status RandomDataOp::Reset() {
|
||||
MS_LOG(INFO) << "RandomDataOp resetting.";
|
||||
|
||||
// Ensure all guys are in the waitpost
|
||||
if (guys_in_ != num_workers_) {
|
||||
return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
|
||||
"Issuing a reset, but some workers are missing from epochSync!");
|
||||
}
|
||||
|
||||
// reset the row counters for all workers
|
||||
for (int32_t w = 0; w < num_workers_; w++) {
|
||||
worker_rows_packed_[w] = 0;
|
||||
worker_max_rows_[w] = 0;
|
||||
}
|
||||
buffer_id_ = 0;
|
||||
|
||||
// Re-assign round robin row counts, starting from the worker after the one that gave
|
||||
// the eoe last time
|
||||
int32_t currentWorker = (eoe_worker_id_ + 1) % num_workers_;
|
||||
for (int64_t r = 0; r < total_rows_; r++) {
|
||||
worker_max_rows_[currentWorker]++;
|
||||
currentWorker = (currentWorker + 1) % num_workers_;
|
||||
}
|
||||
|
||||
// Compute which worker should get the eoe for the next epoch
|
||||
eoe_worker_id_ = ((epoch_buffers_sent_ % num_workers_) + eoe_worker_id_) % num_workers_;
|
||||
|
||||
// Wake up the workers to get them going again in a new epoch
|
||||
guys_out_ = 0;
|
||||
epoch_sync_wait_post_.Set();
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
|
@ -0,0 +1,271 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_ENGINE_DATASETOPS_SOURCE_RANDOM_DATA_OP_
|
||||
#define DATASET_ENGINE_DATASETOPS_SOURCE_RANDOM_DATA_OP_
|
||||
|
||||
#include <atomic>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include "dataset/util/status.h"
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/core/data_type.h"
|
||||
#include "dataset/engine/data_schema.h"
|
||||
#include "dataset/engine/datasetops/parallel_op.h"
|
||||
#include "dataset/util/wait_post.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
// The RandomDataOp is a leaf node storage operator that generates random data based
|
||||
// on the schema specifications. Typically, it's used for testing and demonstrating
|
||||
// various dataset operator pipelines. It is not "real" data to train with.
|
||||
// The data that is random created is just random and repeated bytes, there is no
|
||||
// "meaning" behind what these bytes are.
|
||||
class RandomDataOp : public ParallelOp {
|
||||
public:
|
||||
// Some constants to provide limits to random generation.
|
||||
static constexpr int32_t kMaxNumColumns = 4;
|
||||
static constexpr int32_t kMaxRank = 4;
|
||||
static constexpr int32_t kMaxDimValue = 2048;
|
||||
static constexpr int32_t kMaxDataType = (DataType::DE_UNKNOWN - 1);
|
||||
static constexpr int32_t kMaxTotalRows = 1024;
|
||||
|
||||
// A nested builder class to aid in the construction of a RandomDataOp
|
||||
class Builder {
|
||||
public:
|
||||
/**
|
||||
* Builder constructor. Creates the builder object.
|
||||
* @note No default args.
|
||||
* @return This is a constructor.
|
||||
*/
|
||||
Builder();
|
||||
|
||||
/**
|
||||
* Default destructor
|
||||
*/
|
||||
~Builder() = default;
|
||||
|
||||
/**
|
||||
* The build method that produces the instantiated RandomDataOp as a shared pointer
|
||||
* @param out_op - The output RandomDataOperator that was constructed
|
||||
* @return Status - The error code return
|
||||
*/
|
||||
Status Build(std::shared_ptr<RandomDataOp> *out_op);
|
||||
|
||||
/**
|
||||
* Builder set method
|
||||
* @param data_schema - A user-provided schema
|
||||
* @return Builder - The modified builder by reference
|
||||
*/
|
||||
Builder &SetDataSchema(std::unique_ptr<DataSchema> data_schema) {
|
||||
builder_data_schema_ = std::move(data_schema);
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builder set method
|
||||
* @param num_workers - The number of workers
|
||||
* @return Builder - The modified builder by reference
|
||||
*/
|
||||
Builder &SetNumWorkers(int32_t num_workers) {
|
||||
builder_num_workers_ = num_workers;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builder set method
|
||||
* @param op_connector_size - The size of the output connector
|
||||
* @return Builder - The modified builder by reference
|
||||
*/
|
||||
Builder &SetOpConnectorSize(int32_t op_connector_size) {
|
||||
builder_op_connector_size_ = op_connector_size;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builder set method
|
||||
* @param rows_per_buffer - The number of rows in each DataBuffer
|
||||
* @return Builder - The modified builder by reference
|
||||
*/
|
||||
Builder &SetRowsPerBuffer(int64_t rows_per_buffer) {
|
||||
builder_rows_per_buffer_ = rows_per_buffer;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builder set method
|
||||
* @param total_rows - The total number of rows in the dataset
|
||||
* @return Builder - The modified builder by reference
|
||||
*/
|
||||
Builder &SetTotalRows(int64_t total_rows) {
|
||||
builder_total_rows_ = total_rows;
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
/**
|
||||
* Check if the required parameters are set by the builder.
|
||||
* @return Status - The error code return
|
||||
*/
|
||||
Status SanityCheck() const;
|
||||
|
||||
std::unique_ptr<DataSchema> builder_data_schema_;
|
||||
int32_t builder_num_workers_;
|
||||
int32_t builder_op_connector_size_;
|
||||
int64_t builder_rows_per_buffer_;
|
||||
int64_t builder_total_rows_;
|
||||
}; // class Builder
|
||||
|
||||
/**
|
||||
* Constructor for RandomDataOp
|
||||
* @note Private constructor. Must use builder to construct.
|
||||
* @param num_workers - The number of workers
|
||||
* @param op_connector_size - The size of the output connector
|
||||
* @param rows_per_buffer - The number of rows in each DataBuffer
|
||||
* @param data_schema - A user-provided schema
|
||||
* @param total_rows - The total number of rows in the dataset
|
||||
* @return Builder - The modified builder by reference
|
||||
*/
|
||||
RandomDataOp(int32_t num_workers, int32_t op_connector_size, int64_t rows_per_buffer, int64_t total_rows,
|
||||
std::unique_ptr<DataSchema> data_schema);
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
~RandomDataOp() = default;
|
||||
|
||||
/**
|
||||
* A print method typically used for debugging
|
||||
* @param out - The output stream to write output to
|
||||
* @param show_all - A bool to control if you want to show all info or just a summary
|
||||
*/
|
||||
void Print(std::ostream &out, bool show_all) const override;
|
||||
|
||||
/**
|
||||
* << Stream output operator overload
|
||||
* @notes This allows you to write the debug print info using stream operators
|
||||
* @param out - reference to the output stream being overloaded
|
||||
* @param so - reference to the ShuffleOp to display
|
||||
* @return - the output stream must be returned
|
||||
*/
|
||||
friend std::ostream &operator<<(std::ostream &out, const RandomDataOp &op) {
|
||||
op.Print(out, false);
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Class functor operator () override.
|
||||
* All DatasetOps operate by launching a thread (see ExecutionTree). This class functor will
|
||||
* provide the master loop that drives the logic for performing the work.
|
||||
* @return Status - The error code return
|
||||
*/
|
||||
Status operator()() override;
|
||||
|
||||
/**
|
||||
* Overrides base class reset method. When an operator does a reset, it cleans up any state
|
||||
* info from it's previous execution and then initializes itself so that it can be executed
|
||||
* again.
|
||||
* @return Status - The error code return
|
||||
*/
|
||||
Status Reset() override;
|
||||
|
||||
/**
|
||||
* Quick getter for total rows.
|
||||
*/
|
||||
int64_t GetTotalRows() const { return total_rows_; }
|
||||
|
||||
private:
|
||||
/**
|
||||
* The entry point code for when workers are launched
|
||||
* @param worker_id - The worker id
|
||||
* @return Status - The error code return
|
||||
*/
|
||||
Status WorkerEntry(int32_t worker_id) override;
|
||||
|
||||
/**
|
||||
* Helper function to produce a default/random schema if one didn't exist
|
||||
@return Status - The error code return
|
||||
*/
|
||||
Status GenerateSchema();
|
||||
|
||||
/**
|
||||
* Performs a synchronization between workers at the end of an epoch
|
||||
* @param worker_id - The worker id
|
||||
* @return Status - The error code return
|
||||
*/
|
||||
Status EpochSync(int32_t worker_id, bool *quitting);
|
||||
|
||||
/**
|
||||
* A helper function to stuff the tensor table into a buffer and send it to output connector
|
||||
* @param worker_id - The worker id
|
||||
* @param in_table - The tensor table to pack and send
|
||||
* @return Status - The error code return
|
||||
*/
|
||||
Status PackAndSend(int32_t worker_id, std::unique_ptr<TensorQTable> in_table);
|
||||
|
||||
/**
|
||||
* A helper function to create random data for the row
|
||||
* @param worker_id - The worker id
|
||||
* @param new_row - The output row to produce
|
||||
* @return Status - The error code return
|
||||
*/
|
||||
Status CreateRandomRow(int32_t worker_id, TensorRow *new_row);
|
||||
|
||||
/**
|
||||
* A quick inline for producing a random number between (and including) min/max
|
||||
* @param min - minimum number that can be generated
|
||||
* @param max - maximum number that can be generated
|
||||
* @return - The generated random number
|
||||
*/
|
||||
inline int32_t GenRandomInt(int32_t min, int32_t max) {
|
||||
std::uniform_int_distribution<int32_t> uniDist(min, max);
|
||||
return uniDist(rand_gen_);
|
||||
}
|
||||
|
||||
/**
|
||||
* A quick inline for producing the next buffer id in sequence, threadsafe
|
||||
* @return - The next buffer id.
|
||||
*/
|
||||
inline int32_t GetNextBufferId() {
|
||||
std::unique_lock<std::mutex> lock(buffer_id_mutex_);
|
||||
return ++buffer_id_;
|
||||
}
|
||||
|
||||
int32_t buffer_id_;
|
||||
int64_t rows_per_buffer_;
|
||||
int64_t total_rows_;
|
||||
int64_t epoch_buffers_sent_;
|
||||
std::atomic<int32_t> guys_in_;
|
||||
std::atomic<int32_t> guys_out_;
|
||||
int32_t eoe_worker_id_;
|
||||
std::unique_ptr<DataSchema> data_schema_;
|
||||
std::vector<int64_t> worker_max_rows_;
|
||||
std::vector<int64_t> worker_rows_packed_;
|
||||
std::unordered_map<std::string, int32_t> column_name_map_;
|
||||
std::mt19937 rand_gen_;
|
||||
WaitPost epoch_sync_wait_post_;
|
||||
WaitPost all_out_;
|
||||
std::mutex buffer_id_mutex_;
|
||||
}; // class RandomDataOp
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // DATASET_ENGINE_DATASETOPS_SOURCE_RANDOM_DATA_OP_
|
|
@ -21,7 +21,7 @@ can also create samplers with this module to sample data.
|
|||
from .core.configuration import config
|
||||
from .engine.datasets import StorageDataset, TFRecordDataset, ImageFolderDatasetV2, MnistDataset, MindDataset, \
|
||||
GeneratorDataset, ManifestDataset, Cifar10Dataset, Cifar100Dataset, VOCDataset, CelebADataset, TextFileDataset, \
|
||||
Schema, Shuffle, zip
|
||||
Schema, Shuffle, zip, RandomDataset
|
||||
from .engine.samplers import DistributedSampler, PKSampler, RandomSampler, SequentialSampler, SubsetRandomSampler, \
|
||||
WeightedRandomSampler, Sampler
|
||||
from .engine.serializer_deserializer import serialize, deserialize, show
|
||||
|
|
|
@ -3146,6 +3146,57 @@ class Cifar100Dataset(SourceDataset):
|
|||
return get_num_rows(num_rows, self.num_shards)
|
||||
|
||||
|
||||
class RandomDataset(SourceDataset):
|
||||
"""
|
||||
A source dataset that generates random data.
|
||||
|
||||
Args:
|
||||
num_samples (int): number of samples to generate.
|
||||
schema (str or Schema, optional): Path to the json schema file or schema object (default=None).
|
||||
If the schema is not provided, the meta data from the TFRecord file is considered the schema.
|
||||
columns_list (list[str], optional): List of columns to be read (default=None, read all columns)
|
||||
num_parallel_workers (int, optional): number of workers to read the data
|
||||
(default=None, number set in the config).
|
||||
"""
|
||||
|
||||
def __init__(self, schema=None, columns_list=None, num_samples=None, num_parallel_workers=None):
|
||||
super().__init__(num_parallel_workers)
|
||||
schema_obj = None
|
||||
if (schema is not None) and (not isinstance(schema, Schema)):
|
||||
schema_obj = Schema(schema) # read the schema file and convert to schema object to validate it
|
||||
self.schema = schema
|
||||
self.columns_list = columns_list
|
||||
self.num_samples = num_samples
|
||||
if schema_obj is not None and num_samples is None:
|
||||
self.num_samples = schema_obj.num_rows
|
||||
|
||||
def get_args(self):
|
||||
args = super().get_args()
|
||||
if self.schema is not None:
|
||||
if isinstance(self.schema, Schema):
|
||||
self.schema.datasetType = 'Random'
|
||||
if self.num_samples is not None:
|
||||
self.schema.num_rows = self.num_samples
|
||||
args["schema_json_string"] = self.schema.to_json()
|
||||
else:
|
||||
args["schema_file_path"] = self.schema
|
||||
args["schema"] = self.schema
|
||||
if self.columns_list is not None:
|
||||
args["columns_list"] = self.columns_list
|
||||
if self.num_samples is not None:
|
||||
args["num_samples"] = self.num_samples
|
||||
return args
|
||||
|
||||
def get_dataset_size(self):
|
||||
"""
|
||||
Get the number of batches in an epoch.
|
||||
|
||||
Return:
|
||||
Number, number of batches.
|
||||
"""
|
||||
return num_samples
|
||||
|
||||
|
||||
class Schema:
|
||||
"""
|
||||
Class to represent a schema of dataset.
|
||||
|
|
|
@ -192,6 +192,8 @@ class Iterator:
|
|||
op_type = OpName.CIFAR100
|
||||
elif isinstance(dataset, de.CelebADataset):
|
||||
op_type = OpName.CELEBA
|
||||
elif isinstance(dataset, de.RandomDataset):
|
||||
op_type = OpName.RANDOMDATA
|
||||
elif isinstance(dataset, de.TextFileDataset):
|
||||
op_type = OpName.TEXTFILE
|
||||
else:
|
||||
|
|
|
@ -0,0 +1,457 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dataset/core/client.h"
|
||||
#include "common/common.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include "dataset/core/tensor_shape.h"
|
||||
#include "dataset/engine/datasetops/source/random_data_op.h"
|
||||
#include "dataset/engine/data_schema.h"
|
||||
|
||||
using namespace mindspore::dataset;
|
||||
using mindspore::MsLogLevel::INFO;
|
||||
using mindspore::ExceptionType::NoExceptionType;
|
||||
using mindspore::LogStream;
|
||||
|
||||
class MindDataTestRandomDataOp : public UT::DatasetOpTesting {
|
||||
|
||||
};
|
||||
|
||||
// Test info:
|
||||
// - Simple test with a user-provided schema generated purely from DataSchema C API
|
||||
// - has an interation loop
|
||||
//
|
||||
// Tree: single node tree with RandomDataOp
|
||||
//
|
||||
// RandomDataOp
|
||||
//
|
||||
TEST_F(MindDataTestRandomDataOp, RandomDataOpBasic1) {
|
||||
Status rc;
|
||||
int32_t rank = 0; // not used
|
||||
MS_LOG(INFO) << "UT test RandomDataOpBasic1";
|
||||
|
||||
// Start with an empty execution tree
|
||||
auto myTree = std::make_shared<ExecutionTree>();
|
||||
|
||||
// Create a schema using the C api's
|
||||
std::unique_ptr<DataSchema> testSchema = std::make_unique<DataSchema>();
|
||||
|
||||
// RandomDataOp can randomly fill in unknown dimension lengths of a shape.
|
||||
// Most other ops cannot do that as they are limited by the physical data itself. We're
|
||||
// more flexible with random data since it is just making stuff up on the fly.
|
||||
TensorShape c1Shape({TensorShape::kDimUnknown, TensorShape::kDimUnknown, 3});
|
||||
ColDescriptor c1("image",
|
||||
DataType(DataType::DE_INT8),
|
||||
TensorImpl::kFlexible,
|
||||
rank, // not used
|
||||
&c1Shape);
|
||||
|
||||
// Column 2 will just be a scalar label number
|
||||
TensorShape c2Shape({}); // empty shape is a 1-value scalar Tensor
|
||||
ColDescriptor c2("label",
|
||||
DataType(DataType::DE_UINT32),
|
||||
TensorImpl::kFlexible,
|
||||
rank,
|
||||
&c2Shape);
|
||||
|
||||
testSchema->AddColumn(c1);
|
||||
testSchema->AddColumn(c2);
|
||||
|
||||
std::shared_ptr<RandomDataOp> myRandomDataOp;
|
||||
RandomDataOp::Builder builder;
|
||||
|
||||
rc = builder.SetRowsPerBuffer(2)
|
||||
.SetNumWorkers(1)
|
||||
.SetDataSchema(std::move(testSchema))
|
||||
.SetTotalRows(25)
|
||||
.Build(&myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
rc = myTree->AssociateNode(myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
rc = myTree->AssignRoot(myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
std::ostringstream ss;
|
||||
ss << *myRandomDataOp;
|
||||
MS_LOG(INFO) << "RandomDataOp print: %s" << ss.str();
|
||||
|
||||
MS_LOG(INFO) << "Launching tree and begin iteration";
|
||||
rc = myTree->Prepare();
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
rc = myTree->Launch();
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
// Start the loop of reading tensors from our pipeline
|
||||
DatasetIterator dI(myTree);
|
||||
TensorRow tensorList;
|
||||
rc = dI.FetchNextTensorRow(&tensorList);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
int rowCount = 0;
|
||||
while (!tensorList.empty()) {
|
||||
// Don't display these rows...too big to show
|
||||
MS_LOG(INFO) << "Row fetched #: " << rowCount;
|
||||
|
||||
rc = dI.FetchNextTensorRow(&tensorList);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
rowCount++;
|
||||
}
|
||||
ASSERT_EQ(rowCount, 25);
|
||||
}
|
||||
|
||||
// Test info:
|
||||
// - Simple test with a randomly generated schema
|
||||
// - no iteration loop on this one, just create the op
|
||||
//
|
||||
// Tree: single node tree with RandomDataOp
|
||||
//
|
||||
// RandomDataOp
|
||||
//
|
||||
TEST_F(MindDataTestRandomDataOp, RandomDataOpBasic2) {
|
||||
Status rc;
|
||||
MS_LOG(INFO) << "UT test RandomDataOpBasic2";
|
||||
|
||||
// Start with an empty execution tree
|
||||
auto myTree = std::make_shared<ExecutionTree>();
|
||||
|
||||
std::shared_ptr<RandomDataOp> myRandomDataOp;
|
||||
RandomDataOp::Builder builder;
|
||||
|
||||
rc = builder.SetRowsPerBuffer(2)
|
||||
.SetNumWorkers(1)
|
||||
.Build(&myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
rc = myTree->AssociateNode(myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
rc = myTree->AssignRoot(myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
std::ostringstream ss;
|
||||
ss << *myRandomDataOp;
|
||||
MS_LOG(INFO) << "RandomDataOp print: " << ss.str();
|
||||
}
|
||||
|
||||
// Test info:
|
||||
// - json file test with iteration
|
||||
//
|
||||
// Tree: single node tree with RandomDataOp
|
||||
//
|
||||
// RandomDataOp
|
||||
//
|
||||
TEST_F(MindDataTestRandomDataOp, RandomDataOpBasic3) {
|
||||
Status rc;
|
||||
MS_LOG(INFO) << "UT test RandomDataOpBasic3";
|
||||
|
||||
// Start with an empty execution tree
|
||||
auto myTree = std::make_shared<ExecutionTree>();
|
||||
|
||||
std::unique_ptr<DataSchema> testSchema = std::make_unique<DataSchema>();
|
||||
rc = testSchema->LoadSchemaFile(datasets_root_path_ + "/testRandomData/datasetSchema.json", {});
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
std::shared_ptr<RandomDataOp> myRandomDataOp;
|
||||
RandomDataOp::Builder builder;
|
||||
|
||||
rc = builder.SetRowsPerBuffer(2)
|
||||
.SetNumWorkers(1)
|
||||
.SetDataSchema(std::move(testSchema))
|
||||
.SetTotalRows(10)
|
||||
.Build(&myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
rc = myTree->AssociateNode(myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
rc = myTree->AssignRoot(myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
std::ostringstream ss;
|
||||
ss << *myRandomDataOp;
|
||||
MS_LOG(INFO) << "RandomDataOp print: " << ss.str();
|
||||
|
||||
MS_LOG(INFO) << "Launching tree and begin iteration";
|
||||
rc = myTree->Prepare();
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
rc = myTree->Launch();
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
// Start the loop of reading tensors from our pipeline
|
||||
DatasetIterator dI(myTree);
|
||||
TensorRow tensorList;
|
||||
rc = dI.FetchNextTensorRow(&tensorList);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
int rowCount = 0;
|
||||
while (!tensorList.empty()) {
|
||||
// Don't display these rows...too big to show
|
||||
MS_LOG(INFO) << "Row fetched #: " << rowCount;
|
||||
|
||||
rc = dI.FetchNextTensorRow(&tensorList);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
rowCount++;
|
||||
}
|
||||
ASSERT_EQ(rowCount, 10);
|
||||
}
|
||||
|
||||
// Test info:
|
||||
// - json schema input it's a fairly simple one
|
||||
// - has an interation loop
|
||||
//
|
||||
// Tree: RepeatOp over RandomDataOp
|
||||
//
|
||||
// RepeatOp
|
||||
// |
|
||||
// RandomDataOp
|
||||
//
|
||||
TEST_F(MindDataTestRandomDataOp, RandomDataOpBasic4) {
|
||||
Status rc;
|
||||
MS_LOG(INFO) << "UT test RandomDataOpBasic4";
|
||||
|
||||
// Start with an empty execution tree
|
||||
auto myTree = std::make_shared<ExecutionTree>();
|
||||
|
||||
std::unique_ptr<DataSchema> testSchema = std::make_unique<DataSchema>();
|
||||
rc = testSchema->LoadSchemaFile(datasets_root_path_ + "/testRandomData/datasetSchema2.json", {});
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
std::shared_ptr<RandomDataOp> myRandomDataOp;
|
||||
RandomDataOp::Builder builder;
|
||||
|
||||
rc = builder.SetRowsPerBuffer(2)
|
||||
.SetNumWorkers(1)
|
||||
.SetDataSchema(std::move(testSchema))
|
||||
.SetTotalRows(10)
|
||||
.Build(&myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
rc = myTree->AssociateNode(myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
uint32_t numRepeats = 2;
|
||||
std::shared_ptr<RepeatOp> myRepeatOp;
|
||||
rc = RepeatOp::Builder(numRepeats)
|
||||
.Build(&myRepeatOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
rc = myTree->AssociateNode(myRepeatOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
rc = myRepeatOp->AddChild(myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
rc = myTree->AssignRoot(myRepeatOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
MS_LOG(INFO) << "Launching tree and begin iteration";
|
||||
rc = myTree->Prepare();
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
rc = myTree->Launch();
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
// Start the loop of reading tensors from our pipeline
|
||||
DatasetIterator dI(myTree);
|
||||
TensorRow tensorList;
|
||||
rc = dI.FetchNextTensorRow(&tensorList);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
int rowCount = 0;
|
||||
while (!tensorList.empty()) {
|
||||
MS_LOG(INFO) << "Row display for row #: " << rowCount;
|
||||
|
||||
// Display the tensor by calling the printer on it
|
||||
for (int i = 0; i < tensorList.size(); i++) {
|
||||
std::ostringstream ss;
|
||||
ss << *tensorList[i] << std::endl;
|
||||
MS_LOG(INFO) << "Tensor print: %s" << ss.str();
|
||||
}
|
||||
|
||||
rc = dI.FetchNextTensorRow(&tensorList);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
rowCount++;
|
||||
}
|
||||
ASSERT_EQ(rowCount, 20);
|
||||
}
|
||||
|
||||
// Test info:
|
||||
// - json schema input it's a fairly simple one
|
||||
// - has an interation loop
|
||||
// - same as MindDataTestRandomDataOpBasic4 except that this one will have parallel workers
|
||||
//
|
||||
// Tree: RepeatOp over RandomDataOp
|
||||
//
|
||||
// RepeatOp
|
||||
// |
|
||||
// RandomDataOp
|
||||
//
|
||||
TEST_F(MindDataTestRandomDataOp, RandomDataOpBasic5) {
|
||||
Status rc;
|
||||
MS_LOG(INFO) << "UT test RandomDataOpBasic5";
|
||||
|
||||
// Start with an empty execution tree
|
||||
auto myTree = std::make_shared<ExecutionTree>();
|
||||
|
||||
std::unique_ptr<DataSchema> testSchema = std::make_unique<DataSchema>();
|
||||
rc = testSchema->LoadSchemaFile(datasets_root_path_ + "/testRandomData/datasetSchema2.json", {});
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
std::shared_ptr<RandomDataOp> myRandomDataOp;
|
||||
RandomDataOp::Builder builder;
|
||||
|
||||
rc = builder.SetRowsPerBuffer(2)
|
||||
.SetNumWorkers(4)
|
||||
.SetDataSchema(std::move(testSchema))
|
||||
.SetTotalRows(10)
|
||||
.Build(&myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
rc = myTree->AssociateNode(myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
uint32_t numRepeats = 3;
|
||||
std::shared_ptr<RepeatOp> myRepeatOp;
|
||||
rc = RepeatOp::Builder(numRepeats)
|
||||
.Build(&myRepeatOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
rc = myTree->AssociateNode(myRepeatOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
rc = myRepeatOp->AddChild(myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
rc = myTree->AssignRoot(myRepeatOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
MS_LOG(INFO) << "Launching tree and begin iteration";
|
||||
rc = myTree->Prepare();
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
rc = myTree->Launch();
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
// Start the loop of reading tensors from our pipeline
|
||||
DatasetIterator dI(myTree);
|
||||
TensorRow tensorList;
|
||||
rc = dI.FetchNextTensorRow(&tensorList);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
int rowCount = 0;
|
||||
while (!tensorList.empty()) {
|
||||
MS_LOG(INFO) << "Row display for row #: " << rowCount;
|
||||
|
||||
// Display the tensor by calling the printer on it
|
||||
for (int i = 0; i < tensorList.size(); i++) {
|
||||
std::ostringstream ss;
|
||||
ss << *tensorList[i] << std::endl;
|
||||
MS_LOG(INFO) << "Tensor print: ", ss.str();
|
||||
}
|
||||
|
||||
rc = dI.FetchNextTensorRow(&tensorList);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
rowCount++;
|
||||
}
|
||||
ASSERT_EQ(rowCount, 30);
|
||||
}
|
||||
|
||||
// Test info:
|
||||
// - repeat shuffle random
|
||||
//
|
||||
// Tree: RepeatOp over RandomDataOp
|
||||
//
|
||||
// RepeatOp
|
||||
// |
|
||||
// ShuffleOp
|
||||
// |
|
||||
// RandomDataOp
|
||||
//
|
||||
TEST_F(MindDataTestRandomDataOp, RandomDataOpTree1) {
|
||||
Status rc;
|
||||
MS_LOG(INFO) << "UT test RandomDataOpTree1";
|
||||
|
||||
// Start with an empty execution tree
|
||||
auto myTree = std::make_shared<ExecutionTree>();
|
||||
|
||||
std::unique_ptr<DataSchema> testSchema = std::make_unique<DataSchema>();
|
||||
rc = testSchema->LoadSchemaFile(datasets_root_path_ + "/testRandomData/datasetSchema2.json", {});
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
std::shared_ptr<RandomDataOp> myRandomDataOp;
|
||||
RandomDataOp::Builder builder;
|
||||
|
||||
rc = builder.SetRowsPerBuffer(2)
|
||||
.SetNumWorkers(4)
|
||||
.SetDataSchema(std::move(testSchema))
|
||||
.SetTotalRows(10)
|
||||
.Build(&myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
rc = myTree->AssociateNode(myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
std::shared_ptr<ShuffleOp> myShuffleOp;
|
||||
rc = ShuffleOp::Builder()
|
||||
.SetRowsPerBuffer(2)
|
||||
.SetShuffleSize(4)
|
||||
.Build(&myShuffleOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
rc = myTree->AssociateNode(myShuffleOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
uint32_t numRepeats = 3;
|
||||
std::shared_ptr<RepeatOp> myRepeatOp;
|
||||
rc = RepeatOp::Builder(numRepeats)
|
||||
.Build(&myRepeatOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
rc = myTree->AssociateNode(myRepeatOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
rc = myRepeatOp->AddChild(myShuffleOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
rc = myShuffleOp->AddChild(myRandomDataOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
rc = myTree->AssignRoot(myRepeatOp);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
MS_LOG(INFO) << "Launching tree and begin iteration";
|
||||
rc = myTree->Prepare();
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
rc = myTree->Launch();
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
|
||||
// Start the loop of reading tensors from our pipeline
|
||||
DatasetIterator dI(myTree);
|
||||
TensorRow tensorList;
|
||||
rc = dI.FetchNextTensorRow(&tensorList);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
int rowCount = 0;
|
||||
while (!tensorList.empty()) {
|
||||
MS_LOG(INFO) << "Row display for row #: " << rowCount;
|
||||
|
||||
// Display the tensor by calling the printer on it
|
||||
for (int i = 0; i < tensorList.size(); i++) {
|
||||
std::ostringstream ss;
|
||||
ss << *tensorList[i] << std::endl;
|
||||
MS_LOG(INFO) << "Tensor print: " << ss.str();
|
||||
}
|
||||
|
||||
rc = dI.FetchNextTensorRow(&tensorList);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
rowCount++;
|
||||
}
|
||||
ASSERT_EQ(rowCount, 30);
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
{
|
||||
"columns": {
|
||||
"image": {
|
||||
"type": "uint8",
|
||||
"rank": 3,
|
||||
"shape": [1920,1080,3]
|
||||
},
|
||||
"label": {
|
||||
"type": "int32",
|
||||
"rank": 1,
|
||||
"shape": [1]
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
{
|
||||
"columns": {
|
||||
"image": {
|
||||
"type": "uint8",
|
||||
"rank": 2,
|
||||
"shape": [28,28]
|
||||
},
|
||||
"label": {
|
||||
"type": "uint8",
|
||||
"rank": 1,
|
||||
"shape": [1]
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
# Copyright 2019 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset as ds
|
||||
from pathlib import Path
|
||||
|
||||
# just a basic test with parallel random data op
|
||||
def test_randomdataset_basic1():
|
||||
print("Test randomdataset basic")
|
||||
|
||||
schema = ds.Schema()
|
||||
schema.add_column('image', de_type=mstype.uint8, shape=[2])
|
||||
schema.add_column('label', de_type=mstype.uint8, shape=[1])
|
||||
|
||||
# apply dataset operations
|
||||
ds1 = ds.RandomDataset(schema=schema, num_samples=50, num_parallel_workers=4)
|
||||
ds1 = ds1.repeat(4)
|
||||
|
||||
num_iter = 0
|
||||
for data in ds1.create_dict_iterator(): # each data is a dictionary
|
||||
# in this example, each dictionary has keys "image" and "label"
|
||||
print("{} image: {}".format(num_iter, data["image"]))
|
||||
print("{} label: {}".format(num_iter, data["label"]))
|
||||
num_iter += 1
|
||||
|
||||
print("Number of data in ds1: ", num_iter)
|
||||
assert(num_iter == 200)
|
||||
|
||||
# Another simple test
|
||||
def test_randomdataset_basic2():
|
||||
print("Test randomdataset basic 2")
|
||||
|
||||
schema = ds.Schema()
|
||||
schema.add_column('image', de_type=mstype.uint8, shape=[640,480,3]) # 921600 bytes (a bit less than 1 MB per image)
|
||||
schema.add_column('label', de_type=mstype.uint8, shape=[1])
|
||||
|
||||
# Make up about 10 samples
|
||||
ds1 = ds.RandomDataset(schema=schema, num_samples=10, num_parallel_workers=1)
|
||||
|
||||
# cache size allows for about 4 images since each image just a bit less than 1MB, after that we will have to spill
|
||||
ds1 = ds1.repeat(4)
|
||||
|
||||
num_iter = 0
|
||||
for data in ds1.create_dict_iterator(): # each data is a dictionary
|
||||
# in this example, each dictionary has keys "image" and "label"
|
||||
#print(data["image"])
|
||||
print("printing the label: {}".format(data["label"]))
|
||||
num_iter += 1
|
||||
|
||||
print("Number of data in ds1: ", num_iter)
|
||||
assert(num_iter == 40)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_randomdataset_basic1()
|
||||
test_randomdataset_basic2()
|
||||
print('test_randomdataset_basic Ended.\n')
|
||||
|
Loading…
Reference in New Issue