!869 Random data op

Merge pull request !869 from JesseKLee/random_data_op
2020-04-30 02:56:39 +08:00 · 2020-04-30 02:56:39 +08:00 · 2303453753
parent ea7872b0a3 5236d0c3c0
commit 2303453753
15 changed files with 1367 additions and 1 deletions
--- a/mindspore/ccsrc/dataset/api/de_pipeline.cc
+++ b/mindspore/ccsrc/dataset/api/de_pipeline.cc
@ -28,6 +28,7 @@
 #include "dataset/engine/datasetops/source/manifest_op.h"
 #include "dataset/engine/datasetops/source/cifar_op.h"
 #include "dataset/engine/datasetops/source/celeba_op.h"
+#include "dataset/engine/datasetops/source/random_data_op.h"
 #include "dataset/engine/datasetops/source/text_file_op.h"
 #include "dataset/engine/datasetops/filter_op.h"
 #include "mindrecord/include/shard_category.h"
@ -65,6 +66,7 @@ static std::unordered_map<uint32_t, pFunction> g_parse_op_func_ = {{kStorage, &D
                                                                   {kCifar10, &DEPipeline::ParseCifar10Op},
                                                                   {kCifar100, &DEPipeline::ParseCifar100Op},
                                                                   {kCelebA, &DEPipeline::ParseCelebAOp},
+                                                                   {kRandomData, &DEPipeline::ParseRandomDataOp},
                                                                   {kTextFile, &DEPipeline::ParseTextFileOp}};

 DEPipeline::DEPipeline() : iterator_(nullptr) {
@ -972,6 +974,45 @@ Status DEPipeline::ParseCifar100Op(const py::dict &args, std::shared_ptr<Dataset
  return Status::OK();
 }

+Status DEPipeline::ParseRandomDataOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
+  // Required arguments
+  RandomDataOp::Builder builder;
+
+  if (args["num_samples"].is_none()) {
+    std::string err_msg = "Error: num_samples is a required argument";
+    RETURN_STATUS_UNEXPECTED(err_msg);
+  }
+  std::vector<std::string> columns_to_load;
+  bool schema_exists = false;
+  // Optional arguments
+  for (auto arg : args) {
+    std::string key = py::str(arg.first);
+    py::handle value = arg.second;
+    if (key == "num_parallel_workers") {
+      (void)builder.SetNumWorkers(ToInt(value));
+    } else if (key == "schema_file_path" || key == "schema_json_string") {
+      schema_exists = true;
+    } else if (key == "num_samples") {
+      (void)builder.SetTotalRows(ToInt(value));
+    } else if (key == "columns_list") {
+      columns_to_load = ToStringVector(value);
+    }
+  }
+  if (schema_exists) {
+    std::unique_ptr<DataSchema> schema = std::make_unique<DataSchema>();
+    if (args.contains("schema_file_path")) {
+      RETURN_IF_NOT_OK(schema->LoadSchemaFile(ToString(args["schema_file_path"]), columns_to_load));
+    } else {
+      RETURN_IF_NOT_OK(schema->LoadSchemaString(ToString(args["schema_json_string"]), columns_to_load));
+    }
+    (void)builder.SetDataSchema(std::move(schema));
+  }
+  std::shared_ptr<RandomDataOp> op;
+  RETURN_IF_NOT_OK(builder.Build(&op));
+  *ptr = op;
+  return Status::OK();
+}
+
 int32_t DEPipeline::GetNumClasses() const { return num_classes_; }

 Status DEPipeline::ParseMnistOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
--- a/mindspore/ccsrc/dataset/api/de_pipeline.h
+++ b/mindspore/ccsrc/dataset/api/de_pipeline.h
@ -60,6 +60,7 @@ enum OpName {
  kCifar10,
  kCifar100,
  kCelebA,
+  kRandomData,
  kTextFile
 };

@ -142,6 +143,8 @@ class DEPipeline {

  Status ParseCifar100Op(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);

+  Status ParseRandomDataOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
+
  void PrintTree();

  int32_t GetNumClasses() const;
--- a/mindspore/ccsrc/dataset/api/python_bindings.cc
+++ b/mindspore/ccsrc/dataset/api/python_bindings.cc
@ -47,6 +47,7 @@
 #include "dataset/engine/datasetops/source/mnist_op.h"
 #include "dataset/engine/datasetops/source/manifest_op.h"
 #include "dataset/engine/datasetops/source/mindrecord_op.h"
+#include "dataset/engine/datasetops/source/random_data_op.h"
 #include "dataset/engine/datasetops/source/sampler/distributed_sampler.h"
 #include "dataset/engine/datasetops/source/sampler/pk_sampler.h"
 #include "dataset/engine/datasetops/source/sampler/random_sampler.h"
@ -489,6 +490,7 @@ PYBIND11_MODULE(_c_dataengine, m) {
    .value("VOC", OpName::kVoc)
    .value("CIFAR10", OpName::kCifar10)
    .value("CIFAR100", OpName::kCifar100)
+    .value("RANDOMDATA", OpName::kRandomData)
    .value("CELEBA", OpName::kCelebA)
    .value("TEXTFILE", OpName::kTextFile);

--- a/mindspore/ccsrc/dataset/engine/data_schema.cc
+++ b/mindspore/ccsrc/dataset/engine/data_schema.cc
@ -466,5 +466,23 @@ Status DataSchema::PreLoadExceptionCheck(const nlohmann::json &js) {
                  "\"columns\" node is required in the schema json file.");
  return Status::OK();
 }
+
+// Loops through all columns in the schema and returns a map with the column
+// name to column index number.
+Status DataSchema::GetColumnNameMap(std::unordered_map<std::string, int32_t> *out_column_name_map) {
+  if (out_column_name_map == nullptr) {
+    return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "unexpected null output column name map.");
+  }
+
+  for (int32_t i = 0; i < col_descs_.size(); ++i) {
+    if (col_descs_[i].name().empty()) {
+      return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
+                    "Constructing column name map from schema, but found empty column name.");
+    }
+    (*out_column_name_map)[col_descs_[i].name()] = i;
+  }
+
+  return Status::OK();
+}
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/dataset/engine/data_schema.h
+++ b/mindspore/ccsrc/dataset/engine/data_schema.h
@ -20,6 +20,7 @@
 #include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include <nlohmann/json.hpp>
 #include "dataset/core/constants.h"
@ -180,6 +181,12 @@ class DataSchema {

  static const char DEFAULT_DATA_SCHEMA_FILENAME[];

+  // Loops through all columns in the schema and returns a map with the column
+  // name to column index number.
+  // @param out_column_name_map - The output map of columns names to column index
+  // @return Status - The error code return
+  Status GetColumnNameMap(std::unordered_map<std::string, int32_t> *out_column_name_map);
+
 private:
  // Internal helper function. Parses the json schema file in any order and produces a schema that
  // does not follow any particular order (json standard does not enforce any ordering protocol).
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt
@ -17,6 +17,7 @@ add_library(engine-datasetops-source OBJECT
    ${FEATURE_SRCS}
    manifest_op.cc
    cifar_op.cc
+    random_data_op.cc
    celeba_op.cc
    text_file_op.cc
    )
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.cc
@ -0,0 +1,414 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dataset/engine/datasetops/source/random_data_op.h"
+#include <iomanip>
+#include <random>
+#include "dataset/engine/execution_tree.h"
+#include "dataset/core/config_manager.h"
+#include "dataset/util/random.h"
+#include "dataset/util/wait_post.h"
+
+namespace mindspore {
+namespace dataset {
+// Builder constructor.  Creates the builder object.
+RandomDataOp::Builder::Builder()
+    : builder_data_schema_(nullptr),
+      builder_num_workers_(0),
+      builder_op_connector_size_(0),
+      builder_rows_per_buffer_(0),
+      builder_total_rows_(0) {
+  // Some arguments to the RandomDataOp have a default argument that is taken from the config.
+  // The user may override these defaults by using the builder set methods.
+  std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
+  builder_rows_per_buffer_ = cfg->rows_per_buffer();
+  builder_num_workers_ = cfg->num_parallel_workers();
+  builder_op_connector_size_ = cfg->op_connector_size();
+}
+
+// The build method that produces the instantiated RandomDataOp as a shared pointer
+Status RandomDataOp::Builder::Build(std::shared_ptr<RandomDataOp> *out_op) {
+  RETURN_IF_NOT_OK(SanityCheck());
+
+  *out_op = std::make_shared<RandomDataOp>(builder_num_workers_, builder_op_connector_size_, builder_rows_per_buffer_,
+                                           builder_total_rows_, std::move(builder_data_schema_));
+
+  // If the user did not provide a schema, then we will ask the op to generate a pseudo-random
+  // schema.
+  // See details of generateSchema function to learn what type of schema it will create.
+  if ((*out_op)->data_schema_ == nullptr) {
+    RETURN_IF_NOT_OK((*out_op)->GenerateSchema());
+  }
+
+  // Extract the column name mapping from the schema and save it in the class.
+  // This will be needed when constructing buffers.
+  RETURN_IF_NOT_OK((*out_op)->data_schema_->GetColumnNameMap(&((*out_op)->column_name_map_)));
+
+  return Status::OK();
+}
+
+// Check if the required parameters are set by the builder.
+Status RandomDataOp::Builder::SanityCheck() const {
+  // There actually is no required arguments for the random data op at all.
+  // Some arguments are preset with global values from config, and if they are not given by the user
+  // then we create them randomly.  Leaving this function here for consistency with other operators.
+  return Status::OK();
+}
+
+// Constructor for RandomDataOp
+RandomDataOp::RandomDataOp(int32_t num_workers, int32_t op_connector_size, int64_t rows_per_buffer, int64_t total_rows,
+                           std::unique_ptr<DataSchema> data_schema)
+    : ParallelOp(num_workers, op_connector_size),
+      buffer_id_(0),
+      rows_per_buffer_(rows_per_buffer),
+      total_rows_(total_rows),
+      epoch_buffers_sent_(0),
+      guys_in_(0),
+      guys_out_(num_workers_),
+      eoe_worker_id_(0),
+      data_schema_(std::move(data_schema)) {
+  rand_gen_.seed(GetSeed());  // seed the random generator
+  // If total rows was not given, then randomly pick a number
+  if (total_rows_ == 0) {
+    total_rows_ = GenRandomInt(1, kMaxTotalRows);
+  }
+  // Everyone is already out from the sync area.
+  all_out_.Set();
+}
+
+// A print method typically used for debugging
+void RandomDataOp::Print(std::ostream &out, bool show_all) const {
+  // Always show the id and name as first line regardless if this summary or detailed print
+  out << "(" << std::setw(2) << operator_id_ << ") <RandomDataOp>:";
+  if (!show_all) {
+    // Call the super class for displaying any common 1-liner info
+    ParallelOp::Print(out, show_all);
+    // Then show any custom derived-internal 1-liner info for this op
+    out << " [total rows: " << total_rows_ << "]\n";
+  } else {
+    // Call the super class for displaying any common detailed info
+    ParallelOp::Print(out, show_all);
+    // Then show any custom derived-internal stuff
+    out << "\nTotal_rows: " << total_rows_
+        << "\nRows per buffer: " << rows_per_buffer_
+        << "\nSchema:\n" << *data_schema_ << "\n\n";
+  }
+}
+
+// Helper function to produce a default/random schema if one didn't exist
+Status RandomDataOp::GenerateSchema() {
+  if (data_schema_ != nullptr) {
+    return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "Generating a schema but one already exists!");
+  }
+
+  // To randomly create a schema, we need to choose:
+  // a) how many columns
+  // b) the type of each column
+  // c) the shape of each column (number of dimensions i.e. rank)
+  // d) the shape of each column (dimension values)
+  data_schema_ = std::make_unique<DataSchema>();
+  std::unique_ptr<TensorShape> newShape;
+  std::unique_ptr<ColDescriptor> newCol;
+
+  // Loop over the number of chosen columns
+  int32_t numColumns = GenRandomInt(1, kMaxNumColumns);
+  for (int32_t i = 0; i < numColumns; i++) {
+    // For each column:
+    // - choose a datatype
+    // - generate a shape that randomly chooses the number of dimensions and the dimension values.
+    DataType::Type newType = static_cast<DataType::Type>(GenRandomInt(0, kMaxDataType));
+    int32_t rank = GenRandomInt(1, kMaxRank);
+    std::vector<dsize_t> dims;
+    for (int32_t d = 0; d < rank; d++) {
+      // 0 is not a valid dimension value.  however, we can support "*" or unknown, so map the random
+      // 0 value to the unknown attribute if 0 is chosen
+      dsize_t dim_value = static_cast<dsize_t>(GenRandomInt(0, kMaxDimValue));
+      if (dim_value == 0) dim_value = TensorShape::kDimUnknown;
+      dims.push_back(dim_value);
+    }
+    newShape = std::make_unique<TensorShape>(dims);
+
+    // Create the column descriptor
+    std::string colName = "c" + std::to_string(i);
+    newCol = std::make_unique<ColDescriptor>(colName, DataType(newType), TensorImpl::kFlexible, rank,
+                                                   newShape.get());
+
+    data_schema_->AddColumn(*newCol);
+  }
+
+  return Status::OK();
+}
+
+// Class functor operator () override.
+// All DatasetOps operate by launching a thread (see ExecutionTree). This class functor will
+// provide the master loop that drives the logic for performing the work.
+Status RandomDataOp::operator()() {
+  // First, compute how many buffers we'll need to satisfy the total row count.
+  // The only reason we do this is for the purpose of throttling worker count if needed.
+  int64_t buffers_needed = total_rows_ / rows_per_buffer_;
+  if (total_rows_ % rows_per_buffer_ != 0) {
+    buffers_needed++;
+  }
+
+  // If the amount of workers we have exceeds the number of buffers to produce, then we'll have
+  // idle workers doing nothing.  In that case, let's throttle the worker count.
+  if (num_workers_ > buffers_needed) {
+    MS_LOG(INFO) << "RandomDataOp throttling worker count from " << num_workers_ << "to " << buffers_needed;
+    num_workers_ = buffers_needed;
+    num_producers_ = num_workers_;
+    guys_out_ = num_workers_;
+    // The output connector was already created with a different worker count.  We have to drop and recreate
+    // that connector.
+    DatasetOp::CreateConnector(num_producers_, num_workers_);
+  }
+
+  // Assign the number of rows to each worker in a round robin fashion.
+  worker_max_rows_.reserve(num_workers_);
+  worker_rows_packed_.reserve(num_workers_);
+  // init the counts to zero to start.
+  for (int32_t w = 0; w < num_workers_; w++) {
+    worker_max_rows_.push_back(0);
+    worker_rows_packed_.push_back(0);
+  }
+  // then assign round robin row counts
+  int32_t currentWorker = 0;
+  for (int64_t r = 0; r < total_rows_; r++) {
+    worker_max_rows_[currentWorker]++;
+    currentWorker = (currentWorker + 1) % num_workers_;
+  }
+
+  // Next, compute the total buffer count.  This stat is needed during reset logic
+  for (int32_t w = 0; w < num_workers_; w++) {
+    int64_t worker_buffers = 0;
+    worker_buffers = worker_max_rows_[w] / rows_per_buffer_;
+    if (worker_max_rows_[w] % rows_per_buffer_ != 0) worker_buffers++;
+    epoch_buffers_sent_ += worker_buffers;
+  }
+
+  // For the connector to work, we need to target the correct worker channel for the eoe.
+  // This will initialize it for the first one.  reset() handles for the rest of the epochs.
+  eoe_worker_id_ = epoch_buffers_sent_ % num_workers_;
+  epoch_buffers_sent_++;  // Add the eoe buffer to the count for subsequent epochs
+
+  // RandomDataOp doesn't need the master thread to stay around.  Kick off the workers and then master exits.
+  RETURN_IF_NOT_OK(
+    tree_->LaunchWorkers(num_workers_, std::bind(&RandomDataOp::WorkerEntry, this, std::placeholders::_1)));
+
+  // required task group setup after launching workers
+  TaskManager::FindMe()->Post();
+  RETURN_IF_NOT_OK(epoch_sync_wait_post_.Register(tree_->AllTasks()));
+
+  return Status::OK();
+}
+
+// Performs a synchronization between workers at the end of an epoch
+Status RandomDataOp::EpochSync(int32_t worker_id, bool *quitting) {
+  MS_LOG(INFO) << "RandomDataOp worker " << worker_id << " syncing at end of epoch";
+
+  // Sync on the guys_in counter
+  // We have to wait the last guy is out.
+  all_out_.Wait();
+  // If we are not in a repeat loop, or that was the last repeat already, then setup our exit
+  // condition from the master loop.
+  if (!BitTest(op_ctrl_flags_, kDeOpRepeated) || BitTest(op_ctrl_flags_, kDeOpLastRepeat)) {
+    *quitting = true;
+  }
+
+  auto prev = guys_in_.fetch_add(1);
+  bool last_guy_in = (prev + 1) == num_workers_;
+  // If we are the last worker to hit this sync point, we have some extra tasks
+  if (last_guy_in) {
+    MS_LOG(INFO) << "RandomDataOp worker " << worker_id << " is the last one to sync. eoe sent as worker "
+                 << eoe_worker_id_;
+    // Prepare for sync
+    all_out_.Clear();
+    // Always flow eoe at the end
+    std::unique_ptr<DataBuffer> eoe_buffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
+    RETURN_IF_NOT_OK(out_connector_->Add(eoe_worker_id_, std::move(eoe_buffer)));
+    // If we're done then also flow the eof
+    if (*quitting) {
+      // The eof needs to be sent from the next sender in the round robin, so +1
+      int32_t eof_worker_id = (eoe_worker_id_ + 1) % num_workers_;
+      MS_LOG(INFO) << "RandomDataOp worker " << worker_id << " has no more epochs.  sending eof as worker "
+                   << eof_worker_id;
+      std::unique_ptr<DataBuffer> eof_buffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF);
+      RETURN_IF_NOT_OK(out_connector_->Add(eof_worker_id, std::move(eof_buffer)));
+    }
+  }
+
+  // Wait for the reset to wake us up if we're not quitting
+  if (!(*quitting)) {
+    MS_LOG(INFO) << "RandomDataOp worker " << worker_id << " entering sync wait.";
+    RETURN_IF_NOT_OK(epoch_sync_wait_post_.Wait());
+    prev = guys_out_.fetch_add(1);
+    bool last_guy_out = (prev + 1) == num_workers_;
+    // Last guy out will clear the wait post and set the row counts
+    if (last_guy_out) {
+      MS_LOG(INFO) << "RandomDataOp worker " << worker_id << " last guy out clearing wait post.";
+      epoch_sync_wait_post_.Clear();
+      guys_in_ = 0;
+      all_out_.Set();
+    }
+  }
+
+  MS_LOG(INFO) << "RandomDataOp worker " << worker_id << " epoch sync complete.";
+  return Status::OK();
+}
+
+// The entry point code for when workers are launched
+Status RandomDataOp::WorkerEntry(int32_t worker_id) {
+  MS_LOG(INFO) << "RandomDataOp worker " << worker_id << " entry";
+
+  // handshake with the master first to tell it we're alive
+  TaskManager::FindMe()->Post();
+
+  bool quitting = false;
+  std::unique_ptr<TensorQTable> new_tensor_table = nullptr;
+
+  // Loop until the quitting variable gets set to true
+  do {
+    // If we have not yet reached the row count for this worker then produce another record
+    if (worker_rows_packed_[worker_id] < worker_max_rows_[worker_id]) {
+      TensorRow new_row;
+
+      // Start a new tensor table if needed
+      if (new_tensor_table == nullptr) {
+        new_tensor_table = std::make_unique<TensorQTable>();
+      }
+
+      // Create the data for the row
+      RETURN_IF_NOT_OK(CreateRandomRow(worker_id, &new_row));
+
+      // Add the row to our table
+      new_tensor_table->push_back(std::move(new_row));
+      worker_rows_packed_[worker_id]++;
+
+      // If the tensor table is at capacity then it's time to send it to output
+      if (new_tensor_table->size() == rows_per_buffer_) {
+        RETURN_IF_NOT_OK(PackAndSend(worker_id, std::move(new_tensor_table)));
+      }
+    } else {
+      // We've reached the total row count for this worker, so it's time for epoch sync.
+      // There is likely some records built but not sent yet, so take care of those first
+      // (this buffer will be smaller than rows_per_buffer)
+      if (new_tensor_table != nullptr && new_tensor_table->size() > 0) {
+        RETURN_IF_NOT_OK(PackAndSend(worker_id, std::move(new_tensor_table)));
+      }
+
+      // Now, let's enter the epoch sync
+      RETURN_IF_NOT_OK(EpochSync(worker_id, &quitting));
+    }
+  } while (!quitting);
+
+  MS_LOG(INFO) << "RandomDataOp worker " << worker_id << " is now quitting.";
+
+  return Status::OK();
+}
+
+// A helper function to stuff the tensor table into a buffer and send it to output connector
+Status RandomDataOp::PackAndSend(int32_t worker_id, std::unique_ptr<TensorQTable> in_table) {
+  auto new_buffer = std::make_unique<DataBuffer>(GetNextBufferId(), DataBuffer::kDeBFlagNone);
+  new_buffer->set_tensor_table(std::move(in_table));
+  new_buffer->set_column_name_map(column_name_map_);
+  RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(new_buffer)));
+  return Status::OK();
+}
+
+// A helper function to create random data for the row
+Status RandomDataOp::CreateRandomRow(int32_t worker_id, TensorRow *new_row) {
+  if (new_row == nullptr) {
+    return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "Missing tensor row output");
+  }
+
+  // Create a tensor for each column, then add the tensor to the row
+  for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
+    const ColDescriptor current_col = data_schema_->column(i);
+    std::vector<dsize_t> current_shape = current_col.shape().AsVector();
+    std::unique_ptr<TensorShape> new_shape = nullptr;
+    std::unique_ptr<unsigned char[]> buf = nullptr;
+    std::shared_ptr<Tensor> new_tensor = nullptr;
+
+    // We need to resolve the shape to fill in any unknown dimensions with random
+    // values, then use that as our shape for this tensor.
+    for (int j = 0; j < current_shape.size(); ++j) {
+      if (current_shape[j] == TensorShape::kDimUnknown) {
+        current_shape[j] = static_cast<dsize_t>(GenRandomInt(1, kMaxDimValue));
+      }
+    }
+
+    new_shape = std::make_unique<TensorShape>(current_shape);
+    int64_t size_in_bytes = new_shape->NumOfElements() * current_col.type().SizeInBytes();
+
+    // Generate a random byte of data.  This may cause some funny data for things like doubles,floats, bools
+    // however the random data op is not too concerned about the physical data itself.
+    std::uniform_int_distribution<uint8_t> uniDist(0, 255);
+    uint8_t random_byte = uniDist(rand_gen_);
+
+    // Now, create a chunk of memory for the entire tensor and copy this byte in repeatedly.
+    buf = std::make_unique<unsigned char[]>(size_in_bytes);
+    int ret_code = memset_s(buf.get(), size_in_bytes, random_byte, size_in_bytes);
+    if (ret_code != 0) {
+      return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "Failed to set random bytes for a tensor.");
+    }
+
+    RETURN_IF_NOT_OK(
+      Tensor::CreateTensor(&new_tensor, current_col.tensorImpl(), *new_shape, current_col.type(), buf.get()));
+
+    // Add this tensor to the tensor row for output
+    (*new_row).push_back(std::move(new_tensor));
+  }
+  return Status::OK();
+}
+
+// Overrides base class reset method.  When an operator does a reset, it cleans up any state
+// info from it's previous execution and then initializes itself so that it can be executed
+// again.
+Status RandomDataOp::Reset() {
+  MS_LOG(INFO) << "RandomDataOp resetting.";
+
+  // Ensure all guys are in the waitpost
+  if (guys_in_ != num_workers_) {
+    return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
+                  "Issuing a reset, but some workers are missing from epochSync!");
+  }
+
+  // reset the row counters for all workers
+  for (int32_t w = 0; w < num_workers_; w++) {
+    worker_rows_packed_[w] = 0;
+    worker_max_rows_[w] = 0;
+  }
+  buffer_id_ = 0;
+
+  // Re-assign round robin row counts, starting from the worker after the one that gave
+  // the eoe last time
+  int32_t currentWorker = (eoe_worker_id_ + 1) % num_workers_;
+  for (int64_t r = 0; r < total_rows_; r++) {
+    worker_max_rows_[currentWorker]++;
+    currentWorker = (currentWorker + 1) % num_workers_;
+  }
+
+  // Compute which worker should get the eoe for the next epoch
+  eoe_worker_id_ = ((epoch_buffers_sent_ % num_workers_) + eoe_worker_id_) % num_workers_;
+
+  // Wake up the workers to get them going again in a new epoch
+  guys_out_ = 0;
+  epoch_sync_wait_post_.Set();
+
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
+
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.h
@ -0,0 +1,271 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_ENGINE_DATASETOPS_SOURCE_RANDOM_DATA_OP_
+#define DATASET_ENGINE_DATASETOPS_SOURCE_RANDOM_DATA_OP_
+
+#include <atomic>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <random>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <utility>
+#include "dataset/util/status.h"
+#include "dataset/core/tensor.h"
+#include "dataset/core/data_type.h"
+#include "dataset/engine/data_schema.h"
+#include "dataset/engine/datasetops/parallel_op.h"
+#include "dataset/util/wait_post.h"
+
+namespace mindspore {
+namespace dataset {
+// The RandomDataOp is a leaf node storage operator that generates random data based
+// on the schema specifications.  Typically, it's used for testing and demonstrating
+// various dataset operator pipelines.  It is not "real" data to train with.
+// The data that is random created is just random and repeated bytes, there is no
+// "meaning" behind what these bytes are.
+class RandomDataOp : public ParallelOp {
+ public:
+  // Some constants to provide limits to random generation.
+  static constexpr int32_t kMaxNumColumns = 4;
+  static constexpr int32_t kMaxRank = 4;
+  static constexpr int32_t kMaxDimValue = 2048;
+  static constexpr int32_t kMaxDataType = (DataType::DE_UNKNOWN - 1);
+  static constexpr int32_t kMaxTotalRows = 1024;
+
+  // A nested builder class to aid in the construction of a RandomDataOp
+  class Builder {
+   public:
+    /**
+     * Builder constructor.  Creates the builder object.
+     * @note No default args.
+     * @return This is a constructor.
+     */
+    Builder();
+
+    /**
+     * Default destructor
+     */
+    ~Builder() = default;
+
+    /**
+     * The build method that produces the instantiated RandomDataOp as a shared pointer
+     * @param out_op - The output RandomDataOperator that was constructed
+     * @return Status - The error code return
+     */
+    Status Build(std::shared_ptr<RandomDataOp> *out_op);
+
+    /**
+     * Builder set method
+     * @param data_schema - A user-provided schema
+     * @return Builder - The modified builder by reference
+     */
+    Builder &SetDataSchema(std::unique_ptr<DataSchema> data_schema) {
+      builder_data_schema_ = std::move(data_schema);
+      return *this;
+    }
+
+    /**
+     * Builder set method
+     * @param num_workers - The number of workers
+     * @return Builder - The modified builder by reference
+     */
+    Builder &SetNumWorkers(int32_t num_workers) {
+      builder_num_workers_ = num_workers;
+      return *this;
+    }
+
+    /**
+     * Builder set method
+     * @param op_connector_size - The size of the output connector
+     * @return Builder - The modified builder by reference
+     */
+    Builder &SetOpConnectorSize(int32_t op_connector_size) {
+      builder_op_connector_size_ = op_connector_size;
+      return *this;
+    }
+
+    /**
+     * Builder set method
+     * @param rows_per_buffer - The number of rows in each DataBuffer
+     * @return Builder - The modified builder by reference
+     */
+    Builder &SetRowsPerBuffer(int64_t rows_per_buffer) {
+      builder_rows_per_buffer_ = rows_per_buffer;
+      return *this;
+    }
+
+    /**
+     * Builder set method
+     * @param total_rows - The total number of rows in the dataset
+     * @return Builder - The modified builder by reference
+     */
+    Builder &SetTotalRows(int64_t total_rows) {
+      builder_total_rows_ = total_rows;
+      return *this;
+    }
+
+   private:
+    /**
+     * Check if the required parameters are set by the builder.
+     * @return Status - The error code return
+     */
+    Status SanityCheck() const;
+
+    std::unique_ptr<DataSchema> builder_data_schema_;
+    int32_t builder_num_workers_;
+    int32_t builder_op_connector_size_;
+    int64_t builder_rows_per_buffer_;
+    int64_t builder_total_rows_;
+  };  // class Builder
+
+  /**
+   * Constructor for RandomDataOp
+   * @note Private constructor.  Must use builder to construct.
+   * @param num_workers - The number of workers
+   * @param op_connector_size - The size of the output connector
+   * @param rows_per_buffer - The number of rows in each DataBuffer
+   * @param data_schema - A user-provided schema
+   * @param total_rows - The total number of rows in the dataset
+   * @return Builder - The modified builder by reference
+   */
+  RandomDataOp(int32_t num_workers, int32_t op_connector_size, int64_t rows_per_buffer, int64_t total_rows,
+               std::unique_ptr<DataSchema> data_schema);
+
+  /**
+   * Destructor
+   */
+  ~RandomDataOp() = default;
+
+  /**
+   * A print method typically used for debugging
+   * @param out - The output stream to write output to
+   * @param show_all - A bool to control if you want to show all info or just a summary
+   */
+  void Print(std::ostream &out, bool show_all) const override;
+
+  /**
+   * << Stream output operator overload
+   * @notes This allows you to write the debug print info using stream operators
+   * @param out - reference to the output stream being overloaded
+   * @param so - reference to the ShuffleOp to display
+   * @return - the output stream must be returned
+   */
+  friend std::ostream &operator<<(std::ostream &out, const RandomDataOp &op) {
+    op.Print(out, false);
+    return out;
+  }
+
+  /**
+   * Class functor operator () override.
+   * All DatasetOps operate by launching a thread (see ExecutionTree). This class functor will
+   * provide the master loop that drives the logic for performing the work.
+   * @return Status - The error code return
+   */
+  Status operator()() override;
+
+  /**
+   * Overrides base class reset method.  When an operator does a reset, it cleans up any state
+   * info from it's previous execution and then initializes itself so that it can be executed
+   * again.
+   * @return Status - The error code return
+   */
+  Status Reset() override;
+
+  /**
+   * Quick getter for total rows.
+   */
+  int64_t GetTotalRows() const { return total_rows_; }
+
+ private:
+  /**
+   * The entry point code for when workers are launched
+   * @param worker_id - The worker id
+   * @return Status - The error code return
+   */
+  Status WorkerEntry(int32_t worker_id) override;
+
+  /**
+   * Helper function to produce a default/random schema if one didn't exist
+   @return Status - The error code return
+  */
+  Status GenerateSchema();
+
+  /**
+   * Performs a synchronization between workers at the end of an epoch
+   * @param worker_id - The worker id
+   * @return Status - The error code return
+   */
+  Status EpochSync(int32_t worker_id, bool *quitting);
+
+  /**
+   * A helper function to stuff the tensor table into a buffer and send it to output connector
+   * @param worker_id - The worker id
+   * @param in_table - The tensor table to pack and send
+   * @return Status - The error code return
+   */
+  Status PackAndSend(int32_t worker_id, std::unique_ptr<TensorQTable> in_table);
+
+  /**
+   * A helper function to create random data for the row
+   * @param worker_id - The worker id
+   * @param new_row - The output row to produce
+   * @return Status - The error code return
+   */
+  Status CreateRandomRow(int32_t worker_id, TensorRow *new_row);
+
+  /**
+   * A quick inline for producing a random number between (and including) min/max
+   * @param min - minimum number that can be generated
+   * @param max - maximum number that can be generated
+   * @return - The generated random number
+   */
+  inline int32_t GenRandomInt(int32_t min, int32_t max) {
+    std::uniform_int_distribution<int32_t> uniDist(min, max);
+    return uniDist(rand_gen_);
+  }
+
+  /**
+   * A quick inline for producing the next buffer id in sequence, threadsafe
+   * @return - The next buffer id.
+   */
+  inline int32_t GetNextBufferId() {
+    std::unique_lock<std::mutex> lock(buffer_id_mutex_);
+    return ++buffer_id_;
+  }
+
+  int32_t buffer_id_;
+  int64_t rows_per_buffer_;
+  int64_t total_rows_;
+  int64_t epoch_buffers_sent_;
+  std::atomic<int32_t> guys_in_;
+  std::atomic<int32_t> guys_out_;
+  int32_t eoe_worker_id_;
+  std::unique_ptr<DataSchema> data_schema_;
+  std::vector<int64_t> worker_max_rows_;
+  std::vector<int64_t> worker_rows_packed_;
+  std::unordered_map<std::string, int32_t> column_name_map_;
+  std::mt19937 rand_gen_;
+  WaitPost epoch_sync_wait_post_;
+  WaitPost all_out_;
+  std::mutex buffer_id_mutex_;
+};  // class RandomDataOp
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_ENGINE_DATASETOPS_SOURCE_RANDOM_DATA_OP_
--- a/mindspore/dataset/init.py
+++ b/mindspore/dataset/init.py
@ -21,7 +21,7 @@ can also create samplers with this module to sample data.
 from .core.configuration import config
 from .engine.datasets import StorageDataset, TFRecordDataset, ImageFolderDatasetV2, MnistDataset, MindDataset, \
    GeneratorDataset, ManifestDataset, Cifar10Dataset, Cifar100Dataset, VOCDataset, CelebADataset, TextFileDataset, \
-    Schema, Shuffle, zip
+    Schema, Shuffle, zip, RandomDataset
 from .engine.samplers import DistributedSampler, PKSampler, RandomSampler, SequentialSampler, SubsetRandomSampler, \
    WeightedRandomSampler, Sampler
 from .engine.serializer_deserializer import serialize, deserialize, show
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@ -3148,6 +3148,57 @@ class Cifar100Dataset(SourceDataset):
        return get_num_rows(num_rows, self.num_shards)


+class RandomDataset(SourceDataset):
+    """
+    A source dataset that generates random data.
+
+    Args:
+        num_samples (int): number of samples to generate.
+        schema (str or Schema, optional): Path to the json schema file or schema object (default=None).
+            If the schema is not provided, the meta data from the TFRecord file is considered the schema.
+        columns_list (list[str], optional): List of columns to be read (default=None, read all columns)
+        num_parallel_workers (int, optional): number of workers to read the data
+            (default=None, number set in the config).
+    """
+
+    def __init__(self, schema=None, columns_list=None, num_samples=None, num_parallel_workers=None):
+        super().__init__(num_parallel_workers)
+        schema_obj = None
+        if (schema is not None) and (not isinstance(schema, Schema)):
+            schema_obj = Schema(schema)  # read the schema file and convert to schema object to validate it
+        self.schema = schema
+        self.columns_list = columns_list
+        self.num_samples = num_samples
+        if schema_obj is not None and num_samples is None:
+            self.num_samples = schema_obj.num_rows
+
+    def get_args(self):
+        args = super().get_args()
+        if self.schema is not None:
+            if isinstance(self.schema, Schema):
+                self.schema.datasetType = 'Random'
+                if self.num_samples is not None:
+                    self.schema.num_rows = self.num_samples
+                args["schema_json_string"] = self.schema.to_json()
+            else:
+                args["schema_file_path"] = self.schema
+        args["schema"] = self.schema
+        if self.columns_list is not None:
+            args["columns_list"] = self.columns_list
+        if self.num_samples is not None:
+            args["num_samples"] = self.num_samples
+        return args
+
+    def get_dataset_size(self):
+        """
+        Get the number of batches in an epoch.
+
+        Return:
+            Number, number of batches.
+        """
+        return num_samples
+
+
 class Schema:
    """
    Class to represent a schema of dataset.
--- a/mindspore/dataset/engine/iterators.py
+++ b/mindspore/dataset/engine/iterators.py
@ -192,6 +192,8 @@ class Iterator:
            op_type = OpName.CIFAR100
        elif isinstance(dataset, de.CelebADataset):
            op_type = OpName.CELEBA
+        elif isinstance(dataset, de.RandomDataset):
+            op_type = OpName.RANDOMDATA
        elif isinstance(dataset, de.TextFileDataset):
            op_type = OpName.TEXTFILE
        else:
--- a/tests/ut/cpp/dataset/random_data_op_test.cc
+++ b/tests/ut/cpp/dataset/random_data_op_test.cc
@ -0,0 +1,457 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dataset/core/client.h"
+#include "common/common.h"
+#include "gtest/gtest.h"
+#include <memory>
+#include <vector>
+#include <iostream>
+#include "dataset/core/tensor_shape.h"
+#include "dataset/engine/datasetops/source/random_data_op.h"
+#include "dataset/engine/data_schema.h"
+
+using namespace mindspore::dataset;
+using mindspore::MsLogLevel::INFO;
+using mindspore::ExceptionType::NoExceptionType;
+using mindspore::LogStream;
+
+class MindDataTestRandomDataOp : public UT::DatasetOpTesting {
+
+};
+
+// Test info:
+// - Simple test with a user-provided schema generated purely from DataSchema C API
+// - has an interation loop
+//
+// Tree:  single node tree with RandomDataOp
+//
+//    RandomDataOp
+//
+TEST_F(MindDataTestRandomDataOp, RandomDataOpBasic1) {
+  Status rc;
+  int32_t rank = 0; // not used
+  MS_LOG(INFO) << "UT test RandomDataOpBasic1";
+
+  // Start with an empty execution tree
+  auto myTree = std::make_shared<ExecutionTree>();
+
+  // Create a schema using the C api's
+  std::unique_ptr<DataSchema> testSchema = std::make_unique<DataSchema>();
+
+  // RandomDataOp can randomly fill in unknown dimension lengths of a shape.
+  // Most other ops cannot do that as they are limited by the physical data itself. We're
+  // more flexible with random data since it is just making stuff up on the fly.
+  TensorShape c1Shape({TensorShape::kDimUnknown, TensorShape::kDimUnknown, 3});
+  ColDescriptor c1("image",
+                   DataType(DataType::DE_INT8),
+                   TensorImpl::kFlexible,
+                   rank,  // not used
+                   &c1Shape);
+
+  // Column 2 will just be a scalar label number
+  TensorShape c2Shape({});  // empty shape is a 1-value scalar Tensor
+  ColDescriptor c2("label",
+                   DataType(DataType::DE_UINT32),
+                   TensorImpl::kFlexible,
+                   rank,
+                   &c2Shape);
+
+  testSchema->AddColumn(c1);
+  testSchema->AddColumn(c2);
+
+  std::shared_ptr<RandomDataOp> myRandomDataOp;
+  RandomDataOp::Builder builder;
+
+  rc = builder.SetRowsPerBuffer(2)
+    .SetNumWorkers(1)
+    .SetDataSchema(std::move(testSchema))
+    .SetTotalRows(25)
+    .Build(&myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  rc = myTree->AssociateNode(myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  rc = myTree->AssignRoot(myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  std::ostringstream ss;
+  ss << *myRandomDataOp;
+  MS_LOG(INFO) << "RandomDataOp print: %s" << ss.str();
+
+  MS_LOG(INFO) << "Launching tree and begin iteration";
+  rc = myTree->Prepare();
+  EXPECT_TRUE(rc.IsOk());
+  rc = myTree->Launch();
+  EXPECT_TRUE(rc.IsOk());
+
+  // Start the loop of reading tensors from our pipeline
+  DatasetIterator dI(myTree);
+  TensorRow tensorList;
+  rc = dI.FetchNextTensorRow(&tensorList);
+  EXPECT_TRUE(rc.IsOk());
+  int rowCount = 0;
+  while (!tensorList.empty()) {
+    // Don't display these rows...too big to show
+    MS_LOG(INFO) << "Row fetched #: " << rowCount;
+
+    rc = dI.FetchNextTensorRow(&tensorList);
+    EXPECT_TRUE(rc.IsOk());
+    rowCount++;
+  }
+  ASSERT_EQ(rowCount, 25);
+}
+
+// Test info:
+// - Simple test with a randomly generated schema
+// - no iteration loop on this one, just create the op
+//
+// Tree:  single node tree with RandomDataOp
+//
+//    RandomDataOp
+//
+TEST_F(MindDataTestRandomDataOp, RandomDataOpBasic2) {
+  Status rc;
+  MS_LOG(INFO) << "UT test RandomDataOpBasic2";
+
+  // Start with an empty execution tree
+  auto myTree = std::make_shared<ExecutionTree>();
+
+  std::shared_ptr<RandomDataOp> myRandomDataOp;
+  RandomDataOp::Builder builder;
+
+  rc = builder.SetRowsPerBuffer(2)
+    .SetNumWorkers(1)
+    .Build(&myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  rc = myTree->AssociateNode(myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  rc = myTree->AssignRoot(myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  std::ostringstream ss;
+  ss << *myRandomDataOp;
+  MS_LOG(INFO) << "RandomDataOp print: " << ss.str();
+}
+
+// Test info:
+// - json file test with iteration
+//
+// Tree:  single node tree with RandomDataOp
+//
+//    RandomDataOp
+//
+TEST_F(MindDataTestRandomDataOp, RandomDataOpBasic3) {
+  Status rc;
+  MS_LOG(INFO) << "UT test RandomDataOpBasic3";
+
+  // Start with an empty execution tree
+  auto myTree = std::make_shared<ExecutionTree>();
+
+  std::unique_ptr<DataSchema> testSchema = std::make_unique<DataSchema>();
+  rc = testSchema->LoadSchemaFile(datasets_root_path_ + "/testRandomData/datasetSchema.json", {});
+  EXPECT_TRUE(rc.IsOk());
+
+  std::shared_ptr<RandomDataOp> myRandomDataOp;
+  RandomDataOp::Builder builder;
+
+  rc = builder.SetRowsPerBuffer(2)
+    .SetNumWorkers(1)
+    .SetDataSchema(std::move(testSchema))
+    .SetTotalRows(10)
+    .Build(&myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  rc = myTree->AssociateNode(myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  rc = myTree->AssignRoot(myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  std::ostringstream ss;
+  ss << *myRandomDataOp;
+  MS_LOG(INFO) << "RandomDataOp print: " << ss.str();
+
+  MS_LOG(INFO) << "Launching tree and begin iteration";
+  rc = myTree->Prepare();
+  EXPECT_TRUE(rc.IsOk());
+  rc = myTree->Launch();
+  EXPECT_TRUE(rc.IsOk());
+
+  // Start the loop of reading tensors from our pipeline
+  DatasetIterator dI(myTree);
+  TensorRow tensorList;
+  rc = dI.FetchNextTensorRow(&tensorList);
+  EXPECT_TRUE(rc.IsOk());
+  int rowCount = 0;
+  while (!tensorList.empty()) {
+    // Don't display these rows...too big to show
+    MS_LOG(INFO) << "Row fetched #: " << rowCount;
+
+    rc = dI.FetchNextTensorRow(&tensorList);
+    EXPECT_TRUE(rc.IsOk());
+    rowCount++;
+  }
+  ASSERT_EQ(rowCount, 10);
+}
+
+// Test info:
+// - json schema input it's a fairly simple one
+// - has an interation loop
+//
+// Tree:  RepeatOp over RandomDataOp
+//
+//     RepeatOp
+//        |
+//    RandomDataOp
+//
+TEST_F(MindDataTestRandomDataOp, RandomDataOpBasic4) {
+  Status rc;
+  MS_LOG(INFO) << "UT test RandomDataOpBasic4";
+
+  // Start with an empty execution tree
+  auto myTree = std::make_shared<ExecutionTree>();
+
+  std::unique_ptr<DataSchema> testSchema = std::make_unique<DataSchema>();
+  rc = testSchema->LoadSchemaFile(datasets_root_path_ + "/testRandomData/datasetSchema2.json", {});
+  EXPECT_TRUE(rc.IsOk());
+
+  std::shared_ptr<RandomDataOp> myRandomDataOp;
+  RandomDataOp::Builder builder;
+
+  rc = builder.SetRowsPerBuffer(2)
+    .SetNumWorkers(1)
+    .SetDataSchema(std::move(testSchema))
+    .SetTotalRows(10)
+    .Build(&myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  rc = myTree->AssociateNode(myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  uint32_t numRepeats = 2;
+  std::shared_ptr<RepeatOp> myRepeatOp;
+  rc = RepeatOp::Builder(numRepeats)
+    .Build(&myRepeatOp);
+  EXPECT_TRUE(rc.IsOk());
+  rc = myTree->AssociateNode(myRepeatOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  rc = myRepeatOp->AddChild(myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  rc = myTree->AssignRoot(myRepeatOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  MS_LOG(INFO) << "Launching tree and begin iteration";
+  rc = myTree->Prepare();
+  EXPECT_TRUE(rc.IsOk());
+  rc = myTree->Launch();
+  EXPECT_TRUE(rc.IsOk());
+
+  // Start the loop of reading tensors from our pipeline
+  DatasetIterator dI(myTree);
+  TensorRow tensorList;
+  rc = dI.FetchNextTensorRow(&tensorList);
+  EXPECT_TRUE(rc.IsOk());
+  int rowCount = 0;
+  while (!tensorList.empty()) {
+    MS_LOG(INFO) << "Row display for row #: " << rowCount;
+
+    // Display the tensor by calling the printer on it
+    for (int i = 0; i < tensorList.size(); i++) {
+      std::ostringstream ss;
+      ss << *tensorList[i] << std::endl;
+      MS_LOG(INFO) << "Tensor print: %s" << ss.str();
+    }
+
+    rc = dI.FetchNextTensorRow(&tensorList);
+    EXPECT_TRUE(rc.IsOk());
+    rowCount++;
+  }
+  ASSERT_EQ(rowCount, 20);
+}
+
+// Test info:
+// - json schema input it's a fairly simple one
+// - has an interation loop
+// - same as MindDataTestRandomDataOpBasic4 except that this one will have parallel workers
+//
+// Tree:  RepeatOp over RandomDataOp
+//
+//     RepeatOp
+//        |
+//    RandomDataOp
+//
+TEST_F(MindDataTestRandomDataOp, RandomDataOpBasic5) {
+  Status rc;
+  MS_LOG(INFO) << "UT test RandomDataOpBasic5";
+
+  // Start with an empty execution tree
+  auto myTree = std::make_shared<ExecutionTree>();
+
+  std::unique_ptr<DataSchema> testSchema = std::make_unique<DataSchema>();
+  rc = testSchema->LoadSchemaFile(datasets_root_path_ + "/testRandomData/datasetSchema2.json", {});
+  EXPECT_TRUE(rc.IsOk());
+
+  std::shared_ptr<RandomDataOp> myRandomDataOp;
+  RandomDataOp::Builder builder;
+
+  rc = builder.SetRowsPerBuffer(2)
+    .SetNumWorkers(4)
+    .SetDataSchema(std::move(testSchema))
+    .SetTotalRows(10)
+    .Build(&myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  rc = myTree->AssociateNode(myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  uint32_t numRepeats = 3;
+  std::shared_ptr<RepeatOp> myRepeatOp;
+  rc = RepeatOp::Builder(numRepeats)
+    .Build(&myRepeatOp);
+  EXPECT_TRUE(rc.IsOk());
+  rc = myTree->AssociateNode(myRepeatOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  rc = myRepeatOp->AddChild(myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  rc = myTree->AssignRoot(myRepeatOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  MS_LOG(INFO) << "Launching tree and begin iteration";
+  rc = myTree->Prepare();
+  EXPECT_TRUE(rc.IsOk());
+  rc = myTree->Launch();
+  EXPECT_TRUE(rc.IsOk());
+
+  // Start the loop of reading tensors from our pipeline
+  DatasetIterator dI(myTree);
+  TensorRow tensorList;
+  rc = dI.FetchNextTensorRow(&tensorList);
+  EXPECT_TRUE(rc.IsOk());
+  int rowCount = 0;
+  while (!tensorList.empty()) {
+    MS_LOG(INFO) << "Row display for row #: " << rowCount;
+
+    // Display the tensor by calling the printer on it
+    for (int i = 0; i < tensorList.size(); i++) {
+      std::ostringstream ss;
+      ss << *tensorList[i] << std::endl;
+      MS_LOG(INFO) << "Tensor print: ", ss.str();
+    }
+
+    rc = dI.FetchNextTensorRow(&tensorList);
+    EXPECT_TRUE(rc.IsOk());
+    rowCount++;
+  }
+  ASSERT_EQ(rowCount, 30);
+}
+
+// Test info:
+// - repeat shuffle random
+//
+// Tree:  RepeatOp over RandomDataOp
+//
+//     RepeatOp
+//        |
+//     ShuffleOp
+//        |
+//    RandomDataOp
+//
+TEST_F(MindDataTestRandomDataOp, RandomDataOpTree1) {
+  Status rc;
+  MS_LOG(INFO) << "UT test RandomDataOpTree1";
+
+  // Start with an empty execution tree
+  auto myTree = std::make_shared<ExecutionTree>();
+
+  std::unique_ptr<DataSchema> testSchema = std::make_unique<DataSchema>();
+  rc = testSchema->LoadSchemaFile(datasets_root_path_ + "/testRandomData/datasetSchema2.json", {});
+  EXPECT_TRUE(rc.IsOk());
+
+  std::shared_ptr<RandomDataOp> myRandomDataOp;
+  RandomDataOp::Builder builder;
+
+  rc = builder.SetRowsPerBuffer(2)
+    .SetNumWorkers(4)
+    .SetDataSchema(std::move(testSchema))
+    .SetTotalRows(10)
+    .Build(&myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  rc = myTree->AssociateNode(myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  std::shared_ptr<ShuffleOp> myShuffleOp;
+  rc = ShuffleOp::Builder()
+      .SetRowsPerBuffer(2)
+      .SetShuffleSize(4)
+      .Build(&myShuffleOp);
+  EXPECT_TRUE(rc.IsOk());
+  rc = myTree->AssociateNode(myShuffleOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  uint32_t numRepeats = 3;
+  std::shared_ptr<RepeatOp> myRepeatOp;
+  rc = RepeatOp::Builder(numRepeats)
+    .Build(&myRepeatOp);
+  EXPECT_TRUE(rc.IsOk());
+  rc = myTree->AssociateNode(myRepeatOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  rc = myRepeatOp->AddChild(myShuffleOp);
+  EXPECT_TRUE(rc.IsOk());
+  
+  rc = myShuffleOp->AddChild(myRandomDataOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  rc = myTree->AssignRoot(myRepeatOp);
+  EXPECT_TRUE(rc.IsOk());
+
+  MS_LOG(INFO) << "Launching tree and begin iteration";
+  rc = myTree->Prepare();
+  EXPECT_TRUE(rc.IsOk());
+  rc = myTree->Launch();
+  EXPECT_TRUE(rc.IsOk());
+
+  // Start the loop of reading tensors from our pipeline
+  DatasetIterator dI(myTree);
+  TensorRow tensorList;
+  rc = dI.FetchNextTensorRow(&tensorList);
+  EXPECT_TRUE(rc.IsOk());
+  int rowCount = 0;
+  while (!tensorList.empty()) {
+    MS_LOG(INFO) << "Row display for row #: " << rowCount;
+
+    // Display the tensor by calling the printer on it
+    for (int i = 0; i < tensorList.size(); i++) {
+      std::ostringstream ss;
+      ss << *tensorList[i] << std::endl;
+      MS_LOG(INFO) << "Tensor print: " << ss.str();
+    }
+
+    rc = dI.FetchNextTensorRow(&tensorList);
+    EXPECT_TRUE(rc.IsOk());
+    rowCount++;
+  }
+  ASSERT_EQ(rowCount, 30);
+}
--- a/tests/ut/data/dataset/testRandomData/datasetSchema.json
+++ b/tests/ut/data/dataset/testRandomData/datasetSchema.json
@ -0,0 +1,14 @@
+{
+  "columns": {
+    "image": {
+      "type": "uint8",
+      "rank": 3,  
+      "shape": [1920,1080,3]
+    },
+    "label": {
+      "type": "int32",
+      "rank": 1,
+      "shape": [1]
+    }
+  }
+}
--- a/tests/ut/data/dataset/testRandomData/datasetSchema2.json
+++ b/tests/ut/data/dataset/testRandomData/datasetSchema2.json
@ -0,0 +1,14 @@
+{
+  "columns": {
+    "image": {
+      "type": "uint8",
+      "rank": 2,  
+      "shape": [28,28]
+    },
+    "label": {
+      "type": "uint8",
+      "rank": 1,
+      "shape": [1]
+    }
+  }
+}
--- a/tests/ut/python/dataset/test_random_dataset.py
+++ b/tests/ut/python/dataset/test_random_dataset.py
@ -0,0 +1,71 @@
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import mindspore.common.dtype as mstype
+import mindspore.dataset as ds
+from mindspore import log as logger
+from pathlib import Path
+
+# just a basic test with parallel random data op
+def test_randomdataset_basic1():
+    logger.info("Test randomdataset basic")
+
+    schema = ds.Schema()
+    schema.add_column('image', de_type=mstype.uint8, shape=[2])
+    schema.add_column('label', de_type=mstype.uint8, shape=[1])
+
+    # apply dataset operations
+    ds1 = ds.RandomDataset(schema=schema, num_samples=50, num_parallel_workers=4)
+    ds1 = ds1.repeat(4)
+
+    num_iter = 0
+    for data in ds1.create_dict_iterator():  # each data is a dictionary
+        # in this example, each dictionary has keys "image" and "label"
+        logger.info("{} image: {}".format(num_iter, data["image"]))
+        logger.info("{} label: {}".format(num_iter, data["label"]))
+        num_iter += 1
+
+    logger.info("Number of data in ds1: ", num_iter)
+    assert(num_iter == 200)
+
+# Another simple test
+def test_randomdataset_basic2():
+    logger.info("Test randomdataset basic 2")
+
+    schema = ds.Schema()
+    schema.add_column('image', de_type=mstype.uint8, shape=[640,480,3]) # 921600 bytes (a bit less than 1 MB per image)
+    schema.add_column('label', de_type=mstype.uint8, shape=[1])
+
+    # Make up about 10 samples
+    ds1 = ds.RandomDataset(schema=schema, num_samples=10, num_parallel_workers=1)
+
+    # cache size allows for about 4 images since each image just a bit less than 1MB, after that we will have to spill
+    ds1 = ds1.repeat(4)
+
+    num_iter = 0
+    for data in ds1.create_dict_iterator():  # each data is a dictionary
+        # in this example, each dictionary has keys "image" and "label"
+        #logger.info(data["image"])
+        logger.info("printing the label: {}".format(data["label"]))
+        num_iter += 1
+
+    logger.info("Number of data in ds1: ", num_iter)
+    assert(num_iter == 40)
+
+
+if __name__ == '__main__':
+    test_randomdataset_basic1()
+    test_randomdataset_basic2()
+    logger.info('test_randomdataset_basic Ended.\n')
+