forked from mindspore-Ecosystem/mindspore
support padding samples
This commit is contained in:
parent
c51d90d84e
commit
feff8899ac
|
@ -32,6 +32,7 @@
|
|||
#include "dataset/engine/datasetops/source/text_file_op.h"
|
||||
#include "dataset/engine/datasetops/filter_op.h"
|
||||
#include "mindrecord/include/shard_category.h"
|
||||
#include "mindrecord/include/shard_distributed_sample.h"
|
||||
#include "mindrecord/include/shard_sample.h"
|
||||
#include "mindrecord/include/shard_shuffle.h"
|
||||
#include "dataset/util/random.h"
|
||||
|
@ -400,7 +401,7 @@ Status DEPipeline::CheckMindRecordPartitionInfo(const py::dict &args, std::vecto
|
|||
RETURN_STATUS_UNEXPECTED(err_msg);
|
||||
}
|
||||
|
||||
constexpr int kMaxPartitions = 64;
|
||||
constexpr int kMaxPartitions = 1024;
|
||||
if (in_partitions->at(0) <= 0 || in_partitions->at(0) > kMaxPartitions) {
|
||||
std::string err_msg = "Error: partitions is invalid or not set.";
|
||||
RETURN_STATUS_UNEXPECTED(err_msg);
|
||||
|
@ -438,6 +439,10 @@ Status DEPipeline::ParseMindRecordOp(const py::dict &args, std::shared_ptr<Datas
|
|||
(void)builder->SetColumnsToLoad(in_col_names);
|
||||
}
|
||||
|
||||
if (!args["padded_sample"].is_none()) {
|
||||
(void)builder->SetPaddedSample(args["padded_sample"]);
|
||||
(void)builder->SetNumToPadSamples(ToInt(args["num_padded"]));
|
||||
}
|
||||
std::vector<std::shared_ptr<mindrecord::ShardOperator>> operators;
|
||||
for (auto arg : args) {
|
||||
std::string key = py::str(arg.first);
|
||||
|
@ -447,14 +452,15 @@ Status DEPipeline::ParseMindRecordOp(const py::dict &args, std::shared_ptr<Datas
|
|||
(void)builder->SetNumMindRecordWorkers(ToInt(value));
|
||||
} else if (key == "block_reader" && ToBool(value) == true) {
|
||||
(void)builder->SetBlockReader();
|
||||
} else if (key == "global_shuffle" && ToBool(value) == true) {
|
||||
uint32_t seed = args["partitions"].is_none() ? GetSeed() : 0;
|
||||
} else if (key == "shuffle_option" && ToBool(value) == true) {
|
||||
if (!args["partitions"].is_none()) continue;
|
||||
uint32_t seed = GetSeed();
|
||||
operators.push_back(std::make_shared<mindrecord::ShardShuffle>(seed));
|
||||
} else if (key == "sampler") {
|
||||
auto create = py::reinterpret_borrow<py::object>(value).attr("_create_for_minddataset");
|
||||
std::shared_ptr<mindrecord::ShardOperator> sample_op =
|
||||
create().cast<std::shared_ptr<mindrecord::ShardOperator>>();
|
||||
operators.push_back(sample_op);
|
||||
auto sampler = py::reinterpret_borrow<py::object>(value);
|
||||
auto create = sampler.attr("_create_for_minddataset");
|
||||
auto op = create().cast<std::shared_ptr<mindrecord::ShardOperator>>();
|
||||
operators.push_back(op);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -465,7 +471,13 @@ Status DEPipeline::ParseMindRecordOp(const py::dict &args, std::shared_ptr<Datas
|
|||
if (Status::OK() != ret) {
|
||||
return ret;
|
||||
}
|
||||
operators.push_back(std::make_shared<mindrecord::ShardSample>(1, in_partitions[0], in_partitions[1]));
|
||||
auto shuffle = ToBool(args["shuffle_option"]);
|
||||
int num_padded = 0;
|
||||
if (!args["num_padded"].is_none()) {
|
||||
num_padded = ToInt(args["num_padded"]);
|
||||
}
|
||||
operators.push_back(
|
||||
std::make_shared<mindrecord::ShardDistributedSample>(in_partitions[0], in_partitions[1], num_padded, shuffle, 0));
|
||||
}
|
||||
|
||||
if (!operators.empty()) {
|
||||
|
|
|
@ -66,6 +66,7 @@
|
|||
#include "dataset/util/random.h"
|
||||
#include "mindrecord/include/shard_operator.h"
|
||||
#include "mindrecord/include/shard_pk_sample.h"
|
||||
#include "mindrecord/include/shard_distributed_sample.h"
|
||||
#include "mindrecord/include/shard_sample.h"
|
||||
#include "pybind11/pybind11.h"
|
||||
#include "pybind11/stl.h"
|
||||
|
@ -157,17 +158,17 @@ void bindDatasetOps(py::module *m) {
|
|||
});
|
||||
|
||||
(void)py::class_<MindRecordOp, DatasetOp, std::shared_ptr<MindRecordOp>>(*m, "MindRecordOp")
|
||||
.def_static("get_num_rows",
|
||||
[](const std::vector<std::string> &paths, bool load_dataset, const py::object &sampler) {
|
||||
int64_t count = 0;
|
||||
std::shared_ptr<mindrecord::ShardOperator> op;
|
||||
if (py::hasattr(sampler, "_create_for_minddataset")) {
|
||||
auto create = sampler.attr("_create_for_minddataset");
|
||||
op = create().cast<std::shared_ptr<mindrecord::ShardOperator>>();
|
||||
}
|
||||
THROW_IF_ERROR(MindRecordOp::CountTotalRows(paths, load_dataset, op, &count));
|
||||
return count;
|
||||
});
|
||||
.def_static("get_num_rows", [](const std::vector<std::string> &paths, bool load_dataset, const py::object &sampler,
|
||||
const int64_t num_padded) {
|
||||
int64_t count = 0;
|
||||
std::shared_ptr<mindrecord::ShardOperator> op;
|
||||
if (py::hasattr(sampler, "_create_for_minddataset")) {
|
||||
auto create = sampler.attr("_create_for_minddataset");
|
||||
op = create().cast<std::shared_ptr<mindrecord::ShardOperator>>();
|
||||
}
|
||||
THROW_IF_ERROR(MindRecordOp::CountTotalRows(paths, load_dataset, op, &count, num_padded));
|
||||
return count;
|
||||
});
|
||||
|
||||
(void)py::class_<ManifestOp, DatasetOp, std::shared_ptr<ManifestOp>>(*m, "ManifestOp")
|
||||
.def_static("get_num_rows_and_classes",
|
||||
|
@ -472,6 +473,7 @@ void bindSamplerOps(py::module *m) {
|
|||
(void)py::class_<mindrecord::ShardSample, mindrecord::ShardOperator, std::shared_ptr<mindrecord::ShardSample>>(
|
||||
*m, "MindrecordSubsetRandomSampler")
|
||||
.def(py::init<std::vector<int64_t>, uint32_t>(), py::arg("indices"), py::arg("seed") = GetSeed());
|
||||
|
||||
(void)py::class_<mindrecord::ShardPkSample, mindrecord::ShardOperator, std::shared_ptr<mindrecord::ShardPkSample>>(
|
||||
*m, "MindrecordPkSampler")
|
||||
.def(py::init([](int64_t kVal, std::string kColumn, bool shuffle) {
|
||||
|
|
|
@ -53,6 +53,8 @@ MindRecordOp::Builder::Builder() : build_dataset_file_({}) {
|
|||
build_op_connector_queue_size_ = cfg->op_connector_size();
|
||||
build_block_reader_ = false;
|
||||
builder_num_workers_ = 0;
|
||||
build_num_padded_ = 0;
|
||||
build_sample_ = nullptr;
|
||||
}
|
||||
|
||||
// The builder "build" method creates the final object.
|
||||
|
@ -63,24 +65,57 @@ Status MindRecordOp::Builder::Build(std::shared_ptr<MindRecordOp> *ptr) {
|
|||
return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
|
||||
"Building a MindRecordOp that has not provided a file.");
|
||||
}
|
||||
|
||||
mindrecord::json sample_json;
|
||||
if (build_num_padded_ > 0) {
|
||||
sample_json = ToJson(build_sample_);
|
||||
}
|
||||
new_mind_record_op = std::make_shared<MindRecordOp>(
|
||||
build_num_mind_record_workers_, build_rows_per_buffer_, build_dataset_file_, build_load_dataset_,
|
||||
build_op_connector_queue_size_, build_columns_to_load_, build_operators_, build_block_reader_);
|
||||
build_op_connector_queue_size_, build_columns_to_load_, build_operators_, build_block_reader_, build_num_padded_,
|
||||
sample_json, build_sample_bytes_);
|
||||
|
||||
RETURN_IF_NOT_OK(new_mind_record_op->Init());
|
||||
|
||||
*ptr = std::move(new_mind_record_op);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status MindRecordOp::Builder::SanityCheck() const { return Status::OK(); }
|
||||
|
||||
mindrecord::json MindRecordOp::Builder::ToJson(const py::handle &obj) {
|
||||
if (obj.is_none()) {
|
||||
return nullptr;
|
||||
}
|
||||
if (py::isinstance<py::int_>(obj)) {
|
||||
return obj.cast<int64_t>();
|
||||
}
|
||||
if (py::isinstance<py::float_>(obj)) {
|
||||
return obj.cast<double>();
|
||||
}
|
||||
if (py::isinstance<py::str>(obj)) { // also catch py::bytes
|
||||
return obj.cast<std::string>();
|
||||
}
|
||||
if (py::isinstance<py::dict>(obj)) {
|
||||
auto out = mindrecord::json::object();
|
||||
for (const py::handle &key : obj) {
|
||||
if (py::isinstance<py::bytes>(obj[key])) {
|
||||
build_sample_bytes_[py::str(key).cast<std::string>()] = obj[key].cast<std::string>();
|
||||
} else {
|
||||
out[py::str(key).cast<std::string>()] = ToJson(obj[key]);
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
MS_LOG(ERROR) << "Python object convert to json failed, object is: " << py::cast<std::string>(obj);
|
||||
return mindrecord::json();
|
||||
}
|
||||
|
||||
// Constructor of the MindRecordOp.
|
||||
MindRecordOp::MindRecordOp(int32_t num_mind_record_workers, int32_t rows_per_buffer,
|
||||
std::vector<std::string> dataset_file, bool load_dataset, int32_t op_connector_queue_size,
|
||||
const std::vector<std::string> &columns_to_load,
|
||||
const std::vector<std::shared_ptr<ShardOperator>> &operators, const bool &block_reader)
|
||||
const std::vector<std::shared_ptr<ShardOperator>> &operators, const bool &block_reader,
|
||||
int64_t num_padded, const mindrecord::json &sample_json,
|
||||
const std::map<std::string, std::string> &sample_bytes)
|
||||
: ParallelOp(num_mind_record_workers, op_connector_queue_size),
|
||||
rows_per_buffer_(rows_per_buffer),
|
||||
dataset_file_(dataset_file),
|
||||
|
@ -92,7 +127,10 @@ MindRecordOp::MindRecordOp(int32_t num_mind_record_workers, int32_t rows_per_buf
|
|||
buffers_needed_(0),
|
||||
buf_cnt_(0),
|
||||
ended_worker_(0),
|
||||
buffer_water_mark_(0) {
|
||||
buffer_water_mark_(0),
|
||||
num_padded_(num_padded),
|
||||
sample_json_(sample_json),
|
||||
sample_bytes_(sample_bytes) {
|
||||
io_blk_queues_.Init(num_workers_, op_connector_queue_size);
|
||||
if (!block_reader_) return;
|
||||
for (int32_t i = 0; i < num_workers_; ++i) {
|
||||
|
@ -104,7 +142,7 @@ MindRecordOp::MindRecordOp(int32_t num_mind_record_workers, int32_t rows_per_buf
|
|||
Status MindRecordOp::Init() {
|
||||
shard_reader_ = std::make_unique<ShardReader>();
|
||||
auto rc = shard_reader_->Open(dataset_file_, load_dataset_, num_mind_record_workers_, columns_to_load_, operators_,
|
||||
block_reader_);
|
||||
block_reader_, num_padded_);
|
||||
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(rc == MSRStatus::SUCCESS,
|
||||
"MindRecordOp init failed. Error message: " + ErrnoToMessage(rc));
|
||||
|
@ -161,10 +199,6 @@ Status MindRecordOp::Init() {
|
|||
column_name_id_map_[columns_to_load_[i]] = i;
|
||||
}
|
||||
|
||||
num_rows_ = shard_reader_->GetNumRows();
|
||||
// Compute how many buffers we would need to accomplish rowsPerBuffer
|
||||
buffers_needed_ = (num_rows_ + rows_per_buffer_ - 1) / rows_per_buffer_;
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
@ -261,20 +295,30 @@ Status MindRecordOp::GetBufferFromReader(std::unique_ptr<DataBuffer> *fetched_bu
|
|||
std::unique_ptr<TensorQTable> tensor_table = std::make_unique<TensorQTable>();
|
||||
for (int32_t i = 0; i < rows_per_buffer_; ++i) {
|
||||
ShardTuple tupled_buffer;
|
||||
mindrecord::TaskType task_type = mindrecord::TaskType::kCommonTask;
|
||||
if (block_reader_) {
|
||||
if (i >= block_buffer_[buffer_id % num_workers_]->size()) break;
|
||||
tupled_buffer = block_buffer_[buffer_id % num_workers_]->at(i);
|
||||
} else {
|
||||
int32_t row_id = buffer_id * rows_per_buffer_ + i;
|
||||
tupled_buffer = shard_reader_->GetNextById(row_id, worker_id);
|
||||
auto rc = shard_reader_->GetNextById(row_id, worker_id);
|
||||
task_type = rc.first;
|
||||
tupled_buffer = rc.second;
|
||||
if (task_type == mindrecord::TaskType::kPaddedTask) {
|
||||
TensorRow tensor_row;
|
||||
RETURN_IF_NOT_OK(LoadTensorRow(&tensor_row, {}, mindrecord::json(), task_type));
|
||||
tensor_table->push_back(std::move(tensor_row));
|
||||
}
|
||||
if (tupled_buffer.empty()) break;
|
||||
}
|
||||
for (const auto &tupled_row : tupled_buffer) {
|
||||
std::vector<uint8_t> columns_blob = std::get<0>(tupled_row);
|
||||
mindrecord::json columns_json = std::get<1>(tupled_row);
|
||||
TensorRow tensor_row;
|
||||
RETURN_IF_NOT_OK(LoadTensorRow(&tensor_row, columns_blob, columns_json));
|
||||
tensor_table->push_back(std::move(tensor_row));
|
||||
if (task_type == mindrecord::TaskType::kCommonTask) {
|
||||
for (const auto &tupled_row : tupled_buffer) {
|
||||
std::vector<uint8_t> columns_blob = std::get<0>(tupled_row);
|
||||
mindrecord::json columns_json = std::get<1>(tupled_row);
|
||||
TensorRow tensor_row;
|
||||
RETURN_IF_NOT_OK(LoadTensorRow(&tensor_row, columns_blob, columns_json, task_type));
|
||||
tensor_table->push_back(std::move(tensor_row));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -284,7 +328,7 @@ Status MindRecordOp::GetBufferFromReader(std::unique_ptr<DataBuffer> *fetched_bu
|
|||
}
|
||||
|
||||
Status MindRecordOp::LoadTensorRow(TensorRow *tensor_row, const std::vector<uint8_t> &columns_blob,
|
||||
const mindrecord::json &columns_json) {
|
||||
const mindrecord::json &columns_json, const mindrecord::TaskType task_type) {
|
||||
for (uint32_t i_col = 0; i_col < columns_to_load_.size(); i_col++) {
|
||||
auto column_name = columns_to_load_[i_col];
|
||||
|
||||
|
@ -297,11 +341,39 @@ Status MindRecordOp::LoadTensorRow(TensorRow *tensor_row, const std::vector<uint
|
|||
std::vector<int64_t> column_shape;
|
||||
|
||||
// Get column data
|
||||
auto has_column = shard_reader_->GetShardColumn()->GetColumnValueByName(
|
||||
column_name, columns_blob, columns_json, &data, &data_ptr, &n_bytes, &column_data_type, &column_data_type_size,
|
||||
&column_shape);
|
||||
if (has_column == MSRStatus::FAILED) {
|
||||
RETURN_STATUS_UNEXPECTED("Failed to retrieve data from mindrecord reader.");
|
||||
auto shard_column = shard_reader_->GetShardColumn();
|
||||
if (num_padded_ > 0 && task_type == mindrecord::TaskType::kPaddedTask) {
|
||||
auto rc =
|
||||
shard_column->GetColumnTypeByName(column_name, &column_data_type, &column_data_type_size, &column_shape);
|
||||
if (rc.first != MSRStatus::SUCCESS) {
|
||||
RETURN_STATUS_UNEXPECTED("Failed to retrieve data type.");
|
||||
}
|
||||
if (rc.second == mindrecord::ColumnInRaw) {
|
||||
auto has_column = shard_column->GetColumnFromJson(column_name, sample_json_, &data_ptr, &n_bytes);
|
||||
if (has_column == MSRStatus::FAILED) {
|
||||
RETURN_STATUS_UNEXPECTED("Failed to retrieve raw data from padding sample.");
|
||||
}
|
||||
} else if (rc.second == mindrecord::ColumnInBlob) {
|
||||
if (sample_bytes_.find(column_name) == sample_bytes_.end()) {
|
||||
RETURN_STATUS_UNEXPECTED("Failed to retrieve blob data from padding sample.");
|
||||
}
|
||||
std::string ss(sample_bytes_[column_name]);
|
||||
n_bytes = ss.size();
|
||||
data_ptr = std::make_unique<unsigned char[]>(n_bytes);
|
||||
std::copy(ss.begin(), ss.end(), data_ptr.get());
|
||||
} else {
|
||||
RETURN_STATUS_UNEXPECTED("Retrieved data type is unknown.");
|
||||
}
|
||||
if (data == nullptr) {
|
||||
data = reinterpret_cast<const unsigned char *>(data_ptr.get());
|
||||
}
|
||||
} else {
|
||||
auto has_column =
|
||||
shard_column->GetColumnValueByName(column_name, columns_blob, columns_json, &data, &data_ptr, &n_bytes,
|
||||
&column_data_type, &column_data_type_size, &column_shape);
|
||||
if (has_column == MSRStatus::FAILED) {
|
||||
RETURN_STATUS_UNEXPECTED("Failed to retrieve data from mindrecord reader.");
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<Tensor> tensor;
|
||||
|
@ -334,7 +406,8 @@ Status MindRecordOp::FetchBlockBuffer(const int32_t &buffer_id) {
|
|||
}
|
||||
for (int32_t i = 0; i < rows_per_buffer_; i++) {
|
||||
// Block reader does NOT care about argument
|
||||
ShardTuple tuple_buffer = shard_reader_->GetNextById(i, i);
|
||||
auto rc = shard_reader_->GetNextById(i, i);
|
||||
ShardTuple tuple_buffer = rc.second;
|
||||
if (tuple_buffer.empty()) break;
|
||||
block_buffer_[buffer_id % num_workers_]->push_back(std::move(tuple_buffer));
|
||||
}
|
||||
|
@ -348,11 +421,8 @@ Status MindRecordOp::FetchBlockBuffer(const int32_t &buffer_id) {
|
|||
Status MindRecordOp::operator()() {
|
||||
RETURN_IF_NOT_OK(LaunchThreadAndInitOp());
|
||||
num_rows_ = shard_reader_->GetNumRows();
|
||||
|
||||
buffers_needed_ = num_rows_ / rows_per_buffer_;
|
||||
if (num_rows_ % rows_per_buffer_ != 0) {
|
||||
buffers_needed_++;
|
||||
}
|
||||
// Compute how many buffers we would need to accomplish rowsPerBuffer
|
||||
buffers_needed_ = (num_rows_ + rows_per_buffer_ - 1) / rows_per_buffer_;
|
||||
|
||||
while (true) { // each iterator is 1 epoch
|
||||
for (int32_t i = 0; i < buffers_needed_; ++i) {
|
||||
|
@ -417,9 +487,9 @@ Status MindRecordOp::LaunchThreadAndInitOp() {
|
|||
}
|
||||
|
||||
Status MindRecordOp::CountTotalRows(const std::vector<std::string> dataset_path, bool load_dataset,
|
||||
const std::shared_ptr<ShardOperator> &op, int64_t *count) {
|
||||
const std::shared_ptr<ShardOperator> &op, int64_t *count, int64_t num_padded) {
|
||||
std::unique_ptr<ShardReader> shard_reader = std::make_unique<ShardReader>();
|
||||
MSRStatus rc = shard_reader->CountTotalRows(dataset_path, load_dataset, op, count);
|
||||
MSRStatus rc = shard_reader->CountTotalRows(dataset_path, load_dataset, op, count, num_padded);
|
||||
if (rc == MSRStatus::FAILED) {
|
||||
RETURN_STATUS_UNEXPECTED("MindRecordOp count total rows failed.");
|
||||
}
|
||||
|
|
|
@ -104,10 +104,22 @@ class MindRecordOp : public ParallelOp {
|
|||
return *this;
|
||||
}
|
||||
|
||||
Builder &SetNumToPadSamples(int64_t num_padded) {
|
||||
build_num_padded_ = num_padded;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Builder &SetPaddedSample(const py::handle &sample) {
|
||||
build_sample_ = sample;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Status SanityCheck() const;
|
||||
|
||||
static int32_t num_mind_record_workers() { return kDefaultMindRecordWorkers; }
|
||||
|
||||
mindrecord::json ToJson(const py::handle &obj);
|
||||
|
||||
private:
|
||||
static constexpr int32_t kDefaultMindRecordWorkers = 4;
|
||||
// The builder saves all MindRecordOp construction arguments internally.
|
||||
|
@ -121,6 +133,9 @@ class MindRecordOp : public ParallelOp {
|
|||
std::vector<std::string> build_columns_to_load_;
|
||||
std::vector<std::shared_ptr<ShardOperator>> build_operators_;
|
||||
bool build_block_reader_;
|
||||
int64_t build_num_padded_;
|
||||
py::handle build_sample_;
|
||||
std::map<std::string, std::string> build_sample_bytes_;
|
||||
};
|
||||
|
||||
// Constructor of the MindRecordOp.
|
||||
|
@ -133,7 +148,9 @@ class MindRecordOp : public ParallelOp {
|
|||
// @param operators - ShardOperators for Shuffle, Category, Sample
|
||||
MindRecordOp(int32_t num_mind_record_workers, int32_t rows_per_buffer, std::vector<std::string> dataset_file,
|
||||
bool load_dataset, int32_t op_connector_queue_size, const std::vector<std::string> &columns_to_load,
|
||||
const std::vector<std::shared_ptr<ShardOperator>> &operators, const bool &block_reader);
|
||||
const std::vector<std::shared_ptr<ShardOperator>> &operators, const bool &block_reader,
|
||||
int64_t num_padded_, const mindrecord::json &sample_json,
|
||||
const std::map<std::string, std::string> &sample_bytes_);
|
||||
|
||||
// Destructor
|
||||
~MindRecordOp() override;
|
||||
|
@ -178,7 +195,7 @@ class MindRecordOp : public ParallelOp {
|
|||
int32_t num_rows() const { return num_rows_; }
|
||||
|
||||
static Status CountTotalRows(const std::vector<std::string> dataset_path, bool load_dataset,
|
||||
const std::shared_ptr<ShardOperator> &op, int64_t *count);
|
||||
const std::shared_ptr<ShardOperator> &op, int64_t *count, int64_t num_padded);
|
||||
|
||||
// Getter method
|
||||
int32_t rows_per_buffer() const { return rows_per_buffer_; }
|
||||
|
@ -209,7 +226,7 @@ class MindRecordOp : public ParallelOp {
|
|||
// @param columns_blob - the blob data received from the reader
|
||||
// @param columns_json - the data for fields received from the reader
|
||||
Status LoadTensorRow(TensorRow *tensor_row, const std::vector<uint8_t> &columns_blob,
|
||||
const mindrecord::json &columns_json);
|
||||
const mindrecord::json &columns_json, const mindrecord::TaskType task_type);
|
||||
|
||||
Status FetchBlockBuffer(const int32_t &buffer_id);
|
||||
|
||||
|
@ -226,6 +243,10 @@ class MindRecordOp : public ParallelOp {
|
|||
std::atomic<int32_t> ended_worker_;
|
||||
std::atomic<int32_t> buffer_water_mark_;
|
||||
|
||||
int64_t num_padded_;
|
||||
mindrecord::json sample_json_;
|
||||
std::map<std::string, std::string> sample_bytes_;
|
||||
|
||||
std::unique_ptr<DataSchema> data_schema_; // Data schema for column typing
|
||||
std::vector<std::string> columns_blob_; // Blob Columns to load from dataset
|
||||
std::vector<int32_t> columns_blob_index_; // Blob Columns to load from dataset
|
||||
|
|
|
@ -203,7 +203,8 @@ Status GraphLoader::LoadFeatureIndex(const std::string &key, const std::vector<u
|
|||
Status GraphLoader::WorkerEntry(int32_t worker_id) {
|
||||
// Handshake
|
||||
TaskManager::FindMe()->Post();
|
||||
ShardTuple rows = shard_reader_->GetNextById(row_id_++, worker_id);
|
||||
auto ret = shard_reader_->GetNextById(row_id_++, worker_id);
|
||||
ShardTuple rows = ret.second;
|
||||
while (rows.empty() == false) {
|
||||
RETURN_IF_INTERRUPTED();
|
||||
for (const auto &tupled_row : rows) {
|
||||
|
@ -224,7 +225,8 @@ Status GraphLoader::WorkerEntry(int32_t worker_id) {
|
|||
MS_LOG(WARNING) << "attribute:" << attr << " is neither edge nor node.";
|
||||
}
|
||||
}
|
||||
rows = shard_reader_->GetNextById(row_id_++, worker_id);
|
||||
auto rc = shard_reader_->GetNextById(row_id_++, worker_id);
|
||||
rows = rc.second;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
|
|
@ -73,6 +73,10 @@ enum ShardType {
|
|||
kCV = 1,
|
||||
};
|
||||
|
||||
enum TaskType {
|
||||
kCommonTask = 0,
|
||||
kPaddedTask = 1,
|
||||
};
|
||||
enum SamplerType { kCustomTopNSampler, kCustomTopPercentSampler, kSubsetRandomSampler, kPKSampler };
|
||||
|
||||
enum ShuffleType { kShuffleCategory, kShuffleSample };
|
||||
|
|
|
@ -89,12 +89,16 @@ class ShardColumn {
|
|||
MSRStatus GetColumnFromBlob(const std::string &column_name, const std::vector<uint8_t> &columns_blob,
|
||||
const unsigned char **data, std::unique_ptr<unsigned char[]> *data_ptr,
|
||||
uint64_t *n_bytes);
|
||||
std::pair<MSRStatus, ColumnCategory> GetColumnTypeByName(const std::string &column_name,
|
||||
ColumnDataType *column_data_type,
|
||||
uint64_t *column_data_type_size,
|
||||
std::vector<int64_t> *column_shape);
|
||||
|
||||
private:
|
||||
/// \brief get column value from json
|
||||
MSRStatus GetColumnFromJson(const std::string &column_name, const json &columns_json,
|
||||
std::unique_ptr<unsigned char[]> *data_ptr, uint64_t *n_bytes);
|
||||
|
||||
private:
|
||||
/// \brief get float value from json
|
||||
template <typename T>
|
||||
MSRStatus GetFloat(std::unique_ptr<unsigned char[]> *data_ptr, const json &json_column_value, bool use_double);
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDRECORD_INCLUDE_SHARD_DISTRIBUTED_SAMPLE_H_
|
||||
#define MINDRECORD_INCLUDE_SHARD_DISTRIBUTED_SAMPLE_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include "mindrecord/include/shard_operator.h"
|
||||
#include "mindrecord/include/shard_shuffle.h"
|
||||
#include "mindrecord/include/shard_sample.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace mindrecord {
|
||||
class ShardDistributedSample : public ShardSample {
|
||||
public:
|
||||
ShardDistributedSample(int num_shards, int shard_id, int no_of_padded_samples, bool shuffle, uint32_t seed);
|
||||
|
||||
~ShardDistributedSample() override{};
|
||||
|
||||
MSRStatus PreExecute(ShardTask &tasks) override;
|
||||
|
||||
int64_t GetNumSamples(int64_t dataset_size, int64_t num_classes) override;
|
||||
|
||||
private:
|
||||
bool shuffle_;
|
||||
int no_of_padded_samples_;
|
||||
};
|
||||
} // namespace mindrecord
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDRECORD_INCLUDE_SHARD_DISTRIBUTED_SAMPLE_H_
|
|
@ -58,7 +58,8 @@ using ROW_GROUPS =
|
|||
std::tuple<MSRStatus, std::vector<std::vector<std::vector<uint64_t>>>, std::vector<std::vector<json>>>;
|
||||
using ROW_GROUP_BRIEF =
|
||||
std::tuple<MSRStatus, std::string, int, uint64_t, std::vector<std::vector<uint64_t>>, std::vector<json>>;
|
||||
using TASK_RETURN_CONTENT = std::pair<MSRStatus, std::vector<std::tuple<std::vector<uint8_t>, json>>>;
|
||||
using TASK_RETURN_CONTENT =
|
||||
std::pair<MSRStatus, std::pair<TaskType, std::vector<std::tuple<std::vector<uint8_t>, json>>>>;
|
||||
const int kNumBatchInMap = 1000; // iterator buffer size in row-reader mode
|
||||
const int kNumPageInBuffer = 16; // page buffer size in block-reader mode
|
||||
|
||||
|
@ -78,7 +79,8 @@ class ShardReader {
|
|||
/// \return MSRStatus the status of MSRStatus
|
||||
MSRStatus Open(const std::vector<std::string> &file_paths, bool load_dataset, int n_consumer = 4,
|
||||
const std::vector<std::string> &selected_columns = {},
|
||||
const std::vector<std::shared_ptr<ShardOperator>> &operators = {}, const bool &block_reader = false);
|
||||
const std::vector<std::shared_ptr<ShardOperator>> &operators = {}, const bool &block_reader = false,
|
||||
const int num_padded = 0);
|
||||
|
||||
/// \brief open files and initialize reader, python API
|
||||
/// \param[in] file_paths the path of ONE file, any file in dataset is fine or file list
|
||||
|
@ -127,7 +129,7 @@ class ShardReader {
|
|||
/// \param[out] count # of rows
|
||||
/// \return MSRStatus the status of MSRStatus
|
||||
MSRStatus CountTotalRows(const std::vector<std::string> &file_paths, bool load_dataset,
|
||||
const std::shared_ptr<ShardOperator> &op, int64_t *count);
|
||||
const std::shared_ptr<ShardOperator> &op, int64_t *count, const int num_padded);
|
||||
|
||||
/// \brief shuffle task with incremental seed
|
||||
/// \return void
|
||||
|
@ -182,7 +184,8 @@ class ShardReader {
|
|||
|
||||
/// \brief return a row by id
|
||||
/// \return a batch of images and image data
|
||||
std::vector<std::tuple<std::vector<uint8_t>, json>> GetNextById(const int64_t &task_id, const int32_t &consumer_id);
|
||||
std::pair<TaskType, std::vector<std::tuple<std::vector<uint8_t>, json>>> GetNextById(const int64_t &task_id,
|
||||
const int32_t &consumer_id);
|
||||
|
||||
/// \brief return a batch in block-reader mode, given that one is ready
|
||||
/// \return a batch of images and image data
|
||||
|
@ -330,6 +333,8 @@ class ShardReader {
|
|||
bool all_in_index_ = true; // if all columns are stored in index-table
|
||||
bool interrupt_ = false; // reader interrupted
|
||||
|
||||
int num_padded_; // number of padding samples
|
||||
|
||||
// Delivery/Iterator mode begin
|
||||
const std::string kThreadName = "THRD_ITER_"; // prefix of thread name
|
||||
std::vector<std::thread> thread_set_; // thread list
|
||||
|
|
|
@ -38,22 +38,22 @@ class ShardSample : public ShardOperator {
|
|||
|
||||
~ShardSample() override{};
|
||||
|
||||
const std::pair<int, int> GetPartitions() const;
|
||||
|
||||
MSRStatus Execute(ShardTask &tasks) override;
|
||||
|
||||
MSRStatus SufExecute(ShardTask &tasks) override;
|
||||
|
||||
int64_t GetNumSamples(int64_t dataset_size, int64_t num_classes) override;
|
||||
|
||||
private:
|
||||
protected:
|
||||
int numerator_;
|
||||
int denominator_;
|
||||
int no_of_samples_;
|
||||
int partition_id_;
|
||||
std::shared_ptr<ShardShuffle> shuffle_op_;
|
||||
|
||||
private:
|
||||
int no_of_samples_;
|
||||
std::vector<int64_t> indices_;
|
||||
SamplerType sampler_type_;
|
||||
std::shared_ptr<ShardShuffle> shuffle_op_;
|
||||
};
|
||||
} // namespace mindrecord
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -29,9 +29,10 @@ class ShardTask {
|
|||
public:
|
||||
void MakePerm();
|
||||
|
||||
void InsertTask(int shard_id, int group_id, const std::vector<uint64_t> &offset, const json &label);
|
||||
void InsertTask(TaskType task_type, int shard_id, int group_id, const std::vector<uint64_t> &offset,
|
||||
const json &label);
|
||||
|
||||
void InsertTask(std::tuple<std::tuple<int, int>, std::vector<uint64_t>, json> task);
|
||||
void InsertTask(std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json> task);
|
||||
|
||||
void PopBack();
|
||||
|
||||
|
@ -39,15 +40,15 @@ class ShardTask {
|
|||
|
||||
uint32_t SizeOfRows() const;
|
||||
|
||||
std::tuple<std::tuple<int, int>, std::vector<uint64_t>, json> &GetTaskByID(size_t id);
|
||||
std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json> &GetTaskByID(size_t id);
|
||||
|
||||
std::tuple<std::tuple<int, int>, std::vector<uint64_t>, json> &GetRandomTask();
|
||||
std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json> &GetRandomTask();
|
||||
|
||||
static ShardTask Combine(std::vector<ShardTask> &category_tasks, bool replacement, int64_t num_elements);
|
||||
|
||||
uint32_t categories = 1;
|
||||
|
||||
std::vector<std::tuple<std::tuple<int, int>, std::vector<uint64_t>, json>> task_list_;
|
||||
std::vector<std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json>> task_list_;
|
||||
std::vector<int> permutation_;
|
||||
};
|
||||
} // namespace mindrecord
|
||||
|
|
|
@ -45,6 +45,7 @@ ShardReader::ShardReader() {
|
|||
row_id_ = 0;
|
||||
num_blocks_ = 0;
|
||||
block_reader_ = false;
|
||||
num_padded_ = 0;
|
||||
}
|
||||
|
||||
std::pair<MSRStatus, std::vector<std::string>> ShardReader::GetMeta(const std::string &file_path, json &meta_data) {
|
||||
|
@ -790,7 +791,7 @@ int64_t ShardReader::GetNumClasses(const std::string &category_field) {
|
|||
}
|
||||
|
||||
MSRStatus ShardReader::CountTotalRows(const std::vector<std::string> &file_paths, bool load_dataset,
|
||||
const std::shared_ptr<ShardOperator> &op, int64_t *count) {
|
||||
const std::shared_ptr<ShardOperator> &op, int64_t *count, const int num_padded) {
|
||||
if (SUCCESS != Init(file_paths, load_dataset)) {
|
||||
return FAILED;
|
||||
}
|
||||
|
@ -802,11 +803,12 @@ MSRStatus ShardReader::CountTotalRows(const std::vector<std::string> &file_paths
|
|||
num_samples = category_op->GetNumSamples(num_rows_, num_classes);
|
||||
} else if (std::dynamic_pointer_cast<ShardSample>(op)) {
|
||||
num_samples = op->GetNumSamples(num_rows_, 0);
|
||||
if (-1 == num_samples) {
|
||||
MS_LOG(ERROR) << "Dataset size plus number of padded samples is not divisible by number of shards.";
|
||||
return FAILED;
|
||||
}
|
||||
} else {
|
||||
}
|
||||
if (-1 == num_samples) {
|
||||
MS_LOG(ERROR) << "Failed to get dataset size.";
|
||||
return FAILED;
|
||||
if (num_padded > 0) num_samples += num_padded;
|
||||
}
|
||||
*count = num_samples;
|
||||
return SUCCESS;
|
||||
|
@ -814,7 +816,8 @@ MSRStatus ShardReader::CountTotalRows(const std::vector<std::string> &file_paths
|
|||
|
||||
MSRStatus ShardReader::Open(const std::vector<std::string> &file_paths, bool load_dataset, int n_consumer,
|
||||
const std::vector<std::string> &selected_columns,
|
||||
const std::vector<std::shared_ptr<ShardOperator>> &operators, const bool &block_reader) {
|
||||
const std::vector<std::shared_ptr<ShardOperator>> &operators, const bool &block_reader,
|
||||
int num_padded) {
|
||||
// Open file and set header by ShardReader
|
||||
auto ret = Init(file_paths, load_dataset);
|
||||
if (SUCCESS != ret) {
|
||||
|
@ -844,6 +847,7 @@ MSRStatus ShardReader::Open(const std::vector<std::string> &file_paths, bool loa
|
|||
// Initialize argument
|
||||
shard_count_ = static_cast<int>(file_paths_.size());
|
||||
n_consumer_ = n_consumer;
|
||||
num_padded_ = num_padded;
|
||||
|
||||
operators_ = operators;
|
||||
|
||||
|
@ -935,7 +939,7 @@ MSRStatus ShardReader::CreateTasksByBlock(const std::vector<std::tuple<int, int,
|
|||
auto shard_id = std::get<0>(rg);
|
||||
auto group_id = std::get<1>(rg);
|
||||
auto n_Rows = std::get<3>(rg);
|
||||
tasks_.InsertTask(shard_id, group_id, std::vector<uint64_t>{n_Rows}, json{});
|
||||
tasks_.InsertTask(TaskType::kCommonTask, shard_id, group_id, std::vector<uint64_t>{n_Rows}, json{});
|
||||
}
|
||||
return SUCCESS;
|
||||
}
|
||||
|
@ -986,7 +990,7 @@ MSRStatus ShardReader::CreateTasksByCategory(const std::vector<std::tuple<int, i
|
|||
auto number_of_rows = offsets.size();
|
||||
for (uint32_t iStart = 0; iStart < number_of_rows; iStart += 1) {
|
||||
if (category_index < num_elements) {
|
||||
categoryTasks[categoryNo].InsertTask(shard_id, group_id, std::get<4>(details)[iStart],
|
||||
categoryTasks[categoryNo].InsertTask(TaskType::kCommonTask, shard_id, group_id, std::get<4>(details)[iStart],
|
||||
std::get<5>(details)[iStart]);
|
||||
category_index++;
|
||||
}
|
||||
|
@ -1014,7 +1018,7 @@ MSRStatus ShardReader::CreateTasksByRow(const std::vector<std::tuple<int, int, i
|
|||
if (shard_count_ <= kMaxShardCount) {
|
||||
for (int shard_id = 0; shard_id < shard_count_; shard_id++) {
|
||||
for (uint32_t i = 0; i < offsets[shard_id].size(); i += 1) {
|
||||
tasks_.InsertTask(offsets[shard_id][i][0], offsets[shard_id][i][1],
|
||||
tasks_.InsertTask(TaskType::kCommonTask, offsets[shard_id][i][0], offsets[shard_id][i][1],
|
||||
std::vector<uint64_t>{offsets[shard_id][i][2], offsets[shard_id][i][3]},
|
||||
local_columns[shard_id][i]);
|
||||
}
|
||||
|
@ -1044,6 +1048,11 @@ MSRStatus ShardReader::CreateTasks(const std::vector<std::tuple<int, int, int, u
|
|||
if (SUCCESS != CreateTasksByRow(row_group_summary, operators)) {
|
||||
return FAILED;
|
||||
}
|
||||
if (num_padded_ > 0) {
|
||||
for (int i = 0; i < num_padded_; ++i) {
|
||||
tasks_.InsertTask(TaskType::kPaddedTask, 0, 0, {}, json());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (SUCCESS != CreateTasksByCategory(row_group_summary, operators[category_operator])) {
|
||||
return FAILED;
|
||||
|
@ -1070,18 +1079,27 @@ MSRStatus ShardReader::CreateTasks(const std::vector<std::tuple<int, int, int, u
|
|||
TASK_RETURN_CONTENT ShardReader::ConsumerOneTask(int task_id, uint32_t consumer_id) {
|
||||
// All tasks are done
|
||||
if (task_id >= static_cast<int>(tasks_.Size())) {
|
||||
return std::make_pair(FAILED, std::vector<std::tuple<std::vector<uint8_t>, json>>());
|
||||
return std::make_pair(FAILED,
|
||||
std::make_pair(TaskType::kCommonTask, std::vector<std::tuple<std::vector<uint8_t>, json>>()));
|
||||
}
|
||||
|
||||
// Pick up task from task list
|
||||
auto task = tasks_.GetTaskByID(tasks_.permutation_[task_id]);
|
||||
|
||||
auto shard_id = std::get<0>(std::get<0>(task));
|
||||
auto group_id = std::get<1>(std::get<0>(task));
|
||||
auto addr = std::get<1>(task);
|
||||
// check task type
|
||||
auto task_type = std::get<0>(task);
|
||||
if (task_type == TaskType::kPaddedTask) {
|
||||
return std::make_pair(SUCCESS,
|
||||
std::make_pair(TaskType::kPaddedTask, std::vector<std::tuple<std::vector<uint8_t>, json>>()));
|
||||
}
|
||||
|
||||
auto shard_id = std::get<0>(std::get<1>(task));
|
||||
auto group_id = std::get<1>(std::get<1>(task));
|
||||
auto addr = std::get<2>(task);
|
||||
const auto &ret = shard_header_->GetPageByGroupId(group_id, shard_id);
|
||||
if (SUCCESS != ret.first) {
|
||||
return std::make_pair(FAILED, std::vector<std::tuple<std::vector<uint8_t>, json>>());
|
||||
return std::make_pair(FAILED,
|
||||
std::make_pair(TaskType::kCommonTask, std::vector<std::tuple<std::vector<uint8_t>, json>>()));
|
||||
}
|
||||
const std::shared_ptr<Page> &page = ret.second;
|
||||
|
||||
|
@ -1093,7 +1111,8 @@ TASK_RETURN_CONTENT ShardReader::ConsumerOneTask(int task_id, uint32_t consumer_
|
|||
if (!io_seekg.good() || io_seekg.fail() || io_seekg.bad()) {
|
||||
MS_LOG(ERROR) << "File seekg failed";
|
||||
file_streams_random_[consumer_id][shard_id]->close();
|
||||
return std::make_pair(FAILED, std::vector<std::tuple<std::vector<uint8_t>, json>>());
|
||||
return std::make_pair(FAILED,
|
||||
std::make_pair(TaskType::kCommonTask, std::vector<std::tuple<std::vector<uint8_t>, json>>()));
|
||||
}
|
||||
|
||||
auto &io_read =
|
||||
|
@ -1101,14 +1120,15 @@ TASK_RETURN_CONTENT ShardReader::ConsumerOneTask(int task_id, uint32_t consumer_
|
|||
if (!io_read.good() || io_read.fail() || io_read.bad()) {
|
||||
MS_LOG(ERROR) << "File read failed";
|
||||
file_streams_random_[consumer_id][shard_id]->close();
|
||||
return std::make_pair(FAILED, std::vector<std::tuple<std::vector<uint8_t>, json>>());
|
||||
return std::make_pair(FAILED,
|
||||
std::pair(TaskType::kCommonTask, std::vector<std::tuple<std::vector<uint8_t>, json>>()));
|
||||
}
|
||||
|
||||
// Deliver batch data to output map
|
||||
std::vector<std::tuple<std::vector<uint8_t>, json>> batch;
|
||||
batch.emplace_back(std::move(images), std::move(std::get<2>(task)));
|
||||
batch.emplace_back(std::move(images), std::move(std::get<3>(task)));
|
||||
|
||||
return std::make_pair(SUCCESS, std::move(batch));
|
||||
return std::make_pair(SUCCESS, std::make_pair(TaskType::kCommonTask, std::move(batch)));
|
||||
}
|
||||
|
||||
MSRStatus ShardReader::ConsumerByRow(int consumer_id) {
|
||||
|
@ -1133,7 +1153,7 @@ MSRStatus ShardReader::ConsumerByRow(int consumer_id) {
|
|||
if (SUCCESS != ret.first) {
|
||||
return FAILED;
|
||||
}
|
||||
const auto &batch = ret.second;
|
||||
const auto &batch = (ret.second).second;
|
||||
// Hanging if maximum map size exceeded
|
||||
// otherwise, set batch data in map
|
||||
{
|
||||
|
@ -1193,8 +1213,8 @@ MSRStatus ShardReader::ConsumerByBlock(int consumer_id) {
|
|||
// Pick up task from task list
|
||||
auto task = tasks_.GetTaskByID(tasks_.permutation_[task_id]);
|
||||
|
||||
auto shard_id = std::get<0>(std::get<0>(task));
|
||||
auto group_id = std::get<1>(std::get<0>(task));
|
||||
auto shard_id = std::get<0>(std::get<1>(task));
|
||||
auto group_id = std::get<1>(std::get<1>(task));
|
||||
auto row_group_brief = ReadRowGroupBrief(group_id, shard_id, selected_columns_);
|
||||
if (SUCCESS != std::get<0>(row_group_brief)) {
|
||||
return FAILED;
|
||||
|
@ -1302,17 +1322,17 @@ std::vector<std::tuple<std::vector<uint8_t>, json>> ShardReader::GetNext() {
|
|||
return *res;
|
||||
}
|
||||
|
||||
std::vector<std::tuple<std::vector<uint8_t>, json>> ShardReader::GetNextById(const int64_t &task_id,
|
||||
const int32_t &consumer_id) {
|
||||
std::pair<TaskType, std::vector<std::tuple<std::vector<uint8_t>, json>>> ShardReader::GetNextById(
|
||||
const int64_t &task_id, const int32_t &consumer_id) {
|
||||
if (interrupt_) {
|
||||
return std::vector<std::tuple<std::vector<uint8_t>, json>>();
|
||||
return std::make_pair(TaskType::kCommonTask, std::vector<std::tuple<std::vector<uint8_t>, json>>());
|
||||
}
|
||||
if (block_reader_) {
|
||||
return GetBlockNext();
|
||||
return std::make_pair(TaskType::kCommonTask, GetBlockNext());
|
||||
}
|
||||
const auto &ret = ConsumerOneTask(task_id, consumer_id);
|
||||
if (SUCCESS != ret.first) {
|
||||
return std::vector<std::tuple<std::vector<uint8_t>, json>>();
|
||||
return std::make_pair(TaskType::kCommonTask, std::vector<std::tuple<std::vector<uint8_t>, json>>());
|
||||
}
|
||||
return std::move(ret.second);
|
||||
}
|
||||
|
|
|
@ -41,7 +41,7 @@ int64_t ShardCategory::GetNumSamples(int64_t dataset_size, int64_t num_classes)
|
|||
if (dataset_size > 0 && num_classes > 0 && num_categories_ > 0 && num_elements_ > 0) {
|
||||
return std::min(num_categories_, num_classes) * num_elements_;
|
||||
}
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
} // namespace mindrecord
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -66,6 +66,25 @@ ShardColumn::ShardColumn(const std::shared_ptr<ShardHeader> &shard_header, bool
|
|||
num_blob_column_ = blob_column_.size();
|
||||
}
|
||||
|
||||
std::pair<MSRStatus, ColumnCategory> ShardColumn::GetColumnTypeByName(const std::string &column_name,
|
||||
ColumnDataType *column_data_type,
|
||||
uint64_t *column_data_type_size,
|
||||
std::vector<int64_t> *column_shape) {
|
||||
// Skip if column not found
|
||||
auto column_category = CheckColumnName(column_name);
|
||||
if (column_category == ColumnNotFound) {
|
||||
return {FAILED, ColumnNotFound};
|
||||
}
|
||||
|
||||
// Get data type and size
|
||||
auto column_id = column_name_id_[column_name];
|
||||
*column_data_type = column_data_type_[column_id];
|
||||
*column_data_type_size = ColumnDataTypeSize[*column_data_type];
|
||||
*column_shape = column_shape_[column_id];
|
||||
|
||||
return {SUCCESS, column_category};
|
||||
}
|
||||
|
||||
MSRStatus ShardColumn::GetColumnValueByName(const std::string &column_name, const std::vector<uint8_t> &columns_blob,
|
||||
const json &columns_json, const unsigned char **data,
|
||||
std::unique_ptr<unsigned char[]> *data_ptr, uint64_t *n_bytes,
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "mindrecord/include/shard_distributed_sample.h"
|
||||
|
||||
using mindspore::LogStream;
|
||||
using mindspore::ExceptionType::NoExceptionType;
|
||||
using mindspore::MsLogLevel::ERROR;
|
||||
|
||||
namespace mindspore {
|
||||
namespace mindrecord {
|
||||
ShardDistributedSample::ShardDistributedSample(int num_shards, int shard_id, int no_of_padded_samples, bool shuffle,
|
||||
uint32_t seed)
|
||||
: ShardSample(1, num_shards, shard_id), shuffle_(shuffle), no_of_padded_samples_(no_of_padded_samples) {
|
||||
shuffle_op_ = std::make_shared<ShardShuffle>(seed, kShuffleSample);
|
||||
}
|
||||
|
||||
int64_t ShardDistributedSample::GetNumSamples(int64_t dataset_size, int64_t num_classes) {
|
||||
if (no_of_padded_samples_ <= 0) {
|
||||
if (dataset_size % denominator_ == 0) {
|
||||
return dataset_size / denominator_ * numerator_;
|
||||
} else {
|
||||
return dataset_size / denominator_ * numerator_ + 1;
|
||||
}
|
||||
} else {
|
||||
auto padded_size = dataset_size + no_of_padded_samples_;
|
||||
if (padded_size % denominator_ == 0) {
|
||||
return padded_size / denominator_ * numerator_;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
MSRStatus ShardDistributedSample::PreExecute(ShardTask &tasks) {
|
||||
auto total_no = tasks.Size();
|
||||
if (no_of_padded_samples_ > 0) {
|
||||
if (total_no % denominator_ != 0) {
|
||||
MS_LOG(ERROR) << "Dataset size plus number of padded samples is not divisible by number of shards.";
|
||||
return FAILED;
|
||||
}
|
||||
}
|
||||
if (shuffle_ == true) {
|
||||
if (SUCCESS != (*shuffle_op_)(tasks)) {
|
||||
return FAILED;
|
||||
}
|
||||
}
|
||||
return SUCCESS;
|
||||
}
|
||||
} // namespace mindrecord
|
||||
} // namespace mindspore
|
|
@ -25,32 +25,32 @@ namespace mindrecord {
|
|||
ShardSample::ShardSample(int n)
|
||||
: numerator_(0),
|
||||
denominator_(0),
|
||||
no_of_samples_(n),
|
||||
partition_id_(0),
|
||||
no_of_samples_(n),
|
||||
indices_({}),
|
||||
sampler_type_(kCustomTopNSampler) {}
|
||||
|
||||
ShardSample::ShardSample(int num, int den)
|
||||
: numerator_(num),
|
||||
denominator_(den),
|
||||
no_of_samples_(0),
|
||||
partition_id_(0),
|
||||
no_of_samples_(0),
|
||||
indices_({}),
|
||||
sampler_type_(kCustomTopPercentSampler) {}
|
||||
|
||||
ShardSample::ShardSample(int num, int den, int par)
|
||||
: numerator_(num),
|
||||
denominator_(den),
|
||||
no_of_samples_(0),
|
||||
partition_id_(par),
|
||||
no_of_samples_(0),
|
||||
indices_({}),
|
||||
sampler_type_(kCustomTopPercentSampler) {}
|
||||
|
||||
ShardSample::ShardSample(const std::vector<int64_t> &indices, uint32_t seed)
|
||||
: numerator_(0),
|
||||
denominator_(0),
|
||||
no_of_samples_(0),
|
||||
partition_id_(0),
|
||||
no_of_samples_(0),
|
||||
indices_(indices),
|
||||
sampler_type_(kSubsetRandomSampler) {
|
||||
shuffle_op_ = std::make_shared<ShardShuffle>(seed);
|
||||
|
@ -71,19 +71,12 @@ int64_t ShardSample::GetNumSamples(int64_t dataset_size, int64_t num_classes) {
|
|||
if (sampler_type_ == kSubsetRandomSampler) {
|
||||
return indices_.size();
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
const std::pair<int, int> ShardSample::GetPartitions() const {
|
||||
if (numerator_ == 1 && denominator_ > 1) {
|
||||
return std::pair<int, int>(denominator_, partition_id_);
|
||||
}
|
||||
return std::pair<int, int>(-1, -1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
MSRStatus ShardSample::Execute(ShardTask &tasks) {
|
||||
int no_of_categories = static_cast<int>(tasks.categories);
|
||||
int total_no = static_cast<int>(tasks.Size());
|
||||
int total_no = static_cast<int>(tasks.Size()); // make sure task_size
|
||||
|
||||
int taking = 0;
|
||||
if (sampler_type_ == kCustomTopNSampler) { // non sharding case constructor #1
|
||||
|
@ -97,7 +90,7 @@ MSRStatus ShardSample::Execute(ShardTask &tasks) {
|
|||
} else { // constructor TopPercent
|
||||
if (numerator_ > 0 && denominator_ > 0 && numerator_ <= denominator_) {
|
||||
if (numerator_ == 1 && denominator_ > 1) { // sharding
|
||||
taking = (total_no / denominator_) + (total_no % denominator_ == 0 ? 0 : 1);
|
||||
taking = (total_no + denominator_ - 1) / denominator_;
|
||||
} else { // non sharding
|
||||
taking = total_no * numerator_ / denominator_;
|
||||
taking -= (taking % no_of_categories);
|
||||
|
|
|
@ -31,16 +31,18 @@ void ShardTask::MakePerm() {
|
|||
}
|
||||
}
|
||||
|
||||
void ShardTask::InsertTask(int shard_id, int group_id, const std::vector<uint64_t> &offset, const json &label) {
|
||||
void ShardTask::InsertTask(TaskType task_type, int shard_id, int group_id, const std::vector<uint64_t> &offset,
|
||||
const json &label) {
|
||||
MS_LOG(DEBUG) << "Into insert task, shard_id: " << shard_id << ", group_id: " << group_id
|
||||
<< ", label: " << label.dump() << ", size of task_list_: " << task_list_.size() << ".";
|
||||
task_list_.emplace_back(std::make_tuple(shard_id, group_id), offset, label);
|
||||
task_list_.emplace_back(task_type, std::make_tuple(shard_id, group_id), offset, label);
|
||||
}
|
||||
|
||||
void ShardTask::InsertTask(std::tuple<std::tuple<int, int>, std::vector<uint64_t>, json> task) {
|
||||
MS_LOG(DEBUG) << "Into insert task, shard_id: " << std::get<0>(std::get<0>(task))
|
||||
<< ", group_id: " << std::get<1>(std::get<0>(task)) << ", label: " << std::get<2>(task).dump()
|
||||
void ShardTask::InsertTask(std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json> task) {
|
||||
MS_LOG(DEBUG) << "Into insert task, shard_id: " << std::get<0>(std::get<1>(task))
|
||||
<< ", group_id: " << std::get<1>(std::get<1>(task)) << ", label: " << std::get<3>(task).dump()
|
||||
<< ", size of task_list_: " << task_list_.size() << ".";
|
||||
|
||||
task_list_.push_back(std::move(task));
|
||||
}
|
||||
|
||||
|
@ -52,19 +54,19 @@ uint32_t ShardTask::SizeOfRows() const {
|
|||
if (task_list_.size() == 0) return static_cast<uint32_t>(0);
|
||||
|
||||
// 1 task is 1 page
|
||||
auto sum_num_rows = [](int x, std::tuple<std::tuple<int, int>, std::vector<uint64_t>, json> y) {
|
||||
return x + std::get<1>(y)[0];
|
||||
auto sum_num_rows = [](int x, std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json> y) {
|
||||
return x + std::get<2>(y)[0];
|
||||
};
|
||||
uint32_t nRows = std::accumulate(task_list_.begin(), task_list_.end(), 0, sum_num_rows);
|
||||
return nRows;
|
||||
}
|
||||
|
||||
std::tuple<std::tuple<int, int>, std::vector<uint64_t>, json> &ShardTask::GetTaskByID(size_t id) {
|
||||
std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json> &ShardTask::GetTaskByID(size_t id) {
|
||||
MS_ASSERT(id < task_list_.size());
|
||||
return task_list_[id];
|
||||
}
|
||||
|
||||
std::tuple<std::tuple<int, int>, std::vector<uint64_t>, json> &ShardTask::GetRandomTask() {
|
||||
std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json> &ShardTask::GetRandomTask() {
|
||||
std::random_device rd;
|
||||
std::mt19937 gen(rd());
|
||||
std::uniform_int_distribution<> dis(0, task_list_.size() - 1);
|
||||
|
|
|
@ -2548,7 +2548,11 @@ class MindDataset(SourceDataset):
|
|||
sampler (Sampler, optional): Object used to choose samples from the
|
||||
dataset (default=None, sampler is exclusive
|
||||
with shuffle and block_reader). Support list: SubsetRandomSampler,
|
||||
PkSampler
|
||||
PkSampler.
|
||||
padded_sample (dict, optional): Samples will be appended to dataset, which
|
||||
keys are the same as column_list.
|
||||
num_padded (int, optional): Number of padding samples.Dataset size
|
||||
plus num_padded should be divisible by num_shards.
|
||||
|
||||
Raises:
|
||||
ValueError: If num_shards is specified but shard_id is None.
|
||||
|
@ -2559,7 +2563,8 @@ class MindDataset(SourceDataset):
|
|||
@check_minddataset
|
||||
def __init__(self, dataset_file, columns_list=None, num_parallel_workers=None,
|
||||
shuffle=None, num_shards=None, shard_id=None,
|
||||
block_reader=False, sampler=None):
|
||||
block_reader=False, sampler=None, padded_sample=None,
|
||||
num_padded=None):
|
||||
super().__init__(num_parallel_workers)
|
||||
if isinstance(dataset_file, list):
|
||||
self.load_dataset = False
|
||||
|
@ -2567,7 +2572,7 @@ class MindDataset(SourceDataset):
|
|||
self.load_dataset = True
|
||||
self.dataset_file = dataset_file
|
||||
self.columns_list = columns_list
|
||||
self.global_shuffle = shuffle
|
||||
self.shuffle_option = shuffle
|
||||
self.distribution = ""
|
||||
self.sampler = sampler
|
||||
|
||||
|
@ -2598,22 +2603,36 @@ class MindDataset(SourceDataset):
|
|||
raise ValueError("shuffle not allowed when use sampler")
|
||||
|
||||
if block_reader is False and sampler is None:
|
||||
self.global_shuffle = not bool(shuffle is False)
|
||||
self.shuffle_option = not bool(shuffle is False)
|
||||
|
||||
if num_padded is None:
|
||||
num_padded = 0
|
||||
|
||||
self.num_shards = num_shards
|
||||
self.shard_id = shard_id
|
||||
self.block_reader = block_reader
|
||||
self.padded_sample = padded_sample
|
||||
self.num_padded = num_padded
|
||||
|
||||
def get_args(self):
|
||||
args = super().get_args()
|
||||
padded_sample = {}
|
||||
if self.padded_sample:
|
||||
for k, v in self.padded_sample.items():
|
||||
if isinstance(v, np.ndarray):
|
||||
padded_sample[k] = v.tobytes()
|
||||
else:
|
||||
padded_sample[k] = v
|
||||
args["dataset_file"] = self.dataset_file
|
||||
args["load_dataset"] = self.load_dataset
|
||||
args["columns_list"] = self.columns_list
|
||||
args["global_shuffle"] = self.global_shuffle
|
||||
args["shuffle_option"] = self.shuffle_option
|
||||
args["partitions"] = self.partitions
|
||||
args["block_reader"] = self.block_reader
|
||||
args["num_shards"] = self.num_shards
|
||||
args["shard_id"] = self.shard_id
|
||||
args["num_padded"] = self.num_padded
|
||||
args["padded_sample"] = padded_sample
|
||||
args["sampler"] = self.sampler
|
||||
return args
|
||||
|
||||
|
@ -2628,19 +2647,22 @@ class MindDataset(SourceDataset):
|
|||
dataset_file = [self.dataset_file]
|
||||
else:
|
||||
dataset_file = self.dataset_file
|
||||
num_rows = MindRecordOp.get_num_rows(dataset_file, self.load_dataset, self.sampler)
|
||||
num_rows = MindRecordOp.get_num_rows(dataset_file, self.load_dataset, self.sampler, self.num_padded)
|
||||
if self.partitions is not None and self.partitions[0] > 0:
|
||||
if num_rows % self.partitions[0] == 0:
|
||||
num_rows = num_rows // self.partitions[0]
|
||||
else:
|
||||
if self.num_padded > 0:
|
||||
raise RuntimeError(
|
||||
"Dataset size plus number of padded samples is not divisible by number of shards.")
|
||||
num_rows = num_rows // self.partitions[0] + 1
|
||||
return num_rows
|
||||
|
||||
def is_shuffled(self):
|
||||
if self.global_shuffle is None:
|
||||
if self.shuffle_option is None:
|
||||
return True
|
||||
|
||||
return self.global_shuffle or self.sampler.is_shuffled()
|
||||
return self.shuffle_option or self.sampler.is_shuffled()
|
||||
|
||||
def is_sharded(self):
|
||||
if self.num_shards is not None:
|
||||
|
|
|
@ -323,6 +323,27 @@ def check_sampler_shuffle_shard_options(param_dict):
|
|||
raise RuntimeError("shard_id is specified but num_shards is not.")
|
||||
|
||||
|
||||
def check_padding_options(param_dict):
|
||||
""" check for valid padded_sample and num_padded of padded samples"""
|
||||
columns_list = param_dict.get('columns_list')
|
||||
block_reader = param_dict.get('block_reader')
|
||||
padded_sample, num_padded = param_dict.get('padded_sample'), param_dict.get('num_padded')
|
||||
if padded_sample is not None:
|
||||
if num_padded is None:
|
||||
raise RuntimeError("padded_sample is specified and requires num_padded as well.")
|
||||
if num_padded < 0:
|
||||
raise ValueError("num_padded is invalid, num_padded={}.".format(num_padded))
|
||||
if columns_list is None:
|
||||
raise RuntimeError("padded_sample is specified and requires columns_list as well.")
|
||||
for column in columns_list:
|
||||
if column not in padded_sample:
|
||||
raise ValueError("padded_sample cannot match columns_list.")
|
||||
if block_reader:
|
||||
raise RuntimeError("block_reader and padded_sample cannot be specified at the same time.")
|
||||
|
||||
if padded_sample is None and num_padded is not None:
|
||||
raise RuntimeError("num_padded is specified but padded_sample is not.")
|
||||
|
||||
def check_imagefolderdatasetv2(method):
|
||||
"""A wrapper that wrap a parameter checker to the original Dataset(ImageFolderDatasetV2)."""
|
||||
|
||||
|
@ -549,9 +570,10 @@ def check_minddataset(method):
|
|||
def new_method(*args, **kwargs):
|
||||
param_dict = make_param_dict(method, args, kwargs)
|
||||
|
||||
nreq_param_int = ['num_samples', 'num_parallel_workers', 'seed', 'num_shards', 'shard_id']
|
||||
nreq_param_int = ['num_samples', 'num_parallel_workers', 'seed', 'num_shards', 'shard_id', 'num_padded']
|
||||
nreq_param_list = ['columns_list']
|
||||
nreq_param_bool = ['block_reader']
|
||||
nreq_param_dict = ['padded_sample']
|
||||
|
||||
# check dataset_file; required argument
|
||||
dataset_file = param_dict.get('dataset_file')
|
||||
|
@ -569,12 +591,11 @@ def check_minddataset(method):
|
|||
|
||||
check_param_type(nreq_param_bool, param_dict, bool)
|
||||
|
||||
num_shards, shard_id = param_dict.get('num_shards'), param_dict.get('shard_id')
|
||||
if (num_shards is not None and shard_id is None) or (num_shards is None and shard_id is not None):
|
||||
raise ValueError("num_shards and shard_id need to be set or not set at the same time")
|
||||
check_param_type(nreq_param_dict, param_dict, dict)
|
||||
|
||||
check_sampler_shuffle_shard_options(param_dict)
|
||||
|
||||
check_padding_options(param_dict)
|
||||
return method(*args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
|
|
@ -139,9 +139,6 @@ TEST_F(TestShardOperator, TestShardSamplePartition) {
|
|||
const int kPar = 2;
|
||||
std::vector<std::shared_ptr<ShardOperator>> ops;
|
||||
ops.push_back(std::make_shared<ShardSample>(kNum, kDen, kPar));
|
||||
auto partitions = std::dynamic_pointer_cast<ShardSample>(ops[0])->GetPartitions();
|
||||
ASSERT_TRUE(partitions.first == 4);
|
||||
ASSERT_TRUE(partitions.second == 2);
|
||||
|
||||
ShardReader dataset;
|
||||
dataset.Open({file_name}, true, 4, column_list, ops);
|
||||
|
|
|
@ -227,10 +227,9 @@ def test_cv_minddataset_partition_tutorial(add_and_remove_cv_file):
|
|||
num_shards=num_shards, shard_id=partition_id)
|
||||
num_iter = 0
|
||||
for item in data_set.create_dict_iterator():
|
||||
logger.info(
|
||||
"-------------- partition : {} ------------------------".format(partition_id))
|
||||
logger.info(
|
||||
"-------------- item[label]: {} -----------------------".format(item["label"]))
|
||||
logger.info("-------------- partition : {} ------------------------".format(partition_id))
|
||||
logger.info("-------------- item[file_name]: {}-----------------------".format(item["file_name"]))
|
||||
logger.info("-------------- item[label]: {} -----------------------".format(item["label"]))
|
||||
num_iter += 1
|
||||
return num_iter
|
||||
|
||||
|
@ -321,12 +320,11 @@ def test_cv_minddataset_issue_888(add_and_remove_cv_file):
|
|||
"""issue 888 test."""
|
||||
columns_list = ["data", "label"]
|
||||
num_readers = 2
|
||||
data = ds.MindDataset(CV_FILE_NAME + "0", columns_list,
|
||||
num_readers, shuffle=False, num_shards=5, shard_id=1)
|
||||
data = data.shuffle(2)
|
||||
data = data.repeat(9)
|
||||
data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers, shuffle=False, num_shards=5, shard_id=1)
|
||||
data_set = data_set.shuffle(2)
|
||||
data_set = data_set.repeat(9)
|
||||
num_iter = 0
|
||||
for _ in data.create_dict_iterator():
|
||||
for _ in data_set.create_dict_iterator():
|
||||
num_iter += 1
|
||||
assert num_iter == 18
|
||||
|
||||
|
@ -335,8 +333,7 @@ def test_cv_minddataset_blockreader_tutorial(add_and_remove_cv_file):
|
|||
"""tutorial for cv minddataset."""
|
||||
columns_list = ["data", "label"]
|
||||
num_readers = 4
|
||||
data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
|
||||
block_reader=True)
|
||||
data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers, block_reader=True)
|
||||
assert data_set.get_dataset_size() == 10
|
||||
repeat_num = 2
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
@ -544,7 +541,6 @@ def test_cv_minddataset_reader_basic_tutorial(add_and_remove_cv_file):
|
|||
num_iter += 1
|
||||
assert num_iter == 10
|
||||
|
||||
|
||||
def test_nlp_minddataset_reader_basic_tutorial(add_and_remove_nlp_file):
|
||||
"""tutorial for nlp minderdataset."""
|
||||
num_readers = 4
|
||||
|
|
|
@ -0,0 +1,444 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
This is the test module for mindrecord
|
||||
"""
|
||||
import collections
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
import pytest
|
||||
import re
|
||||
import string
|
||||
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.vision.c_transforms as vision
|
||||
from mindspore import log as logger
|
||||
from mindspore.dataset.transforms.vision import Inter
|
||||
from mindspore.mindrecord import FileWriter
|
||||
|
||||
FILES_NUM = 4
|
||||
CV_FILE_NAME = "../data/mindrecord/imagenet.mindrecord"
|
||||
CV1_FILE_NAME = "../data/mindrecord/imagenet1.mindrecord"
|
||||
CV2_FILE_NAME = "../data/mindrecord/imagenet2.mindrecord"
|
||||
CV_DIR_NAME = "../data/mindrecord/testImageNetData"
|
||||
NLP_FILE_NAME = "../data/mindrecord/aclImdb.mindrecord"
|
||||
NLP_FILE_POS = "../data/mindrecord/testAclImdbData/pos"
|
||||
NLP_FILE_VOCAB = "../data/mindrecord/testAclImdbData/vocab.txt"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def add_and_remove_cv_file():
|
||||
"""add/remove cv file"""
|
||||
paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0'))
|
||||
for x in range(FILES_NUM)]
|
||||
for x in paths:
|
||||
os.remove("{}".format(x)) if os.path.exists("{}".format(x)) else None
|
||||
os.remove("{}.db".format(x)) if os.path.exists(
|
||||
"{}.db".format(x)) else None
|
||||
writer = FileWriter(CV_FILE_NAME, FILES_NUM)
|
||||
data = get_data(CV_DIR_NAME)
|
||||
cv_schema_json = {"id": {"type": "int32"},
|
||||
"file_name": {"type": "string"},
|
||||
"label": {"type": "int32"},
|
||||
"data": {"type": "bytes"}}
|
||||
writer.add_schema(cv_schema_json, "img_schema")
|
||||
writer.add_index(["file_name", "label"])
|
||||
writer.write_raw_data(data)
|
||||
writer.commit()
|
||||
yield "yield_cv_data"
|
||||
for x in paths:
|
||||
os.remove("{}".format(x))
|
||||
os.remove("{}.db".format(x))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def add_and_remove_nlp_file():
|
||||
"""add/remove nlp file"""
|
||||
paths = ["{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0'))
|
||||
for x in range(FILES_NUM)]
|
||||
for x in paths:
|
||||
if os.path.exists("{}".format(x)):
|
||||
os.remove("{}".format(x))
|
||||
if os.path.exists("{}.db".format(x)):
|
||||
os.remove("{}.db".format(x))
|
||||
writer = FileWriter(NLP_FILE_NAME, FILES_NUM)
|
||||
data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)]
|
||||
nlp_schema_json = {"id": {"type": "string"}, "label": {"type": "int32"},
|
||||
"rating": {"type": "float32"},
|
||||
"input_ids": {"type": "int64",
|
||||
"shape": [-1]},
|
||||
"input_mask": {"type": "int64",
|
||||
"shape": [1, -1]},
|
||||
"segment_ids": {"type": "int64",
|
||||
"shape": [2, -1]}
|
||||
}
|
||||
writer.set_header_size(1 << 14)
|
||||
writer.set_page_size(1 << 15)
|
||||
writer.add_schema(nlp_schema_json, "nlp_schema")
|
||||
writer.add_index(["id", "rating"])
|
||||
writer.write_raw_data(data)
|
||||
writer.commit()
|
||||
yield "yield_nlp_data"
|
||||
for x in paths:
|
||||
os.remove("{}".format(x))
|
||||
os.remove("{}.db".format(x))
|
||||
|
||||
def test_cv_minddataset_reader_basic_padded_samples(add_and_remove_cv_file):
|
||||
"""tutorial for cv minderdataset."""
|
||||
columns_list = ["label", "file_name", "data"]
|
||||
|
||||
data = get_data(CV_DIR_NAME)
|
||||
padded_sample = data[0]
|
||||
padded_sample['label'] = -1
|
||||
padded_sample['file_name'] = 'dummy.jpg'
|
||||
num_readers = 4
|
||||
data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers, padded_sample=padded_sample, num_padded=5)
|
||||
assert data_set.get_dataset_size() == 15
|
||||
num_iter = 0
|
||||
num_padded_iter = 0
|
||||
for item in data_set.create_dict_iterator():
|
||||
logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter))
|
||||
logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
|
||||
logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
|
||||
if item['label'] == -1:
|
||||
num_padded_iter += 1
|
||||
assert item['file_name'] == bytes(padded_sample['file_name'],
|
||||
encoding='utf8')
|
||||
assert item['label'] == padded_sample['label']
|
||||
assert (item['data'] == np.array(list(padded_sample['data']))).all()
|
||||
num_iter += 1
|
||||
assert num_padded_iter ==5
|
||||
assert num_iter == 15
|
||||
|
||||
|
||||
def test_cv_minddataset_partition_padded_samples(add_and_remove_cv_file):
|
||||
"""tutorial for cv minddataset."""
|
||||
columns_list = ["data", "file_name", "label"]
|
||||
|
||||
data = get_data(CV_DIR_NAME)
|
||||
padded_sample = data[0]
|
||||
padded_sample['label'] = -2
|
||||
padded_sample['file_name'] = 'dummy.jpg'
|
||||
num_readers = 4
|
||||
|
||||
def partitions(num_shards, num_padded, dataset_size):
|
||||
for partition_id in range(num_shards):
|
||||
data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
|
||||
num_shards=num_shards,
|
||||
shard_id=partition_id,
|
||||
padded_sample=padded_sample,
|
||||
num_padded=num_padded)
|
||||
assert data_set.get_dataset_size() == dataset_size
|
||||
num_iter = 0
|
||||
num_padded_iter = 0
|
||||
for item in data_set.create_dict_iterator():
|
||||
logger.info("-------------- partition : {} ------------------------".format(partition_id))
|
||||
logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
|
||||
logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
|
||||
logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
|
||||
logger.info("-------------- item[label]: {} -----------------------".format(item["label"]))
|
||||
if item['label'] == -2:
|
||||
num_padded_iter += 1
|
||||
assert item['file_name'] == bytes(padded_sample['file_name'], encoding='utf8')
|
||||
assert item['label'] == padded_sample['label']
|
||||
assert (item['data'] == np.array(list(padded_sample['data']))).all()
|
||||
num_iter += 1
|
||||
return num_iter
|
||||
|
||||
assert partitions(4, 2, 3) == 3
|
||||
assert partitions(5, 5, 3) == 3
|
||||
assert partitions(9, 8, 2) == 2
|
||||
|
||||
def test_cv_minddataset_partition_padded_samples_no_dividsible(add_and_remove_cv_file):
|
||||
"""tutorial for cv minddataset."""
|
||||
columns_list = ["data", "file_name", "label"]
|
||||
|
||||
data = get_data(CV_DIR_NAME)
|
||||
padded_sample = data[0]
|
||||
padded_sample['label'] = -2
|
||||
padded_sample['file_name'] = 'dummy.jpg'
|
||||
num_readers = 4
|
||||
|
||||
def partitions(num_shards, num_padded):
|
||||
for partition_id in range(num_shards):
|
||||
data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
|
||||
num_shards=num_shards,
|
||||
shard_id=partition_id,
|
||||
padded_sample=padded_sample,
|
||||
num_padded=num_padded)
|
||||
num_iter = 0
|
||||
for item in data_set.create_dict_iterator():
|
||||
num_iter += 1
|
||||
return num_iter
|
||||
|
||||
with pytest.raises(RuntimeError):
|
||||
partitions(4, 1)
|
||||
|
||||
def test_cv_minddataset_partition_padded_samples_dataset_size_no_divisible(add_and_remove_cv_file):
|
||||
columns_list = ["data", "file_name", "label"]
|
||||
|
||||
data = get_data(CV_DIR_NAME)
|
||||
padded_sample = data[0]
|
||||
padded_sample['label'] = -2
|
||||
padded_sample['file_name'] = 'dummy.jpg'
|
||||
num_readers = 4
|
||||
|
||||
def partitions(num_shards, num_padded):
|
||||
for partition_id in range(num_shards):
|
||||
data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
|
||||
num_shards=num_shards,
|
||||
shard_id=partition_id,
|
||||
padded_sample=padded_sample,
|
||||
num_padded=num_padded)
|
||||
with pytest.raises(RuntimeError):
|
||||
data_set.get_dataset_size() == 3
|
||||
partitions(4, 1)
|
||||
|
||||
def test_cv_minddataset_partition_padded_samples_no_equal_column_list(add_and_remove_cv_file):
|
||||
columns_list = ["data", "file_name", "label"]
|
||||
|
||||
data = get_data(CV_DIR_NAME)
|
||||
padded_sample = data[0]
|
||||
padded_sample.pop('label', None)
|
||||
padded_sample['file_name'] = 'dummy.jpg'
|
||||
num_readers = 4
|
||||
|
||||
def partitions(num_shards, num_padded):
|
||||
for partition_id in range(num_shards):
|
||||
data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
|
||||
num_shards=num_shards,
|
||||
shard_id=partition_id,
|
||||
padded_sample=padded_sample,
|
||||
num_padded=num_padded)
|
||||
for item in data_set.create_dict_iterator():
|
||||
logger.info("-------------- partition : {} ------------------------".format(partition_id))
|
||||
logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
|
||||
logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
|
||||
logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
|
||||
with pytest.raises(Exception, match="padded_sample cannot match columns_list."):
|
||||
partitions(4, 2)
|
||||
|
||||
def test_cv_minddataset_partition_padded_samples_no_column_list(add_and_remove_cv_file):
|
||||
data = get_data(CV_DIR_NAME)
|
||||
padded_sample = data[0]
|
||||
padded_sample['label'] = -2
|
||||
padded_sample['file_name'] = 'dummy.jpg'
|
||||
num_readers = 4
|
||||
|
||||
def partitions(num_shards, num_padded):
|
||||
for partition_id in range(num_shards):
|
||||
data_set = ds.MindDataset(CV_FILE_NAME + "0", None, num_readers,
|
||||
num_shards=num_shards,
|
||||
shard_id=partition_id,
|
||||
padded_sample=padded_sample,
|
||||
num_padded=num_padded)
|
||||
for item in data_set.create_dict_iterator():
|
||||
logger.info("-------------- partition : {} ------------------------".format(partition_id))
|
||||
logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
|
||||
logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
|
||||
logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
|
||||
with pytest.raises(Exception, match="padded_sample is specified and requires columns_list as well."):
|
||||
partitions(4, 2)
|
||||
|
||||
def test_cv_minddataset_partition_padded_samples_no_num_padded(add_and_remove_cv_file):
|
||||
columns_list = ["data", "file_name", "label"]
|
||||
data = get_data(CV_DIR_NAME)
|
||||
padded_sample = data[0]
|
||||
padded_sample['file_name'] = 'dummy.jpg'
|
||||
num_readers = 4
|
||||
|
||||
def partitions(num_shards, num_padded):
|
||||
for partition_id in range(num_shards):
|
||||
data_set = ds.MindDataset(CV_FILE_NAME + "0", None, num_readers,
|
||||
num_shards=num_shards,
|
||||
shard_id=partition_id,
|
||||
padded_sample=padded_sample)
|
||||
for item in data_set.create_dict_iterator():
|
||||
logger.info("-------------- partition : {} ------------------------".format(partition_id))
|
||||
logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
|
||||
logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
|
||||
logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
|
||||
with pytest.raises(Exception, match="padded_sample is specified and requires num_padded as well."):
|
||||
partitions(4, 2)
|
||||
|
||||
def test_cv_minddataset_partition_padded_samples_no_padded_samples(add_and_remove_cv_file):
|
||||
columns_list = ["data", "file_name", "label"]
|
||||
data = get_data(CV_DIR_NAME)
|
||||
padded_sample = data[0]
|
||||
padded_sample['file_name'] = 'dummy.jpg'
|
||||
num_readers = 4
|
||||
|
||||
def partitions(num_shards, num_padded):
|
||||
for partition_id in range(num_shards):
|
||||
data_set = ds.MindDataset(CV_FILE_NAME + "0", None, num_readers,
|
||||
num_shards=num_shards,
|
||||
shard_id=partition_id,
|
||||
num_padded=num_padded)
|
||||
for item in data_set.create_dict_iterator():
|
||||
logger.info("-------------- partition : {} ------------------------".format(partition_id))
|
||||
logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
|
||||
logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
|
||||
logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
|
||||
with pytest.raises(Exception, match="num_padded is specified but padded_sample is not."):
|
||||
partitions(4, 2)
|
||||
|
||||
|
||||
|
||||
def test_nlp_minddataset_reader_basic_padded_samples(add_and_remove_nlp_file):
|
||||
columns_list = ["input_ids", "id", "rating"]
|
||||
|
||||
data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)]
|
||||
padded_sample = data[0]
|
||||
padded_sample['id'] = "-1"
|
||||
padded_sample['input_ids'] = np.array([-1,-1,-1,-1], dtype=np.int64)
|
||||
padded_sample['rating'] = 1.0
|
||||
num_readers = 4
|
||||
|
||||
def partitions(num_shards, num_padded, dataset_size):
|
||||
for partition_id in range(num_shards):
|
||||
data_set = ds.MindDataset(NLP_FILE_NAME + "0", columns_list, num_readers,
|
||||
num_shards=num_shards,
|
||||
shard_id=partition_id,
|
||||
padded_sample=padded_sample,
|
||||
num_padded=num_padded)
|
||||
assert data_set.get_dataset_size() == dataset_size
|
||||
num_iter = 0
|
||||
for item in data_set.create_dict_iterator():
|
||||
logger.info("-------------- item[id]: {} ------------------------".format(item["id"]))
|
||||
logger.info("-------------- item[rating]: {} --------------------".format(item["rating"]))
|
||||
logger.info("-------------- item[input_ids]: {}, shape: {} -----------------".format(item["input_ids"], item["input_ids"].shape))
|
||||
if item['id'] == '-1':
|
||||
num_padded_iter += 1
|
||||
assert item['id'] == padded_sample['id']
|
||||
assert item['input_ids'] == padded_sample['input_ids']
|
||||
assert item['rating'] == padded_sample['rating']
|
||||
num_iter += 1
|
||||
return num_iter
|
||||
|
||||
assert partitions(4, 6, 4) == 4
|
||||
assert partitions(5, 5, 3) == 3
|
||||
assert partitions(9, 8, 2) == 2
|
||||
|
||||
def get_data(dir_name):
|
||||
"""
|
||||
usage: get data from imagenet dataset
|
||||
params:
|
||||
dir_name: directory containing folder images and annotation information
|
||||
|
||||
"""
|
||||
if not os.path.isdir(dir_name):
|
||||
raise IOError("Directory {} not exists".format(dir_name))
|
||||
img_dir = os.path.join(dir_name, "images")
|
||||
ann_file = os.path.join(dir_name, "annotation.txt")
|
||||
with open(ann_file, "r") as file_reader:
|
||||
lines = file_reader.readlines()
|
||||
|
||||
data_list = []
|
||||
for i, line in enumerate(lines):
|
||||
try:
|
||||
filename, label = line.split(",")
|
||||
label = label.strip("\n")
|
||||
with open(os.path.join(img_dir, filename), "rb") as file_reader:
|
||||
img = file_reader.read()
|
||||
data_json = {"id": i,
|
||||
"file_name": filename,
|
||||
"data": img,
|
||||
"label": int(label)}
|
||||
data_list.append(data_json)
|
||||
except FileNotFoundError:
|
||||
continue
|
||||
return data_list
|
||||
|
||||
|
||||
def get_nlp_data(dir_name, vocab_file, num):
|
||||
"""
|
||||
Return raw data of aclImdb dataset.
|
||||
|
||||
Args:
|
||||
dir_name (str): String of aclImdb dataset's path.
|
||||
vocab_file (str): String of dictionary's path.
|
||||
num (int): Number of sample.
|
||||
|
||||
Returns:
|
||||
List
|
||||
"""
|
||||
if not os.path.isdir(dir_name):
|
||||
raise IOError("Directory {} not exists".format(dir_name))
|
||||
for root, dirs, files in os.walk(dir_name):
|
||||
for index, file_name_extension in enumerate(files):
|
||||
if index < num:
|
||||
file_path = os.path.join(root, file_name_extension)
|
||||
file_name, _ = file_name_extension.split('.', 1)
|
||||
id_, rating = file_name.split('_', 1)
|
||||
with open(file_path, 'r') as f:
|
||||
raw_content = f.read()
|
||||
|
||||
dictionary = load_vocab(vocab_file)
|
||||
vectors = [dictionary.get('[CLS]')]
|
||||
vectors += [dictionary.get(i) if i in dictionary
|
||||
else dictionary.get('[UNK]')
|
||||
for i in re.findall(r"[\w']+|[{}]"
|
||||
.format(string.punctuation),
|
||||
raw_content)]
|
||||
vectors += [dictionary.get('[SEP]')]
|
||||
input_, mask, segment = inputs(vectors)
|
||||
input_ids = np.reshape(np.array(input_), [-1])
|
||||
input_mask = np.reshape(np.array(mask), [1, -1])
|
||||
segment_ids = np.reshape(np.array(segment), [2, -1])
|
||||
data = {
|
||||
"label": 1,
|
||||
"id": id_,
|
||||
"rating": float(rating),
|
||||
"input_ids": input_ids,
|
||||
"input_mask": input_mask,
|
||||
"segment_ids": segment_ids
|
||||
}
|
||||
yield data
|
||||
|
||||
|
||||
def convert_to_uni(text):
|
||||
if isinstance(text, str):
|
||||
return text
|
||||
if isinstance(text, bytes):
|
||||
return text.decode('utf-8', 'ignore')
|
||||
raise Exception("The type %s does not convert!" % type(text))
|
||||
|
||||
|
||||
def load_vocab(vocab_file):
|
||||
"""load vocabulary to translate statement."""
|
||||
vocab = collections.OrderedDict()
|
||||
vocab.setdefault('blank', 2)
|
||||
index = 0
|
||||
with open(vocab_file) as reader:
|
||||
while True:
|
||||
tmp = reader.readline()
|
||||
if not tmp:
|
||||
break
|
||||
token = convert_to_uni(tmp)
|
||||
token = token.strip()
|
||||
vocab[token] = index
|
||||
index += 1
|
||||
return vocab
|
||||
|
||||
|
||||
def inputs(vectors, maxlen=50):
|
||||
length = len(vectors)
|
||||
if length > maxlen:
|
||||
return vectors[0:maxlen], [1] * maxlen, [0] * maxlen
|
||||
input_ = vectors + [0] * (maxlen - length)
|
||||
mask = [1] * length + [0] * (maxlen - length)
|
||||
segment = [0] * maxlen
|
||||
return input_, mask, segment
|
Loading…
Reference in New Issue