Initial commit for album
Added linter fix for album dataset Added testDataset Adding signature Added JsonDataset example API Example dataset Resolving format More fixing Refactor Small fix Added compiling album dataset Running tests Added linter fix #1 Passing UT Added dataset API Addressing clang Clang part 2 Fixing pass Fixed tree check lint fix Added lint fix part 2
2
build.sh
|
@ -393,7 +393,7 @@ build_mindspore()
|
|||
CMAKE_VERBOSE="--verbose"
|
||||
fi
|
||||
cmake --build . --target package ${CMAKE_VERBOSE} -j$THREAD_NUM
|
||||
echo "success to build mindspore project!"
|
||||
echo "success building mindspore project!"
|
||||
}
|
||||
|
||||
checkndk() {
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "minddata/dataset/include/transforms.h"
|
||||
#include "minddata/dataset/engine/dataset_iterator.h"
|
||||
// Source dataset headers (in alphabetical order)
|
||||
#include "minddata/dataset/engine/datasetops/source/album_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/celeba_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/cifar_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/clue_op.h"
|
||||
|
@ -117,6 +118,15 @@ std::shared_ptr<SchemaObj> Schema(const std::string &schema_file) {
|
|||
// FUNCTIONS TO CREATE DATASETS FOR LEAF-NODE DATASETS
|
||||
// (In alphabetical order)
|
||||
|
||||
// Function to create a AlbumDataset.
|
||||
std::shared_ptr<AlbumDataset> Album(const std::string &dataset_dir, const std::string &data_schema,
|
||||
const std::vector<std::string> &column_names, bool decode,
|
||||
const std::shared_ptr<SamplerObj> &sampler) {
|
||||
auto ds = std::make_shared<AlbumDataset>(dataset_dir, data_schema, column_names, decode, sampler);
|
||||
|
||||
return ds->ValidateParams() ? ds : nullptr;
|
||||
}
|
||||
|
||||
// Function to create a CelebADataset.
|
||||
std::shared_ptr<CelebADataset> CelebA(const std::string &dataset_dir, const std::string &dataset_type,
|
||||
const std::shared_ptr<SamplerObj> &sampler, bool decode,
|
||||
|
@ -687,6 +697,49 @@ bool ValidateDatasetShardParams(const std::string &dataset_name, int32_t num_sha
|
|||
// DERIVED DATASET CLASSES LEAF-NODE DATASETS
|
||||
// (In alphabetical order)
|
||||
|
||||
// Constructor for AlbumDataset
|
||||
AlbumDataset::AlbumDataset(const std::string &dataset_dir, const std::string &data_schema,
|
||||
const std::vector<std::string> &column_names, bool decode,
|
||||
const std::shared_ptr<SamplerObj> &sampler)
|
||||
: dataset_dir_(dataset_dir),
|
||||
schema_path_(data_schema),
|
||||
column_names_(column_names),
|
||||
decode_(decode),
|
||||
sampler_(sampler) {}
|
||||
|
||||
bool AlbumDataset::ValidateParams() {
|
||||
if (!ValidateDatasetDirParam("AlbumDataset", dataset_dir_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ValidateDatasetFilesParam("AlbumDataset", {schema_path_})) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Function to build AlbumDataset
|
||||
std::vector<std::shared_ptr<DatasetOp>> AlbumDataset::Build() {
|
||||
// A vector containing shared pointer to the Dataset Ops that this object will create
|
||||
std::vector<std::shared_ptr<DatasetOp>> node_ops;
|
||||
|
||||
// If user does not specify Sampler, create a default sampler, i.e., RandomSampler.
|
||||
if (sampler_ == nullptr) {
|
||||
sampler_ = CreateDefaultSampler();
|
||||
}
|
||||
|
||||
auto schema = std::make_unique<DataSchema>();
|
||||
RETURN_EMPTY_IF_ERROR(schema->LoadSchemaFile(schema_path_, column_names_));
|
||||
|
||||
// Argument that is not exposed to user in the API.
|
||||
std::set<std::string> extensions = {};
|
||||
|
||||
node_ops.push_back(std::make_shared<AlbumOp>(num_workers_, rows_per_buffer_, dataset_dir_, connector_que_size_,
|
||||
decode_, extensions, std::move(schema), std::move(sampler_->Build())));
|
||||
return node_ops;
|
||||
}
|
||||
|
||||
// Constructor for CelebADataset
|
||||
CelebADataset::CelebADataset(const std::string &dataset_dir, const std::string &dataset_type,
|
||||
const std::shared_ptr<SamplerObj> &sampler, const bool &decode,
|
||||
|
|
|
@ -13,6 +13,7 @@ set(DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES
|
|||
text_file_op.cc
|
||||
clue_op.cc
|
||||
csv_op.cc
|
||||
album_op.cc
|
||||
)
|
||||
|
||||
set(DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES
|
||||
|
|
|
@ -0,0 +1,508 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "minddata/dataset/engine/datasetops/source/album_op.h"
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include "minddata/dataset/core/config_manager.h"
|
||||
#include "minddata/dataset/core/tensor_shape.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
|
||||
#include "minddata/dataset/engine/db_connector.h"
|
||||
#include "minddata/dataset/engine/execution_tree.h"
|
||||
#include "minddata/dataset/engine/opt/pass.h"
|
||||
#include "minddata/dataset/kernels/image/image_utils.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
AlbumOp::Builder::Builder() : builder_decode_(false), builder_sampler_(nullptr), builder_schema_file_("") {
|
||||
std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
|
||||
builder_num_workers_ = cfg->num_parallel_workers();
|
||||
builder_rows_per_buffer_ = cfg->rows_per_buffer();
|
||||
builder_op_connector_size_ = cfg->op_connector_size();
|
||||
}
|
||||
|
||||
Status AlbumOp::Builder::Build(std::shared_ptr<AlbumOp> *ptr) {
|
||||
RETURN_IF_NOT_OK(SanityCheck());
|
||||
if (builder_sampler_ == nullptr) {
|
||||
int64_t num_samples = 0; // default num samples of 0 means to sample entire set of data
|
||||
int64_t start_index = 0;
|
||||
builder_sampler_ = std::make_shared<SequentialSampler>(start_index, num_samples);
|
||||
}
|
||||
|
||||
builder_schema_ = std::make_unique<DataSchema>();
|
||||
Path schema_file(builder_schema_file_);
|
||||
if (builder_schema_file_ == "" || !schema_file.Exists()) {
|
||||
RETURN_STATUS_UNEXPECTED("Schema not provided");
|
||||
} else {
|
||||
MS_LOG(INFO) << "Schema file provided: " << builder_schema_file_ << ".";
|
||||
builder_schema_->LoadSchemaFile(builder_schema_file_, builder_columns_to_load_);
|
||||
}
|
||||
*ptr = std::make_shared<AlbumOp>(builder_num_workers_, builder_rows_per_buffer_, builder_dir_,
|
||||
builder_op_connector_size_, builder_decode_, builder_extensions_,
|
||||
std::move(builder_schema_), std::move(builder_sampler_));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AlbumOp::Builder::SanityCheck() {
|
||||
Path dir(builder_dir_);
|
||||
std::string err_msg;
|
||||
err_msg += dir.IsDirectory() == false ? "Album path is invalid or not set\n" : "";
|
||||
err_msg += builder_num_workers_ <= 0 ? "Num of parallel workers is set to 0\n" : "";
|
||||
return err_msg.empty() ? Status::OK() : Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err_msg);
|
||||
}
|
||||
|
||||
AlbumOp::AlbumOp(int32_t num_wkrs, int32_t rows_per_buffer, std::string file_dir, int32_t queue_size, bool do_decode,
|
||||
const std::set<std::string> &exts, std::unique_ptr<DataSchema> data_schema,
|
||||
std::shared_ptr<Sampler> sampler)
|
||||
: ParallelOp(num_wkrs, queue_size),
|
||||
rows_per_buffer_(rows_per_buffer),
|
||||
folder_path_(file_dir),
|
||||
decode_(do_decode),
|
||||
extensions_(exts),
|
||||
data_schema_(std::move(data_schema)),
|
||||
sampler_(std::move(sampler)),
|
||||
row_cnt_(0),
|
||||
buf_cnt_(0),
|
||||
sampler_ind_(0),
|
||||
dirname_offset_(0) {
|
||||
// Set the column name map (base class field)
|
||||
for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
|
||||
column_name_id_map_[data_schema_->column(i).name()] = i;
|
||||
}
|
||||
io_block_queues_.Init(num_workers_, queue_size);
|
||||
}
|
||||
|
||||
// Helper function for string comparison
|
||||
bool StrComp(const std::string &a, const std::string &b) {
|
||||
// returns 1 if string a is alphabetically
|
||||
// less than string b
|
||||
// quite similar to strcmp operation
|
||||
return a < b;
|
||||
}
|
||||
|
||||
// Single thread to go through the folder directory and gets all file names
|
||||
// calculate numRows then return
|
||||
Status AlbumOp::PrescanEntry() {
|
||||
Path folder(folder_path_);
|
||||
dirname_offset_ = folder_path_.length();
|
||||
std::shared_ptr<Path::DirIterator> dirItr = Path::DirIterator::OpenDirectory(&folder);
|
||||
if (folder.Exists() == false || dirItr == nullptr) {
|
||||
RETURN_STATUS_UNEXPECTED("Error unable to open: " + folder_path_);
|
||||
}
|
||||
MS_LOG(INFO) << "Album folder Path found: " << folder_path_ << ".";
|
||||
|
||||
while (dirItr->hasNext()) {
|
||||
Path file = dirItr->next();
|
||||
if (extensions_.empty() || extensions_.find(file.Extension()) != extensions_.end()) {
|
||||
(void)image_rows_.push_back(file.toString().substr(dirname_offset_));
|
||||
} else {
|
||||
MS_LOG(INFO) << "Album operator unsupported file found: " << file.toString()
|
||||
<< ", extension: " << file.Extension() << ".";
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(image_rows_.begin(), image_rows_.end(), StrComp);
|
||||
num_rows_ = image_rows_.size();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Main logic, Register Queue with TaskGroup, launch all threads and do the functor's work
|
||||
Status AlbumOp::operator()() {
|
||||
RETURN_IF_NOT_OK(this->PrescanEntry());
|
||||
RETURN_IF_NOT_OK(LaunchThreadsAndInitOp());
|
||||
std::unique_ptr<DataBuffer> sampler_buffer;
|
||||
RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
|
||||
while (true) { // each iterator is 1 epoch
|
||||
std::vector<int64_t> keys;
|
||||
keys.reserve(rows_per_buffer_);
|
||||
while (sampler_buffer->eoe() == false) {
|
||||
TensorRow sample_row;
|
||||
RETURN_IF_NOT_OK(sampler_buffer->PopRow(&sample_row));
|
||||
std::shared_ptr<Tensor> sample_ids = sample_row[0];
|
||||
for (auto itr = sample_ids->begin<int64_t>(); itr != sample_ids->end<int64_t>(); ++itr) {
|
||||
if ((*itr) >= num_rows_) continue; // index out of bound, skipping
|
||||
keys.push_back(*itr);
|
||||
row_cnt_++;
|
||||
if (row_cnt_ % rows_per_buffer_ == 0) {
|
||||
RETURN_IF_NOT_OK(
|
||||
io_block_queues_[buf_cnt_++ % num_workers_]->Add(std::make_unique<IOBlock>(keys, IOBlock::kDeIoBlockNone)));
|
||||
keys.clear();
|
||||
}
|
||||
}
|
||||
RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
|
||||
}
|
||||
if (keys.empty() == false) {
|
||||
RETURN_IF_NOT_OK(
|
||||
io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(keys, IOBlock::kDeIoBlockNone)));
|
||||
}
|
||||
if (IsLastIteration()) {
|
||||
std::unique_ptr<IOBlock> eoe_block = std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe);
|
||||
std::unique_ptr<IOBlock> eof_block = std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof);
|
||||
RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::move(eoe_block)));
|
||||
RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::move(eof_block)));
|
||||
for (int32_t i = 0; i < num_workers_; ++i) {
|
||||
RETURN_IF_NOT_OK(
|
||||
io_block_queues_[i]->Add(std::make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone)));
|
||||
}
|
||||
return Status::OK();
|
||||
} else { // not the last repeat. Sleep master thread, wait for the wake-up from reset
|
||||
RETURN_IF_NOT_OK(
|
||||
io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
|
||||
RETURN_IF_NOT_OK(wp_.Wait()); // Master thread goes to sleep after it has made all the IOBlocks
|
||||
wp_.Clear();
|
||||
RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
|
||||
}
|
||||
UpdateRepeatAndEpochCounter();
|
||||
}
|
||||
}
|
||||
|
||||
// contains the main logic of pulling a IOBlock from IOBlockQueue, load a buffer and push the buffer to out_connector_
|
||||
// IMPORTANT: 1 IOBlock produces 1 DataBuffer
|
||||
Status AlbumOp::WorkerEntry(int32_t worker_id) {
|
||||
TaskManager::FindMe()->Post();
|
||||
int64_t buffer_id = worker_id;
|
||||
std::unique_ptr<IOBlock> io_block;
|
||||
RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&io_block));
|
||||
while (io_block != nullptr) {
|
||||
if (io_block->eoe() == true) {
|
||||
RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
|
||||
buffer_id = worker_id;
|
||||
} else if (io_block->eof() == true) {
|
||||
RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
|
||||
} else {
|
||||
std::vector<int64_t> keys;
|
||||
RETURN_IF_NOT_OK(io_block->GetKeys(&keys));
|
||||
if (keys.empty() == true) return Status::OK(); // empty key is a quit signal for workers
|
||||
std::unique_ptr<DataBuffer> db = std::make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
|
||||
RETURN_IF_NOT_OK(LoadBuffer(keys, &db));
|
||||
RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(db)));
|
||||
buffer_id += num_workers_;
|
||||
}
|
||||
RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&io_block));
|
||||
}
|
||||
RETURN_STATUS_UNEXPECTED("Unexpected nullptr received in worker");
|
||||
}
|
||||
|
||||
// Only support JPEG/PNG/GIF/BMP
|
||||
// Optimization: Could take in a tensor
|
||||
Status AlbumOp::CheckImageType(const std::string &file_name, bool *valid) {
|
||||
std::ifstream file_handle;
|
||||
constexpr int read_num = 3;
|
||||
*valid = false;
|
||||
file_handle.open(file_name, std::ios::binary | std::ios::in);
|
||||
if (!file_handle.is_open()) {
|
||||
RETURN_STATUS_UNEXPECTED("Can not open image file " + file_name);
|
||||
}
|
||||
unsigned char file_type[read_num];
|
||||
(void)file_handle.read(reinterpret_cast<char *>(file_type), read_num);
|
||||
|
||||
if (file_handle.fail()) {
|
||||
file_handle.close();
|
||||
RETURN_STATUS_UNEXPECTED("Read image file failed " + file_name);
|
||||
}
|
||||
file_handle.close();
|
||||
if (file_type[0] == 0xff && file_type[1] == 0xd8 && file_type[2] == 0xff) {
|
||||
// Normal JPEGs start with \xff\xd8\xff\xe0
|
||||
// JPEG with EXIF stats with \xff\xd8\xff\xe1
|
||||
// Use \xff\xd8\xff to cover both.
|
||||
*valid = true;
|
||||
} else if (file_type[0] == 0x89 && file_type[1] == 0x50 && file_type[2] == 0x4e) {
|
||||
// It's a PNG
|
||||
*valid = true;
|
||||
} else if (file_type[0] == 0x47 && file_type[1] == 0x49 && file_type[2] == 0x46) {
|
||||
// It's a GIF
|
||||
*valid = true;
|
||||
} else if (file_type[0] == 0x42 && file_type[1] == 0x4d) {
|
||||
// It's a BMP
|
||||
*valid = true;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AlbumOp::LoadImageTensor(const std::string &image_file_path, uint32_t col_num, TensorRow *row) {
|
||||
std::shared_ptr<Tensor> image;
|
||||
std::ifstream fs;
|
||||
fs.open(image_file_path, std::ios::binary | std::ios::in);
|
||||
if (fs.fail()) {
|
||||
MS_LOG(INFO) << "Image file not found:" << image_file_path << ".";
|
||||
// If file doesn't exist, we don't flag this as error in input check, simply skip
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
MS_LOG(INFO) << "Image file found: " << image_file_path << ".";
|
||||
|
||||
// check that the file is an image before decoding
|
||||
bool valid = false;
|
||||
RETURN_IF_NOT_OK(CheckImageType(image_file_path, &valid));
|
||||
RETURN_IF_NOT_OK(Tensor::CreateFromFile(image_file_path, &image));
|
||||
if (decode_ && valid) {
|
||||
Status rc = Decode(image, &image);
|
||||
if (rc.IsError()) {
|
||||
std::string err = "Fail to decode image:" + image_file_path;
|
||||
RETURN_STATUS_UNEXPECTED(err);
|
||||
}
|
||||
}
|
||||
row->push_back(std::move(image));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AlbumOp::LoadStringArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) {
|
||||
std::vector<std::string> data = json_obj;
|
||||
|
||||
MS_LOG(INFO) << "String array label found: " << data << ".";
|
||||
std::shared_ptr<Tensor> label;
|
||||
RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &label));
|
||||
row->push_back(std::move(label));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AlbumOp::LoadStringTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) {
|
||||
std::string data = json_obj;
|
||||
// now we iterate over the elements in json
|
||||
|
||||
MS_LOG(INFO) << "String label found: " << data << ".";
|
||||
std::shared_ptr<Tensor> label;
|
||||
RETURN_IF_NOT_OK(Tensor::CreateScalar<std::string>(data, &label));
|
||||
row->push_back(std::move(label));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AlbumOp::LoadIntArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) {
|
||||
std::shared_ptr<Tensor> label;
|
||||
// consider templating this function to handle all ints
|
||||
if (data_schema_->column(col_num).type() == DataType(DataType::DE_INT64)) {
|
||||
std::vector<int64_t> data;
|
||||
|
||||
// Iterate over the integer list and add those values to the output shape tensor
|
||||
auto items = json_obj.items();
|
||||
using it_type = decltype(items.begin());
|
||||
(void)std::transform(items.begin(), items.end(), std::back_inserter(data), [](it_type j) { return j.value(); });
|
||||
|
||||
RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &label));
|
||||
} else if (data_schema_->column(col_num).type() == DataType(DataType::DE_INT32)) {
|
||||
std::vector<int32_t> data;
|
||||
|
||||
// Iterate over the integer list and add those values to the output shape tensor
|
||||
auto items = json_obj.items();
|
||||
using it_type = decltype(items.begin());
|
||||
(void)std::transform(items.begin(), items.end(), std::back_inserter(data), [](it_type j) { return j.value(); });
|
||||
|
||||
MS_LOG(INFO) << "Int array found: " << data << ".";
|
||||
RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &label));
|
||||
} else {
|
||||
RETURN_STATUS_UNEXPECTED("Error in Load Int Tensor");
|
||||
}
|
||||
row->push_back(std::move(label));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AlbumOp::LoadIDTensor(const std::string &file, uint32_t col_num, TensorRow *row) {
|
||||
if (data_schema_->column(col_num).type() == DataType(DataType::DE_STRING)) {
|
||||
std::shared_ptr<Tensor> id;
|
||||
RETURN_IF_NOT_OK(Tensor::CreateScalar<std::string>(file, &id));
|
||||
row->push_back(std::move(id));
|
||||
return Status::OK();
|
||||
}
|
||||
// hack to get the file name without extension, the 1 is to get rid of the backslash character
|
||||
int64_t image_id = std::atoi(file.substr(1, file.find(".")).c_str());
|
||||
std::shared_ptr<Tensor> id;
|
||||
RETURN_IF_NOT_OK(Tensor::CreateScalar<int64_t>(image_id, &id));
|
||||
MS_LOG(INFO) << "File ID " << image_id << ".";
|
||||
row->push_back(std::move(id));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AlbumOp::LoadEmptyTensor(uint32_t col_num, TensorRow *row) {
|
||||
// hack to get the file name without extension, the 1 is to get rid of the backslash character
|
||||
std::shared_ptr<Tensor> empty_tensor;
|
||||
RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({}), data_schema_->column(col_num).type(), &empty_tensor));
|
||||
row->push_back(std::move(empty_tensor));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Loads a tensor with float value, issue with float64, we don't have reverse look up to the type
|
||||
// So we actually have to check what type we want to fill the tensor with.
|
||||
// Float64 doesn't work with reinterpret cast here. Otherwise we limit the float in the schema to
|
||||
// only be float32, seems like a weird limitation to impose
|
||||
Status AlbumOp::LoadFloatTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) {
|
||||
std::shared_ptr<Tensor> float_tensor;
|
||||
if (data_schema_->column(col_num).type() == DataType(DataType::DE_FLOAT64)) {
|
||||
double data = json_obj;
|
||||
MS_LOG(INFO) << "double found: " << json_obj << ".";
|
||||
RETURN_IF_NOT_OK(Tensor::CreateScalar<double>(data, &float_tensor));
|
||||
} else if (data_schema_->column(col_num).type() == DataType(DataType::DE_FLOAT32)) {
|
||||
float data = json_obj;
|
||||
RETURN_IF_NOT_OK(Tensor::CreateScalar<float>(data, &float_tensor));
|
||||
MS_LOG(INFO) << "float found: " << json_obj << ".";
|
||||
}
|
||||
row->push_back(std::move(float_tensor));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Load 1 TensorRow (image,label) using 1 ImageColumns. 1 function call produces 1 TensorTow in a DataBuffer
|
||||
// possible optimization: the helper functions of LoadTensorRow should be optimized
|
||||
// to take a reference to a column descriptor?
|
||||
Status AlbumOp::LoadTensorRow(const std::string &file, TensorRow *row) {
|
||||
// testing here is to just print out file path
|
||||
(*row) = {};
|
||||
MS_LOG(INFO) << "Image row file: " << file << ".";
|
||||
|
||||
std::ifstream file_handle(folder_path_ + file);
|
||||
if (!file_handle.is_open()) {
|
||||
RETURN_STATUS_UNEXPECTED("Json file " + folder_path_ + file + " can not open.");
|
||||
}
|
||||
std::string line;
|
||||
while (getline(file_handle, line)) {
|
||||
try {
|
||||
nlohmann::json js = nlohmann::json::parse(line);
|
||||
MS_LOG(INFO) << "This Line: " << line << ".";
|
||||
|
||||
// note if take a schema here, then we have to iterate over all column descriptors in schema and check for key
|
||||
// get columns in schema:
|
||||
int32_t columns = data_schema_->NumColumns();
|
||||
|
||||
// loop over each column descriptor, this can optimized by swtich cases
|
||||
for (int32_t i = 0; i < columns; i++) {
|
||||
// special case to handle
|
||||
if (data_schema_->column(i).name() == "id") {
|
||||
// id is internal, special case to load from file
|
||||
RETURN_IF_NOT_OK(LoadIDTensor(file, i, row));
|
||||
continue;
|
||||
}
|
||||
// find if key does not exist, insert placeholder nullptr if not found
|
||||
if (js.find(data_schema_->column(i).name()) == js.end()) {
|
||||
// iterator not found, push nullptr as placeholder
|
||||
MS_LOG(INFO) << "Pushing empty tensor for column: " << data_schema_->column(i).name() << ".";
|
||||
RETURN_IF_NOT_OK(LoadEmptyTensor(i, row));
|
||||
continue;
|
||||
}
|
||||
nlohmann::json column_value = js.at(data_schema_->column(i).name());
|
||||
MS_LOG(INFO) << "This column is: " << data_schema_->column(i).name() << ".";
|
||||
bool is_array = column_value.is_array();
|
||||
// load single string
|
||||
if (column_value.is_string() && data_schema_->column(i).type() == DataType(DataType::DE_STRING)) {
|
||||
RETURN_IF_NOT_OK(LoadStringTensor(column_value, i, row));
|
||||
continue;
|
||||
}
|
||||
// load string array
|
||||
if (is_array && data_schema_->column(i).type() == DataType(DataType::DE_STRING)) {
|
||||
RETURN_IF_NOT_OK(LoadStringArrayTensor(column_value, i, row));
|
||||
continue;
|
||||
}
|
||||
// load image file
|
||||
if (column_value.is_string() && data_schema_->column(i).type() != DataType(DataType::DE_STRING)) {
|
||||
std::string image_file_path = column_value;
|
||||
RETURN_IF_NOT_OK(LoadImageTensor(image_file_path, i, row));
|
||||
continue;
|
||||
}
|
||||
// load float array
|
||||
if (!is_array && (data_schema_->column(i).type() == DataType(DataType::DE_FLOAT32) ||
|
||||
data_schema_->column(i).type() == DataType(DataType::DE_FLOAT64))) {
|
||||
RETURN_IF_NOT_OK(LoadFloatTensor(column_value, i, row));
|
||||
continue;
|
||||
}
|
||||
// int array
|
||||
if (is_array && (data_schema_->column(i).type() == DataType(DataType::DE_INT64) ||
|
||||
data_schema_->column(i).type() == DataType(DataType::DE_INT32))) {
|
||||
RETURN_IF_NOT_OK(LoadIntArrayTensor(column_value, i, row));
|
||||
continue;
|
||||
} else {
|
||||
MS_LOG(WARNING) << "Value type for column: " << data_schema_->column(i).name() << " is not supported.";
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} catch (const std::exception &err) {
|
||||
file_handle.close();
|
||||
RETURN_STATUS_UNEXPECTED("Parse Json file failed");
|
||||
}
|
||||
}
|
||||
file_handle.close();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Looping over LoadTensorRow to make 1 DataBuffer. 1 function call produces 1 buffer
|
||||
Status AlbumOp::LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataBuffer> *db) {
|
||||
std::unique_ptr<TensorQTable> deq = std::make_unique<TensorQTable>();
|
||||
TensorRow trow;
|
||||
|
||||
for (const int64_t &key : keys) {
|
||||
RETURN_IF_NOT_OK(this->LoadTensorRow(image_rows_[key], &trow));
|
||||
deq->push_back(std::move(trow));
|
||||
}
|
||||
(*db)->set_tensor_table(std::move(deq));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void AlbumOp::Print(std::ostream &out, bool show_all) const {
|
||||
// Always show the id and name as first line regardless if this summary or detailed print
|
||||
out << "(" << std::setw(2) << operator_id_ << ") <AlbumOp>:";
|
||||
if (!show_all) {
|
||||
// Call the super class for displaying any common 1-liner info
|
||||
ParallelOp::Print(out, show_all);
|
||||
// Then show any custom derived-internal 1-liner info for this op
|
||||
out << "\n";
|
||||
} else {
|
||||
// Call the super class for displaying any common detailed info
|
||||
ParallelOp::Print(out, show_all);
|
||||
// Then show any custom derived-internal stuff
|
||||
out << "\nNumber of rows:" << num_rows_ << "\nAlbum directory: " << folder_path_ << "\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
// Reset Sampler and wakeup Master thread (functor)
|
||||
Status AlbumOp::Reset() {
|
||||
RETURN_IF_NOT_OK(sampler_->ResetSampler());
|
||||
row_cnt_ = 0;
|
||||
wp_.Set(); // wake up master thread after reset is done
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// hand shake with Sampler, allow Sampler to call RandomAccessOp's functions to get NumRows
|
||||
Status AlbumOp::InitSampler() {
|
||||
RETURN_IF_NOT_OK(sampler_->HandshakeRandomAccessOp(this));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AlbumOp::LaunchThreadsAndInitOp() {
|
||||
RETURN_UNEXPECTED_IF_NULL(tree_);
|
||||
// registers QueueList and individual Queues for interrupt services
|
||||
RETURN_IF_NOT_OK(io_block_queues_.Register(tree_->AllTasks()));
|
||||
RETURN_IF_NOT_OK(wp_.Register(tree_->AllTasks()));
|
||||
// launch main workers that load DataBuffers by reading all images
|
||||
RETURN_IF_NOT_OK(tree_->LaunchWorkers(num_workers_, std::bind(&AlbumOp::WorkerEntry, this, std::placeholders::_1)));
|
||||
TaskManager::FindMe()->Post();
|
||||
RETURN_IF_NOT_OK(this->InitSampler()); // pass numRows to Sampler
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Visitor accept method for NodePass
|
||||
Status AlbumOp::Accept(NodePass *p, bool *modified) {
|
||||
// Downcast shared pointer then call visitor
|
||||
return p->RunOnNode(shared_from_base<AlbumOp>(), modified);
|
||||
}
|
||||
|
||||
Status AlbumOp::ComputeColMap() {
|
||||
// Set the column name map (base class field)
|
||||
if (column_name_id_map_.empty()) {
|
||||
for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
|
||||
column_name_id_map_[data_schema_->column(i).name()] = i;
|
||||
}
|
||||
} else {
|
||||
MS_LOG(WARNING) << "Column name map is already set!";
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,298 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_ALBUM_OP_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_ALBUM_OP_H_
|
||||
|
||||
#include <deque>
|
||||
#include <memory>
|
||||
#include <queue>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include "minddata/dataset/core/tensor.h"
|
||||
#include "minddata/dataset/engine/data_buffer.h"
|
||||
#include "minddata/dataset/engine/data_schema.h"
|
||||
#include "minddata/dataset/engine/datasetops/parallel_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/io_block.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
|
||||
#include "minddata/dataset/util/path.h"
|
||||
#include "minddata/dataset/util/queue.h"
|
||||
#include "minddata/dataset/util/services.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#include "minddata/dataset/util/wait_post.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
// Forward declares
|
||||
template <typename T>
|
||||
class Queue;
|
||||
|
||||
// Define row information as a list of file objects to read
|
||||
using FolderImages = std::shared_ptr<std::pair<std::string, std::queue<std::string>>>;
|
||||
|
||||
/// \class AlbumOp album_op.h
|
||||
class AlbumOp : public ParallelOp, public RandomAccessOp {
|
||||
public:
|
||||
class Builder {
|
||||
public:
|
||||
/// \brief Constructor for Builder class of AlbumOp
|
||||
Builder();
|
||||
|
||||
/// \brief Destructor.
|
||||
~Builder() = default;
|
||||
|
||||
/// \brief Setter method
|
||||
/// \param[in] rows_per_buffer
|
||||
/// \return Builder setter method returns reference to the builder
|
||||
Builder &SetRowsPerBuffer(int32_t rows_per_buffer) {
|
||||
builder_rows_per_buffer_ = rows_per_buffer;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Setter method
|
||||
/// \param[in] size
|
||||
/// \return Builder setter method returns reference to the builder
|
||||
Builder &SetOpConnectorSize(int32_t size) {
|
||||
builder_op_connector_size_ = size;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Setter method
|
||||
/// \param[in] exts - file extensions to be read
|
||||
/// \return Builder setter method returns reference to the builder
|
||||
Builder &SetExtensions(const std::set<std::string> &exts) {
|
||||
builder_extensions_ = exts;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Setter method
|
||||
/// \param[in] do_decode
|
||||
/// \return Builder setter method returns reference to the builder
|
||||
Builder &SetDecode(bool do_decode) {
|
||||
builder_decode_ = do_decode;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Setter method
|
||||
/// \param[in] num_workers
|
||||
/// \return Builder setter method returns reference to the builder
|
||||
Builder &SetNumWorkers(int32_t num_workers) {
|
||||
builder_num_workers_ = num_workers;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Setter method
|
||||
/// \param[in] sampler
|
||||
/// \return Builder setter method returns reference to the builder
|
||||
Builder &SetSampler(std::shared_ptr<Sampler> sampler) {
|
||||
builder_sampler_ = std::move(sampler);
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Setter method
|
||||
/// \param[in] dir - dataset directory
|
||||
/// \return Builder setter method returns reference to the builder
|
||||
Builder &SetAlbumDir(const std::string &dir) {
|
||||
builder_dir_ = dir;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Setter method
|
||||
/// \param[in] file - schema file to load
|
||||
/// \return Builder setter method returns reference to the builder
|
||||
Builder &SetSchemaFile(const std::string &file) {
|
||||
builder_schema_file_ = file;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Setter method
|
||||
/// \param[in] columns - input columns
|
||||
/// \return Builder setter method returns reference to the builder
|
||||
Builder &SetColumnsToLoad(const std::vector<std::string> &columns) {
|
||||
builder_columns_to_load_ = columns;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Check validity of input args
|
||||
/// \return - The error code return
|
||||
Status SanityCheck();
|
||||
|
||||
/// \brief The builder "build" method creates the final object.
|
||||
/// \param[inout] std::shared_ptr<AlbumOp> *op - DatasetOp
|
||||
/// \return - The error code return
|
||||
Status Build(std::shared_ptr<AlbumOp> *op);
|
||||
|
||||
private:
|
||||
bool builder_decode_;
|
||||
std::vector<std::string> builder_columns_to_load_;
|
||||
std::string builder_dir_;
|
||||
std::string builder_schema_file_;
|
||||
int32_t builder_num_workers_;
|
||||
int32_t builder_rows_per_buffer_;
|
||||
int32_t builder_op_connector_size_;
|
||||
std::set<std::string> builder_extensions_;
|
||||
std::shared_ptr<Sampler> builder_sampler_;
|
||||
std::unique_ptr<DataSchema> builder_schema_;
|
||||
};
|
||||
|
||||
/// \brief Constructor
|
||||
/// \param[in] num_wkrs - Num of workers reading images in parallel
|
||||
/// \param[in] rows_per_buffer Number of images (rows) in each buffer
|
||||
/// \param[in] file_dir - directory of Album
|
||||
/// \param[in] queue_size - connector size
|
||||
/// \param[in] do_decode - decode image files
|
||||
/// \param[in] exts - set of file extensions to read, if empty, read everything under the dir
|
||||
/// \param[in] data_schema - schema of dataset
|
||||
/// \param[in] sampler - sampler tells AlbumOp what to read
|
||||
AlbumOp(int32_t num_wkrs, int32_t rows_per_buffer, std::string file_dir, int32_t queue_size, bool do_decode,
|
||||
const std::set<std::string> &exts, std::unique_ptr<DataSchema> data_schema, std::shared_ptr<Sampler> sampler);
|
||||
|
||||
/// \brief Destructor.
|
||||
~AlbumOp() = default;
|
||||
|
||||
/// \brief Initialize AlbumOp related var, calls the function to walk all files
|
||||
/// \return - The error code return
|
||||
Status PrescanEntry();
|
||||
|
||||
/// \brief Worker thread pulls a number of IOBlock from IOBlock Queue, make a buffer and push it to Connector
|
||||
/// \param[in] int32_t workerId - id of each worker
|
||||
/// \return Status - The error code return
|
||||
Status WorkerEntry(int32_t worker_id) override;
|
||||
|
||||
/// \brief Main Loop of AlbumOp
|
||||
/// Master thread: Fill IOBlockQueue, then goes to sleep
|
||||
/// Worker thread: pulls IOBlock from IOBlockQueue, work on it then put buffer to mOutConnector
|
||||
/// \return Status - The error code return
|
||||
Status operator()() override;
|
||||
|
||||
/// \brief A print method typically used for debugging
|
||||
/// \param[in] out
|
||||
/// \param[in] show_all
|
||||
void Print(std::ostream &out, bool show_all) const override;
|
||||
|
||||
/// \brief Check if image ia valid.Only support JPEG/PNG/GIF/BMP
|
||||
/// This function could be optimized to return the tensor to reduce open/closing files
|
||||
/// \return Status - The error code return
|
||||
Status CheckImageType(const std::string &file_name, bool *valid);
|
||||
|
||||
// Base-class override for NodePass visitor acceptor.
|
||||
// @param p - Pointer to the NodePass to be accepted.
|
||||
// @param modified - Whether this node visit modified the pipeline.
|
||||
// @return - Status of the node visit.
|
||||
Status Accept(NodePass *p, bool *modified) override;
|
||||
|
||||
// Op name getter
|
||||
// @return Name of the current Op
|
||||
std::string Name() const override { return "AlbumOp"; }
|
||||
|
||||
private:
|
||||
/// \brief Initialize Sampler, calls sampler->Init() within
|
||||
/// \return Status The error code return
|
||||
Status InitSampler();
|
||||
|
||||
/// \brief Load image to tensor row
|
||||
/// \param[in] image_file Image name of file
|
||||
/// \param[in] col_num Column num in schema
|
||||
/// \param[inout] row Tensor row to push to
|
||||
/// \return Status The error code return
|
||||
Status LoadImageTensor(const std::string &image_file, uint32_t col_num, TensorRow *row);
|
||||
|
||||
/// \brief Load vector of ints to tensor, append tensor to tensor row
|
||||
/// \param[in] json_obj Json object containing multi-dimensional label
|
||||
/// \param[in] col_num Column num in schema
|
||||
/// \param[inout] row Tensor row to push to
|
||||
/// \return Status The error code return
|
||||
Status LoadIntArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row);
|
||||
|
||||
/// \brief Load string array into a tensor, append tensor to tensor row
|
||||
/// \param[in] json_obj Json object containing string tensor
|
||||
/// \param[in] col_num Column num in schema
|
||||
/// \param[inout] row Tensor row to push to
|
||||
/// \return Status The error code return
|
||||
Status LoadStringArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row);
|
||||
|
||||
/// \brief Load string into a tensor, append tensor to tensor row
|
||||
/// \param[in] json_obj Json object containing string tensor
|
||||
/// \param[in] col_num Column num in schema
|
||||
/// \param[inout] row Tensor row to push to
|
||||
/// \return Status The error code return
|
||||
Status LoadStringTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row);
|
||||
|
||||
/// \brief Load float value to tensor row
|
||||
/// \param[in] json_obj Json object containing float
|
||||
/// \param[in] col_num Column num in schema
|
||||
/// \param[inout] row Tensor row to push to
|
||||
/// \return Status The error code return
|
||||
Status LoadFloatTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row);
|
||||
|
||||
/// \brief Load emtpy tensor to tensor row
|
||||
/// \param[in] col_num Column num in schema
|
||||
/// \param[inout] row Tensor row to push to
|
||||
/// \return Status The error code return
|
||||
Status LoadEmptyTensor(uint32_t col_num, TensorRow *row);
|
||||
|
||||
/// \brief Load id from file name to tensor row
|
||||
/// \param[in] file The file name to get ID from
|
||||
/// \param[in] col_num Column num in schema
|
||||
/// \param[inout] row Tensor row to push to
|
||||
/// \return Status The error code return
|
||||
Status LoadIDTensor(const std::string &file, uint32_t col_num, TensorRow *row);
|
||||
|
||||
/// \brief Load a tensor row according to a json file
|
||||
/// \param[in] ImageColumns file Json file location
|
||||
/// \param[inout] TensorRow row Json content stored into a tensor row
|
||||
/// \return Status The error code return
|
||||
Status LoadTensorRow(const std::string &file, TensorRow *row);
|
||||
|
||||
/// \param[in] const std::vector<int64_t> &keys Keys in ioblock
|
||||
/// \param[inout] std::unique_ptr<DataBuffer> db Databuffer to push to
|
||||
/// \return Status The error code return
|
||||
Status LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataBuffer> *db);
|
||||
|
||||
/// \brief Called first when function is called
|
||||
/// \return The error code return
|
||||
Status LaunchThreadsAndInitOp();
|
||||
|
||||
/// \brief reset Op
|
||||
/// \return Status The error code return
|
||||
Status Reset() override;
|
||||
|
||||
// Private function for computing the assignment of the column name map.
|
||||
// @return - Status
|
||||
Status ComputeColMap() override;
|
||||
|
||||
int32_t rows_per_buffer_;
|
||||
std::string folder_path_; // directory of image folder
|
||||
bool decode_;
|
||||
std::set<std::string> extensions_; // extensions allowed
|
||||
std::unordered_map<std::string, int32_t> col_name_map_;
|
||||
std::unique_ptr<DataSchema> data_schema_;
|
||||
std::shared_ptr<Sampler> sampler_;
|
||||
int64_t row_cnt_;
|
||||
int64_t buf_cnt_;
|
||||
int64_t sampler_ind_;
|
||||
int64_t dirname_offset_;
|
||||
WaitPost wp_;
|
||||
std::vector<std::string> image_rows_;
|
||||
QueueList<std::unique_ptr<IOBlock>> io_block_queues_; // queues of IOBlocks
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_ALBUM_OP_H_
|
|
@ -134,7 +134,6 @@ Status ImageFolderOp::operator()() {
|
|||
TensorRow sample_row;
|
||||
RETURN_IF_NOT_OK(sampler_buffer->PopRow(&sample_row));
|
||||
std::shared_ptr<Tensor> sample_ids = sample_row[0];
|
||||
if (sample_ids->type() != DataType(DataType::DE_INT64)) RETURN_STATUS_UNEXPECTED("Sampler Tensor isn't int64");
|
||||
for (auto itr = sample_ids->begin<int64_t>(); itr != sample_ids->end<int64_t>(); ++itr) {
|
||||
if ((*itr) >= num_rows_) continue; // index out of bound, skipping
|
||||
keys.push_back(*itr);
|
||||
|
|
|
@ -30,6 +30,7 @@
|
|||
#include "minddata/dataset/engine/datasetops/repeat_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/skip_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/shuffle_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/album_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/celeba_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/cifar_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/coco_op.h"
|
||||
|
@ -199,6 +200,11 @@ Status NodePass::RunOnNode(std::shared_ptr<ImageFolderOp> node, bool *modified)
|
|||
return RunOnNode(std::static_pointer_cast<DatasetOp>(node), modified);
|
||||
}
|
||||
|
||||
Status NodePass::RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified) {
|
||||
// Fallback to base class visitor by default
|
||||
return RunOnNode(std::static_pointer_cast<DatasetOp>(node), modified);
|
||||
}
|
||||
|
||||
Status NodePass::RunOnNode(std::shared_ptr<CacheOp> node, bool *modified) {
|
||||
// Fallback to base class visitor by default
|
||||
return RunOnNode(std::static_pointer_cast<DatasetOp>(node), modified);
|
||||
|
|
|
@ -49,6 +49,8 @@ class FilterOp;
|
|||
class GeneratorOp;
|
||||
#endif
|
||||
|
||||
class AlbumOp;
|
||||
|
||||
class RandomDataOp;
|
||||
|
||||
class RepeatOp;
|
||||
|
@ -178,6 +180,8 @@ class NodePass : public Pass {
|
|||
|
||||
virtual Status RunOnNode(std::shared_ptr<RandomDataOp> node, bool *modified);
|
||||
|
||||
virtual Status RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified);
|
||||
|
||||
virtual Status RunOnNode(std::shared_ptr<TakeOp> node, bool *modified);
|
||||
|
||||
virtual Status RunOnNode(std::shared_ptr<ZipOp> node, bool *modified);
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "minddata/dataset/engine/datasetops/cache_lookup_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/cache_merge_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/cache_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/album_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/celeba_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/cifar_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/coco_op.h"
|
||||
|
@ -152,6 +153,11 @@ Status CacheTransformPass::CachePass::RunOnNode(std::shared_ptr<ImageFolderOp> n
|
|||
return MappableCacheLeafSetup(std::static_pointer_cast<DatasetOp>(node));
|
||||
}
|
||||
|
||||
// Perform leaf node cache transform identification
|
||||
Status CacheTransformPass::CachePass::RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified) {
|
||||
return MappableCacheLeafSetup(std::static_pointer_cast<DatasetOp>(node));
|
||||
}
|
||||
|
||||
// Perform leaf node cache transform identification
|
||||
Status CacheTransformPass::CachePass::RunOnNode(std::shared_ptr<MnistOp> node, bool *modified) {
|
||||
return MappableCacheLeafSetup(std::static_pointer_cast<DatasetOp>(node));
|
||||
|
|
|
@ -79,6 +79,12 @@ class CacheTransformPass : public TreePass {
|
|||
/// \return Status The error code return
|
||||
Status RunOnNode(std::shared_ptr<ImageFolderOp> node, bool *modified) override;
|
||||
|
||||
/// \brief Perform leaf node cache tranform identifications
|
||||
/// \param[in] node The node being visited
|
||||
/// \param[inout] modified Indicator if the node was changed at all
|
||||
/// \return Status The error code return
|
||||
Status RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified) override;
|
||||
|
||||
/// \brief Perform leaf node cache tranform identifications
|
||||
/// \param[in] node The node being visited
|
||||
/// \param[inout] modified Indicator if the node was changed at all
|
||||
|
|
|
@ -111,5 +111,11 @@ Status PrinterPass::RunOnNode(std::shared_ptr<ImageFolderOp> node, bool *modifie
|
|||
std::cout << "Visiting ImageFolderOp" << '\n';
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status PrinterPass::RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified) {
|
||||
*modified = false;
|
||||
std::cout << "Visiting ImageFolderOp" << '\n';
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -58,6 +58,8 @@ class PrinterPass : public NodePass {
|
|||
Status RunOnNode(std::shared_ptr<DeviceQueueOp> node, bool *modified) override;
|
||||
|
||||
Status RunOnNode(std::shared_ptr<ImageFolderOp> node, bool *modified) override;
|
||||
|
||||
Status RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified) override;
|
||||
};
|
||||
|
||||
} // namespace dataset
|
||||
|
|
|
@ -48,6 +48,7 @@ class TensorOperation;
|
|||
class SchemaObj;
|
||||
class SamplerObj;
|
||||
// Datasets classes (in alphabetical order)
|
||||
class AlbumDataset;
|
||||
class CelebADataset;
|
||||
class Cifar10Dataset;
|
||||
class Cifar100Dataset;
|
||||
|
@ -79,6 +80,20 @@ class ZipDataset;
|
|||
/// \return Shared pointer to the current schema
|
||||
std::shared_ptr<SchemaObj> Schema(const std::string &schema_file = "");
|
||||
|
||||
/// \brief Function to create an AlbumDataset
|
||||
/// \notes The generated dataset is specified through setting a schema
|
||||
/// \param[in] dataset_dir Path to the root directory that contains the dataset
|
||||
/// \param[in] data_schema Path to dataset schema file
|
||||
/// \param[in] column_names Column names used to specify columns to load, if empty, will read all columns.
|
||||
/// (default = {})
|
||||
/// \param[in] decode the option to decode the images in dataset (default = false)
|
||||
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`,
|
||||
/// A `RandomSampler` will be used to randomly iterate the entire dataset (default = nullptr)
|
||||
/// \return Shared pointer to the current Dataset
|
||||
std::shared_ptr<AlbumDataset> Album(const std::string &dataset_dir, const std::string &data_schema,
|
||||
const std::vector<std::string> &column_names = {}, bool decode = false,
|
||||
const std::shared_ptr<SamplerObj> &sampler = nullptr);
|
||||
|
||||
/// \brief Function to create a CelebADataset
|
||||
/// \notes The generated dataset has two columns ['image', 'attr'].
|
||||
// The type of the image tensor is uint8. The attr tensor is uint32 and one hot type.
|
||||
|
@ -119,11 +134,11 @@ std::shared_ptr<Cifar100Dataset> Cifar100(const std::string &dataset_dir,
|
|||
/// \param[in] usage Be used to "train", "test" or "eval" data (default="train").
|
||||
/// \param[in] num_samples The number of samples to be included in the dataset.
|
||||
/// (Default = 0 means all samples.)
|
||||
/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode::kGlobal)
|
||||
/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode.kGlobal)
|
||||
/// Can be any of:
|
||||
/// ShuffleMode::kFalse - No shuffling is performed.
|
||||
/// ShuffleMode::kFiles - Shuffle files only.
|
||||
/// ShuffleMode::kGlobal - Shuffle both the files and samples.
|
||||
/// ShuffleMode.kFalse - No shuffling is performed.
|
||||
/// ShuffleMode.kFiles - Shuffle files only.
|
||||
/// ShuffleMode.kGlobal - Shuffle both the files and samples.
|
||||
/// \param[in] num_shards Number of shards that the dataset should be divided into. (Default = 1)
|
||||
/// \param[in] shard_id The shard ID within num_shards. This argument should be
|
||||
/// specified only when num_shards is also specified. (Default = 0)
|
||||
|
@ -248,11 +263,11 @@ std::shared_ptr<RandomDataset> RandomData(const int32_t &total_rows = 0, T schem
|
|||
/// will be sorted in a lexicographical order.
|
||||
/// \param[in] num_samples The number of samples to be included in the dataset.
|
||||
/// (Default = 0 means all samples.)
|
||||
/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode::kGlobal)
|
||||
/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode.kGlobal)
|
||||
/// Can be any of:
|
||||
/// ShuffleMode::kFalse - No shuffling is performed.
|
||||
/// ShuffleMode::kFiles - Shuffle files only.
|
||||
/// ShuffleMode::kGlobal - Shuffle both the files and samples.
|
||||
/// ShuffleMode.kFalse - No shuffling is performed.
|
||||
/// ShuffleMode.kFiles - Shuffle files only.
|
||||
/// ShuffleMode.kGlobal - Shuffle both the files and samples.
|
||||
/// \param[in] num_shards Number of shards that the dataset should be divided into. (Default = 1)
|
||||
/// \param[in] shard_id The shard ID within num_shards. This argument should be
|
||||
/// specified only when num_shards is also specified. (Default = 0)
|
||||
|
@ -506,6 +521,31 @@ class SchemaObj {
|
|||
// DERIVED DATASET CLASSES FOR LEAF-NODE DATASETS
|
||||
// (In alphabetical order)
|
||||
|
||||
class AlbumDataset : public Dataset {
|
||||
public:
|
||||
/// \brief Constructor
|
||||
AlbumDataset(const std::string &dataset_dir, const std::string &data_schema,
|
||||
const std::vector<std::string> &column_names, bool decode, const std::shared_ptr<SamplerObj> &sampler);
|
||||
|
||||
/// \brief Destructor
|
||||
~AlbumDataset() = default;
|
||||
|
||||
/// \brief a base class override function to create a runtime dataset op object from this class
|
||||
/// \return shared pointer to the newly created DatasetOp
|
||||
std::vector<std::shared_ptr<DatasetOp>> Build() override;
|
||||
|
||||
/// \brief Parameters validation
|
||||
/// \return bool true if all the params are valid
|
||||
bool ValidateParams() override;
|
||||
|
||||
private:
|
||||
std::string dataset_dir_;
|
||||
std::string schema_path_;
|
||||
std::vector<std::string> column_names_;
|
||||
bool decode_;
|
||||
std::shared_ptr<SamplerObj> sampler_;
|
||||
};
|
||||
|
||||
class CelebADataset : public Dataset {
|
||||
public:
|
||||
/// \brief Constructor
|
||||
|
|
|
@ -5,6 +5,7 @@ SET(DE_UT_SRCS
|
|||
common/cvop_common.cc
|
||||
common/bboxop_common.cc
|
||||
auto_contrast_op_test.cc
|
||||
album_op_test.cc
|
||||
batch_op_test.cc
|
||||
bit_functions_test.cc
|
||||
storage_container_test.cc
|
||||
|
@ -101,6 +102,7 @@ SET(DE_UT_SRCS
|
|||
c_api_samplers_test.cc
|
||||
c_api_transforms_test.cc
|
||||
c_api_dataset_ops_test.cc
|
||||
c_api_dataset_album_test.cc
|
||||
c_api_dataset_cifar_test.cc
|
||||
c_api_dataset_clue_test.cc
|
||||
c_api_dataset_coco_test.cc
|
||||
|
|
|
@ -0,0 +1,208 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include "common/common.h"
|
||||
#include "minddata/dataset/core/client.h"
|
||||
#include "minddata/dataset/core/global_context.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/album_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/sampler/distributed_sampler.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/sampler/pk_sampler.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/sampler/random_sampler.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.h"
|
||||
#include "minddata/dataset/util/path.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "utils/log_adapter.h"
|
||||
#include "securec.h"
|
||||
#include "minddata/dataset/include/datasets.h"
|
||||
#include "minddata/dataset/include/transforms.h"
|
||||
|
||||
using namespace mindspore::dataset;
|
||||
using mindspore::MsLogLevel::ERROR;
|
||||
using mindspore::ExceptionType::NoExceptionType;
|
||||
using mindspore::LogStream;
|
||||
|
||||
std::shared_ptr<BatchOp> Batch(int batch_size = 1, bool drop = false, int rows_per_buf = 2);
|
||||
|
||||
std::shared_ptr<RepeatOp> Repeat(int repeat_cnt);
|
||||
|
||||
std::shared_ptr<ExecutionTree> Build(std::vector<std::shared_ptr<DatasetOp>> ops);
|
||||
|
||||
std::shared_ptr<AlbumOp> Album(int64_t num_works, int64_t rows, int64_t conns, std::string path,
|
||||
bool shuf = false, std::unique_ptr<Sampler> sampler = nullptr,
|
||||
bool decode = false) {
|
||||
std::shared_ptr<AlbumOp> so;
|
||||
AlbumOp::Builder builder;
|
||||
Status rc = builder.SetNumWorkers(num_works)
|
||||
.SetAlbumDir(path)
|
||||
.SetRowsPerBuffer(rows)
|
||||
.SetOpConnectorSize(conns)
|
||||
.SetExtensions({".json"})
|
||||
.SetSampler(std::move(sampler))
|
||||
.SetDecode(decode)
|
||||
.Build(&so);
|
||||
return so;
|
||||
}
|
||||
|
||||
std::shared_ptr<AlbumOp> AlbumSchema(int64_t num_works, int64_t rows, int64_t conns, std::string path,
|
||||
std::string schema_file, std::vector<std::string> column_names = {},
|
||||
bool shuf = false, std::unique_ptr<Sampler> sampler = nullptr,
|
||||
bool decode = false) {
|
||||
std::shared_ptr<AlbumOp> so;
|
||||
AlbumOp::Builder builder;
|
||||
Status rc = builder.SetNumWorkers(num_works)
|
||||
.SetSchemaFile(schema_file)
|
||||
.SetColumnsToLoad(column_names)
|
||||
.SetAlbumDir(path)
|
||||
.SetRowsPerBuffer(rows)
|
||||
.SetOpConnectorSize(conns)
|
||||
.SetExtensions({".json"})
|
||||
.SetSampler(std::move(sampler))
|
||||
.SetDecode(decode)
|
||||
.Build(&so);
|
||||
return so;
|
||||
}
|
||||
|
||||
class MindDataTestAlbum : public UT::DatasetOpTesting {
|
||||
protected:
|
||||
};
|
||||
|
||||
TEST_F(MindDataTestAlbum, TestSequentialAlbumWithSchema) {
|
||||
std::string folder_path = datasets_root_path_ + "/testAlbum/images";
|
||||
std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
|
||||
std::vector<std::string> column_names = {"image", "label", "id"};
|
||||
auto tree = Build({AlbumSchema(16, 2, 32, folder_path, schema_file, column_names, false), Repeat(2)});
|
||||
tree->Prepare();
|
||||
Status rc = tree->Launch();
|
||||
if (rc.IsError()) {
|
||||
MS_LOG(ERROR) << "Return code error detected during tree launch: " << ".";
|
||||
EXPECT_TRUE(false);
|
||||
} else {
|
||||
DatasetIterator di(tree);
|
||||
TensorMap tensor_map;
|
||||
di.GetNextAsMap(&tensor_map);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
uint64_t i = 0;
|
||||
int32_t label = 0;
|
||||
while (tensor_map.size() != 0) {
|
||||
tensor_map["label"]->GetItemAt<int32_t>(&label, {});
|
||||
MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape"
|
||||
<< tensor_map["label"] << "\n";
|
||||
i++;
|
||||
di.GetNextAsMap(&tensor_map);
|
||||
}
|
||||
MS_LOG(INFO) << "got rows" << i << "\n";
|
||||
EXPECT_TRUE(i == 14);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestAlbum, TestSequentialAlbumWithSchemaNoOrder) {
|
||||
std::string folder_path = datasets_root_path_ + "/testAlbum/images";
|
||||
std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
|
||||
auto tree = Build({AlbumSchema(16, 2, 32, folder_path, schema_file), Repeat(2)});
|
||||
tree->Prepare();
|
||||
Status rc = tree->Launch();
|
||||
if (rc.IsError()) {
|
||||
MS_LOG(ERROR) << "Return code error detected during tree launch: " << ".";
|
||||
EXPECT_TRUE(false);
|
||||
} else {
|
||||
DatasetIterator di(tree);
|
||||
TensorMap tensor_map;
|
||||
di.GetNextAsMap(&tensor_map);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
uint64_t i = 0;
|
||||
int32_t label = 0;
|
||||
while (tensor_map.size() != 0) {
|
||||
tensor_map["label"]->GetItemAt<int32_t>(&label, {});
|
||||
MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape"
|
||||
<< tensor_map["label"] << "\n";
|
||||
i++;
|
||||
di.GetNextAsMap(&tensor_map);
|
||||
}
|
||||
MS_LOG(INFO) << "got rows" << i << "\n";
|
||||
EXPECT_TRUE(i == 14);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestAlbum, TestSequentialAlbumWithSchemaFloat) {
|
||||
std::string folder_path = datasets_root_path_ + "/testAlbum/images";
|
||||
// add the priority column
|
||||
std::string schema_file = datasets_root_path_ + "/testAlbum/floatSchema.json";
|
||||
auto tree = Build({AlbumSchema(16, 2, 32, folder_path, schema_file), Repeat(2)});
|
||||
tree->Prepare();
|
||||
Status rc = tree->Launch();
|
||||
if (rc.IsError()) {
|
||||
MS_LOG(ERROR) << "Return code error detected during tree launch: " << ".";
|
||||
EXPECT_TRUE(false);
|
||||
} else {
|
||||
DatasetIterator di(tree);
|
||||
TensorMap tensor_map;
|
||||
di.GetNextAsMap(&tensor_map);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
uint64_t i = 0;
|
||||
int32_t label = 0;
|
||||
double priority = 0;
|
||||
while (tensor_map.size() != 0) {
|
||||
tensor_map["label"]->GetItemAt<int32_t>(&label, {});
|
||||
tensor_map["_priority"]->GetItemAt<double>(&priority, {});
|
||||
MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape"
|
||||
<< tensor_map["label"] << "priority: " << priority << "\n";
|
||||
i++;
|
||||
di.GetNextAsMap(&tensor_map);
|
||||
}
|
||||
MS_LOG(INFO) << "got rows" << i << "\n";
|
||||
EXPECT_TRUE(i == 14);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestAlbum, TestSequentialAlbumWithFullSchema) {
|
||||
std::string folder_path = datasets_root_path_ + "/testAlbum/images";
|
||||
// add the priority column
|
||||
std::string schema_file = datasets_root_path_ + "/testAlbum/fullSchema.json";
|
||||
auto tree = Build({AlbumSchema(16, 2, 32, folder_path, schema_file), Repeat(2)});
|
||||
tree->Prepare();
|
||||
Status rc = tree->Launch();
|
||||
if (rc.IsError()) {
|
||||
MS_LOG(ERROR) << "Return code error detected during tree launch: " << ".";
|
||||
EXPECT_TRUE(false);
|
||||
} else {
|
||||
DatasetIterator di(tree);
|
||||
TensorMap tensor_map;
|
||||
di.GetNextAsMap(&tensor_map);
|
||||
EXPECT_TRUE(rc.IsOk());
|
||||
uint64_t i = 0;
|
||||
int32_t label = 0;
|
||||
double priority = 0;
|
||||
while (tensor_map.size() != 0) {
|
||||
tensor_map["label"]->GetItemAt<int32_t>(&label, {});
|
||||
tensor_map["_priority"]->GetItemAt<double>(&priority, {});
|
||||
MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape"
|
||||
<< tensor_map["label"] << "priority: " << priority << " embedding : " <<
|
||||
tensor_map["_embedding"]->shape() << "\n";
|
||||
i++;
|
||||
di.GetNextAsMap(&tensor_map);
|
||||
}
|
||||
MS_LOG(INFO) << "got rows" << i << "\n";
|
||||
EXPECT_TRUE(i == 14);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "common/common.h"
|
||||
#include "minddata/dataset/include/datasets.h"
|
||||
|
||||
using namespace mindspore::dataset::api;
|
||||
using mindspore::dataset::Tensor;
|
||||
|
||||
class MindDataTestPipeline : public UT::DatasetOpTesting {
|
||||
protected:
|
||||
};
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestAlbumBasic) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumBasic.";
|
||||
|
||||
std::string folder_path = datasets_root_path_ + "/testAlbum/images";
|
||||
std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
|
||||
std::vector<std::string> column_names = {"image", "label", "id"};
|
||||
// Create a Album Dataset
|
||||
std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
auto image = row["image"];
|
||||
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
|
||||
iter->GetNextRow(&row);
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 7);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestAlbumDecode) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumDecode.";
|
||||
std::string folder_path = datasets_root_path_ + "/testAlbum/images";
|
||||
std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
|
||||
std::vector<std::string> column_names = {"image", "label", "id"};
|
||||
// Create a Album Dataset
|
||||
std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names, true);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
auto image = row["image"];
|
||||
auto shape = image->shape();
|
||||
MS_LOG(INFO) << "Tensor image shape size: " << shape.Size();
|
||||
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
|
||||
EXPECT_GT(shape.Size(), 1); // Verify decode=true took effect
|
||||
iter->GetNextRow(&row);
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 7);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestAlbumNumSamplers) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumNumSamplers.";
|
||||
|
||||
std::string folder_path = datasets_root_path_ + "/testAlbum/images";
|
||||
std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
|
||||
std::vector<std::string> column_names = {"image", "label", "id"};
|
||||
// Create a Album Dataset
|
||||
std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names, true, SequentialSampler(0, 1));
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
i++;
|
||||
auto image = row["image"];
|
||||
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
|
||||
iter->GetNextRow(&row);
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestAlbumError) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumError.";
|
||||
std::string folder_path = datasets_root_path_ + "/testAlbum/ima";
|
||||
std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
|
||||
std::vector<std::string> column_names = {"image", "label", "id"};
|
||||
// Create a Album Dataset
|
||||
std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names, true, SequentialSampler(0, 1));
|
||||
|
||||
EXPECT_EQ(ds, nullptr);
|
||||
}
|
|
@ -32,6 +32,8 @@ export GLOG_v=2
|
|||
|
||||
## prepare data for dataset & mindrecord
|
||||
cp -fr $PROJECT_PATH/tests/ut/data ${PROJECT_PATH}/build/mindspore/tests/ut/cpp/
|
||||
## prepare album dataset, uses absolute path so has to be generated
|
||||
python ${PROJECT_PATH}/build/mindspore/tests/ut/cpp/data/dataset/testAlbum/gen_json.py
|
||||
|
||||
if [ $# -gt 0 ]; then
|
||||
./ut_tests --gtest_filter=$1
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
just some random stuff
|
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
"columns": {
|
||||
"image": {
|
||||
"type": "uint8",
|
||||
"rank": 1
|
||||
},
|
||||
"label" : {
|
||||
"type": "string",
|
||||
"rank": 1
|
||||
},
|
||||
"id" : {
|
||||
"type": "int64",
|
||||
"rank": 0
|
||||
}
|
||||
}
|
||||
}
|
|
@ -5,7 +5,7 @@
|
|||
"rank": 1
|
||||
},
|
||||
"label" : {
|
||||
"type": "int32",
|
||||
"type": "string",
|
||||
"rank": 1
|
||||
},
|
||||
"id" : {
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"rank": 1
|
||||
},
|
||||
"label" : {
|
||||
"type": "int32",
|
||||
"type": "string",
|
||||
"rank": 1
|
||||
},
|
||||
"id" : {
|
||||
|
|
|
@ -2,21 +2,21 @@ import json
|
|||
import os
|
||||
|
||||
def dump_json_from_dict(structure, file_name):
|
||||
with open(file_name + '.json', 'w') as file_path:
|
||||
json.dump(structure, file_path)
|
||||
with open(file_name + '.json', 'w') as fp:
|
||||
json.dump(structure, fp)
|
||||
|
||||
if __name__ == '__main__':
|
||||
# iterate over directory
|
||||
DIRECTORY = "imagefolder"
|
||||
i = 0
|
||||
# iterate over DIRECTORY
|
||||
DIRECTORY = os.path.dirname(os.path.realpath(__file__)) + "/original"
|
||||
PARENT_DIR = os.path.dirname(DIRECTORY)
|
||||
i = -1
|
||||
for filename in os.listdir(DIRECTORY):
|
||||
default_dict = {}
|
||||
default_dict.update(dataset='')
|
||||
default_dict.update(image=(os.path.join(DIRECTORY, filename)))
|
||||
default_dict.update(label=[1, 2])
|
||||
default_dict.update(image=os.path.abspath(os.path.join(DIRECTORY, filename)))
|
||||
default_dict.update(label=['3', '2'])
|
||||
default_dict.update(_priority=0.8)
|
||||
default_dict.update(_embedding='sample.bin')
|
||||
default_dict.update(_segmented_image=(os.path.join(DIRECTORY, filename)))
|
||||
default_dict.update(_processed_image=(os.path.join(DIRECTORY, filename)))
|
||||
default_dict.update(_embedding=os.path.abspath(os.path.join(PARENT_DIR, 'sample.bin')))
|
||||
default_dict.update(_processed_image=os.path.abspath(os.path.join(DIRECTORY, filename)))
|
||||
i = i + 1
|
||||
dump_json_from_dict(default_dict, 'images/'+str(i))
|
||||
dump_json_from_dict(default_dict, PARENT_DIR + '/images/'+str(i))
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
{"dataset": "", "image": "original/apple_expect_decoded.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "sample.bin", "_processed_image": "original/apple_expect_decoded.jpg"}
|
|
@ -1 +1 @@
|
|||
{"dataset": "", "image": "imagefolder/apple_expect_decoded.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_decoded.jpg", "_processed_image": "imagefolder/apple_expect_decoded.jpg"}
|
||||
{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_resize_bilinear.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_resize_bilinear.jpg"}
|
||||
|
|
|
@ -1 +1 @@
|
|||
{"dataset": "", "image": "imagefolder/apple_expect_resize_bilinear.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_resize_bilinear.jpg", "_processed_image": "imagefolder/apple_expect_resize_bilinear.jpg"}
|
||||
{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_changemode.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_changemode.jpg"}
|
||||
|
|
|
@ -1 +1 @@
|
|||
{"dataset": "", "image": "imagefolder/apple_expect_changemode.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_changemode.jpg", "_processed_image": "imagefolder/apple_expect_changemode.jpg"}
|
||||
{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_not_flip.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_not_flip.jpg"}
|
||||
|
|
|
@ -1 +1 @@
|
|||
{"dataset": "", "image": "imagefolder/apple_expect_not_flip.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_not_flip.jpg", "_processed_image": "imagefolder/apple_expect_not_flip.jpg"}
|
||||
{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_flipped_horizontal.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_flipped_horizontal.jpg"}
|
||||
|
|
|
@ -1 +1 @@
|
|||
{"dataset": "", "image": "imagefolder/apple_expect_flipped_horizontal.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_flipped_horizontal.jpg", "_processed_image": "imagefolder/apple_expect_flipped_horizontal.jpg"}
|
||||
{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_rescaled.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_rescaled.jpg"}
|
||||
|
|
|
@ -1 +1 @@
|
|||
{"dataset": "", "image": "imagefolder/apple_expect_rescaled.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_rescaled.jpg", "_processed_image": "imagefolder/apple_expect_rescaled.jpg"}
|
||||
{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_flipped_vertical.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_flipped_vertical.jpg"}
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
{"dataset": "", "image": "imagefolder/apple_expect_flipped_vertical.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_flipped_vertical.jpg", "_processed_image": "imagefolder/apple_expect_flipped_vertical.jpg"}
|
Before Width: | Height: | Size: 422 KiB After Width: | Height: | Size: 422 KiB |
Before Width: | Height: | Size: 422 KiB After Width: | Height: | Size: 422 KiB |
Before Width: | Height: | Size: 429 KiB After Width: | Height: | Size: 429 KiB |
Before Width: | Height: | Size: 832 KiB After Width: | Height: | Size: 832 KiB |
Before Width: | Height: | Size: 422 KiB After Width: | Height: | Size: 422 KiB |
Before Width: | Height: | Size: 147 KiB After Width: | Height: | Size: 147 KiB |
Before Width: | Height: | Size: 112 KiB After Width: | Height: | Size: 112 KiB |
After Width: | Height: | Size: 422 KiB |
After Width: | Height: | Size: 422 KiB |
After Width: | Height: | Size: 429 KiB |
After Width: | Height: | Size: 832 KiB |
After Width: | Height: | Size: 422 KiB |
After Width: | Height: | Size: 147 KiB |
After Width: | Height: | Size: 112 KiB |
After Width: | Height: | Size: 422 KiB |
After Width: | Height: | Size: 422 KiB |
After Width: | Height: | Size: 429 KiB |
After Width: | Height: | Size: 832 KiB |
After Width: | Height: | Size: 422 KiB |
After Width: | Height: | Size: 147 KiB |
After Width: | Height: | Size: 112 KiB |