!2733 Schema clean up

Merge pull request !2733 from EricZ/schema_fix
This commit is contained in:
mindspore-ci-bot 2020-06-30 05:31:27 +08:00 committed by Gitee
commit 3197f9fb35
23 changed files with 241 additions and 148 deletions

View File

@ -183,35 +183,7 @@ TensorShape ColDescriptor::shape() const {
const char DataSchema::DEFAULT_DATA_SCHEMA_FILENAME[] = "datasetSchema.json";
// Constructor 1: Simple constructor that leaves things uninitialized.
DataSchema::DataSchema() : dataset_type_(DatasetType::kUnknown), num_rows_(0) {}
DatasetType DataSchema::GetDatasetTYpeFromString(const std::string &type) const {
// Convert the string to a more easy to manage enum flavour of the buffer type.
if (type == "ARROW") {
return DatasetType::kArrow;
} else if (type == "TF") {
return DatasetType::kTf;
} else {
return DatasetType::kUnknown;
}
}
Status DataSchema::LoadDatasetType(const std::string &schema_file_path) {
try {
std::ifstream in(schema_file_path);
nlohmann::json js;
in >> js;
// First, get the column for the type of dataset.
dataset_type_str_ = js.value("datasetType", "");
dataset_type_ = GetDatasetTYpeFromString(dataset_type_str_);
dir_structure_ = js.value("directoryStructure", "");
}
// Catch any exception and convert to Status return code
catch (const std::exception &err) {
RETURN_STATUS_UNEXPECTED("Schema file failed to load");
}
return Status::OK();
}
DataSchema::DataSchema() : num_rows_(0) {}
// Internal helper function. Parses the json schema file in any order and produces a schema that
// does not follow any particular order (json standard does not enforce any ordering protocol).
@ -399,8 +371,6 @@ Status DataSchema::LoadSchemaString(const std::string &schema_json_string,
nlohmann::json js = nlohmann::json::parse(schema_json_string);
RETURN_IF_NOT_OK(PreLoadExceptionCheck(js));
num_rows_ = js.value("numRows", 0);
dataset_type_str_ = js.value("datasetType", "");
dataset_type_ = GetDatasetTYpeFromString(dataset_type_str_);
nlohmann::json column_tree = js.at("columns");
if (column_tree.empty()) {
RETURN_STATUS_UNEXPECTED("columns is null");
@ -430,16 +400,10 @@ const ColDescriptor &DataSchema::column(int32_t idx) const {
// A print method typically used for debugging
void DataSchema::Print(std::ostream &out) const {
out << "Dataset type string : (";
if (dataset_type_str_.empty()) {
out << "none specified)\n";
} else {
out << dataset_type_str_ << ")\n";
}
out << "Dataset schema: (";
for (const auto &col_desc : col_descs_) {
out << col_desc << "\n";
}
out << "Dataset type: " << static_cast<uint32_t>(dataset_type_) << "\n";
}
// Adds a column descriptor to the schema

View File

@ -30,196 +30,176 @@
namespace mindspore {
namespace dataset {
// A simple class to provide meta info about a column.
/// \class ColDescriptor data_schema.h
/// \brief A simple class to provide meta info about a column.
class ColDescriptor {
public:
// Constructor 1: Simple constructor that leaves things uninitialized.
/// \brief Constructor 1: Simple constructor that leaves things uninitialized.
ColDescriptor();
// Constructor 2: Main constructor
// @param col_name - The name of the column
// @param col_type - The DE Datatype of the column
// @param tensor_impl - The (initial) type of tensor implementation for the column
// @param rank - The number of dimension of the data
// @param in_shape - option argument for input shape
/// \brief Constructor 2: Main constructor
/// \param[in] col_name - The name of the column
/// \param[in] col_type - The DE Datatype of the column
/// \param[in] tensor_impl - The (initial) type of tensor implementation for the column
/// \param[in] rank - The number of dimension of the data
/// \param[in] in_shape - option argument for input shape
ColDescriptor(const std::string &col_name, DataType col_type, TensorImpl tensor_impl, int32_t rank,
const TensorShape *in_shape = nullptr);
// Explicit copy constructor is required
// @param in_cd - the source ColDescriptor
/// \brief Explicit copy constructor is required
/// \param[in] in_cd - the source ColDescriptor
ColDescriptor(const ColDescriptor &in_cd);
// Assignment overload
// @param in_cd - the source ColDescriptor
/// \brief Assignment overload
/// \param in_cd - the source ColDescriptor
ColDescriptor &operator=(const ColDescriptor &in_cd);
// Destructor
/// \brief Destructor
~ColDescriptor();
// A print method typically used for debugging
// @param out - The output stream to write output to
/// \brief A print method typically used for debugging
/// \param out - The output stream to write output to
void Print(std::ostream &out) const;
// Given a number of elements, this function will compute what the actual Tensor shape would be.
// If there is no starting TensorShape in this column, or if there is a shape but it contains
// an unknown dimension, then the output shape returned shall resolve dimensions as needed.
// @param num_elements - The number of elements in the data for a Tensor
// @param out_shape - The materialized output Tensor shape
// @return Status - The error code return
/// \brief Given a number of elements, this function will compute what the actual Tensor shape would be.
/// If there is no starting TensorShape in this column, or if there is a shape but it contains
/// an unknown dimension, then the output shape returned shall resolve dimensions as needed.
/// \param[in] num_elements - The number of elements in the data for a Tensor
/// \param[inout] out_shape - The materialized output Tensor shape
/// \return Status - The error code return
Status MaterializeTensorShape(int32_t num_elements, TensorShape *out_shape) const;
// << Stream output operator overload
// @notes This allows you to write the debug print info using stream operators
// @param out - reference to the output stream being overloaded
// @param cd - reference to the ColDescriptor to display
// @return - the output stream must be returned
/// \brief << Stream output operator overload
/// This allows you to write the debug print info using stream operators
/// \param[in] out - reference to the output stream being overloaded
/// \param[in] cd - reference to the ColDescriptor to display
/// \return - the output stream must be returned
friend std::ostream &operator<<(std::ostream &out, const ColDescriptor &cd) {
cd.Print(out);
return out;
}
// getter function
// @return The column's DataType
/// \brief getter function
/// \return The column's DataType
DataType type() const { return type_; }
// getter function
// @return The column's rank
/// \brief getter function
/// \return The column's rank
int32_t rank() const { return rank_; }
// getter function
// @return The column's name
/// \brief getter function
/// \return The column's name
std::string name() const { return col_name_; }
// getter function
// @return The column's shape
/// \brief getter function
/// \return The column's shape
TensorShape shape() const;
// getter function
// @return TF if the column has an assigned fixed shape.
/// \brief getter function
/// \return TF if the column has an assigned fixed shape.
bool hasShape() const { return tensor_shape_ != nullptr; }
// getter function
// @return The column's tensor implementation type
/// \brief getter function
/// \return The column's tensor implementation type
TensorImpl tensorImpl() const { return tensor_impl_; }
private:
DataType type_; // The columns type
int32_t rank_; // The rank for this column (number of dimensions)
TensorImpl tensor_impl_; // The initial flavour of the tensor for this column.
TensorImpl tensor_impl_; // The initial flavour of the tensor for this column
std::unique_ptr<TensorShape> tensor_shape_; // The fixed shape (if given by user)
std::string col_name_; // The name of the column
};
// A list of the columns.
/// \class DataSchema data_schema.h
/// \brief A list of the columns.
class DataSchema {
public:
// Constructor
/// \brief Constructor
DataSchema();
// Destructor
/// \brief Destructor
~DataSchema();
// Populates the schema with a dataset type from a json file. It does not populate any of the
// column info. To populate everything, use loadSchema() afterwards.
// @param schema_file_path - Absolute path to the schema file to use for getting dataset type info.
Status LoadDatasetType(const std::string &schema_file_path);
// Parses a schema json file and populates the columns and meta info.
// @param schema_file_path - the schema file that has the column's info to load
// @param columns_to_load - list of strings for columns to load. if empty, assumes all columns.
// @return Status - The error code return
/// \brief Parses a schema json file and populates the columns and meta info.
/// \param[in] schema_file_path - the schema file that has the column's info to load
/// \param[in] columns_to_load - list of strings for columns to load. if empty, assumes all columns.
/// \return Status - The error code return
Status LoadSchemaFile(const std::string &schema_file_path, const std::vector<std::string> &columns_to_load);
// Parses a schema JSON string and populates the columns and meta info.
// @param schema_json_string - the schema file that has the column's info to load
// @param columns_to_load - list of strings for columns to load. if empty, assumes all columns.
// @return Status - The error code return
/// \brief Parses a schema JSON string and populates the columns and meta info.
/// \param[in] schema_json_string - the schema file that has the column's info to load
/// \param[in] columns_to_load - list of strings for columns to load. if empty, assumes all columns.
/// \return Status - The error code return
Status LoadSchemaString(const std::string &schema_json_string, const std::vector<std::string> &columns_to_load);
// A print method typically used for debugging
// @param out - The output stream to write output to
/// \brief A print method typically used for debugging
/// \param[in] out - The output stream to write output to
void Print(std::ostream &out) const;
// << Stream output operator overload
// @notes This allows you to write the debug print info using stream operators
// @param out - reference to the output stream being overloaded
// @param ds - reference to the DataSchema to display
// @return - the output stream must be returned
/// \brief << Stream output operator overload. This allows you to write the debug print info using stream operators
/// \param[in] out - reference to the output stream being overloaded
/// \param[in] ds - reference to the DataSchema to display
/// \return - the output stream must be returned
friend std::ostream &operator<<(std::ostream &out, const DataSchema &ds) {
ds.Print(out);
return out;
}
// Adds a column descriptor to the schema
// @param cd - The ColDescriptor to add
// @return Status - The error code return
/// \brief Adds a column descriptor to the schema
/// \param[in] cd - The ColDescriptor to add
/// \return Status - The error code return
Status AddColumn(const ColDescriptor &cd);
// Setter
// @param in_type - The Dataset type to set into the schema
void set_dataset_type(DatasetType in_type) { dataset_type_ = in_type; }
// getter
// @return The dataset type of the schema
DatasetType dataset_type() const { return dataset_type_; }
// getter
// @return The reference to a ColDescriptor to get (const version)
/// \brief getter
/// \return The reference to a ColDescriptor to get (const version)
const ColDescriptor &column(int32_t idx) const;
// getter
// @return The number of columns in the schema
/// \brief getter
/// \return The number of columns in the schema
int32_t NumColumns() const { return col_descs_.size(); }
bool Empty() const { return NumColumns() == 0; }
std::string dir_structure() const { return dir_structure_; }
std::string dataset_type_str() const { return dataset_type_str_; }
/// \brief getter
/// \return The number of rows read from schema
int64_t num_rows() const { return num_rows_; }
static const char DEFAULT_DATA_SCHEMA_FILENAME[];
// Loops through all columns in the schema and returns a map with the column
// name to column index number.
// @param out_column_name_map - The output map of columns names to column index
// @return Status - The error code return
/// \brief Loops through all columns in the schema and returns a map with the column name to column index number.
/// \param[inout] out_column_name_map - The output map of columns names to column index
/// \return Status - The error code return
Status GetColumnNameMap(std::unordered_map<std::string, int32_t> *out_column_name_map);
private:
// Internal helper function. Parses the json schema file in any order and produces a schema that
// does not follow any particular order (json standard does not enforce any ordering protocol).
// This one produces a schema that contains all of the columns from the schema file.
// @param column_tree - The nlohmann tree from the json file to parse
// @return Status - The error code return
/// \brief Internal helper function. Parses the json schema file in any order and produces a schema that
/// does not follow any particular order (json standard does not enforce any ordering protocol).
/// This one produces a schema that contains all of the columns from the schema file.
/// \param[in] column_tree - The nlohmann tree from the json file to parse
/// \return Status - The error code return
Status AnyOrderLoad(nlohmann::json column_tree);
// Internal helper function. For each input column name, perform a lookup to the json document to
// find the matching column. When the match is found, process that column to build the column
// descriptor and add to the schema in the order in which the input column names are given.
// @param column_tree - The nlohmann tree from the json file to parse
// @param columns_to_load - list of strings for the columns to add to the schema
// @return Status - The error code return
/// \brief Internal helper function. For each input column name, perform a lookup to the json document to
/// find the matching column. When the match is found, process that column to build the column
/// descriptor and add to the schema in the order in which the input column names are given.
/// \param[in] column_tree - The nlohmann tree from the json file to parse
/// \param[in] columns_to_load - list of strings for the columns to add to the schema
/// \return Status - The error code return
Status ColumnOrderLoad(nlohmann::json column_tree, const std::vector<std::string> &columns_to_load);
// Internal helper function. Given the json tree for a given column, load it into our schema.
// @param columnTree - The nlohmann child tree for a given column to load.
// @param col_name - The string name of the column for that subtree.
// @return Status - The error code return
/// \brief Internal helper function. Given the json tree for a given column, load it into our schema.
/// \param[in] columnTree - The nlohmann child tree for a given column to load.
/// \param[in] col_name - The string name of the column for that subtree.
/// \return Status - The error code return
Status ColumnLoad(nlohmann::json column_child_tree, const std::string &col_name);
// Internal helper function. Performs sanity checks on the json file setup.
// @param js - The nlohmann tree for the schema file
// @return Status - The error code return
/// \brief Internal helper function. Performs sanity checks on the json file setup.
/// \param[in] js - The nlohmann tree for the schema file
/// \return Status - The error code return
Status PreLoadExceptionCheck(const nlohmann::json &js);
DatasetType GetDatasetTYpeFromString(const std::string &type) const;
std::vector<ColDescriptor> col_descs_; // Vector of column descriptors
std::string dataset_type_str_; // A string that represents the type of dataset
DatasetType dataset_type_; // The numeric form of the dataset type from enum
std::string dir_structure_; // Implicit or flatten
int64_t num_rows_;
};
} // namespace dataset

View File

@ -4019,8 +4019,6 @@ class Schema:
else:
raise RuntimeError("Unknown field %s" % k)
if self.dataset_type is None:
raise RuntimeError("DatasetType field is missing.")
if self.columns is None:
raise RuntimeError("Columns are missing.")
if self.num_rows is not None:

View File

@ -47,6 +47,7 @@ SET(DE_UT_SRCS
rescale_op_test.cc
resize_bilinear_op_test.cc
resize_op_test.cc
schema_test.cc
shuffle_op_test.cc
stand_alone_samplers_test.cc
status_test.cc

View File

@ -0,0 +1,68 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <fstream>
#include <iostream>
#include <memory>
#include <string>
#include "common/common.h"
#include "common/utils.h"
#include "dataset/core/client.h"
#include "dataset/core/global_context.h"
#include "dataset/engine/data_schema.h"
#include "dataset/util/path.h"
#include "dataset/util/status.h"
#include "gtest/gtest.h"
#include "utils/log_adapter.h"
#include "securec.h"
namespace common = mindspore::common;
using namespace mindspore::dataset;
using mindspore::MsLogLevel::ERROR;
using mindspore::ExceptionType::NoExceptionType;
using mindspore::LogStream;
class MindDataTestSchema : public UT::DatasetOpTesting {
protected:
};
TEST_F(MindDataTestSchema, TestOldSchema) {
std::string schema_file = datasets_root_path_ + "/testDataset2/datasetSchema.json";
std::unique_ptr<DataSchema> schema = std::make_unique<DataSchema>();
Status rc = schema->LoadSchemaFile(schema_file, {});
if (rc.IsError()) {
MS_LOG(ERROR) << "Return code error detected during schema load: " << common::SafeCStr(rc.ToString()) << ".";
EXPECT_TRUE(false);
} else {
int32_t num_cols = schema->NumColumns();
EXPECT_TRUE(num_cols == 4);
}
}
TEST_F(MindDataTestSchema, TestAlbumSchema) {
std::string schema_file = datasets_root_path_ + "/testAlbum/fullSchema.json";
std::unique_ptr<DataSchema> schema = std::make_unique<DataSchema>();
Status rc = schema->LoadSchemaFile(schema_file, {});
if (rc.IsError()) {
MS_LOG(ERROR) << "Return code error detected during schema load: " << common::SafeCStr(rc.ToString()) << ".";
EXPECT_TRUE(false);
} else {
int32_t num_cols = schema->NumColumns();
MS_LOG(INFO) << "num_cols: " << num_cols << ".";
EXPECT_TRUE(num_cols == 7);
}
}

View File

@ -0,0 +1,20 @@
{
"columns": {
"image": {
"type": "uint8",
"rank": 1
},
"label" : {
"type": "int32",
"rank": 1
},
"id" : {
"type": "int64",
"rank": 0
},
"_priority" : {
"type": "float64",
"rank": 0
}
}
}

View File

@ -0,0 +1,32 @@
{
"columns": {
"image": {
"type": "uint8",
"rank": 1
},
"label" : {
"type": "int32",
"rank": 1
},
"id" : {
"type": "int64",
"rank": 0
},
"_priority" : {
"type": "float64",
"rank": 0
},
"_embedding" : {
"type": "uint8",
"rank": 1
},
"_segmented_image" : {
"type": "uint8",
"rank": 1
},
"_processed_image" : {
"type": "uint8",
"rank": 1
}
}
}

View File

@ -0,0 +1,22 @@
import json
import os
def dump_json_from_dict(structure, file_name):
with open(file_name + '.json', 'w') as file_path:
json.dump(structure, file_path)
if __name__ == '__main__':
# iterate over directory
DIRECTORY = "imagefolder"
i = 0
for filename in os.listdir(DIRECTORY):
default_dict = {}
default_dict.update(dataset='')
default_dict.update(image=(os.path.join(DIRECTORY, filename)))
default_dict.update(label=[1, 2])
default_dict.update(_priority=0.8)
default_dict.update(_embedding='sample.bin')
default_dict.update(_segmented_image=(os.path.join(DIRECTORY, filename)))
default_dict.update(_processed_image=(os.path.join(DIRECTORY, filename)))
i = i + 1
dump_json_from_dict(default_dict, 'images/'+str(i))

Binary file not shown.

After

Width:  |  Height:  |  Size: 422 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 422 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 429 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 832 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 422 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 147 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 112 KiB

View File

@ -0,0 +1 @@
{"dataset": "", "image": "imagefolder/apple_expect_decoded.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_decoded.jpg", "_processed_image": "imagefolder/apple_expect_decoded.jpg"}

View File

@ -0,0 +1 @@
{"dataset": "", "image": "imagefolder/apple_expect_resize_bilinear.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_resize_bilinear.jpg", "_processed_image": "imagefolder/apple_expect_resize_bilinear.jpg"}

View File

@ -0,0 +1 @@
{"dataset": "", "image": "imagefolder/apple_expect_changemode.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_changemode.jpg", "_processed_image": "imagefolder/apple_expect_changemode.jpg"}

View File

@ -0,0 +1 @@
{"dataset": "", "image": "imagefolder/apple_expect_not_flip.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_not_flip.jpg", "_processed_image": "imagefolder/apple_expect_not_flip.jpg"}

View File

@ -0,0 +1 @@
{"dataset": "", "image": "imagefolder/apple_expect_flipped_horizontal.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_flipped_horizontal.jpg", "_processed_image": "imagefolder/apple_expect_flipped_horizontal.jpg"}

View File

@ -0,0 +1 @@
{"dataset": "", "image": "imagefolder/apple_expect_rescaled.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_rescaled.jpg", "_processed_image": "imagefolder/apple_expect_rescaled.jpg"}

View File

@ -0,0 +1 @@
{"dataset": "", "image": "imagefolder/apple_expect_flipped_vertical.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_flipped_vertical.jpg", "_processed_image": "imagefolder/apple_expect_flipped_vertical.jpg"}

View File

@ -0,0 +1 @@
just some random stuff