forked from mindspore-Ecosystem/mindspore
add TensorDataset and its ut
This commit is contained in:
parent
8de8289cfd
commit
bc22c172b8
|
@ -19,7 +19,7 @@ can also create samplers with this module to sample data.
|
|||
"""
|
||||
|
||||
from .core.configuration import config
|
||||
from .engine.datasets import TFRecordDataset, ImageFolderDatasetV2, MnistDataset, MindDataset, \
|
||||
from .engine.datasets import TFRecordDataset, ImageFolderDatasetV2, MnistDataset, MindDataset, NumpySlicesDataset, \
|
||||
GeneratorDataset, ManifestDataset, Cifar10Dataset, Cifar100Dataset, VOCDataset, CocoDataset, CelebADataset,\
|
||||
TextFileDataset, Schema, Shuffle, zip, RandomDataset
|
||||
from .engine.samplers import DistributedSampler, PKSampler, RandomSampler, SequentialSampler, SubsetRandomSampler, \
|
||||
|
@ -29,6 +29,6 @@ from .engine.graphdata import GraphData
|
|||
|
||||
__all__ = ["config", "ImageFolderDatasetV2", "MnistDataset",
|
||||
"MindDataset", "GeneratorDataset", "TFRecordDataset",
|
||||
"ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset",
|
||||
"ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset", "NumpySlicesDataset",
|
||||
"VOCDataset", "CocoDataset", "TextFileDataset", "Schema", "DistributedSampler", "PKSampler", "RandomSampler",
|
||||
"SequentialSampler", "SubsetRandomSampler", "WeightedRandomSampler", "zip", "GraphData"]
|
||||
|
|
|
@ -40,7 +40,7 @@ from mindspore import log as logger
|
|||
from . import samplers
|
||||
from .iterators import DictIterator, TupleIterator
|
||||
from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \
|
||||
check_rename, \
|
||||
check_rename, check_numpyslicesdataset, \
|
||||
check_take, check_project, check_imagefolderdatasetv2, check_mnist_cifar_dataset, check_manifestdataset, \
|
||||
check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset,\
|
||||
check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat,\
|
||||
|
@ -4377,3 +4377,158 @@ class TextFileDataset(SourceDataset):
|
|||
return self.num_shards > 1
|
||||
|
||||
return False
|
||||
|
||||
|
||||
class _NumpySlicesDataset:
|
||||
"""
|
||||
Mainly for dealing with several kinds of format of python data, and return one row each time.
|
||||
"""
|
||||
def __init__(self, data, column_list=None):
|
||||
self.column_list = None
|
||||
# Convert dict data into tuple
|
||||
if isinstance(data, dict) or isinstance(data[0], dict):
|
||||
data = self.process_dict(data)
|
||||
|
||||
if isinstance(data[0], tuple) or isinstance(data, tuple):
|
||||
self.is_tuple = True
|
||||
self.data = data
|
||||
if isinstance(data[0], tuple):
|
||||
for i in range(len(self.data)):
|
||||
self.data[i] = np.array(self.data[i])
|
||||
else:
|
||||
self.is_tuple = False
|
||||
self.data = np.array(data)
|
||||
|
||||
# Init column_name
|
||||
if column_list is not None:
|
||||
self.column_list = column_list
|
||||
elif self.column_list is None:
|
||||
self.column_list = []
|
||||
column_num = len(self.data) if self.is_tuple else 1
|
||||
for i in range(column_num):
|
||||
self.column_list.append("column_" + str(i))
|
||||
|
||||
def __getitem__(self, index):
|
||||
if self.is_tuple:
|
||||
data_row = []
|
||||
for i in range(len(self.data)):
|
||||
data_row.append(self.data[i][index, ...])
|
||||
data_res = tuple(data_row)
|
||||
else:
|
||||
data_row = self.data[index, ...]
|
||||
data_row = [data_row]
|
||||
data_res = tuple(data_row)
|
||||
|
||||
return data_res
|
||||
|
||||
def __len__(self):
|
||||
if self.is_tuple:
|
||||
return len(self.data[0])
|
||||
return len(self.data)
|
||||
|
||||
def process_dict(self, input_data):
|
||||
"""
|
||||
Convert the dict like data into tuple format, when input is a tuple of dict then compose it into a dict first.
|
||||
"""
|
||||
# When input is a tuple of dict, composing it
|
||||
if isinstance(input_data, tuple) and isinstance(input_data[0], dict):
|
||||
data_dict = {}
|
||||
for d in input_data:
|
||||
data_dict.update(d)
|
||||
input_data = data_dict
|
||||
|
||||
# convert pandas like dict(has "values" column) into General dict
|
||||
data_keys = list(input_data.keys())
|
||||
data_col = input_data[data_keys[0]]
|
||||
if hasattr(data_col, "values"):
|
||||
new_dict = {}
|
||||
for key in data_keys:
|
||||
item1 = input_data.pop(key)
|
||||
new_dict[key] = item1.values
|
||||
input_data = new_dict
|
||||
|
||||
# Convert the data in dict into tuple
|
||||
data = []
|
||||
self.column_list = []
|
||||
keys = input_data.keys()
|
||||
for key in keys:
|
||||
self.column_list.append(key)
|
||||
value = input_data[key]
|
||||
data.append(tuple(value))
|
||||
|
||||
return data
|
||||
|
||||
|
||||
class NumpySlicesDataset(GeneratorDataset):
|
||||
"""
|
||||
Create a dataset with given data slices, mainly for loading python data into dataset.
|
||||
|
||||
This dataset can take in a sampler. sampler and shuffle are mutually exclusive. Table
|
||||
below shows what input args are allowed and their expected behavior.
|
||||
|
||||
.. list-table:: Expected Order Behavior of Using 'sampler' and 'shuffle'
|
||||
:widths: 25 25 50
|
||||
:header-rows: 1
|
||||
|
||||
* - Parameter 'sampler'
|
||||
- Parameter 'shuffle'
|
||||
- Expected Order Behavior
|
||||
* - None
|
||||
- None
|
||||
- random order
|
||||
* - None
|
||||
- True
|
||||
- random order
|
||||
* - None
|
||||
- False
|
||||
- sequential order
|
||||
* - Sampler object
|
||||
- None
|
||||
- order defined by sampler
|
||||
* - Sampler object
|
||||
- True
|
||||
- not allowed
|
||||
* - Sampler object
|
||||
- False
|
||||
- not allowed
|
||||
|
||||
Args:
|
||||
data(list, tuple or dict)Input of Given data, supported data type includes list, tuple, dict and other numpy
|
||||
format. Input data will be sliced in first dimension and generate many rows, large data is not recommend to
|
||||
load in this way as data is loading into memory.
|
||||
column_names (list[str], optional): List of column names of the dataset (default=None). If column_names not
|
||||
provided, when data is dict, column_names will be its key, otherwise it will be like column_1, column_2 ...
|
||||
num_samples (int, optional): The number of samples to be included in the dataset (default=None, all images).
|
||||
num_parallel_workers (int, optional): Number of subprocesses used to fetch the dataset in parallel (default=1).
|
||||
shuffle (bool, optional): Whether or not to perform shuffle on the dataset. Random accessible input is required.
|
||||
(default=None, expected order behavior shown in the table).
|
||||
sampler (Sampler/Iterable, optional): Object used to choose samples from the dataset. Random accessible input is
|
||||
required (default=None, expected order behavior shown in the table).
|
||||
num_shards (int, optional): Number of shards that the dataset should be divided into (default=None).
|
||||
This argument should be specified only when 'num_samples' is "None". Random accessible input is required.
|
||||
shard_id (int, optional): The shard ID within num_shards (default=None). This argument should be specified only
|
||||
when num_shards is also specified. Random accessible input is required.
|
||||
|
||||
Examples:
|
||||
>>> import mindspore.dataset as ds
|
||||
>>> # 1) Input data can be a list
|
||||
>>> data = [1, 2, 3]
|
||||
>>> dataset1 = ds.NumpySlicesDataset(data, column_names=["column_1"])
|
||||
>>> # 2) Input data can be a dict, and column_names will be its key
|
||||
>>> data = {"a": [1, 2], "b": [3, 4]}
|
||||
>>> dataset2 = ds.NumpySlicesDataset(data)
|
||||
>>> # 3) Input data can be a tuple (or list of tuple), and each tuple element refers to data in each column
|
||||
>>> data = ((1, 2), (3, 4), (5, 6))
|
||||
>>> dataset3 = ds.NumpySlicesDataset(data, column_names=["column_1", "column_2", "column_3"])
|
||||
>>> # 4) Load data from csv file
|
||||
>>> import pandas as pd
|
||||
>>> df = pd.read_csv("file.csv")
|
||||
>>> dataset4 = ds.NumpySlicesDataset(dict(df), shuffle=False)
|
||||
"""
|
||||
@check_numpyslicesdataset
|
||||
def __init__(self, data, column_names=None, num_samples=None, num_parallel_workers=1, shuffle=None,
|
||||
sampler=None, num_shards=None, shard_id=None):
|
||||
dataset = _NumpySlicesDataset(data, column_names)
|
||||
super().__init__(dataset, column_names=dataset.column_list, num_samples=num_samples,
|
||||
num_parallel_workers=num_parallel_workers, shuffle=shuffle, sampler=sampler,
|
||||
num_shards=num_shards, shard_id=shard_id)
|
||||
|
|
|
@ -1356,3 +1356,48 @@ def check_gnn_get_node_feature(method):
|
|||
return method(*args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
||||
|
||||
def check_numpyslicesdataset(method):
|
||||
"""A wrapper that wrap a parameter checker to the original Dataset(NumpySlicesDataset)."""
|
||||
|
||||
@wraps(method)
|
||||
def new_method(*args, **kwargs):
|
||||
param_dict = make_param_dict(method, args, kwargs)
|
||||
|
||||
# check data; required argument
|
||||
data = param_dict.get('data')
|
||||
if not isinstance(data, (list, tuple, dict, np.ndarray)):
|
||||
raise TypeError("Unsupported data type: {}, only support some common python data type, \
|
||||
like list, tuple, dict, and numpy array.".format(type(data)))
|
||||
if not data:
|
||||
raise ValueError("Input data is empty.")
|
||||
|
||||
# check column_names
|
||||
column_names = param_dict.get('column_names')
|
||||
if column_names is not None:
|
||||
check_columns(column_names, "column_names")
|
||||
|
||||
# check num of input column in column_names
|
||||
column_num = 1 if isinstance(column_names, str) else len(column_names)
|
||||
if isinstance(data, dict):
|
||||
data_column = len(list(data.keys()))
|
||||
if column_num != data_column:
|
||||
raise ValueError("Num of column is {0}, but required is {1}.".format(column_num, data_column))
|
||||
|
||||
# Consider input is a tuple of dict
|
||||
elif isinstance(data[0], dict):
|
||||
data_column = np.sum(len(list(data[i].keys())) for i in range(len(data)))
|
||||
if column_num != data_column:
|
||||
raise ValueError("Num of column is {0}, but required is {1}.".format(column_num, data_column))
|
||||
|
||||
elif isinstance(data[0], tuple) or isinstance(data, tuple):
|
||||
if column_num != len(data):
|
||||
raise ValueError("Num of column is {0}, but required is {1}.".format(column_num, len(data)))
|
||||
else:
|
||||
if column_num != 1:
|
||||
raise ValueError("Num of column is {0}, but required is {1} as data is list.".format(column_num, 1))
|
||||
|
||||
return method(*args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
|
|
@ -12,3 +12,4 @@ setuptools >= 40.8.0
|
|||
matplotlib >= 3.1.3 # for ut test
|
||||
opencv-python >= 4.2.0.32 # for ut test
|
||||
sklearn >= 0.0 # for st test
|
||||
pandas >= 1.0.2 # for ut test
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
age,sex,height,weight,slope,state,target
|
||||
65,0,161,45,93,fixed,1
|
||||
72,1,164,60,86,good,0
|
||||
45,0,174,70,79,bad,1
|
||||
73,1,173,65,70,good,1
|
||||
55,1,182,80,104,good,0
|
|
|
@ -0,0 +1,201 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
import numpy as np
|
||||
import mindspore.dataset as de
|
||||
from mindspore import log as logger
|
||||
import mindspore.dataset.transforms.vision.c_transforms as vision
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def test_numpy_slices_list_1():
|
||||
logger.info("Test Slicing a 1D list.")
|
||||
|
||||
np_data = [1, 2, 3]
|
||||
ds = de.NumpySlicesDataset(np_data, shuffle=False)
|
||||
|
||||
for i, data in enumerate(ds):
|
||||
assert data[0] == np_data[i]
|
||||
|
||||
|
||||
def test_numpy_slices_list_2():
|
||||
logger.info("Test Slicing a 2D list into 1D list.")
|
||||
|
||||
np_data = [[1, 2], [3, 4]]
|
||||
ds = de.NumpySlicesDataset(np_data, column_names=["col1"], shuffle=False)
|
||||
|
||||
for i, data in enumerate(ds):
|
||||
assert np.equal(data[0], np_data[i]).all()
|
||||
|
||||
|
||||
def test_numpy_slices_list_3():
|
||||
logger.info("Test Slicing list in the first dimension.")
|
||||
|
||||
np_data = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
|
||||
ds = de.NumpySlicesDataset(np_data, column_names=["col1"], shuffle=False)
|
||||
|
||||
for i, data in enumerate(ds):
|
||||
assert np.equal(data[0], np_data[i]).all()
|
||||
|
||||
|
||||
def test_numpy_slices_list_append():
|
||||
logger.info("Test reading data of image list.")
|
||||
|
||||
DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
|
||||
resize_height, resize_width = 2, 2
|
||||
|
||||
data1 = de.TFRecordDataset(DATA_DIR)
|
||||
resize_op = vision.Resize((resize_height, resize_width))
|
||||
data1 = data1.map(input_columns=["image"], operations=[vision.Decode(True), resize_op])
|
||||
|
||||
res = []
|
||||
for data in data1.create_dict_iterator():
|
||||
res.append(data["image"])
|
||||
|
||||
ds = de.NumpySlicesDataset(res, column_names=["col1"], shuffle=False)
|
||||
|
||||
for i, data in enumerate(ds):
|
||||
assert np.equal(data, res[i]).all()
|
||||
|
||||
|
||||
def test_numpy_slices_dict_1():
|
||||
logger.info("Test Dictionary structure data.")
|
||||
|
||||
np_data = {"a": [1, 2], "b": [3, 4]}
|
||||
ds = de.NumpySlicesDataset(np_data, shuffle=False)
|
||||
res = [[1, 3], [2, 4]]
|
||||
|
||||
for i, data in enumerate(ds):
|
||||
assert data[0] == res[i][0]
|
||||
assert data[1] == res[i][1]
|
||||
|
||||
|
||||
def test_numpy_slices_dict_2():
|
||||
logger.info("Test input data is a tuple of Dictionary structure data.")
|
||||
|
||||
data1, data2 = {"a": [1, 2]}, {"b": [3, 4]}
|
||||
ds = de.NumpySlicesDataset((data1, data2), column_names=["col1", "col2"], shuffle=False)
|
||||
res = [[1, 3], [2, 4]]
|
||||
|
||||
for i, data in enumerate(ds):
|
||||
assert data[0] == res[i][0]
|
||||
assert data[1] == res[i][1]
|
||||
|
||||
|
||||
def test_numpy_slices_tuple_1():
|
||||
logger.info("Test slicing a list of tuple.")
|
||||
|
||||
np_data = [([1, 2], [3, 4]), ([11, 12], [13, 14]), ([21, 22], [23, 24])]
|
||||
res = [[[1, 2], [11, 12], [21, 22]], [[3, 4], [13, 14], [23, 24]]]
|
||||
ds = de.NumpySlicesDataset(np_data, shuffle=False)
|
||||
|
||||
for i, data in enumerate(ds):
|
||||
assert np.equal(data[0], res[i][0]).all()
|
||||
assert np.equal(data[1], res[i][1]).all()
|
||||
assert np.equal(data[2], res[i][2]).all()
|
||||
|
||||
assert sum([1 for _ in ds]) == 2
|
||||
|
||||
|
||||
def test_numpy_slices_tuple_2():
|
||||
logger.info("Test reading different dimension of tuple data.")
|
||||
features, labels = np.random.sample((5, 2)), np.random.sample((5, 1))
|
||||
data = (features, labels)
|
||||
|
||||
ds = de.NumpySlicesDataset(data, column_names=["col1", "col2"], shuffle=False)
|
||||
|
||||
for i, data in enumerate(ds):
|
||||
assert np.equal(data[0], features[i]).all()
|
||||
assert data[1] == labels[i]
|
||||
|
||||
|
||||
def test_numpy_slices_csv_value():
|
||||
logger.info("Test loading value of csv file.")
|
||||
csv_file = "../data/dataset/testNumpySlicesDataset/heart.csv"
|
||||
|
||||
df = pd.read_csv(csv_file)
|
||||
target = df.pop("target")
|
||||
df.pop("state")
|
||||
np_data = (df.values, target.values)
|
||||
|
||||
ds = de.NumpySlicesDataset(np_data, column_names=["col1", "col2"], shuffle=False)
|
||||
|
||||
for i, data in enumerate(ds):
|
||||
assert np.equal(np_data[0][i], data[0]).all()
|
||||
assert np.equal(np_data[1][i], data[1]).all()
|
||||
|
||||
|
||||
def test_numpy_slices_csv_dict():
|
||||
logger.info("Test loading csv file as dict.")
|
||||
|
||||
csv_file = "../data/dataset/testNumpySlicesDataset/heart.csv"
|
||||
df = pd.read_csv(csv_file)
|
||||
df.pop("state")
|
||||
res = df.values
|
||||
|
||||
ds = de.NumpySlicesDataset(dict(df), shuffle=False)
|
||||
|
||||
for i, data in enumerate(ds):
|
||||
assert np.equal(data, res[i]).all()
|
||||
|
||||
|
||||
def test_numpy_slices_num_samplers():
|
||||
logger.info("Test num_samplers.")
|
||||
|
||||
np_data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
|
||||
ds = de.NumpySlicesDataset(np_data, shuffle=False, num_samples=2)
|
||||
|
||||
for i, data in enumerate(ds):
|
||||
assert np.equal(data[0], np_data[i]).all()
|
||||
|
||||
assert sum([1 for _ in ds]) == 2
|
||||
|
||||
|
||||
def test_numpy_slices_distributed_sampler():
|
||||
logger.info("Test distributed sampler.")
|
||||
|
||||
np_data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
|
||||
ds = de.NumpySlicesDataset(np_data, shuffle=False, shard_id=0, num_shards=4)
|
||||
|
||||
for i, data in enumerate(ds):
|
||||
assert np.equal(data[0], np_data[i * 4]).all()
|
||||
|
||||
assert sum([1 for _ in ds]) == 2
|
||||
|
||||
|
||||
def test_numpy_slices_sequential_sampler():
|
||||
|
||||
logger.info("Test numpy_slices_dataset with SequentialSampler and repeat.")
|
||||
|
||||
np_data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
|
||||
ds = de.NumpySlicesDataset(np_data, sampler=de.SequentialSampler()).repeat(2)
|
||||
|
||||
for i, data in enumerate(ds):
|
||||
assert np.equal(data[0], np_data[i % 8]).all()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_numpy_slices_list_1()
|
||||
test_numpy_slices_list_2()
|
||||
test_numpy_slices_list_3()
|
||||
test_numpy_slices_list_append()
|
||||
test_numpy_slices_dict_1()
|
||||
test_numpy_slices_dict_2()
|
||||
test_numpy_slices_tuple_1()
|
||||
test_numpy_slices_tuple_2()
|
||||
test_numpy_slices_csv_value()
|
||||
test_numpy_slices_csv_dict()
|
||||
test_numpy_slices_num_samplers()
|
||||
test_numpy_slices_distributed_sampler()
|
||||
test_numpy_slices_sequential_sampler()
|
Loading…
Reference in New Issue