forked from mindspore-Ecosystem/mindspore
add performance test for mindrecord
This commit is contained in:
parent
420ef2a352
commit
819b102ef8
|
@ -118,5 +118,8 @@ def mindrecord_dict_data(task_id):
|
|||
image_file = open(file_name, "rb")
|
||||
image_bytes = image_file.read()
|
||||
image_file.close()
|
||||
if not image_bytes:
|
||||
print("The image file: {} is invalid.".format(file_name))
|
||||
continue
|
||||
data["data"] = image_bytes
|
||||
yield data
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""use ImageNetToMR tool generate mindrecord"""
|
||||
import os
|
||||
from mindspore.mindrecord import ImageNetToMR
|
||||
|
||||
IMAGENET_MAP_FILE = "../../../ut/data/mindrecord/testImageNetDataWhole/labels_map.txt"
|
||||
IMAGENET_IMAGE_DIR = "../../../ut/data/mindrecord/testImageNetDataWhole/images"
|
||||
MINDRECORD_FILE = "./imagenet.mindrecord"
|
||||
PARTITION_NUMBER = 16
|
||||
|
||||
def imagenet_to_mindrecord():
|
||||
imagenet_transformer = ImageNetToMR(IMAGENET_MAP_FILE,
|
||||
IMAGENET_IMAGE_DIR,
|
||||
MINDRECORD_FILE,
|
||||
PARTITION_NUMBER)
|
||||
imagenet_transformer.transform()
|
||||
|
||||
if __name__ == '__main__':
|
||||
imagenet_to_mindrecord()
|
|
@ -0,0 +1,113 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""generate tfrecord"""
|
||||
import collections
|
||||
import os
|
||||
import tensorflow as tf
|
||||
|
||||
IMAGENET_MAP_FILE = "../../../ut/data/mindrecord/testImageNetDataWhole/labels_map.txt"
|
||||
IMAGENET_IMAGE_DIR = "../../../ut/data/mindrecord/testImageNetDataWhole/images"
|
||||
TFRECORD_FILE = "./imagenet.tfrecord"
|
||||
PARTITION_NUMBER = 16
|
||||
|
||||
def get_imagenet_filename_label_pic(map_file, image_dir):
|
||||
"""
|
||||
Get data from imagenet.
|
||||
|
||||
Yields:
|
||||
filename, label, image_bytes
|
||||
"""
|
||||
if not os.path.exists(map_file):
|
||||
raise IOError("map file {} not exists".format(map_file))
|
||||
|
||||
label_dict = {}
|
||||
with open(map_file) as fp:
|
||||
line = fp.readline()
|
||||
while line:
|
||||
labels = line.split(" ")
|
||||
label_dict[labels[1]] = labels[0]
|
||||
line = fp.readline()
|
||||
|
||||
# get all the dir which are n02087046, n02094114, n02109525
|
||||
dir_paths = {}
|
||||
for item in label_dict:
|
||||
real_path = os.path.join(image_dir, label_dict[item])
|
||||
if not os.path.isdir(real_path):
|
||||
print("{} dir is not exist".format(real_path))
|
||||
continue
|
||||
dir_paths[item] = real_path
|
||||
|
||||
if not dir_paths:
|
||||
raise PathNotExistsError("not valid image dir in {}".format(image_dir))
|
||||
|
||||
# get the filename, label and image binary as a dict
|
||||
for label in dir_paths:
|
||||
for item in os.listdir(dir_paths[label]):
|
||||
file_name = os.path.join(dir_paths[label], item)
|
||||
if not item.endswith("JPEG") and not item.endswith("jpg"):
|
||||
print("{} file is not suffix with JPEG/jpg, skip it.".format(file_name))
|
||||
continue
|
||||
|
||||
# get the image data
|
||||
image_file = open(file_name, "rb")
|
||||
image_bytes = image_file.read()
|
||||
image_file.close()
|
||||
if not image_bytes:
|
||||
print("The image file: {} is invalid.".format(file_name))
|
||||
continue
|
||||
yield str(file_name), int(label), image_bytes
|
||||
|
||||
def create_int_feature(values):
|
||||
feature = tf.train.Feature(int64_list=tf.train.Int64List(value=[values]))
|
||||
return feature
|
||||
|
||||
def create_string_feature(values):
|
||||
feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes(values, encoding='utf-8')]))
|
||||
return feature
|
||||
|
||||
def create_bytes_feature(values):
|
||||
feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[values]))
|
||||
return feature
|
||||
|
||||
def imagenet_to_tfrecord():
|
||||
writers = []
|
||||
for i in range(PARTITION_NUMBER):
|
||||
output_file = TFRECORD_FILE + str(i).rjust(2, '0')
|
||||
writers.append(tf.io.TFRecordWriter(output_file))
|
||||
|
||||
writer_index = 0
|
||||
total_written = 0
|
||||
|
||||
for file_name, label, image_bytes in get_imagenet_filename_label_pic(IMAGENET_MAP_FILE,
|
||||
IMAGENET_IMAGE_DIR):
|
||||
features = collections.OrderedDict()
|
||||
features["file_name"] = create_string_feature(file_name)
|
||||
features["label"] = create_int_feature(label)
|
||||
features["data"] = create_bytes_feature(image_bytes)
|
||||
|
||||
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
|
||||
|
||||
writers[writer_index].write(tf_example.SerializeToString())
|
||||
writer_index = (writer_index + 1) % len(writers)
|
||||
|
||||
total_written += 1
|
||||
|
||||
for writer in writers:
|
||||
writer.close()
|
||||
|
||||
print("Write {} total examples".format(total_written))
|
||||
|
||||
if __name__ == '__main__':
|
||||
imagenet_to_tfrecord()
|
|
@ -0,0 +1,106 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""test dataset performance about mindspore.MindDataset, mindspore.TFRecordDataset, tf.data.TFRecordDataset"""
|
||||
import time
|
||||
import mindspore.dataset as ds
|
||||
from mindspore.mindrecord import FileReader
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
print_step = 5000
|
||||
|
||||
def print_log(count):
|
||||
if count % print_step == 0:
|
||||
print("Read {} rows ...".format(count))
|
||||
|
||||
def use_filereader(mindrecord):
|
||||
start = time.time()
|
||||
columns_list = ["data", "label"]
|
||||
reader = FileReader(file_name=mindrecord,
|
||||
num_consumer=4,
|
||||
columns=columns_list)
|
||||
num_iter = 0
|
||||
for index, item in enumerate(reader.get_next()):
|
||||
num_iter += 1
|
||||
print_log(num_iter)
|
||||
end = time.time()
|
||||
print("Read by FileReader - total rows: {}, cost time: {}s".format(num_iter, end - start))
|
||||
|
||||
def use_minddataset(mindrecord):
|
||||
start = time.time()
|
||||
columns_list = ["data", "label"]
|
||||
data_set = ds.MindDataset(dataset_file=mindrecord,
|
||||
columns_list=columns_list,
|
||||
num_parallel_workers=4)
|
||||
num_iter = 0
|
||||
for item in data_set.create_dict_iterator():
|
||||
num_iter += 1
|
||||
print_log(num_iter)
|
||||
end = time.time()
|
||||
print("Read by MindDataset - total rows: {}, cost time: {}s".format(num_iter, end - start))
|
||||
|
||||
def use_tfrecorddataset(tfrecord):
|
||||
start = time.time()
|
||||
columns_list = ["data", "label"]
|
||||
data_set = ds.TFRecordDataset(dataset_files=tfrecord,
|
||||
columns_list=columns_list,
|
||||
num_parallel_workers=4,
|
||||
shuffle=ds.Shuffle.GLOBAL)
|
||||
data_set = data_set.shuffle(10000)
|
||||
num_iter = 0
|
||||
for item in data_set.create_dict_iterator():
|
||||
num_iter += 1
|
||||
print_log(num_iter)
|
||||
end = time.time()
|
||||
print("Read by TFRecordDataset - total rows: {}, cost time: {}s".format(num_iter, end - start))
|
||||
|
||||
def use_tensorflow_tfrecorddataset(tfrecord):
|
||||
start = time.time()
|
||||
def _parse_record(example_photo):
|
||||
features = {
|
||||
'file_name': tf.io.FixedLenFeature([], tf.string),
|
||||
'label': tf.io.FixedLenFeature([1], tf.int64),
|
||||
'data': tf.io.FixedLenFeature([], tf.string)}
|
||||
parsed_features = tf.io.parse_single_example(example_photo, features=features)
|
||||
return parsed_features
|
||||
|
||||
data_set = tf.data.TFRecordDataset(filenames=tfrecord,
|
||||
buffer_size=100000,
|
||||
num_parallel_reads=4)
|
||||
data_set = data_set.map(_parse_record, num_parallel_calls=4)
|
||||
num_iter = 0
|
||||
for item in data_set.__iter__():
|
||||
num_iter += 1
|
||||
print_log(num_iter)
|
||||
end = time.time()
|
||||
print("Read by TensorFlow TFRecordDataset - total rows: {}, cost time: {}s".format(num_iter, end - start))
|
||||
|
||||
if __name__ == '__main__':
|
||||
# use MindDataset
|
||||
mindrecord = './imagenet.mindrecord00'
|
||||
use_minddataset(mindrecord)
|
||||
|
||||
# use TFRecordDataset
|
||||
tfrecord = ['imagenet.tfrecord00', 'imagenet.tfrecord01', 'imagenet.tfrecord02', 'imagenet.tfrecord03',
|
||||
'imagenet.tfrecord04', 'imagenet.tfrecord05', 'imagenet.tfrecord06', 'imagenet.tfrecord07',
|
||||
'imagenet.tfrecord08', 'imagenet.tfrecord09', 'imagenet.tfrecord10', 'imagenet.tfrecord11',
|
||||
'imagenet.tfrecord12', 'imagenet.tfrecord13', 'imagenet.tfrecord14', 'imagenet.tfrecord15']
|
||||
use_tfrecorddataset(tfrecord)
|
||||
|
||||
# use TensorFlow TFRecordDataset
|
||||
use_tensorflow_tfrecorddataset(tfrecord)
|
||||
|
||||
# use FileReader
|
||||
# use_filereader(mindrecord)
|
|
@ -0,0 +1,18 @@
|
|||
{
|
||||
"datasetType": "TF",
|
||||
"numRows": 930059,
|
||||
"columns": {
|
||||
"file_name": {
|
||||
"type": "uint8",
|
||||
"rank": 0
|
||||
},
|
||||
"label": {
|
||||
"type": "int64",
|
||||
"rank": 0
|
||||
},
|
||||
"data": {
|
||||
"type": "uint8",
|
||||
"rank": 0
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue