!743 add imagenet perf test for mindrecord

Merge pull request !743 from guozhijian/add_perf_test_for_mindrecord
2020-04-28 10:18:10 +08:00 · 2020-04-28 10:18:10 +08:00 · 728876fc75
parent 40f9d6fcd2 819b102ef8
commit 728876fc75
5 changed files with 272 additions and 0 deletions
--- a/example/convert_to_mindrecord/imagenet/mr_api.py
+++ b/example/convert_to_mindrecord/imagenet/mr_api.py
@ -118,5 +118,8 @@ def mindrecord_dict_data(task_id):
        image_file = open(file_name, "rb")
        image_bytes = image_file.read()
        image_file.close()
        if not image_bytes:
            print("The image file: {} is invalid.".format(file_name))
            continue
        data["data"] = image_bytes
        yield data
--- a/tests/perf_test/mindrecord/imagenet/imagenet_to_mindrecord.py
+++ b/tests/perf_test/mindrecord/imagenet/imagenet_to_mindrecord.py
@ -0,0 +1,32 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """use ImageNetToMR tool generate mindrecord"""
 import os
 from mindspore.mindrecord import ImageNetToMR
 IMAGENET_MAP_FILE = "../../../ut/data/mindrecord/testImageNetDataWhole/labels_map.txt"
 IMAGENET_IMAGE_DIR = "../../../ut/data/mindrecord/testImageNetDataWhole/images"
 MINDRECORD_FILE = "./imagenet.mindrecord"
 PARTITION_NUMBER = 16
 def imagenet_to_mindrecord():
    imagenet_transformer = ImageNetToMR(IMAGENET_MAP_FILE,
                                        IMAGENET_IMAGE_DIR,
                                        MINDRECORD_FILE,
                                        PARTITION_NUMBER)
    imagenet_transformer.transform()
 if __name__ == '__main__':
    imagenet_to_mindrecord()
--- a/tests/perf_test/mindrecord/imagenet/imagenet_to_tfrecord.py
+++ b/tests/perf_test/mindrecord/imagenet/imagenet_to_tfrecord.py
@ -0,0 +1,113 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """generate tfrecord"""
 import collections
 import os
 import tensorflow as tf
 IMAGENET_MAP_FILE = "../../../ut/data/mindrecord/testImageNetDataWhole/labels_map.txt"
 IMAGENET_IMAGE_DIR = "../../../ut/data/mindrecord/testImageNetDataWhole/images"
 TFRECORD_FILE = "./imagenet.tfrecord"
 PARTITION_NUMBER = 16
 def get_imagenet_filename_label_pic(map_file, image_dir):
    """
    Get data from imagenet.
    Yields:
        filename, label, image_bytes
    """
    if not os.path.exists(map_file):
        raise IOError("map file {} not exists".format(map_file))
    label_dict = {}
    with open(map_file) as fp:
        line = fp.readline()
        while line:
            labels = line.split(" ")
            label_dict[labels[1]] = labels[0]
            line = fp.readline()
    # get all the dir which are n02087046, n02094114, n02109525
    dir_paths = {}
    for item in label_dict:
        real_path = os.path.join(image_dir, label_dict[item])
        if not os.path.isdir(real_path):
            print("{} dir is not exist".format(real_path))
            continue
        dir_paths[item] = real_path
    if not dir_paths:
        raise PathNotExistsError("not valid image dir in {}".format(image_dir))
    # get the filename, label and image binary as a dict
    for label in dir_paths:
        for item in os.listdir(dir_paths[label]):
            file_name = os.path.join(dir_paths[label], item)
            if not item.endswith("JPEG") and not item.endswith("jpg"):
                print("{} file is not suffix with JPEG/jpg, skip it.".format(file_name))
                continue
            # get the image data
            image_file = open(file_name, "rb")
            image_bytes = image_file.read()
            image_file.close()
            if not image_bytes:
                print("The image file: {} is invalid.".format(file_name))
                continue
            yield str(file_name), int(label), image_bytes
 def create_int_feature(values):
    feature = tf.train.Feature(int64_list=tf.train.Int64List(value=[values]))
    return feature
 def create_string_feature(values):
    feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes(values, encoding='utf-8')]))
    return feature
 def create_bytes_feature(values):
    feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[values]))
    return feature
 def imagenet_to_tfrecord():
    writers = []
    for i in range(PARTITION_NUMBER):
        output_file = TFRECORD_FILE + str(i).rjust(2, '0')
        writers.append(tf.io.TFRecordWriter(output_file))
    writer_index = 0
    total_written = 0
    for file_name, label, image_bytes in get_imagenet_filename_label_pic(IMAGENET_MAP_FILE,
                                                                         IMAGENET_IMAGE_DIR):
        features = collections.OrderedDict()
        features["file_name"] = create_string_feature(file_name)
        features["label"] = create_int_feature(label)
        features["data"] = create_bytes_feature(image_bytes)
        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
        writers[writer_index].write(tf_example.SerializeToString())
        writer_index = (writer_index + 1) % len(writers)
        total_written += 1
    for writer in writers:
        writer.close()
    print("Write {} total examples".format(total_written))
 if __name__ == '__main__':
    imagenet_to_tfrecord()
--- a/tests/perf_test/mindrecord/imagenet/perf_read_imagenet.py
+++ b/tests/perf_test/mindrecord/imagenet/perf_read_imagenet.py
@ -0,0 +1,106 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """test dataset performance about mindspore.MindDataset, mindspore.TFRecordDataset, tf.data.TFRecordDataset"""
 import time
 import mindspore.dataset as ds
 from mindspore.mindrecord import FileReader
 import tensorflow as tf
 print_step = 5000
 def print_log(count):
    if count % print_step == 0:
        print("Read {} rows ...".format(count))
 def use_filereader(mindrecord):
    start = time.time()
    columns_list = ["data", "label"]
    reader = FileReader(file_name=mindrecord,
                        num_consumer=4,
                        columns=columns_list)
    num_iter = 0
    for index, item in enumerate(reader.get_next()):
        num_iter += 1
        print_log(num_iter)
    end = time.time()
    print("Read by FileReader - total rows: {}, cost time: {}s".format(num_iter, end - start))
 def use_minddataset(mindrecord):
    start = time.time()
    columns_list = ["data", "label"]
    data_set = ds.MindDataset(dataset_file=mindrecord,
                              columns_list=columns_list,
                              num_parallel_workers=4)
    num_iter = 0
    for item in data_set.create_dict_iterator():
        num_iter += 1
        print_log(num_iter)
    end = time.time()
    print("Read by MindDataset - total rows: {}, cost time: {}s".format(num_iter, end - start))
 def use_tfrecorddataset(tfrecord):
    start = time.time()
    columns_list = ["data", "label"]
    data_set = ds.TFRecordDataset(dataset_files=tfrecord,
                                  columns_list=columns_list,
                                  num_parallel_workers=4,
                                  shuffle=ds.Shuffle.GLOBAL)
    data_set = data_set.shuffle(10000)
    num_iter = 0
    for item in data_set.create_dict_iterator():
        num_iter += 1
        print_log(num_iter)
    end = time.time()
    print("Read by TFRecordDataset - total rows: {}, cost time: {}s".format(num_iter, end - start))
 def use_tensorflow_tfrecorddataset(tfrecord):
    start = time.time()
    def _parse_record(example_photo):
        features = {
            'file_name': tf.io.FixedLenFeature([], tf.string),
            'label': tf.io.FixedLenFeature([1], tf.int64),
            'data': tf.io.FixedLenFeature([], tf.string)}
        parsed_features = tf.io.parse_single_example(example_photo, features=features)
        return parsed_features
    data_set = tf.data.TFRecordDataset(filenames=tfrecord,
                                       buffer_size=100000,
                                       num_parallel_reads=4)
    data_set = data_set.map(_parse_record, num_parallel_calls=4)
    num_iter = 0
    for item in data_set.__iter__():
        num_iter += 1
        print_log(num_iter)
    end = time.time()
    print("Read by TensorFlow TFRecordDataset - total rows: {}, cost time: {}s".format(num_iter, end - start))
 if __name__ == '__main__':
    # use MindDataset
    mindrecord = './imagenet.mindrecord00'
    use_minddataset(mindrecord)
    # use TFRecordDataset
    tfrecord = ['imagenet.tfrecord00', 'imagenet.tfrecord01', 'imagenet.tfrecord02', 'imagenet.tfrecord03',
                'imagenet.tfrecord04', 'imagenet.tfrecord05', 'imagenet.tfrecord06', 'imagenet.tfrecord07',
                'imagenet.tfrecord08', 'imagenet.tfrecord09', 'imagenet.tfrecord10', 'imagenet.tfrecord11',
                'imagenet.tfrecord12', 'imagenet.tfrecord13', 'imagenet.tfrecord14', 'imagenet.tfrecord15']
    use_tfrecorddataset(tfrecord)
    # use TensorFlow TFRecordDataset
    use_tensorflow_tfrecorddataset(tfrecord)
    # use FileReader
    # use_filereader(mindrecord)
--- a/tests/perf_test/mindrecord/imagenet/schema.json
+++ b/tests/perf_test/mindrecord/imagenet/schema.json
@ -0,0 +1,18 @@
 {
  "datasetType": "TF",
  "numRows": 930059,
  "columns": {
    "file_name": {
      "type": "uint8",
      "rank": 0
    },
    "label": {
      "type": "int64",
      "rank": 0
    },
    "data": {
      "type": "uint8",
      "rank": 0
    }
  }
 }